simple-saliency.C

Go to the documentation of this file.
00001 /*!@file INVT/simple-saliency.C A very simple saliency map implementation */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2003   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file: Laurent Itti <itti@pollux.usc.edu>
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/INVT/simple-saliency.C $
00035 // $Id: simple-saliency.C 12962 2010-03-06 02:13:53Z irock $
00036 //
00037 
00038 #include "Util/log.H"          // logging facilities, provides LINFO(), LFATAL(), etc functions
00039 #include "Util/sformat.H"      // sformat() is similar to sprintf() but returns a C++ string
00040 
00041 #include "Image/PixelsTypes.H" // for PixRGB<T>, etc
00042 #include "Image/Image.H"       // template image class
00043 #include "Image/ImageSet.H"    // for ImageSet<T>, a set of images (e.g., a pyramid)
00044 
00045 #include "Image/ColorOps.H"    // for luminance(), getRGBY(), etc
00046 #include "Image/PyramidOps.H"  // for buildPyrGeneric(), centerSurround(), etc
00047 #include "Image/PyrBuilder.H"  // for the various builders for different types of pyramids
00048 #include "Image/ShapeOps.H"    // for downSize()
00049 #include "Image/fancynorm.H"   // for maxNormalize()
00050 #include "Image/ImageSetOps.H" // for doLowThresh()
00051 
00052 #include "Raster/Raster.H"     // for reading/writing images from/to disk
00053 #include "Image/DrawOps.H"     // for colGreyCombo(), used only to create an output display
00054 #include "Image/MathOps.H"     // for getMinMax, used only to print informational messages
00055 #include "Image/Normalize.H"   // for normalizeFloat(), used only to write outputs to disk
00056 
00057 
00058 // This program is a simplified version of the Itti et al (1998) visual saliency algorithm, designed for didactic
00059 // purposes. Here, we compute a saliency map as if you were using: ezvision --maxnorm-type=Maxnorm --nouse-random
00060 
00061 // To understand what is happening in this program, you should:
00062 //
00063 // - familiarize yourself with the Dims class in src/Image/Dims.H; it's just a simple struct with a width and a height
00064 //   of an image;
00065 //
00066 // - familiarize yourself with the PixRGB<T> pixel type in src/Image/PixelsTypes.H; it's just a simple struct with 3
00067 //   components for red, green and blue;
00068 //
00069 // - familiarize yourself with the Image<T> class in src/Image.H; it's just a 2D array of pixels with some Dims; you do
00070 //   not need to worry about the internal handling of the array (copy-on-write and ref-counting). But check out the
00071 //   begin() and beginw() functions as those are how one gets direct access to the raw pixel array through C++
00072 //   iterators (linear access, scanning the image like a TV gun);
00073 //
00074 // - familiarize yourself with ImageSet<T>, it's just a 1D array of images, we use it here to store image pyramids;
00075 //
00076 // - then, have a look at the following functions used in the present program:
00077 //
00078 //   getRGBY() in Image/ColorOps.C
00079 //   downSize() in Image/ShapeOps.C
00080 //   centerSurround() in Image/PyramidOps.C, which internally uses centerSurround() of Image/FilterOps.C; note that this
00081 //     one in turns internally uses inplaceAttenuateBorders() defined in Image/MathOps.C
00082 //   maxNormalize in Image/fancynorm.C, here we use the VCXNORM_MAXNORM version of this algo
00083 //   buildPyrGeneric() in Image/PyramidOps.C -- this is probably the most complicated one, it internally calls
00084 //     builders for various types of image pyramids, which are defined in Image/PyrBuilder.C
00085 //   doLowThresh() in ImageSetOps.C, user by the motion channel
00086 
00087 // The following papers will help:
00088 //
00089 // * http://ilab.usc.edu/publications/doc/Itti_etal98pami.pdf
00090 //   Check out Fig 1 for the general flow diagram, and Fig. 2 for the max-normalization operator. Note that eq. 2 and 3
00091 //   in this paper have a typo.
00092 //
00093 // * http://ilab.usc.edu/publications/doc/Itti_Koch00vr.pdf
00094 //   Check out Fig. 2.a for how the center-surround operations work. Note that this also introduces a different
00095 //   max-normalization scheme (Fog. 2b) which is not used in the present program.
00096 //
00097 // * http://ilab.usc.edu/publications/doc/Itti_Koch01ei.pdf
00098 //   Have a look at Fig. 4 for the "truncated filter" boundary condition which we use to create image pyramids.
00099 //
00100 // * http://ilab.usc.edu/publications/doc/Itti_etal03spienn.pdf
00101 //   This version of the model includes flicker and motion channels and all the equations are correct, so use this as
00102 //   the reference for the full algorithm.
00103 
00104 // ######################################################################
00105 // ##### Global options:
00106 // ######################################################################
00107 #define sml            4   /* pyramid level of the saliency map */
00108 #define level_min      2   /* min center level */
00109 #define level_max      4   /* max center level */
00110 #define delta_min      3   /* min center-surround delta */
00111 #define delta_max      4   /* max center-surround delta */
00112 #define maxdepth       (level_max + delta_max + 1)
00113 #define num_feat_maps  ((level_max - level_min + 1) * (delta_max - delta_min + 1)) /* num of feture maps per pyramid */
00114 #define motionlowthresh 3.0F
00115 
00116 // ######################################################################
00117 //! Compute the luminance of an RGB image, and convert from byte to float pixels
00118 /*! note that we could as well use luminance() defined in Image/ColorOps.H except that the rounding side effects are
00119     slightly different. This function corresponds to the one we use in our master code (it's burried in
00120     Channels/InputFrame.C).*/
00121 Image<float> computeLuminance(const Image<PixRGB<byte> >& img)
00122 {
00123   Image<float> lum(img.getDims(), NO_INIT);
00124   Image<PixRGB<byte> >::const_iterator in = img.begin(), stop = img.end();
00125   Image<float>::iterator dest = lum.beginw();
00126   const float one_third = 1.0F / 3.0F;
00127 
00128   while(in != stop) {
00129     *dest++ = one_third * (in->red() + in->green() + in->blue());
00130     ++in;
00131   }
00132 
00133   return lum;
00134 }
00135 
00136 // ######################################################################
00137 //! Max-normalize a feature map or conspicuity map
00138 Image<float> normalizMAP(const Image<float>& ima, const Dims& dims, const char *label)
00139 {
00140   // do we want to downsize the map?
00141   Image<float> dsima;
00142   if (dims.isNonEmpty()) dsima = downSize(ima, dims); else dsima = ima;
00143 
00144   // apply spatial competition for salience (mx-normalization):
00145   Image<float> result = maxNormalize(dsima, MAXNORMMIN, MAXNORMMAX, VCXNORM_MAXNORM);
00146 
00147   // show some info about this map:
00148   float mi, ma, nmi, nma; getMinMax(ima, mi, ma); getMinMax(result, nmi, nma);
00149   LINFO("%s: raw range [%f .. %f] max-normalized to [%f .. %f]", label, mi, ma, nmi, nma);
00150 
00151   return result;
00152 }
00153 
00154 // ######################################################################
00155 // Compute a conspicuity map, called with various pyramid types for the various channels
00156 Image<float> computeCMAP(const Image<float>& fima, const PyramidType ptyp, const char *label,
00157                          const bool absol, const float ori = 0.0F)
00158 {
00159   LINFO("Building %s channel:", label);
00160 
00161   // compute pyramid:
00162   ImageSet<float> pyr = buildPyrGeneric(fima, 0, maxdepth, ptyp, ori);
00163 
00164   // alloc conspicuity map and clear it:
00165   Image<float> cmap(pyr[sml].getDims(), ZEROS);
00166 
00167   // get all the center-surround maps and combine them:
00168   for (int delta = delta_min; delta <= delta_max; ++delta)
00169     for (int lev = level_min; lev <= level_max; ++lev)
00170       {
00171         Image<float> tmp = centerSurround(pyr, lev, lev + delta, absol);
00172         std::string lbl = sformat("  %s(%d,%d)", label, lev, lev + delta);
00173 
00174         tmp = normalizMAP(tmp, cmap.getDims(), lbl.c_str());
00175 
00176         cmap += tmp;
00177       }
00178 
00179   float mi, ma;
00180   getMinMax(cmap, mi, ma); LINFO("%s: final cmap range [%f .. %f]", label, mi, ma);
00181 
00182   return cmap;
00183 }
00184 
00185 // ######################################################################
00186 // Compute a motion conspicuity map from a motion pyramid
00187 Image<float> computeMMAP(ImageSet<float>& pyr, const char *label)
00188 {
00189   LINFO("Building %s channel:", label);
00190 
00191   // apply a low threshold to cut small motion values:
00192   doLowThresh(pyr, motionlowthresh);
00193 
00194   // alloc conspicuity map and clear it:
00195   Image<float> cmap(pyr[sml].getDims(), ZEROS);
00196 
00197   // get all the center-surround maps and combine them:
00198   for (int delta = delta_min; delta <= delta_max; ++delta)
00199     for (int lev = level_min; lev <= level_max; ++lev)
00200       {
00201         Image<float> tmp = centerSurround(pyr, lev, lev + delta, true);
00202         std::string lbl = sformat("  %s(%d,%d)", label, lev, lev + delta);
00203 
00204         tmp = normalizMAP(tmp, cmap.getDims(), lbl.c_str());
00205 
00206         cmap += tmp;
00207       }
00208 
00209   float mi, ma;
00210   getMinMax(cmap, mi, ma); LINFO("%s: final cmap range [%f .. %f]", label, mi, ma);
00211 
00212   return cmap;
00213 }
00214 
00215 // ######################################################################
00216 //! Simple saliency map computation
00217 /*! This is a barebones implementation of the Itti, Koch & Niebur (1998) saliency map algorithm */
00218 int main(const int argc, const char **argv)
00219 {
00220   if (argc != 4) LFATAL("Usage: %s <input-stem> <traj-stem> <sm-stem>\n"
00221                         "Will read <input-stem>000000.png, <input-stem>000001.png, etc and \n"
00222                         "create the corresponding outputs.", argv[0]);
00223   const char *instem = argv[1];
00224   const char *tstem = argv[2];
00225   const char *sstem = argv[3];
00226 
00227   uint frame = 0; // frame number
00228   Image<float> prevlum; // luminance image of the previous frame, initially empty
00229 
00230   // setup some pyramid builders for the motion detection, using Reichardt motion detectors in 4 directions: NOTE: The
00231   // very small values below should be 0.0F, however, in our reference code, these are computed from an arbitrary motion
00232   // direction specified in degrees, using sin() and cos(), which apparently yields these almost-zero values:
00233   ReichardtPyrBuilder<float> dir0( 1.0F,         -0.0F,         Gaussian5,  90.0F);  // right
00234   ReichardtPyrBuilder<float> dir1( 6.12323e-17F, -1.0F,         Gaussian5, 180.0F);  // up
00235   ReichardtPyrBuilder<float> dir2(-1.0F,         -1.22465e-16F, Gaussian5, 270.0F);  // left
00236   ReichardtPyrBuilder<float> dir3(-1.83697e-16F,  1.0F,         Gaussian5, 360.0F);  // down
00237 
00238   try {
00239     // run the main loop forever until some exception occurs (e.g., no more input frames):
00240     while(true)
00241       {
00242         // read the next input frame from disk into a 24-bit RGB image:
00243         if (Raster::fileExists(sformat("%s%06u.png", instem, frame)) == false)
00244           { LINFO("Input frames exhausted -- Quit"); break; }
00245 
00246         Image< PixRGB<byte> > in = Raster::ReadRGB(sformat("%s%06u.png", instem, frame));
00247 
00248         LINFO("########## Processing frame %06u ##########", frame);
00249 
00250         // compute the luminance of the image, in float values:
00251         Image<float> lum = computeLuminance(in);
00252 
00253         // convert the RGB image to float color pixels:
00254         Image<PixRGB<float> > inf = in;
00255 
00256         // get R-G, B-Y, base-flicker components:
00257         Image<float> rg, by; getRGBY(inf, rg, by, 25.5F); // getRGBY() defined in Image/ColorOps.H
00258         Image<float> flick; if (prevlum.initialized()) flick = absDiff(lum, prevlum);
00259 
00260         // Compute image pyramids, center-surround differences across pairs of pyramid levels, and final resulting
00261         // conspicuity map for each channel. The saliency map will sum all the conspicuity maps, where each channel is
00262         // weighted by the number of feature maps it contains:
00263         const float w = 1.0F / float(num_feat_maps);
00264 
00265         // *** color channel:
00266         Image<float> chanc = normalizMAP(computeCMAP(by, Gaussian5, "blue-yellow", true) +
00267                                          computeCMAP(rg, Gaussian5, "red-green", true), Dims(0,0), "Color");
00268 
00269         Image<float> smap = chanc * (w / 2.0F);
00270 
00271         // *** flicker channel:
00272         if (flick.initialized()) {
00273           Image<float> chanf = normalizMAP(computeCMAP(flick, Gaussian5, "flicker", true), Dims(0,0), "Flicker");
00274           smap += chanf * w;
00275         }
00276 
00277         // *** intensity channel:
00278         const Image<float> chani = normalizMAP(computeCMAP(lum, Gaussian5, "intensity", true), Dims(0,0), "Intensity");
00279 
00280         smap += chani * w;
00281 
00282         // *** orientation channel:
00283         Image<float> chano = normalizMAP(computeCMAP(lum, Oriented9, "ori0", false, 0.0F), Dims(0,0), "ori0");
00284         chano += normalizMAP(computeCMAP(lum, Oriented9, "ori45", false, 45.0F), Dims(0,0), "ori45");
00285         chano += normalizMAP(computeCMAP(lum, Oriented9, "ori90", false, 90.0F), Dims(0,0), "ori90");
00286         chano += normalizMAP(computeCMAP(lum, Oriented9, "ori135", false, 135.0F), Dims(0,0), "ori135");
00287 
00288         smap += normalizMAP(chano, Dims(0,0), "Orientation") * (w / 4.0F);
00289 
00290         // *** and motion channel:
00291         ImageSet<float> d = dir0.build(lum, level_min, level_max + delta_max + 1);
00292         Image<float> chanm = normalizMAP(computeMMAP(d, "dir0"), Dims(0,0), "dir0");
00293 
00294         d = dir1.build(lum, level_min, level_max + delta_max + 1);
00295         chanm += normalizMAP(computeMMAP(d, "dir1"), Dims(0,0), "dir1");
00296 
00297         d = dir2.build(lum, level_min, level_max + delta_max + 1);
00298         chanm += normalizMAP(computeMMAP(d, "dir2"), Dims(0,0), "dir2");
00299 
00300         d = dir3.build(lum, level_min, level_max + delta_max + 1);
00301         chanm += normalizMAP(computeMMAP(d, "dir3"), Dims(0,0), "dir3");
00302 
00303         smap += normalizMAP(chanm, Dims(0,0), "Motion") * (w / 4.0F);
00304 
00305         // *** do one final round of spatial competition for salience (maxnorm) on the combined saliency map:
00306         smap = maxNormalize(smap, 0.0F, 2.0F, VCXNORM_MAXNORM);
00307 
00308         // finally convert saliency map values to nA of synaptic current:
00309         smap *= 1.0e-9F;
00310 
00311         float mi, ma; getMinMax(smap, mi, ma);
00312         LINFO("Saliency Map: final range [%f .. %f] nA", mi * 1.0e9F, ma * 1.0e9F);
00313 
00314         // create a composite image with input + saliency map + attention trajectory:
00315         Image<float> smapn = smap; inplaceNormalize(smapn, 0.0F, 255.0F);
00316         Image<PixRGB<byte> > traj = colGreyCombo(in, smapn, true, false);
00317 
00318         // write our results:
00319         Raster::WriteRGB(traj, sformat("%s%06u.png", tstem, frame));
00320         Raster::WriteFloat(smap, FLOAT_NORM_0_255, sformat("%s%06u.png", sstem, frame));
00321         Raster::WriteFloat(smap, FLOAT_NORM_PRESERVE, sformat("%s%06u.pfm", sstem, frame));
00322 
00323         // ready for next frame:
00324         prevlum = lum; ++frame;
00325       }
00326   } catch(...) { LERROR("Exception caught -- Quitting."); }
00327 
00328   return(0);
00329 }
00330 
00331 
00332 // ######################################################################
00333 /* So things look consistent in everyone's emacs... */
00334 /* Local Variables: */
00335 /* indent-tabs-mode: nil */
00336 /* End: */
Generated on Sun May 8 08:40:58 2011 for iLab Neuromorphic Vision Toolkit by  doxygen 1.6.3