00001 /*!@file INVT/simple-saliency.C A very simple saliency map implementation */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2003 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: Laurent Itti <itti@pollux.usc.edu> 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/INVT/simple-saliency.C $ 00035 // $Id: simple-saliency.C 12962 2010-03-06 02:13:53Z irock $ 00036 // 00037 00038 #include "Util/log.H" // logging facilities, provides LINFO(), LFATAL(), etc functions 00039 #include "Util/sformat.H" // sformat() is similar to sprintf() but returns a C++ string 00040 00041 #include "Image/PixelsTypes.H" // for PixRGB<T>, etc 00042 #include "Image/Image.H" // template image class 00043 #include "Image/ImageSet.H" // for ImageSet<T>, a set of images (e.g., a pyramid) 00044 00045 #include "Image/ColorOps.H" // for luminance(), getRGBY(), etc 00046 #include "Image/PyramidOps.H" // for buildPyrGeneric(), centerSurround(), etc 00047 #include "Image/PyrBuilder.H" // for the various builders for different types of pyramids 00048 #include "Image/ShapeOps.H" // for downSize() 00049 #include "Image/fancynorm.H" // for maxNormalize() 00050 #include "Image/ImageSetOps.H" // for doLowThresh() 00051 00052 #include "Raster/Raster.H" // for reading/writing images from/to disk 00053 #include "Image/DrawOps.H" // for colGreyCombo(), used only to create an output display 00054 #include "Image/MathOps.H" // for getMinMax, used only to print informational messages 00055 #include "Image/Normalize.H" // for normalizeFloat(), used only to write outputs to disk 00056 00057 00058 // This program is a simplified version of the Itti et al (1998) visual saliency algorithm, designed for didactic 00059 // purposes. Here, we compute a saliency map as if you were using: ezvision --maxnorm-type=Maxnorm --nouse-random 00060 00061 // To understand what is happening in this program, you should: 00062 // 00063 // - familiarize yourself with the Dims class in src/Image/Dims.H; it's just a simple struct with a width and a height 00064 // of an image; 00065 // 00066 // - familiarize yourself with the PixRGB<T> pixel type in src/Image/PixelsTypes.H; it's just a simple struct with 3 00067 // components for red, green and blue; 00068 // 00069 // - familiarize yourself with the Image<T> class in src/Image.H; it's just a 2D array of pixels with some Dims; you do 00070 // not need to worry about the internal handling of the array (copy-on-write and ref-counting). But check out the 00071 // begin() and beginw() functions as those are how one gets direct access to the raw pixel array through C++ 00072 // iterators (linear access, scanning the image like a TV gun); 00073 // 00074 // - familiarize yourself with ImageSet<T>, it's just a 1D array of images, we use it here to store image pyramids; 00075 // 00076 // - then, have a look at the following functions used in the present program: 00077 // 00078 // getRGBY() in Image/ColorOps.C 00079 // downSize() in Image/ShapeOps.C 00080 // centerSurround() in Image/PyramidOps.C, which internally uses centerSurround() of Image/FilterOps.C; note that this 00081 // one in turns internally uses inplaceAttenuateBorders() defined in Image/MathOps.C 00082 // maxNormalize in Image/fancynorm.C, here we use the VCXNORM_MAXNORM version of this algo 00083 // buildPyrGeneric() in Image/PyramidOps.C -- this is probably the most complicated one, it internally calls 00084 // builders for various types of image pyramids, which are defined in Image/PyrBuilder.C 00085 // doLowThresh() in ImageSetOps.C, user by the motion channel 00086 00087 // The following papers will help: 00088 // 00089 // * http://ilab.usc.edu/publications/doc/Itti_etal98pami.pdf 00090 // Check out Fig 1 for the general flow diagram, and Fig. 2 for the max-normalization operator. Note that eq. 2 and 3 00091 // in this paper have a typo. 00092 // 00093 // * http://ilab.usc.edu/publications/doc/Itti_Koch00vr.pdf 00094 // Check out Fig. 2.a for how the center-surround operations work. Note that this also introduces a different 00095 // max-normalization scheme (Fog. 2b) which is not used in the present program. 00096 // 00097 // * http://ilab.usc.edu/publications/doc/Itti_Koch01ei.pdf 00098 // Have a look at Fig. 4 for the "truncated filter" boundary condition which we use to create image pyramids. 00099 // 00100 // * http://ilab.usc.edu/publications/doc/Itti_etal03spienn.pdf 00101 // This version of the model includes flicker and motion channels and all the equations are correct, so use this as 00102 // the reference for the full algorithm. 00103 00104 // ###################################################################### 00105 // ##### Global options: 00106 // ###################################################################### 00107 #define sml 4 /* pyramid level of the saliency map */ 00108 #define level_min 2 /* min center level */ 00109 #define level_max 4 /* max center level */ 00110 #define delta_min 3 /* min center-surround delta */ 00111 #define delta_max 4 /* max center-surround delta */ 00112 #define maxdepth (level_max + delta_max + 1) 00113 #define num_feat_maps ((level_max - level_min + 1) * (delta_max - delta_min + 1)) /* num of feture maps per pyramid */ 00114 #define motionlowthresh 3.0F 00115 00116 // ###################################################################### 00117 //! Compute the luminance of an RGB image, and convert from byte to float pixels 00118 /*! note that we could as well use luminance() defined in Image/ColorOps.H except that the rounding side effects are 00119 slightly different. This function corresponds to the one we use in our master code (it's burried in 00120 Channels/InputFrame.C).*/ 00121 Image<float> computeLuminance(const Image<PixRGB<byte> >& img) 00122 { 00123 Image<float> lum(img.getDims(), NO_INIT); 00124 Image<PixRGB<byte> >::const_iterator in = img.begin(), stop = img.end(); 00125 Image<float>::iterator dest = lum.beginw(); 00126 const float one_third = 1.0F / 3.0F; 00127 00128 while(in != stop) { 00129 *dest++ = one_third * (in->red() + in->green() + in->blue()); 00130 ++in; 00131 } 00132 00133 return lum; 00134 } 00135 00136 // ###################################################################### 00137 //! Max-normalize a feature map or conspicuity map 00138 Image<float> normalizMAP(const Image<float>& ima, const Dims& dims, const char *label) 00139 { 00140 // do we want to downsize the map? 00141 Image<float> dsima; 00142 if (dims.isNonEmpty()) dsima = downSize(ima, dims); else dsima = ima; 00143 00144 // apply spatial competition for salience (mx-normalization): 00145 Image<float> result = maxNormalize(dsima, MAXNORMMIN, MAXNORMMAX, VCXNORM_MAXNORM); 00146 00147 // show some info about this map: 00148 float mi, ma, nmi, nma; getMinMax(ima, mi, ma); getMinMax(result, nmi, nma); 00149 LINFO("%s: raw range [%f .. %f] max-normalized to [%f .. %f]", label, mi, ma, nmi, nma); 00150 00151 return result; 00152 } 00153 00154 // ###################################################################### 00155 // Compute a conspicuity map, called with various pyramid types for the various channels 00156 Image<float> computeCMAP(const Image<float>& fima, const PyramidType ptyp, const char *label, 00157 const bool absol, const float ori = 0.0F) 00158 { 00159 LINFO("Building %s channel:", label); 00160 00161 // compute pyramid: 00162 ImageSet<float> pyr = buildPyrGeneric(fima, 0, maxdepth, ptyp, ori); 00163 00164 // alloc conspicuity map and clear it: 00165 Image<float> cmap(pyr[sml].getDims(), ZEROS); 00166 00167 // get all the center-surround maps and combine them: 00168 for (int delta = delta_min; delta <= delta_max; ++delta) 00169 for (int lev = level_min; lev <= level_max; ++lev) 00170 { 00171 Image<float> tmp = centerSurround(pyr, lev, lev + delta, absol); 00172 std::string lbl = sformat(" %s(%d,%d)", label, lev, lev + delta); 00173 00174 tmp = normalizMAP(tmp, cmap.getDims(), lbl.c_str()); 00175 00176 cmap += tmp; 00177 } 00178 00179 float mi, ma; 00180 getMinMax(cmap, mi, ma); LINFO("%s: final cmap range [%f .. %f]", label, mi, ma); 00181 00182 return cmap; 00183 } 00184 00185 // ###################################################################### 00186 // Compute a motion conspicuity map from a motion pyramid 00187 Image<float> computeMMAP(ImageSet<float>& pyr, const char *label) 00188 { 00189 LINFO("Building %s channel:", label); 00190 00191 // apply a low threshold to cut small motion values: 00192 doLowThresh(pyr, motionlowthresh); 00193 00194 // alloc conspicuity map and clear it: 00195 Image<float> cmap(pyr[sml].getDims(), ZEROS); 00196 00197 // get all the center-surround maps and combine them: 00198 for (int delta = delta_min; delta <= delta_max; ++delta) 00199 for (int lev = level_min; lev <= level_max; ++lev) 00200 { 00201 Image<float> tmp = centerSurround(pyr, lev, lev + delta, true); 00202 std::string lbl = sformat(" %s(%d,%d)", label, lev, lev + delta); 00203 00204 tmp = normalizMAP(tmp, cmap.getDims(), lbl.c_str()); 00205 00206 cmap += tmp; 00207 } 00208 00209 float mi, ma; 00210 getMinMax(cmap, mi, ma); LINFO("%s: final cmap range [%f .. %f]", label, mi, ma); 00211 00212 return cmap; 00213 } 00214 00215 // ###################################################################### 00216 //! Simple saliency map computation 00217 /*! This is a barebones implementation of the Itti, Koch & Niebur (1998) saliency map algorithm */ 00218 int main(const int argc, const char **argv) 00219 { 00220 if (argc != 4) LFATAL("Usage: %s <input-stem> <traj-stem> <sm-stem>\n" 00221 "Will read <input-stem>000000.png, <input-stem>000001.png, etc and \n" 00222 "create the corresponding outputs.", argv[0]); 00223 const char *instem = argv[1]; 00224 const char *tstem = argv[2]; 00225 const char *sstem = argv[3]; 00226 00227 uint frame = 0; // frame number 00228 Image<float> prevlum; // luminance image of the previous frame, initially empty 00229 00230 // setup some pyramid builders for the motion detection, using Reichardt motion detectors in 4 directions: NOTE: The 00231 // very small values below should be 0.0F, however, in our reference code, these are computed from an arbitrary motion 00232 // direction specified in degrees, using sin() and cos(), which apparently yields these almost-zero values: 00233 ReichardtPyrBuilder<float> dir0( 1.0F, -0.0F, Gaussian5, 90.0F); // right 00234 ReichardtPyrBuilder<float> dir1( 6.12323e-17F, -1.0F, Gaussian5, 180.0F); // up 00235 ReichardtPyrBuilder<float> dir2(-1.0F, -1.22465e-16F, Gaussian5, 270.0F); // left 00236 ReichardtPyrBuilder<float> dir3(-1.83697e-16F, 1.0F, Gaussian5, 360.0F); // down 00237 00238 try { 00239 // run the main loop forever until some exception occurs (e.g., no more input frames): 00240 while(true) 00241 { 00242 // read the next input frame from disk into a 24-bit RGB image: 00243 if (Raster::fileExists(sformat("%s%06u.png", instem, frame)) == false) 00244 { LINFO("Input frames exhausted -- Quit"); break; } 00245 00246 Image< PixRGB<byte> > in = Raster::ReadRGB(sformat("%s%06u.png", instem, frame)); 00247 00248 LINFO("########## Processing frame %06u ##########", frame); 00249 00250 // compute the luminance of the image, in float values: 00251 Image<float> lum = computeLuminance(in); 00252 00253 // convert the RGB image to float color pixels: 00254 Image<PixRGB<float> > inf = in; 00255 00256 // get R-G, B-Y, base-flicker components: 00257 Image<float> rg, by; getRGBY(inf, rg, by, 25.5F); // getRGBY() defined in Image/ColorOps.H 00258 Image<float> flick; if (prevlum.initialized()) flick = absDiff(lum, prevlum); 00259 00260 // Compute image pyramids, center-surround differences across pairs of pyramid levels, and final resulting 00261 // conspicuity map for each channel. The saliency map will sum all the conspicuity maps, where each channel is 00262 // weighted by the number of feature maps it contains: 00263 const float w = 1.0F / float(num_feat_maps); 00264 00265 // *** color channel: 00266 Image<float> chanc = normalizMAP(computeCMAP(by, Gaussian5, "blue-yellow", true) + 00267 computeCMAP(rg, Gaussian5, "red-green", true), Dims(0,0), "Color"); 00268 00269 Image<float> smap = chanc * (w / 2.0F); 00270 00271 // *** flicker channel: 00272 if (flick.initialized()) { 00273 Image<float> chanf = normalizMAP(computeCMAP(flick, Gaussian5, "flicker", true), Dims(0,0), "Flicker"); 00274 smap += chanf * w; 00275 } 00276 00277 // *** intensity channel: 00278 const Image<float> chani = normalizMAP(computeCMAP(lum, Gaussian5, "intensity", true), Dims(0,0), "Intensity"); 00279 00280 smap += chani * w; 00281 00282 // *** orientation channel: 00283 Image<float> chano = normalizMAP(computeCMAP(lum, Oriented9, "ori0", false, 0.0F), Dims(0,0), "ori0"); 00284 chano += normalizMAP(computeCMAP(lum, Oriented9, "ori45", false, 45.0F), Dims(0,0), "ori45"); 00285 chano += normalizMAP(computeCMAP(lum, Oriented9, "ori90", false, 90.0F), Dims(0,0), "ori90"); 00286 chano += normalizMAP(computeCMAP(lum, Oriented9, "ori135", false, 135.0F), Dims(0,0), "ori135"); 00287 00288 smap += normalizMAP(chano, Dims(0,0), "Orientation") * (w / 4.0F); 00289 00290 // *** and motion channel: 00291 ImageSet<float> d = dir0.build(lum, level_min, level_max + delta_max + 1); 00292 Image<float> chanm = normalizMAP(computeMMAP(d, "dir0"), Dims(0,0), "dir0"); 00293 00294 d = dir1.build(lum, level_min, level_max + delta_max + 1); 00295 chanm += normalizMAP(computeMMAP(d, "dir1"), Dims(0,0), "dir1"); 00296 00297 d = dir2.build(lum, level_min, level_max + delta_max + 1); 00298 chanm += normalizMAP(computeMMAP(d, "dir2"), Dims(0,0), "dir2"); 00299 00300 d = dir3.build(lum, level_min, level_max + delta_max + 1); 00301 chanm += normalizMAP(computeMMAP(d, "dir3"), Dims(0,0), "dir3"); 00302 00303 smap += normalizMAP(chanm, Dims(0,0), "Motion") * (w / 4.0F); 00304 00305 // *** do one final round of spatial competition for salience (maxnorm) on the combined saliency map: 00306 smap = maxNormalize(smap, 0.0F, 2.0F, VCXNORM_MAXNORM); 00307 00308 // finally convert saliency map values to nA of synaptic current: 00309 smap *= 1.0e-9F; 00310 00311 float mi, ma; getMinMax(smap, mi, ma); 00312 LINFO("Saliency Map: final range [%f .. %f] nA", mi * 1.0e9F, ma * 1.0e9F); 00313 00314 // create a composite image with input + saliency map + attention trajectory: 00315 Image<float> smapn = smap; inplaceNormalize(smapn, 0.0F, 255.0F); 00316 Image<PixRGB<byte> > traj = colGreyCombo(in, smapn, true, false); 00317 00318 // write our results: 00319 Raster::WriteRGB(traj, sformat("%s%06u.png", tstem, frame)); 00320 Raster::WriteFloat(smap, FLOAT_NORM_0_255, sformat("%s%06u.png", sstem, frame)); 00321 Raster::WriteFloat(smap, FLOAT_NORM_PRESERVE, sformat("%s%06u.pfm", sstem, frame)); 00322 00323 // ready for next frame: 00324 prevlum = lum; ++frame; 00325 } 00326 } catch(...) { LERROR("Exception caught -- Quitting."); } 00327 00328 return(0); 00329 } 00330 00331 00332 // ###################################################################### 00333 /* So things look consistent in everyone's emacs... */ 00334 /* Local Variables: */ 00335 /* indent-tabs-mode: nil */ 00336 /* End: */