00001 /*!@file Neuro/GistEstimatorTexton.C */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: Manu Viswanathan <mviswana at usc dot edu> 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Neuro/GistEstimatorTexton.C $ 00035 // $Id: GistEstimatorTexton.C 13103 2010-03-31 02:24:47Z itti $ 00036 // 00037 00038 //------------------------------ HEADERS -------------------------------- 00039 00040 // Gist specific headers 00041 #include "Neuro/GistEstimatorTexton.H" 00042 //#include "Neuro/gistParams.H" 00043 00044 // Other INVT headers 00045 #include "Neuro/VisualCortex.H" 00046 #include "Neuro/NeuroSimEvents.H" 00047 00048 #include "Simulation/SimEventQueue.H" 00049 00050 #include "Channels/GaborChannel.H" 00051 #include "Channels/OrientationChannel.H" 00052 00053 #include "Image/ShapeOps.H" 00054 #include "Image/CutPaste.H" 00055 #include "Image/Dims.H" 00056 00057 #include "nub/ref.h" 00058 #include "rutz/shared_ptr.h" 00059 00060 // Standard C++ headers 00061 #include <sstream> 00062 #include <vector> 00063 #include <numeric> 00064 #include <algorithm> 00065 #include <functional> 00066 #include <stdexcept> 00067 #include <utility> 00068 #include <limits> 00069 #include <cmath> 00070 #include <ctime> 00071 00072 //----------------------------- TYPEDEFS -------------------------------- 00073 00074 // Some useful shortcuts 00075 typedef GistEstimatorTexton::PixelType PixelType ; 00076 typedef GistEstimatorTexton::ImageType ImageType ; 00077 typedef std::vector<ImageType> FilterationResults ; 00078 00079 //------------------------ STATIC DATA MEMBERS -------------------------- 00080 00081 // The GistEstimatorTexton relies on its client to load the universal 00082 // textons prior to using the textons computation facility. We could load 00083 // the universal textons here and thereby not bother the client with this 00084 // chore. However, client programs may not be interested in figuring out 00085 // which universal textons occur in a given input image. Instead, they 00086 // may just be interested in getting their hands on the input image's 00087 // "raw" textons via the training hook. (The train-texton program, for 00088 // instance, has such a mode of operation.) 00089 // 00090 // If we were to always load the universal textons each time this class 00091 // is used, it would always impose the universal textons load overhead 00092 // even if the client is not interested in using that portion of this 00093 // class. By moving the universal textons loading responsibility into 00094 // client space, we avoid this unnecessary overhead. 00095 // 00096 // Moreover, this approach gives clients the flexibility to save textons 00097 // to disk in whatever format they choose. All the GistEstimatorTexton 00098 // needs is an Image specifying the universal textons regardless of how 00099 // that Image is actually obtained. 00100 // 00101 // To ensure that clients properly set the universal textons, we require 00102 // them to provide the address of such an Image rather than the contents. 00103 // This allows us to check that this pointer is valid prior to performing 00104 // the texton histogram computations. 00105 const ImageType* GistEstimatorTexton::itsUniversalTextons ; 00106 00107 //-------------------------- INITIALIZATION ----------------------------- 00108 00109 GistEstimatorTexton::GistEstimatorTexton(OptionManager& mgr, 00110 const std::string& descrName, 00111 const std::string& tagName) 00112 : GistEstimatorAdapter(mgr, descrName, tagName), 00113 SIMCALLBACK_INIT(SimEventVisualCortexOutput), 00114 itsTrainingHook(0) 00115 {} 00116 00117 //----------------------------- CLEAN-UP -------------------------------- 00118 00119 GistEstimatorTexton::~GistEstimatorTexton() 00120 {} 00121 00122 //------------------ GIST FEATURE VECTOR COMPUTATION -------------------- 00123 00124 // Forward declarations 00125 namespace { 00126 00127 ImageType apply_texton_filter(const VisualCortex*, uint, uint) ; 00128 ImageType compute_textons(const FilterationResults&) ; 00129 Image<double> histogram(const ImageType&, const ImageType&) ; 00130 00131 } 00132 00133 // The processing method filters the "current" image passed in by the INVT 00134 // simulation framework and computes this image's textons. Then, 00135 // depending on whether we are in training mode or not, it either passes 00136 // the textons to the texton trainer (src/Gist/train-texton.C) via the 00137 // training hook or performs K-nearest neighbour search to figure out 00138 // the instances of the universal textons in the input image. 00139 // 00140 // DEVNOTE: We already know how many filters we're going to be applying 00141 // to the input image. So we ought to be able to initialize the 00142 // filteration results (STL) vector with these many elements so as to 00143 // avoid reallocations later on. 00144 // 00145 // Unfortunately, for some strange reason, preallocating this number of 00146 // elements and then calling push_back() doesn't quite work. The vector 00147 // ends up with the specified number of preallocated elements and then 00148 // push_back() appends beyond this preallocated range! 00149 // 00150 // This is *not* standards-compliant behaviour. The vector reserve() 00151 // method is not supposed to affect the vector's size; it should only 00152 // change the vector's capacity. A small test program written for 00153 // verification purposes confirmed this. Yet the application of the same 00154 // technique here backfires. (Perhaps something to do with the INVT build 00155 // environment?) 00156 00157 // ###################################################################### 00158 void GistEstimatorTexton:: 00159 onSimEventVisualCortexOutput(SimEventQueue& q, rutz::shared_ptr<SimEventVisualCortexOutput>& e) 00160 { 00161 LFATAL("FIXME, this should be done using a SimReq"); 00162 /* 00163 VisualCortex* vc = dynamic_cast<VisualCortex*>(e->source()) ; 00164 00165 //FilterationResults results(NUM_FILTERS) ; // doesn't work; see above 00166 FilterationResults results ; 00167 for (uint orientation = 0; orientation < NUM_ORIENTATIONS; ++orientation) 00168 for (uint scale = 0; scale < NUM_SCALES; ++scale) 00169 results.push_back(apply_texton_filter(vc, orientation, scale)) ; 00170 00171 ImageType textons = compute_textons(results) ; 00172 LINFO("MVN: computed %dx%d texton \"matrix\" for input image", 00173 textons.getHeight(), textons.getWidth()) ; 00174 if (itsTrainingHook) 00175 itsTrainingHook(textons) ; 00176 else 00177 { 00178 if (! itsUniversalTextons) 00179 throw std::runtime_error("GistEstimatorTexton requires " 00180 "universal textons \"database\"") ; 00181 clock_t start_time = clock() ; 00182 itsGistVector = histogram(textons, *itsUniversalTextons) ; 00183 LINFO("MVN: %g seconds to compute histogram, i.e., %dx%d gist vector", 00184 static_cast<double>(clock() - start_time)/CLOCKS_PER_SEC, 00185 itsGistVector.getHeight(), itsGistVector.getWidth()) ; 00186 } 00187 00188 rutz::shared_ptr<SimEventGistOutput> 00189 gistOutputEvent(new SimEventGistOutput(this, itsGistVector)) ; 00190 q.post(gistOutputEvent) ; 00191 */ 00192 } 00193 00194 //------------------------ TEXTON COMPUTATIONS -------------------------- 00195 00196 // This section is local to this file. Thus, stuffing it in an anonymous 00197 // namespace ensures that its definitions don't clash with identically 00198 // named entities in other modules. 00199 namespace { 00200 00201 // As per the Renninger and Malik paper, their texton filters are 00202 // equivalent to Gabor filters applied at different orientations and 00203 // scales. Thus, the following function simply retrieves the Gabor 00204 // channel for the specified orientation and scale. 00205 // 00206 // One thing to keep in mind though is that at coarser scales, the 00207 // filtered image will have a correspondingly smaller size. At scale 0, 00208 // the filtered image is the same size as the input image; at scale 1, it 00209 // is half the size of the input image; at scale 2, 1/4th the size of the 00210 // input image; so on and so forth. Basically, at scale n, it will be 00211 // 1/(2^n) the size of the input image. 00212 // 00213 // To get the texton for the (i,j)th pixel, we extract the (i,j)th pixel 00214 // from each filter's resultant image. However, if the resultant and 00215 // input images are different sizes, there may be no pixel to extract 00216 // from the resultant image. For example, if the input image is 320x240 00217 // pixels, at scale 2, the filtered image will be 80x60 pixels. And we 00218 // won't be able to create the texton for all pixels beyond the 80th row 00219 // and 60th column. 00220 // 00221 // To fix this problem, we rescale the filtered image back to the input 00222 // image's size. 00223 ImageType 00224 apply_texton_filter(const VisualCortex* vc, uint orientation, uint scale) 00225 { 00226 00227 LFATAL("Please talk to Laurent to fix this"); 00228 return ImageType(); 00229 /* 00230 nub::soft_ref<OrientationChannel> oc ; 00231 dynCastWeakToFrom(oc, vc->subChan("orientation")) ; 00232 GaborChannel& gc = oc->gabor(orientation) ; 00233 ImageType I = gc.getImage(scale) ; 00234 if (scale > 0) 00235 I = rescale(I, I.getDims() * (1 << scale)) ; // blow up by 2^scale 00236 return I ; 00237 */ 00238 } 00239 00240 // Forward declaration 00241 ImageType get_textons(int i, int j, const FilterationResults&) ; 00242 00243 // The following function returns the textons for the entire input image 00244 // given the filteration results. The textons are returned as an NxR 00245 // image where N is the number of filters applied and R is the product 00246 // of the width and height of the input image. Thus, the textons Image 00247 // returned by this function will have R (i.e., WxH) rows and N columns. 00248 ImageType compute_textons(const FilterationResults& results) 00249 { 00250 int width = results[0].getWidth() ; 00251 int height = results[0].getHeight() ; 00252 ImageType textons(GistEstimatorTexton::NUM_FILTERS, width * height, 00253 NO_INIT) ; 00254 int row = 0 ; 00255 for (int i = 0; i < width; ++i) 00256 for (int j = 0; j < height; ++j) 00257 inplacePaste(textons, get_textons(i, j, results), 00258 Point2D<int>(0, row++)) ; 00259 00260 return textons ; 00261 } 00262 00263 // Quick helper to extract the (i,j)th pixel from a given Image. 00264 // 00265 // DEVNOTE: We could write this as a std::binary_function and then use 00266 // it in conjunction with std::bind2nd(). But that doesn't really reduce 00267 // the amount of code to be written here. Keeping this a unary_function 00268 // and using it directly makes the intent somewhat clearer. 00269 // 00270 // DEVNOTE 2: Another possibility is to use std::mem_fun_ref() in 00271 // conjunction with std::bind2nd(). Unfortunately, std::mem_fun_ref() 00272 // doesn't work when the argument of its function call operator is a 00273 // reference (compiler issues "reference to reference" error). That 00274 // requires partial specialization of the std::binary_function so that 00275 // the second argument is a reference. 00276 // 00277 // But that too doesn't really reduce the amount of code to be written. 00278 // As a matter of fact, with this second approach, we have to write a 00279 // lot more code because of the extra partial specialization plus several 00280 // typedefs required to disambiguate the call to Image<T>::getVal(). 00281 // 00282 // These extra bits of code simply serve to further obfuscate intent. 00283 // Thus, it's best to just stick with this basic custom function object. 00284 // It gets the job done and makes fairly clear what's going on. 00285 class get_pixel : std::unary_function<ImageType, PixelType> { 00286 Point2D<int> coordinates ; 00287 public : 00288 get_pixel(int i, int j) : coordinates(i, j) {} 00289 PixelType operator()(const ImageType& I) const { 00290 return I.getVal(coordinates) ; 00291 } 00292 } ; 00293 00294 // A texton is simply the vector of filter responses for a given pixel. 00295 // That is, if we apply 36 filters to an input image, we will get 36 00296 // Images as the filteration results. The texton for pixel (i,j) will be 00297 // the vector of 36 numbers formed by taking pixel (i,j) from each of the 00298 // 36 Images in the filteration results. 00299 // 00300 // The following function returns the textons corresponding to the 00301 // (i,j)th pixel of the input image given the filteration results. The 00302 // textons for this pixel are returned as an Nx1 Image, i.e., 1 row of N 00303 // values (where N is the number of filters applied). 00304 ImageType get_textons(int i, int j, const FilterationResults& images) 00305 { 00306 ImageType textons(GistEstimatorTexton::NUM_FILTERS, 1, NO_INIT) ; 00307 std::transform(images.begin(), images.end(), textons.beginw(), 00308 get_pixel(i, j)) ; 00309 return textons ; 00310 } 00311 00312 } // end of local namespace encapsulating above definitions 00313 00314 //---------------------- HISTOGRAM COMPUTATIONS ------------------------- 00315 00316 namespace { 00317 00318 // Quick helper to extract the r-th row of an Image 00319 inline ImageType get_row(int r, const ImageType& I) 00320 { 00321 return crop(I, Point2D<int>(0, r), Dims(I.getWidth(), 1)) ; 00322 } 00323 00324 // Forward declarations 00325 int nearest_universal_texton(const ImageType&, const ImageType&) ; 00326 Image<double> normalized_histogram(const std::vector<int>&, int) ; 00327 double dist2(const ImageType&, const ImageType&, double) ; 00328 00329 // Given the set of input textons, I, and the universal textons, U, this 00330 // function returns the normalized histogram counting the occurences of 00331 // the supplied universal textons in I. This histogram is the "gist 00332 // signature" of the input image and forms the basis for image 00333 // classification. 00334 Image<double> histogram(const ImageType& I, const ImageType& U) 00335 { 00336 std::vector<int> counts(U.getHeight()) ; 00337 std::fill(counts.begin(), counts.end(), 0) ; 00338 00339 for (int i = 0; i < I.getHeight(); ++i) 00340 ++counts[nearest_universal_texton(get_row(i, I), U)] ; 00341 00342 return normalized_histogram(counts, I.getHeight()) ; 00343 } 00344 00345 // Given a row of the input image and the set of universal textons, this 00346 // function returns the index of the universal texton nearest to the 00347 // input image row. 00348 // 00349 // DEVNOTE: The Renninger-Malik implementation uses the Netlab (Matlab) 00350 // toolbox's knn function to compute the nearest universal texton (i.e., 00351 // k = 1). Rather than implement the K-nn algorithm here and call it 00352 // with k = 1, we simply take a shortcut and perform a simple nearest 00353 // neighbour test. 00354 int nearest_universal_texton(const ImageType& I, const ImageType& U) 00355 { 00356 double D ; 00357 00358 std::pair<double, int> min(std::numeric_limits<double>::max(), -1) ; 00359 for (int i = 0; i < U.getHeight(); ++i) 00360 if ((D = dist2(I, get_row(i, U), min.first)) < min.first) 00361 min = std::make_pair(D, i) ; 00362 00363 return min.second ; 00364 } 00365 00366 // The following function returns the square of the Euclidean distance 00367 // between two vectors subject to a supplied minimum, i.e., if the 00368 // square distance exceeds the minimum, the computation is 00369 // short-circuited and the minimum returned. 00370 // 00371 // This short-circuiting helps reduce the nearest universal texton 00372 // computation by about 2 seconds. Without it, each image's histogram 00373 // was taking over 8 seconds to compute no thanks to the O(nU) nature of 00374 // the problem (where n is the number of input image textons and U the 00375 // number of universal textons). 00376 // 00377 // The vectors are passed in as Images that are assumed to have a single 00378 // row and the same width. 00379 double dist2(const ImageType& L, const ImageType& R, double min) 00380 { 00381 double D = 0 ; 00382 for (int i = 0; i < L.getWidth(); ++i) { 00383 double l = L.getVal(i, 0) ; 00384 double r = R.getVal(i, 0) ; 00385 D += (l-r) * (l-r) ; 00386 if (D > min) 00387 return min ; 00388 } 00389 return D ; 00390 } 00391 00392 // The following function converts a vector of counts and the total of 00393 // all those counts to an Image containing the normalized counts. 00394 Image<double> normalized_histogram(const std::vector<int>& counts, int total) 00395 { 00396 Image<double> I(counts.size(), 1, ZEROS) ; 00397 std::transform(counts.begin(), counts.end(), I.beginw(), 00398 std::bind2nd(std::divides<double>(), total)) ; 00399 return I ; 00400 } 00401 00402 } // end of local namespace encapsulating above definitions 00403 00404 //----------------------------------------------------------------------- 00405 00406 /* So things look consistent in everyone's emacs... */ 00407 /* Local Variables: */ 00408 /* indent-tabs-mode: nil */ 00409 /* End: */