GistEstimatorTexton.C

Go to the documentation of this file.
00001 /*!@file Neuro/GistEstimatorTexton.C */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file: Manu Viswanathan <mviswana at usc dot edu>
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Neuro/GistEstimatorTexton.C $
00035 // $Id: GistEstimatorTexton.C 13103 2010-03-31 02:24:47Z itti $
00036 //
00037 
00038 //------------------------------ HEADERS --------------------------------
00039 
00040 // Gist specific headers
00041 #include "Neuro/GistEstimatorTexton.H"
00042 //#include "Neuro/gistParams.H"
00043 
00044 // Other INVT headers
00045 #include "Neuro/VisualCortex.H"
00046 #include "Neuro/NeuroSimEvents.H"
00047 
00048 #include "Simulation/SimEventQueue.H"
00049 
00050 #include "Channels/GaborChannel.H"
00051 #include "Channels/OrientationChannel.H"
00052 
00053 #include "Image/ShapeOps.H"
00054 #include "Image/CutPaste.H"
00055 #include "Image/Dims.H"
00056 
00057 #include "nub/ref.h"
00058 #include "rutz/shared_ptr.h"
00059 
00060 // Standard C++ headers
00061 #include <sstream>
00062 #include <vector>
00063 #include <numeric>
00064 #include <algorithm>
00065 #include <functional>
00066 #include <stdexcept>
00067 #include <utility>
00068 #include <limits>
00069 #include <cmath>
00070 #include <ctime>
00071 
00072 //----------------------------- TYPEDEFS --------------------------------
00073 
00074 // Some useful shortcuts
00075 typedef GistEstimatorTexton::PixelType PixelType ;
00076 typedef GistEstimatorTexton::ImageType ImageType ;
00077 typedef std::vector<ImageType> FilterationResults ;
00078 
00079 //------------------------ STATIC DATA MEMBERS --------------------------
00080 
00081 // The GistEstimatorTexton relies on its client to load the universal
00082 // textons prior to using the textons computation facility. We could load
00083 // the universal textons here and thereby not bother the client with this
00084 // chore. However, client programs may not be interested in figuring out
00085 // which universal textons occur in a given input image. Instead, they
00086 // may just be interested in getting their hands on the input image's
00087 // "raw" textons via the training hook. (The train-texton program, for
00088 // instance, has such a mode of operation.)
00089 //
00090 // If we were to always load the universal textons each time this class
00091 // is used, it would always impose the universal textons load overhead
00092 // even if the client is not interested in using that portion of this
00093 // class. By moving the universal textons loading responsibility into
00094 // client space, we avoid this unnecessary overhead.
00095 //
00096 // Moreover, this approach gives clients the flexibility to save textons
00097 // to disk in whatever format they choose. All the GistEstimatorTexton
00098 // needs is an Image specifying the universal textons regardless of how
00099 // that Image is actually obtained.
00100 //
00101 // To ensure that clients properly set the universal textons, we require
00102 // them to provide the address of such an Image rather than the contents.
00103 // This allows us to check that this pointer is valid prior to performing
00104 // the texton histogram computations.
00105 const ImageType* GistEstimatorTexton::itsUniversalTextons ;
00106 
00107 //-------------------------- INITIALIZATION -----------------------------
00108 
00109 GistEstimatorTexton::GistEstimatorTexton(OptionManager& mgr,
00110                                          const std::string& descrName,
00111                                          const std::string& tagName)
00112    : GistEstimatorAdapter(mgr, descrName, tagName),
00113      SIMCALLBACK_INIT(SimEventVisualCortexOutput),
00114      itsTrainingHook(0)
00115 {}
00116 
00117 //----------------------------- CLEAN-UP --------------------------------
00118 
00119 GistEstimatorTexton::~GistEstimatorTexton()
00120 {}
00121 
00122 //------------------ GIST FEATURE VECTOR COMPUTATION --------------------
00123 
00124 // Forward declarations
00125 namespace {
00126 
00127 ImageType apply_texton_filter(const VisualCortex*, uint, uint) ;
00128 ImageType compute_textons(const FilterationResults&) ;
00129 Image<double> histogram(const ImageType&, const ImageType&) ;
00130 
00131 }
00132 
00133 // The processing method filters the "current" image passed in by the INVT
00134 // simulation framework and computes this image's textons. Then,
00135 // depending on whether we are in training mode or not, it either passes
00136 // the textons to the texton trainer (src/Gist/train-texton.C) via the
00137 // training hook or performs K-nearest neighbour search to figure out
00138 // the instances of the universal textons in the input image.
00139 //
00140 // DEVNOTE: We already know how many filters we're going to be applying
00141 // to the input image. So we ought to be able to initialize the
00142 // filteration results (STL) vector with these many elements so as to
00143 // avoid reallocations later on.
00144 //
00145 // Unfortunately, for some strange reason, preallocating this number of
00146 // elements and then calling push_back() doesn't quite work. The vector
00147 // ends up with the specified number of preallocated elements and then
00148 // push_back() appends beyond this preallocated range!
00149 //
00150 // This is *not* standards-compliant behaviour. The vector reserve()
00151 // method is not supposed to affect the vector's size; it should only
00152 // change the vector's capacity. A small test program written for
00153 // verification purposes confirmed this. Yet the application of the same
00154 // technique here backfires. (Perhaps something to do with the INVT build
00155 // environment?)
00156 
00157 // ######################################################################
00158 void GistEstimatorTexton::
00159 onSimEventVisualCortexOutput(SimEventQueue& q, rutz::shared_ptr<SimEventVisualCortexOutput>& e)
00160 {
00161   LFATAL("FIXME, this should be done using a SimReq");
00162   /*
00163   VisualCortex* vc = dynamic_cast<VisualCortex*>(e->source()) ;
00164 
00165   //FilterationResults results(NUM_FILTERS) ; // doesn't work; see above
00166   FilterationResults results ;
00167   for (uint orientation = 0; orientation < NUM_ORIENTATIONS; ++orientation)
00168     for (uint scale = 0; scale < NUM_SCALES; ++scale)
00169       results.push_back(apply_texton_filter(vc, orientation, scale)) ;
00170 
00171   ImageType textons = compute_textons(results) ;
00172   LINFO("MVN: computed %dx%d texton \"matrix\" for input image",
00173         textons.getHeight(), textons.getWidth()) ;
00174   if (itsTrainingHook)
00175     itsTrainingHook(textons) ;
00176   else
00177     {
00178       if (! itsUniversalTextons)
00179         throw std::runtime_error("GistEstimatorTexton requires "
00180                                  "universal textons \"database\"") ;
00181       clock_t start_time = clock() ;
00182       itsGistVector = histogram(textons, *itsUniversalTextons) ;
00183       LINFO("MVN: %g seconds to compute histogram, i.e., %dx%d gist vector",
00184             static_cast<double>(clock() - start_time)/CLOCKS_PER_SEC,
00185             itsGistVector.getHeight(), itsGistVector.getWidth()) ;
00186     }
00187 
00188   rutz::shared_ptr<SimEventGistOutput>
00189     gistOutputEvent(new SimEventGistOutput(this, itsGistVector)) ;
00190   q.post(gistOutputEvent) ;
00191   */
00192 }
00193 
00194 //------------------------ TEXTON COMPUTATIONS --------------------------
00195 
00196 // This section is local to this file. Thus, stuffing it in an anonymous
00197 // namespace ensures that its definitions don't clash with identically
00198 // named entities in other modules.
00199 namespace {
00200 
00201 // As per the Renninger and Malik paper, their texton filters are
00202 // equivalent to Gabor filters applied at different orientations and
00203 // scales. Thus, the following function simply retrieves the Gabor
00204 // channel for the specified orientation and scale.
00205 //
00206 // One thing to keep in mind though is that at coarser scales, the
00207 // filtered image will have a correspondingly smaller size. At scale 0,
00208 // the filtered image is the same size as the input image; at scale 1, it
00209 // is half the size of the input image; at scale 2, 1/4th the size of the
00210 // input image; so on and so forth. Basically, at scale n, it will be
00211 // 1/(2^n) the size of the input image.
00212 //
00213 // To get the texton for the (i,j)th pixel, we extract the (i,j)th pixel
00214 // from each filter's resultant image. However, if the resultant and
00215 // input images are different sizes, there may be no pixel to extract
00216 // from the resultant image. For example, if the input image is 320x240
00217 // pixels, at scale 2, the filtered image will be 80x60 pixels. And we
00218 // won't be able to create the texton for all pixels beyond the 80th row
00219 // and 60th column.
00220 //
00221 // To fix this problem, we rescale the filtered image back to the input
00222 // image's size.
00223 ImageType
00224 apply_texton_filter(const VisualCortex* vc, uint orientation, uint scale)
00225 {
00226 
00227   LFATAL("Please talk to Laurent to fix this");
00228   return ImageType();
00229   /*
00230    nub::soft_ref<OrientationChannel> oc ;
00231    dynCastWeakToFrom(oc, vc->subChan("orientation")) ;
00232    GaborChannel& gc = oc->gabor(orientation) ;
00233    ImageType I = gc.getImage(scale) ;
00234    if (scale > 0)
00235       I = rescale(I, I.getDims() * (1 << scale)) ; // blow up by 2^scale
00236    return I ;
00237   */
00238 }
00239 
00240 // Forward declaration
00241 ImageType get_textons(int i, int j, const FilterationResults&) ;
00242 
00243 // The following function returns the textons for the entire input image
00244 // given the filteration results. The textons are returned as an NxR
00245 // image where N is the number of filters applied and R is the product
00246 // of the width and height of the input image. Thus, the textons Image
00247 // returned by this function will have R (i.e., WxH) rows and N columns.
00248 ImageType compute_textons(const FilterationResults& results)
00249 {
00250    int width  = results[0].getWidth() ;
00251    int height = results[0].getHeight() ;
00252    ImageType textons(GistEstimatorTexton::NUM_FILTERS, width * height,
00253                      NO_INIT) ;
00254    int row = 0 ;
00255    for (int i = 0; i < width; ++i)
00256       for (int j = 0; j < height; ++j)
00257          inplacePaste(textons, get_textons(i, j, results),
00258                       Point2D<int>(0, row++)) ;
00259 
00260    return textons ;
00261 }
00262 
00263 // Quick helper to extract the (i,j)th pixel from a given Image.
00264 //
00265 // DEVNOTE: We could write this as a std::binary_function and then use
00266 // it in conjunction with std::bind2nd(). But that doesn't really reduce
00267 // the amount of code to be written here. Keeping this a unary_function
00268 // and using it directly makes the intent somewhat clearer.
00269 //
00270 // DEVNOTE 2: Another possibility is to use std::mem_fun_ref() in
00271 // conjunction with std::bind2nd(). Unfortunately, std::mem_fun_ref()
00272 // doesn't work when the argument of its function call operator is a
00273 // reference (compiler issues "reference to reference" error). That
00274 // requires partial specialization of the std::binary_function so that
00275 // the second argument is a reference.
00276 //
00277 // But that too doesn't really reduce the amount of code to be written.
00278 // As a matter of fact, with this second approach, we have to write a
00279 // lot more code because of the extra partial specialization plus several
00280 // typedefs required to disambiguate the call to Image<T>::getVal().
00281 //
00282 // These extra bits of code simply serve to further obfuscate intent.
00283 // Thus, it's best to just stick with this basic custom function object.
00284 // It gets the job done and makes fairly clear what's going on.
00285 class get_pixel : std::unary_function<ImageType, PixelType> {
00286    Point2D<int> coordinates ;
00287 public :
00288    get_pixel(int i, int j) : coordinates(i, j) {}
00289    PixelType operator()(const ImageType& I) const {
00290       return I.getVal(coordinates) ;
00291    }
00292 } ;
00293 
00294 // A texton is simply the vector of filter responses for a given pixel.
00295 // That is, if we apply 36 filters to an input image, we will get 36
00296 // Images as the filteration results. The texton for pixel (i,j) will be
00297 // the vector of 36 numbers formed by taking pixel (i,j) from each of the
00298 // 36 Images in the filteration results.
00299 //
00300 // The following function returns the textons corresponding to the
00301 // (i,j)th pixel of the input image given the filteration results. The
00302 // textons for this pixel are returned as an Nx1 Image, i.e., 1 row of N
00303 // values (where N is the number of filters applied).
00304 ImageType get_textons(int i, int j, const FilterationResults& images)
00305 {
00306    ImageType textons(GistEstimatorTexton::NUM_FILTERS, 1, NO_INIT) ;
00307    std::transform(images.begin(), images.end(), textons.beginw(),
00308                   get_pixel(i, j)) ;
00309    return textons ;
00310 }
00311 
00312 } // end of local namespace encapsulating above definitions
00313 
00314 //---------------------- HISTOGRAM COMPUTATIONS -------------------------
00315 
00316 namespace {
00317 
00318 // Quick helper to extract the r-th row of an Image
00319 inline ImageType get_row(int r, const ImageType& I)
00320 {
00321    return crop(I, Point2D<int>(0, r), Dims(I.getWidth(), 1)) ;
00322 }
00323 
00324 // Forward declarations
00325 int nearest_universal_texton(const ImageType&, const ImageType&) ;
00326 Image<double> normalized_histogram(const std::vector<int>&, int) ;
00327 double dist2(const ImageType&, const ImageType&, double) ;
00328 
00329 // Given the set of input textons, I, and the universal textons, U, this
00330 // function returns the normalized histogram counting the occurences of
00331 // the supplied universal textons in I. This histogram is the "gist
00332 // signature" of the input image and forms the basis for image
00333 // classification.
00334 Image<double> histogram(const ImageType& I, const ImageType& U)
00335 {
00336    std::vector<int> counts(U.getHeight()) ;
00337    std::fill(counts.begin(), counts.end(), 0) ;
00338 
00339    for (int i = 0; i < I.getHeight(); ++i)
00340       ++counts[nearest_universal_texton(get_row(i, I), U)] ;
00341 
00342    return normalized_histogram(counts, I.getHeight()) ;
00343 }
00344 
00345 // Given a row of the input image and the set of universal textons, this
00346 // function returns the index of the universal texton nearest to the
00347 // input image row.
00348 //
00349 // DEVNOTE: The Renninger-Malik implementation uses the Netlab (Matlab)
00350 // toolbox's knn function to compute the nearest universal texton (i.e.,
00351 // k = 1). Rather than implement the K-nn algorithm here and call it
00352 // with k = 1, we simply take a shortcut and perform a simple nearest
00353 // neighbour test.
00354 int nearest_universal_texton(const ImageType& I, const ImageType& U)
00355 {
00356    double D ;
00357 
00358    std::pair<double, int> min(std::numeric_limits<double>::max(), -1) ;
00359    for (int i = 0; i < U.getHeight(); ++i)
00360       if ((D = dist2(I, get_row(i, U), min.first)) < min.first)
00361          min = std::make_pair(D, i) ;
00362 
00363    return min.second ;
00364 }
00365 
00366 // The following function returns the square of the Euclidean distance
00367 // between two vectors subject to a supplied minimum, i.e., if the
00368 // square distance exceeds the minimum, the computation is
00369 // short-circuited and the minimum returned.
00370 //
00371 // This short-circuiting helps reduce the nearest universal texton
00372 // computation by about 2 seconds. Without it, each image's histogram
00373 // was taking over 8 seconds to compute no thanks to the O(nU) nature of
00374 // the problem (where n is the number of input image textons and U the
00375 // number of universal textons).
00376 //
00377 // The vectors are passed in as Images that are assumed to have a single
00378 // row and the same width.
00379 double dist2(const ImageType& L, const ImageType& R, double min)
00380 {
00381    double D = 0 ;
00382    for (int i = 0; i < L.getWidth(); ++i) {
00383       double l = L.getVal(i, 0) ;
00384       double r = R.getVal(i, 0) ;
00385       D += (l-r) * (l-r) ;
00386       if (D > min)
00387          return min ;
00388    }
00389    return D ;
00390 }
00391 
00392 // The following function converts a vector of counts and the total of
00393 // all those counts to an Image containing the normalized counts.
00394 Image<double> normalized_histogram(const std::vector<int>& counts, int total)
00395 {
00396    Image<double> I(counts.size(), 1, ZEROS) ;
00397    std::transform(counts.begin(), counts.end(), I.beginw(),
00398                   std::bind2nd(std::divides<double>(), total)) ;
00399    return I ;
00400 }
00401 
00402 } // end of local namespace encapsulating above definitions
00403 
00404 //-----------------------------------------------------------------------
00405 
00406 /* So things look consistent in everyone's emacs... */
00407 /* Local Variables: */
00408 /* indent-tabs-mode: nil */
00409 /* End: */