GistEstimatorBeyondBoF.C

Go to the documentation of this file.
00001 /*!
00002    \file Neuro/GistEstimatorBeyondBoF.C
00003 
00004    This file defines the member functions, static members, etc. of the
00005    GistEstimatorBeyondBoF class. Further details are in the header file.
00006 */
00007 
00008 // //////////////////////////////////////////////////////////////////// //
00009 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00010 // by the University of Southern California (USC) and the iLab at USC.  //
00011 // See http://iLab.usc.edu for information about this project.          //
00012 // //////////////////////////////////////////////////////////////////// //
00013 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00014 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00015 // in Visual Environments, and Applications'' by Christof Koch and      //
00016 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00017 // pending; application number 09/912,225 filed July 23, 2001; see      //
00018 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00019 // //////////////////////////////////////////////////////////////////// //
00020 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00023 // redistribute it and/or modify it under the terms of the GNU General  //
00024 // Public License as published by the Free Software Foundation; either  //
00025 // version 2 of the License, or (at your option) any later version.     //
00026 //                                                                      //
00027 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00028 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00029 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00030 // PURPOSE.  See the GNU General Public License for more details.       //
00031 //                                                                      //
00032 // You should have received a copy of the GNU General Public License    //
00033 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00034 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00035 // Boston, MA 02111-1307 USA.                                           //
00036 // //////////////////////////////////////////////////////////////////// //
00037 //
00038 // Primary maintainer for this file: Manu Viswanathan <mviswana at usc dot edu>
00039 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Neuro/GistEstimatorBeyondBoF.C $
00040 // $Id: GistEstimatorBeyondBoF.C 13065 2010-03-28 00:01:00Z itti $
00041 //
00042 
00043 //------------------------------ HEADERS --------------------------------
00044 
00045 // Gist specific headers
00046 #include "Neuro/GistEstimatorBeyondBoF.H"
00047 
00048 // Other INVT headers
00049 #include "Neuro/VisualCortex.H"
00050 #include "Neuro/NeuroSimEvents.H"
00051 
00052 #include "Simulation/SimEventQueue.H"
00053 
00054 #include "SIFT/ScaleSpace.H"
00055 #include "SIFT/Keypoint.H"
00056 
00057 #include "Image/Kernels.H"
00058 #include "Image/Convolutions.H"
00059 #include "Image/MathOps.H"
00060 #include "Image/ShapeOps.H"
00061 #include "Image/CutPaste.H"
00062 #include "Image/Point2D.H"
00063 #include "Image/Dims.H"
00064 
00065 #include "Util/Timer.H"
00066 
00067 #include "nub/ref.h"
00068 #include "rutz/shared_ptr.h"
00069 
00070 // Standard C++ headers
00071 #include <sstream>
00072 #include <numeric>
00073 #include <algorithm>
00074 #include <functional>
00075 #include <map>
00076 #include <list>
00077 #include <stdexcept>
00078 #include <utility>
00079 #include <limits>
00080 #include <ctime>
00081 
00082 //------------------------------ DEFINES --------------------------------
00083 
00084 // Error message for exceptions when vocabulary is required but not
00085 // specified.
00086 #define GE_BBOF_ERR_NO_VOCABULARY \
00087    "GistEstimatorBeyondBoF requires vocabulary of " \
00088    "prototypical SIFT descriptors"
00089 
00090 //----------------------------- TYPEDEFS --------------------------------
00091 
00092 // Some useful shortcuts
00093 typedef GistEstimatorBeyondBoF BBoF ;
00094 typedef BBoF::Vocabulary       Vocabulary ;
00095 typedef BBoF::SiftDescriptor   SiftDescriptor ;
00096 typedef BBoF::SiftGrid         SiftGrid ;
00097 
00098 // Other frequently used types
00099 typedef float PixelType ;
00100 typedef Image<PixelType> ImageType ;
00101 typedef Image<double>    GistVectorType ;
00102 
00103 //-------------- STATIC DATA MEMBERS AND OTHER CONSTANTS ----------------
00104 
00105 // As noted in the header file, the following static members, being
00106 // consts, do not really need to be defined here (i.e., they can be
00107 // initialized directly in the header). But they are defined here to
00108 // resolve some weirdness with GCC 3.4.
00109 int GistEstimatorBeyondBoF::NUM_CHANNELS = 200 ;
00110 int GistEstimatorBeyondBoF::NUM_LEVELS   = 2 ;
00111 
00112 // The gist vector consists of counts of feature types at each level of
00113 // the spatial matching pyramid (see Lazebnik paper and comments in other
00114 // parts of this file for the low-down). At level 0 of the pyramid, the
00115 // histogram for each type consists of a single number; at level 1, the
00116 // histogram has 4 numbers; at level 2, 16; and so on.
00117 //
00118 // The following constant uses the formula in the paper to compute the
00119 // total size of the "spatial" histogram for all the levels of the
00120 // pyramid that are used.
00121 //
00122 // If we use a two-level pyramid, we will get a total of 1 + 4 + 16 = 21
00123 // numbers. These 21 numbers will simply be "collapsed" into long
00124 // (21-dimensional) vectors to create the histograms for each feature
00125 // type.
00126 static int HISTOGRAM_SIZE_PER_LEVEL = 21 ;
00127 
00128 // Just as the histograms for each feature type are formed by
00129 // concatenating the spatial histograms at each level of the pyramid into
00130 // one long vector, the entire gist vector is constructed from the
00131 // "collapsed" spatial histograms of all feature types by stringing them
00132 // all together.
00133 //
00134 // For a two-level pyramid and a vocabulary of 200 feature types, we will
00135 // get 4200-dimensional gist vectors.
00136 int BBoF::GIST_VECTOR_SIZE = BBoF::NUM_CHANNELS * HISTOGRAM_SIZE_PER_LEVEL ;
00137 
00138 // Changing the number of channels and pyramid size.
00139 void GistEstimatorBeyondBoF::num_channels(int n)
00140 {
00141    if (n >= 5 && n <= 500) { // basic sanity checks
00142       NUM_CHANNELS = n ;
00143       GIST_VECTOR_SIZE = n * HISTOGRAM_SIZE_PER_LEVEL ;
00144    }
00145 }
00146 
00147 void GistEstimatorBeyondBoF::num_levels(int n)
00148 {
00149    if (n >= 0 && n <= 5) { // basic sanity checks
00150       NUM_LEVELS = n ;
00151       HISTOGRAM_SIZE_PER_LEVEL = ((1 << (2*n + 2)) - 1)/3 ;
00152       GIST_VECTOR_SIZE = NUM_CHANNELS * HISTOGRAM_SIZE_PER_LEVEL ;
00153    }
00154 }
00155 
00156 //-------------------------- INITIALIZATION -----------------------------
00157 
00158 GistEstimatorBeyondBoF::
00159 GistEstimatorBeyondBoF(OptionManager& mgr,
00160                        const std::string& descrName,
00161                        const std::string& tagName)
00162    : GistEstimatorAdapter(mgr, descrName, tagName),
00163      SIMCALLBACK_INIT(SimEventRetinaImage),
00164      itsTrainingHook(0)
00165 {}
00166 
00167 GistEstimatorBeyondBoF::SiftDescriptor::SiftDescriptor()
00168 {
00169    const int n = sizeof values / sizeof values[0] ;
00170    std::fill_n(values, n, 0) ;
00171 }
00172 
00173 // The following SiftDescriptor constructor expects the input image to
00174 // essentially be a vector of 128 values and simply copies them into its
00175 // internal SIFT descriptor values array. If the input image does not
00176 // have at least 128 values, this *will* bomb!
00177 GistEstimatorBeyondBoF::SiftDescriptor::SiftDescriptor(const ImageType& I)
00178 {
00179    const int n = sizeof values / sizeof values[0] ;
00180    std::copy(I.begin(), I.begin() + n, values) ;
00181 }
00182 
00183 GistEstimatorBeyondBoF::
00184 SiftDescriptor::SiftDescriptor(const rutz::shared_ptr<Keypoint>& kp)
00185 {
00186    std::copy(kp->begin(), kp->end(), values) ;
00187 }
00188 
00189 // Quick helper to extract the r-th row of an Image
00190 static inline ImageType get_row(int r, const ImageType& I)
00191 {
00192    return crop(I, Point2D<int>(0, r), Dims(I.getWidth(), 1)) ;
00193 }
00194 
00195 // The vocabulary is a list of SIFT descriptors. But for convenience,
00196 // clients can pass it in as an image. The image must have 128 columns
00197 // (for the 128 values that make up a SIFT descriptor) and will usually
00198 // have 200 rows (the size of the vocabulary). Thus, the dimensions of
00199 // the input image used to represent the SIFT descriptor vocabulary in
00200 // client space will be 128x200.
00201 //
00202 // The following method simply converts this image into the list of SIFT
00203 // descriptors used to represent the vocabulary internally by this class.
00204 void GistEstimatorBeyondBoF::setVocabulary(const ImageType& V)
00205 {
00206    itsVocabulary.clear() ;
00207 
00208    const int H = V.getHeight() ;
00209    for (int i = 0; i < H; ++i)
00210       itsVocabulary.push_back(SiftDescriptor(get_row(i, V))) ;
00211    num_channels(H) ;
00212 }
00213 
00214 //----------------------------- CLEAN-UP --------------------------------
00215 
00216 GistEstimatorBeyondBoF::~GistEstimatorBeyondBoF(){}
00217 
00218 //------------------ GIST FEATURE VECTOR COMPUTATION --------------------
00219 
00220 // Forward declarations
00221 namespace {
00222 
00223 SiftGrid       apply_sift_on_patches(ImageType&) ;
00224 GistVectorType flattened_multi_level_histogram(const SiftGrid&,
00225                                                const Vocabulary&) ;
00226 
00227 }
00228 
00229 // The processing method divides the current frame of the series of input
00230 // images into 16x16 pixel patches and computes the SIFT descriptors for
00231 // each of these patches. Then, it either passes this grid of
00232 // descriptors back to its client (in training mode) or computes the
00233 // required gist vector using the supplied vocabulary of prototypical
00234 // descriptors (in normal operational mode).
00235 //
00236 // COMPLEXITY ANALYSIS: O(?) [? time]
00237 // ---------------------------------------
00238 // During normal operation (i.e., non-training mode), this function
00239 // calls apply_sift_on_patches() and flattened_multi_level_histogram().
00240 // The time complexity latter is O(n). That of the former is not yet
00241 // known as it depends on how the SIFT black box will work.
00242 //
00243 // The other operations in this function are all constant time or at
00244 // worst O(n), where n is the number of pixels in the input image.
00245 // Therefore, as it stands now, the overall complexity of this function
00246 // will be determined by that of the SIFT black box; if that is O(n),
00247 // then so is this; if that is O(n^2), so is this; so on and so forth.
00248 void
00249 GistEstimatorBeyondBoF::
00250 onSimEventRetinaImage(SimEventQueue& q,
00251                       rutz::shared_ptr<SimEventRetinaImage>& e)
00252 {
00253    Timer T ;
00254 
00255    ImageType current_frame = e->frame().grayFloat() ;
00256    SiftGrid sift_patches = apply_sift_on_patches(current_frame) ;
00257    LINFO("MVN: %g seconds to compute SIFT grid", T.getSecs()) ;
00258    if (itsTrainingHook)
00259      itsTrainingHook(sift_patches) ;
00260    else
00261    {
00262       if (itsVocabulary.empty())
00263          throw std::runtime_error(GE_BBOF_ERR_NO_VOCABULARY) ;
00264 
00265       T.reset() ;
00266       itsGistVector =
00267          flattened_multi_level_histogram(sift_patches, itsVocabulary) ;
00268       LINFO("MVN: %g seconds to compute %dx%d gist vector", T.getSecs(),
00269             itsGistVector.getHeight(), itsGistVector.getWidth()) ;
00270    }
00271 
00272    rutz::shared_ptr<SimEventGistOutput>
00273      gist_output_event(new SimEventGistOutput(this, itsGistVector)) ;
00274    q.post(gist_output_event) ;
00275 }
00276 
00277 //---------------------- INPUT IMAGE FILTERATION ------------------------
00278 
00279 namespace {
00280 
00281 // This function applies a Gaussian blur using the supplied sigma value
00282 // to take out salt and pepper noise from the input image. The process
00283 // involves a simple sequence of 2X interpolation, blurring and
00284 // decimation back to the original size.
00285 ImageType denoise(ImageType& lum, float sigma = 1.6f)
00286 {
00287    lum = interpolate(lum) ;
00288 
00289    const float blursig = sqrtf(sigma * sigma - 1) ;
00290    Image<float> kernel = gaussian<float>(1, blursig, lum.getWidth(), 1) ;
00291    kernel = kernel / static_cast<float>(sum(kernel)) ;
00292    lum = sepFilter(lum, kernel, kernel, CONV_BOUNDARY_CLEAN) ;
00293 
00294    lum = decXY(lum) ;
00295 
00296    return lum ;
00297 }
00298 
00299 /*
00300    The following function breaks the image into fixed patches of 16x16
00301    pixels with 8 pixel overlaps and computes SIFT descriptors for each
00302    patch. All SIFT descriptors in the output grid are computed at
00303    orientation zero (i.e., upright).
00304 
00305    COMPLEXITY ANALYSIS: O(n) [linear time]
00306    ---------------------------------------
00307    In the worst case, i.e., when the regular grid into which the input
00308    image is to be divided is just one pixel (rather than the usual
00309    16x16), the outer loop will execute a maximum of H times, where H is
00310    the height of the input image. Similarly, the inner loop will iterate
00311    a maximum of W times, where W is the input image's width. Together,
00312    these two loops will end up executing a total of n = WxH times.
00313 
00314    Inside the loops, we have an image crop operation, which will be
00315    O(n). The next step involves computing the SIFT descriptor for the
00316    image patch cropped out by the previous one. The overall complexity
00317    of this function depends on the complexity of this SIFT computation
00318    step. If it is O(n), so is this function; if it is O(n^2), so is this
00319    function; so on and so forth.
00320 */
00321 SiftGrid apply_sift_on_patches(ImageType& I)
00322 {
00323    I = denoise(I) ;
00324 
00325    // ScaleSpace needs image set rather than single image
00326    ImageSet<PixelType> images(1, I.getDims()) ;
00327    images.getImageMut(0) = I ;
00328 
00329    // Divide input image into regular grid of cells sized 16x16 pixels
00330    // with 8 pixels overlap and get SIFT keypoints for each cell in the
00331    // grid. The gridding is handled internally by ScaleSpace.
00332    typedef rutz::shared_ptr<Keypoint> KP ;
00333    std::vector<KP> keypoints ;
00334    ScaleSpace ss(images, 1);
00335    const int n = ss.getGridKeypoints(keypoints) ;
00336 
00337    // Convert std::vector of keypoints retuned by ScaleSpace to grid of
00338    // SIFT descriptors required by other functions here and by this
00339    // class's clients (mainly just src/Gist/train-bbof.C).
00340    const int W = I.getWidth()/8  - 1 ;
00341    const int H = I.getHeight()/8 - 1 ;
00342    LINFO("SIFT descriptor grid size for %dx%d image: %dx%d (= %d descriptors)",
00343          I.getWidth(), I.getHeight(), H, W, n) ;
00344 
00345    int i = 0 ;
00346    SiftGrid G(W, H, NO_INIT) ;
00347    for (int y = 0; y < H; ++y)
00348       for (int x = 0; x < W; ++x)
00349          G.setVal(x, y, SiftDescriptor(keypoints[i++])) ;
00350    return G ;
00351 }
00352 
00353 } // end of local namespace encapsulating input image filteration section
00354 
00355 //---------------------- HISTOGRAM COMPUTATIONS -------------------------
00356 
00357 namespace {
00358 
00359 /*
00360    Once we have the SIFT descriptors for the 16x16 pixel patches, we
00361    compute a feature map using the vocabulary of prototypical SIFT
00362    descriptors. The feature map holds the list of coordinates
00363    corresponding to each feature type.
00364 
00365    For example, with a vocabulary size of 200, the feature map will hold
00366    (at most) 200 "bins" corresponding to the 200 feature types (or
00367    words). Each bin will contain a list of the coordinates at which that
00368    feature type occurs. These coordinates are in the SIFT descriptors
00369    grid space.
00370 
00371    To clarify further: consider a 160x120 input image. We first divide
00372    this image into 16x16 pixel patches with an 8 pixel overlap in each
00373    direction. We then proceed to compute SIFT descriptors for these
00374    patches. These descriptors are arranged in a grid whose size (in this
00375    case) is 20x15.
00376 
00377    Then, we figure out which bin each of the descriptors in the SIFT
00378    descriptor grid belongs to by matching against the vocabulary of
00379    prototypical SIFT descriptors. This "transforms" the grid of SIFT
00380    descriptors into a grid of bin indices.
00381 
00382    The feature map is obtained (essentially) by "flipping" the grid of
00383    bin indices so that each bin index points to a list of coordinates. As
00384    mentioned above these coordinates are the coordinates of the cells in
00385    20x15 descriptor grid.
00386 
00387    Hopefully, the following picture helps visualize the entire process:
00388 
00389 
00390                   0   1   ...   19              0   1   ...   19
00391                 +---+---+-----+---+           +---+---+-----+---+
00392               0 | S | S | ... | S |         0 | 7 | 3 | ... | 3 |
00393                 +---+---+-----+---+           +---+---+-----+---+
00394               1 | S | S | ... | S |         1 | 4 | 7 | ... | 3 |
00395                 +---+---+-----+---+ =====>    +---+---+-----+---+
00396               : | : | : |  :  | : |         : | : | : |  :  | : |
00397               : | : | : |  :  | : |         : | : | : |  :  | : |
00398                 +---+---+-----+---+           +---+---+-----+---+
00399              14 | S | S | ... | S |        14 |123| 8 | ... | 9 |
00400                 +---+---+-----+---+           +---+---+-----+---+
00401 
00402                    20x15 grid of                 20x15 grid of
00403                  SIFT descriptors                 bin indices
00404                                                        |
00405                                                        |
00406                 +---+                                  |
00407                 | : |                                  |
00408                 | : |                                  |
00409                 +---+                                  |
00410               3 | --|--> (1,0) (19,0) (19,1) ...       |
00411                 +---+                                  |
00412               4 | --|--> (0,1) ...                     |
00413                 +---+                                  |
00414                 | : |                                  |
00415                 | : |                                  |
00416                 +---+                                  |
00417               7 | --|--> (0,0) (1,1) ...         <=====+
00418                 +---+
00419               8 | --|--> (1,14) ...
00420                 +---+
00421               9 | --|--> (19,14) ...
00422                 +---+
00423                 | : |
00424                 | : |
00425                 +---+
00426             123 | --|--> (0,14) ...
00427                 +---+
00428                 | : |
00429                 | : |
00430                 +---+
00431 
00432                Feature
00433                  Map
00434 
00435    The following types are for the above data structure.
00436 */
00437 typedef Point2D<int>             GridCoord ;
00438 typedef std::list<GridCoord>     CoordList ;
00439 typedef std::map<int, CoordList> FeatureMap ;
00440 
00441 // To compute the gist vector for an input image, we need to count the
00442 // number of instances of each feature type at each level of the spatial
00443 // matching pyramid. At level 0, there will be only one count for a given
00444 // feature type. At level 1, the SIFT descriptor grid is divided into a
00445 // 2x2 grid and so there are 4 counts. At level 2, the SIFT descriptor
00446 // grid is divided into a 4x4 grid for a total of 16 counts.
00447 //
00448 // The following type is used to store the weighted, normalized feature
00449 // type counts at each level of the pyramid.
00450 typedef Image<float> Histogram ;
00451 
00452 // Forward declarations
00453 FeatureMap compute_feature_map(const SiftGrid&, const Vocabulary&) ;
00454 void       compute_spatial_histogram(const FeatureMap&, const Dims&,
00455                                      GistVectorType*) ;
00456 Histogram  spatial_counts_vector(const CoordList&, int, const Dims&) ;
00457 int        find_bin(const SiftDescriptor&, const Vocabulary&) ;
00458 float      dist2(const SiftDescriptor&, const SiftDescriptor&, float) ;
00459 void       normalize(GistVectorType*, double) ;
00460 
00461 // Top-level routine to compute the gist vector for the input image given
00462 // the grid of SIFT descriptors for the pixel patches and the vocabulary
00463 // of prototypical SIFT descriptors.
00464 //
00465 // COMPLEXITY ANALYSIS: O(n) [linear time]
00466 // ---------------------------------------
00467 // Both the compute_feature_map() and compute_spatial_histogram()
00468 // functions are O(n), where n is the number of pixels in the input
00469 // image. The normalize routine is constant time. Therefore, this
00470 // function runs in time O(n).
00471 //
00472 // DEVNOTE: See the complexity analyses for each of the above-mentioned
00473 // functions to understand why they are linear and constant time
00474 // respectively.
00475 GistVectorType
00476 flattened_multi_level_histogram(const SiftGrid& G, const Vocabulary& V)
00477 {
00478    const Dims gist_vector_size(BBoF::gist_vector_size(), 1) ;
00479 
00480    GistVectorType H(gist_vector_size, ZEROS) ;
00481    FeatureMap F = compute_feature_map(G, V) ;
00482    compute_spatial_histogram(F, G.getDims(), &H) ;
00483    normalize(&H, G.getSize()) ;
00484    return H ;
00485 }
00486 
00487 // The following function implements the procedure described in the
00488 // comment and figure at the start of this section to produce the feature
00489 // map from the SIFT descriptors grid and vocabulary.
00490 //
00491 // COMPLEXITY ANALYSIS: O(n) [linear time]
00492 // ---------------------------------------
00493 // The outer and inner loops combined will execute a total of WxH times
00494 // where W is the width of the SIFT descriptor for the input image and H
00495 // this grid's height. W and H are both going to be some fraction of the
00496 // width and height of the input image itself. Thus, in the worst case,
00497 // the inner and outer loops execute O(n) times, where n is the number
00498 // of pixels in the input image.
00499 //
00500 // To illustrate, consider a 160x120 input image. It is partitioned into
00501 // a regular grid of 16x16 pixels with an 8 pixel overlap for each cell.
00502 // This yields a 20x15 grid of SIFT descriptors for the original 160x120
00503 // image. A 320x240 image will have a 40x30 corresponding SIFT grid and
00504 // an 80x60 image will have a 10x8 grid.
00505 //
00506 // A WxH image will have a (W/8)x(H/8) SIFT grid. If we use n = WxH, we
00507 // can say that the size of the SIFT grid for an input image containing
00508 // n pixels is some constant k times n. (NOTE: k won't always be 1/64
00509 // because the spacing/overlap between cells as well as the size of the
00510 // cells can be something other than 8 and 16x16 pixels respectively.)
00511 //
00512 // Now, having established that the inner and outer loops in this
00513 // function execute O(n) times, we only have to determine the complexity
00514 // of the code inside the loops to ascertain the overall complexity of
00515 // this function.
00516 //
00517 // The find_bin() function is constant time (refer to its complexity
00518 // analysis for the explanation of why this is so).
00519 //
00520 // As per the complexity guarantees provided by the STL, finding items
00521 // in an std::map is at worst logarithmic. In our case, the size of the
00522 // feature map can be no more than M, the number of channels we are
00523 // using. Therefore, looking up feature types in the feature map can be
00524 // no worse than O(log M).
00525 //
00526 // Similarly, inserting an element into the feature map is O(log M). And
00527 // appending an element to the end of an std::list (L.push_back) is
00528 // O(1).
00529 //
00530 // Therefore, the inner loop is dominated by the map look-up and
00531 // insertion operations. This means that the overall complexity of the
00532 // following function is O(log M * n).
00533 //
00534 // M, the number of channels, is a fixed external parameter that cannot
00535 // be arbitrarily changed. That is, we cannot train the scene classifier
00536 // with M = 200 (say) and then classify with M = 400. This fact allows
00537 // us to treat M as a constant.
00538 //
00539 // As a result, we can write the time complexity for this function as
00540 // O(kn) or O(n), where n is the number of pixels in the input image.
00541 FeatureMap compute_feature_map(const SiftGrid& G, const Vocabulary& V)
00542 {
00543    const int W = G.getWidth() ;
00544    const int H = G.getHeight() ;
00545 
00546    FeatureMap F ;
00547    for (int y = 0; y < H; ++y)
00548       for (int x = 0; x < W; ++x)
00549       {
00550          int bin = find_bin(G.getVal(x,y), V) ;
00551          FeatureMap::iterator it = F.find(bin) ;
00552          if (it == F.end()) {
00553             CoordList L ;
00554             L.push_back(GridCoord(x,y)) ;
00555             F.insert(std::make_pair(bin, L)) ;
00556          }
00557          else
00558             it->second.push_back(GridCoord(x,y)) ;
00559       }
00560 
00561    return F ;
00562 }
00563 
00564 // Once we have the feature map, we need to compute the histogram for
00565 // each feature at different levels of the spatial match pyramid. The
00566 // spatial histograms for each level are concatenated to form a single
00567 // vector for each feature. All these vectors for all the features are
00568 // then concatenated to form the final gist vector. (See the Lazebnik
00569 // paper for details.)
00570 //
00571 // COMPLEXITY ANALYSIS: O(n) [linear time]
00572 // ---------------------------------------
00573 // The outer loop will execute M times, where M is the number of
00574 // channels. Finding the coordinate list for a feature type is at most
00575 // O(log M). The inner for loop will execute L times, where L is the
00576 // number of levels in the spatial matching pyramid. The
00577 // spatial_counts_vector() function will thus be called a maximum of ML
00578 // times.
00579 //
00580 // For an input image with n pixels, spatial_counts_vector() takes time
00581 // O(n). Therefore, the overall time complexity for this function is
00582 // O(M * L * log M * n).
00583 //
00584 // M and L are both external parameters to the algorithm but remain
00585 // fixed for any given "experiment" involving all the steps related to
00586 // training, gist vector computations and subsequent classification.
00587 // That is, we cannot train with some particular values of M and L and
00588 // then classify using some other values. Consequently, we can treat
00589 // these parameters as constants and replace them in the above
00590 // expression with k = M * L * log M.
00591 //
00592 // Thus, this function's run time grows linearly with the size of the
00593 // input image. In other words, its complexity is O(kn) or just O(n) if
00594 // we ignore the constant of proportionality.
00595 void compute_spatial_histogram(const FeatureMap& F,
00596                                const Dims& sift_grid_size,
00597                                GistVectorType* H)
00598 {
00599    GistVectorType::iterator target = H->beginw() ;
00600    for (int m = 0; m < BBoF::num_channels(); ++m)
00601    {
00602       FeatureMap::const_iterator it = F.find(m) ;
00603       if (it == F.end()) // no instances of feature type m in input image
00604          target += HISTOGRAM_SIZE_PER_LEVEL ;
00605       else // count instances of feature type m at each level of spatial pyr.
00606       {
00607          const CoordList& coords = it->second ;
00608          for (int l = 0; l <= BBoF::num_levels(); ++l)
00609          {
00610             Histogram h = spatial_counts_vector(coords, l, sift_grid_size) ;
00611             std::copy(h.begin(), h.end(), target) ;
00612             target += h.getSize() ;
00613          }
00614       }
00615    }
00616 }
00617 
00618 /*
00619    In an ordinary bag-of-features matching, we would only be interested
00620    in knowing how many times each feature type occurs in the input
00621    image. The gist vector would thus simply be a histogram showing the
00622    counts for each feature type. For example, for 200 feature types, the
00623    gist vector is just 200 counts.
00624 
00625    When we factor in the spatial matching pyramid as described in the
00626    Lazebnik paper, we have to subdivide the SIFT descriptors grid into
00627    smaller regions and count the occurences of each feature type in each
00628    of these regions. This is why the feature map needs to record the
00629    coordinates of each feature type within the SIFT descriptors grid.
00630 
00631    The spatial matching pyramid is simply a mechanism for specifying the
00632    resolution of the "grid space" subdivisions. At higher resolutions,
00633    we perform a more fine-grained spatial feature matching; at lower
00634    resolutions, we get a coarse "overview" of the feature occurences. In
00635    fact, at level 0, the spatial matching pyramid is exactly a standard
00636    bag-of-features with a single count for each feature type.
00637 
00638    To illustrate how the spatial matching process works, let us consider
00639    a 160x120 input image that has been "transformed" into a 20x15 grid
00640    of bin indices as described in an earlier comment. At level 1 of the
00641    pyramid, our 20x15 grid space will be partitioned into 4 regions as
00642    shown below. At level 2 of the pyramid, the grid space will be
00643    divided into 16 regions (also shown below).
00644 
00645 
00646                 Level 0                          Level 1
00647 
00648           0                 19            0        9        19
00649          +--------------------+          +----------+---------+
00650         0|                    |         0|          |         |
00651          |                    |          |          |         |
00652          |                    |          |          |         |
00653          |                    |          |          |         |
00654          |                    |         7|          |         |
00655          |                    |          +----------+---------+
00656          |                    |          |          |         |
00657          |                    |          |          |         |
00658          |                    |          |          |         |
00659        14|                    |        14|          |         |
00660          +--------------------+          +----------+---------+
00661 
00662 
00663 
00664                                   Level 2
00665 
00666                            0   4    9   14   19
00667                           +----+----+----+----+
00668                          0|    |    |    |    |
00669                           |    |    |    |    |
00670                          3+----+----+----+----+
00671                           |    |    |    |    |
00672                           |    |    |    |    |
00673                          7+----+----+----+----+
00674                           |    |    |    |    |
00675                           |    |    |    |    |
00676                         11+----+----+----+----+
00677                           |    |    |    |    |
00678                         14+----+----+----+----+
00679 
00680 
00681    Thus, at each level of the pyramid, the grid space is partitioned
00682    into a grid with N = 2^l cells in each direction. The histogram for
00683    each feature type at each level mirrors this partitioning and is
00684    representend as an NxN image of integers (counts).
00685 
00686    To compute this histogram, we need to figure out the cell of the
00687    pyramid in which a given coordinate in grid space lies. To illustrate
00688    how to do this, consider the pyramid at level 1. The 20x15 grid space
00689    has to be partitioned into a 2x2 area. Thus, each "quadrant" will
00690    encompass a 10x7.5 area of the 20x15 grid space. Now, let us say we
00691    have grid space coordinates (5,9). This will lie in the bottom left
00692    quadrant. Thus, its coordinates in "pyramid space" are (0,1). Notice
00693    that to transform (5,9) to (0,1) all we need to do is divide the x and
00694    y components of the grid space coordinates by 10 and 7.5 respectively
00695    and discard any fractional portion of the resulting quotient.
00696 
00697    In essence, the transformation from grid space to pyramid space is a
00698    scaling operation. We compute appropriate x and y scale factors and
00699    then simply scale each feature type's coordinates by these factors.
00700 
00701    The scale factors for each direction are:
00702 
00703          Sx = w/N
00704          Sy = h/N
00705 
00706    Each coordinate in grid space simply needs to be divided by the above
00707    factors to effect its transformation into pyramid space.
00708 
00709    NOTE: Scale factors are usually multiplicative, i.e., to scale from
00710    one coordinate system to another, we multiply rather than divide by an
00711    appropriate scale factor. Here, we use the scale factors as divisors
00712    because the above example explains this entire rigmarole in terms of
00713    division and it's just easier to think of it in that way. We could
00714    also have computed the reciprocals of the above factors and then used
00715    multiplication instead.
00716 */
00717 
00718 // The following function object performs the above transformation from
00719 // grid space to pyramid space and updates the histogram count associated
00720 // with each cell in pyramid space. When invoked on every element of a
00721 // coordinate list corresponding to a feature type from the feature map,
00722 // it results in computing the spatial histogram of that feature type at
00723 // the pyramid level currently in effect.
00724 class update_spatial_histogram {
00725    Histogram& target ;
00726    float Sx, Sy ;
00727 public:
00728    update_spatial_histogram(Histogram&, const Dims&) ;
00729    void operator()(const GridCoord&) const ;
00730 } ;
00731 
00732 update_spatial_histogram::update_spatial_histogram(Histogram& H, const Dims& d)
00733    : target(H),
00734      Sx(d.w()/static_cast<float>(H.getWidth())),
00735      Sy(d.h()/static_cast<float>(H.getHeight()))
00736 {}
00737 
00738 void update_spatial_histogram::operator()(const GridCoord& c) const
00739 {
00740    GridCoord p(static_cast<int>(c.i/Sx), static_cast<int>(c.j/Sy)) ;
00741    ++target[p] ;
00742 }
00743 
00744 // The following function returns the histogram for the specified
00745 // feature type at the specified level of the spatial matching pyramid.
00746 // The histogram values are weighted according to the pyramid level as
00747 // described in the Lazebnik paper.
00748 //
00749 // Briefly: at level 0, we divide the feature count by 2^L, where L is
00750 // the maximum number of levels of the spatial matching pyramid. At
00751 // higher levels l, we divide by 2^(L-l+1). The idea is to give more
00752 // weightage to feature matches at higher resolutions and lower
00753 // weightage to matches at coarser resolutions.
00754 //
00755 // Rather than using the feature type to look up the list of SIFT grid
00756 // coordinates from the SIFT descriptors grid, it expects its caller to
00757 // pass it the coordinate list for the desired feature type. Then, for
00758 // each of the coordinates where the feature type occurs, it figures out
00759 // (using the above function object) which cell of the spatial match
00760 // pyramid the feature is in and updates the count associated with that
00761 // cell.
00762 //
00763 // There is a special case for level 0 of the spatial matching pyramid.
00764 // At this level, the grid associated with the pyramid has just one
00765 // cell. So all occurences of all feature types will be in this cell.
00766 // Thus, the histogram for level 0 consists of a single number, viz.,
00767 // the number of entries in the coordinate list for the feature type.
00768 // This handy fact allows us to short-circuit the counting procedure we
00769 // would normally have to perform for other levels of the pyramid.
00770 //
00771 // COMPLEXITY ANALYSIS: O(n) [linear time]
00772 // ---------------------------------------
00773 // For level 0, this function executes a single operation. But at higher
00774 // levels, it needs to compute, for each coordinate where a given
00775 // feature type appears in the input image's SIFT grid, the appropriate
00776 // mapping from "grid space" to "pyramid space" (refer to the comment
00777 // preceding the definition of the update_spatial_histogram function
00778 // object).
00779 //
00780 // The number of coordinate mapping operations to be performed will be
00781 // determined by the size of the coordinates list passed in. This list
00782 // cannot ever have more elements than there are cells in the source
00783 // SIFT grid of the input image.
00784 //
00785 // To illustrate, consider a 160x120 input image. It is partitioned into
00786 // a regular grid of 16x16 pixels with an 8 pixel overlap. This yields a
00787 // 20x15 grid of SIFT descriptors for the original 160x120 image. Thus,
00788 // the SIFT grid contains a total of 300 cells.
00789 //
00790 // Now, in the worst case, we could have some feature type appear in
00791 // every cell of the SIFT grid. In this situation, the coordinates list
00792 // for this feature type will have 300 elements.
00793 //
00794 // The number of cells in the SIFT grid is proportional to the size of
00795 // the input image. A 320x240 image will have a 40x30 corresponding SIFT
00796 // grid and an 80x60 image will have a 10x8 grid. A WxH image will have
00797 // a (W/8)x(H/8) SIFT grid. If we use n = WxH, we can say that the size
00798 // of the SIFT grid for an input image containing n pixels is some
00799 // constant k times n.
00800 //
00801 // Thus, in the worst case, this function executes the operations
00802 // implemented by the update_spatial_histogram function object at most
00803 // kn times. Since the above-mentioned function object only uses a
00804 // couple of elementary operations (division to perform mapping from
00805 // grid space to pyramid space and addition to increment the feature
00806 // type count), the complexity of this function is O(kn) or just O(n).
00807 //
00808 // NOTE: After the spatial histogram updating, there is weighting
00809 // computation. This loop (implicit in the std::transform call) executes
00810 // N^2 times where N is 2^L. However, L, the number of levels in the
00811 // spatial matching pyramid is a fixed external parameter to the gist
00812 // vector computation algorithm. Thus, we can consider the weighting
00813 // step to be constant time. This means that the preceding
00814 // update_spatial_histogram step is the one that dominates the running
00815 // time of this function.
00816 Histogram spatial_counts_vector(const CoordList& coords, int l,
00817                                 const Dims& source_grid_size)
00818 {
00819    const int L = BBoF::num_levels() ;
00820    const int N = 1 << l ;
00821 
00822    Histogram H(N, N, ZEROS) ; // NxN partition of grid space at pyramid level l
00823    if (N == 1) // level 0 ==> no point iterating over entire coord. list
00824       H.setVal(0, 0, coords.size()/(1 << L)) ;
00825    else // level > 0 ==> do spatial pyramid thingummy
00826    {
00827       std::for_each(coords.begin(), coords.end(),
00828                     update_spatial_histogram(H, source_grid_size)) ;
00829       std::transform(H.begin(), H.end(), H.beginw(),
00830                      std::bind2nd(std::divides<float>(), 1 << (L - l + 1))) ;
00831    }
00832    return H ;
00833 }
00834 
00835 // The following function finds the prototypical SIFT descriptor that
00836 // most closely matches the supplied descriptor and returns the
00837 // vocabulary index of the prototypical descriptor. This index is the
00838 // input descriptor's bin.
00839 //
00840 // COMPLEXITY ANALYSIS: 0(1) [constant time]
00841 // -----------------------------------------
00842 // The loop in this function calls the dist2() function and performs
00843 // elementary operations. The dist2() function is O(1) [as per its
00844 // complexity analysis]. Therefore, this function's complexity is O(M),
00845 // where M is the number of channels and the size of the SIFT descriptor
00846 // vocabulary.
00847 //
00848 // M is an external parameter to the gist vector computation algorithm.
00849 // However, once picked, it cannot be arbitrarily changed. That is, if
00850 // we train the scene classifier using gist vectors computed on the
00851 // basis of a vocabulary consisting of (say) 200 words, then we cannot
00852 // change this to 400 and attempt subsequent classifications.
00853 //
00854 // Therefore, for all practical purposes this function is constant time.
00855 // More accurately, it is O(M) and M is essentially a constant.
00856 int find_bin(const SiftDescriptor& S, const Vocabulary& V)
00857 {
00858    int bin = -1 ;
00859    float min_dist = std::numeric_limits<float>::max() ;
00860 
00861    int i = 0 ;
00862    for (Vocabulary::const_iterator it = V.begin(); it != V.end(); ++it, ++i)
00863    {
00864       float d = dist2(S, *it, min_dist) ;
00865       if (d < min_dist) {
00866          bin = i ;
00867          min_dist = d ;
00868       }
00869    }
00870 
00871    return bin ;
00872 }
00873 
00874 // The following function returns the square of the Euclidean distance
00875 // between two SIFT descriptors subject to a supplied minimum, i.e., if
00876 // the square distance exceeds the minimum, the computation is
00877 // short-circuited and the minimum returned.
00878 //
00879 // COMPLEXITY ANALYSIS: O(1) [constant time]
00880 // -----------------------------------------
00881 // The loop in this function contains only elementary operations. The
00882 // number of iterations is equal to the size of a SIFT descriptor, i.e.,
00883 // 128. Thus, this function executes a bunch of elementary operations
00884 // exactly 128 times. Since the size of a SIFT descriptor is independent
00885 // of the size of the input images to the gist vector computations, we
00886 // can consider this function as a constant time operation, i.e., O(1).
00887 float dist2(const SiftDescriptor& L, const SiftDescriptor& R, float min)
00888 {
00889    float d = 0 ;
00890    for (int i = 0; i < SiftDescriptor::SIZE; ++i) {
00891       d += (L[i] - R[i]) * (L[i] - R[i]) ;
00892       if (d > min)
00893          return min ;
00894    }
00895    return d ;
00896 }
00897 
00898 // COMPLEXITY ANALYSIS: O(1) [constant time]
00899 // -----------------------------------------
00900 // This function divides each element of the gist vector by the
00901 // normalizer. Therefore, it is linear in the size of the gist vector.
00902 // This size is determined by the number of channels (M) and the number
00903 // of levels of the spatial matching pyramid (L) [see Lazebnik paper for
00904 // details].
00905 //
00906 // The exact formula for the size of the gist vector is
00907 // (M * (4^(L+1) - 1))/3. Both M and L are parameters to the algorithm
00908 // but remain fixed for any given "experiment" involving all the steps
00909 // related to training, gist vector computations and subsequent
00910 // classification. That is, we cannot train with some particular values
00911 // of M and L and then classify using some other values.
00912 //
00913 // Thus, in some sense, both M and L are predetermined constants so that
00914 // the gist vector size remains fixed. In our case, M = 200 and L = 2.
00915 // So, our gist vectors have 4200 dimensions. This won't change.
00916 // Consequently, we can consider this normalization step as being
00917 // constant time, i.e., O(1).
00918 void normalize(GistVectorType* G, double normalizer)
00919 {
00920    std::transform(G->begin(), G->end(), G->beginw(),
00921                   std::bind2nd(std::divides<double>(), normalizer)) ;
00922 }
00923 
00924 } // end of local namespace encapsulating histogram computations section
00925 
00926 //---------------------- MISCELLANEOUS FUNCTIONS ------------------------
00927 
00928 // Stream I/O for SIFT descriptors
00929 std::ostream& operator<<(std::ostream& os, const SiftDescriptor& d)
00930 {
00931    for (int i = 0; i < SiftDescriptor::SIZE; ++i)
00932       os << d[i] << ' ' ;
00933    return os ;
00934 }
00935 
00936 //-----------------------------------------------------------------------
00937 
00938 /* So things look consistent in everyone's emacs... */
00939 /* Local Variables: */
00940 /* indent-tabs-mode: nil */
00941 /* End: */