00001 /*! 00002 \file Neuro/GistEstimatorBeyondBoF.C 00003 00004 This file defines the member functions, static members, etc. of the 00005 GistEstimatorBeyondBoF class. Further details are in the header file. 00006 */ 00007 00008 // //////////////////////////////////////////////////////////////////// // 00009 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00010 // by the University of Southern California (USC) and the iLab at USC. // 00011 // See http://iLab.usc.edu for information about this project. // 00012 // //////////////////////////////////////////////////////////////////// // 00013 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00014 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00015 // in Visual Environments, and Applications'' by Christof Koch and // 00016 // Laurent Itti, California Institute of Technology, 2001 (patent // 00017 // pending; application number 09/912,225 filed July 23, 2001; see // 00018 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00019 // //////////////////////////////////////////////////////////////////// // 00020 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00023 // redistribute it and/or modify it under the terms of the GNU General // 00024 // Public License as published by the Free Software Foundation; either // 00025 // version 2 of the License, or (at your option) any later version. // 00026 // // 00027 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00028 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00029 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00030 // PURPOSE. See the GNU General Public License for more details. // 00031 // // 00032 // You should have received a copy of the GNU General Public License // 00033 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00034 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00035 // Boston, MA 02111-1307 USA. // 00036 // //////////////////////////////////////////////////////////////////// // 00037 // 00038 // Primary maintainer for this file: Manu Viswanathan <mviswana at usc dot edu> 00039 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Neuro/GistEstimatorBeyondBoF.C $ 00040 // $Id: GistEstimatorBeyondBoF.C 13065 2010-03-28 00:01:00Z itti $ 00041 // 00042 00043 //------------------------------ HEADERS -------------------------------- 00044 00045 // Gist specific headers 00046 #include "Neuro/GistEstimatorBeyondBoF.H" 00047 00048 // Other INVT headers 00049 #include "Neuro/VisualCortex.H" 00050 #include "Neuro/NeuroSimEvents.H" 00051 00052 #include "Simulation/SimEventQueue.H" 00053 00054 #include "SIFT/ScaleSpace.H" 00055 #include "SIFT/Keypoint.H" 00056 00057 #include "Image/Kernels.H" 00058 #include "Image/Convolutions.H" 00059 #include "Image/MathOps.H" 00060 #include "Image/ShapeOps.H" 00061 #include "Image/CutPaste.H" 00062 #include "Image/Point2D.H" 00063 #include "Image/Dims.H" 00064 00065 #include "Util/Timer.H" 00066 00067 #include "nub/ref.h" 00068 #include "rutz/shared_ptr.h" 00069 00070 // Standard C++ headers 00071 #include <sstream> 00072 #include <numeric> 00073 #include <algorithm> 00074 #include <functional> 00075 #include <map> 00076 #include <list> 00077 #include <stdexcept> 00078 #include <utility> 00079 #include <limits> 00080 #include <ctime> 00081 00082 //------------------------------ DEFINES -------------------------------- 00083 00084 // Error message for exceptions when vocabulary is required but not 00085 // specified. 00086 #define GE_BBOF_ERR_NO_VOCABULARY \ 00087 "GistEstimatorBeyondBoF requires vocabulary of " \ 00088 "prototypical SIFT descriptors" 00089 00090 //----------------------------- TYPEDEFS -------------------------------- 00091 00092 // Some useful shortcuts 00093 typedef GistEstimatorBeyondBoF BBoF ; 00094 typedef BBoF::Vocabulary Vocabulary ; 00095 typedef BBoF::SiftDescriptor SiftDescriptor ; 00096 typedef BBoF::SiftGrid SiftGrid ; 00097 00098 // Other frequently used types 00099 typedef float PixelType ; 00100 typedef Image<PixelType> ImageType ; 00101 typedef Image<double> GistVectorType ; 00102 00103 //-------------- STATIC DATA MEMBERS AND OTHER CONSTANTS ---------------- 00104 00105 // As noted in the header file, the following static members, being 00106 // consts, do not really need to be defined here (i.e., they can be 00107 // initialized directly in the header). But they are defined here to 00108 // resolve some weirdness with GCC 3.4. 00109 int GistEstimatorBeyondBoF::NUM_CHANNELS = 200 ; 00110 int GistEstimatorBeyondBoF::NUM_LEVELS = 2 ; 00111 00112 // The gist vector consists of counts of feature types at each level of 00113 // the spatial matching pyramid (see Lazebnik paper and comments in other 00114 // parts of this file for the low-down). At level 0 of the pyramid, the 00115 // histogram for each type consists of a single number; at level 1, the 00116 // histogram has 4 numbers; at level 2, 16; and so on. 00117 // 00118 // The following constant uses the formula in the paper to compute the 00119 // total size of the "spatial" histogram for all the levels of the 00120 // pyramid that are used. 00121 // 00122 // If we use a two-level pyramid, we will get a total of 1 + 4 + 16 = 21 00123 // numbers. These 21 numbers will simply be "collapsed" into long 00124 // (21-dimensional) vectors to create the histograms for each feature 00125 // type. 00126 static int HISTOGRAM_SIZE_PER_LEVEL = 21 ; 00127 00128 // Just as the histograms for each feature type are formed by 00129 // concatenating the spatial histograms at each level of the pyramid into 00130 // one long vector, the entire gist vector is constructed from the 00131 // "collapsed" spatial histograms of all feature types by stringing them 00132 // all together. 00133 // 00134 // For a two-level pyramid and a vocabulary of 200 feature types, we will 00135 // get 4200-dimensional gist vectors. 00136 int BBoF::GIST_VECTOR_SIZE = BBoF::NUM_CHANNELS * HISTOGRAM_SIZE_PER_LEVEL ; 00137 00138 // Changing the number of channels and pyramid size. 00139 void GistEstimatorBeyondBoF::num_channels(int n) 00140 { 00141 if (n >= 5 && n <= 500) { // basic sanity checks 00142 NUM_CHANNELS = n ; 00143 GIST_VECTOR_SIZE = n * HISTOGRAM_SIZE_PER_LEVEL ; 00144 } 00145 } 00146 00147 void GistEstimatorBeyondBoF::num_levels(int n) 00148 { 00149 if (n >= 0 && n <= 5) { // basic sanity checks 00150 NUM_LEVELS = n ; 00151 HISTOGRAM_SIZE_PER_LEVEL = ((1 << (2*n + 2)) - 1)/3 ; 00152 GIST_VECTOR_SIZE = NUM_CHANNELS * HISTOGRAM_SIZE_PER_LEVEL ; 00153 } 00154 } 00155 00156 //-------------------------- INITIALIZATION ----------------------------- 00157 00158 GistEstimatorBeyondBoF:: 00159 GistEstimatorBeyondBoF(OptionManager& mgr, 00160 const std::string& descrName, 00161 const std::string& tagName) 00162 : GistEstimatorAdapter(mgr, descrName, tagName), 00163 SIMCALLBACK_INIT(SimEventRetinaImage), 00164 itsTrainingHook(0) 00165 {} 00166 00167 GistEstimatorBeyondBoF::SiftDescriptor::SiftDescriptor() 00168 { 00169 const int n = sizeof values / sizeof values[0] ; 00170 std::fill_n(values, n, 0) ; 00171 } 00172 00173 // The following SiftDescriptor constructor expects the input image to 00174 // essentially be a vector of 128 values and simply copies them into its 00175 // internal SIFT descriptor values array. If the input image does not 00176 // have at least 128 values, this *will* bomb! 00177 GistEstimatorBeyondBoF::SiftDescriptor::SiftDescriptor(const ImageType& I) 00178 { 00179 const int n = sizeof values / sizeof values[0] ; 00180 std::copy(I.begin(), I.begin() + n, values) ; 00181 } 00182 00183 GistEstimatorBeyondBoF:: 00184 SiftDescriptor::SiftDescriptor(const rutz::shared_ptr<Keypoint>& kp) 00185 { 00186 std::copy(kp->begin(), kp->end(), values) ; 00187 } 00188 00189 // Quick helper to extract the r-th row of an Image 00190 static inline ImageType get_row(int r, const ImageType& I) 00191 { 00192 return crop(I, Point2D<int>(0, r), Dims(I.getWidth(), 1)) ; 00193 } 00194 00195 // The vocabulary is a list of SIFT descriptors. But for convenience, 00196 // clients can pass it in as an image. The image must have 128 columns 00197 // (for the 128 values that make up a SIFT descriptor) and will usually 00198 // have 200 rows (the size of the vocabulary). Thus, the dimensions of 00199 // the input image used to represent the SIFT descriptor vocabulary in 00200 // client space will be 128x200. 00201 // 00202 // The following method simply converts this image into the list of SIFT 00203 // descriptors used to represent the vocabulary internally by this class. 00204 void GistEstimatorBeyondBoF::setVocabulary(const ImageType& V) 00205 { 00206 itsVocabulary.clear() ; 00207 00208 const int H = V.getHeight() ; 00209 for (int i = 0; i < H; ++i) 00210 itsVocabulary.push_back(SiftDescriptor(get_row(i, V))) ; 00211 num_channels(H) ; 00212 } 00213 00214 //----------------------------- CLEAN-UP -------------------------------- 00215 00216 GistEstimatorBeyondBoF::~GistEstimatorBeyondBoF(){} 00217 00218 //------------------ GIST FEATURE VECTOR COMPUTATION -------------------- 00219 00220 // Forward declarations 00221 namespace { 00222 00223 SiftGrid apply_sift_on_patches(ImageType&) ; 00224 GistVectorType flattened_multi_level_histogram(const SiftGrid&, 00225 const Vocabulary&) ; 00226 00227 } 00228 00229 // The processing method divides the current frame of the series of input 00230 // images into 16x16 pixel patches and computes the SIFT descriptors for 00231 // each of these patches. Then, it either passes this grid of 00232 // descriptors back to its client (in training mode) or computes the 00233 // required gist vector using the supplied vocabulary of prototypical 00234 // descriptors (in normal operational mode). 00235 // 00236 // COMPLEXITY ANALYSIS: O(?) [? time] 00237 // --------------------------------------- 00238 // During normal operation (i.e., non-training mode), this function 00239 // calls apply_sift_on_patches() and flattened_multi_level_histogram(). 00240 // The time complexity latter is O(n). That of the former is not yet 00241 // known as it depends on how the SIFT black box will work. 00242 // 00243 // The other operations in this function are all constant time or at 00244 // worst O(n), where n is the number of pixels in the input image. 00245 // Therefore, as it stands now, the overall complexity of this function 00246 // will be determined by that of the SIFT black box; if that is O(n), 00247 // then so is this; if that is O(n^2), so is this; so on and so forth. 00248 void 00249 GistEstimatorBeyondBoF:: 00250 onSimEventRetinaImage(SimEventQueue& q, 00251 rutz::shared_ptr<SimEventRetinaImage>& e) 00252 { 00253 Timer T ; 00254 00255 ImageType current_frame = e->frame().grayFloat() ; 00256 SiftGrid sift_patches = apply_sift_on_patches(current_frame) ; 00257 LINFO("MVN: %g seconds to compute SIFT grid", T.getSecs()) ; 00258 if (itsTrainingHook) 00259 itsTrainingHook(sift_patches) ; 00260 else 00261 { 00262 if (itsVocabulary.empty()) 00263 throw std::runtime_error(GE_BBOF_ERR_NO_VOCABULARY) ; 00264 00265 T.reset() ; 00266 itsGistVector = 00267 flattened_multi_level_histogram(sift_patches, itsVocabulary) ; 00268 LINFO("MVN: %g seconds to compute %dx%d gist vector", T.getSecs(), 00269 itsGistVector.getHeight(), itsGistVector.getWidth()) ; 00270 } 00271 00272 rutz::shared_ptr<SimEventGistOutput> 00273 gist_output_event(new SimEventGistOutput(this, itsGistVector)) ; 00274 q.post(gist_output_event) ; 00275 } 00276 00277 //---------------------- INPUT IMAGE FILTERATION ------------------------ 00278 00279 namespace { 00280 00281 // This function applies a Gaussian blur using the supplied sigma value 00282 // to take out salt and pepper noise from the input image. The process 00283 // involves a simple sequence of 2X interpolation, blurring and 00284 // decimation back to the original size. 00285 ImageType denoise(ImageType& lum, float sigma = 1.6f) 00286 { 00287 lum = interpolate(lum) ; 00288 00289 const float blursig = sqrtf(sigma * sigma - 1) ; 00290 Image<float> kernel = gaussian<float>(1, blursig, lum.getWidth(), 1) ; 00291 kernel = kernel / static_cast<float>(sum(kernel)) ; 00292 lum = sepFilter(lum, kernel, kernel, CONV_BOUNDARY_CLEAN) ; 00293 00294 lum = decXY(lum) ; 00295 00296 return lum ; 00297 } 00298 00299 /* 00300 The following function breaks the image into fixed patches of 16x16 00301 pixels with 8 pixel overlaps and computes SIFT descriptors for each 00302 patch. All SIFT descriptors in the output grid are computed at 00303 orientation zero (i.e., upright). 00304 00305 COMPLEXITY ANALYSIS: O(n) [linear time] 00306 --------------------------------------- 00307 In the worst case, i.e., when the regular grid into which the input 00308 image is to be divided is just one pixel (rather than the usual 00309 16x16), the outer loop will execute a maximum of H times, where H is 00310 the height of the input image. Similarly, the inner loop will iterate 00311 a maximum of W times, where W is the input image's width. Together, 00312 these two loops will end up executing a total of n = WxH times. 00313 00314 Inside the loops, we have an image crop operation, which will be 00315 O(n). The next step involves computing the SIFT descriptor for the 00316 image patch cropped out by the previous one. The overall complexity 00317 of this function depends on the complexity of this SIFT computation 00318 step. If it is O(n), so is this function; if it is O(n^2), so is this 00319 function; so on and so forth. 00320 */ 00321 SiftGrid apply_sift_on_patches(ImageType& I) 00322 { 00323 I = denoise(I) ; 00324 00325 // ScaleSpace needs image set rather than single image 00326 ImageSet<PixelType> images(1, I.getDims()) ; 00327 images.getImageMut(0) = I ; 00328 00329 // Divide input image into regular grid of cells sized 16x16 pixels 00330 // with 8 pixels overlap and get SIFT keypoints for each cell in the 00331 // grid. The gridding is handled internally by ScaleSpace. 00332 typedef rutz::shared_ptr<Keypoint> KP ; 00333 std::vector<KP> keypoints ; 00334 ScaleSpace ss(images, 1); 00335 const int n = ss.getGridKeypoints(keypoints) ; 00336 00337 // Convert std::vector of keypoints retuned by ScaleSpace to grid of 00338 // SIFT descriptors required by other functions here and by this 00339 // class's clients (mainly just src/Gist/train-bbof.C). 00340 const int W = I.getWidth()/8 - 1 ; 00341 const int H = I.getHeight()/8 - 1 ; 00342 LINFO("SIFT descriptor grid size for %dx%d image: %dx%d (= %d descriptors)", 00343 I.getWidth(), I.getHeight(), H, W, n) ; 00344 00345 int i = 0 ; 00346 SiftGrid G(W, H, NO_INIT) ; 00347 for (int y = 0; y < H; ++y) 00348 for (int x = 0; x < W; ++x) 00349 G.setVal(x, y, SiftDescriptor(keypoints[i++])) ; 00350 return G ; 00351 } 00352 00353 } // end of local namespace encapsulating input image filteration section 00354 00355 //---------------------- HISTOGRAM COMPUTATIONS ------------------------- 00356 00357 namespace { 00358 00359 /* 00360 Once we have the SIFT descriptors for the 16x16 pixel patches, we 00361 compute a feature map using the vocabulary of prototypical SIFT 00362 descriptors. The feature map holds the list of coordinates 00363 corresponding to each feature type. 00364 00365 For example, with a vocabulary size of 200, the feature map will hold 00366 (at most) 200 "bins" corresponding to the 200 feature types (or 00367 words). Each bin will contain a list of the coordinates at which that 00368 feature type occurs. These coordinates are in the SIFT descriptors 00369 grid space. 00370 00371 To clarify further: consider a 160x120 input image. We first divide 00372 this image into 16x16 pixel patches with an 8 pixel overlap in each 00373 direction. We then proceed to compute SIFT descriptors for these 00374 patches. These descriptors are arranged in a grid whose size (in this 00375 case) is 20x15. 00376 00377 Then, we figure out which bin each of the descriptors in the SIFT 00378 descriptor grid belongs to by matching against the vocabulary of 00379 prototypical SIFT descriptors. This "transforms" the grid of SIFT 00380 descriptors into a grid of bin indices. 00381 00382 The feature map is obtained (essentially) by "flipping" the grid of 00383 bin indices so that each bin index points to a list of coordinates. As 00384 mentioned above these coordinates are the coordinates of the cells in 00385 20x15 descriptor grid. 00386 00387 Hopefully, the following picture helps visualize the entire process: 00388 00389 00390 0 1 ... 19 0 1 ... 19 00391 +---+---+-----+---+ +---+---+-----+---+ 00392 0 | S | S | ... | S | 0 | 7 | 3 | ... | 3 | 00393 +---+---+-----+---+ +---+---+-----+---+ 00394 1 | S | S | ... | S | 1 | 4 | 7 | ... | 3 | 00395 +---+---+-----+---+ =====> +---+---+-----+---+ 00396 : | : | : | : | : | : | : | : | : | : | 00397 : | : | : | : | : | : | : | : | : | : | 00398 +---+---+-----+---+ +---+---+-----+---+ 00399 14 | S | S | ... | S | 14 |123| 8 | ... | 9 | 00400 +---+---+-----+---+ +---+---+-----+---+ 00401 00402 20x15 grid of 20x15 grid of 00403 SIFT descriptors bin indices 00404 | 00405 | 00406 +---+ | 00407 | : | | 00408 | : | | 00409 +---+ | 00410 3 | --|--> (1,0) (19,0) (19,1) ... | 00411 +---+ | 00412 4 | --|--> (0,1) ... | 00413 +---+ | 00414 | : | | 00415 | : | | 00416 +---+ | 00417 7 | --|--> (0,0) (1,1) ... <=====+ 00418 +---+ 00419 8 | --|--> (1,14) ... 00420 +---+ 00421 9 | --|--> (19,14) ... 00422 +---+ 00423 | : | 00424 | : | 00425 +---+ 00426 123 | --|--> (0,14) ... 00427 +---+ 00428 | : | 00429 | : | 00430 +---+ 00431 00432 Feature 00433 Map 00434 00435 The following types are for the above data structure. 00436 */ 00437 typedef Point2D<int> GridCoord ; 00438 typedef std::list<GridCoord> CoordList ; 00439 typedef std::map<int, CoordList> FeatureMap ; 00440 00441 // To compute the gist vector for an input image, we need to count the 00442 // number of instances of each feature type at each level of the spatial 00443 // matching pyramid. At level 0, there will be only one count for a given 00444 // feature type. At level 1, the SIFT descriptor grid is divided into a 00445 // 2x2 grid and so there are 4 counts. At level 2, the SIFT descriptor 00446 // grid is divided into a 4x4 grid for a total of 16 counts. 00447 // 00448 // The following type is used to store the weighted, normalized feature 00449 // type counts at each level of the pyramid. 00450 typedef Image<float> Histogram ; 00451 00452 // Forward declarations 00453 FeatureMap compute_feature_map(const SiftGrid&, const Vocabulary&) ; 00454 void compute_spatial_histogram(const FeatureMap&, const Dims&, 00455 GistVectorType*) ; 00456 Histogram spatial_counts_vector(const CoordList&, int, const Dims&) ; 00457 int find_bin(const SiftDescriptor&, const Vocabulary&) ; 00458 float dist2(const SiftDescriptor&, const SiftDescriptor&, float) ; 00459 void normalize(GistVectorType*, double) ; 00460 00461 // Top-level routine to compute the gist vector for the input image given 00462 // the grid of SIFT descriptors for the pixel patches and the vocabulary 00463 // of prototypical SIFT descriptors. 00464 // 00465 // COMPLEXITY ANALYSIS: O(n) [linear time] 00466 // --------------------------------------- 00467 // Both the compute_feature_map() and compute_spatial_histogram() 00468 // functions are O(n), where n is the number of pixels in the input 00469 // image. The normalize routine is constant time. Therefore, this 00470 // function runs in time O(n). 00471 // 00472 // DEVNOTE: See the complexity analyses for each of the above-mentioned 00473 // functions to understand why they are linear and constant time 00474 // respectively. 00475 GistVectorType 00476 flattened_multi_level_histogram(const SiftGrid& G, const Vocabulary& V) 00477 { 00478 const Dims gist_vector_size(BBoF::gist_vector_size(), 1) ; 00479 00480 GistVectorType H(gist_vector_size, ZEROS) ; 00481 FeatureMap F = compute_feature_map(G, V) ; 00482 compute_spatial_histogram(F, G.getDims(), &H) ; 00483 normalize(&H, G.getSize()) ; 00484 return H ; 00485 } 00486 00487 // The following function implements the procedure described in the 00488 // comment and figure at the start of this section to produce the feature 00489 // map from the SIFT descriptors grid and vocabulary. 00490 // 00491 // COMPLEXITY ANALYSIS: O(n) [linear time] 00492 // --------------------------------------- 00493 // The outer and inner loops combined will execute a total of WxH times 00494 // where W is the width of the SIFT descriptor for the input image and H 00495 // this grid's height. W and H are both going to be some fraction of the 00496 // width and height of the input image itself. Thus, in the worst case, 00497 // the inner and outer loops execute O(n) times, where n is the number 00498 // of pixels in the input image. 00499 // 00500 // To illustrate, consider a 160x120 input image. It is partitioned into 00501 // a regular grid of 16x16 pixels with an 8 pixel overlap for each cell. 00502 // This yields a 20x15 grid of SIFT descriptors for the original 160x120 00503 // image. A 320x240 image will have a 40x30 corresponding SIFT grid and 00504 // an 80x60 image will have a 10x8 grid. 00505 // 00506 // A WxH image will have a (W/8)x(H/8) SIFT grid. If we use n = WxH, we 00507 // can say that the size of the SIFT grid for an input image containing 00508 // n pixels is some constant k times n. (NOTE: k won't always be 1/64 00509 // because the spacing/overlap between cells as well as the size of the 00510 // cells can be something other than 8 and 16x16 pixels respectively.) 00511 // 00512 // Now, having established that the inner and outer loops in this 00513 // function execute O(n) times, we only have to determine the complexity 00514 // of the code inside the loops to ascertain the overall complexity of 00515 // this function. 00516 // 00517 // The find_bin() function is constant time (refer to its complexity 00518 // analysis for the explanation of why this is so). 00519 // 00520 // As per the complexity guarantees provided by the STL, finding items 00521 // in an std::map is at worst logarithmic. In our case, the size of the 00522 // feature map can be no more than M, the number of channels we are 00523 // using. Therefore, looking up feature types in the feature map can be 00524 // no worse than O(log M). 00525 // 00526 // Similarly, inserting an element into the feature map is O(log M). And 00527 // appending an element to the end of an std::list (L.push_back) is 00528 // O(1). 00529 // 00530 // Therefore, the inner loop is dominated by the map look-up and 00531 // insertion operations. This means that the overall complexity of the 00532 // following function is O(log M * n). 00533 // 00534 // M, the number of channels, is a fixed external parameter that cannot 00535 // be arbitrarily changed. That is, we cannot train the scene classifier 00536 // with M = 200 (say) and then classify with M = 400. This fact allows 00537 // us to treat M as a constant. 00538 // 00539 // As a result, we can write the time complexity for this function as 00540 // O(kn) or O(n), where n is the number of pixels in the input image. 00541 FeatureMap compute_feature_map(const SiftGrid& G, const Vocabulary& V) 00542 { 00543 const int W = G.getWidth() ; 00544 const int H = G.getHeight() ; 00545 00546 FeatureMap F ; 00547 for (int y = 0; y < H; ++y) 00548 for (int x = 0; x < W; ++x) 00549 { 00550 int bin = find_bin(G.getVal(x,y), V) ; 00551 FeatureMap::iterator it = F.find(bin) ; 00552 if (it == F.end()) { 00553 CoordList L ; 00554 L.push_back(GridCoord(x,y)) ; 00555 F.insert(std::make_pair(bin, L)) ; 00556 } 00557 else 00558 it->second.push_back(GridCoord(x,y)) ; 00559 } 00560 00561 return F ; 00562 } 00563 00564 // Once we have the feature map, we need to compute the histogram for 00565 // each feature at different levels of the spatial match pyramid. The 00566 // spatial histograms for each level are concatenated to form a single 00567 // vector for each feature. All these vectors for all the features are 00568 // then concatenated to form the final gist vector. (See the Lazebnik 00569 // paper for details.) 00570 // 00571 // COMPLEXITY ANALYSIS: O(n) [linear time] 00572 // --------------------------------------- 00573 // The outer loop will execute M times, where M is the number of 00574 // channels. Finding the coordinate list for a feature type is at most 00575 // O(log M). The inner for loop will execute L times, where L is the 00576 // number of levels in the spatial matching pyramid. The 00577 // spatial_counts_vector() function will thus be called a maximum of ML 00578 // times. 00579 // 00580 // For an input image with n pixels, spatial_counts_vector() takes time 00581 // O(n). Therefore, the overall time complexity for this function is 00582 // O(M * L * log M * n). 00583 // 00584 // M and L are both external parameters to the algorithm but remain 00585 // fixed for any given "experiment" involving all the steps related to 00586 // training, gist vector computations and subsequent classification. 00587 // That is, we cannot train with some particular values of M and L and 00588 // then classify using some other values. Consequently, we can treat 00589 // these parameters as constants and replace them in the above 00590 // expression with k = M * L * log M. 00591 // 00592 // Thus, this function's run time grows linearly with the size of the 00593 // input image. In other words, its complexity is O(kn) or just O(n) if 00594 // we ignore the constant of proportionality. 00595 void compute_spatial_histogram(const FeatureMap& F, 00596 const Dims& sift_grid_size, 00597 GistVectorType* H) 00598 { 00599 GistVectorType::iterator target = H->beginw() ; 00600 for (int m = 0; m < BBoF::num_channels(); ++m) 00601 { 00602 FeatureMap::const_iterator it = F.find(m) ; 00603 if (it == F.end()) // no instances of feature type m in input image 00604 target += HISTOGRAM_SIZE_PER_LEVEL ; 00605 else // count instances of feature type m at each level of spatial pyr. 00606 { 00607 const CoordList& coords = it->second ; 00608 for (int l = 0; l <= BBoF::num_levels(); ++l) 00609 { 00610 Histogram h = spatial_counts_vector(coords, l, sift_grid_size) ; 00611 std::copy(h.begin(), h.end(), target) ; 00612 target += h.getSize() ; 00613 } 00614 } 00615 } 00616 } 00617 00618 /* 00619 In an ordinary bag-of-features matching, we would only be interested 00620 in knowing how many times each feature type occurs in the input 00621 image. The gist vector would thus simply be a histogram showing the 00622 counts for each feature type. For example, for 200 feature types, the 00623 gist vector is just 200 counts. 00624 00625 When we factor in the spatial matching pyramid as described in the 00626 Lazebnik paper, we have to subdivide the SIFT descriptors grid into 00627 smaller regions and count the occurences of each feature type in each 00628 of these regions. This is why the feature map needs to record the 00629 coordinates of each feature type within the SIFT descriptors grid. 00630 00631 The spatial matching pyramid is simply a mechanism for specifying the 00632 resolution of the "grid space" subdivisions. At higher resolutions, 00633 we perform a more fine-grained spatial feature matching; at lower 00634 resolutions, we get a coarse "overview" of the feature occurences. In 00635 fact, at level 0, the spatial matching pyramid is exactly a standard 00636 bag-of-features with a single count for each feature type. 00637 00638 To illustrate how the spatial matching process works, let us consider 00639 a 160x120 input image that has been "transformed" into a 20x15 grid 00640 of bin indices as described in an earlier comment. At level 1 of the 00641 pyramid, our 20x15 grid space will be partitioned into 4 regions as 00642 shown below. At level 2 of the pyramid, the grid space will be 00643 divided into 16 regions (also shown below). 00644 00645 00646 Level 0 Level 1 00647 00648 0 19 0 9 19 00649 +--------------------+ +----------+---------+ 00650 0| | 0| | | 00651 | | | | | 00652 | | | | | 00653 | | | | | 00654 | | 7| | | 00655 | | +----------+---------+ 00656 | | | | | 00657 | | | | | 00658 | | | | | 00659 14| | 14| | | 00660 +--------------------+ +----------+---------+ 00661 00662 00663 00664 Level 2 00665 00666 0 4 9 14 19 00667 +----+----+----+----+ 00668 0| | | | | 00669 | | | | | 00670 3+----+----+----+----+ 00671 | | | | | 00672 | | | | | 00673 7+----+----+----+----+ 00674 | | | | | 00675 | | | | | 00676 11+----+----+----+----+ 00677 | | | | | 00678 14+----+----+----+----+ 00679 00680 00681 Thus, at each level of the pyramid, the grid space is partitioned 00682 into a grid with N = 2^l cells in each direction. The histogram for 00683 each feature type at each level mirrors this partitioning and is 00684 representend as an NxN image of integers (counts). 00685 00686 To compute this histogram, we need to figure out the cell of the 00687 pyramid in which a given coordinate in grid space lies. To illustrate 00688 how to do this, consider the pyramid at level 1. The 20x15 grid space 00689 has to be partitioned into a 2x2 area. Thus, each "quadrant" will 00690 encompass a 10x7.5 area of the 20x15 grid space. Now, let us say we 00691 have grid space coordinates (5,9). This will lie in the bottom left 00692 quadrant. Thus, its coordinates in "pyramid space" are (0,1). Notice 00693 that to transform (5,9) to (0,1) all we need to do is divide the x and 00694 y components of the grid space coordinates by 10 and 7.5 respectively 00695 and discard any fractional portion of the resulting quotient. 00696 00697 In essence, the transformation from grid space to pyramid space is a 00698 scaling operation. We compute appropriate x and y scale factors and 00699 then simply scale each feature type's coordinates by these factors. 00700 00701 The scale factors for each direction are: 00702 00703 Sx = w/N 00704 Sy = h/N 00705 00706 Each coordinate in grid space simply needs to be divided by the above 00707 factors to effect its transformation into pyramid space. 00708 00709 NOTE: Scale factors are usually multiplicative, i.e., to scale from 00710 one coordinate system to another, we multiply rather than divide by an 00711 appropriate scale factor. Here, we use the scale factors as divisors 00712 because the above example explains this entire rigmarole in terms of 00713 division and it's just easier to think of it in that way. We could 00714 also have computed the reciprocals of the above factors and then used 00715 multiplication instead. 00716 */ 00717 00718 // The following function object performs the above transformation from 00719 // grid space to pyramid space and updates the histogram count associated 00720 // with each cell in pyramid space. When invoked on every element of a 00721 // coordinate list corresponding to a feature type from the feature map, 00722 // it results in computing the spatial histogram of that feature type at 00723 // the pyramid level currently in effect. 00724 class update_spatial_histogram { 00725 Histogram& target ; 00726 float Sx, Sy ; 00727 public: 00728 update_spatial_histogram(Histogram&, const Dims&) ; 00729 void operator()(const GridCoord&) const ; 00730 } ; 00731 00732 update_spatial_histogram::update_spatial_histogram(Histogram& H, const Dims& d) 00733 : target(H), 00734 Sx(d.w()/static_cast<float>(H.getWidth())), 00735 Sy(d.h()/static_cast<float>(H.getHeight())) 00736 {} 00737 00738 void update_spatial_histogram::operator()(const GridCoord& c) const 00739 { 00740 GridCoord p(static_cast<int>(c.i/Sx), static_cast<int>(c.j/Sy)) ; 00741 ++target[p] ; 00742 } 00743 00744 // The following function returns the histogram for the specified 00745 // feature type at the specified level of the spatial matching pyramid. 00746 // The histogram values are weighted according to the pyramid level as 00747 // described in the Lazebnik paper. 00748 // 00749 // Briefly: at level 0, we divide the feature count by 2^L, where L is 00750 // the maximum number of levels of the spatial matching pyramid. At 00751 // higher levels l, we divide by 2^(L-l+1). The idea is to give more 00752 // weightage to feature matches at higher resolutions and lower 00753 // weightage to matches at coarser resolutions. 00754 // 00755 // Rather than using the feature type to look up the list of SIFT grid 00756 // coordinates from the SIFT descriptors grid, it expects its caller to 00757 // pass it the coordinate list for the desired feature type. Then, for 00758 // each of the coordinates where the feature type occurs, it figures out 00759 // (using the above function object) which cell of the spatial match 00760 // pyramid the feature is in and updates the count associated with that 00761 // cell. 00762 // 00763 // There is a special case for level 0 of the spatial matching pyramid. 00764 // At this level, the grid associated with the pyramid has just one 00765 // cell. So all occurences of all feature types will be in this cell. 00766 // Thus, the histogram for level 0 consists of a single number, viz., 00767 // the number of entries in the coordinate list for the feature type. 00768 // This handy fact allows us to short-circuit the counting procedure we 00769 // would normally have to perform for other levels of the pyramid. 00770 // 00771 // COMPLEXITY ANALYSIS: O(n) [linear time] 00772 // --------------------------------------- 00773 // For level 0, this function executes a single operation. But at higher 00774 // levels, it needs to compute, for each coordinate where a given 00775 // feature type appears in the input image's SIFT grid, the appropriate 00776 // mapping from "grid space" to "pyramid space" (refer to the comment 00777 // preceding the definition of the update_spatial_histogram function 00778 // object). 00779 // 00780 // The number of coordinate mapping operations to be performed will be 00781 // determined by the size of the coordinates list passed in. This list 00782 // cannot ever have more elements than there are cells in the source 00783 // SIFT grid of the input image. 00784 // 00785 // To illustrate, consider a 160x120 input image. It is partitioned into 00786 // a regular grid of 16x16 pixels with an 8 pixel overlap. This yields a 00787 // 20x15 grid of SIFT descriptors for the original 160x120 image. Thus, 00788 // the SIFT grid contains a total of 300 cells. 00789 // 00790 // Now, in the worst case, we could have some feature type appear in 00791 // every cell of the SIFT grid. In this situation, the coordinates list 00792 // for this feature type will have 300 elements. 00793 // 00794 // The number of cells in the SIFT grid is proportional to the size of 00795 // the input image. A 320x240 image will have a 40x30 corresponding SIFT 00796 // grid and an 80x60 image will have a 10x8 grid. A WxH image will have 00797 // a (W/8)x(H/8) SIFT grid. If we use n = WxH, we can say that the size 00798 // of the SIFT grid for an input image containing n pixels is some 00799 // constant k times n. 00800 // 00801 // Thus, in the worst case, this function executes the operations 00802 // implemented by the update_spatial_histogram function object at most 00803 // kn times. Since the above-mentioned function object only uses a 00804 // couple of elementary operations (division to perform mapping from 00805 // grid space to pyramid space and addition to increment the feature 00806 // type count), the complexity of this function is O(kn) or just O(n). 00807 // 00808 // NOTE: After the spatial histogram updating, there is weighting 00809 // computation. This loop (implicit in the std::transform call) executes 00810 // N^2 times where N is 2^L. However, L, the number of levels in the 00811 // spatial matching pyramid is a fixed external parameter to the gist 00812 // vector computation algorithm. Thus, we can consider the weighting 00813 // step to be constant time. This means that the preceding 00814 // update_spatial_histogram step is the one that dominates the running 00815 // time of this function. 00816 Histogram spatial_counts_vector(const CoordList& coords, int l, 00817 const Dims& source_grid_size) 00818 { 00819 const int L = BBoF::num_levels() ; 00820 const int N = 1 << l ; 00821 00822 Histogram H(N, N, ZEROS) ; // NxN partition of grid space at pyramid level l 00823 if (N == 1) // level 0 ==> no point iterating over entire coord. list 00824 H.setVal(0, 0, coords.size()/(1 << L)) ; 00825 else // level > 0 ==> do spatial pyramid thingummy 00826 { 00827 std::for_each(coords.begin(), coords.end(), 00828 update_spatial_histogram(H, source_grid_size)) ; 00829 std::transform(H.begin(), H.end(), H.beginw(), 00830 std::bind2nd(std::divides<float>(), 1 << (L - l + 1))) ; 00831 } 00832 return H ; 00833 } 00834 00835 // The following function finds the prototypical SIFT descriptor that 00836 // most closely matches the supplied descriptor and returns the 00837 // vocabulary index of the prototypical descriptor. This index is the 00838 // input descriptor's bin. 00839 // 00840 // COMPLEXITY ANALYSIS: 0(1) [constant time] 00841 // ----------------------------------------- 00842 // The loop in this function calls the dist2() function and performs 00843 // elementary operations. The dist2() function is O(1) [as per its 00844 // complexity analysis]. Therefore, this function's complexity is O(M), 00845 // where M is the number of channels and the size of the SIFT descriptor 00846 // vocabulary. 00847 // 00848 // M is an external parameter to the gist vector computation algorithm. 00849 // However, once picked, it cannot be arbitrarily changed. That is, if 00850 // we train the scene classifier using gist vectors computed on the 00851 // basis of a vocabulary consisting of (say) 200 words, then we cannot 00852 // change this to 400 and attempt subsequent classifications. 00853 // 00854 // Therefore, for all practical purposes this function is constant time. 00855 // More accurately, it is O(M) and M is essentially a constant. 00856 int find_bin(const SiftDescriptor& S, const Vocabulary& V) 00857 { 00858 int bin = -1 ; 00859 float min_dist = std::numeric_limits<float>::max() ; 00860 00861 int i = 0 ; 00862 for (Vocabulary::const_iterator it = V.begin(); it != V.end(); ++it, ++i) 00863 { 00864 float d = dist2(S, *it, min_dist) ; 00865 if (d < min_dist) { 00866 bin = i ; 00867 min_dist = d ; 00868 } 00869 } 00870 00871 return bin ; 00872 } 00873 00874 // The following function returns the square of the Euclidean distance 00875 // between two SIFT descriptors subject to a supplied minimum, i.e., if 00876 // the square distance exceeds the minimum, the computation is 00877 // short-circuited and the minimum returned. 00878 // 00879 // COMPLEXITY ANALYSIS: O(1) [constant time] 00880 // ----------------------------------------- 00881 // The loop in this function contains only elementary operations. The 00882 // number of iterations is equal to the size of a SIFT descriptor, i.e., 00883 // 128. Thus, this function executes a bunch of elementary operations 00884 // exactly 128 times. Since the size of a SIFT descriptor is independent 00885 // of the size of the input images to the gist vector computations, we 00886 // can consider this function as a constant time operation, i.e., O(1). 00887 float dist2(const SiftDescriptor& L, const SiftDescriptor& R, float min) 00888 { 00889 float d = 0 ; 00890 for (int i = 0; i < SiftDescriptor::SIZE; ++i) { 00891 d += (L[i] - R[i]) * (L[i] - R[i]) ; 00892 if (d > min) 00893 return min ; 00894 } 00895 return d ; 00896 } 00897 00898 // COMPLEXITY ANALYSIS: O(1) [constant time] 00899 // ----------------------------------------- 00900 // This function divides each element of the gist vector by the 00901 // normalizer. Therefore, it is linear in the size of the gist vector. 00902 // This size is determined by the number of channels (M) and the number 00903 // of levels of the spatial matching pyramid (L) [see Lazebnik paper for 00904 // details]. 00905 // 00906 // The exact formula for the size of the gist vector is 00907 // (M * (4^(L+1) - 1))/3. Both M and L are parameters to the algorithm 00908 // but remain fixed for any given "experiment" involving all the steps 00909 // related to training, gist vector computations and subsequent 00910 // classification. That is, we cannot train with some particular values 00911 // of M and L and then classify using some other values. 00912 // 00913 // Thus, in some sense, both M and L are predetermined constants so that 00914 // the gist vector size remains fixed. In our case, M = 200 and L = 2. 00915 // So, our gist vectors have 4200 dimensions. This won't change. 00916 // Consequently, we can consider this normalization step as being 00917 // constant time, i.e., O(1). 00918 void normalize(GistVectorType* G, double normalizer) 00919 { 00920 std::transform(G->begin(), G->end(), G->beginw(), 00921 std::bind2nd(std::divides<double>(), normalizer)) ; 00922 } 00923 00924 } // end of local namespace encapsulating histogram computations section 00925 00926 //---------------------- MISCELLANEOUS FUNCTIONS ------------------------ 00927 00928 // Stream I/O for SIFT descriptors 00929 std::ostream& operator<<(std::ostream& os, const SiftDescriptor& d) 00930 { 00931 for (int i = 0; i < SiftDescriptor::SIZE; ++i) 00932 os << d[i] << ' ' ; 00933 return os ; 00934 } 00935 00936 //----------------------------------------------------------------------- 00937 00938 /* So things look consistent in everyone's emacs... */ 00939 /* Local Variables: */ 00940 /* indent-tabs-mode: nil */ 00941 /* End: */