00001 /*!@file Util/AllocAux.C memory allocation routines for 16-byte alignment */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: Rob Peters <rjpeters at usc dot edu> 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/CudaAlloc.C $ 00035 // $Id: CudaAlloc.C 12962 2010-03-06 02:13:53Z irock $ 00036 // 00037 00038 #ifndef CUDAALLOC_C_DEFINED 00039 #define CUDAALLOC_C_DEFINED 00040 00041 #include "Util/Assert.H" 00042 #include "Util/log.H" 00043 #include "Util/sformat.H" 00044 #include "CUDA/cudafreelist.H" 00045 #include "rutz/mutex.h" 00046 #include "rutz/trace.h" 00047 #include "CUDA/CudaDevices.H" 00048 #include <map> 00049 #include <pthread.h> 00050 00051 namespace 00052 { 00053 //! Trivial allocator that just calls operator new() and operator delete() 00054 struct cuda_trivial_alloc 00055 { 00056 void set_debug(bool /*do_debug*/) 00057 { 00058 // no debug settings for this allocator type 00059 } 00060 00061 void set_allow_caching(bool /*on*/) 00062 { 00063 // no caching here in any case 00064 } 00065 00066 void show_stats(int /*verbosity*/, const char* /*pfx*/, 00067 const size_t block_size, const size_t overhead) const 00068 { 00069 // nothing to do here 00070 } 00071 00072 void* allocate(size_t nbytes, rutz::cuda_free_list_base** source, int dev) 00073 { 00074 if (source != 0) 00075 *source = 0; 00076 void *ret; 00077 CudaDevices::malloc(&ret,nbytes,dev); 00078 return ret; 00079 } 00080 00081 void deallocate(void* space, rutz::cuda_free_list_base* source, int dev) 00082 { 00083 ASSERT(source == 0); 00084 CudaDevices::free(space,dev); 00085 } 00086 00087 void release_free_mem() 00088 { 00089 // nothing to do here 00090 } 00091 }; 00092 00093 //! Caching allocator with free lists for common allocation sizes 00094 template <size_t cache_size> 00095 struct cuda_fastcache_alloc 00096 { 00097 rutz::cuda_free_list_base* cache[cache_size]; 00098 mutable size_t num_alloc[cache_size][MAX_CUDA_DEVICES]; 00099 bool allow_caching; 00100 00101 cuda_fastcache_alloc() 00102 : 00103 allow_caching(true) 00104 { 00105 for (size_t i = 0; i < cache_size; ++i) 00106 { 00107 this->cache[i] = 0; 00108 for(int d=0;d<MAX_CUDA_DEVICES;d++) 00109 this->num_alloc[i][d] = 0; 00110 } 00111 } 00112 00113 void set_debug(bool /*do_debug*/) 00114 { 00115 // no debug settings for this allocator type 00116 } 00117 00118 void set_allow_caching(bool on) 00119 { 00120 if (!on && this->allow_caching) 00121 { 00122 // if we are currently caching but are being asked to turn 00123 // off caching, then let's first free any existing caches 00124 00125 this->release_free_mem(); 00126 } 00127 this->allow_caching = on; 00128 } 00129 00130 void show_stats(int verbosity, const char* pfx, 00131 const size_t block_size, const size_t overhead) const 00132 { 00133 size_t nused = 0; 00134 00135 std::map<size_t, std::string> msgs; 00136 size_t bytes_allocated = 0; 00137 for (size_t i = 0; i < cache_size; ++i) 00138 { 00139 00140 if (this->cache[i] != 0) 00141 { 00142 std::map<int,int>::const_iterator devAt = this->cache[i]->getDevicesBegin(); 00143 std::map<int,int>::const_iterator devStop = this->cache[i]->getDevicesEnd(); 00144 for(;devAt!=devStop;devAt++) 00145 { 00146 00147 // TODO: go through all of the devices 00148 int dev = (*devAt).first; 00149 int dev_index = (*devAt).second; 00150 00151 ++nused; 00152 const size_t nb = (this->cache[i]->num_allocations(dev) 00153 * this->cache[i]->alloc_size()); 00154 00155 const size_t extra = (this->cache[i]->num_allocations(dev) 00156 - this->num_alloc[i][dev_index]); 00157 00158 this->num_alloc[i][dev_index] = this->cache[i]->num_allocations(dev); 00159 00160 bytes_allocated += nb; 00161 00162 if (verbosity <= 0) 00163 continue; 00164 00165 std::string msg = 00166 sformat("%s%sfastcache[%02"ZU"/%02"ZU"]: CUDA device %d, " 00167 "%10.4fMB in %4"ZU" allocations of %10.4fkB", 00168 pfx ? pfx : "", pfx ? ": " : "", 00169 i, cache_size, dev, nb / (1024.0*1024.0), 00170 this->cache[i]->num_allocations(dev), 00171 this->cache[i]->alloc_size() / 1024.0); 00172 00173 if (block_size > 0) 00174 { 00175 if (this->cache[i]->alloc_size() - overhead >= block_size 00176 || this->cache[i]->alloc_size() - overhead <= 1) 00177 msg += sformat(" (%.2fkB * %7.1f + %"ZU"B)", 00178 block_size / 1024.0, 00179 (double(this->cache[i]->alloc_size() - overhead) 00180 / double(block_size)), 00181 overhead); 00182 else 00183 msg += sformat(" (%.2fkB / %7.1f + %"ZU"B)", 00184 block_size / 1024.0, 00185 (double(block_size) 00186 / double(this->cache[i]->alloc_size() - overhead)), 00187 overhead); 00188 } 00189 00190 if (extra > 0) 00191 msg += sformat(" (+%"ZU" new)", extra); 00192 00193 msgs[this->cache[i]->alloc_size()] = msg; 00194 00195 00196 msg = 00197 sformat("%s%sfastcache_alloc<%"ZU">: %"ZU"/%"ZU" cache table " 00198 "entries in use, %fMB total allocated", 00199 pfx ? pfx : "", pfx ? ": " : "", 00200 cache_size, nused, cache_size, 00201 bytes_allocated / (1024.0*1024.0)); 00202 00203 if (block_size > 0) 00204 msg += sformat(" (%.2fkB * %7.1f)", 00205 block_size / 1024.0, 00206 double(bytes_allocated) / double(block_size)); 00207 00208 LINFO("%s", msg.c_str()); 00209 } 00210 } 00211 } 00212 for (std::map<size_t, std::string>::const_iterator 00213 itr = msgs.begin(), stop = msgs.end(); 00214 itr != stop; ++itr) 00215 LINFO("%s", (*itr).second.c_str()); 00216 } 00217 00218 // allocate memory block of size nbytes; also return the address 00219 // of the rutz::cuda_free_list_base, if any, that was used for 00220 // allocation 00221 void* allocate(size_t nbytes, int dev) 00222 { 00223 if (this->allow_caching) 00224 for (size_t i = 0; i < cache_size; ++i) 00225 { 00226 if (this->cache[i] != 0) 00227 { 00228 // we found a filled slot, let's see if it matches our 00229 // requested size 00230 if (this->cache[i]->alloc_size() == nbytes) 00231 { 00232 return this->cache[i]->allocate(nbytes,dev); 00233 } 00234 // else, continue 00235 } 00236 else // this->cache[i] == 0 00237 { 00238 // we found an empty slot, let's set up a new free 00239 // list for our requested size: 00240 this->cache[i] = new rutz::cuda_free_list_base(nbytes); 00241 return this->cache[i]->allocate(nbytes,dev); 00242 } 00243 } 00244 void *ret; 00245 CudaDevices::malloc(&ret,nbytes,dev);//::operator new(nbytes); 00246 return ret; 00247 } 00248 00249 // deallocate memory from the given rutz::cuda_free_list_base, 00250 // otherwise free it globally 00251 void deallocate(void* space, int dev, size_t nbytes) 00252 { 00253 if (this->allow_caching) 00254 { 00255 for (size_t i = 0; i < cache_size; ++i) 00256 { 00257 if (this->cache[i] != 0) 00258 { 00259 // we found a filled slot, let's see if it matches our 00260 // requested size 00261 if (this->cache[i]->alloc_size() == nbytes) 00262 { 00263 this->cache[i]->deallocate(space,dev); 00264 return; 00265 } 00266 // else, continue 00267 } 00268 else // this->cache[i] == 0 00269 { 00270 // we found an empty slot, let's set up a new free 00271 // list to store our deallocated size: 00272 this->cache[i] = new rutz::cuda_free_list_base(nbytes); 00273 this->cache[i]->deallocate(space,dev); 00274 } 00275 } 00276 } 00277 else 00278 { 00279 CudaDevices::free(space,dev); 00280 } 00281 } 00282 00283 void release_free_mem() 00284 { 00285 for (size_t i = 0; i < cache_size; ++i) 00286 if (this->cache[i] != 0) 00287 this->cache[i]->release_free_nodes(); 00288 } 00289 }; 00290 00291 00292 00293 00294 00295 00296 00297 /* Here are the various macros that you can twiddle if you need to 00298 change the allocation strategy. Basically you can have aligned 00299 allocation (DO_ALIGN) at an arbitrary N-byte boundary (NALIGN), 00300 with optional freelist caching (DO_FASTCACHE) of (NCACHE) 00301 commonly-requested memory sizes. 00302 00303 If you turn off both DO_ALIGN and DO_FASTCACHE, you will end up 00304 using trivial_alloc, which is just a bare wrapper around operator 00305 new() and operator delete(). By default, malloc() returns 8-byte 00306 aligned memory on gnu/linux/x86 machines. 00307 00308 Note that certain libraries (fftw [see FourierEngine] in 00309 particular) require greater than 8-byte alignment, so if you are 00310 going to be using those parts of the code, then you'll need to 00311 leave DO_ALIGN set, with NALIGN>=16. Also note that NALIGN must 00312 be at least 4*sizeof(void*) -- in particular, 16 will be too 00313 small on 64-bit systems for which sizeof(void*) is 8; for those 00314 systems we'll need NALIGN>=32. 00315 00316 DO_FASTCACHE is here primarily for performance; since our memory 00317 usage pattern tends to involve many many allocations of Image 00318 objects with only a few different Dims shapes, it helps to cache 00319 those memory allocations in a freelist. Profiling tests showed 00320 that this can give a 15-20% speedup. 00321 */ 00322 00323 #define DO_FASTCACHE 00324 #define NCACHE 64 00325 00326 # ifdef DO_FASTCACHE 00327 typedef cuda_fastcache_alloc<NCACHE> cuda_alloc_type; 00328 # else 00329 typedef cuda_trivial_alloc cuda_alloc_type; 00330 #endif 00331 00332 // Here is our global allocator object, whose type is determined by 00333 // the various macro settings abovve, and a corresponding mutex. For 00334 // now, we use a heavy-handed approach and just use the mutex to 00335 // lock the entire structure during each call to any of the public 00336 // functions. If this turns out to be a performance problem, we 00337 // could turn to finer-grained locking within the various allocator 00338 // classes themselves. 00339 cuda_alloc_type cuda_alloc; 00340 pthread_mutex_t cuda_alloc_mutex = PTHREAD_MUTEX_INITIALIZER; 00341 00342 size_t cuda_stats_units = 0; 00343 } 00344 00345 void* cuda_invt_allocate(size_t user_nbytes, int dev) 00346 { 00347 GVX_MUTEX_LOCK(&cuda_alloc_mutex); 00348 return cuda_alloc.allocate(user_nbytes,dev); 00349 } 00350 00351 void cuda_invt_deallocate(void* mem, int dev, size_t nbytes) 00352 { 00353 GVX_MUTEX_LOCK(&cuda_alloc_mutex); 00354 cuda_alloc.deallocate(mem,dev,nbytes); 00355 } 00356 00357 void cuda_invt_allocation_release_free_mem() 00358 { 00359 GVX_MUTEX_LOCK(&cuda_alloc_mutex); 00360 cuda_alloc.release_free_mem(); 00361 } 00362 00363 void cuda_invt_allocation_allow_caching(bool on) 00364 { 00365 GVX_MUTEX_LOCK(&cuda_alloc_mutex); 00366 cuda_alloc.set_allow_caching(on); 00367 } 00368 00369 void cuda_invt_allocation_debug_print(bool do_debug) 00370 { 00371 GVX_MUTEX_LOCK(&cuda_alloc_mutex); 00372 cuda_alloc.set_debug(do_debug); 00373 } 00374 00375 void cuda_invt_allocation_show_stats(int verbosity, const char* pfx, 00376 const size_t block_size) 00377 { 00378 GVX_MUTEX_LOCK(&cuda_alloc_mutex); 00379 cuda_alloc.show_stats(verbosity, pfx, 00380 block_size ? block_size : cuda_stats_units, 0); 00381 } 00382 00383 void cuda_invt_allocation_set_stats_units(const size_t units) 00384 { 00385 cuda_stats_units = units; 00386 } 00387 00388 // ###################################################################### 00389 /* So things look consistent in everyone's emacs... */ 00390 /* Local Variables: */ 00391 /* indent-tabs-mode: nil */ 00392 /* End: */ 00393 00394 #endif // CUDAALLOC_C_DEFINED