CudaAlloc.C

00001 /*!@file Util/AllocAux.C memory allocation routines for 16-byte alignment */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file: Rob Peters <rjpeters at usc dot edu>
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/CudaAlloc.C $
00035 // $Id: CudaAlloc.C 12962 2010-03-06 02:13:53Z irock $
00036 //
00037 
00038 #ifndef CUDAALLOC_C_DEFINED
00039 #define CUDAALLOC_C_DEFINED
00040 
00041 #include "Util/Assert.H"
00042 #include "Util/log.H"
00043 #include "Util/sformat.H"
00044 #include "CUDA/cudafreelist.H"
00045 #include "rutz/mutex.h"
00046 #include "rutz/trace.h"
00047 #include "CUDA/CudaDevices.H"
00048 #include <map>
00049 #include <pthread.h>
00050 
00051 namespace
00052 {
00053   //! Trivial allocator that just calls operator new() and operator delete()
00054   struct cuda_trivial_alloc
00055   {
00056     void set_debug(bool /*do_debug*/)
00057     {
00058       // no debug settings for this allocator type
00059     }
00060 
00061     void set_allow_caching(bool /*on*/)
00062     {
00063       // no caching here in any case
00064     }
00065 
00066     void show_stats(int /*verbosity*/, const char* /*pfx*/,
00067                     const size_t block_size, const size_t overhead) const
00068     {
00069       // nothing to do here
00070     }
00071 
00072     void* allocate(size_t nbytes, rutz::cuda_free_list_base** source, int dev)
00073     {
00074       if (source != 0)
00075         *source = 0;
00076       void *ret;
00077       CudaDevices::malloc(&ret,nbytes,dev);
00078       return ret;
00079     }
00080 
00081     void deallocate(void* space, rutz::cuda_free_list_base* source, int dev)
00082     {
00083       ASSERT(source == 0);
00084       CudaDevices::free(space,dev);
00085     }
00086 
00087     void release_free_mem()
00088     {
00089       // nothing to do here
00090     }
00091   };
00092 
00093   //! Caching allocator with free lists for common allocation sizes
00094   template <size_t cache_size>
00095   struct cuda_fastcache_alloc
00096   {
00097     rutz::cuda_free_list_base* cache[cache_size];
00098     mutable size_t num_alloc[cache_size][MAX_CUDA_DEVICES];
00099     bool allow_caching;
00100 
00101     cuda_fastcache_alloc()
00102       :
00103       allow_caching(true)
00104     {
00105       for (size_t i = 0; i < cache_size; ++i)
00106         {
00107           this->cache[i] = 0;
00108           for(int d=0;d<MAX_CUDA_DEVICES;d++)
00109             this->num_alloc[i][d] = 0;
00110         }
00111     }
00112 
00113     void set_debug(bool /*do_debug*/)
00114     {
00115       // no debug settings for this allocator type
00116     }
00117 
00118     void set_allow_caching(bool on)
00119     {
00120       if (!on && this->allow_caching)
00121         {
00122           // if we are currently caching but are being asked to turn
00123           // off caching, then let's first free any existing caches
00124 
00125           this->release_free_mem();
00126         }
00127       this->allow_caching = on;
00128     }
00129 
00130     void show_stats(int verbosity, const char* pfx,
00131                     const size_t block_size, const size_t overhead) const
00132     {
00133       size_t nused = 0;
00134 
00135       std::map<size_t, std::string> msgs;
00136       size_t bytes_allocated = 0;
00137       for (size_t i = 0; i < cache_size; ++i)
00138         {
00139 
00140           if (this->cache[i] != 0)
00141             {
00142               std::map<int,int>::const_iterator devAt = this->cache[i]->getDevicesBegin();
00143               std::map<int,int>::const_iterator devStop = this->cache[i]->getDevicesEnd();
00144               for(;devAt!=devStop;devAt++)
00145                 {
00146 
00147                   // TODO: go through all of the devices
00148                   int dev = (*devAt).first;
00149                   int dev_index = (*devAt).second;
00150 
00151                   ++nused;
00152                   const size_t nb = (this->cache[i]->num_allocations(dev)
00153                                      * this->cache[i]->alloc_size());
00154 
00155                   const size_t extra = (this->cache[i]->num_allocations(dev)
00156                                         - this->num_alloc[i][dev_index]);
00157 
00158                   this->num_alloc[i][dev_index] = this->cache[i]->num_allocations(dev);
00159 
00160                   bytes_allocated += nb;
00161 
00162                   if (verbosity <= 0)
00163                     continue;
00164 
00165                   std::string msg =
00166                     sformat("%s%sfastcache[%02"ZU"/%02"ZU"]: CUDA device %d, "
00167                             "%10.4fMB in %4"ZU" allocations of %10.4fkB",
00168                             pfx ? pfx : "", pfx ? ": " : "",
00169                             i, cache_size, dev, nb / (1024.0*1024.0),
00170                             this->cache[i]->num_allocations(dev),
00171                             this->cache[i]->alloc_size() / 1024.0);
00172 
00173                   if (block_size > 0)
00174                     {
00175                       if (this->cache[i]->alloc_size() - overhead >= block_size
00176                           || this->cache[i]->alloc_size() - overhead <= 1)
00177                         msg += sformat(" (%.2fkB * %7.1f + %"ZU"B)",
00178                                        block_size / 1024.0,
00179                                        (double(this->cache[i]->alloc_size() - overhead)
00180                                         / double(block_size)),
00181                                        overhead);
00182                       else
00183                         msg += sformat(" (%.2fkB / %7.1f + %"ZU"B)",
00184                                        block_size / 1024.0,
00185                                        (double(block_size)
00186                                         / double(this->cache[i]->alloc_size() - overhead)),
00187                                        overhead);
00188                     }
00189 
00190                   if (extra > 0)
00191                     msg += sformat(" (+%"ZU" new)", extra);
00192 
00193                   msgs[this->cache[i]->alloc_size()] = msg;
00194 
00195 
00196                   msg =
00197                     sformat("%s%sfastcache_alloc<%"ZU">: %"ZU"/%"ZU" cache table "
00198                             "entries in use, %fMB total allocated",
00199                             pfx ? pfx : "", pfx ? ": " : "",
00200                             cache_size, nused, cache_size,
00201                             bytes_allocated / (1024.0*1024.0));
00202 
00203                   if (block_size > 0)
00204                     msg += sformat(" (%.2fkB * %7.1f)",
00205                                    block_size / 1024.0,
00206                                    double(bytes_allocated) / double(block_size));
00207 
00208                   LINFO("%s", msg.c_str());
00209                 }
00210             }
00211         }
00212       for (std::map<size_t, std::string>::const_iterator
00213              itr = msgs.begin(), stop = msgs.end();
00214            itr != stop; ++itr)
00215         LINFO("%s", (*itr).second.c_str());
00216     }
00217 
00218     // allocate memory block of size nbytes; also return the address
00219     // of the rutz::cuda_free_list_base, if any, that was used for
00220     // allocation
00221     void* allocate(size_t nbytes, int dev)
00222     {
00223       if (this->allow_caching)
00224         for (size_t i = 0; i < cache_size; ++i)
00225           {
00226             if (this->cache[i] != 0)
00227               {
00228                 // we found a filled slot, let's see if it matches our
00229                 // requested size
00230                 if (this->cache[i]->alloc_size() == nbytes)
00231                   {
00232                     return this->cache[i]->allocate(nbytes,dev);
00233                   }
00234                 // else, continue
00235               }
00236             else // this->cache[i] == 0
00237               {
00238                 // we found an empty slot, let's set up a new free
00239                 // list for our requested size:
00240                 this->cache[i] = new rutz::cuda_free_list_base(nbytes);
00241                 return this->cache[i]->allocate(nbytes,dev);
00242               }
00243           }
00244       void *ret;
00245       CudaDevices::malloc(&ret,nbytes,dev);//::operator new(nbytes);
00246       return ret;
00247     }
00248 
00249     // deallocate memory from the given rutz::cuda_free_list_base,
00250     // otherwise free it globally
00251     void deallocate(void* space, int dev, size_t nbytes)
00252     {
00253       if (this->allow_caching)
00254         {
00255         for (size_t i = 0; i < cache_size; ++i)
00256           {
00257             if (this->cache[i] != 0)
00258               {
00259                 // we found a filled slot, let's see if it matches our
00260                 // requested size
00261                 if (this->cache[i]->alloc_size() == nbytes)
00262                   {
00263                     this->cache[i]->deallocate(space,dev);
00264                     return;
00265                   }
00266                 // else, continue
00267               }
00268             else // this->cache[i] == 0
00269               {
00270                 // we found an empty slot, let's set up a new free
00271                 // list to store our deallocated size:
00272                 this->cache[i] = new rutz::cuda_free_list_base(nbytes);
00273                 this->cache[i]->deallocate(space,dev);
00274               }
00275           }
00276         }
00277       else
00278         {
00279           CudaDevices::free(space,dev);
00280         }
00281     }
00282 
00283     void release_free_mem()
00284     {
00285       for (size_t i = 0; i < cache_size; ++i)
00286         if (this->cache[i] != 0)
00287           this->cache[i]->release_free_nodes();
00288     }
00289   };
00290 
00291 
00292 
00293 
00294 
00295 
00296 
00297   /* Here are the various macros that you can twiddle if you need to
00298      change the allocation strategy. Basically you can have aligned
00299      allocation (DO_ALIGN) at an arbitrary N-byte boundary (NALIGN),
00300      with optional freelist caching (DO_FASTCACHE) of (NCACHE)
00301      commonly-requested memory sizes.
00302 
00303      If you turn off both DO_ALIGN and DO_FASTCACHE, you will end up
00304      using trivial_alloc, which is just a bare wrapper around operator
00305      new() and operator delete(). By default, malloc() returns 8-byte
00306      aligned memory on gnu/linux/x86 machines.
00307 
00308      Note that certain libraries (fftw [see FourierEngine] in
00309      particular) require greater than 8-byte alignment, so if you are
00310      going to be using those parts of the code, then you'll need to
00311      leave DO_ALIGN set, with NALIGN>=16. Also note that NALIGN must
00312      be at least 4*sizeof(void*) -- in particular, 16 will be too
00313      small on 64-bit systems for which sizeof(void*) is 8; for those
00314      systems we'll need NALIGN>=32.
00315 
00316      DO_FASTCACHE is here primarily for performance; since our memory
00317      usage pattern tends to involve many many allocations of Image
00318      objects with only a few different Dims shapes, it helps to cache
00319      those memory allocations in a freelist. Profiling tests showed
00320      that this can give a 15-20% speedup.
00321   */
00322 
00323 #define DO_FASTCACHE
00324 #define NCACHE        64
00325 
00326 #  ifdef DO_FASTCACHE
00327   typedef cuda_fastcache_alloc<NCACHE> cuda_alloc_type;
00328 #  else
00329   typedef cuda_trivial_alloc      cuda_alloc_type;
00330 #endif
00331 
00332   // Here is our global allocator object, whose type is determined by
00333   // the various macro settings abovve, and a corresponding mutex. For
00334   // now, we use a heavy-handed approach and just use the mutex to
00335   // lock the entire structure during each call to any of the public
00336   // functions. If this turns out to be a performance problem, we
00337   // could turn to finer-grained locking within the various allocator
00338   // classes themselves.
00339   cuda_alloc_type    cuda_alloc;
00340   pthread_mutex_t cuda_alloc_mutex = PTHREAD_MUTEX_INITIALIZER;
00341 
00342   size_t          cuda_stats_units = 0;
00343 }
00344 
00345 void* cuda_invt_allocate(size_t user_nbytes, int dev)
00346 {
00347   GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00348   return cuda_alloc.allocate(user_nbytes,dev);
00349 }
00350 
00351 void cuda_invt_deallocate(void* mem, int dev, size_t nbytes)
00352 {
00353   GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00354   cuda_alloc.deallocate(mem,dev,nbytes);
00355 }
00356 
00357 void cuda_invt_allocation_release_free_mem()
00358 {
00359   GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00360   cuda_alloc.release_free_mem();
00361 }
00362 
00363 void cuda_invt_allocation_allow_caching(bool on)
00364 {
00365   GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00366   cuda_alloc.set_allow_caching(on);
00367 }
00368 
00369 void cuda_invt_allocation_debug_print(bool do_debug)
00370 {
00371   GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00372   cuda_alloc.set_debug(do_debug);
00373 }
00374 
00375 void cuda_invt_allocation_show_stats(int verbosity, const char* pfx,
00376                                 const size_t block_size)
00377 {
00378   GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00379   cuda_alloc.show_stats(verbosity, pfx,
00380                      block_size ? block_size : cuda_stats_units, 0);
00381 }
00382 
00383 void cuda_invt_allocation_set_stats_units(const size_t units)
00384 {
00385   cuda_stats_units = units;
00386 }
00387 
00388 // ######################################################################
00389 /* So things look consistent in everyone's emacs... */
00390 /* Local Variables: */
00391 /* indent-tabs-mode: nil */
00392 /* End: */
00393 
00394 #endif // CUDAALLOC_C_DEFINED