00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #ifndef CUDAALLOC_C_DEFINED
00039 #define CUDAALLOC_C_DEFINED
00040
00041 #include "Util/Assert.H"
00042 #include "Util/log.H"
00043 #include "Util/sformat.H"
00044 #include "CUDA/cudafreelist.H"
00045 #include "rutz/mutex.h"
00046 #include "rutz/trace.h"
00047 #include "CUDA/CudaDevices.H"
00048 #include <map>
00049 #include <pthread.h>
00050
00051 namespace
00052 {
00053
00054 struct cuda_trivial_alloc
00055 {
00056 void set_debug(bool )
00057 {
00058
00059 }
00060
00061 void set_allow_caching(bool )
00062 {
00063
00064 }
00065
00066 void show_stats(int , const char* ,
00067 const size_t block_size, const size_t overhead) const
00068 {
00069
00070 }
00071
00072 void* allocate(size_t nbytes, rutz::cuda_free_list_base** source, int dev)
00073 {
00074 if (source != 0)
00075 *source = 0;
00076 void *ret;
00077 CudaDevices::malloc(&ret,nbytes,dev);
00078 return ret;
00079 }
00080
00081 void deallocate(void* space, rutz::cuda_free_list_base* source, int dev)
00082 {
00083 ASSERT(source == 0);
00084 CudaDevices::free(space,dev);
00085 }
00086
00087 void release_free_mem()
00088 {
00089
00090 }
00091 };
00092
00093
00094 template <size_t cache_size>
00095 struct cuda_fastcache_alloc
00096 {
00097 rutz::cuda_free_list_base* cache[cache_size];
00098 mutable size_t num_alloc[cache_size][MAX_CUDA_DEVICES];
00099 bool allow_caching;
00100
00101 cuda_fastcache_alloc()
00102 :
00103 allow_caching(true)
00104 {
00105 for (size_t i = 0; i < cache_size; ++i)
00106 {
00107 this->cache[i] = 0;
00108 for(int d=0;d<MAX_CUDA_DEVICES;d++)
00109 this->num_alloc[i][d] = 0;
00110 }
00111 }
00112
00113 void set_debug(bool )
00114 {
00115
00116 }
00117
00118 void set_allow_caching(bool on)
00119 {
00120 if (!on && this->allow_caching)
00121 {
00122
00123
00124
00125 this->release_free_mem();
00126 }
00127 this->allow_caching = on;
00128 }
00129
00130 void show_stats(int verbosity, const char* pfx,
00131 const size_t block_size, const size_t overhead) const
00132 {
00133 size_t nused = 0;
00134
00135 std::map<size_t, std::string> msgs;
00136 size_t bytes_allocated = 0;
00137 for (size_t i = 0; i < cache_size; ++i)
00138 {
00139
00140 if (this->cache[i] != 0)
00141 {
00142 std::map<int,int>::const_iterator devAt = this->cache[i]->getDevicesBegin();
00143 std::map<int,int>::const_iterator devStop = this->cache[i]->getDevicesEnd();
00144 for(;devAt!=devStop;devAt++)
00145 {
00146
00147
00148 int dev = (*devAt).first;
00149 int dev_index = (*devAt).second;
00150
00151 ++nused;
00152 const size_t nb = (this->cache[i]->num_allocations(dev)
00153 * this->cache[i]->alloc_size());
00154
00155 const size_t extra = (this->cache[i]->num_allocations(dev)
00156 - this->num_alloc[i][dev_index]);
00157
00158 this->num_alloc[i][dev_index] = this->cache[i]->num_allocations(dev);
00159
00160 bytes_allocated += nb;
00161
00162 if (verbosity <= 0)
00163 continue;
00164
00165 std::string msg =
00166 sformat("%s%sfastcache[%02"ZU"/%02"ZU"]: CUDA device %d, "
00167 "%10.4fMB in %4"ZU" allocations of %10.4fkB",
00168 pfx ? pfx : "", pfx ? ": " : "",
00169 i, cache_size, dev, nb / (1024.0*1024.0),
00170 this->cache[i]->num_allocations(dev),
00171 this->cache[i]->alloc_size() / 1024.0);
00172
00173 if (block_size > 0)
00174 {
00175 if (this->cache[i]->alloc_size() - overhead >= block_size
00176 || this->cache[i]->alloc_size() - overhead <= 1)
00177 msg += sformat(" (%.2fkB * %7.1f + %"ZU"B)",
00178 block_size / 1024.0,
00179 (double(this->cache[i]->alloc_size() - overhead)
00180 / double(block_size)),
00181 overhead);
00182 else
00183 msg += sformat(" (%.2fkB / %7.1f + %"ZU"B)",
00184 block_size / 1024.0,
00185 (double(block_size)
00186 / double(this->cache[i]->alloc_size() - overhead)),
00187 overhead);
00188 }
00189
00190 if (extra > 0)
00191 msg += sformat(" (+%"ZU" new)", extra);
00192
00193 msgs[this->cache[i]->alloc_size()] = msg;
00194
00195
00196 msg =
00197 sformat("%s%sfastcache_alloc<%"ZU">: %"ZU"/%"ZU" cache table "
00198 "entries in use, %fMB total allocated",
00199 pfx ? pfx : "", pfx ? ": " : "",
00200 cache_size, nused, cache_size,
00201 bytes_allocated / (1024.0*1024.0));
00202
00203 if (block_size > 0)
00204 msg += sformat(" (%.2fkB * %7.1f)",
00205 block_size / 1024.0,
00206 double(bytes_allocated) / double(block_size));
00207
00208 LINFO("%s", msg.c_str());
00209 }
00210 }
00211 }
00212 for (std::map<size_t, std::string>::const_iterator
00213 itr = msgs.begin(), stop = msgs.end();
00214 itr != stop; ++itr)
00215 LINFO("%s", (*itr).second.c_str());
00216 }
00217
00218
00219
00220
00221 void* allocate(size_t nbytes, int dev)
00222 {
00223 if (this->allow_caching)
00224 for (size_t i = 0; i < cache_size; ++i)
00225 {
00226 if (this->cache[i] != 0)
00227 {
00228
00229
00230 if (this->cache[i]->alloc_size() == nbytes)
00231 {
00232 return this->cache[i]->allocate(nbytes,dev);
00233 }
00234
00235 }
00236 else
00237 {
00238
00239
00240 this->cache[i] = new rutz::cuda_free_list_base(nbytes);
00241 return this->cache[i]->allocate(nbytes,dev);
00242 }
00243 }
00244 void *ret;
00245 CudaDevices::malloc(&ret,nbytes,dev);
00246 return ret;
00247 }
00248
00249
00250
00251 void deallocate(void* space, int dev, size_t nbytes)
00252 {
00253 if (this->allow_caching)
00254 {
00255 for (size_t i = 0; i < cache_size; ++i)
00256 {
00257 if (this->cache[i] != 0)
00258 {
00259
00260
00261 if (this->cache[i]->alloc_size() == nbytes)
00262 {
00263 this->cache[i]->deallocate(space,dev);
00264 return;
00265 }
00266
00267 }
00268 else
00269 {
00270
00271
00272 this->cache[i] = new rutz::cuda_free_list_base(nbytes);
00273 this->cache[i]->deallocate(space,dev);
00274 }
00275 }
00276 }
00277 else
00278 {
00279 CudaDevices::free(space,dev);
00280 }
00281 }
00282
00283 void release_free_mem()
00284 {
00285 for (size_t i = 0; i < cache_size; ++i)
00286 if (this->cache[i] != 0)
00287 this->cache[i]->release_free_nodes();
00288 }
00289 };
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323 #define DO_FASTCACHE
00324 #define NCACHE 64
00325
00326 # ifdef DO_FASTCACHE
00327 typedef cuda_fastcache_alloc<NCACHE> cuda_alloc_type;
00328 # else
00329 typedef cuda_trivial_alloc cuda_alloc_type;
00330 #endif
00331
00332
00333
00334
00335
00336
00337
00338
00339 cuda_alloc_type cuda_alloc;
00340 pthread_mutex_t cuda_alloc_mutex = PTHREAD_MUTEX_INITIALIZER;
00341
00342 size_t cuda_stats_units = 0;
00343 }
00344
00345 void* cuda_invt_allocate(size_t user_nbytes, int dev)
00346 {
00347 GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00348 return cuda_alloc.allocate(user_nbytes,dev);
00349 }
00350
00351 void cuda_invt_deallocate(void* mem, int dev, size_t nbytes)
00352 {
00353 GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00354 cuda_alloc.deallocate(mem,dev,nbytes);
00355 }
00356
00357 void cuda_invt_allocation_release_free_mem()
00358 {
00359 GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00360 cuda_alloc.release_free_mem();
00361 }
00362
00363 void cuda_invt_allocation_allow_caching(bool on)
00364 {
00365 GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00366 cuda_alloc.set_allow_caching(on);
00367 }
00368
00369 void cuda_invt_allocation_debug_print(bool do_debug)
00370 {
00371 GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00372 cuda_alloc.set_debug(do_debug);
00373 }
00374
00375 void cuda_invt_allocation_show_stats(int verbosity, const char* pfx,
00376 const size_t block_size)
00377 {
00378 GVX_MUTEX_LOCK(&cuda_alloc_mutex);
00379 cuda_alloc.show_stats(verbosity, pfx,
00380 block_size ? block_size : cuda_stats_units, 0);
00381 }
00382
00383 void cuda_invt_allocation_set_stats_units(const size_t units)
00384 {
00385 cuda_stats_units = units;
00386 }
00387
00388
00389
00390
00391
00392
00393
00394 #endif // CUDAALLOC_C_DEFINED