00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #ifndef CUDADEVICES_H_DEFINED
00040 #define CUDADEVICES_H_DEFINED
00041
00042 #include <map>
00043 #include <list>
00044 #include <cmath>
00045 #include "Image/Dims.H"
00046 #include "CUDA/cutil.h"
00047 #include <cuda_runtime_api.h>
00048 #include "Util/log.H"
00049 #include <stdlib.h>
00050 #include <stdio.h>
00051
00052 using std::map;
00053 using std::list;
00054
00055 class CudaDevices
00056 {
00057 public:
00058 static void initDevice(const int deviceNum)
00059 {
00060
00061 if(isValid(deviceNum))
00062 return;
00063
00064 int deviceCount;
00065 CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));
00066
00067 if (deviceCount == 0)
00068 LFATAL("No devices supporting CUDA");
00069
00070 if (deviceNum > deviceCount-1)
00071 LFATAL("Device [%d] out of valid device range [0-%d]",deviceNum,deviceCount-1);
00072
00073 cudaDeviceProp deviceProp;
00074 CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, deviceNum));
00075
00076 if (deviceProp.major < 1)
00077 LFATAL("Device [%d] does not support CUDA",deviceNum);
00078
00079
00080
00081
00082 int sqSize = nextpow2down((int) sqrt(deviceProp.maxThreadsPerBlock));
00083 Dims tileSize = Dims(sqSize,sqSize);
00084 int smSize = deviceProp.sharedMemPerBlock;
00085 setDeviceTileSize(deviceNum,tileSize);
00086 setDeviceSharedMemorySize(deviceNum,smSize);
00087 }
00088
00089 static inline int nextpow2down (int x)
00090 {
00091 if (x < 0)
00092 return 0;
00093 --x;
00094 x |= x >> 1;
00095 x |= x >> 2;
00096 x |= x >> 4;
00097 x |= x >> 8;
00098 x |= x >> 16;
00099 x = x>>1;
00100 return x+1;
00101 }
00102
00103 static void memcpyHostToDevice(void *trg, const void *src, const int len, const int deviceNum)
00104 {
00105 setCurrentDevice(deviceNum);
00106 CUDA_SAFE_CALL( cudaMemcpy(trg, src, len, cudaMemcpyHostToDevice));
00107 }
00108
00109 static void memcpyDeviceToHost(void *trg, const void *src, const int len, const int deviceNum)
00110 {
00111 setCurrentDevice(deviceNum);
00112 CUDA_SAFE_CALL( cudaMemcpy(trg, src, len, cudaMemcpyDeviceToHost));
00113 }
00114
00115 static void memcpyDeviceToDevice(void *trg, const void *src, const int len, const int trgDev, const int srcDev)
00116 {
00117 if(trgDev == srcDev)
00118 {
00119 setCurrentDevice(srcDev);
00120 CUDA_SAFE_CALL( cudaMemcpy(trg, src, len, cudaMemcpyDeviceToDevice));
00121 }
00122 else
00123 {
00124 char *tmp;
00125 tmp = (char *) std::malloc(len);
00126 memcpyDeviceToHost(tmp,src,len,srcDev);
00127 memcpyHostToDevice(trg,tmp,len,trgDev);
00128 std::free(tmp);
00129 }
00130 }
00131
00132 static void malloc(void **ret, const int sz, const int deviceNum)
00133 {
00134 setCurrentDevice(deviceNum);
00135 cudaError err = cudaMalloc(ret,sz);
00136 if( cudaSuccess != err) {
00137 fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",
00138 __FILE__, __LINE__, cudaGetErrorString( err) );
00139 printf("Attempted to malloc %d bytes on cuda device\n",sz);
00140 abort();
00141 }
00142
00143 if(*ret == NULL && sz > 0)
00144 {
00145 LFATAL("CUDA Device [%d] failed to allocate %d bytes of memory",deviceNum,sz);
00146 }
00147 }
00148
00149 static void free(void *mem, const int deviceNum)
00150 {
00151 setCurrentDevice(deviceNum);
00152 cudaError err = cudaFree(mem);
00153 if( cudaSuccess != err) {
00154 fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",
00155 __FILE__, __LINE__, cudaGetErrorString( err) );
00156 printf("Attempted to free %p address on cuda device\n",mem);
00157 abort();
00158 }
00159
00160
00161 }
00162
00163 static void memset(void *mem, const char val, const int sz, const int deviceNum)
00164 {
00165 setCurrentDevice(deviceNum);
00166 CUDA_SAFE_CALL(cudaMemset(mem, val, sz) );
00167 }
00168
00169 static inline Dims getDeviceTileSize(const int deviceNum)
00170 {
00171 return deviceTileSizes[deviceNum];
00172 }
00173
00174 static inline int getDeviceSharedMemorySize(const int deviceNum)
00175 {
00176 return deviceSharedMemorySizes[deviceNum];
00177 }
00178
00179
00180 static inline Dims getDeviceTileSize1D(const int deviceNum)
00181 {
00182 Dims tile1D = Dims(deviceTileSizes[deviceNum].sz(),1);
00183 return tile1D;
00184 }
00185
00186 static inline bool isValid(const int deviceNum)
00187 {
00188 if(deviceTileSizes[deviceNum].isEmpty())
00189 return false;
00190 return true;
00191 }
00192
00193 static void displayProperties( const int deviceNum )
00194 {
00195 setCurrentDevice(deviceNum);
00196 cudaDeviceProp deviceProp;
00197 CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, deviceNum));
00198 if (deviceProp.major < 1)
00199 {
00200 LINFO("Video Device [%d] does not support CUDA",deviceNum);
00201 return;
00202 }
00203 printf( "\nCUDA Device Name \t - %s ", deviceProp.name );
00204 printf( "\n**************************************");
00205 printf( "\nTotal Global Memory\t\t -%lu KB", deviceProp.totalGlobalMem/1024 );
00206 printf( "\nShared memory available per block \t - %lu KB", deviceProp.sharedMemPerBlock/1024 );
00207 printf( "\nNumber of registers per thread block \t - %d", deviceProp.regsPerBlock );
00208 printf( "\nWarp size in threads \t - %d", deviceProp.warpSize );
00209 printf( "\nMemory Pitch \t - %d bytes", (int) deviceProp.memPitch );
00210 printf( "\nMaximum threads per block \t - %d", deviceProp.maxThreadsPerBlock );
00211 printf( "\nMaximum Thread Dimension (block) \t - %d %d %d", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2] );
00212 printf( "\nMaximum Thread Dimension (grid) \t - %d %d %d", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2] );
00213 printf( "\nTotal constant memory \t - %d bytes", (int) deviceProp.totalConstMem );
00214 printf( "\nCUDA ver \t - %d.%d", deviceProp.major, deviceProp.minor );
00215 printf( "\nClock rate \t - %d KHz", deviceProp.clockRate );
00216 printf( "\nTexture Alignment \t - %d bytes", (int) deviceProp.textureAlignment );
00217 printf( "\nDevice Overlap \t - %s", deviceProp. deviceOverlap?"Allowed":"Not Allowed" );
00218 printf( "\nNumber of Multi processors \t - %d\n", deviceProp.multiProcessorCount );
00219 }
00220
00221
00222 static inline void setCurrentDevice(const int deviceNum)
00223 {
00224 if(!isValid(deviceNum)) initDevice(deviceNum);
00225 _setCurrentDevice(deviceNum);
00226 }
00227
00228
00229 private:
00230
00231 CudaDevices();
00232
00233
00234 static inline void _setCurrentDevice(const int deviceNum)
00235 {
00236
00237 if(deviceNum != currentDevice)
00238 {
00239 if(deviceNum < 0)
00240 {
00241 fprintf(stderr,"Attempting to set device to %d from %d in file '%s' in line %i.\n",deviceNum,currentDevice,
00242 __FILE__, __LINE__ );
00243 abort();
00244 }
00245
00246 CUDA_SAFE_CALL(cudaSetDevice(deviceNum));
00247 currentDevice = deviceNum;
00248 }
00249 }
00250
00251 static inline void setDeviceTileSize(const int deviceNum, const Dims tileSize)
00252 {
00253 deviceTileSizes[deviceNum] = tileSize;
00254 }
00255
00256 static inline void setDeviceSharedMemorySize(const int deviceNum, const int smSize)
00257 {
00258 deviceSharedMemorySizes[deviceNum] = smSize;
00259 }
00260
00261 static map<int,Dims> deviceTileSizes;
00262 static map<int,int> deviceSharedMemorySizes;
00263 static int currentDevice;
00264 };
00265
00266 #endif