00001 /*!@file CUDA/CudaDevices.H Static class to keep track of status and properties of CUDA devices */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/CudaDevices.H $ 00035 // $Id: CudaDevices.H 12962 2010-03-06 02:13:53Z irock $ 00036 // 00037 00038 00039 #ifndef CUDADEVICES_H_DEFINED 00040 #define CUDADEVICES_H_DEFINED 00041 00042 #include <map> 00043 #include <list> 00044 #include <cmath> 00045 #include "Image/Dims.H" 00046 #include "CUDA/cutil.h" 00047 #include <cuda_runtime_api.h> 00048 #include "Util/log.H" 00049 #include <stdlib.h> 00050 #include <stdio.h> 00051 00052 using std::map; 00053 using std::list; 00054 00055 class CudaDevices 00056 { 00057 public: 00058 static void initDevice(const int deviceNum) 00059 { 00060 // Optimization, avoid repeated initializations 00061 if(isValid(deviceNum)) 00062 return; 00063 // Initializing CUDA devices should not happen often, we can be thorough 00064 int deviceCount; 00065 CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount)); 00066 00067 if (deviceCount == 0) 00068 LFATAL("No devices supporting CUDA"); 00069 00070 if (deviceNum > deviceCount-1) 00071 LFATAL("Device [%d] out of valid device range [0-%d]",deviceNum,deviceCount-1); 00072 00073 cudaDeviceProp deviceProp; 00074 CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, deviceNum)); 00075 00076 if (deviceProp.major < 1) 00077 LFATAL("Device [%d] does not support CUDA",deviceNum); 00078 00079 // Determine the optimal tile size for this device 00080 // Take the square root of the maximum num of threads per block and then find 00081 // the next smallest power of 2 00082 int sqSize = nextpow2down((int) sqrt(deviceProp.maxThreadsPerBlock)); 00083 Dims tileSize = Dims(sqSize,sqSize); 00084 int smSize = deviceProp.sharedMemPerBlock; 00085 setDeviceTileSize(deviceNum,tileSize); 00086 setDeviceSharedMemorySize(deviceNum,smSize); 00087 } 00088 00089 static inline int nextpow2down (int x) 00090 { 00091 if (x < 0) 00092 return 0; 00093 --x; 00094 x |= x >> 1; 00095 x |= x >> 2; 00096 x |= x >> 4; 00097 x |= x >> 8; 00098 x |= x >> 16; 00099 x = x>>1; 00100 return x+1; 00101 } 00102 00103 static void memcpyHostToDevice(void *trg, const void *src, const int len, const int deviceNum) 00104 { 00105 setCurrentDevice(deviceNum); 00106 CUDA_SAFE_CALL( cudaMemcpy(trg, src, len, cudaMemcpyHostToDevice)); 00107 } 00108 00109 static void memcpyDeviceToHost(void *trg, const void *src, const int len, const int deviceNum) 00110 { 00111 setCurrentDevice(deviceNum); 00112 CUDA_SAFE_CALL( cudaMemcpy(trg, src, len, cudaMemcpyDeviceToHost)); 00113 } 00114 00115 static void memcpyDeviceToDevice(void *trg, const void *src, const int len, const int trgDev, const int srcDev) 00116 { 00117 if(trgDev == srcDev) 00118 { 00119 setCurrentDevice(srcDev); 00120 CUDA_SAFE_CALL( cudaMemcpy(trg, src, len, cudaMemcpyDeviceToDevice)); 00121 } 00122 else 00123 { 00124 char *tmp; 00125 tmp = (char *) std::malloc(len); 00126 memcpyDeviceToHost(tmp,src,len,srcDev); 00127 memcpyHostToDevice(trg,tmp,len,trgDev); 00128 std::free(tmp); 00129 } 00130 } 00131 00132 static void malloc(void **ret, const int sz, const int deviceNum) 00133 { 00134 setCurrentDevice(deviceNum); 00135 cudaError err = cudaMalloc(ret,sz); 00136 if( cudaSuccess != err) { 00137 fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", 00138 __FILE__, __LINE__, cudaGetErrorString( err) ); 00139 printf("Attempted to malloc %d bytes on cuda device\n",sz); 00140 abort(); 00141 } 00142 //CUDA_SAFE_CALL(cudaMalloc(ret, sz) ); 00143 if(*ret == NULL && sz > 0) 00144 { 00145 LFATAL("CUDA Device [%d] failed to allocate %d bytes of memory",deviceNum,sz); 00146 } 00147 } 00148 00149 static void free(void *mem, const int deviceNum) 00150 { 00151 setCurrentDevice(deviceNum); 00152 cudaError err = cudaFree(mem); 00153 if( cudaSuccess != err) { 00154 fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", 00155 __FILE__, __LINE__, cudaGetErrorString( err) ); 00156 printf("Attempted to free %p address on cuda device\n",mem); 00157 abort(); 00158 } 00159 00160 //CUDA_SAFE_CALL(cudaFree(mem) ); 00161 } 00162 00163 static void memset(void *mem, const char val, const int sz, const int deviceNum) 00164 { 00165 setCurrentDevice(deviceNum); 00166 CUDA_SAFE_CALL(cudaMemset(mem, val, sz) ); 00167 } 00168 00169 static inline Dims getDeviceTileSize(const int deviceNum) 00170 { 00171 return deviceTileSizes[deviceNum]; 00172 } 00173 00174 static inline int getDeviceSharedMemorySize(const int deviceNum) 00175 { 00176 return deviceSharedMemorySizes[deviceNum]; 00177 } 00178 00179 00180 static inline Dims getDeviceTileSize1D(const int deviceNum) 00181 { 00182 Dims tile1D = Dims(deviceTileSizes[deviceNum].sz(),1); 00183 return tile1D; 00184 } 00185 00186 static inline bool isValid(const int deviceNum) 00187 { 00188 if(deviceTileSizes[deviceNum].isEmpty()) 00189 return false; 00190 return true; 00191 } 00192 00193 static void displayProperties( const int deviceNum ) 00194 { 00195 setCurrentDevice(deviceNum); 00196 cudaDeviceProp deviceProp; 00197 CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, deviceNum)); 00198 if (deviceProp.major < 1) 00199 { 00200 LINFO("Video Device [%d] does not support CUDA",deviceNum); 00201 return; 00202 } 00203 printf( "\nCUDA Device Name \t - %s ", deviceProp.name ); 00204 printf( "\n**************************************"); 00205 printf( "\nTotal Global Memory\t\t -%lu KB", deviceProp.totalGlobalMem/1024 ); 00206 printf( "\nShared memory available per block \t - %lu KB", deviceProp.sharedMemPerBlock/1024 ); 00207 printf( "\nNumber of registers per thread block \t - %d", deviceProp.regsPerBlock ); 00208 printf( "\nWarp size in threads \t - %d", deviceProp.warpSize ); 00209 printf( "\nMemory Pitch \t - %d bytes", (int) deviceProp.memPitch ); 00210 printf( "\nMaximum threads per block \t - %d", deviceProp.maxThreadsPerBlock ); 00211 printf( "\nMaximum Thread Dimension (block) \t - %d %d %d", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2] ); 00212 printf( "\nMaximum Thread Dimension (grid) \t - %d %d %d", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2] ); 00213 printf( "\nTotal constant memory \t - %d bytes", (int) deviceProp.totalConstMem ); 00214 printf( "\nCUDA ver \t - %d.%d", deviceProp.major, deviceProp.minor ); 00215 printf( "\nClock rate \t - %d KHz", deviceProp.clockRate ); 00216 printf( "\nTexture Alignment \t - %d bytes", (int) deviceProp.textureAlignment ); 00217 printf( "\nDevice Overlap \t - %s", deviceProp. deviceOverlap?"Allowed":"Not Allowed" ); 00218 printf( "\nNumber of Multi processors \t - %d\n", deviceProp.multiProcessorCount ); 00219 } 00220 00221 00222 static inline void setCurrentDevice(const int deviceNum) 00223 { 00224 if(!isValid(deviceNum)) initDevice(deviceNum); 00225 _setCurrentDevice(deviceNum); 00226 } 00227 00228 00229 private: 00230 // Don't allow objects of CudaDevices 00231 CudaDevices(); 00232 00233 // Don't check if our setup is valid 00234 static inline void _setCurrentDevice(const int deviceNum) 00235 { 00236 // Only change if it's different 00237 if(deviceNum != currentDevice) 00238 { 00239 if(deviceNum < 0) 00240 { 00241 fprintf(stderr,"Attempting to set device to %d from %d in file '%s' in line %i.\n",deviceNum,currentDevice, 00242 __FILE__, __LINE__ ); 00243 abort(); 00244 } 00245 00246 CUDA_SAFE_CALL(cudaSetDevice(deviceNum)); 00247 currentDevice = deviceNum; 00248 } 00249 } 00250 00251 static inline void setDeviceTileSize(const int deviceNum, const Dims tileSize) 00252 { 00253 deviceTileSizes[deviceNum] = tileSize; 00254 } 00255 00256 static inline void setDeviceSharedMemorySize(const int deviceNum, const int smSize) 00257 { 00258 deviceSharedMemorySizes[deviceNum] = smSize; 00259 } 00260 00261 static map<int,Dims> deviceTileSizes; 00262 static map<int,int> deviceSharedMemorySizes; 00263 static int currentDevice; 00264 }; 00265 00266 #endif