00001 //********************************************************// 00002 // CUDA SIFT extractor by Marten Bjorkman aka Celebrandil // 00003 //********************************************************// 00004 00005 #ifndef CUDASIFTD_DEVICE_H 00006 #define CUDASIFTD_DEVICE_H 00007 00008 #include "cudaSiftD.h" 00009 00010 /////////////////////////////////////////////////////////////////////////////// 00011 // Loop unrolling templates, needed for best performance 00012 /////////////////////////////////////////////////////////////////////////////// 00013 template<int i> 00014 __device__ float ConvRow(float *data) 00015 { 00016 return data[i]*d_Kernel[i] + ConvRow<i-1>(data); 00017 } 00018 00019 template<> 00020 __device__ float ConvRow<-1>(float *data) 00021 { 00022 return 0; 00023 } 00024 00025 template<int i> 00026 __device__ float ConvCol(float *data) 00027 { 00028 return data[i*COLUMN_TILE_W]*d_Kernel[i] + ConvCol<i-1>(data); 00029 } 00030 00031 template<> 00032 __device__ float ConvCol<-1>(float *data) 00033 { 00034 return 0; 00035 } 00036 00037 /////////////////////////////////////////////////////////////////////////////// 00038 // Row convolution filter 00039 /////////////////////////////////////////////////////////////////////////////// 00040 template<int RADIUS> 00041 __global__ void ConvRowGPU(float *d_Result, float *d_Data, 00042 int width, int height) 00043 { 00044 //Data cache 00045 __shared__ float data[RADIUS+ROW_TILE_W+RADIUS+1]; 00046 //Current tile and apron limits, relative to row start 00047 const int tileStart = __mul24(blockIdx.x, ROW_TILE_W); 00048 00049 //Row start index in d_Data[] 00050 const int rowStart = __mul24(blockIdx.y, width); 00051 const int rowEnd = rowStart + width - 1; 00052 const int loadPos = threadIdx.x - WARP_SIZE + tileStart; 00053 const int smemPos = threadIdx.x - WARP_SIZE + RADIUS; 00054 00055 //Set the entire data cache contents 00056 if (smemPos>=0) { 00057 if (loadPos<0) 00058 data[smemPos] = d_Data[rowStart]; 00059 else if (loadPos>=width) 00060 data[smemPos] = d_Data[rowEnd]; 00061 else 00062 data[smemPos] = d_Data[rowStart + loadPos]; 00063 #if 0 00064 if (rowStart+loadPos<0 || (rowStart+loadPos>= width*height || 00065 smemPos<0 || (smemPos >=(RADIUS+ROW_TILE_W+RADIUS+1)) 00066 printf("smemPos: %d\n", smemPos); 00067 #endif 00068 } 00069 __syncthreads(); 00070 00071 //Clamp tile and apron limits by image borders 00072 const int tileEnd = tileStart + ROW_TILE_W - 1; 00073 const int tileEndClamped = min(tileEnd, width - 1); 00074 const int writePos = tileStart + threadIdx.x; 00075 00076 if (writePos <= tileEndClamped){ 00077 const int smemPos = threadIdx.x + RADIUS; 00078 #if 0 00079 if (rowStart+writePos<0 || rowStart+writePos>=width*height || 00080 (smemPos - RADIUS*COLUMN_TILE_W)<0 || 00081 (smemPos + RADIUS*COLUMN_TILE_W)>=(RADIUS+ROW_TILE_W+RADIUS+1)) 00082 printf("gmemPos: %d\n", rowStart+writePos); 00083 #endif 00084 d_Result[rowStart + writePos] = 00085 ConvRow<2 * RADIUS>(data + smemPos - RADIUS);; 00086 } 00087 __syncthreads(); 00088 } 00089 00090 /////////////////////////////////////////////////////////////////////////////// 00091 // Column convolution filter 00092 /////////////////////////////////////////////////////////////////////////////// 00093 template<int RADIUS> 00094 __global__ void ConvColGPU(float *d_Result, float *d_Data, int width, 00095 int height, int smemStride, int gmemStride) 00096 { 00097 // Data cache 00098 __shared__ float data[COLUMN_TILE_W*(RADIUS + COLUMN_TILE_H + RADIUS+1)]; 00099 00100 // Current tile and apron limits, in rows 00101 const int tileStart = __mul24(blockIdx.y, COLUMN_TILE_H); 00102 const int tileEnd = tileStart + COLUMN_TILE_H - 1; 00103 const int apronStart = tileStart - RADIUS; 00104 const int apronEnd = tileEnd + RADIUS; 00105 00106 // Current column index 00107 const int columnStart = __mul24(blockIdx.x, COLUMN_TILE_W) + threadIdx.x; 00108 const int columnEnd = columnStart + __mul24(height-1, width); 00109 00110 if (columnStart<width) { 00111 // Shared and global memory indices for current column 00112 int smemPos = __mul24(threadIdx.y, COLUMN_TILE_W) + threadIdx.x; 00113 int gmemPos = __mul24(apronStart + threadIdx.y, width) + columnStart; 00114 // Cycle through the entire data cache 00115 for (int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y){ 00116 if (y<0) 00117 data[smemPos] = d_Data[columnStart]; 00118 else if (y>=height) 00119 data[smemPos] = d_Data[columnEnd]; 00120 else 00121 data[smemPos] = d_Data[gmemPos]; 00122 #if 0 00123 if (columnStart<0 || columnEnd>= width*height || smemPos<0 || 00124 smemPos>=COLUMN_TILE_W*(RADIUS + COLUMN_TILE_H + RADIUS+1) || 00125 gmemPos<0 || gmemPos>=width*height) 00126 printf("pos: %d %d\n", smemPos, gmemPos); 00127 #endif 00128 smemPos += smemStride; 00129 gmemPos += gmemStride; 00130 } 00131 } 00132 __syncthreads(); 00133 00134 if (columnStart<width) { 00135 // Shared and global memory indices for current column 00136 // Clamp tile and apron limits by image borders 00137 const int tileEndClamped = min(tileEnd, height - 1); 00138 int smemPos = __mul24(threadIdx.y + RADIUS, COLUMN_TILE_W) + threadIdx.x; 00139 int gmemPos = __mul24(tileStart + threadIdx.y , width) + columnStart; 00140 // Cycle through the tile body, clamped by image borders 00141 // Calculate and output the results 00142 for (int y=tileStart+threadIdx.y;y<=tileEndClamped;y+=blockDim.y){ 00143 #if 0 00144 if (gmemPos<0 || gmemPos>=width*height || 00145 (smemPos - RADIUS*COLUMN_TILE_W)<0 || 00146 (smemPos + RADIUS*COLUMN_TILE_W)>= 00147 (COLUMN_TILE_W*(RADIUS + COLUMN_TILE_H + RADIUS+1))) 00148 printf("pos = %d, smemPos = %d, width = %d, height = %d\n", 00149 gmemPos, smemPos, width, height); 00150 #endif 00151 d_Result[gmemPos] = 00152 ConvCol<2*RADIUS>(data + smemPos - RADIUS*COLUMN_TILE_W);; 00153 smemPos += smemStride; 00154 gmemPos += gmemStride; 00155 } 00156 } 00157 __syncthreads(); 00158 } 00159 00160 #endif