cudaSiftD_device.h
00001
00002
00003
00004
00005 #ifndef CUDASIFTD_DEVICE_H
00006 #define CUDASIFTD_DEVICE_H
00007
00008 #include "cudaSiftD.h"
00009
00010
00011
00012
00013 template<int i>
00014 __device__ float ConvRow(float *data)
00015 {
00016 return data[i]*d_Kernel[i] + ConvRow<i-1>(data);
00017 }
00018
00019 template<>
00020 __device__ float ConvRow<-1>(float *data)
00021 {
00022 return 0;
00023 }
00024
00025 template<int i>
00026 __device__ float ConvCol(float *data)
00027 {
00028 return data[i*COLUMN_TILE_W]*d_Kernel[i] + ConvCol<i-1>(data);
00029 }
00030
00031 template<>
00032 __device__ float ConvCol<-1>(float *data)
00033 {
00034 return 0;
00035 }
00036
00037
00038
00039
00040 template<int RADIUS>
00041 __global__ void ConvRowGPU(float *d_Result, float *d_Data,
00042 int width, int height)
00043 {
00044
00045 __shared__ float data[RADIUS+ROW_TILE_W+RADIUS+1];
00046
00047 const int tileStart = __mul24(blockIdx.x, ROW_TILE_W);
00048
00049
00050 const int rowStart = __mul24(blockIdx.y, width);
00051 const int rowEnd = rowStart + width - 1;
00052 const int loadPos = threadIdx.x - WARP_SIZE + tileStart;
00053 const int smemPos = threadIdx.x - WARP_SIZE + RADIUS;
00054
00055
00056 if (smemPos>=0) {
00057 if (loadPos<0)
00058 data[smemPos] = d_Data[rowStart];
00059 else if (loadPos>=width)
00060 data[smemPos] = d_Data[rowEnd];
00061 else
00062 data[smemPos] = d_Data[rowStart + loadPos];
00063 #if 0
00064 if (rowStart+loadPos<0 || (rowStart+loadPos>= width*height ||
00065 smemPos<0 || (smemPos >=(RADIUS+ROW_TILE_W+RADIUS+1))
00066 printf("smemPos: %d\n", smemPos);
00067 #endif
00068 }
00069 __syncthreads();
00070
00071
00072 const int tileEnd = tileStart + ROW_TILE_W - 1;
00073 const int tileEndClamped = min(tileEnd, width - 1);
00074 const int writePos = tileStart + threadIdx.x;
00075
00076 if (writePos <= tileEndClamped){
00077 const int smemPos = threadIdx.x + RADIUS;
00078 #if 0
00079 if (rowStart+writePos<0 || rowStart+writePos>=width*height ||
00080 (smemPos - RADIUS*COLUMN_TILE_W)<0 ||
00081 (smemPos + RADIUS*COLUMN_TILE_W)>=(RADIUS+ROW_TILE_W+RADIUS+1))
00082 printf("gmemPos: %d\n", rowStart+writePos);
00083 #endif
00084 d_Result[rowStart + writePos] =
00085 ConvRow<2 * RADIUS>(data + smemPos - RADIUS);;
00086 }
00087 __syncthreads();
00088 }
00089
00090
00091
00092
00093 template<int RADIUS>
00094 __global__ void ConvColGPU(float *d_Result, float *d_Data, int width,
00095 int height, int smemStride, int gmemStride)
00096 {
00097
00098 __shared__ float data[COLUMN_TILE_W*(RADIUS + COLUMN_TILE_H + RADIUS+1)];
00099
00100
00101 const int tileStart = __mul24(blockIdx.y, COLUMN_TILE_H);
00102 const int tileEnd = tileStart + COLUMN_TILE_H - 1;
00103 const int apronStart = tileStart - RADIUS;
00104 const int apronEnd = tileEnd + RADIUS;
00105
00106
00107 const int columnStart = __mul24(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
00108 const int columnEnd = columnStart + __mul24(height-1, width);
00109
00110 if (columnStart<width) {
00111
00112 int smemPos = __mul24(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;
00113 int gmemPos = __mul24(apronStart + threadIdx.y, width) + columnStart;
00114
00115 for (int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y){
00116 if (y<0)
00117 data[smemPos] = d_Data[columnStart];
00118 else if (y>=height)
00119 data[smemPos] = d_Data[columnEnd];
00120 else
00121 data[smemPos] = d_Data[gmemPos];
00122 #if 0
00123 if (columnStart<0 || columnEnd>= width*height || smemPos<0 ||
00124 smemPos>=COLUMN_TILE_W*(RADIUS + COLUMN_TILE_H + RADIUS+1) ||
00125 gmemPos<0 || gmemPos>=width*height)
00126 printf("pos: %d %d\n", smemPos, gmemPos);
00127 #endif
00128 smemPos += smemStride;
00129 gmemPos += gmemStride;
00130 }
00131 }
00132 __syncthreads();
00133
00134 if (columnStart<width) {
00135
00136
00137 const int tileEndClamped = min(tileEnd, height - 1);
00138 int smemPos = __mul24(threadIdx.y + RADIUS, COLUMN_TILE_W) + threadIdx.x;
00139 int gmemPos = __mul24(tileStart + threadIdx.y , width) + columnStart;
00140
00141
00142 for (int y=tileStart+threadIdx.y;y<=tileEndClamped;y+=blockDim.y){
00143 #if 0
00144 if (gmemPos<0 || gmemPos>=width*height ||
00145 (smemPos - RADIUS*COLUMN_TILE_W)<0 ||
00146 (smemPos + RADIUS*COLUMN_TILE_W)>=
00147 (COLUMN_TILE_W*(RADIUS + COLUMN_TILE_H + RADIUS+1)))
00148 printf("pos = %d, smemPos = %d, width = %d, height = %d\n",
00149 gmemPos, smemPos, width, height);
00150 #endif
00151 d_Result[gmemPos] =
00152 ConvCol<2*RADIUS>(data + smemPos - RADIUS*COLUMN_TILE_W);;
00153 smemPos += smemStride;
00154 gmemPos += gmemStride;
00155 }
00156 }
00157 __syncthreads();
00158 }
00159
00160 #endif