cudaSiftD_device.h

00001 //********************************************************//
00002 // CUDA SIFT extractor by Marten Bjorkman aka Celebrandil //
00003 //********************************************************//
00004 
00005 #ifndef CUDASIFTD_DEVICE_H
00006 #define CUDASIFTD_DEVICE_H
00007 
00008 #include "cudaSiftD.h"
00009 
00010 ///////////////////////////////////////////////////////////////////////////////
00011 // Loop unrolling templates, needed for best performance
00012 ///////////////////////////////////////////////////////////////////////////////
00013 template<int i>
00014 __device__ float ConvRow(float *data)
00015 {
00016   return data[i]*d_Kernel[i] + ConvRow<i-1>(data);
00017 }
00018 
00019 template<>
00020 __device__ float ConvRow<-1>(float *data)
00021 {
00022   return 0;
00023 }
00024 
00025 template<int i>
00026 __device__ float ConvCol(float *data)
00027 {
00028   return data[i*COLUMN_TILE_W]*d_Kernel[i] + ConvCol<i-1>(data);
00029 }
00030 
00031 template<>
00032 __device__ float ConvCol<-1>(float *data)
00033 {
00034   return 0;
00035 }
00036 
00037 ///////////////////////////////////////////////////////////////////////////////
00038 // Row convolution filter
00039 ///////////////////////////////////////////////////////////////////////////////
00040 template<int RADIUS>
00041 __global__ void ConvRowGPU(float *d_Result, float *d_Data,
00042                            int width, int height)
00043 {
00044   //Data cache
00045   __shared__ float data[RADIUS+ROW_TILE_W+RADIUS+1];
00046   //Current tile and apron limits, relative to row start
00047   const int tileStart = __mul24(blockIdx.x, ROW_TILE_W);
00048 
00049   //Row start index in d_Data[]
00050   const int rowStart = __mul24(blockIdx.y, width);
00051   const int rowEnd = rowStart + width - 1;
00052   const int loadPos = threadIdx.x - WARP_SIZE + tileStart;
00053   const int smemPos =  threadIdx.x - WARP_SIZE + RADIUS;
00054 
00055   //Set the entire data cache contents
00056   if (smemPos>=0) {
00057     if (loadPos<0)
00058       data[smemPos] = d_Data[rowStart];
00059     else if (loadPos>=width)
00060       data[smemPos] = d_Data[rowEnd];
00061     else
00062       data[smemPos] = d_Data[rowStart + loadPos];
00063 #if 0
00064     if (rowStart+loadPos<0 || (rowStart+loadPos>= width*height ||
00065                                smemPos<0 || (smemPos >=(RADIUS+ROW_TILE_W+RADIUS+1))
00066                                printf("smemPos: %d\n", smemPos);
00067 #endif
00068                                }
00069         __syncthreads();
00070 
00071         //Clamp tile and apron limits by image borders
00072         const int tileEnd = tileStart + ROW_TILE_W - 1;
00073         const int tileEndClamped = min(tileEnd, width - 1);
00074         const int writePos = tileStart + threadIdx.x;
00075 
00076         if (writePos <= tileEndClamped){
00077           const int smemPos = threadIdx.x + RADIUS;
00078 #if 0
00079           if (rowStart+writePos<0 || rowStart+writePos>=width*height ||
00080               (smemPos - RADIUS*COLUMN_TILE_W)<0 ||
00081               (smemPos + RADIUS*COLUMN_TILE_W)>=(RADIUS+ROW_TILE_W+RADIUS+1))
00082             printf("gmemPos: %d\n", rowStart+writePos);
00083 #endif
00084           d_Result[rowStart + writePos] =
00085             ConvRow<2 * RADIUS>(data + smemPos - RADIUS);;
00086         }
00087         __syncthreads();
00088         }
00089 
00090     ///////////////////////////////////////////////////////////////////////////////
00091     // Column convolution filter
00092     ///////////////////////////////////////////////////////////////////////////////
00093     template<int RADIUS>
00094       __global__ void ConvColGPU(float *d_Result, float *d_Data, int width,
00095                                  int height, int smemStride, int gmemStride)
00096       {
00097         // Data cache
00098         __shared__ float data[COLUMN_TILE_W*(RADIUS + COLUMN_TILE_H + RADIUS+1)];
00099 
00100         // Current tile and apron limits, in rows
00101         const int tileStart = __mul24(blockIdx.y, COLUMN_TILE_H);
00102         const int tileEnd = tileStart + COLUMN_TILE_H - 1;
00103         const int apronStart = tileStart - RADIUS;
00104         const int apronEnd = tileEnd + RADIUS;
00105 
00106         // Current column index
00107         const int columnStart = __mul24(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
00108         const int columnEnd = columnStart + __mul24(height-1, width);
00109 
00110         if (columnStart<width) {
00111           // Shared and global memory indices for current column
00112           int smemPos = __mul24(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;
00113           int gmemPos = __mul24(apronStart + threadIdx.y, width) + columnStart;
00114           // Cycle through the entire data cache
00115           for (int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y){
00116             if (y<0)
00117               data[smemPos] = d_Data[columnStart];
00118             else if (y>=height)
00119               data[smemPos] = d_Data[columnEnd];
00120             else
00121               data[smemPos] = d_Data[gmemPos];
00122 #if 0
00123             if (columnStart<0 || columnEnd>= width*height || smemPos<0 ||
00124                 smemPos>=COLUMN_TILE_W*(RADIUS + COLUMN_TILE_H + RADIUS+1) ||
00125                 gmemPos<0 || gmemPos>=width*height)
00126               printf("pos: %d %d\n", smemPos, gmemPos);
00127 #endif
00128             smemPos += smemStride;
00129             gmemPos += gmemStride;
00130           }
00131         }
00132         __syncthreads();
00133 
00134         if (columnStart<width) {
00135           // Shared and global memory indices for current column
00136           // Clamp tile and apron limits by image borders
00137           const int tileEndClamped = min(tileEnd, height - 1);
00138           int smemPos = __mul24(threadIdx.y + RADIUS, COLUMN_TILE_W) + threadIdx.x;
00139           int gmemPos = __mul24(tileStart + threadIdx.y , width) + columnStart;
00140           // Cycle through the tile body, clamped by image borders
00141           // Calculate and output the results
00142           for (int y=tileStart+threadIdx.y;y<=tileEndClamped;y+=blockDim.y){
00143 #if 0
00144             if (gmemPos<0 || gmemPos>=width*height ||
00145                 (smemPos - RADIUS*COLUMN_TILE_W)<0 ||
00146                 (smemPos + RADIUS*COLUMN_TILE_W)>=
00147                 (COLUMN_TILE_W*(RADIUS + COLUMN_TILE_H + RADIUS+1)))
00148               printf("pos = %d, smemPos = %d, width = %d, height = %d\n",
00149                      gmemPos, smemPos, width, height);
00150 #endif
00151             d_Result[gmemPos] =
00152               ConvCol<2*RADIUS>(data + smemPos - RADIUS*COLUMN_TILE_W);;
00153             smemPos += smemStride;
00154             gmemPos += gmemStride;
00155           }
00156         }
00157         __syncthreads();
00158       }
00159 
00160 #endif