00001 #ifndef CUDASIFTH_DEVICE_H
00002 #define CUDASIFTH_DEVICE_H
00003
00004 #include "cudaSiftD.h"
00005 #include "cudaImage.h"
00006
00007
00008
00009
00010
00011 double Find3DMinMax(CudaArray *minmax, CudaImage *data1, CudaImage *data2,
00012 CudaImage *data3, float thresh, int maxPts);
00013 double UnpackPointers(CudaArray *minmax, int maxPts, int *ptrs, int *numPts);
00014 double ComputePositions(CudaImage *data1, CudaImage *data2, CudaImage *data3,
00015 int *h_ptrs, CudaArray *sift, int numPts, int maxPts, float scale,
00016 float factor);
00017 double RemoveEdgePoints(CudaArray *sift, int *initNumPts, int maxPts,
00018 float edgeLimit);
00019 double ComputeOrientations(CudaImage *img, int *h_ptrs, CudaArray *sift,
00020 int numPts, int maxPts);
00021 double SecondOrientations(CudaArray *sift, int *initNumPts, int maxPts);
00022 double ExtractSiftDescriptors(CudaImage *img, CudaArray *sift,
00023 CudaArray *desc, int numPts, int maxPts);
00024 double AddSiftData(SiftData *data, float *d_sift, float *d_desc,
00025 int numPts, int maxPts, float subsampling);
00026
00027
00028
00029
00030 template<int RADIUS>
00031 double SeparableFilter(CudaImage *dataA, CudaImage *dataB,
00032 CudaImage *temp, float *h_Kernel)
00033 {
00034 unsigned int width = dataA->width;
00035 unsigned int height = dataA->height;
00036 float *d_DataA = dataA->d_data;
00037 float *d_DataB = dataB->d_data;
00038 float *d_Temp = temp->d_data;
00039 if (d_DataA==NULL || d_DataB==NULL || d_Temp==NULL) {
00040 printf("SeparableFilter: missing data\n");
00041 return 0.0;
00042 }
00043
00044
00045 const unsigned int kernelSize = (2*RADIUS+1)*sizeof(float);
00046 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_Kernel, h_Kernel, kernelSize));
00047
00048
00049
00050 #if 1
00051 dim3 blockGridRows(iDivUp(width, ROW_TILE_W), height);
00052 dim3 threadBlockRows(WARP_SIZE + ROW_TILE_W + RADIUS);
00053 ConvRowGPU<RADIUS><<<blockGridRows, threadBlockRows>>>(d_Temp,
00054 d_DataA, width, height);
00055 CUT_CHECK_ERROR("ConvRowGPU() execution failed\n");
00056 CUDA_SAFE_CALL(cudaThreadSynchronize());
00057 #endif
00058 #if 1
00059 dim3 blockGridColumns(iDivUp(width, COLUMN_TILE_W),
00060 iDivUp(height, COLUMN_TILE_H));
00061 dim3 threadBlockColumns(COLUMN_TILE_W, 8);
00062 ConvColGPU<RADIUS><<<blockGridColumns, threadBlockColumns>>>(d_DataB,
00063 d_Temp, width, height, COLUMN_TILE_W*8, width*8);
00064 CUT_CHECK_ERROR("ConvColGPU() execution failed\n");
00065 CUDA_SAFE_CALL(cudaThreadSynchronize());
00066 #endif
00067
00068
00069
00070
00071
00072
00073
00074 return 0.0;
00075 }
00076
00077 template<int RADIUS>
00078 double LowPass(CudaImage *dataB, CudaImage *dataA, CudaImage *temp, double var)
00079 {
00080 float kernel[2*RADIUS+1];
00081 float kernelSum = 0.0f;
00082 for (int j=-RADIUS;j<=RADIUS;j++) {
00083 kernel[j+RADIUS] = (float)expf(-(double)j*j/2.0/var);
00084 kernelSum += kernel[j+RADIUS];
00085 }
00086 for (int j=-RADIUS;j<=RADIUS;j++)
00087 kernel[j+RADIUS] /= kernelSum;
00088 return SeparableFilter<RADIUS>(dataA, dataB, temp, kernel);
00089 }
00090
00091 #endif