00001 //********************************************************// 00002 // CUDA SIFT extractor by Marten Bjorkman aka Celebrandil // 00003 //********************************************************// 00004 00005 #include <stdio.h> 00006 #include <string.h> 00007 #include "CUDA/cutil.h" 00008 00009 #undef VERBOSE 00010 00011 #include "cudaImage.h" 00012 #include "cudaSift.h" 00013 #include "cudaSiftD_device.h" 00014 #include "cudaSiftH.h" 00015 #include "cudaSiftH_device.h" 00016 00017 #include "cudaSiftDcu" 00018 00019 void InitCuda(int argc, char **argv) 00020 { 00021 //CUT_DEVICE_INIT(argc, argv); //Doesn't like iLab command line 00022 CUDA_SAFE_CALL(cudaSetDevice(3)); //Use non-display gpu 00023 } 00024 00025 void ExtractSift(SiftData *siftData, CudaImage *img, int numLayers, 00026 int numOctaves, double initBlur, float thresh, float subsampling) 00027 { 00028 int w = img->width; 00029 int h = img->height; 00030 int p = img->pitch; 00031 if (numOctaves>1) { 00032 CudaImage subImg; 00033 AllocCudaImage(&subImg, w/2, h/2, p/2, false, true); 00034 ScaleDown(&subImg, img, 0.5f); 00035 float totInitBlur = (float)sqrt(initBlur*initBlur + 0.5f*0.5f) / 2.0f; 00036 ExtractSift(siftData, &subImg, numLayers, numOctaves-1, totInitBlur, 00037 thresh, subsampling*2.0f); 00038 FreeCudaImage(&subImg); 00039 } 00040 ExtractSiftOctave(siftData, img, numLayers, initBlur, thresh, subsampling); 00041 } 00042 00043 void ExtractSiftOctave(SiftData *siftData, CudaImage *img, int numLayers, 00044 double initBlur, float thresh, float subsampling) 00045 { 00046 const double baseBlur = 1.0; 00047 const int maxPts = 512; 00048 int w = img->width; 00049 int h = img->height; 00050 int p = img->pitch; 00051 // Images and arrays 00052 CudaImage blurImg[2]; 00053 CudaImage diffImg[3]; 00054 CudaImage tempImg; 00055 CudaImage textImg; 00056 CudaArray sift; // { xpos, ypos, scale, strength, edge, orient1, orient2 }; 00057 CudaArray desc; 00058 CudaArray minmax; 00059 00060 AllocCudaImage(&blurImg[0], w, h, p, false, true); 00061 AllocCudaImage(&blurImg[1], w, h, p, false, true); 00062 AllocCudaImage(&diffImg[0], w, h, p, false, true); 00063 AllocCudaImage(&diffImg[1], w, h, p, false, true); 00064 AllocCudaImage(&diffImg[2], w, h, p, false, true); 00065 AllocCudaImage(&tempImg, w, h, p, false, true); 00066 AllocCudaImage(&minmax, w, iDivUp(h,32), w, true, true); 00067 AllocCudaImage(&sift, maxPts, 7, maxPts, true, true); 00068 AllocCudaImage(&desc, 128, maxPts, 128, true, true); 00069 int *ptrs = 0; 00070 //ptrs = (int *)malloc(sizeof(int)*maxPts); 00071 //Now, allocates page locked memory versus above. 00072 CUDA_SAFE_CALL(cudaMallocHost((void **)&ptrs, sizeof(int)*maxPts)); 00073 AllocCudaImage(&textImg, w, h, p, false, false); 00074 InitTexture(&textImg); 00075 CUT_CHECK_ERROR("Memory allocation failed\n"); 00076 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00077 double var = baseBlur*baseBlur - initBlur*initBlur; 00078 double gpuTime = 0.0, gpuTimeDoG = 0.0, gpuTimeSift = 0.0; 00079 int totPts = 0; 00080 00081 if (var>0.0) 00082 gpuTime += LowPass<2>(&blurImg[0], img, &tempImg, var); 00083 for (int i=0;i<numLayers+2;i++) { 00084 double pSigma = baseBlur*pow(2.0, (double)(i-1)/numLayers); 00085 double oSigma = baseBlur*pow(2.0, (double)i/numLayers); 00086 double nSigma = baseBlur*pow(2.0, (double)(i+1)/numLayers); 00087 double var = nSigma*nSigma - oSigma*oSigma; 00088 if (i<=1) { 00089 gpuTime += LowPass<7>(&blurImg[(i+1)%2], &blurImg[i%2], &tempImg, var); 00090 gpuTime += Subtract(&diffImg[(i+2)%3], &blurImg[i%2], &blurImg[(i+1)%2]); 00091 } else { 00092 gpuTimeDoG += LowPass<7>(&blurImg[(i+1)%2], &blurImg[i%2], &tempImg,var); 00093 gpuTimeDoG += Subtract(&diffImg[(i+2)%3], &blurImg[i%2], 00094 &blurImg[(i+1)%2]); 00095 gpuTimeDoG += Find3DMinMax(&minmax, &diffImg[(i+2)%3], &diffImg[(i+1)%3], 00096 &diffImg[i%3], thresh, maxPts); 00097 int numPts = 0; 00098 gpuTimeSift += UnpackPointers(&minmax, maxPts, ptrs, &numPts); 00099 if (numPts>0) { 00100 gpuTimeDoG += CopyToTexture(&blurImg[i%2], &textImg, false); 00101 gpuTimeSift += ComputePositions(&diffImg[(i+2)%3], &diffImg[(i+1)%3], 00102 &diffImg[i%3], ptrs, &sift, numPts, maxPts, 00103 pSigma, 1.0f/numLayers); 00104 gpuTimeSift += RemoveEdgePoints(&sift, &numPts, maxPts, 10.0f); 00105 if(numPts>0){ // Need to check again? 00106 gpuTimeSift += ComputeOrientations(&blurImg[i%2], ptrs, &sift, 00107 numPts, maxPts); 00108 gpuTimeSift += SecondOrientations(&sift, &numPts, maxPts); 00109 gpuTimeSift += ExtractSiftDescriptors(&textImg, &sift, &desc, 00110 numPts, maxPts); 00111 gpuTimeSift += AddSiftData(siftData, sift.d_data, desc.d_data, 00112 numPts, maxPts, subsampling); 00113 } 00114 } 00115 totPts += numPts; 00116 } 00117 } 00118 00119 #if 0 00120 Readback(&diffImg[(2+2)%3]); 00121 float *imd = diffImg[(2+2)%3].h_data; 00122 for (int i=0;i<w*h;i++) imd[i] = 8*imd[i] + 128; 00123 CUT_SAFE_CALL(cutSavePGMf("data/limg_test2.pgm", 00124 diffImg[(2+2)%3].h_data, w, h)); 00125 Readback(&diffImg[(2+1)%3]); 00126 imd = diffImg[(2+1)%3].h_data; 00127 for (int i=0;i<w*h;i++) imd[i] = 8*imd[i] + 128; 00128 CUT_SAFE_CALL(cutSavePGMf("data/limg_test1.pgm", 00129 diffImg[(2+1)%3].h_data, w, h)); 00130 Readback(&diffImg[(2+0)%3]); 00131 imd = diffImg[(2+0)%3].h_data; 00132 for (int i=0;i<w*h;i++) imd[i] = 8*imd[i] + 128; 00133 CUT_SAFE_CALL(cutSavePGMf("data/limg_test0.pgm", 00134 diffImg[(2+0)%3].h_data, w, h)); 00135 #endif 00136 00137 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00138 FreeCudaImage(&textImg); 00139 //free(ptrs); 00140 CUDA_SAFE_CALL(cudaFreeHost(ptrs)); 00141 FreeCudaImage(&desc); 00142 FreeCudaImage(&sift); 00143 FreeCudaImage(&minmax); 00144 FreeCudaImage(&tempImg); 00145 FreeCudaImage(&diffImg[2]); 00146 FreeCudaImage(&diffImg[1]); 00147 FreeCudaImage(&diffImg[0]); 00148 FreeCudaImage(&blurImg[1]); 00149 FreeCudaImage(&blurImg[0]); 00150 00151 } 00152 00153 void InitSiftData(SiftData *data, int num, bool host, bool dev) 00154 { 00155 data->numPts = 0; 00156 data->maxPts = num; 00157 int sz = sizeof(SiftPoint)*num; 00158 data->h_data = NULL; 00159 if (host) 00160 //data->h_data = (SiftPoint *)malloc(sz); 00161 CUDA_SAFE_CALL(cudaMallocHost((void **)&data->h_data, sz)); 00162 data->d_data = NULL; 00163 if (dev) 00164 CUDA_SAFE_CALL(cudaMalloc((void **)&data->d_data, sz)); 00165 } 00166 00167 void FreeSiftData(SiftData *data) 00168 { 00169 if (data->d_data!=NULL) 00170 CUDA_SAFE_CALL(cudaFree(data->d_data)); 00171 data->d_data = NULL; 00172 if (data->h_data!=NULL) 00173 //free(data->h_data); 00174 CUDA_SAFE_CALL(cudaFreeHost(data->h_data)); 00175 data->numPts = 0; 00176 data->maxPts = 0; 00177 } 00178 00179 double AddSiftData(SiftData *data, float *d_sift, float *d_desc, 00180 int numPts, int maxPts, float subsampling) 00181 { 00182 int newNum = data->numPts + numPts; 00183 if (data->maxPts<(data->numPts+numPts)) { 00184 int newMaxNum = 2*data->maxPts; 00185 while (newNum>newMaxNum) 00186 newMaxNum *= 2; 00187 if (data->h_data!=NULL) { 00188 SiftPoint *h_data = 0; 00189 //h_data = (SiftPoint *)malloc(sizeof(SiftPoint)*newMaxNum); 00190 CUDA_SAFE_CALL(cudaMallocHost((void**)&h_data, sizeof(SiftPoint)*newMaxNum)); 00191 memcpy(h_data, data->h_data, sizeof(SiftPoint)*data->numPts); 00192 //free(data->h_data); 00193 //CUDA_SAFE_CALL(cudaFree(data->h_data)); //Was incorrect, should be cudaFreeHost. 00194 CUDA_SAFE_CALL(cudaFreeHost(data->h_data)); 00195 data->h_data = h_data; 00196 } 00197 if (data->d_data!=NULL) { 00198 SiftPoint *d_data = NULL; 00199 CUDA_SAFE_CALL(cudaMalloc((void**)&d_data, sizeof(SiftPoint)*newMaxNum)); 00200 CUDA_SAFE_CALL(cudaMemcpy(d_data, data->d_data, 00201 sizeof(SiftPoint)*data->numPts, cudaMemcpyDeviceToDevice)); 00202 CUDA_SAFE_CALL(cudaFree(data->d_data)); 00203 data->d_data = d_data; 00204 } 00205 data->maxPts = newMaxNum; 00206 } 00207 int pitch = sizeof(SiftPoint); 00208 float *buffer; 00209 //buffer = (float *)malloc(sizeof(float)*3*numPts); 00210 CUDA_SAFE_CALL(cudaMallocHost((void**)&buffer, sizeof(float)*3*numPts)); 00211 int bwidth = sizeof(float)*numPts; 00212 CUDA_SAFE_CALL(cudaMemcpy2D(buffer, bwidth, d_sift, sizeof(float)*maxPts, 00213 bwidth, 3, cudaMemcpyDeviceToHost)); 00214 for (int i=0;i<3*numPts;i++) 00215 buffer[i] *= subsampling; 00216 CUDA_SAFE_CALL(cudaMemcpy2D(d_sift, sizeof(float)*maxPts, buffer, bwidth, 00217 bwidth, 3, cudaMemcpyHostToDevice)); 00218 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00219 if (data->h_data!=NULL) { 00220 float *ptr = (float*)&data->h_data[data->numPts]; 00221 for (int i=0;i<6;i++) 00222 CUDA_SAFE_CALL(cudaMemcpy2D(&ptr[i], pitch, &d_sift[i*maxPts], 4, 4, 00223 numPts, cudaMemcpyDeviceToHost)); 00224 CUDA_SAFE_CALL(cudaMemcpy2D(&ptr[16], pitch, d_desc, sizeof(float)*128, 00225 sizeof(float)*128, numPts, cudaMemcpyDeviceToHost)); 00226 } 00227 if (data->d_data!=NULL) { 00228 float *ptr = (float*)&data->d_data[data->numPts]; 00229 for (int i=0;i<6;i++) 00230 CUDA_SAFE_CALL(cudaMemcpy2D(&ptr[i], pitch, &d_sift[i*maxPts], 4, 4, 00231 numPts, cudaMemcpyDeviceToDevice)); 00232 CUDA_SAFE_CALL(cudaMemcpy2D(&ptr[16], pitch, d_desc, sizeof(float)*128, 00233 sizeof(float)*128, numPts, cudaMemcpyDeviceToDevice)); 00234 } 00235 data->numPts = newNum; 00236 CUDA_SAFE_CALL(cudaFreeHost(buffer)); 00237 //free(buffer); 00238 return 0; 00239 } 00240 00241 void PrintSiftData(SiftData *data) 00242 { 00243 SiftPoint *h_data = data->h_data; 00244 if (data->h_data==NULL) { 00245 //h_data = (SiftPoint *)malloc(sizeof(SiftPoint)*data->maxPts); 00246 CUDA_SAFE_CALL(cudaMallocHost((void **)&h_data, sizeof(SiftPoint)*data->maxPts)); 00247 CUDA_SAFE_CALL(cudaMemcpy(h_data, data->d_data, 00248 sizeof(SiftPoint)*data->numPts, cudaMemcpyDeviceToHost)); 00249 data->h_data = h_data; 00250 } 00251 for (int i=0;i<data->numPts;i++) { 00252 printf("xpos = %.2f\n", h_data[i].xpos); 00253 printf("ypos = %.2f\n", h_data[i].ypos); 00254 printf("scale = %.2f\n", h_data[i].scale); 00255 printf("sharpness = %.2f\n", h_data[i].sharpness); 00256 printf("edgeness = %.2f\n", h_data[i].edgeness); 00257 printf("orientation = %.2f\n", h_data[i].orientation); 00258 float *siftData = (float*)&h_data[i].data; 00259 for (int j=0;j<8;j++) { 00260 if (j==0) printf("data = "); 00261 else printf(" "); 00262 for (int k=0;k<16;k++) 00263 if (siftData[j+8*k]<0.05) 00264 printf(" . ", siftData[j+8*k]); 00265 else 00266 printf("%.2f ", siftData[j+8*k]); 00267 printf("\n"); 00268 } 00269 } 00270 printf("Number of available points: %d\n", data->numPts); 00271 printf("Number of allocated points: %d\n", data->maxPts); 00272 } 00273 00274 double MatchSiftData(SiftData *data1, SiftData *data2) 00275 { 00276 if (data1->d_data==NULL || data2->d_data==NULL) 00277 return 0.0f; 00278 int numPts1 = data1->numPts; 00279 int numPts2 = data2->numPts; 00280 SiftPoint *sift1 = data1->d_data; 00281 SiftPoint *sift2 = data2->d_data; 00282 00283 float *d_corrData; 00284 int corrWidth = iDivUp(numPts2, 16)*16; 00285 int corrSize = sizeof(float)*numPts1*corrWidth; 00286 CUDA_SAFE_CALL(cudaMalloc((void **)&d_corrData, corrSize)); 00287 dim3 blocks(numPts1, iDivUp(numPts2, 16)); 00288 dim3 threads(16, 16); // each block: 1 points x 16 points 00289 MatchSiftPoints<<<blocks, threads>>>(sift1, sift2, 00290 d_corrData, numPts1, numPts2); 00291 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00292 dim3 blocksMax(iDivUp(numPts1, 16)); 00293 dim3 threadsMax(16, 16); 00294 FindMaxCorr<<<blocksMax, threadsMax>>>(d_corrData, sift1, sift2, 00295 numPts1, corrWidth, sizeof(SiftPoint)); 00296 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00297 CUT_CHECK_ERROR("MatchSiftPoints() execution failed\n"); 00298 CUDA_SAFE_CALL(cudaFree(d_corrData)); 00299 if (data1->h_data!=NULL) { 00300 float *h_ptr = &data1->h_data[0].score; 00301 float *d_ptr = &data1->d_data[0].score; 00302 CUDA_SAFE_CALL(cudaMemcpy2D(h_ptr, sizeof(SiftPoint), d_ptr, 00303 sizeof(SiftPoint), 5*sizeof(float), data1->numPts, 00304 cudaMemcpyDeviceToHost)); 00305 } 00306 00307 return 0; 00308 } 00309 00310 double FindHomography(SiftData *data, float *homography, int *numMatches, 00311 int numLoops, float minScore, float maxAmbiguity, float thresh) 00312 { 00313 *numMatches = 0; 00314 homography[0] = homography[4] = homography[8] = 1.0f; 00315 homography[1] = homography[2] = homography[3] = 0.0f; 00316 homography[5] = homography[6] = homography[7] = 0.0f; 00317 if (data->d_data==NULL) 00318 return 0.0f; 00319 SiftPoint *d_sift = data->d_data; 00320 numLoops = iDivUp(numLoops,16)*16; 00321 int numPts = data->numPts; 00322 if (numPts<8) 00323 return 0.0f; 00324 int numPtsUp = iDivUp(numPts, 16)*16; 00325 float *d_coord, *d_homo; 00326 int *d_randPts, *h_randPts; 00327 int randSize = 4*sizeof(int)*numLoops; 00328 int szFl = sizeof(float); 00329 int szPt = sizeof(SiftPoint); 00330 CUDA_SAFE_CALL(cudaMalloc((void **)&d_coord, 4*sizeof(float)*numPtsUp)); 00331 CUDA_SAFE_CALL(cudaMalloc((void **)&d_randPts, randSize)); 00332 CUDA_SAFE_CALL(cudaMalloc((void **)&d_homo, 8*sizeof(float)*numLoops)); 00333 h_randPts = (int*)malloc(randSize); 00334 float *h_scores = (float *)malloc(sizeof(float)*numPtsUp); 00335 float *h_ambiguities = (float *)malloc(sizeof(float)*numPtsUp); 00336 CUDA_SAFE_CALL(cudaMemcpy2D(h_scores, szFl, &d_sift[0].score, szPt, 00337 szFl, numPts, cudaMemcpyDeviceToHost)); 00338 CUDA_SAFE_CALL(cudaMemcpy2D(h_ambiguities, szFl, &d_sift[0].ambiguity, szPt, 00339 szFl, numPts, cudaMemcpyDeviceToHost)); 00340 int *validPts = (int *)malloc(sizeof(int)*numPts); 00341 int numValid = 0; 00342 for (int i=0;i<numPts;i++) { 00343 if (h_scores[i]>minScore && h_ambiguities[i]<maxAmbiguity) 00344 validPts[numValid++] = i; 00345 } 00346 free(h_scores); 00347 free(h_ambiguities); 00348 if (numValid>=8) { 00349 for (int i=0;i<numLoops;i++) { 00350 int p1 = rand() % numValid; 00351 int p2 = rand() % numValid; 00352 int p3 = rand() % numValid; 00353 int p4 = rand() % numValid; 00354 while (p2==p1) p2 = rand() % numValid; 00355 while (p3==p1 || p3==p2) p3 = rand() % numValid; 00356 while (p4==p1 || p4==p2 || p4==p3) p4 = rand() % numValid; 00357 h_randPts[i+0*numLoops] = validPts[p1]; 00358 h_randPts[i+1*numLoops] = validPts[p2]; 00359 h_randPts[i+2*numLoops] = validPts[p3]; 00360 h_randPts[i+3*numLoops] = validPts[p4]; 00361 } 00362 CUDA_SAFE_CALL(cudaMemcpy(d_randPts, h_randPts, randSize, 00363 cudaMemcpyHostToDevice)); 00364 CUDA_SAFE_CALL(cudaMemcpy2D(&d_coord[0*numPtsUp], szFl, &d_sift[0].xpos, 00365 szPt, szFl, numPts, cudaMemcpyDeviceToDevice)); 00366 CUDA_SAFE_CALL(cudaMemcpy2D(&d_coord[1*numPtsUp], szFl, &d_sift[0].ypos, 00367 szPt, szFl, numPts, cudaMemcpyDeviceToDevice)); 00368 CUDA_SAFE_CALL(cudaMemcpy2D(&d_coord[2*numPtsUp], szFl, 00369 &d_sift[0].match_xpos, szPt, szFl, numPts, cudaMemcpyDeviceToDevice)); 00370 CUDA_SAFE_CALL(cudaMemcpy2D(&d_coord[3*numPtsUp], szFl, 00371 &d_sift[0].match_ypos, szPt, szFl, numPts, cudaMemcpyDeviceToDevice)); 00372 ComputeHomographies<<<numLoops/16, 16>>>(d_coord, d_randPts, 00373 d_homo, numPtsUp); 00374 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00375 CUT_CHECK_ERROR("ComputeHomographies() execution failed\n"); 00376 dim3 blocks(1, numLoops/TESTHOMO_LOOPS); 00377 dim3 threads(TESTHOMO_TESTS, TESTHOMO_LOOPS); 00378 TestHomographies<<<blocks, threads>>>(d_coord, d_homo, 00379 d_randPts, numPtsUp, thresh*thresh); 00380 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00381 CUT_CHECK_ERROR("TestHomographies() execution failed\n"); 00382 CUDA_SAFE_CALL(cudaMemcpy(h_randPts, d_randPts, sizeof(int)*numLoops, 00383 cudaMemcpyDeviceToHost)); 00384 int maxIndex = -1, maxCount = -1; 00385 for (int i=0;i<numLoops;i++) 00386 if (h_randPts[i]>maxCount) { 00387 maxCount = h_randPts[i]; 00388 maxIndex = i; 00389 } 00390 CUDA_SAFE_CALL(cudaMemcpy2D(homography, szFl, &d_homo[maxIndex], 00391 sizeof(float)*numLoops, szFl, 8, cudaMemcpyDeviceToHost)); 00392 } 00393 free(validPts); 00394 free(h_randPts); 00395 CUDA_SAFE_CALL(cudaFree(d_homo)); 00396 CUDA_SAFE_CALL(cudaFree(d_randPts)); 00397 CUDA_SAFE_CALL(cudaFree(d_coord)); 00398 00399 return 0; 00400 } 00401 00402 /////////////////////////////////////////////////////////////////////////////// 00403 // Host side master functions 00404 /////////////////////////////////////////////////////////////////////////////// 00405 00406 double LowPass5(CudaImage *res, CudaImage *data, float variance) 00407 { 00408 int w = res->width; 00409 int h = res->height; 00410 if (res->d_data==NULL || data->d_data==NULL) { 00411 printf("LowPass5: missing data\n"); 00412 return 0.0; 00413 } 00414 float h_Kernel[5]; 00415 float kernelSum = 0.0f; 00416 for (int j=0;j<5;j++) { 00417 h_Kernel[j] = (float)expf(-(double)(j-2)*(j-2)/2.0/variance); 00418 kernelSum += h_Kernel[j]; 00419 } 00420 for (int j=0;j<5;j++) 00421 h_Kernel[j] /= kernelSum; 00422 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_Kernel, h_Kernel, 5*sizeof(float))); 00423 dim3 blocks(iDivUp(w, LOWPASS5_DX), iDivUp(h, LOWPASS5_DY)); 00424 dim3 threads(LOWPASS5_DX + WARP_SIZE + 2); 00425 LowPass5<<<blocks, threads>>>(res->d_data, data->d_data, w, h); 00426 CUT_CHECK_ERROR("LowPass5() execution failed\n"); 00427 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00428 00429 return 0; 00430 } 00431 00432 double ScaleDown(CudaImage *res, CudaImage *data, float variance) 00433 { 00434 int w = data->width; 00435 int h = data->height; 00436 if (res->d_data==NULL || data->d_data==NULL) { 00437 printf("ScaleDown: missing data\n"); 00438 return 0.0; 00439 } 00440 float h_Kernel[5]; 00441 float kernelSum = 0.0f; 00442 for (int j=0;j<5;j++) { 00443 h_Kernel[j] = (float)expf(-(double)(j-2)*(j-2)/2.0/variance); 00444 kernelSum += h_Kernel[j]; 00445 } 00446 for (int j=0;j<5;j++) 00447 h_Kernel[j] /= kernelSum; 00448 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_Kernel, h_Kernel, 5*sizeof(float))); 00449 dim3 blocks(iDivUp(w, LOWPASS5_DX), iDivUp(h, LOWPASS5_DY)); 00450 dim3 threads(LOWPASS5_DX + WARP_SIZE + 2); 00451 //printf("File %s, Number %d\n",__FILE__,__LINE__); 00452 //printf("blocks.x=%d,blocks.y=%d,threads.x=%d,res->d_data=%x, data->d_data=%x, w=%d, h=%d\n", 00453 //blocks.x,blocks.y, threads.x, res->d_data, data->d_data, w, h); 00454 ScaleDown<<<blocks, threads>>>(res->d_data, data->d_data, w, h); 00455 CUT_CHECK_ERROR("ScaleDown() execution failed\n"); 00456 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00457 00458 return 0; 00459 } 00460 00461 double Subtract(CudaImage *res, CudaImage *dataA, CudaImage *dataB) 00462 { 00463 int w = res->width; 00464 int h = res->height; 00465 if (res->d_data==NULL || dataA->d_data==NULL || dataB->d_data==NULL) { 00466 printf("Subtract: missing data\n"); 00467 return 0.0; 00468 } 00469 dim3 blocks(iDivUp(w, 16), iDivUp(h, 16)); 00470 dim3 threads(16, 16); 00471 Subtract<<<blocks, threads>>>(res->d_data, dataA->d_data, dataB->d_data, 00472 w, h); 00473 CUT_CHECK_ERROR("Subtract() execution failed\n"); 00474 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00475 00476 return 0; 00477 } 00478 00479 double MultiplyAdd(CudaImage *res, CudaImage *data, float constA, float constB) 00480 { 00481 int w = res->width; 00482 int h = res->height; 00483 if (res->d_data==NULL || data->d_data==NULL) { 00484 printf("MultiplyAdd: missing data\n"); 00485 return 0.0; 00486 } 00487 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantA, &constA, sizeof(float))); 00488 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantB, &constB, sizeof(float))); 00489 00490 dim3 blocks(iDivUp(w, 16), iDivUp(h, 16)); 00491 dim3 threads(16, 16); 00492 MultiplyAdd<<<blocks, threads>>>(res->d_data, data->d_data, w, h); 00493 CUT_CHECK_ERROR("MultiplyAdd() execution failed\n"); 00494 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00495 00496 return 0; 00497 } 00498 00499 double FindMinMax(CudaImage *img, float *minval, float *maxval) 00500 { 00501 int w = img->width; 00502 int h = img->height; 00503 00504 int dx = iDivUp(w, 128); 00505 int dy = iDivUp(h, 16); 00506 int sz = 2*dx*dy*sizeof(float); 00507 float *d_minmax = 0; 00508 float *h_minmax = 0; 00509 //h_minmax = (float *)malloc(sz); 00510 CUDA_SAFE_CALL(cudaMallocHost((void **)&h_minmax, sz)); 00511 for (int i=0;i<2*dx*dy;i+=2) { 00512 h_minmax[i+0] = 1e6; 00513 h_minmax[i+1] = -1e6; 00514 } 00515 CUDA_SAFE_CALL(cudaMalloc((void **)&d_minmax, sz)); 00516 CUDA_SAFE_CALL(cudaMemcpy(d_minmax, h_minmax, sz, cudaMemcpyHostToDevice)); 00517 00518 dim3 blocks(dx, dy); 00519 dim3 threads(128, 1); 00520 FindMinMax<<<blocks, threads>>>(d_minmax, img->d_data, w, h); 00521 CUT_CHECK_ERROR("FindMinMax() execution failed\n"); 00522 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00523 00524 CUDA_SAFE_CALL(cudaMemcpy(h_minmax, d_minmax, sz, cudaMemcpyDeviceToHost)); 00525 *minval = 1e6; 00526 *maxval = -1e6; 00527 for (int i=0;i<2*dx*dy;i+=2) { 00528 if (h_minmax[i+0]<*minval) 00529 *minval = h_minmax[i]; 00530 if (h_minmax[i+1]>*maxval) 00531 *maxval = h_minmax[i+1]; 00532 } 00533 00534 CUDA_SAFE_CALL(cudaFree(d_minmax)); 00535 CUDA_SAFE_CALL(cudaFreeHost(h_minmax)); 00536 return 0; 00537 } 00538 00539 double Find3DMinMax(CudaArray *minmax, CudaImage *data1, CudaImage *data2, 00540 CudaImage *data3, float thresh, int maxPts) 00541 { 00542 int *h_res = (int *)minmax->h_data; 00543 int *d_res = (int *)minmax->d_data; 00544 if (data1->d_data==NULL || data2->d_data==NULL || data3->d_data==NULL || 00545 h_res==NULL || d_res==NULL) { 00546 printf("Find3DMinMax: missing data %p %p %p %p %p\n", 00547 data1->d_data, data2->d_data, data3->d_data, h_res, d_res); 00548 return 0.0; 00549 } 00550 int w = data1->width; 00551 int h = data1->height; 00552 float threshs[2] = { thresh, -thresh }; 00553 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantA, &threshs, 2*sizeof(float))); 00554 00555 dim3 blocks(iDivUp(w, MINMAX_SIZE), iDivUp(h,32)); 00556 dim3 threads(WARP_SIZE + MINMAX_SIZE + 1); 00557 Find3DMinMax<<<blocks, threads>>>(d_res, data1->d_data, data2->d_data, 00558 data3->d_data, w, h); 00559 CUT_CHECK_ERROR("Find3DMinMax() execution failed\n"); 00560 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00561 CUDA_SAFE_CALL(cudaMemcpy(h_res, d_res, 00562 sizeof(int)*minmax->pitch*minmax->height, cudaMemcpyDeviceToHost)); 00563 return 0; 00564 } 00565 00566 double UnpackPointers(CudaArray *minmax, int maxPts, int *ptrs, int *numPts) 00567 { 00568 unsigned int *minmax_data = (unsigned int *)minmax->h_data; 00569 if (minmax_data==NULL || ptrs==NULL) { 00570 printf("UnpackPointers: missing data %p %p\n", minmax_data, ptrs); 00571 return 0.0; 00572 } 00573 int w = minmax->pitch; 00574 int h = 32*minmax->height; 00575 int num = 0; 00576 for (int y=0;y<h/32;y++) { 00577 for (int x=0;x<w;x++) { 00578 unsigned int val = minmax_data[y*w+x]; 00579 if (val) { 00580 //printf("%d %d %08x\n", x, y, val); 00581 for (int k=0;k<32;k++) { 00582 if (val&0x1 && num<maxPts) 00583 ptrs[num++] = (y*32+k)*w + x; 00584 val >>= 1; 00585 } 00586 } 00587 } 00588 } 00589 *numPts = num; 00590 return 0; 00591 } 00592 00593 double ComputePositions(CudaImage *data1, CudaImage *data2, CudaImage *data3, 00594 int *h_ptrs, CudaArray *sift, int numPts, int maxPts, float scale, 00595 float factor) 00596 { 00597 int w = data1->width; 00598 int h = data1->height; 00599 int *d_ptrs = 0; 00600 float *d_sift = sift->d_data; 00601 CUDA_SAFE_CALL(cudaMalloc((void **)&d_ptrs, sizeof(int)*numPts)); 00602 CUDA_SAFE_CALL(cudaMemcpy(d_ptrs, h_ptrs, sizeof(int)*numPts, 00603 cudaMemcpyHostToDevice)); 00604 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantA, &scale, sizeof(float))); 00605 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantB, &factor, sizeof(float))); 00606 00607 dim3 blocks(iDivUp(numPts, POSBLK_SIZE)); 00608 dim3 threads(POSBLK_SIZE); 00609 ComputePositions<<<blocks, threads>>>(data1->d_data, data2->d_data, 00610 data3->d_data, d_ptrs, d_sift, numPts, maxPts, w, h); 00611 CUT_CHECK_ERROR("ComputePositions() execution failed\n"); 00612 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00613 00614 return 0; 00615 } 00616 00617 double RemoveEdgePoints(CudaArray *sift, int *initNumPts, int maxPts, 00618 float edgeLimit) 00619 { 00620 int numPts = *initNumPts; 00621 float *d_sift = sift->d_data; 00622 int bw = sizeof(float)*numPts; 00623 float *h_sift = 0; 00624 //h_sift = (float *)malloc(5*bw); 00625 CUDA_SAFE_CALL(cudaMallocHost((void **)&h_sift, 5*bw)); 00626 CUDA_SAFE_CALL(cudaMemcpy2D(h_sift, bw, d_sift, sizeof(float)*maxPts, 00627 bw, 5, cudaMemcpyDeviceToHost)); 00628 int num = 0; 00629 for (int i=0;i<numPts;i++) 00630 if (h_sift[4*numPts+i]<edgeLimit) { 00631 for (int j=0;j<5;j++) 00632 h_sift[j*numPts+num] = h_sift[j*numPts+i]; 00633 num ++; 00634 } 00635 CUDA_SAFE_CALL(cudaMemcpy2D(d_sift, sizeof(float)*maxPts, h_sift, bw, 00636 bw, 5, cudaMemcpyHostToDevice)); 00637 //free(h_sift); 00638 CUDA_SAFE_CALL(cudaFreeHost(h_sift)); 00639 *initNumPts = num; 00640 return 0; 00641 } 00642 00643 double ComputeOrientations(CudaImage *img, int *h_ptrs, CudaArray *sift, 00644 int numPts, int maxPts) 00645 { 00646 int w = img->pitch; 00647 int h = img->height; 00648 int *d_ptrs = 0; 00649 float *d_orient = &sift->d_data[5*maxPts]; 00650 CUDA_SAFE_CALL(cudaMalloc((void **)&d_ptrs, sizeof(int)*numPts)); 00651 CUDA_SAFE_CALL(cudaMemcpy(d_ptrs, h_ptrs, sizeof(int)*numPts, 00652 cudaMemcpyHostToDevice)); 00653 00654 dim3 blocks(numPts); 00655 dim3 threads(32); 00656 ComputeOrientations<<<blocks, threads>>>(img->d_data, 00657 d_ptrs, d_orient, maxPts, w, h); 00658 CUT_CHECK_ERROR("ComputeOrientations() execution failed\n"); 00659 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00660 00661 return 0; 00662 } 00663 00664 double SecondOrientations(CudaArray *sift, int *initNumPts, int maxPts) 00665 { 00666 int numPts = *initNumPts; 00667 int numPts2 = 2*numPts; 00668 float *d_sift = sift->d_data; 00669 int bw = sizeof(float)*numPts2; 00670 float *h_sift = 0; 00671 h_sift = (float *)malloc(7*bw); 00672 //CUDA_SAFE_CALL(cudaMallocHost((void **)&h_sift, 7*bw)); 00673 CUDA_SAFE_CALL(cudaMemcpy2D(h_sift, bw, d_sift, sizeof(float)*maxPts, 00674 sizeof(float)*numPts, 7, cudaMemcpyDeviceToHost)); 00675 int num = numPts; 00676 for (int i=0;i<numPts;i++) 00677 if (h_sift[6*numPts2+i]>=0.0f && num<maxPts) { 00678 for (int j=0;j<5;j++) 00679 h_sift[j*numPts2+num] = h_sift[j*numPts2+i]; 00680 h_sift[5*numPts2+num] = h_sift[6*numPts2+i]; 00681 h_sift[6*numPts2+num] = -1.0f; 00682 num ++; 00683 } 00684 CUDA_SAFE_CALL(cudaMemcpy2D(&d_sift[numPts], sizeof(float)*maxPts, 00685 &h_sift[numPts], bw, sizeof(float)*(num-numPts), 7, 00686 cudaMemcpyHostToDevice)); 00687 free(h_sift); 00688 //CUDA_SAFE_CALL(cudaFreeHost(h_sift)); 00689 *initNumPts = num; 00690 return 0; 00691 } 00692 00693 double ExtractSiftDescriptors(CudaImage *img, CudaArray *sift, 00694 CudaArray *desc, int numPts, int maxPts) 00695 { 00696 float *d_sift = sift->d_data, *d_desc = desc->d_data; 00697 00698 tex.addressMode[0] = cudaAddressModeClamp; 00699 tex.addressMode[1] = cudaAddressModeClamp; 00700 tex.filterMode = cudaFilterModeLinear; 00701 tex.normalized = false; 00702 CUDA_SAFE_CALL(cudaBindTextureToArray(tex, (cudaArray *)img->t_data)); 00703 00704 dim3 blocks(numPts); 00705 dim3 threads(16); 00706 ExtractSiftDescriptors<<<blocks, threads>>>(img->d_data, 00707 d_sift, d_desc, maxPts); 00708 CUT_CHECK_ERROR("ExtractSiftDescriptors() execution failed\n"); 00709 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00710 CUDA_SAFE_CALL(cudaUnbindTexture(tex)); 00711 00712 return 0; 00713 }