00001
00002
00003
00004
00005 #include <stdio.h>
00006 #include <string.h>
00007 #include "CUDA/cutil.h"
00008
00009 #undef VERBOSE
00010
00011 #include "cudaImage.h"
00012 #include "cudaSift.h"
00013 #include "cudaSiftD_device.h"
00014 #include "cudaSiftH.h"
00015 #include "cudaSiftH_device.h"
00016
00017 #include "cudaSiftDcu"
00018
00019 void InitCuda(int argc, char **argv)
00020 {
00021
00022 CUDA_SAFE_CALL(cudaSetDevice(3));
00023 }
00024
00025 void ExtractSift(SiftData *siftData, CudaImage *img, int numLayers,
00026 int numOctaves, double initBlur, float thresh, float subsampling)
00027 {
00028 int w = img->width;
00029 int h = img->height;
00030 int p = img->pitch;
00031 if (numOctaves>1) {
00032 CudaImage subImg;
00033 AllocCudaImage(&subImg, w/2, h/2, p/2, false, true);
00034 ScaleDown(&subImg, img, 0.5f);
00035 float totInitBlur = (float)sqrt(initBlur*initBlur + 0.5f*0.5f) / 2.0f;
00036 ExtractSift(siftData, &subImg, numLayers, numOctaves-1, totInitBlur,
00037 thresh, subsampling*2.0f);
00038 FreeCudaImage(&subImg);
00039 }
00040 ExtractSiftOctave(siftData, img, numLayers, initBlur, thresh, subsampling);
00041 }
00042
00043 void ExtractSiftOctave(SiftData *siftData, CudaImage *img, int numLayers,
00044 double initBlur, float thresh, float subsampling)
00045 {
00046 const double baseBlur = 1.0;
00047 const int maxPts = 512;
00048 int w = img->width;
00049 int h = img->height;
00050 int p = img->pitch;
00051
00052 CudaImage blurImg[2];
00053 CudaImage diffImg[3];
00054 CudaImage tempImg;
00055 CudaImage textImg;
00056 CudaArray sift;
00057 CudaArray desc;
00058 CudaArray minmax;
00059
00060 AllocCudaImage(&blurImg[0], w, h, p, false, true);
00061 AllocCudaImage(&blurImg[1], w, h, p, false, true);
00062 AllocCudaImage(&diffImg[0], w, h, p, false, true);
00063 AllocCudaImage(&diffImg[1], w, h, p, false, true);
00064 AllocCudaImage(&diffImg[2], w, h, p, false, true);
00065 AllocCudaImage(&tempImg, w, h, p, false, true);
00066 AllocCudaImage(&minmax, w, iDivUp(h,32), w, true, true);
00067 AllocCudaImage(&sift, maxPts, 7, maxPts, true, true);
00068 AllocCudaImage(&desc, 128, maxPts, 128, true, true);
00069 int *ptrs = 0;
00070
00071
00072 CUDA_SAFE_CALL(cudaMallocHost((void **)&ptrs, sizeof(int)*maxPts));
00073 AllocCudaImage(&textImg, w, h, p, false, false);
00074 InitTexture(&textImg);
00075 CUT_CHECK_ERROR("Memory allocation failed\n");
00076 CUDA_SAFE_CALL(cudaThreadSynchronize());
00077 double var = baseBlur*baseBlur - initBlur*initBlur;
00078 double gpuTime = 0.0, gpuTimeDoG = 0.0, gpuTimeSift = 0.0;
00079 int totPts = 0;
00080
00081 if (var>0.0)
00082 gpuTime += LowPass<2>(&blurImg[0], img, &tempImg, var);
00083 for (int i=0;i<numLayers+2;i++) {
00084 double pSigma = baseBlur*pow(2.0, (double)(i-1)/numLayers);
00085 double oSigma = baseBlur*pow(2.0, (double)i/numLayers);
00086 double nSigma = baseBlur*pow(2.0, (double)(i+1)/numLayers);
00087 double var = nSigma*nSigma - oSigma*oSigma;
00088 if (i<=1) {
00089 gpuTime += LowPass<7>(&blurImg[(i+1)%2], &blurImg[i%2], &tempImg, var);
00090 gpuTime += Subtract(&diffImg[(i+2)%3], &blurImg[i%2], &blurImg[(i+1)%2]);
00091 } else {
00092 gpuTimeDoG += LowPass<7>(&blurImg[(i+1)%2], &blurImg[i%2], &tempImg,var);
00093 gpuTimeDoG += Subtract(&diffImg[(i+2)%3], &blurImg[i%2],
00094 &blurImg[(i+1)%2]);
00095 gpuTimeDoG += Find3DMinMax(&minmax, &diffImg[(i+2)%3], &diffImg[(i+1)%3],
00096 &diffImg[i%3], thresh, maxPts);
00097 int numPts = 0;
00098 gpuTimeSift += UnpackPointers(&minmax, maxPts, ptrs, &numPts);
00099 if (numPts>0) {
00100 gpuTimeDoG += CopyToTexture(&blurImg[i%2], &textImg, false);
00101 gpuTimeSift += ComputePositions(&diffImg[(i+2)%3], &diffImg[(i+1)%3],
00102 &diffImg[i%3], ptrs, &sift, numPts, maxPts,
00103 pSigma, 1.0f/numLayers);
00104 gpuTimeSift += RemoveEdgePoints(&sift, &numPts, maxPts, 10.0f);
00105 if(numPts>0){
00106 gpuTimeSift += ComputeOrientations(&blurImg[i%2], ptrs, &sift,
00107 numPts, maxPts);
00108 gpuTimeSift += SecondOrientations(&sift, &numPts, maxPts);
00109 gpuTimeSift += ExtractSiftDescriptors(&textImg, &sift, &desc,
00110 numPts, maxPts);
00111 gpuTimeSift += AddSiftData(siftData, sift.d_data, desc.d_data,
00112 numPts, maxPts, subsampling);
00113 }
00114 }
00115 totPts += numPts;
00116 }
00117 }
00118
00119 #if 0
00120 Readback(&diffImg[(2+2)%3]);
00121 float *imd = diffImg[(2+2)%3].h_data;
00122 for (int i=0;i<w*h;i++) imd[i] = 8*imd[i] + 128;
00123 CUT_SAFE_CALL(cutSavePGMf("data/limg_test2.pgm",
00124 diffImg[(2+2)%3].h_data, w, h));
00125 Readback(&diffImg[(2+1)%3]);
00126 imd = diffImg[(2+1)%3].h_data;
00127 for (int i=0;i<w*h;i++) imd[i] = 8*imd[i] + 128;
00128 CUT_SAFE_CALL(cutSavePGMf("data/limg_test1.pgm",
00129 diffImg[(2+1)%3].h_data, w, h));
00130 Readback(&diffImg[(2+0)%3]);
00131 imd = diffImg[(2+0)%3].h_data;
00132 for (int i=0;i<w*h;i++) imd[i] = 8*imd[i] + 128;
00133 CUT_SAFE_CALL(cutSavePGMf("data/limg_test0.pgm",
00134 diffImg[(2+0)%3].h_data, w, h));
00135 #endif
00136
00137 CUDA_SAFE_CALL(cudaThreadSynchronize());
00138 FreeCudaImage(&textImg);
00139
00140 CUDA_SAFE_CALL(cudaFreeHost(ptrs));
00141 FreeCudaImage(&desc);
00142 FreeCudaImage(&sift);
00143 FreeCudaImage(&minmax);
00144 FreeCudaImage(&tempImg);
00145 FreeCudaImage(&diffImg[2]);
00146 FreeCudaImage(&diffImg[1]);
00147 FreeCudaImage(&diffImg[0]);
00148 FreeCudaImage(&blurImg[1]);
00149 FreeCudaImage(&blurImg[0]);
00150
00151 }
00152
00153 void InitSiftData(SiftData *data, int num, bool host, bool dev)
00154 {
00155 data->numPts = 0;
00156 data->maxPts = num;
00157 int sz = sizeof(SiftPoint)*num;
00158 data->h_data = NULL;
00159 if (host)
00160
00161 CUDA_SAFE_CALL(cudaMallocHost((void **)&data->h_data, sz));
00162 data->d_data = NULL;
00163 if (dev)
00164 CUDA_SAFE_CALL(cudaMalloc((void **)&data->d_data, sz));
00165 }
00166
00167 void FreeSiftData(SiftData *data)
00168 {
00169 if (data->d_data!=NULL)
00170 CUDA_SAFE_CALL(cudaFree(data->d_data));
00171 data->d_data = NULL;
00172 if (data->h_data!=NULL)
00173
00174 CUDA_SAFE_CALL(cudaFreeHost(data->h_data));
00175 data->numPts = 0;
00176 data->maxPts = 0;
00177 }
00178
00179 double AddSiftData(SiftData *data, float *d_sift, float *d_desc,
00180 int numPts, int maxPts, float subsampling)
00181 {
00182 int newNum = data->numPts + numPts;
00183 if (data->maxPts<(data->numPts+numPts)) {
00184 int newMaxNum = 2*data->maxPts;
00185 while (newNum>newMaxNum)
00186 newMaxNum *= 2;
00187 if (data->h_data!=NULL) {
00188 SiftPoint *h_data = 0;
00189
00190 CUDA_SAFE_CALL(cudaMallocHost((void**)&h_data, sizeof(SiftPoint)*newMaxNum));
00191 memcpy(h_data, data->h_data, sizeof(SiftPoint)*data->numPts);
00192
00193
00194 CUDA_SAFE_CALL(cudaFreeHost(data->h_data));
00195 data->h_data = h_data;
00196 }
00197 if (data->d_data!=NULL) {
00198 SiftPoint *d_data = NULL;
00199 CUDA_SAFE_CALL(cudaMalloc((void**)&d_data, sizeof(SiftPoint)*newMaxNum));
00200 CUDA_SAFE_CALL(cudaMemcpy(d_data, data->d_data,
00201 sizeof(SiftPoint)*data->numPts, cudaMemcpyDeviceToDevice));
00202 CUDA_SAFE_CALL(cudaFree(data->d_data));
00203 data->d_data = d_data;
00204 }
00205 data->maxPts = newMaxNum;
00206 }
00207 int pitch = sizeof(SiftPoint);
00208 float *buffer;
00209
00210 CUDA_SAFE_CALL(cudaMallocHost((void**)&buffer, sizeof(float)*3*numPts));
00211 int bwidth = sizeof(float)*numPts;
00212 CUDA_SAFE_CALL(cudaMemcpy2D(buffer, bwidth, d_sift, sizeof(float)*maxPts,
00213 bwidth, 3, cudaMemcpyDeviceToHost));
00214 for (int i=0;i<3*numPts;i++)
00215 buffer[i] *= subsampling;
00216 CUDA_SAFE_CALL(cudaMemcpy2D(d_sift, sizeof(float)*maxPts, buffer, bwidth,
00217 bwidth, 3, cudaMemcpyHostToDevice));
00218 CUDA_SAFE_CALL(cudaThreadSynchronize());
00219 if (data->h_data!=NULL) {
00220 float *ptr = (float*)&data->h_data[data->numPts];
00221 for (int i=0;i<6;i++)
00222 CUDA_SAFE_CALL(cudaMemcpy2D(&ptr[i], pitch, &d_sift[i*maxPts], 4, 4,
00223 numPts, cudaMemcpyDeviceToHost));
00224 CUDA_SAFE_CALL(cudaMemcpy2D(&ptr[16], pitch, d_desc, sizeof(float)*128,
00225 sizeof(float)*128, numPts, cudaMemcpyDeviceToHost));
00226 }
00227 if (data->d_data!=NULL) {
00228 float *ptr = (float*)&data->d_data[data->numPts];
00229 for (int i=0;i<6;i++)
00230 CUDA_SAFE_CALL(cudaMemcpy2D(&ptr[i], pitch, &d_sift[i*maxPts], 4, 4,
00231 numPts, cudaMemcpyDeviceToDevice));
00232 CUDA_SAFE_CALL(cudaMemcpy2D(&ptr[16], pitch, d_desc, sizeof(float)*128,
00233 sizeof(float)*128, numPts, cudaMemcpyDeviceToDevice));
00234 }
00235 data->numPts = newNum;
00236 CUDA_SAFE_CALL(cudaFreeHost(buffer));
00237
00238 return 0;
00239 }
00240
00241 void PrintSiftData(SiftData *data)
00242 {
00243 SiftPoint *h_data = data->h_data;
00244 if (data->h_data==NULL) {
00245
00246 CUDA_SAFE_CALL(cudaMallocHost((void **)&h_data, sizeof(SiftPoint)*data->maxPts));
00247 CUDA_SAFE_CALL(cudaMemcpy(h_data, data->d_data,
00248 sizeof(SiftPoint)*data->numPts, cudaMemcpyDeviceToHost));
00249 data->h_data = h_data;
00250 }
00251 for (int i=0;i<data->numPts;i++) {
00252 printf("xpos = %.2f\n", h_data[i].xpos);
00253 printf("ypos = %.2f\n", h_data[i].ypos);
00254 printf("scale = %.2f\n", h_data[i].scale);
00255 printf("sharpness = %.2f\n", h_data[i].sharpness);
00256 printf("edgeness = %.2f\n", h_data[i].edgeness);
00257 printf("orientation = %.2f\n", h_data[i].orientation);
00258 float *siftData = (float*)&h_data[i].data;
00259 for (int j=0;j<8;j++) {
00260 if (j==0) printf("data = ");
00261 else printf(" ");
00262 for (int k=0;k<16;k++)
00263 if (siftData[j+8*k]<0.05)
00264 printf(" . ", siftData[j+8*k]);
00265 else
00266 printf("%.2f ", siftData[j+8*k]);
00267 printf("\n");
00268 }
00269 }
00270 printf("Number of available points: %d\n", data->numPts);
00271 printf("Number of allocated points: %d\n", data->maxPts);
00272 }
00273
00274 double MatchSiftData(SiftData *data1, SiftData *data2)
00275 {
00276 if (data1->d_data==NULL || data2->d_data==NULL)
00277 return 0.0f;
00278 int numPts1 = data1->numPts;
00279 int numPts2 = data2->numPts;
00280 SiftPoint *sift1 = data1->d_data;
00281 SiftPoint *sift2 = data2->d_data;
00282
00283 float *d_corrData;
00284 int corrWidth = iDivUp(numPts2, 16)*16;
00285 int corrSize = sizeof(float)*numPts1*corrWidth;
00286 CUDA_SAFE_CALL(cudaMalloc((void **)&d_corrData, corrSize));
00287 dim3 blocks(numPts1, iDivUp(numPts2, 16));
00288 dim3 threads(16, 16);
00289 MatchSiftPoints<<<blocks, threads>>>(sift1, sift2,
00290 d_corrData, numPts1, numPts2);
00291 CUDA_SAFE_CALL(cudaThreadSynchronize());
00292 dim3 blocksMax(iDivUp(numPts1, 16));
00293 dim3 threadsMax(16, 16);
00294 FindMaxCorr<<<blocksMax, threadsMax>>>(d_corrData, sift1, sift2,
00295 numPts1, corrWidth, sizeof(SiftPoint));
00296 CUDA_SAFE_CALL(cudaThreadSynchronize());
00297 CUT_CHECK_ERROR("MatchSiftPoints() execution failed\n");
00298 CUDA_SAFE_CALL(cudaFree(d_corrData));
00299 if (data1->h_data!=NULL) {
00300 float *h_ptr = &data1->h_data[0].score;
00301 float *d_ptr = &data1->d_data[0].score;
00302 CUDA_SAFE_CALL(cudaMemcpy2D(h_ptr, sizeof(SiftPoint), d_ptr,
00303 sizeof(SiftPoint), 5*sizeof(float), data1->numPts,
00304 cudaMemcpyDeviceToHost));
00305 }
00306
00307 return 0;
00308 }
00309
00310 double FindHomography(SiftData *data, float *homography, int *numMatches,
00311 int numLoops, float minScore, float maxAmbiguity, float thresh)
00312 {
00313 *numMatches = 0;
00314 homography[0] = homography[4] = homography[8] = 1.0f;
00315 homography[1] = homography[2] = homography[3] = 0.0f;
00316 homography[5] = homography[6] = homography[7] = 0.0f;
00317 if (data->d_data==NULL)
00318 return 0.0f;
00319 SiftPoint *d_sift = data->d_data;
00320 numLoops = iDivUp(numLoops,16)*16;
00321 int numPts = data->numPts;
00322 if (numPts<8)
00323 return 0.0f;
00324 int numPtsUp = iDivUp(numPts, 16)*16;
00325 float *d_coord, *d_homo;
00326 int *d_randPts, *h_randPts;
00327 int randSize = 4*sizeof(int)*numLoops;
00328 int szFl = sizeof(float);
00329 int szPt = sizeof(SiftPoint);
00330 CUDA_SAFE_CALL(cudaMalloc((void **)&d_coord, 4*sizeof(float)*numPtsUp));
00331 CUDA_SAFE_CALL(cudaMalloc((void **)&d_randPts, randSize));
00332 CUDA_SAFE_CALL(cudaMalloc((void **)&d_homo, 8*sizeof(float)*numLoops));
00333 h_randPts = (int*)malloc(randSize);
00334 float *h_scores = (float *)malloc(sizeof(float)*numPtsUp);
00335 float *h_ambiguities = (float *)malloc(sizeof(float)*numPtsUp);
00336 CUDA_SAFE_CALL(cudaMemcpy2D(h_scores, szFl, &d_sift[0].score, szPt,
00337 szFl, numPts, cudaMemcpyDeviceToHost));
00338 CUDA_SAFE_CALL(cudaMemcpy2D(h_ambiguities, szFl, &d_sift[0].ambiguity, szPt,
00339 szFl, numPts, cudaMemcpyDeviceToHost));
00340 int *validPts = (int *)malloc(sizeof(int)*numPts);
00341 int numValid = 0;
00342 for (int i=0;i<numPts;i++) {
00343 if (h_scores[i]>minScore && h_ambiguities[i]<maxAmbiguity)
00344 validPts[numValid++] = i;
00345 }
00346 free(h_scores);
00347 free(h_ambiguities);
00348 if (numValid>=8) {
00349 for (int i=0;i<numLoops;i++) {
00350 int p1 = rand() % numValid;
00351 int p2 = rand() % numValid;
00352 int p3 = rand() % numValid;
00353 int p4 = rand() % numValid;
00354 while (p2==p1) p2 = rand() % numValid;
00355 while (p3==p1 || p3==p2) p3 = rand() % numValid;
00356 while (p4==p1 || p4==p2 || p4==p3) p4 = rand() % numValid;
00357 h_randPts[i+0*numLoops] = validPts[p1];
00358 h_randPts[i+1*numLoops] = validPts[p2];
00359 h_randPts[i+2*numLoops] = validPts[p3];
00360 h_randPts[i+3*numLoops] = validPts[p4];
00361 }
00362 CUDA_SAFE_CALL(cudaMemcpy(d_randPts, h_randPts, randSize,
00363 cudaMemcpyHostToDevice));
00364 CUDA_SAFE_CALL(cudaMemcpy2D(&d_coord[0*numPtsUp], szFl, &d_sift[0].xpos,
00365 szPt, szFl, numPts, cudaMemcpyDeviceToDevice));
00366 CUDA_SAFE_CALL(cudaMemcpy2D(&d_coord[1*numPtsUp], szFl, &d_sift[0].ypos,
00367 szPt, szFl, numPts, cudaMemcpyDeviceToDevice));
00368 CUDA_SAFE_CALL(cudaMemcpy2D(&d_coord[2*numPtsUp], szFl,
00369 &d_sift[0].match_xpos, szPt, szFl, numPts, cudaMemcpyDeviceToDevice));
00370 CUDA_SAFE_CALL(cudaMemcpy2D(&d_coord[3*numPtsUp], szFl,
00371 &d_sift[0].match_ypos, szPt, szFl, numPts, cudaMemcpyDeviceToDevice));
00372 ComputeHomographies<<<numLoops/16, 16>>>(d_coord, d_randPts,
00373 d_homo, numPtsUp);
00374 CUDA_SAFE_CALL(cudaThreadSynchronize());
00375 CUT_CHECK_ERROR("ComputeHomographies() execution failed\n");
00376 dim3 blocks(1, numLoops/TESTHOMO_LOOPS);
00377 dim3 threads(TESTHOMO_TESTS, TESTHOMO_LOOPS);
00378 TestHomographies<<<blocks, threads>>>(d_coord, d_homo,
00379 d_randPts, numPtsUp, thresh*thresh);
00380 CUDA_SAFE_CALL(cudaThreadSynchronize());
00381 CUT_CHECK_ERROR("TestHomographies() execution failed\n");
00382 CUDA_SAFE_CALL(cudaMemcpy(h_randPts, d_randPts, sizeof(int)*numLoops,
00383 cudaMemcpyDeviceToHost));
00384 int maxIndex = -1, maxCount = -1;
00385 for (int i=0;i<numLoops;i++)
00386 if (h_randPts[i]>maxCount) {
00387 maxCount = h_randPts[i];
00388 maxIndex = i;
00389 }
00390 CUDA_SAFE_CALL(cudaMemcpy2D(homography, szFl, &d_homo[maxIndex],
00391 sizeof(float)*numLoops, szFl, 8, cudaMemcpyDeviceToHost));
00392 }
00393 free(validPts);
00394 free(h_randPts);
00395 CUDA_SAFE_CALL(cudaFree(d_homo));
00396 CUDA_SAFE_CALL(cudaFree(d_randPts));
00397 CUDA_SAFE_CALL(cudaFree(d_coord));
00398
00399 return 0;
00400 }
00401
00402
00403
00404
00405
00406 double LowPass5(CudaImage *res, CudaImage *data, float variance)
00407 {
00408 int w = res->width;
00409 int h = res->height;
00410 if (res->d_data==NULL || data->d_data==NULL) {
00411 printf("LowPass5: missing data\n");
00412 return 0.0;
00413 }
00414 float h_Kernel[5];
00415 float kernelSum = 0.0f;
00416 for (int j=0;j<5;j++) {
00417 h_Kernel[j] = (float)expf(-(double)(j-2)*(j-2)/2.0/variance);
00418 kernelSum += h_Kernel[j];
00419 }
00420 for (int j=0;j<5;j++)
00421 h_Kernel[j] /= kernelSum;
00422 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_Kernel, h_Kernel, 5*sizeof(float)));
00423 dim3 blocks(iDivUp(w, LOWPASS5_DX), iDivUp(h, LOWPASS5_DY));
00424 dim3 threads(LOWPASS5_DX + WARP_SIZE + 2);
00425 LowPass5<<<blocks, threads>>>(res->d_data, data->d_data, w, h);
00426 CUT_CHECK_ERROR("LowPass5() execution failed\n");
00427 CUDA_SAFE_CALL(cudaThreadSynchronize());
00428
00429 return 0;
00430 }
00431
00432 double ScaleDown(CudaImage *res, CudaImage *data, float variance)
00433 {
00434 int w = data->width;
00435 int h = data->height;
00436 if (res->d_data==NULL || data->d_data==NULL) {
00437 printf("ScaleDown: missing data\n");
00438 return 0.0;
00439 }
00440 float h_Kernel[5];
00441 float kernelSum = 0.0f;
00442 for (int j=0;j<5;j++) {
00443 h_Kernel[j] = (float)expf(-(double)(j-2)*(j-2)/2.0/variance);
00444 kernelSum += h_Kernel[j];
00445 }
00446 for (int j=0;j<5;j++)
00447 h_Kernel[j] /= kernelSum;
00448 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_Kernel, h_Kernel, 5*sizeof(float)));
00449 dim3 blocks(iDivUp(w, LOWPASS5_DX), iDivUp(h, LOWPASS5_DY));
00450 dim3 threads(LOWPASS5_DX + WARP_SIZE + 2);
00451
00452
00453
00454 ScaleDown<<<blocks, threads>>>(res->d_data, data->d_data, w, h);
00455 CUT_CHECK_ERROR("ScaleDown() execution failed\n");
00456 CUDA_SAFE_CALL(cudaThreadSynchronize());
00457
00458 return 0;
00459 }
00460
00461 double Subtract(CudaImage *res, CudaImage *dataA, CudaImage *dataB)
00462 {
00463 int w = res->width;
00464 int h = res->height;
00465 if (res->d_data==NULL || dataA->d_data==NULL || dataB->d_data==NULL) {
00466 printf("Subtract: missing data\n");
00467 return 0.0;
00468 }
00469 dim3 blocks(iDivUp(w, 16), iDivUp(h, 16));
00470 dim3 threads(16, 16);
00471 Subtract<<<blocks, threads>>>(res->d_data, dataA->d_data, dataB->d_data,
00472 w, h);
00473 CUT_CHECK_ERROR("Subtract() execution failed\n");
00474 CUDA_SAFE_CALL(cudaThreadSynchronize());
00475
00476 return 0;
00477 }
00478
00479 double MultiplyAdd(CudaImage *res, CudaImage *data, float constA, float constB)
00480 {
00481 int w = res->width;
00482 int h = res->height;
00483 if (res->d_data==NULL || data->d_data==NULL) {
00484 printf("MultiplyAdd: missing data\n");
00485 return 0.0;
00486 }
00487 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantA, &constA, sizeof(float)));
00488 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantB, &constB, sizeof(float)));
00489
00490 dim3 blocks(iDivUp(w, 16), iDivUp(h, 16));
00491 dim3 threads(16, 16);
00492 MultiplyAdd<<<blocks, threads>>>(res->d_data, data->d_data, w, h);
00493 CUT_CHECK_ERROR("MultiplyAdd() execution failed\n");
00494 CUDA_SAFE_CALL(cudaThreadSynchronize());
00495
00496 return 0;
00497 }
00498
00499 double FindMinMax(CudaImage *img, float *minval, float *maxval)
00500 {
00501 int w = img->width;
00502 int h = img->height;
00503
00504 int dx = iDivUp(w, 128);
00505 int dy = iDivUp(h, 16);
00506 int sz = 2*dx*dy*sizeof(float);
00507 float *d_minmax = 0;
00508 float *h_minmax = 0;
00509
00510 CUDA_SAFE_CALL(cudaMallocHost((void **)&h_minmax, sz));
00511 for (int i=0;i<2*dx*dy;i+=2) {
00512 h_minmax[i+0] = 1e6;
00513 h_minmax[i+1] = -1e6;
00514 }
00515 CUDA_SAFE_CALL(cudaMalloc((void **)&d_minmax, sz));
00516 CUDA_SAFE_CALL(cudaMemcpy(d_minmax, h_minmax, sz, cudaMemcpyHostToDevice));
00517
00518 dim3 blocks(dx, dy);
00519 dim3 threads(128, 1);
00520 FindMinMax<<<blocks, threads>>>(d_minmax, img->d_data, w, h);
00521 CUT_CHECK_ERROR("FindMinMax() execution failed\n");
00522 CUDA_SAFE_CALL(cudaThreadSynchronize());
00523
00524 CUDA_SAFE_CALL(cudaMemcpy(h_minmax, d_minmax, sz, cudaMemcpyDeviceToHost));
00525 *minval = 1e6;
00526 *maxval = -1e6;
00527 for (int i=0;i<2*dx*dy;i+=2) {
00528 if (h_minmax[i+0]<*minval)
00529 *minval = h_minmax[i];
00530 if (h_minmax[i+1]>*maxval)
00531 *maxval = h_minmax[i+1];
00532 }
00533
00534 CUDA_SAFE_CALL(cudaFree(d_minmax));
00535 CUDA_SAFE_CALL(cudaFreeHost(h_minmax));
00536 return 0;
00537 }
00538
00539 double Find3DMinMax(CudaArray *minmax, CudaImage *data1, CudaImage *data2,
00540 CudaImage *data3, float thresh, int maxPts)
00541 {
00542 int *h_res = (int *)minmax->h_data;
00543 int *d_res = (int *)minmax->d_data;
00544 if (data1->d_data==NULL || data2->d_data==NULL || data3->d_data==NULL ||
00545 h_res==NULL || d_res==NULL) {
00546 printf("Find3DMinMax: missing data %p %p %p %p %p\n",
00547 data1->d_data, data2->d_data, data3->d_data, h_res, d_res);
00548 return 0.0;
00549 }
00550 int w = data1->width;
00551 int h = data1->height;
00552 float threshs[2] = { thresh, -thresh };
00553 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantA, &threshs, 2*sizeof(float)));
00554
00555 dim3 blocks(iDivUp(w, MINMAX_SIZE), iDivUp(h,32));
00556 dim3 threads(WARP_SIZE + MINMAX_SIZE + 1);
00557 Find3DMinMax<<<blocks, threads>>>(d_res, data1->d_data, data2->d_data,
00558 data3->d_data, w, h);
00559 CUT_CHECK_ERROR("Find3DMinMax() execution failed\n");
00560 CUDA_SAFE_CALL(cudaThreadSynchronize());
00561 CUDA_SAFE_CALL(cudaMemcpy(h_res, d_res,
00562 sizeof(int)*minmax->pitch*minmax->height, cudaMemcpyDeviceToHost));
00563 return 0;
00564 }
00565
00566 double UnpackPointers(CudaArray *minmax, int maxPts, int *ptrs, int *numPts)
00567 {
00568 unsigned int *minmax_data = (unsigned int *)minmax->h_data;
00569 if (minmax_data==NULL || ptrs==NULL) {
00570 printf("UnpackPointers: missing data %p %p\n", minmax_data, ptrs);
00571 return 0.0;
00572 }
00573 int w = minmax->pitch;
00574 int h = 32*minmax->height;
00575 int num = 0;
00576 for (int y=0;y<h/32;y++) {
00577 for (int x=0;x<w;x++) {
00578 unsigned int val = minmax_data[y*w+x];
00579 if (val) {
00580
00581 for (int k=0;k<32;k++) {
00582 if (val&0x1 && num<maxPts)
00583 ptrs[num++] = (y*32+k)*w + x;
00584 val >>= 1;
00585 }
00586 }
00587 }
00588 }
00589 *numPts = num;
00590 return 0;
00591 }
00592
00593 double ComputePositions(CudaImage *data1, CudaImage *data2, CudaImage *data3,
00594 int *h_ptrs, CudaArray *sift, int numPts, int maxPts, float scale,
00595 float factor)
00596 {
00597 int w = data1->width;
00598 int h = data1->height;
00599 int *d_ptrs = 0;
00600 float *d_sift = sift->d_data;
00601 CUDA_SAFE_CALL(cudaMalloc((void **)&d_ptrs, sizeof(int)*numPts));
00602 CUDA_SAFE_CALL(cudaMemcpy(d_ptrs, h_ptrs, sizeof(int)*numPts,
00603 cudaMemcpyHostToDevice));
00604 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantA, &scale, sizeof(float)));
00605 CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_ConstantB, &factor, sizeof(float)));
00606
00607 dim3 blocks(iDivUp(numPts, POSBLK_SIZE));
00608 dim3 threads(POSBLK_SIZE);
00609 ComputePositions<<<blocks, threads>>>(data1->d_data, data2->d_data,
00610 data3->d_data, d_ptrs, d_sift, numPts, maxPts, w, h);
00611 CUT_CHECK_ERROR("ComputePositions() execution failed\n");
00612 CUDA_SAFE_CALL(cudaThreadSynchronize());
00613
00614 return 0;
00615 }
00616
00617 double RemoveEdgePoints(CudaArray *sift, int *initNumPts, int maxPts,
00618 float edgeLimit)
00619 {
00620 int numPts = *initNumPts;
00621 float *d_sift = sift->d_data;
00622 int bw = sizeof(float)*numPts;
00623 float *h_sift = 0;
00624
00625 CUDA_SAFE_CALL(cudaMallocHost((void **)&h_sift, 5*bw));
00626 CUDA_SAFE_CALL(cudaMemcpy2D(h_sift, bw, d_sift, sizeof(float)*maxPts,
00627 bw, 5, cudaMemcpyDeviceToHost));
00628 int num = 0;
00629 for (int i=0;i<numPts;i++)
00630 if (h_sift[4*numPts+i]<edgeLimit) {
00631 for (int j=0;j<5;j++)
00632 h_sift[j*numPts+num] = h_sift[j*numPts+i];
00633 num ++;
00634 }
00635 CUDA_SAFE_CALL(cudaMemcpy2D(d_sift, sizeof(float)*maxPts, h_sift, bw,
00636 bw, 5, cudaMemcpyHostToDevice));
00637
00638 CUDA_SAFE_CALL(cudaFreeHost(h_sift));
00639 *initNumPts = num;
00640 return 0;
00641 }
00642
00643 double ComputeOrientations(CudaImage *img, int *h_ptrs, CudaArray *sift,
00644 int numPts, int maxPts)
00645 {
00646 int w = img->pitch;
00647 int h = img->height;
00648 int *d_ptrs = 0;
00649 float *d_orient = &sift->d_data[5*maxPts];
00650 CUDA_SAFE_CALL(cudaMalloc((void **)&d_ptrs, sizeof(int)*numPts));
00651 CUDA_SAFE_CALL(cudaMemcpy(d_ptrs, h_ptrs, sizeof(int)*numPts,
00652 cudaMemcpyHostToDevice));
00653
00654 dim3 blocks(numPts);
00655 dim3 threads(32);
00656 ComputeOrientations<<<blocks, threads>>>(img->d_data,
00657 d_ptrs, d_orient, maxPts, w, h);
00658 CUT_CHECK_ERROR("ComputeOrientations() execution failed\n");
00659 CUDA_SAFE_CALL(cudaThreadSynchronize());
00660
00661 return 0;
00662 }
00663
00664 double SecondOrientations(CudaArray *sift, int *initNumPts, int maxPts)
00665 {
00666 int numPts = *initNumPts;
00667 int numPts2 = 2*numPts;
00668 float *d_sift = sift->d_data;
00669 int bw = sizeof(float)*numPts2;
00670 float *h_sift = 0;
00671 h_sift = (float *)malloc(7*bw);
00672
00673 CUDA_SAFE_CALL(cudaMemcpy2D(h_sift, bw, d_sift, sizeof(float)*maxPts,
00674 sizeof(float)*numPts, 7, cudaMemcpyDeviceToHost));
00675 int num = numPts;
00676 for (int i=0;i<numPts;i++)
00677 if (h_sift[6*numPts2+i]>=0.0f && num<maxPts) {
00678 for (int j=0;j<5;j++)
00679 h_sift[j*numPts2+num] = h_sift[j*numPts2+i];
00680 h_sift[5*numPts2+num] = h_sift[6*numPts2+i];
00681 h_sift[6*numPts2+num] = -1.0f;
00682 num ++;
00683 }
00684 CUDA_SAFE_CALL(cudaMemcpy2D(&d_sift[numPts], sizeof(float)*maxPts,
00685 &h_sift[numPts], bw, sizeof(float)*(num-numPts), 7,
00686 cudaMemcpyHostToDevice));
00687 free(h_sift);
00688
00689 *initNumPts = num;
00690 return 0;
00691 }
00692
00693 double ExtractSiftDescriptors(CudaImage *img, CudaArray *sift,
00694 CudaArray *desc, int numPts, int maxPts)
00695 {
00696 float *d_sift = sift->d_data, *d_desc = desc->d_data;
00697
00698 tex.addressMode[0] = cudaAddressModeClamp;
00699 tex.addressMode[1] = cudaAddressModeClamp;
00700 tex.filterMode = cudaFilterModeLinear;
00701 tex.normalized = false;
00702 CUDA_SAFE_CALL(cudaBindTextureToArray(tex, (cudaArray *)img->t_data));
00703
00704 dim3 blocks(numPts);
00705 dim3 threads(16);
00706 ExtractSiftDescriptors<<<blocks, threads>>>(img->d_data,
00707 d_sift, d_desc, maxPts);
00708 CUT_CHECK_ERROR("ExtractSiftDescriptors() execution failed\n");
00709 CUDA_SAFE_CALL(cudaThreadSynchronize());
00710 CUDA_SAFE_CALL(cudaUnbindTexture(tex));
00711
00712 return 0;
00713 }