00001
00002
00003
00004
00005
00006
00007 #include <cuda.h>
00008 #include "CUDA/cutil.h"
00009 #include <stdio.h>
00010 #include <assert.h>
00011 #include "cbcl_model.h"
00012
00013 #define BLOCK_SIZE 16
00014
00015
00016 const float* min_element(const float* start,const float* end)
00017 {
00018
00019 const float* minptr=start;
00020 float minval=*start;
00021 float val = minval;
00022 for(const float* ptr=start;ptr!=end;ptr++)
00023 {
00024 val=*ptr;
00025 if(val<minval)
00026 {
00027 minval=val;
00028 minptr=ptr;
00029 }
00030 }
00031 return minptr;
00032 }
00033
00034
00035 const float* max_element(const float* start,const float* end)
00036 {
00037 const float* maxptr=start;
00038 float maxval=*start;
00039 float val = maxval;
00040 for(const float* ptr=start;ptr!=end;ptr++)
00041 {
00042 val=*ptr;
00043 if(val>maxval)
00044 {
00045 maxval=val;
00046 maxptr=ptr;
00047 }
00048 }
00049 return maxptr;
00050 }
00051
00052
00053 void cpu_create_filters(band_info** ppfilt,int nfilts, int width, int height, int depth)
00054 {
00055 *ppfilt = new band_info[nfilts];
00056 assert(*ppfilt !=NULL);
00057 for(int i=0;i<nfilts;i++)
00058 {
00059 band_info* pfilt = *ppfilt+i;
00060 pfilt->depth = depth;
00061 pfilt->height = height;
00062 pfilt->width = width;
00063 pfilt->where = ONHOST;
00064
00065 pfilt->pitch=pfilt->width*sizeof(float);
00066 pfilt->ptr =new float[pfilt->depth*pfilt->height*pfilt->width];
00067 assert(pfilt->ptr);
00068 }
00069 }
00070
00071 int cpu_get_width(band_info** ppfilt, int nfilts, int index)
00072 {
00073 band_info* pfilt = *ppfilt+index;
00074 assert(index < nfilts);
00075 return pfilt->width;
00076 }
00077
00078 int cpu_get_height(band_info** ppfilt, int nfilts, int index)
00079 {
00080 band_info* pfilt = *ppfilt+index;
00081 assert(index < nfilts);
00082 return pfilt->height;
00083 }
00084
00085 int cpu_get_depth(band_info** ppfilt, int nfilts, int index)
00086 {
00087 band_info* pfilt = *ppfilt+index;
00088 assert(index < nfilts);
00089 return pfilt->depth;
00090 }
00091
00092 void cpu_copy_filter(band_info** ppfilt, int nfilts, int index, int x1, int y1, band_info **ppfiltin, int indexin, int x2, int y2)
00093 {
00094 band_info* pfilt = *ppfilt+index;
00095 band_info* pfiltin = *ppfiltin+indexin;
00096 assert(index < nfilts);
00097 assert(x1 < pfilt->width);
00098 assert(y1 < pfilt->height);
00099 assert(x2 < pfiltin->width);
00100 assert(y2 < pfiltin->height);
00101 assert(pfilt->width > 0);
00102 assert(pfilt->height > 0);
00103 assert(pfilt->depth > 0);
00104 assert(pfiltin->width > 0);
00105 assert(pfiltin->height > 0);
00106 assert(pfiltin->depth > 0);
00107
00108 assert(pfiltin->depth == pfilt->depth);
00109
00110 for(int d=0;d<pfilt->depth;d++)
00111 {
00112 float* ptr=pfilt->ptr+d*pfilt->height*pfilt->width;
00113 float* ptrin=pfiltin->ptr+d*pfiltin->height*pfiltin->width;
00114 ptr[y1*pfilt->width+x1] = ptrin[y2*pfiltin->width+x2];
00115 }
00116 }
00117
00118 void cpu_read_filters(const char *fname, band_info** ppfilt,int* pnfilts)
00119 {
00120
00121 int num_filters=0;
00122
00123 FILE *fp;
00124
00125 fp = fopen(fname, "r");
00126 if(!fp) exit(0);
00127 fscanf(fp,"%d",&num_filters);
00128
00129 printf("Loading %d filters\n",num_filters);
00130 assert(num_filters >= 1);
00131 *pnfilts= num_filters;
00132 *ppfilt = new band_info[num_filters];
00133 assert(*ppfilt !=NULL);
00134
00135 for(int i=0;i<num_filters;i++)
00136 {
00137 band_info* pfilt = *ppfilt+i;
00138 fscanf(fp,"%d",&(pfilt->depth));
00139 fscanf(fp,"%d",&(pfilt->height));
00140 fscanf(fp,"%d",&(pfilt->width));
00141
00142 pfilt->pitch=pfilt->width*sizeof(float);
00143 pfilt->ptr =new float[pfilt->depth*pfilt->height*pfilt->width];
00144 pfilt->where=ONHOST;
00145 assert(pfilt->ptr);
00146 for(int d=0;d<pfilt->depth;d++)
00147 {
00148 float* ptr=pfilt->ptr+d*pfilt->height*pfilt->width;
00149 for(int y=0;y<pfilt->height;y++)
00150 {
00151 for(int x=0;x<pfilt->width;x++)
00152 fscanf(fp,"%f",&(ptr[y*pfilt->width+x]));
00153 }
00154 }
00155 }
00156 fclose(fp);
00157
00158 }
00159
00160 void cpu_write_filters(band_info* pfilt,int nfilts,const char* filename)
00161 {
00162 FILE *fp;
00163 fp = fopen(filename,"w");
00164 assert(nfilts>=1);
00165 assert(pfilt!=NULL);
00166
00167 fprintf(fp,"%d\n",nfilts);
00168
00169 for(int i=0;i<nfilts;i++,pfilt++)
00170 {
00171 fprintf(fp,"%d\n",pfilt->depth);
00172 fprintf(fp,"%d\n",pfilt->height);
00173 fprintf(fp,"%d\n",pfilt->width);
00174
00175 for(int d=0;d<pfilt->depth;d++)
00176 {
00177
00178 for(int y=0;y<pfilt->height;y++)
00179 {
00180 for(int x=0;x<pfilt->width;x++)
00181 fprintf(fp,"%.8f ",*elptr(pfilt->ptr,d,y,x,pfilt->height,pfilt->pitch));
00182 fprintf(fp,"\n");
00183 }
00184 }
00185 }
00186 fclose(fp);
00187 }
00188
00189
00190
00191
00192
00193
00194 texture<float,2,cudaReadModeElementType> teximg;
00195 texture<float,2,cudaReadModeElementType> texfilt;
00196
00197 __host__ __device__ float* elptr(float* base,int depth,int row,int col,int height,int pitch)
00198 {
00199 return (float*)((char*)base+depth*height*pitch+row*pitch)+col;
00200 }
00201
00202 __global__ void kernel_c_local(float* dest,int pitch,int depth,float wt,float ht,int poolxy,int stepxy,int srcwidth,int srcheight,float xscale,float yscale)
00203 {
00204 int col = blockIdx.x*BLOCK_SIZE+threadIdx.x;
00205 int row = blockIdx.y*BLOCK_SIZE+threadIdx.y;
00206 int x = col*stepxy;
00207 int y = row*stepxy;
00208 if(row>=ht) return;
00209 if(col>=wt) return;
00210 for(int d=0;d<depth;d++)
00211 {
00212 float maxval = 0;
00213 float pixval = 0;
00214 int istride = d*srcheight;
00215 for(int v=0;v<poolxy;v++)
00216 for(int u=0;u<poolxy;u++)
00217 {
00218 pixval=tex2D(teximg,(x+u)*xscale,istride+(y+v)*yscale);
00219 maxval=maxval>pixval?maxval:pixval;
00220 }
00221 float* ptr = elptr(dest,d,row,col,ht,pitch);
00222 if(*ptr<maxval)
00223 *ptr = maxval;
00224 }
00225 }
00226
00227 void gpu_c_local(
00228 IN band_info* sin,
00229 IN int in_bands,
00230 IN int pool_xy,
00231 IN int step_xy,
00232 IN int pool_scale,
00233 IN int step_scale,
00234 OUT band_info** c,
00235 OUT int* pout_bands,
00236 IN bool copy
00237 )
00238 {
00239 cudaArray* imgarray;
00240 band_info* h_outbands;
00241 float* d_ptr;
00242 cudaMemcpyKind copydir=cudaMemcpyHostToDevice;
00243 int i,o,b;
00244 size_t pitch;
00245 int out_bands = (in_bands-pool_scale)/step_scale+1;
00246
00247 h_outbands = new band_info[out_bands];
00248 assert(h_outbands!=NULL);
00249 for(i=0,o=0;i<=in_bands-pool_scale;i+=step_scale,o++)
00250 {
00251 h_outbands[o].height = (sin[i].height-pool_xy)/step_xy+1;
00252 h_outbands[o].width = (sin[i].width-pool_xy)/step_xy+1;
00253 h_outbands[o].depth = sin[i].depth;
00254 CUDA_SAFE_CALL(cudaMallocPitch((void**)&d_ptr,&pitch,h_outbands[o].width*sizeof(float),h_outbands[o].depth*h_outbands[o].height));
00255 CUDA_SAFE_CALL(cudaMemset(d_ptr,0,pitch*h_outbands[o].depth*h_outbands[o].height));
00256 h_outbands[o].pitch = pitch;
00257 h_outbands[o].ptr = d_ptr;
00258 h_outbands[o].where = ONDEVICE;
00259 }
00260 *pout_bands = out_bands;
00261 *c = h_outbands;
00262
00263 cudaChannelFormatDesc imgdesc=cudaCreateChannelDesc<float>();
00264
00265 teximg.addressMode[0] = cudaAddressModeClamp;
00266 teximg.addressMode[1] = cudaAddressModeClamp;
00267 teximg.filterMode = cudaFilterModePoint;
00268 teximg.normalized = false;
00269
00270
00271 for(i=0,o=0;i<=in_bands-pool_scale;i+=step_scale,o++)
00272 {
00273 for(b=0;b<pool_scale;b++)
00274 {
00275 copydir = cudaMemcpyHostToDevice;
00276 if(sin[i+b].where==ONDEVICE)
00277 copydir=cudaMemcpyDeviceToDevice;
00278 CUDA_SAFE_CALL(cudaMallocArray(&imgarray,&imgdesc,sin[i+b].width,sin[i+b].height*sin[i+b].depth));
00279 assert(imgarray!=NULL);
00280 CUDA_SAFE_CALL(cudaMemcpy2DToArray(imgarray,0,0,
00281 sin[i+b].ptr,sin[i+b].pitch,
00282 sin[i+b].width*sizeof(float),sin[i+b].height*sin[i+b].depth,
00283 copydir));
00284 CUDA_SAFE_CALL(cudaBindTextureToArray(teximg,imgarray));
00285
00286 uint3 gridsz = make_uint3(ceilf((float)h_outbands[o].width/BLOCK_SIZE),ceilf((float)h_outbands[o].height/BLOCK_SIZE),1);
00287 uint3 blocksz = make_uint3(BLOCK_SIZE,BLOCK_SIZE,1);
00288 kernel_c_local<<<gridsz,blocksz>>>(h_outbands[o].ptr,h_outbands[o].pitch,
00289 h_outbands[o].depth,h_outbands[o].width,h_outbands[o].height,
00290 pool_xy,step_xy,
00291 sin[i+b].width,sin[i+b].height,
00292 (float)sin[i+b].width/sin[i].width,(float)sin[i+b].height/sin[i].height);
00293 CUDA_SAFE_CALL(cudaThreadSynchronize());
00294 CUDA_SAFE_CALL(cudaUnbindTexture(teximg));
00295 CUDA_SAFE_CALL(cudaFreeArray(imgarray));
00296 }
00297 }
00298
00299 for(b=0;copy && b<out_bands;b++)
00300 {
00301 int sz = h_outbands[b].height*h_outbands[b].width*h_outbands[b].depth;
00302 float* ptr = new float[sz];
00303 assert(ptr!=NULL);
00304 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,h_outbands[b].width*sizeof(float),
00305 h_outbands[b].ptr,h_outbands[b].pitch,
00306 h_outbands[b].width*sizeof(float),h_outbands[b].height*h_outbands[b].depth,
00307 cudaMemcpyDeviceToHost));
00308 CUDA_SAFE_CALL(cudaFree(h_outbands[b].ptr));
00309 h_outbands[b].ptr =ptr;
00310 h_outbands[b].pitch =h_outbands[b].width*sizeof(float);
00311 h_outbands[b].where =ONHOST;
00312 }
00313 CUDA_SAFE_CALL(cudaThreadSynchronize());
00314 }
00315
00316
00317 void cpu_release_images(band_info** ppbands,int num_bands)
00318 {
00319 for(int i=0;i<num_bands;i++)
00320 {
00321 band_info* pb=*ppbands+i;
00322 if(pb->where==ONHOST)
00323 delete[] pb->ptr;
00324 else
00325 CUDA_SAFE_CALL(cudaFree(pb->ptr));
00326 }
00327 delete [] *ppbands;
00328 *ppbands = NULL;
00329 }
00330
00331 __global__ void kernel_resize(float *dest,int pitch,int wt,int ht,float xscale,float yscale)
00332 {
00333 int row=blockIdx.y*BLOCK_SIZE+threadIdx.y;
00334 int col=blockIdx.x*BLOCK_SIZE+threadIdx.x;
00335 if(row>=ht) return;
00336 if(col>=wt) return;
00337 float* ptr = elptr(dest,0,row,col,ht,pitch);
00338 *ptr = tex2D(teximg,(float)col*xscale,(float)row*yscale);
00339 }
00340
00341 void gpu_create_c0(const float* pimg,int width,int height,band_info** ppc,int* pbands,float scale,int num_scales,bool copy)
00342 {
00343 size_t pitch = 0;
00344 *ppc = new band_info[num_scales];
00345 *pbands = num_scales;
00346 assert(*ppc!=NULL);
00347 assert(*pbands>=1);
00348 float* ptr = NULL;
00349 CUDA_SAFE_CALL(cudaMallocPitch((void**)&ptr,&pitch,width*sizeof(float),height));
00350 assert(ptr);
00351 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,pitch,pimg,width*sizeof(float),width*sizeof(float),height,cudaMemcpyHostToDevice));
00352
00353 (*ppc)->height = height;
00354 (*ppc)->width = width;
00355 (*ppc)->depth = 1;
00356 (*ppc)->pitch = pitch;
00357 (*ppc)->ptr = ptr;
00358 cudaArray* pdimg;
00359 cudaChannelFormatDesc imgdesc=cudaCreateChannelDesc<float>();
00360
00361 teximg.addressMode[0] = cudaAddressModeClamp;
00362 teximg.addressMode[1] = cudaAddressModeClamp;
00363 teximg.filterMode = cudaFilterModeLinear;
00364 teximg.normalized = false;
00365
00366 for(int b=1;b<num_scales;b++)
00367 {
00368 band_info* prev = *ppc+b-1;
00369 band_info* pc = *ppc+b;
00370 int bht = roundf(prev->height/scale);
00371 int bwt = roundf(prev->width/scale);
00372
00373 CUDA_SAFE_CALL(cudaMallocArray(&pdimg,&imgdesc,prev->width,prev->height));
00374 CUDA_SAFE_CALL(cudaMemcpy2DToArray(pdimg,0,0,
00375 prev->ptr,prev->pitch,
00376 prev->width*sizeof(float),prev->height,
00377 cudaMemcpyDeviceToDevice));
00378 CUDA_SAFE_CALL(cudaBindTextureToArray(teximg,pdimg));
00379
00380 float* pdout = NULL;
00381 CUDA_SAFE_CALL(cudaMallocPitch((void**)&pdout,&pitch,bwt*sizeof(float),bht));
00382 CUDA_SAFE_CALL(cudaMemset(pdout,0,bht*pitch));
00383 pc->height = bht;
00384 pc->width = bwt;
00385 pc->pitch = pitch;
00386 pc->depth = 1;
00387 pc->ptr = pdout;
00388 pc->where = ONDEVICE;
00389
00390 uint3 gridsz = make_uint3(ceilf((float)bwt/BLOCK_SIZE),ceilf((float)bht/BLOCK_SIZE),1);
00391 uint3 blocksz = make_uint3(BLOCK_SIZE,BLOCK_SIZE,1);
00392 kernel_resize<<<gridsz,blocksz>>>(pdout,pitch,bwt,bht,(float)prev->width/bwt,(float)prev->height/bht);
00393 CUDA_SAFE_CALL(cudaThreadSynchronize());
00394 CUDA_SAFE_CALL(cudaFreeArray(pdimg));
00395 CUDA_SAFE_CALL(cudaUnbindTexture(teximg));
00396 }
00397 for(int b=0;copy && b<num_scales;b++)
00398 {
00399 band_info* pc =*ppc+b;
00400 float* ptr=new float[pc->height*pc->width];
00401 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,pc->width*sizeof(float),
00402 pc->ptr,pc->pitch,
00403 pc->width*sizeof(float),pc->height,
00404 cudaMemcpyDeviceToHost));
00405 CUDA_SAFE_CALL(cudaFree(pc->ptr));
00406 pc->ptr = ptr;
00407 pc->pitch = pc->width*sizeof(float);
00408 pc->where = ONHOST;
00409 }
00410 }
00411
00412 void cpu_create_c0(const float* pimg,int width,int height,band_info** ppc,int* pbands,float scale,int num_scales)
00413 {
00414 *ppc = new band_info[num_scales];
00415 *pbands = num_scales;
00416 assert(*ppc!=NULL);
00417 assert(*pbands>=1);
00418
00419 band_info* prev = *ppc;
00420 prev->height = height;
00421 prev->width = width;
00422 prev->depth = 1;
00423 prev->pitch = width*sizeof(float);
00424 prev->ptr = new float[height*width];
00425 prev->where = ONHOST;
00426 memcpy(prev->ptr,pimg,height*width*sizeof(float));
00427 assert(prev->ptr);
00428
00429 float minval = *min_element(pimg,&pimg[height*width]);
00430 float maxval = *max_element(pimg,&pimg[height*width]);
00431 float range = maxval-minval+1e-6;
00432
00433
00434 for(int b=1;b<num_scales;b++,prev++)
00435 {
00436 pimg = prev->ptr;
00437 width = prev->width;
00438 height = prev->height;
00439
00440 band_info* pc = *ppc+b;
00441 int bht = roundf(height/scale);
00442 int bwt = roundf(width/scale);
00443 pc->height = bht;
00444 pc->width = bwt;
00445 pc->pitch = bwt*sizeof(float);
00446 pc->depth = 1;
00447 pc->ptr = new float[bht*bwt];
00448 pc->where = ONHOST;
00449 assert(pc->ptr!=NULL);
00450 float cmin = 1e6;
00451 float cmax = 0;
00452 for(int x=0;x<bwt;x++)
00453 {
00454 for(int y=0;y<bht;y++)
00455 {
00456 float sx = x*scale;
00457 float sy = y*scale;
00458 int fx = floorf(sx); int cx = ceilf(sx);cx=(cx>=width)?(width-1):cx;
00459 int fy = floorf(sy); int cy = ceilf(sy);cy=(cy>=height)?(height-1):cy;
00460 float xalpha=sx-fx;
00461 float yalpha=sy-fy;
00462 float val =pimg[fx+fy*width]*(1-xalpha)*(1-yalpha)+
00463 pimg[cx+fy*width]*(xalpha)*(1-yalpha)+
00464 pimg[fx+cy*width]*(1-xalpha)*(yalpha)+
00465 pimg[cx+cy*width]*(xalpha)*(yalpha);
00466 pc->ptr[y*bwt+x]=val;
00467 if(val<cmin) cmin=val;
00468 if(val>cmax) cmax=val;
00469 }
00470 }
00471 float crange = cmax-cmin+1e-6;
00472 float factor = range/crange;
00473 for(int i=0;i<bht*bwt;i++)
00474 pc->ptr[i]=(pc->ptr[i]-cmin)*factor+minval;
00475 }
00476 }
00477
00478 template<int u>
00479 __device__ void colNorm(int irow,int frow,int col,float* pnum,float* pden)
00480 {
00481 float pval=0;
00482 float fval=0;
00483 pval = tex2D(teximg,col+u-1,irow);
00484 fval = tex2D(texfilt,u-1,frow);
00485 *pnum += pval*fval;
00486 *pden += pval*pval;
00487 colNorm<u-1>(irow,frow,col,pnum,pden);
00488 }
00489
00490 template<>
00491 __device__ void colNorm<0>(int irow,int frow,int col,float *pnum,float *pden)
00492 {
00493 return;
00494 }
00495
00496
00497 __global__ void kernel_s_norm_filter(float* dest,int pitch,int wt,int ht,int srcwidth,int srcheight,int depth,int fsz,int num_filt)
00498 {
00499 int row = blockIdx.y*BLOCK_SIZE+threadIdx.y;
00500 int col = blockIdx.x*BLOCK_SIZE+threadIdx.x;
00501 int fstride = fsz*depth;
00502 int u,v,d,l,f,iy,fy;
00503 float den = 0;
00504 float num = 0;
00505 float pixval = 0;
00506 float filtval = 0;
00507 int nloops = fsz/4;
00508 int residue = fsz%4;
00509 if(row>=ht) return;
00510 if(col>=wt) return;
00511 for(f=0;f<num_filt;f++)
00512 {
00513 den = 1e-6;
00514 num = 0;
00515 for(d=0;d<depth;d++)
00516 {
00517 iy = d*srcheight+row;
00518 fy = d*fsz+f*fstride;
00519 switch(fsz)
00520 {
00521 case 5:
00522 v = 0;
00523 colNorm<5>(iy+v,fy+v,col,&num,&den);v++;
00524 colNorm<5>(iy+v,fy+v,col,&num,&den);v++;
00525 colNorm<5>(iy+v,fy+v,col,&num,&den);v++;
00526 colNorm<5>(iy+v,fy+v,col,&num,&den);v++;
00527 colNorm<5>(iy+v,fy+v,col,&num,&den);v++;
00528 case 7:
00529 v = 0;
00530 colNorm<7>(iy+v,fy+v,col,&num,&den);v++;
00531 colNorm<7>(iy+v,fy+v,col,&num,&den);v++;
00532 colNorm<7>(iy+v,fy+v,col,&num,&den);v++;
00533 colNorm<7>(iy+v,fy+v,col,&num,&den);v++;
00534 colNorm<7>(iy+v,fy+v,col,&num,&den);v++;
00535 colNorm<7>(iy+v,fy+v,col,&num,&den);v++;
00536 colNorm<7>(iy+v,fy+v,col,&num,&den);v++;
00537 break;
00538 default:
00539 for(v=0;v<fsz;v++,iy++,fy++)
00540 {
00541 u =0;
00542 for(l=0;l<nloops;l++)
00543 {
00544
00545 pixval =tex2D(teximg,col+u,iy);
00546 filtval=tex2D(texfilt,u,fy);
00547 num +=filtval*pixval;
00548 den +=pixval*pixval;
00549 u++;
00550
00551
00552 pixval =tex2D(teximg,col+u,iy);
00553 filtval=tex2D(texfilt,u,fy);
00554 num +=filtval*pixval;
00555 den +=pixval*pixval;
00556 u++;
00557
00558
00559 pixval =tex2D(teximg,col+u,iy);
00560 filtval=tex2D(texfilt,u,fy);
00561 num +=filtval*pixval;
00562 den +=pixval*pixval;
00563 u++;
00564
00565
00566 pixval =tex2D(teximg,col+u,iy);
00567 filtval=tex2D(texfilt,u,fy);
00568 num +=filtval*pixval;
00569 den +=pixval*pixval;
00570 u++;
00571 }
00572 switch(residue)
00573 {
00574 case 3:
00575 pixval =tex2D(teximg,col+u,iy);
00576 filtval=tex2D(texfilt,u,fy);
00577 num +=filtval*pixval;
00578 den +=pixval*pixval;
00579 u++;
00580 case 2:
00581 pixval =tex2D(teximg,col+u,iy);
00582 filtval=tex2D(texfilt,u,fy);
00583 num +=filtval*pixval;
00584 den +=pixval*pixval;
00585 u++;
00586 case 1:
00587 pixval =tex2D(teximg,col+u,iy);
00588 filtval=tex2D(texfilt,u,fy);
00589 num +=filtval*pixval;
00590 den +=pixval*pixval;
00591 u++;
00592 case 0:
00593 break;
00594 }
00595 }
00596 break;
00597 }
00598 }
00599 *elptr(dest,f,row,col,ht,pitch)=fabs(num)/sqrtf(den);
00600 }
00601 }
00602
00603
00604
00605
00606
00607
00608 void gpu_s_norm_filter(band_info* cin,int in_bands,band_info* filt,int num_filt, band_info** pps, int *out_bands,bool copy)
00609 {
00610 cudaArray* imgarray;
00611 cudaArray* filtarray;
00612 band_info* h_outbands;
00613 float* d_ptr;
00614 cudaMemcpyKind copydir;
00615 size_t pitch;
00616
00617
00618
00619 int fsz = filt[0].width;
00620 int fdepth = filt[0].depth;
00621
00622 h_outbands = new band_info[in_bands];
00623 for(int b=0;b<in_bands;b++)
00624 {
00625 h_outbands[b].height = cin[b].height-fsz+1;
00626 h_outbands[b].width = cin[b].width-fsz+1;
00627 h_outbands[b].depth = num_filt;
00628 CUDA_SAFE_CALL(cudaMallocPitch((void**)&d_ptr,&pitch,h_outbands[b].width*sizeof(float),num_filt*h_outbands[b].height));
00629 assert(d_ptr!=NULL);
00630 CUDA_SAFE_CALL(cudaMemset(d_ptr,0,pitch*num_filt*h_outbands[b].height));
00631 h_outbands[b].pitch = pitch;
00632 h_outbands[b].where = ONDEVICE;
00633 h_outbands[b].ptr = d_ptr;
00634 }
00635 *pps = h_outbands;
00636 *out_bands= in_bands;
00637
00638
00639 cudaChannelFormatDesc imgdesc=cudaCreateChannelDesc<float>();
00640
00641 teximg.addressMode[0] = cudaAddressModeClamp;
00642 teximg.addressMode[1] = cudaAddressModeClamp;
00643 teximg.filterMode = cudaFilterModePoint;
00644 teximg.normalized = false;
00645 cudaChannelFormatDesc filtdesc=cudaCreateChannelDesc<float>();
00646 CUDA_SAFE_CALL(cudaMallocArray(&filtarray,&filtdesc,fsz,num_filt*fsz*fdepth));
00647 texfilt.addressMode[0] = cudaAddressModeClamp;
00648 texfilt.addressMode[1] = cudaAddressModeClamp;
00649 texfilt.filterMode = cudaFilterModePoint;
00650 texfilt.normalized = false;
00651
00652 for(int f=0;f<num_filt;f++)
00653 {
00654 CUDA_SAFE_CALL(cudaMemcpy2DToArray(filtarray,0,f*fsz*fdepth,
00655 filt[f].ptr,filt[f].width*sizeof(float),
00656 filt[f].width*sizeof(float),filt[f].height*filt[f].depth,
00657 cudaMemcpyHostToDevice));
00658 }
00659 CUDA_SAFE_CALL(cudaBindTextureToArray(texfilt,filtarray));
00660
00661
00662 for(int b=0;b<in_bands;b++)
00663 {
00664
00665 copydir = cudaMemcpyHostToDevice;
00666 if(cin[b].where==ONDEVICE)
00667 copydir = cudaMemcpyDeviceToDevice;
00668 CUDA_SAFE_CALL(cudaMallocArray(&imgarray,&imgdesc,cin[b].width,cin[b].height*cin[b].depth));
00669 assert(imgarray!=NULL);
00670 CUDA_SAFE_CALL(cudaMemcpy2DToArray(imgarray,0,0,
00671 cin[b].ptr,cin[b].pitch,
00672 cin[b].width*sizeof(float),cin[b].height*cin[b].depth,
00673 copydir));
00674 CUDA_SAFE_CALL(cudaBindTextureToArray(teximg,imgarray));
00675 uint3 gridsz = make_uint3(ceilf((float)h_outbands[b].width/BLOCK_SIZE),ceilf((float)h_outbands[b].height/BLOCK_SIZE),1);
00676 uint3 blocksz = make_uint3(BLOCK_SIZE,BLOCK_SIZE,1);
00677 kernel_s_norm_filter<<<gridsz,blocksz>>>(h_outbands[b].ptr,
00678 h_outbands[b].pitch,h_outbands[b].width,h_outbands[b].height,
00679 cin[b].width,cin[b].height,
00680 fdepth,fsz,num_filt);
00681 CUDA_SAFE_CALL(cudaThreadSynchronize());
00682 CUDA_SAFE_CALL(cudaUnbindTexture(teximg));
00683 CUDA_SAFE_CALL(cudaFreeArray(imgarray));
00684 }
00685 CUDA_SAFE_CALL(cudaUnbindTexture(texfilt));
00686 for(int b=0;copy && b<in_bands;b++)
00687 {
00688 int sz = h_outbands[b].height*h_outbands[b].width*num_filt;
00689 float* ptr = new float[sz];
00690 assert(ptr!=NULL);
00691 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,h_outbands[b].width*sizeof(float),
00692 h_outbands[b].ptr,h_outbands[b].pitch,
00693 h_outbands[b].width*sizeof(float),h_outbands[b].height*h_outbands[b].depth,
00694 cudaMemcpyDeviceToHost));
00695 CUDA_SAFE_CALL(cudaFree(h_outbands[b].ptr));
00696 h_outbands[b].ptr =ptr;
00697 h_outbands[b].pitch =h_outbands[b].width*sizeof(float);
00698 h_outbands[b].where =ONHOST;
00699 }
00700 CUDA_SAFE_CALL(cudaThreadSynchronize());
00701
00702 CUDA_SAFE_CALL(cudaFreeArray(filtarray));
00703 }
00704
00705 template<int u>
00706 __device__ float colDist(int irow,int frow,int col)
00707 {
00708 float pval=0;
00709 float fval=0;
00710 pval = tex2D(teximg,col+u-1,irow);
00711 fval = tex2D(texfilt,u-1,frow);
00712 return (pval-fval)*(pval-fval)+colDist<u-1>(irow,frow,col);
00713 }
00714
00715 template<>
00716 __device__ float colDist<0>(int irow,int frow,int col)
00717 {
00718 return 0;
00719 }
00720
00721 __global__ void kernel_s_rbf(float* dest,int pitch,int wt,int ht,int srcwidth,int srcheight,int depth,int fsz,int num_filt,float sigma)
00722 {
00723 int row = blockIdx.y*BLOCK_SIZE+threadIdx.y;
00724 int col = blockIdx.x*BLOCK_SIZE+threadIdx.x;
00725 int fstride = fsz*depth;
00726 int u,v,d,l,iy,fy;
00727 float num = 0;
00728 float pixval = 0;
00729 float filtval = 0;
00730 int nloops = fsz/4;
00731 int residue = fsz%4;
00732 if(row>=ht) return;
00733 if(col>=wt) return;
00734 for(int f=0;f<num_filt;f++)
00735 {
00736 num = 0;
00737 for(d=0;d<depth;d++)
00738 {
00739 iy = d*srcheight+row;
00740 fy = d*fsz+f*fstride;
00741 switch(fsz)
00742 {
00743 case 4:
00744 v = 0;
00745 num += colDist<4>(iy+v,fy+v,col);v++;
00746 num += colDist<4>(iy+v,fy+v,col);v++;
00747 num += colDist<4>(iy+v,fy+v,col);v++;
00748 num += colDist<4>(iy+v,fy+v,col);v++;
00749 break;
00750 case 6:
00751 v = 0;
00752 num += colDist<6>(iy+v,fy+v,col);v++;
00753 num += colDist<6>(iy+v,fy+v,col);v++;
00754 num += colDist<6>(iy+v,fy+v,col);v++;
00755 num += colDist<6>(iy+v,fy+v,col);v++;
00756 num += colDist<6>(iy+v,fy+v,col);v++;
00757 num += colDist<6>(iy+v,fy+v,col);v++;
00758 break;
00759 case 8:
00760 v = 0;
00761 num += colDist<8>(iy+v,fy+v,col);v++;
00762 num += colDist<8>(iy+v,fy+v,col);v++;
00763 num += colDist<8>(iy+v,fy+v,col);v++;
00764 num += colDist<8>(iy+v,fy+v,col);v++;
00765 num += colDist<8>(iy+v,fy+v,col);v++;
00766 num += colDist<8>(iy+v,fy+v,col);v++;
00767 num += colDist<8>(iy+v,fy+v,col);v++;
00768 num += colDist<8>(iy+v,fy+v,col);v++;
00769 default:
00770 for(v=0;v<fsz;v++,iy++,fy++)
00771 {
00772 for(l=0;l<nloops;l++)
00773 {
00774 pixval =tex2D(teximg,col+u,iy);
00775 filtval=tex2D(texfilt,u,fy);
00776 num +=(filtval-pixval)*(filtval-pixval);
00777 u++;
00778
00779 pixval =tex2D(teximg,col+u,iy);
00780 filtval=tex2D(texfilt,u,fy);
00781 num +=(filtval-pixval)*(filtval-pixval);
00782 u++;
00783
00784 pixval =tex2D(teximg,col+u,iy);
00785 filtval=tex2D(texfilt,u,fy);
00786 num +=(filtval-pixval)*(filtval-pixval);
00787 u++;
00788
00789 pixval =tex2D(teximg,col+u,iy);
00790 filtval=tex2D(texfilt,u,fy);
00791 num +=(filtval-pixval)*(filtval-pixval);
00792 u++;
00793 }
00794 switch(residue)
00795 {
00796 case 3:
00797 pixval =tex2D(teximg,col+u,iy);
00798 filtval=tex2D(texfilt,u,fy);
00799 num +=(filtval-pixval)*(filtval-pixval);
00800 u++;
00801 case 2:
00802 pixval =tex2D(teximg,col+u,iy);
00803 filtval=tex2D(texfilt,u,fy);
00804 num +=(filtval-pixval)*(filtval-pixval);
00805 u++;
00806 case 1:
00807 pixval =tex2D(teximg,col+u,iy);
00808 filtval=tex2D(texfilt,u,fy);
00809 num +=(filtval-pixval)*(filtval-pixval);
00810 u++;
00811 case 0:
00812 break;
00813 }
00814 }
00815 break;
00816 }
00817 }
00818 *elptr(dest,f,row,col,ht,pitch)=exp(-num/sigma);
00819 }
00820 }
00821
00822
00823
00824
00825
00826
00827 void gpu_s_rbf(band_info* cin,int in_bands,band_info* filt,int num_filt, float sigma,band_info** pps, int *out_bands,bool copy)
00828 {
00829 cudaArray* imgarray;
00830 cudaArray* filtarray;
00831 band_info* h_outbands;
00832 float* d_ptr;
00833 cudaMemcpyKind copydir=cudaMemcpyHostToDevice;
00834 size_t pitch;
00835
00836 int fsz = filt[0].width;
00837 int fdepth = filt[0].depth;
00838
00839
00840 h_outbands = new band_info[in_bands];
00841 for(int b=0;b<in_bands;b++)
00842 {
00843 h_outbands[b].height = cin[b].height-fsz+1;
00844 h_outbands[b].width = cin[b].width-fsz+1;
00845 h_outbands[b].depth = num_filt;
00846 CUDA_SAFE_CALL(cudaMallocPitch((void**)&d_ptr,&pitch,h_outbands[b].width*sizeof(float),num_filt*h_outbands[b].height));
00847 assert(d_ptr!=NULL);
00848 CUDA_SAFE_CALL(cudaMemset(d_ptr,0,pitch*num_filt*h_outbands[b].height));
00849 h_outbands[b].pitch = pitch;
00850 h_outbands[b].ptr = d_ptr;
00851 h_outbands[b].where = ONDEVICE;
00852 }
00853 *pps = h_outbands;
00854 *out_bands= in_bands;
00855
00856
00857 cudaChannelFormatDesc imgdesc=cudaCreateChannelDesc<float>();
00858
00859 teximg.addressMode[0] = cudaAddressModeClamp;
00860 teximg.addressMode[1] = cudaAddressModeClamp;
00861 teximg.filterMode = cudaFilterModePoint;
00862 teximg.normalized = false;
00863
00864 cudaChannelFormatDesc filtdesc=cudaCreateChannelDesc<float>();
00865 CUDA_SAFE_CALL(cudaMallocArray(&filtarray,&filtdesc,fsz,num_filt*fsz*fdepth));
00866 texfilt.addressMode[0] = cudaAddressModeClamp;
00867 texfilt.addressMode[1] = cudaAddressModeClamp;
00868 texfilt.filterMode = cudaFilterModePoint;
00869 texfilt.normalized = false;
00870
00871 sigma = 2*sigma*sigma;
00872
00873 for(int f=0;f<num_filt;f++)
00874 {
00875 CUDA_SAFE_CALL(cudaMemcpy2DToArray(filtarray,0,f*fsz*fdepth,
00876 filt[f].ptr,filt[f].width*sizeof(float),
00877 filt[f].width*sizeof(float),filt[f].height*filt[f].depth,
00878 cudaMemcpyHostToDevice));
00879 }
00880 CUDA_SAFE_CALL(cudaBindTextureToArray(texfilt,filtarray));
00881
00882 for(int b=0;b<in_bands;b++)
00883 {
00884 copydir = cudaMemcpyHostToDevice;
00885 if(cin[b].where==ONDEVICE)
00886 copydir=cudaMemcpyDeviceToDevice;
00887 CUDA_SAFE_CALL(cudaMallocArray(&imgarray,&imgdesc,cin[b].width,cin[b].height*cin[b].depth));
00888 assert(imgarray!=NULL);
00889
00890 CUDA_SAFE_CALL(cudaMemcpy2DToArray(imgarray,0,0,
00891 cin[b].ptr,cin[b].pitch,
00892 cin[b].width*sizeof(float),cin[b].height*cin[b].depth,
00893 copydir));
00894 CUDA_SAFE_CALL(cudaBindTextureToArray(teximg,imgarray));
00895 uint3 gridsz = make_uint3(ceilf((float)h_outbands[b].width/BLOCK_SIZE),ceilf((float)h_outbands[b].height/BLOCK_SIZE),1);
00896 uint3 blocksz = make_uint3(BLOCK_SIZE,BLOCK_SIZE,1);
00897 kernel_s_rbf<<<gridsz,blocksz>>>(h_outbands[b].ptr,
00898 h_outbands[b].pitch,h_outbands[b].width,h_outbands[b].height,
00899 cin[b].width,cin[b].height,
00900 fdepth,fsz,num_filt,sigma);
00901 CUDA_SAFE_CALL(cudaThreadSynchronize());
00902 CUDA_SAFE_CALL(cudaUnbindTexture(teximg));
00903 CUDA_SAFE_CALL(cudaFreeArray(imgarray));
00904 }
00905 CUDA_SAFE_CALL(cudaUnbindTexture(texfilt));
00906 for(int b=0;copy && b<in_bands;b++)
00907 {
00908 int sz = h_outbands[b].height*h_outbands[b].width*num_filt;
00909 float* ptr = new float[sz];
00910 assert(ptr!=NULL);
00911 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,h_outbands[b].width*sizeof(float),
00912 h_outbands[b].ptr,h_outbands[b].pitch,
00913 h_outbands[b].width*sizeof(float),h_outbands[b].height*h_outbands[b].depth,
00914 cudaMemcpyDeviceToHost));
00915 CUDA_SAFE_CALL(cudaFree(h_outbands[b].ptr));
00916 h_outbands[b].ptr =ptr;
00917 h_outbands[b].pitch =h_outbands[b].width*sizeof(float);
00918 h_outbands[b].where =ONHOST;
00919 }
00920 CUDA_SAFE_CALL(cudaThreadSynchronize());
00921
00922 CUDA_SAFE_CALL(cudaFreeArray(filtarray));
00923 }
00924
00925 void cpu_c_global(
00926 IN band_info* s,
00927 IN int in_bands,
00928 OUT float** ppc,
00929 OUT int* out_units
00930 )
00931 {
00932 *out_units = s[0].depth;
00933 *ppc = new float[*out_units];
00934 assert(*ppc);
00935
00936 float* pc = *ppc;
00937 memset(pc,0,sizeof(float)*(*out_units));
00938
00939 for(int d=0;d<s[0].depth;d++)
00940 {
00941 for(int b=0;b<in_bands;b++)
00942 {
00943 int numel = s[b].height*s[b].width;
00944 float* ptr = s[b].ptr+d*numel;
00945 const float* pmaxval= max_element(ptr,ptr+numel);
00946 pc[d] = max(*pmaxval,pc[d]);
00947 }
00948 }
00949 }
00950
00951