00001 /* 00002 * Modified from MIT CBCL SVN repository 00003 * Author: Sharat Chikkerur 00004 */ 00005 00006 00007 #include <cuda.h> 00008 #include "CUDA/cutil.h" 00009 #include <stdio.h> 00010 #include <assert.h> 00011 #include "cbcl_model.h" 00012 00013 #define BLOCK_SIZE 16 00014 00015 00016 const float* min_element(const float* start,const float* end) 00017 { 00018 00019 const float* minptr=start; 00020 float minval=*start; 00021 float val = minval; 00022 for(const float* ptr=start;ptr!=end;ptr++) 00023 { 00024 val=*ptr; 00025 if(val<minval) 00026 { 00027 minval=val; 00028 minptr=ptr; 00029 } 00030 } 00031 return minptr; 00032 } 00033 00034 00035 const float* max_element(const float* start,const float* end) 00036 { 00037 const float* maxptr=start; 00038 float maxval=*start; 00039 float val = maxval; 00040 for(const float* ptr=start;ptr!=end;ptr++) 00041 { 00042 val=*ptr; 00043 if(val>maxval) 00044 { 00045 maxval=val; 00046 maxptr=ptr; 00047 } 00048 } 00049 return maxptr; 00050 } 00051 00052 00053 void cpu_create_filters(band_info** ppfilt,int nfilts, int width, int height, int depth) 00054 { 00055 *ppfilt = new band_info[nfilts]; 00056 assert(*ppfilt !=NULL); 00057 for(int i=0;i<nfilts;i++) 00058 { 00059 band_info* pfilt = *ppfilt+i; 00060 pfilt->depth = depth; 00061 pfilt->height = height; 00062 pfilt->width = width; 00063 pfilt->where = ONHOST; 00064 /*allocate memory for the image*/ 00065 pfilt->pitch=pfilt->width*sizeof(float); 00066 pfilt->ptr =new float[pfilt->depth*pfilt->height*pfilt->width]; 00067 assert(pfilt->ptr); 00068 } 00069 } 00070 00071 int cpu_get_width(band_info** ppfilt, int nfilts, int index) 00072 { 00073 band_info* pfilt = *ppfilt+index; 00074 assert(index < nfilts); 00075 return pfilt->width; 00076 } 00077 00078 int cpu_get_height(band_info** ppfilt, int nfilts, int index) 00079 { 00080 band_info* pfilt = *ppfilt+index; 00081 assert(index < nfilts); 00082 return pfilt->height; 00083 } 00084 00085 int cpu_get_depth(band_info** ppfilt, int nfilts, int index) 00086 { 00087 band_info* pfilt = *ppfilt+index; 00088 assert(index < nfilts); 00089 return pfilt->depth; 00090 } 00091 00092 void cpu_copy_filter(band_info** ppfilt, int nfilts, int index, int x1, int y1, band_info **ppfiltin, int indexin, int x2, int y2) 00093 { 00094 band_info* pfilt = *ppfilt+index; 00095 band_info* pfiltin = *ppfiltin+indexin; 00096 assert(index < nfilts); 00097 assert(x1 < pfilt->width); 00098 assert(y1 < pfilt->height); 00099 assert(x2 < pfiltin->width); 00100 assert(y2 < pfiltin->height); 00101 assert(pfilt->width > 0); 00102 assert(pfilt->height > 0); 00103 assert(pfilt->depth > 0); 00104 assert(pfiltin->width > 0); 00105 assert(pfiltin->height > 0); 00106 assert(pfiltin->depth > 0); 00107 00108 assert(pfiltin->depth == pfilt->depth); 00109 00110 for(int d=0;d<pfilt->depth;d++) 00111 { 00112 float* ptr=pfilt->ptr+d*pfilt->height*pfilt->width; 00113 float* ptrin=pfiltin->ptr+d*pfiltin->height*pfiltin->width; 00114 ptr[y1*pfilt->width+x1] = ptrin[y2*pfiltin->width+x2]; 00115 } 00116 } 00117 00118 void cpu_read_filters(const char *fname, band_info** ppfilt,int* pnfilts) 00119 { 00120 /*read number of filters*/ 00121 int num_filters=0; 00122 00123 FILE *fp; 00124 00125 fp = fopen(fname, "r"); 00126 if(!fp) exit(0); 00127 fscanf(fp,"%d",&num_filters); 00128 00129 printf("Loading %d filters\n",num_filters); 00130 assert(num_filters >= 1); 00131 *pnfilts= num_filters; 00132 *ppfilt = new band_info[num_filters]; 00133 assert(*ppfilt !=NULL); 00134 00135 for(int i=0;i<num_filters;i++) 00136 { 00137 band_info* pfilt = *ppfilt+i; 00138 fscanf(fp,"%d",&(pfilt->depth)); 00139 fscanf(fp,"%d",&(pfilt->height)); 00140 fscanf(fp,"%d",&(pfilt->width)); 00141 /*allocate memory for the image*/ 00142 pfilt->pitch=pfilt->width*sizeof(float); 00143 pfilt->ptr =new float[pfilt->depth*pfilt->height*pfilt->width]; 00144 pfilt->where=ONHOST; 00145 assert(pfilt->ptr); 00146 for(int d=0;d<pfilt->depth;d++) 00147 { 00148 float* ptr=pfilt->ptr+d*pfilt->height*pfilt->width; 00149 for(int y=0;y<pfilt->height;y++) 00150 { 00151 for(int x=0;x<pfilt->width;x++) 00152 fscanf(fp,"%f",&(ptr[y*pfilt->width+x])); 00153 } 00154 } 00155 } 00156 fclose(fp); 00157 00158 } 00159 00160 void cpu_write_filters(band_info* pfilt,int nfilts,const char* filename) 00161 { 00162 FILE *fp; 00163 fp = fopen(filename,"w"); 00164 assert(nfilts>=1); 00165 assert(pfilt!=NULL); 00166 00167 fprintf(fp,"%d\n",nfilts); 00168 00169 for(int i=0;i<nfilts;i++,pfilt++) 00170 { 00171 fprintf(fp,"%d\n",pfilt->depth); 00172 fprintf(fp,"%d\n",pfilt->height); 00173 fprintf(fp,"%d\n",pfilt->width); 00174 00175 for(int d=0;d<pfilt->depth;d++) 00176 { 00177 //float* ptr=pfilt->ptr+d*pfilt->height*pfilt->width; 00178 for(int y=0;y<pfilt->height;y++) 00179 { 00180 for(int x=0;x<pfilt->width;x++) 00181 fprintf(fp,"%.8f ",*elptr(pfilt->ptr,d,y,x,pfilt->height,pfilt->pitch)); 00182 fprintf(fp,"\n"); 00183 } 00184 } 00185 } 00186 fclose(fp); 00187 } 00188 00189 /* 00190 image texture 00191 */ 00192 00193 00194 texture<float,2,cudaReadModeElementType> teximg; 00195 texture<float,2,cudaReadModeElementType> texfilt; 00196 00197 __host__ __device__ float* elptr(float* base,int depth,int row,int col,int height,int pitch) 00198 { 00199 return (float*)((char*)base+depth*height*pitch+row*pitch)+col; 00200 } 00201 00202 __global__ void kernel_c_local(float* dest,int pitch,int depth,float wt,float ht,int poolxy,int stepxy,int srcwidth,int srcheight,float xscale,float yscale) 00203 { 00204 int col = blockIdx.x*BLOCK_SIZE+threadIdx.x; 00205 int row = blockIdx.y*BLOCK_SIZE+threadIdx.y; 00206 int x = col*stepxy; 00207 int y = row*stepxy; 00208 if(row>=ht) return; 00209 if(col>=wt) return; 00210 for(int d=0;d<depth;d++) 00211 { 00212 float maxval = 0; 00213 float pixval = 0; 00214 int istride = d*srcheight; 00215 for(int v=0;v<poolxy;v++) 00216 for(int u=0;u<poolxy;u++) 00217 { 00218 pixval=tex2D(teximg,(x+u)*xscale,istride+(y+v)*yscale); 00219 maxval=maxval>pixval?maxval:pixval; 00220 } 00221 float* ptr = elptr(dest,d,row,col,ht,pitch); 00222 if(*ptr<maxval) 00223 *ptr = maxval; 00224 } 00225 } 00226 00227 void gpu_c_local( 00228 IN band_info* sin, /*pointer to DEVICE storage*/ 00229 IN int in_bands, /*number of input bands*/ 00230 IN int pool_xy, /*spatial pooling: subsampling by pool_xy/2*/ 00231 IN int step_xy, /*spatial subsampling factor*/ 00232 IN int pool_scale, /*scale wise pooling: out_bands=in_bands/pool_scale*/ 00233 IN int step_scale, /*scale incremenet step*/ 00234 OUT band_info** c, /*pointer to DEVICE storage*/ 00235 OUT int* pout_bands, /*number of output bands*/ 00236 IN bool copy 00237 ) 00238 { 00239 cudaArray* imgarray; 00240 band_info* h_outbands; 00241 float* d_ptr; 00242 cudaMemcpyKind copydir=cudaMemcpyHostToDevice; 00243 int i,o,b; 00244 size_t pitch; 00245 int out_bands = (in_bands-pool_scale)/step_scale+1; 00246 /*stage output*/ 00247 h_outbands = new band_info[out_bands]; 00248 assert(h_outbands!=NULL); 00249 for(i=0,o=0;i<=in_bands-pool_scale;i+=step_scale,o++) 00250 { 00251 h_outbands[o].height = (sin[i].height-pool_xy)/step_xy+1; 00252 h_outbands[o].width = (sin[i].width-pool_xy)/step_xy+1; 00253 h_outbands[o].depth = sin[i].depth; 00254 CUDA_SAFE_CALL(cudaMallocPitch((void**)&d_ptr,&pitch,h_outbands[o].width*sizeof(float),h_outbands[o].depth*h_outbands[o].height)); 00255 CUDA_SAFE_CALL(cudaMemset(d_ptr,0,pitch*h_outbands[o].depth*h_outbands[o].height)); 00256 h_outbands[o].pitch = pitch; 00257 h_outbands[o].ptr = d_ptr; 00258 h_outbands[o].where = ONDEVICE; 00259 } 00260 *pout_bands = out_bands; 00261 *c = h_outbands; 00262 00263 cudaChannelFormatDesc imgdesc=cudaCreateChannelDesc<float>(); 00264 /*bind the texture*/ 00265 teximg.addressMode[0] = cudaAddressModeClamp; 00266 teximg.addressMode[1] = cudaAddressModeClamp; 00267 teximg.filterMode = cudaFilterModePoint; 00268 teximg.normalized = false; 00269 00270 /*copy image*/ 00271 for(i=0,o=0;i<=in_bands-pool_scale;i+=step_scale,o++) 00272 { 00273 for(b=0;b<pool_scale;b++) 00274 { 00275 copydir = cudaMemcpyHostToDevice; 00276 if(sin[i+b].where==ONDEVICE) 00277 copydir=cudaMemcpyDeviceToDevice; 00278 CUDA_SAFE_CALL(cudaMallocArray(&imgarray,&imgdesc,sin[i+b].width,sin[i+b].height*sin[i+b].depth)); 00279 assert(imgarray!=NULL); 00280 CUDA_SAFE_CALL(cudaMemcpy2DToArray(imgarray,0,0, 00281 sin[i+b].ptr,sin[i+b].pitch, 00282 sin[i+b].width*sizeof(float),sin[i+b].height*sin[i+b].depth, 00283 copydir)); 00284 CUDA_SAFE_CALL(cudaBindTextureToArray(teximg,imgarray)); 00285 /*call the kernel*/ 00286 uint3 gridsz = make_uint3(ceilf((float)h_outbands[o].width/BLOCK_SIZE),ceilf((float)h_outbands[o].height/BLOCK_SIZE),1); 00287 uint3 blocksz = make_uint3(BLOCK_SIZE,BLOCK_SIZE,1); 00288 kernel_c_local<<<gridsz,blocksz>>>(h_outbands[o].ptr,h_outbands[o].pitch, 00289 h_outbands[o].depth,h_outbands[o].width,h_outbands[o].height, 00290 pool_xy,step_xy, 00291 sin[i+b].width,sin[i+b].height, 00292 (float)sin[i+b].width/sin[i].width,(float)sin[i+b].height/sin[i].height); 00293 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00294 CUDA_SAFE_CALL(cudaUnbindTexture(teximg)); 00295 CUDA_SAFE_CALL(cudaFreeArray(imgarray)); 00296 } 00297 } 00298 /*copy image back to host*/ 00299 for(b=0;copy && b<out_bands;b++) 00300 { 00301 int sz = h_outbands[b].height*h_outbands[b].width*h_outbands[b].depth; 00302 float* ptr = new float[sz]; 00303 assert(ptr!=NULL); 00304 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,h_outbands[b].width*sizeof(float), 00305 h_outbands[b].ptr,h_outbands[b].pitch, 00306 h_outbands[b].width*sizeof(float),h_outbands[b].height*h_outbands[b].depth, 00307 cudaMemcpyDeviceToHost)); 00308 CUDA_SAFE_CALL(cudaFree(h_outbands[b].ptr)); 00309 h_outbands[b].ptr =ptr; 00310 h_outbands[b].pitch =h_outbands[b].width*sizeof(float); 00311 h_outbands[b].where =ONHOST; 00312 } 00313 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00314 } 00315 00316 00317 void cpu_release_images(band_info** ppbands,int num_bands) 00318 { 00319 for(int i=0;i<num_bands;i++) 00320 { 00321 band_info* pb=*ppbands+i; 00322 if(pb->where==ONHOST) 00323 delete[] pb->ptr; 00324 else 00325 CUDA_SAFE_CALL(cudaFree(pb->ptr)); 00326 } 00327 delete [] *ppbands; 00328 *ppbands = NULL; 00329 } 00330 00331 __global__ void kernel_resize(float *dest,int pitch,int wt,int ht,float xscale,float yscale) 00332 { 00333 int row=blockIdx.y*BLOCK_SIZE+threadIdx.y; 00334 int col=blockIdx.x*BLOCK_SIZE+threadIdx.x; 00335 if(row>=ht) return; 00336 if(col>=wt) return; 00337 float* ptr = elptr(dest,0,row,col,ht,pitch); 00338 *ptr = tex2D(teximg,(float)col*xscale,(float)row*yscale); 00339 } 00340 00341 void gpu_create_c0(const float* pimg,int width,int height,band_info** ppc,int* pbands,float scale,int num_scales,bool copy) 00342 { 00343 size_t pitch = 0; 00344 *ppc = new band_info[num_scales]; 00345 *pbands = num_scales; 00346 assert(*ppc!=NULL); 00347 assert(*pbands>=1); 00348 float* ptr = NULL; 00349 CUDA_SAFE_CALL(cudaMallocPitch((void**)&ptr,&pitch,width*sizeof(float),height)); 00350 assert(ptr); 00351 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,pitch,pimg,width*sizeof(float),width*sizeof(float),height,cudaMemcpyHostToDevice)); 00352 /*create first band*/ 00353 (*ppc)->height = height; 00354 (*ppc)->width = width; 00355 (*ppc)->depth = 1; 00356 (*ppc)->pitch = pitch; 00357 (*ppc)->ptr = ptr; 00358 cudaArray* pdimg; 00359 cudaChannelFormatDesc imgdesc=cudaCreateChannelDesc<float>(); 00360 /*bind the texture*/ 00361 teximg.addressMode[0] = cudaAddressModeClamp; 00362 teximg.addressMode[1] = cudaAddressModeClamp; 00363 teximg.filterMode = cudaFilterModeLinear; 00364 teximg.normalized = false; 00365 /*copy to array*/ 00366 for(int b=1;b<num_scales;b++) 00367 { 00368 band_info* prev = *ppc+b-1; 00369 band_info* pc = *ppc+b; 00370 int bht = roundf(prev->height/scale); 00371 int bwt = roundf(prev->width/scale); 00372 /*map to texture*/ 00373 CUDA_SAFE_CALL(cudaMallocArray(&pdimg,&imgdesc,prev->width,prev->height)); 00374 CUDA_SAFE_CALL(cudaMemcpy2DToArray(pdimg,0,0, 00375 prev->ptr,prev->pitch, 00376 prev->width*sizeof(float),prev->height, 00377 cudaMemcpyDeviceToDevice)); 00378 CUDA_SAFE_CALL(cudaBindTextureToArray(teximg,pdimg)); 00379 /*allocate the memory*/ 00380 float* pdout = NULL; 00381 CUDA_SAFE_CALL(cudaMallocPitch((void**)&pdout,&pitch,bwt*sizeof(float),bht)); 00382 CUDA_SAFE_CALL(cudaMemset(pdout,0,bht*pitch)); 00383 pc->height = bht; 00384 pc->width = bwt; 00385 pc->pitch = pitch; 00386 pc->depth = 1; 00387 pc->ptr = pdout; 00388 pc->where = ONDEVICE; 00389 /*call the kernel*/ 00390 uint3 gridsz = make_uint3(ceilf((float)bwt/BLOCK_SIZE),ceilf((float)bht/BLOCK_SIZE),1); 00391 uint3 blocksz = make_uint3(BLOCK_SIZE,BLOCK_SIZE,1); 00392 kernel_resize<<<gridsz,blocksz>>>(pdout,pitch,bwt,bht,(float)prev->width/bwt,(float)prev->height/bht); 00393 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00394 CUDA_SAFE_CALL(cudaFreeArray(pdimg)); 00395 CUDA_SAFE_CALL(cudaUnbindTexture(teximg)); 00396 } 00397 for(int b=0;copy && b<num_scales;b++) 00398 { 00399 band_info* pc =*ppc+b; 00400 float* ptr=new float[pc->height*pc->width]; 00401 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,pc->width*sizeof(float), 00402 pc->ptr,pc->pitch, 00403 pc->width*sizeof(float),pc->height, 00404 cudaMemcpyDeviceToHost)); 00405 CUDA_SAFE_CALL(cudaFree(pc->ptr)); 00406 pc->ptr = ptr; 00407 pc->pitch = pc->width*sizeof(float); 00408 pc->where = ONHOST; 00409 } 00410 } 00411 00412 void cpu_create_c0(const float* pimg,int width,int height,band_info** ppc,int* pbands,float scale,int num_scales) 00413 { 00414 *ppc = new band_info[num_scales]; 00415 *pbands = num_scales; 00416 assert(*ppc!=NULL); 00417 assert(*pbands>=1); 00418 /*create first band*/ 00419 band_info* prev = *ppc; 00420 prev->height = height; 00421 prev->width = width; 00422 prev->depth = 1; 00423 prev->pitch = width*sizeof(float); 00424 prev->ptr = new float[height*width]; 00425 prev->where = ONHOST; 00426 memcpy(prev->ptr,pimg,height*width*sizeof(float)); 00427 assert(prev->ptr); 00428 /*determine dynamic range*/ 00429 float minval = *min_element(pimg,&pimg[height*width]); 00430 float maxval = *max_element(pimg,&pimg[height*width]); 00431 float range = maxval-minval+1e-6; 00432 00433 /*create the other bands recursively*/ 00434 for(int b=1;b<num_scales;b++,prev++) 00435 { 00436 pimg = prev->ptr; 00437 width = prev->width; 00438 height = prev->height; 00439 00440 band_info* pc = *ppc+b; 00441 int bht = roundf(height/scale); 00442 int bwt = roundf(width/scale); 00443 pc->height = bht; 00444 pc->width = bwt; 00445 pc->pitch = bwt*sizeof(float); 00446 pc->depth = 1; 00447 pc->ptr = new float[bht*bwt]; 00448 pc->where = ONHOST; 00449 assert(pc->ptr!=NULL); 00450 float cmin = 1e6; /*current min*/ 00451 float cmax = 0; /*current max*/ 00452 for(int x=0;x<bwt;x++) 00453 { 00454 for(int y=0;y<bht;y++) 00455 { 00456 float sx = x*scale; 00457 float sy = y*scale; 00458 int fx = floorf(sx); int cx = ceilf(sx);cx=(cx>=width)?(width-1):cx; 00459 int fy = floorf(sy); int cy = ceilf(sy);cy=(cy>=height)?(height-1):cy; 00460 float xalpha=sx-fx; 00461 float yalpha=sy-fy; 00462 float val =pimg[fx+fy*width]*(1-xalpha)*(1-yalpha)+ 00463 pimg[cx+fy*width]*(xalpha)*(1-yalpha)+ 00464 pimg[fx+cy*width]*(1-xalpha)*(yalpha)+ 00465 pimg[cx+cy*width]*(xalpha)*(yalpha); 00466 pc->ptr[y*bwt+x]=val; 00467 if(val<cmin) cmin=val; 00468 if(val>cmax) cmax=val; 00469 } 00470 } 00471 float crange = cmax-cmin+1e-6; 00472 float factor = range/crange; 00473 for(int i=0;i<bht*bwt;i++) 00474 pc->ptr[i]=(pc->ptr[i]-cmin)*factor+minval; 00475 } 00476 } 00477 00478 template<int u> 00479 __device__ void colNorm(int irow,int frow,int col,float* pnum,float* pden) 00480 { 00481 float pval=0; 00482 float fval=0; 00483 pval = tex2D(teximg,col+u-1,irow); 00484 fval = tex2D(texfilt,u-1,frow); 00485 *pnum += pval*fval; 00486 *pden += pval*pval; 00487 colNorm<u-1>(irow,frow,col,pnum,pden); 00488 } 00489 00490 template<> 00491 __device__ void colNorm<0>(int irow,int frow,int col,float *pnum,float *pden) 00492 { 00493 return; 00494 } 00495 00496 00497 __global__ void kernel_s_norm_filter(float* dest,int pitch,int wt,int ht,int srcwidth,int srcheight,int depth,int fsz,int num_filt) 00498 { 00499 int row = blockIdx.y*BLOCK_SIZE+threadIdx.y; 00500 int col = blockIdx.x*BLOCK_SIZE+threadIdx.x; 00501 int fstride = fsz*depth; 00502 int u,v,d,l,f,iy,fy; 00503 float den = 0; 00504 float num = 0; 00505 float pixval = 0; 00506 float filtval = 0; 00507 int nloops = fsz/4; 00508 int residue = fsz%4; 00509 if(row>=ht) return; 00510 if(col>=wt) return; 00511 for(f=0;f<num_filt;f++) 00512 { 00513 den = 1e-6; 00514 num = 0; 00515 for(d=0;d<depth;d++) 00516 { 00517 iy = d*srcheight+row; 00518 fy = d*fsz+f*fstride; 00519 switch(fsz) 00520 { 00521 case 5: 00522 v = 0; 00523 colNorm<5>(iy+v,fy+v,col,&num,&den);v++; 00524 colNorm<5>(iy+v,fy+v,col,&num,&den);v++; 00525 colNorm<5>(iy+v,fy+v,col,&num,&den);v++; 00526 colNorm<5>(iy+v,fy+v,col,&num,&den);v++; 00527 colNorm<5>(iy+v,fy+v,col,&num,&den);v++; 00528 case 7: 00529 v = 0; 00530 colNorm<7>(iy+v,fy+v,col,&num,&den);v++; 00531 colNorm<7>(iy+v,fy+v,col,&num,&den);v++; 00532 colNorm<7>(iy+v,fy+v,col,&num,&den);v++; 00533 colNorm<7>(iy+v,fy+v,col,&num,&den);v++; 00534 colNorm<7>(iy+v,fy+v,col,&num,&den);v++; 00535 colNorm<7>(iy+v,fy+v,col,&num,&den);v++; 00536 colNorm<7>(iy+v,fy+v,col,&num,&den);v++; 00537 break; 00538 default: 00539 for(v=0;v<fsz;v++,iy++,fy++) 00540 { 00541 u =0; 00542 for(l=0;l<nloops;l++) 00543 { 00544 //1 00545 pixval =tex2D(teximg,col+u,iy); 00546 filtval=tex2D(texfilt,u,fy); 00547 num +=filtval*pixval; 00548 den +=pixval*pixval; 00549 u++; 00550 00551 //2 00552 pixval =tex2D(teximg,col+u,iy); 00553 filtval=tex2D(texfilt,u,fy); 00554 num +=filtval*pixval; 00555 den +=pixval*pixval; 00556 u++; 00557 00558 //3 00559 pixval =tex2D(teximg,col+u,iy); 00560 filtval=tex2D(texfilt,u,fy); 00561 num +=filtval*pixval; 00562 den +=pixval*pixval; 00563 u++; 00564 00565 //4 00566 pixval =tex2D(teximg,col+u,iy); 00567 filtval=tex2D(texfilt,u,fy); 00568 num +=filtval*pixval; 00569 den +=pixval*pixval; 00570 u++; 00571 } 00572 switch(residue) 00573 { 00574 case 3: 00575 pixval =tex2D(teximg,col+u,iy); 00576 filtval=tex2D(texfilt,u,fy); 00577 num +=filtval*pixval; 00578 den +=pixval*pixval; 00579 u++; 00580 case 2: 00581 pixval =tex2D(teximg,col+u,iy); 00582 filtval=tex2D(texfilt,u,fy); 00583 num +=filtval*pixval; 00584 den +=pixval*pixval; 00585 u++; 00586 case 1: 00587 pixval =tex2D(teximg,col+u,iy); 00588 filtval=tex2D(texfilt,u,fy); 00589 num +=filtval*pixval; 00590 den +=pixval*pixval; 00591 u++; 00592 case 0: 00593 break; 00594 } 00595 }//v 00596 break; 00597 }//switch 00598 }//d 00599 *elptr(dest,f,row,col,ht,pitch)=fabs(num)/sqrtf(den); 00600 } 00601 } 00602 00603 /* 00604 put the image into texture memory 00605 put the filter into global memory 00606 call the kernel for each band of the input (maybe change later) 00607 */ 00608 void gpu_s_norm_filter(band_info* cin,int in_bands,band_info* filt,int num_filt, band_info** pps, int *out_bands,bool copy) 00609 { 00610 cudaArray* imgarray; 00611 cudaArray* filtarray; 00612 band_info* h_outbands; 00613 float* d_ptr; 00614 cudaMemcpyKind copydir; 00615 size_t pitch; 00616 /*channel description*/ 00617 00618 /*determine largest filter size*/ 00619 int fsz = filt[0].width; 00620 int fdepth = filt[0].depth; 00621 /*stage output*/ 00622 h_outbands = new band_info[in_bands]; 00623 for(int b=0;b<in_bands;b++) 00624 { 00625 h_outbands[b].height = cin[b].height-fsz+1; 00626 h_outbands[b].width = cin[b].width-fsz+1; 00627 h_outbands[b].depth = num_filt; 00628 CUDA_SAFE_CALL(cudaMallocPitch((void**)&d_ptr,&pitch,h_outbands[b].width*sizeof(float),num_filt*h_outbands[b].height)); 00629 assert(d_ptr!=NULL); 00630 CUDA_SAFE_CALL(cudaMemset(d_ptr,0,pitch*num_filt*h_outbands[b].height)); 00631 h_outbands[b].pitch = pitch; 00632 h_outbands[b].where = ONDEVICE; 00633 h_outbands[b].ptr = d_ptr; 00634 } 00635 *pps = h_outbands; 00636 *out_bands= in_bands; 00637 00638 /*copy image*/ 00639 cudaChannelFormatDesc imgdesc=cudaCreateChannelDesc<float>(); 00640 /*fix address modes*/ 00641 teximg.addressMode[0] = cudaAddressModeClamp; 00642 teximg.addressMode[1] = cudaAddressModeClamp; 00643 teximg.filterMode = cudaFilterModePoint; 00644 teximg.normalized = false; 00645 cudaChannelFormatDesc filtdesc=cudaCreateChannelDesc<float>(); 00646 CUDA_SAFE_CALL(cudaMallocArray(&filtarray,&filtdesc,fsz,num_filt*fsz*fdepth)); 00647 texfilt.addressMode[0] = cudaAddressModeClamp; 00648 texfilt.addressMode[1] = cudaAddressModeClamp; 00649 texfilt.filterMode = cudaFilterModePoint; 00650 texfilt.normalized = false; 00651 /*copy filters*/ 00652 for(int f=0;f<num_filt;f++) 00653 { 00654 CUDA_SAFE_CALL(cudaMemcpy2DToArray(filtarray,0,f*fsz*fdepth, 00655 filt[f].ptr,filt[f].width*sizeof(float), 00656 filt[f].width*sizeof(float),filt[f].height*filt[f].depth, 00657 cudaMemcpyHostToDevice)); 00658 } 00659 CUDA_SAFE_CALL(cudaBindTextureToArray(texfilt,filtarray)); 00660 00661 /*call the kernel*/ 00662 for(int b=0;b<in_bands;b++) 00663 { 00664 /*copy to array*/ 00665 copydir = cudaMemcpyHostToDevice; 00666 if(cin[b].where==ONDEVICE) 00667 copydir = cudaMemcpyDeviceToDevice; 00668 CUDA_SAFE_CALL(cudaMallocArray(&imgarray,&imgdesc,cin[b].width,cin[b].height*cin[b].depth)); 00669 assert(imgarray!=NULL); 00670 CUDA_SAFE_CALL(cudaMemcpy2DToArray(imgarray,0,0, 00671 cin[b].ptr,cin[b].pitch, 00672 cin[b].width*sizeof(float),cin[b].height*cin[b].depth, 00673 copydir)); 00674 CUDA_SAFE_CALL(cudaBindTextureToArray(teximg,imgarray)); 00675 uint3 gridsz = make_uint3(ceilf((float)h_outbands[b].width/BLOCK_SIZE),ceilf((float)h_outbands[b].height/BLOCK_SIZE),1); 00676 uint3 blocksz = make_uint3(BLOCK_SIZE,BLOCK_SIZE,1); 00677 kernel_s_norm_filter<<<gridsz,blocksz>>>(h_outbands[b].ptr, 00678 h_outbands[b].pitch,h_outbands[b].width,h_outbands[b].height, 00679 cin[b].width,cin[b].height, 00680 fdepth,fsz,num_filt); 00681 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00682 CUDA_SAFE_CALL(cudaUnbindTexture(teximg)); 00683 CUDA_SAFE_CALL(cudaFreeArray(imgarray)); 00684 } 00685 CUDA_SAFE_CALL(cudaUnbindTexture(texfilt)); 00686 for(int b=0;copy && b<in_bands;b++) 00687 { 00688 int sz = h_outbands[b].height*h_outbands[b].width*num_filt; 00689 float* ptr = new float[sz]; 00690 assert(ptr!=NULL); 00691 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,h_outbands[b].width*sizeof(float), 00692 h_outbands[b].ptr,h_outbands[b].pitch, 00693 h_outbands[b].width*sizeof(float),h_outbands[b].height*h_outbands[b].depth, 00694 cudaMemcpyDeviceToHost)); 00695 CUDA_SAFE_CALL(cudaFree(h_outbands[b].ptr)); 00696 h_outbands[b].ptr =ptr; 00697 h_outbands[b].pitch =h_outbands[b].width*sizeof(float); 00698 h_outbands[b].where =ONHOST; 00699 } 00700 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00701 /*copy image to output*/ 00702 CUDA_SAFE_CALL(cudaFreeArray(filtarray)); 00703 } 00704 00705 template<int u> 00706 __device__ float colDist(int irow,int frow,int col) 00707 { 00708 float pval=0; 00709 float fval=0; 00710 pval = tex2D(teximg,col+u-1,irow); 00711 fval = tex2D(texfilt,u-1,frow); 00712 return (pval-fval)*(pval-fval)+colDist<u-1>(irow,frow,col); 00713 } 00714 00715 template<> 00716 __device__ float colDist<0>(int irow,int frow,int col) 00717 { 00718 return 0; 00719 } 00720 00721 __global__ void kernel_s_rbf(float* dest,int pitch,int wt,int ht,int srcwidth,int srcheight,int depth,int fsz,int num_filt,float sigma) 00722 { 00723 int row = blockIdx.y*BLOCK_SIZE+threadIdx.y; 00724 int col = blockIdx.x*BLOCK_SIZE+threadIdx.x; 00725 int fstride = fsz*depth; 00726 int u,v,d,l,iy,fy; 00727 float num = 0; 00728 float pixval = 0; 00729 float filtval = 0; 00730 int nloops = fsz/4; 00731 int residue = fsz%4; 00732 if(row>=ht) return; 00733 if(col>=wt) return; 00734 for(int f=0;f<num_filt;f++) 00735 { 00736 num = 0; 00737 for(d=0;d<depth;d++) 00738 { 00739 iy = d*srcheight+row; 00740 fy = d*fsz+f*fstride; 00741 switch(fsz) 00742 { 00743 case 4: 00744 v = 0; 00745 num += colDist<4>(iy+v,fy+v,col);v++; 00746 num += colDist<4>(iy+v,fy+v,col);v++; 00747 num += colDist<4>(iy+v,fy+v,col);v++; 00748 num += colDist<4>(iy+v,fy+v,col);v++; 00749 break; 00750 case 6: 00751 v = 0; 00752 num += colDist<6>(iy+v,fy+v,col);v++; 00753 num += colDist<6>(iy+v,fy+v,col);v++; 00754 num += colDist<6>(iy+v,fy+v,col);v++; 00755 num += colDist<6>(iy+v,fy+v,col);v++; 00756 num += colDist<6>(iy+v,fy+v,col);v++; 00757 num += colDist<6>(iy+v,fy+v,col);v++; 00758 break; 00759 case 8: 00760 v = 0; 00761 num += colDist<8>(iy+v,fy+v,col);v++; 00762 num += colDist<8>(iy+v,fy+v,col);v++; 00763 num += colDist<8>(iy+v,fy+v,col);v++; 00764 num += colDist<8>(iy+v,fy+v,col);v++; 00765 num += colDist<8>(iy+v,fy+v,col);v++; 00766 num += colDist<8>(iy+v,fy+v,col);v++; 00767 num += colDist<8>(iy+v,fy+v,col);v++; 00768 num += colDist<8>(iy+v,fy+v,col);v++; 00769 default: 00770 for(v=0;v<fsz;v++,iy++,fy++) 00771 { 00772 for(l=0;l<nloops;l++) 00773 { 00774 pixval =tex2D(teximg,col+u,iy); 00775 filtval=tex2D(texfilt,u,fy); 00776 num +=(filtval-pixval)*(filtval-pixval); 00777 u++; 00778 00779 pixval =tex2D(teximg,col+u,iy); 00780 filtval=tex2D(texfilt,u,fy); 00781 num +=(filtval-pixval)*(filtval-pixval); 00782 u++; 00783 00784 pixval =tex2D(teximg,col+u,iy); 00785 filtval=tex2D(texfilt,u,fy); 00786 num +=(filtval-pixval)*(filtval-pixval); 00787 u++; 00788 00789 pixval =tex2D(teximg,col+u,iy); 00790 filtval=tex2D(texfilt,u,fy); 00791 num +=(filtval-pixval)*(filtval-pixval); 00792 u++; 00793 } 00794 switch(residue) 00795 { 00796 case 3: 00797 pixval =tex2D(teximg,col+u,iy); 00798 filtval=tex2D(texfilt,u,fy); 00799 num +=(filtval-pixval)*(filtval-pixval); 00800 u++; 00801 case 2: 00802 pixval =tex2D(teximg,col+u,iy); 00803 filtval=tex2D(texfilt,u,fy); 00804 num +=(filtval-pixval)*(filtval-pixval); 00805 u++; 00806 case 1: 00807 pixval =tex2D(teximg,col+u,iy); 00808 filtval=tex2D(texfilt,u,fy); 00809 num +=(filtval-pixval)*(filtval-pixval); 00810 u++; 00811 case 0: 00812 break; 00813 } 00814 }//v 00815 break; 00816 }//switch 00817 }//d 00818 *elptr(dest,f,row,col,ht,pitch)=exp(-num/sigma); 00819 }//f 00820 } 00821 00822 /* 00823 put the image into texture memory 00824 put the filter into global memory 00825 call the kernel for each band of the input (maybe change later) 00826 */ 00827 void gpu_s_rbf(band_info* cin,int in_bands,band_info* filt,int num_filt, float sigma,band_info** pps, int *out_bands,bool copy) 00828 { 00829 cudaArray* imgarray; 00830 cudaArray* filtarray; 00831 band_info* h_outbands; 00832 float* d_ptr; 00833 cudaMemcpyKind copydir=cudaMemcpyHostToDevice; 00834 size_t pitch; 00835 /*determine largest filter size*/ 00836 int fsz = filt[0].width; 00837 int fdepth = filt[0].depth; 00838 00839 /*stage output*/ 00840 h_outbands = new band_info[in_bands]; 00841 for(int b=0;b<in_bands;b++) 00842 { 00843 h_outbands[b].height = cin[b].height-fsz+1; 00844 h_outbands[b].width = cin[b].width-fsz+1; 00845 h_outbands[b].depth = num_filt; 00846 CUDA_SAFE_CALL(cudaMallocPitch((void**)&d_ptr,&pitch,h_outbands[b].width*sizeof(float),num_filt*h_outbands[b].height)); 00847 assert(d_ptr!=NULL); 00848 CUDA_SAFE_CALL(cudaMemset(d_ptr,0,pitch*num_filt*h_outbands[b].height)); 00849 h_outbands[b].pitch = pitch; 00850 h_outbands[b].ptr = d_ptr; 00851 h_outbands[b].where = ONDEVICE; 00852 } 00853 *pps = h_outbands; 00854 *out_bands= in_bands; 00855 00856 /*copy image*/ 00857 cudaChannelFormatDesc imgdesc=cudaCreateChannelDesc<float>(); 00858 /*fix address modes*/ 00859 teximg.addressMode[0] = cudaAddressModeClamp; 00860 teximg.addressMode[1] = cudaAddressModeClamp; 00861 teximg.filterMode = cudaFilterModePoint; 00862 teximg.normalized = false; 00863 00864 cudaChannelFormatDesc filtdesc=cudaCreateChannelDesc<float>(); 00865 CUDA_SAFE_CALL(cudaMallocArray(&filtarray,&filtdesc,fsz,num_filt*fsz*fdepth)); 00866 texfilt.addressMode[0] = cudaAddressModeClamp; 00867 texfilt.addressMode[1] = cudaAddressModeClamp; 00868 texfilt.filterMode = cudaFilterModePoint; 00869 texfilt.normalized = false; 00870 00871 sigma = 2*sigma*sigma; 00872 /*copy filters*/ 00873 for(int f=0;f<num_filt;f++) 00874 { 00875 CUDA_SAFE_CALL(cudaMemcpy2DToArray(filtarray,0,f*fsz*fdepth, 00876 filt[f].ptr,filt[f].width*sizeof(float), 00877 filt[f].width*sizeof(float),filt[f].height*filt[f].depth, 00878 cudaMemcpyHostToDevice)); 00879 } 00880 CUDA_SAFE_CALL(cudaBindTextureToArray(texfilt,filtarray)); 00881 /*call the kernel*/ 00882 for(int b=0;b<in_bands;b++) 00883 { 00884 copydir = cudaMemcpyHostToDevice; 00885 if(cin[b].where==ONDEVICE) 00886 copydir=cudaMemcpyDeviceToDevice; 00887 CUDA_SAFE_CALL(cudaMallocArray(&imgarray,&imgdesc,cin[b].width,cin[b].height*cin[b].depth)); 00888 assert(imgarray!=NULL); 00889 /*copy to array*/ 00890 CUDA_SAFE_CALL(cudaMemcpy2DToArray(imgarray,0,0, 00891 cin[b].ptr,cin[b].pitch, 00892 cin[b].width*sizeof(float),cin[b].height*cin[b].depth, 00893 copydir)); 00894 CUDA_SAFE_CALL(cudaBindTextureToArray(teximg,imgarray)); 00895 uint3 gridsz = make_uint3(ceilf((float)h_outbands[b].width/BLOCK_SIZE),ceilf((float)h_outbands[b].height/BLOCK_SIZE),1); 00896 uint3 blocksz = make_uint3(BLOCK_SIZE,BLOCK_SIZE,1); 00897 kernel_s_rbf<<<gridsz,blocksz>>>(h_outbands[b].ptr, 00898 h_outbands[b].pitch,h_outbands[b].width,h_outbands[b].height, 00899 cin[b].width,cin[b].height, 00900 fdepth,fsz,num_filt,sigma); 00901 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00902 CUDA_SAFE_CALL(cudaUnbindTexture(teximg)); 00903 CUDA_SAFE_CALL(cudaFreeArray(imgarray)); 00904 } 00905 CUDA_SAFE_CALL(cudaUnbindTexture(texfilt)); 00906 for(int b=0;copy && b<in_bands;b++) 00907 { 00908 int sz = h_outbands[b].height*h_outbands[b].width*num_filt; 00909 float* ptr = new float[sz]; 00910 assert(ptr!=NULL); 00911 CUDA_SAFE_CALL(cudaMemcpy2D(ptr,h_outbands[b].width*sizeof(float), 00912 h_outbands[b].ptr,h_outbands[b].pitch, 00913 h_outbands[b].width*sizeof(float),h_outbands[b].height*h_outbands[b].depth, 00914 cudaMemcpyDeviceToHost)); 00915 CUDA_SAFE_CALL(cudaFree(h_outbands[b].ptr)); 00916 h_outbands[b].ptr =ptr; 00917 h_outbands[b].pitch =h_outbands[b].width*sizeof(float); 00918 h_outbands[b].where =ONHOST; 00919 } 00920 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00921 /*copy image to output*/ 00922 CUDA_SAFE_CALL(cudaFreeArray(filtarray)); 00923 } 00924 00925 void cpu_c_global( 00926 IN band_info* s, /*pointer to device storage*/ 00927 IN int in_bands, /*number of input bands*/ 00928 OUT float** ppc, /*pointer to DEVICE storage*/ 00929 OUT int* out_units /*=input depth*/ 00930 ) 00931 { 00932 *out_units = s[0].depth; 00933 *ppc = new float[*out_units]; 00934 assert(*ppc); 00935 00936 float* pc = *ppc; 00937 memset(pc,0,sizeof(float)*(*out_units)); 00938 00939 for(int d=0;d<s[0].depth;d++) 00940 { 00941 for(int b=0;b<in_bands;b++) 00942 { 00943 int numel = s[b].height*s[b].width; 00944 float* ptr = s[b].ptr+d*numel; 00945 const float* pmaxval= max_element(ptr,ptr+numel); 00946 pc[d] = max(*pmaxval,pc[d]); 00947 } 00948 } 00949 } 00950 00951