00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #ifndef CUDA_CONVOLUTIONS_H_DEFINED
00039 #define CUDA_CONVOLUTIONS_H_DEFINED
00040
00041 #include <cuda.h>
00042 #include "CUDA/cutil.h"
00043 #include "cudadefs.h"
00044
00045
00046
00047 __global__ void cuda_global_convolveHmaxHelper(float *res, const float *src, const int src_w, const int src_h,
00048 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00049 {
00050 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00051 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00052 const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x;
00053 const int res_idx = IMUL(y_pos,src_w)+x_pos;
00054
00055
00056 float *data = (float *) shared_data;
00057 const int tile_sz = IMUL(tile_height,tile_width);
00058 const int fil_sz = IMUL(Nx,Ny);
00059
00060 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00061 {
00062 const int curld_idx = ld_idx+i;
00063 data[curld_idx] = f[curld_idx];
00064 }
00065 __syncthreads();
00066
00067 if(y_pos < src_h && x_pos < src_w)
00068 {
00069 const int Nx2 = (Nx - 1) / 2;
00070 const int Ny2 = (Ny - 1) / 2;
00071 float sum = 0.0F;
00072 float sumw = 0.0F;
00073 float im;
00074 for(int i=0;i<Ny;i++)
00075 {
00076 const int s_y=y_pos-Ny2+i;
00077 const int f_y=Ny-i-1;
00078 if(s_y >= 0 && s_y < src_h)
00079 {
00080 for(int j=0;j<Nx;j++)
00081 {
00082 const int s_x=x_pos-Nx2+j;
00083 const int f_x=Nx-j-1;
00084 const int flt_idx = IMUL(f_y,Nx)+f_x;
00085 const int src_idx = IMUL(s_y,src_w)+s_x;
00086 if(s_x >=0 && s_x < src_w)
00087 {
00088 im = src[src_idx];
00089
00090 sum = __fadd_rn(__fmul_rn(im,data[flt_idx]),sum);
00091
00092 sumw = __fadd_rn(__fmul_rn(im,im),sumw);
00093 }
00094 }
00095 }
00096 }
00097 if(sumw > 0.0F)
00098 res[res_idx] = fabs(sum)/sqrt(sumw);
00099 else
00100 res[res_idx] = sum;
00101 }
00102
00103 }
00104
00105
00106 __global__ void cuda_global_convolveZeroHelper(float *res, const float *src, const int src_w, const int src_h,
00107 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00108 {
00109 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00110 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00111 const int ld_idx = threadIdx.y*tile_width + threadIdx.x;
00112 const int res_idx = y_pos*src_w+x_pos;
00113
00114
00115 float *data = (float *) shared_data;
00116 const int tile_sz = IMUL(tile_height,tile_width);
00117 const int fil_sz = IMUL(Nx,Ny);
00118
00119 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00120 {
00121 const int curld_idx = ld_idx+i;
00122 data[curld_idx] = f[curld_idx];
00123 }
00124
00125 __syncthreads();
00126
00127 if(y_pos < src_h && x_pos < src_w)
00128 {
00129 const int Nx2 = (Nx - 1) >>1;
00130 const int Ny2 = (Ny - 1) >>1;
00131 float sum = 0.0F;
00132 for(int i=0;i<Ny;i++)
00133 {
00134 const int s_y=y_pos-Ny2+i;
00135 const int f_y=Ny-i-1;
00136 if(s_y >= 0 && s_y < src_h)
00137 {
00138 for(int j=0;j<Nx;j++)
00139 {
00140 const int s_x=x_pos-Nx2+j;
00141 const int f_x=Nx-j-1;
00142 if(s_x >=0 && s_x < src_w)
00143 {
00144 const int flt_idx = IMUL(f_y,Nx)+f_x;
00145 const int src_idx = IMUL(s_y,src_w)+s_x;
00146
00147
00148 sum = __fadd_rn(__fmul_rn(src[src_idx],data[flt_idx]),sum);
00149 }
00150 }
00151 }
00152 }
00153 res[res_idx] = sum;
00154 }
00155
00156 }
00157
00158 __global__ void cuda_global_convolveCleanHelper(float *res, const float *src, const int src_w, const int src_h,
00159 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00160 {
00161 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00162 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00163 const int ld_idx = threadIdx.y*tile_width + threadIdx.x;
00164 const int res_idx = y_pos*src_w+x_pos;
00165
00166
00167 float *data = (float *) shared_data;
00168 const int tile_sz = IMUL(tile_height,tile_width);
00169 const int fil_sz = IMUL(Nx,Ny);
00170
00171 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00172 {
00173 const int curld_idx = ld_idx+i;
00174 data[curld_idx] = f[curld_idx];
00175 }
00176
00177 __syncthreads();
00178
00179 if(y_pos < src_h && x_pos < src_w)
00180 {
00181 const int Nx2 = (Nx - 1) / 2;
00182 const int Ny2 = (Ny - 1) / 2;
00183 float sum = 0.0F;
00184 float sumf = 0.0F;
00185 float sumw = 0.0F;
00186 for(int i=0;i<Ny;i++)
00187 {
00188 const int s_y=y_pos-Ny2+i;
00189 const int f_y=Ny-i-1;
00190 for(int j=0;j<Nx;j++)
00191 {
00192 const int s_x=x_pos-Nx2+j;
00193 const int f_x=Nx-j-1;
00194 const int flt_idx = IMUL(f_y,Nx)+f_x;
00195 const int src_idx = IMUL(s_y,src_w)+s_x;
00196 if(s_y >= 0 && s_y < src_h && s_x >=0 && s_x < src_w)
00197 {
00198
00199 sum = __fadd_rn(__fmul_rn(src[src_idx],data[flt_idx]),sum);
00200 sumw += data[flt_idx];
00201 sumf += data[flt_idx];
00202 }
00203 else
00204 {
00205 sumf += data[flt_idx];
00206 }
00207 }
00208 }
00209
00210 res[res_idx] = sum*sumf/sumw;
00211 }
00212
00213 }
00214
00215
00216 __global__ void cuda_global_convolveHmaxHelperOptimized(float *res, const float *src, const int src_w, const int src_h,
00217 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00218 {
00219 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00220 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00221 const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x;
00222 const int res_idx = IMUL(y_pos,src_w)+x_pos;
00223
00224
00225 const int tile_sz = IMUL(tile_height,tile_width);
00226 const int fil_sz = IMUL(Nx,Ny);
00227
00228 const int Nx2 = (Nx - 1) >>1;
00229 const int Ny2 = (Ny - 1) >>1;
00230
00231 float *filt = (float *) shared_data;
00232 float *data = (float *) &(filt[fil_sz]);
00233 const int dw = tile_width+Nx;
00234
00235 const int shr_x = threadIdx.x+Nx2;
00236 const int shr_y = threadIdx.y+Ny2;
00237 const int shr_idx = IMUL(shr_y,dw)+shr_x;
00238
00239
00240 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00241 {
00242 const int curld_idx = ld_idx+i;
00243 filt[curld_idx] = f[curld_idx];
00244 }
00245 if(y_pos < src_h && x_pos < src_w)
00246 {
00247
00248 data[shr_idx]=src[res_idx];
00249 const bool topBorder = (y_pos > Ny2 && threadIdx.y < Ny2);
00250 const bool bottomBorder = (y_pos < src_h-Ny2 && threadIdx.y > tile_height-Ny2-1);
00251 const bool leftBorder = (x_pos > Nx2 && threadIdx.x < Nx2);
00252 const bool rightBorder = (x_pos < src_w-Nx2 && threadIdx.x > tile_width-Nx2-1);
00253
00254 if(topBorder)
00255 data[shr_idx-IMUL(Ny2,dw)] = src[res_idx-IMUL(Ny2,src_w)];
00256
00257 if(bottomBorder)
00258 data[shr_idx+IMUL(Ny2,dw)] = src[res_idx+IMUL(Ny2,src_w)];
00259
00260 if(leftBorder)
00261 data[shr_idx-Nx2] = src[res_idx-Nx2];
00262
00263 if(rightBorder)
00264 data[shr_idx+Nx2] = src[res_idx+Nx2];
00265
00266 if(topBorder && leftBorder)
00267 data[shr_idx-IMUL(Ny2,dw)-Nx2] = src[res_idx-IMUL(Ny2,src_w)-Nx2];
00268 if(topBorder && rightBorder)
00269 data[shr_idx-IMUL(Ny2,dw)+Nx2] = src[res_idx-IMUL(Ny2,src_w)+Nx2];
00270 if(bottomBorder && leftBorder)
00271 data[shr_idx+IMUL(Ny2,dw)-Nx2] = src[res_idx+IMUL(Ny2,src_w)-Nx2];
00272 if(bottomBorder && rightBorder)
00273 data[shr_idx+IMUL(Ny2,dw)+Nx2] = src[res_idx+IMUL(Ny2,src_w)+Nx2];
00274 __syncthreads();
00275
00276 float sum = 0.0F, sumw = 0.0F;
00277 for(int i=0;i<Ny;i++)
00278 {
00279 const int s_y=shr_y-Ny2+i;
00280 const int y_loc = y_pos-Ny2+i;
00281 const int f_y=Ny-i-1;
00282 if(y_loc >= 0 && y_loc < src_h)
00283 {
00284 for(int j=0;j<Nx;j++)
00285 {
00286 const int s_x=shr_x-Nx2+j;
00287 const int x_loc = x_pos-Nx2+j;
00288 const int f_x=Nx-j-1;
00289 if(x_loc >=0 && x_loc < src_w)
00290 {
00291 const int flt_idx = IMUL(f_y,Nx)+f_x;
00292 const int dat_idx = IMUL(s_y,dw)+s_x;
00293
00294 sum = __fadd_rn(__fmul_rn(data[dat_idx],filt[flt_idx]),sum);
00295
00296 sumw = __fadd_rn(__fmul_rn(data[dat_idx],data[dat_idx]),sumw);
00297 }
00298 }
00299 }
00300 }
00301 if(sumw > 0.0F)
00302 res[res_idx] = fabs(sum)/sqrt(sumw);
00303 else
00304 res[res_idx] = sum;
00305 }
00306 }
00307
00308
00309
00310 __global__ void cuda_global_convolveZeroHelperOptimized(float *res, const float *src, const int src_w, const int src_h,
00311 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00312 {
00313 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00314 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00315 const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x;
00316 const int res_idx = IMUL(y_pos,src_w)+x_pos;
00317
00318
00319 const int tile_sz = IMUL(tile_height,tile_width);
00320 const int fil_sz = IMUL(Nx,Ny);
00321
00322 const int Nx2 = (Nx - 1) >>1;
00323 const int Ny2 = (Ny - 1) >>1;
00324
00325 float *filt = (float *) shared_data;
00326 float *data = (float *) &(filt[fil_sz]);
00327 const int dw = tile_width+Nx;
00328
00329 const int shr_x = threadIdx.x+Nx2;
00330 const int shr_y = threadIdx.y+Ny2;
00331 const int shr_idx = IMUL(shr_y,dw)+shr_x;
00332
00333
00334 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00335 {
00336 const int curld_idx = ld_idx+i;
00337 filt[curld_idx] = f[curld_idx];
00338 }
00339 if(y_pos < src_h && x_pos < src_w)
00340 {
00341
00342 data[shr_idx]=src[res_idx];
00343 const bool topBorder = (y_pos > Ny2 && threadIdx.y < Ny2);
00344 const bool bottomBorder = (y_pos < src_h-Ny2 && threadIdx.y > tile_height-Ny2-1);
00345 const bool leftBorder = (x_pos > Nx2 && threadIdx.x < Nx2);
00346 const bool rightBorder = (x_pos < src_w-Nx2 && threadIdx.x > tile_width-Nx2-1);
00347
00348 if(topBorder)
00349 data[shr_idx-IMUL(Ny2,dw)] = src[res_idx-IMUL(Ny2,src_w)];
00350
00351 if(bottomBorder)
00352 data[shr_idx+IMUL(Ny2,dw)] = src[res_idx+IMUL(Ny2,src_w)];
00353
00354 if(leftBorder)
00355 data[shr_idx-Nx2] = src[res_idx-Nx2];
00356
00357 if(rightBorder)
00358 data[shr_idx+Nx2] = src[res_idx+Nx2];
00359
00360 if(topBorder && leftBorder)
00361 data[shr_idx-IMUL(Ny2,dw)-Nx2] = src[res_idx-IMUL(Ny2,src_w)-Nx2];
00362 if(topBorder && rightBorder)
00363 data[shr_idx-IMUL(Ny2,dw)+Nx2] = src[res_idx-IMUL(Ny2,src_w)+Nx2];
00364 if(bottomBorder && leftBorder)
00365 data[shr_idx+IMUL(Ny2,dw)-Nx2] = src[res_idx+IMUL(Ny2,src_w)-Nx2];
00366 if(bottomBorder && rightBorder)
00367 data[shr_idx+IMUL(Ny2,dw)+Nx2] = src[res_idx+IMUL(Ny2,src_w)+Nx2];
00368 __syncthreads();
00369
00370 float sum = 0.0F;
00371 for(int i=0;i<Ny;i++)
00372 {
00373 const int s_y=shr_y-Ny2+i;
00374 const int y_loc = y_pos-Ny2+i;
00375 const int f_y=Ny-i-1;
00376 if(y_loc >= 0 && y_loc < src_h)
00377 {
00378 for(int j=0;j<Nx;j++)
00379 {
00380 const int s_x=shr_x-Nx2+j;
00381 const int x_loc = x_pos-Nx2+j;
00382 const int f_x=Nx-j-1;
00383 if(x_loc >=0 && x_loc < src_w)
00384 {
00385 const int flt_idx = IMUL(f_y,Nx)+f_x;
00386 const int dat_idx = IMUL(s_y,dw)+s_x;
00387
00388
00389 sum = __fadd_rn(__fmul_rn(data[dat_idx],filt[flt_idx]),sum);
00390 }
00391 }
00392 }
00393 }
00394 res[res_idx] = sum;
00395 }
00396 }
00397
00398
00399
00400
00401 __global__ void cuda_global_optConvolve(float *res, const float *src, const int src_w, const int src_h,
00402 const float *f, const int fil_w, const int fil_h, const int tile_width, const int tile_height)
00403 {
00404 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00405 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00406 const int ld_idx = threadIdx.y*tile_width + threadIdx.x;
00407 const int res_idx = y_pos*src_w+x_pos;
00408
00409
00410 float *data = (float *) shared_data;
00411 const int tile_sz = IMUL(tile_height,tile_width);
00412 const int fil_sz = IMUL(fil_w,fil_h);
00413
00414 for(int i=0;i<fil_sz;i+=tile_sz)
00415 {
00416 const int curld_idx = ld_idx+IMUL(i,tile_sz);
00417 if(curld_idx < fil_sz)
00418 data[curld_idx] = f[curld_idx];
00419 }
00420 __syncthreads();
00421
00422 const int Nx2 = (fil_w - 1) / 2;
00423 const int Ny2 = (fil_h - 1) / 2;
00424
00425
00426
00427
00428
00429 const bool y_clean = y_pos >= Ny2 && y_pos < (src_h - Nx2);
00430
00431
00432
00433 const bool x_clean = x_pos >= Nx2 && x_pos < (src_w - Nx2);
00434
00435
00436
00437
00438
00439
00440 if (x_clean && y_clean)
00441 {
00442 float dst_val = 0.0f;
00443 for(int i=0;i<fil_h;i++)
00444 {
00445 const int s_y=y_pos-Ny2+i;
00446 const int f_y=fil_h-i-1;
00447 for(int j=0;j<fil_w;j++)
00448 {
00449 const int s_x=x_pos-Nx2+j;
00450 const int f_x=fil_w-j-1;
00451 const int flt_idx = IMUL(f_y,fil_w)+f_x;
00452 const int src_idx = IMUL(s_y,src_w)+s_x;
00453 dst_val += src[src_idx]*data[flt_idx];
00454 }
00455 }
00456 res[res_idx] = dst_val;
00457 }
00458 else if(x_pos < src_w && y_pos < src_h)
00459 {
00460
00461
00462
00463
00464
00465
00466 float dst_val = 0.0f;
00467 float src_sum = 0.0f;
00468 int src_cnt = 0;
00469 float fil_sum_skipped = 0.0f;
00470
00471 for(int i=0;i<fil_h;i++)
00472 {
00473 const int s_y=y_pos-Ny2+i;
00474 const int f_y=fil_h-i-1;
00475 if(s_y >= 0 && s_y < src_h)
00476 {
00477 for(int j=0;j<fil_w;j++)
00478 {
00479 const int s_x=x_pos-Nx2+j;
00480 const int f_x=fil_w-j-1;
00481 const int flt_idx = IMUL(f_y,fil_w)+f_x;
00482 const int src_idx = IMUL(s_y,src_w)+s_x;
00483 if(s_x >=0 && s_x < src_w)
00484 {
00485 float src_val = src[src_idx];
00486 dst_val += src_val*data[flt_idx];
00487 src_sum += src_val;
00488 src_cnt++;
00489 }
00490 else
00491 {
00492 fil_sum_skipped += data[flt_idx];
00493 }
00494 }
00495 }
00496 else
00497 {
00498 for (int f_x = fil_w-1; f_x >= 0; --f_x)
00499 fil_sum_skipped += data[IMUL(f_y,fil_w)+f_x];
00500 }
00501 }
00502 const float src_avg = src_sum / src_cnt;
00503 res[res_idx] = dst_val + (fil_sum_skipped*src_avg);
00504
00505
00506 }
00507
00508 }
00509
00510
00511
00512 __global__ void cuda_global_xFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00513 {
00514 const int hfs2 = (hfs-1) / 2;
00515 const int y_pos = blockIdx.y;
00516 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00517 const int ld_idx = threadIdx.x;
00518 const int res_idx = y_pos*src_w+x_pos;
00519
00520
00521 float *data = (float *) shared_data;
00522 float sum = 0;
00523 const int numIters = IDIVUP(hfs,shr_len);
00524
00525 for(int iter=0;iter<numIters;iter++)
00526 {
00527 const int filt_beg = IMUL(iter,shr_len);
00528 const int filt_end = filt_beg+shr_len;
00529 const int load_max = min(filt_end,hfs)-filt_beg;
00530 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00531 {
00532 const int filt_idx = data_idx + filt_beg;
00533 data[data_idx] = f[filt_idx];
00534 }
00535 __syncthreads();
00536
00537 if(y_pos < src_h && x_pos < src_w)
00538 {
00539
00540 for(int j=0;j<load_max;j++)
00541 {
00542 const int s_x=x_pos+hfs2-(filt_beg+j);
00543 const int f_x=j;
00544 if(s_x >=0 && s_x < src_w)
00545 {
00546 const int src_idx = IMUL(y_pos,src_w)+s_x;
00547 sum += src[src_idx]*data[f_x];
00548 }
00549 }
00550
00551 }
00552
00553 if(numIters > 1)
00554 __syncthreads();
00555
00556 }
00557 if(y_pos < src_h && x_pos < src_w)
00558 {
00559 res[res_idx] = sum;
00560 }
00561
00562 }
00563
00564 __global__ void cuda_global_xFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00565 {
00566 const int hfs2 = (hfs - 1) / 2;
00567 const int y_pos = blockIdx.y;
00568 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00569 const int ld_idx = threadIdx.x;
00570 const int res_idx = y_pos*src_w+x_pos;
00571
00572
00573 float *data = (float *) shared_data;
00574 float sum = 0, sumw = 0, sumf = 0;
00575 const int numIters = IDIVUP(hfs,shr_len);
00576
00577 for(int iter=0;iter<numIters;iter++)
00578 {
00579 const int filt_beg = IMUL(iter,shr_len);
00580 const int filt_end = filt_beg+shr_len;
00581 const int load_max = min(filt_end,hfs)-filt_beg;
00582 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00583 {
00584 const int filt_idx = data_idx + filt_beg;
00585 data[data_idx] = f[filt_idx];
00586 }
00587 __syncthreads();
00588
00589 if(y_pos < src_h && x_pos < src_w)
00590 {
00591
00592 for(int j=0;j<load_max;j++)
00593 {
00594 const int s_x=x_pos+hfs2-(filt_beg+j);
00595 const int f_x=j;
00596 if(s_x >=0 && s_x < src_w)
00597 {
00598 const int src_idx = IMUL(y_pos,src_w)+s_x;
00599 sum += src[src_idx]*data[f_x];
00600 sumw += data[f_x];
00601 }
00602 else
00603 {
00604 sumf += data[f_x];
00605 }
00606 }
00607 }
00608
00609 if(numIters > 1)
00610 __syncthreads();
00611
00612 }
00613 if(y_pos < src_h && x_pos < src_w)
00614 {
00615 sumf+=sumw;
00616 res[res_idx] = sum*sumf/sumw;
00617 }
00618
00619 }
00620
00621 __global__ void cuda_global_xFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00622 {
00623 const int hfs2 = (hfs - 1) / 2;
00624 const int y_pos = blockIdx.y;
00625 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00626 const int ld_idx = threadIdx.x;
00627 const int res_idx = y_pos*src_w+x_pos;
00628
00629
00630 float *data = (float *) shared_data;
00631 float sum = 0;
00632 const int numIters = IDIVUP(hfs,shr_len);
00633
00634 for(int iter=0;iter<numIters;iter++)
00635 {
00636 const int filt_beg = IMUL(iter,shr_len);
00637 const int filt_end = filt_beg+shr_len;
00638 const int load_max = min(filt_end,hfs)-filt_beg;
00639 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00640 {
00641 const int filt_idx = data_idx + filt_beg;
00642 data[data_idx] = f[filt_idx];
00643 }
00644 __syncthreads();
00645
00646 if(y_pos < src_h && x_pos < src_w)
00647 {
00648
00649 for(int j=0;j<load_max;j++)
00650 {
00651 const int s_x=x_pos+hfs2-(filt_beg+j);
00652 const int f_x=j;
00653 if(s_x < 0)
00654 {
00655 sum += src[IMUL(y_pos,src_w)]*data[f_x];
00656 }
00657 else if(s_x >= src_w)
00658 {
00659 sum += src[IMUL(y_pos+1,src_w)-1]*data[f_x];
00660 }
00661 else
00662 {
00663 const int src_idx = IMUL(y_pos,src_w)+s_x;
00664 sum += src[src_idx]*data[f_x];
00665 }
00666 }
00667 }
00668
00669 if(numIters > 1)
00670 __syncthreads();
00671
00672 }
00673 if(y_pos < src_h && x_pos < src_w)
00674 {
00675 res[res_idx] = sum;
00676 }
00677
00678
00679 }
00680
00681
00682 __global__ void cuda_global_yFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00683 {
00684 const int hfs2 = (hfs - 1) / 2;
00685 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
00686 const int x_pos = blockIdx.x;
00687 const int ld_idx = threadIdx.y;
00688 const int res_idx = y_pos*src_w+x_pos;
00689
00690
00691 float *data = (float *) shared_data;
00692 float sum = 0;
00693 const int numIters = IDIVUP(hfs,shr_len);
00694
00695 for(int iter=0;iter<numIters;iter++)
00696 {
00697 const int filt_beg = IMUL((iter),shr_len);
00698 const int filt_end = filt_beg+shr_len;
00699 const int load_max = min(filt_end,hfs)-filt_beg;
00700 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00701 {
00702 const int filt_idx = data_idx + filt_beg;
00703 data[data_idx] = f[filt_idx];
00704 }
00705 __syncthreads();
00706
00707 if(y_pos < src_h && x_pos < src_w)
00708 {
00709
00710 for(int j=0;j<load_max;j++)
00711 {
00712 const int s_y=y_pos+hfs2-(filt_beg+j);
00713 const int f_y=j;
00714 if(s_y >=0 && s_y < src_h)
00715 {
00716 const int src_idx = IMUL(s_y,src_w)+x_pos;
00717 sum += src[src_idx]*data[f_y];
00718 }
00719 }
00720 }
00721
00722 if(numIters > 1)
00723 __syncthreads();
00724
00725 }
00726 if(y_pos < src_h && x_pos < src_w)
00727 {
00728 res[res_idx] = sum;
00729 }
00730
00731 }
00732
00733 __global__ void cuda_global_yFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00734 {
00735 const int hfs2 = (hfs - 1) / 2;
00736 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
00737 const int x_pos = blockIdx.x;
00738 const int ld_idx = threadIdx.y;
00739 const int res_idx = y_pos*src_w+x_pos;
00740
00741
00742 float *data = (float *) shared_data;
00743 float sum = 0, sumw = 0, sumf = 0;
00744 const int numIters = IDIVUP(hfs,shr_len);
00745
00746 for(int iter=0;iter<numIters;iter++)
00747 {
00748 const int filt_beg = IMUL((iter),shr_len);
00749 const int filt_end = filt_beg+shr_len;
00750 const int load_max = min(filt_end,hfs)-filt_beg;
00751 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00752 {
00753 const int filt_idx = data_idx + filt_beg;
00754 data[data_idx] = f[filt_idx];
00755 }
00756 __syncthreads();
00757
00758 if(y_pos < src_h && x_pos < src_w)
00759 {
00760
00761 for(int j=0;j<load_max;j++)
00762 {
00763 const int s_y=y_pos+hfs2-(filt_beg+j);
00764 const int f_y=j;
00765 if(s_y >=0 && s_y < src_h)
00766 {
00767 const int src_idx = IMUL(s_y,src_w)+x_pos;
00768 sum += src[src_idx]*data[f_y];
00769 sumw += data[f_y];
00770 }
00771 else
00772 {
00773 sumf += data[f_y];
00774 }
00775 }
00776 }
00777
00778 if(numIters > 1)
00779 __syncthreads();
00780 }
00781 if(y_pos < src_h && x_pos < src_w)
00782 {
00783 sumf+=sumw;
00784 res[res_idx] = sum*sumf/sumw;
00785 }
00786
00787 }
00788
00789 __global__ void cuda_global_yFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00790 {
00791 const int hfs2 = (hfs - 1) / 2;
00792 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
00793 const int x_pos = blockIdx.x;
00794 const int ld_idx = threadIdx.y;
00795 const int res_idx = y_pos*src_w+x_pos;
00796
00797
00798
00799 float *data = (float *) shared_data;
00800 float sum = 0;
00801 const int numIters = IDIVUP(hfs,shr_len);
00802
00803 for(int iter=0;iter<numIters;iter++)
00804 {
00805 const int filt_beg = IMUL((iter),shr_len);
00806 const int filt_end = filt_beg+shr_len;
00807 const int load_max = min(filt_end,hfs)-filt_beg;
00808 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00809 {
00810 const int filt_idx = data_idx + filt_beg;
00811 data[data_idx] = f[filt_idx];
00812 }
00813 __syncthreads();
00814
00815 if(y_pos < src_h && x_pos < src_w)
00816 {
00817 for(int j=0;j<load_max;j++)
00818 {
00819 const int s_y=y_pos+hfs2-(filt_beg+j);
00820 const int f_y=j;
00821 if(s_y < 0)
00822 {
00823 sum += src[x_pos]*data[f_y];
00824 }
00825 else if(s_y >= src_h)
00826 {
00827 sum += src[IMUL(src_h-1,src_w)+x_pos]*data[f_y];
00828 }
00829 else
00830 {
00831 const int src_idx = IMUL(s_y,src_w)+x_pos;
00832 sum += src[src_idx]*data[f_y];
00833 }
00834 }
00835 }
00836
00837 if(numIters > 1)
00838 __syncthreads();
00839 }
00840 if(y_pos < src_h && x_pos < src_w)
00841 {
00842 res[res_idx] = sum;
00843 }
00844
00845
00846 }
00847
00848
00849 __global__ void cuda_global_optXFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00850 {
00851 const int hfs2 = (hfs-1) / 2;
00852 const int y_pos = blockIdx.y;
00853 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00854 const int data_idx = threadIdx.x;
00855 const int res_idx = y_pos*src_w+x_pos;
00856 const int ctr_off = hfs2;
00857
00858
00859 float *filt = (float *) shared_data;
00860 float *data = (float *) &(shared_data[hfs]);
00861
00862 float sum = 0;
00863
00864 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
00865 {
00866 filt[filt_idx] = f[filt_idx];
00867 }
00868 data[data_idx+ctr_off] = src[res_idx];
00869
00870 if(data_idx < ctr_off && x_pos > ctr_off)
00871 {
00872 data[data_idx] = src[res_idx-ctr_off];
00873 }
00874
00875 if(data_idx > tile_len-ctr_off-1 && x_pos < src_w-ctr_off)
00876 {
00877 data[data_idx+ctr_off+ctr_off] = src[res_idx+ctr_off];
00878 }
00879 __syncthreads();
00880
00881
00882 if(y_pos < src_h && x_pos < src_w)
00883 {
00884
00885 for(int j=0;j<hfs;j++)
00886 {
00887 const int s_x=data_idx+ctr_off+hfs2-j;
00888 const int x_loc=x_pos+hfs2-j;
00889 const int f_x=j;
00890 if(x_loc >=0 && x_loc < src_w)
00891 {
00892 sum += data[s_x]*filt[f_x];
00893 }
00894 }
00895
00896 }
00897 if(y_pos < src_h && x_pos < src_w)
00898 {
00899 res[res_idx] = sum;
00900 }
00901
00902 }
00903
00904
00905 __global__ void cuda_global_optXFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00906 {
00907 const int hfs2 = (hfs - 1) / 2;
00908 const int y_pos = blockIdx.y;
00909 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00910 const int data_idx = threadIdx.x;
00911 const int res_idx = y_pos*src_w+x_pos;
00912 const int ctr_off = hfs2;
00913
00914
00915 float *filt = (float *) shared_data;
00916 float *data = (float *) &(shared_data[hfs]);
00917
00918 float sum = 0, sumw = 0, sumf = 0;
00919
00920 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
00921 {
00922 filt[filt_idx] = f[filt_idx];
00923 }
00924 if(y_pos < src_h && x_pos < src_w)
00925 {
00926
00927 data[data_idx+ctr_off] = src[res_idx];
00928
00929 if(data_idx < ctr_off && x_pos > ctr_off)
00930 {
00931 data[data_idx] = src[res_idx-ctr_off];
00932 }
00933
00934 if(data_idx > tile_len-ctr_off-1 && x_pos < src_w-ctr_off)
00935 {
00936 data[data_idx+ctr_off+ctr_off] = src[res_idx+ctr_off];
00937 }
00938 __syncthreads();
00939
00940
00941 for(int j=0;j<hfs;j++)
00942 {
00943 const int s_x=data_idx+ctr_off+hfs2-j;
00944 const int x_loc=x_pos+hfs2-j;
00945 const int f_x=j;
00946 if(x_loc >=0 && x_loc < src_w)
00947 {
00948 sum += data[s_x]*filt[f_x];
00949 sumw += filt[f_x];
00950 }
00951 else
00952 {
00953 sumf += filt[f_x];
00954 }
00955 }
00956 sumf+=sumw;
00957 res[res_idx] = sum*sumf/sumw;
00958 }
00959
00960 }
00961
00962
00963 __global__ void cuda_global_optXFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00964 {
00965 const int hfs2 = (hfs - 1) / 2;
00966 const int y_pos = blockIdx.y;
00967 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00968 const int data_idx = threadIdx.x;
00969 const int res_idx = y_pos*src_w+x_pos;
00970 const int ctr_off = hfs2;
00971
00972
00973 float *filt = (float *) shared_data;
00974 float *data = (float *) &(shared_data[hfs]);
00975
00976 float sum = 0;
00977
00978 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
00979 {
00980 filt[filt_idx] = f[filt_idx];
00981 }
00982 if(y_pos < src_h && x_pos < src_w)
00983 {
00984
00985 data[data_idx+ctr_off] = src[res_idx];
00986
00987 if(data_idx < ctr_off && x_pos > ctr_off)
00988 {
00989 data[data_idx] = src[res_idx-ctr_off];
00990 }
00991
00992 if(data_idx > tile_len-ctr_off-1 && x_pos < src_w-ctr_off)
00993 {
00994 data[data_idx+ctr_off+ctr_off] = src[res_idx+ctr_off];
00995 }
00996 __syncthreads();
00997
00998 for(int j=0;j<hfs;j++)
00999 {
01000 const int x_loc=x_pos+hfs2-j;
01001 const int f_x=j;
01002 if(x_loc < 0)
01003 {
01004
01005 sum += data[ctr_off]*filt[f_x];
01006 }
01007 else if(x_loc >= src_w)
01008 {
01009
01010 const int max_x = src_w-IMUL(blockIdx.x, tile_len)+ctr_off-1;
01011 sum += data[max_x]*filt[f_x];
01012 }
01013 else
01014 {
01015 const int s_x=data_idx+ctr_off+hfs2-j;
01016 sum += data[s_x]*filt[f_x];
01017 }
01018 }
01019
01020 res[res_idx] = sum;
01021 }
01022
01023 }
01024
01025
01026 __global__ void cuda_global_optYFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
01027 {
01028 const int hfs2 = (hfs-1) / 2;
01029 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
01030 const int x_pos = blockIdx.x;
01031 const int data_idx = threadIdx.y;
01032 const int res_idx = y_pos*src_w+x_pos;
01033 const int ctr_off = hfs2;
01034
01035
01036 float *filt = (float *) shared_data;
01037 float *data = (float *) &(shared_data[hfs]);
01038
01039 float sum = 0;
01040
01041 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
01042 {
01043 filt[filt_idx] = f[filt_idx];
01044 }
01045 if(y_pos < src_h && x_pos < src_w)
01046 {
01047 data[data_idx+ctr_off] = src[res_idx];
01048
01049 if(data_idx < ctr_off && y_pos > ctr_off)
01050 {
01051 data[data_idx] = src[res_idx-IMUL(ctr_off,src_w)];
01052 }
01053
01054 if(data_idx > tile_len-ctr_off-1 && y_pos < src_h-ctr_off)
01055 {
01056 data[data_idx+ctr_off+ctr_off] = src[res_idx+IMUL(ctr_off,src_w)];
01057 }
01058 __syncthreads();
01059
01060
01061 for(int j=0;j<hfs;j++)
01062 {
01063 const int s_y=data_idx+ctr_off+hfs2-j;
01064 const int y_loc=y_pos+hfs2-j;
01065 const int f_y=j;
01066 if(y_loc >=0 && y_loc < src_h)
01067 {
01068 sum += data[s_y]*filt[f_y];
01069 }
01070 }
01071
01072 res[res_idx] = sum;
01073 }
01074
01075 }
01076
01077
01078 __global__ void cuda_global_optYFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
01079 {
01080 const int hfs2 = (hfs - 1) / 2;
01081 const int y_pos = IMUL(blockIdx.y,tile_len) + threadIdx.y;
01082 const int x_pos = blockIdx.x;
01083 const int data_idx = threadIdx.y;
01084 const int res_idx = y_pos*src_w+x_pos;
01085 const int ctr_off = hfs2;
01086
01087
01088 float *filt = (float *) shared_data;
01089 float *data = (float *) &(shared_data[hfs]);
01090
01091 float sum = 0, sumw = 0, sumf = 0;
01092
01093
01094 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
01095 {
01096 filt[filt_idx] = f[filt_idx];
01097 }
01098 if(y_pos < src_h && x_pos < src_w)
01099 {
01100
01101 data[data_idx+ctr_off] = src[res_idx];
01102
01103 if(data_idx < ctr_off && y_pos > ctr_off)
01104 {
01105 data[data_idx] = src[res_idx-IMUL(ctr_off,src_w)];
01106 }
01107
01108 if(data_idx > tile_len-ctr_off-1 && y_pos < src_h-ctr_off)
01109 {
01110 data[data_idx+ctr_off+ctr_off] = src[res_idx+IMUL(ctr_off,src_w)];
01111 }
01112 __syncthreads();
01113
01114 for(int j=0;j<hfs;j++)
01115 {
01116 const int s_y=data_idx+ctr_off+hfs2-j;
01117 const int y_loc=y_pos+hfs2-j;
01118 const int f_y=j;
01119 if(y_loc >=0 && y_loc < src_h)
01120 {
01121 sum += data[s_y]*filt[f_y];
01122 sumw += filt[f_y];
01123 }
01124 else
01125 {
01126 sumf += filt[f_y];
01127 }
01128 }
01129 }
01130 if(y_pos < src_h && x_pos < src_w)
01131 {
01132 sumf+=sumw;
01133 res[res_idx] = sum*sumf/sumw;
01134 }
01135
01136 }
01137
01138 __global__ void cuda_global_optYFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
01139 {
01140 const int hfs2 = (hfs - 1) / 2;
01141 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
01142 const int x_pos = blockIdx.x;
01143 const int data_idx = threadIdx.y;
01144 const int res_idx = y_pos*src_w+x_pos;
01145 const int ctr_off = hfs2;
01146
01147
01148 float *filt = (float *) shared_data;
01149 float *data = (float *) &(shared_data[hfs]);
01150
01151 float sum = 0;
01152
01153 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
01154 {
01155 filt[filt_idx] = f[filt_idx];
01156 }
01157 if(y_pos < src_h && x_pos < src_w)
01158 {
01159 data[data_idx+ctr_off] = src[res_idx];
01160
01161 if(data_idx < ctr_off && y_pos > ctr_off)
01162 {
01163 data[data_idx] = src[res_idx-IMUL(ctr_off,src_w)];
01164 }
01165
01166 if(data_idx > tile_len-ctr_off-1 && y_pos < src_h-ctr_off)
01167 {
01168 data[data_idx+ctr_off+ctr_off] = src[res_idx+IMUL(ctr_off,src_w)];
01169 }
01170 __syncthreads();
01171
01172 for(int j=0;j<hfs;j++)
01173 {
01174 const int y_loc=y_pos+hfs2-j;
01175 const int f_y=j;
01176 if(y_loc < 0)
01177 {
01178
01179 sum += data[ctr_off]*filt[f_y];
01180 }
01181 else if(y_loc >= src_h)
01182 {
01183
01184 const int max_y = src_h-IMUL(blockIdx.y, tile_len)+ctr_off-1;
01185 sum += data[max_y]*filt[f_y];
01186 }
01187 else
01188 {
01189 const int s_y=data_idx+ctr_off+hfs2-j;
01190 sum += data[s_y]*filt[f_y];
01191 }
01192 }
01193 }
01194 if(y_pos < src_h && x_pos < src_w)
01195 {
01196 res[res_idx] = sum;
01197 }
01198
01199 }
01200
01201
01202 #endif