00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #ifndef CUDA_LOWPASS_H_DEFINED
00039 #define CUDA_LOWPASS_H_DEFINED
00040
00041 #include <cuda.h>
00042 #include "CUDA/cutil.h"
00043 #include "cudadefs.h"
00044
00045
00046
00047
00048 __global__ void cuda_global_lowpass_5_x_dec_x(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
00049 {
00050
00051
00052
00053
00054
00055
00056 float *data = (float *) shared_data;
00057 float *border = (float *) &data[tile_width];
00058
00059 const int sx = threadIdx.x;
00060 const int dx = (sx >> 1);
00061 const int sts = IMUL(blockIdx.x, tile_width);
00062 const int dts = (sts >>1);
00063 const int srs = IMUL(blockIdx.y, w);
00064 const int drs = IMUL(blockIdx.y, (w >> 1));
00065
00066
00067 const int loadIdx = sts + sx;
00068 if (loadIdx < w) data[sx] = src[srs + loadIdx];
00069
00070
00071 if (sx < 2 && sts > 0)
00072 border[sx] = src[srs + loadIdx - 2];
00073
00074 else if(sx >= tile_width-2 && sts+tile_width < w-2)
00075 border[4+sx-tile_width] = src[srs + sts + sx + 2];
00076
00077 const int ww = (w & 0xfffe);
00078
00079
00080
00081 __syncthreads();
00082
00083
00084 if ( (sx & 1) == 0 && loadIdx < ww) {
00085 const int writeIdx = dts + dx;
00086 const float *dptr = data + sx;
00087
00088
00089
00090
00091 if(w < 5)
00092 {
00093 int numAhead = max(0,min(w-1-(sts+sx),2));
00094 int numBehind = max(0,min(sts+sx,2));
00095 int situation = numBehind*10+numAhead;
00096 switch(situation)
00097 {
00098 case 0:
00099 dst[drs + writeIdx] = dptr[0];
00100 break;
00101 case 1:
00102 dst[drs + writeIdx] = (dptr[0] * (6.0F / 10.0F) +
00103 dptr[1] * (4.0F / 10.0F));
00104 break;
00105 case 2:
00106 dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
00107 dptr[1] * (4.0F / 11.0F) +
00108 dptr[2] * (1.0F / 11.0F));
00109 break;
00110 case 10:
00111 dst[drs + writeIdx] = (dptr[0] * (6.0F / 10.0F) +
00112 dptr[-1] * (4.0F / 10.0F));
00113 break;
00114 case 11:
00115 dst[drs + writeIdx] = (dptr[0] * (6.0F / 14.0F) +
00116 (dptr[-1] + dptr[1]) * (4.0F / 14.0F));
00117 break;
00118 case 12:
00119 dst[drs + writeIdx] = (dptr[0] * (6.0F / 15.0F) +
00120 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
00121 dptr[2] * (1.0F / 15.0F));
00122 break;
00123 case 20:
00124 dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
00125 dptr[-1] * (4.0F / 11.0F) +
00126 dptr[-2] * (1.0F / 11.0F));
00127 break;
00128 case 21:
00129 dst[drs + writeIdx] = (dptr[0] * (6.0F / 15.0F) +
00130 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
00131 dptr[-2] * (1.0F / 15.0F));
00132 break;
00133 }
00134 }
00135
00136 else if(sts+sx < 2)
00137 {
00138 switch(sx)
00139 {
00140 case 0:
00141 dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
00142 dptr[1] * (4.0F / 11.0F) +
00143 dptr[2] * (1.0F / 11.0F));
00144 break;
00145 case 1:
00146 dst[drs + writeIdx] = (dptr[0] * (6.0F / 15.0F) +
00147 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
00148 dptr[2] * (1.0F / 15.0F));
00149 break;
00150 }
00151 }
00152
00153 else if(sx < 2 && sts+sx < w-2)
00154 {
00155 switch(sx)
00156 {
00157 case 0:
00158 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
00159 (border[1] + dptr[1]) * (4.0F / 16.0F) +
00160 (border[0] + dptr[2]) * (1.0F / 16.0F));
00161 break;
00162 case 1:
00163 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
00164 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) +
00165 (border[1] + dptr[2]) * (1.0F / 16.0F));
00166 break;
00167 }
00168 }
00169
00170 else if(sx < tile_width-2 && sts+sx < w-2)
00171 {
00172 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
00173 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) +
00174 (dptr[-2] + dptr[2]) * (1.0F / 16.0F));
00175 }
00176
00177 else if(sts+sx < w-2)
00178 {
00179 switch(tile_width-sx)
00180 {
00181 case 2:
00182 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
00183 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) +
00184 (dptr[-2] + border[2]) * (1.0F / 16.0F));
00185 break;
00186 case 1:
00187 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
00188 (dptr[-1] + border[2]) * (4.0F / 16.0F) +
00189 (dptr[-2] + border[3]) * (1.0F / 16.0F));
00190 break;
00191 }
00192 }
00193
00194 else
00195 {
00196 switch(w-(sts+sx))
00197 {
00198 case 2:
00199 dst[drs + writeIdx] = (dptr[0] * (6.0F / 15.0F) +
00200 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
00201 (dptr[-2]) * (1.0F / 15.0F));
00202 break;
00203 case 1:
00204 dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
00205 (dptr[-1]) * (4.0F / 11.0F) +
00206 (dptr[-2]) * (1.0F / 11.0F));
00207 break;
00208 }
00209 }
00210 }
00211 }
00212
00213
00214
00215
00216 __global__ void cuda_global_lowpass_5_y_dec_y(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00217 {
00218
00219
00220
00221
00222
00223
00224
00225 float *data = (float *) shared_data;
00226 float *border = (float *) &data[tile_width*tile_height];
00227
00228 const int sy = threadIdx.y;
00229 const int dy = (sy >> 1);
00230
00231 const int sts = IMUL(blockIdx.y, tile_height);
00232 const int ste = sts + tile_height;
00233
00234 const int dts = (sts >> 1);
00235 const int dte = (ste >> 1);
00236
00237
00238 const int stec = min(ste, h);
00239 const int dtec = min(dte, (h >> 1));
00240
00241
00242 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00243 const int dcs = scs;
00244
00245
00246 if (scs < w) {
00247
00248 int smemPos = IMUL(sy, tile_width) + threadIdx.x;
00249 int gmemPos = IMUL(sts + sy, w) + scs;
00250
00251
00252 data[smemPos] = src[gmemPos];
00253
00254
00255 if (sy < 2 && gmemPos > IMUL(2,w))
00256 border[smemPos] = src[gmemPos-IMUL(2,w)];
00257
00258 int bordOff = 4+sy-tile_height;
00259
00260 if (sy >= tile_height-2 && ste+2 < h)
00261 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(2,w)];
00262
00263
00264
00265 __syncthreads();
00266
00267
00268 if ((sy & 1) == 0) {
00269
00270 smemPos = IMUL(sy, tile_width) + threadIdx.x;
00271 gmemPos = IMUL(dts + dy, w) + dcs;
00272
00273
00274
00275 float *dptr = data + smemPos;
00276 const int sw = tile_width, sw2 = sw + sw;
00277 const int nsw = -sw, nsw2 = nsw - sw;
00278 const int bn2 = threadIdx.x, bn1 = bn2 + tile_width;
00279 const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width;
00280
00281
00282
00283
00284
00285 if(h < 5)
00286 {
00287 int numAhead = max(0,min(h-1-(sts+sy),2));
00288 int numBehind = max(0,min(sts+sy,2));
00289 int situation = numBehind*10+numAhead;
00290 switch(situation)
00291 {
00292 case 0:
00293 dst[gmemPos] = dptr[0];
00294 break;
00295 case 1:
00296 dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) +
00297 dptr[sw] * (4.0F / 10.0F));
00298 break;
00299 case 2:
00300 dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) +
00301 dptr[sw] * (4.0F / 11.0F) +
00302 dptr[sw2] * (1.0F / 11.0F));
00303 break;
00304 case 10:
00305 dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) +
00306 dptr[nsw] * (4.0F / 10.0F));
00307 break;
00308 case 11:
00309 dst[gmemPos] = (dptr[0] * (6.0F / 14.0F) +
00310 (dptr[nsw] + dptr[sw]) * (4.0F / 14.0F));
00311 break;
00312 case 12:
00313 dst[gmemPos] = (dptr[0] * (6.0F / 15.0F) +
00314 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
00315 dptr[sw2] * (1.0F / 15.0F));
00316 break;
00317 case 20:
00318 dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) +
00319 dptr[nsw] * (4.0F / 11.0F) +
00320 dptr[nsw2] * (1.0F / 11.0F));
00321 break;
00322 case 21:
00323 dst[gmemPos] = (dptr[0] * (6.0F / 15.0F) +
00324 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
00325 dptr[nsw2] * (1.0F / 15.0F));
00326 break;
00327 }
00328 }
00329
00330 else if(sts + sy < 2)
00331 {
00332 switch(sts+sy)
00333 {
00334 case 0:
00335 dst[gmemPos] =
00336 (dptr[0] * (6.0F / 11.0F) +
00337 dptr[sw] * (4.0F / 11.0F) +
00338 dptr[sw2] * (1.0F / 11.0F)
00339 );
00340 break;
00341 case 1:
00342 dst[gmemPos] =
00343 (dptr[0] * (6.0F / 15.0F) +
00344 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
00345 dptr[sw2] * (1.0F / 15.0F)
00346 );
00347 break;
00348 }
00349 }
00350 else if(sy < 2 && sts+sy<h-2)
00351 {
00352 switch(sy)
00353 {
00354 case 0:
00355 dst[gmemPos] =
00356 (dptr[0] * (6.0F / 16.0F) +
00357 (border[bn1] + dptr[sw]) * (4.0F / 16.0F) +
00358 (border[bn2] + dptr[sw2]) * (1.0F / 16.0F)
00359 );
00360 break;
00361 case 1:
00362 dst[gmemPos] =
00363 (dptr[0] * (6.0F / 16.0F) +
00364 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) +
00365 (border[bn1] + dptr[sw2]) * (1.0F / 16.0F)
00366 );
00367 break;
00368 }
00369 }
00370 else if(sy <tile_height-2 && sts+sy<h-2)
00371 {
00372 dst[gmemPos] =
00373 ((dptr[nsw2] + dptr[sw2]) * (1.0F / 16.0F) +
00374 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) +
00375 dptr[0] * (6.0F / 16.0F)
00376 );
00377 }
00378 else if(sts + sy < h-2)
00379 {
00380 switch(tile_height-sy)
00381 {
00382 case 2:
00383 dst[gmemPos] =
00384 (dptr[0] * (6.0F / 16.0F) +
00385 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) +
00386 (dptr[nsw2] + border[bp1]) * (1.0F / 16.0F)
00387 );
00388 break;
00389 case 1:
00390 dst[gmemPos] =
00391 (dptr[0] * (6.0F / 16.0F) +
00392 (dptr[nsw] + border[bp1]) * (4.0F / 16.0F) +
00393 (dptr[nsw2] + border[bp2]) * (1.0F / 16.0F)
00394 );
00395 break;
00396 }
00397 }
00398 else
00399 {
00400 switch(h-(sts+sy))
00401 {
00402 case 2:
00403 dst[gmemPos] =
00404 (dptr[0] * (6.0F / 15.0F) +
00405 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
00406 dptr[nsw2] * (1.0F / 15.0F)
00407 );
00408 break;
00409 case 1:
00410 dst[gmemPos] =
00411 (dptr[0] * (6.0F / 11.0F) +
00412 dptr[nsw] * (4.0F / 11.0F) +
00413 dptr[nsw2] * (1.0F / 11.0F)
00414 );
00415 break;
00416 }
00417 }
00418
00419 }
00420 }
00421 }
00422
00423
00424 __global__ void cuda_global_lowpass_9_x_dec_x(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width)
00425 {
00426
00427 const int src_tile_width = tile_width<<1;
00428
00429 float *data = (float *) shared_data;
00430
00431 float *border = (float *) &data[src_tile_width];
00432 const int dx = threadIdx.x;
00433 const int dts = IMUL(blockIdx.x, tile_width);
00434 const int drs = IMUL(blockIdx.y,dw);
00435 const int sx = dx<<1;
00436 const int sts = (dts <<1);
00437 const int srs = IMUL(blockIdx.y, w);
00438
00439 const int loadIdx = sts + sx;
00440 const int writeIdx = dts + dx;
00441 const int bn4 = 0, bn3 = 1, bn2 = 2, bn1 = 3;
00442 const int bp1 = 4, bp2 = 5, bp3 = 6, bp4 = 7;
00443 float *dptr = &data[sx];
00444
00445
00446 if (dx < 4 && sts > 0) border[dx] = src[srs + sts - (4-dx)];
00447 if (dx >= tile_width-4 && sts+src_tile_width < w-4) border[4+dx-(tile_width-4)] = src[srs + sts + tile_width + dx + 4];
00448
00449
00450 if (loadIdx < w)
00451 {
00452 data[sx] = src[srs + loadIdx];
00453 if(sx+1 < src_tile_width && loadIdx+1 < w)
00454 data[sx+1] = src[srs + loadIdx + 1];
00455 }
00456
00457
00458
00459 __syncthreads();
00460
00461
00462
00463
00464
00465
00466 if(writeIdx < dw && blockIdx.y < dh)
00467 {
00468
00469
00470
00471
00472 if(w < 9)
00473 {
00474
00475 const int numAhead = max(0,min(w-1-(sts+sx),4));
00476 const int numBehind = max(0,min(sts+sx,4));
00477 const int situation = numBehind*10+numAhead;
00478
00479 switch(situation)
00480 {
00481 case 0:
00482 dst[drs+writeIdx] = dptr[0];
00483 break;
00484 case 2:
00485 dst[drs+writeIdx] = (dptr[0] * (70.0F / 154.0F) +
00486 dptr[1] * (56.0F / 154.0F) +
00487 dptr[2] * (28.0F / 154.0F));
00488 break;
00489 case 4:
00490 dst[drs+writeIdx] = (dptr[0] * (70.0F / 163.0F) +
00491 dptr[1] * (56.0F / 163.0F) +
00492 dptr[2] * (28.0F / 163.0F) +
00493 dptr[3] * ( 8.0F / 163.0F) +
00494 dptr[4] * ( 1.0F / 163.0F));
00495 break;
00496 case 10:
00497 dst[drs+writeIdx] = (dptr[0] * (70.0F / 126.0F) +
00498 dptr[-1] * (56.0F / 126.0F));
00499 break;
00500 case 12:
00501 dst[drs+writeIdx] = (dptr[0] * (70.0F / 210.0F) +
00502 (dptr[-1] + dptr[1]) * (56.0F / 210.0F) +
00503 dptr[2] * (28.0F / 210.0F));
00504 break;
00505 case 14:
00506 dst[drs+writeIdx] = (dptr[0] * (70.0F / 219.0F) +
00507 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
00508 dptr[2] * (28.0F / 219.0F) +
00509 dptr[3] * ( 8.0F / 219.0F) +
00510 dptr[4] * ( 1.0F / 219.0F));
00511 break;
00512 case 20:
00513 dst[drs+writeIdx] = (dptr[0] * (70.0F / 154.0F) +
00514 dptr[-1] * (56.0F / 154.0F) +
00515 dptr[-2] * (28.0F / 154.0F));
00516 break;
00517 case 22:
00518 dst[drs+writeIdx] = (dptr[0] * (70.0F / 238.0F) +
00519 (dptr[-1] + dptr[1]) * (56.0F / 238.0F) +
00520 (dptr[-2] + dptr[2]) * (28.0F / 238.0F));
00521 break;
00522 case 24:
00523 dst[drs+writeIdx] = (dptr[0] * (70.0F / 247.0F) +
00524 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
00525 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
00526 dptr[3] * ( 8.0F / 247.0F) +
00527 dptr[4] * ( 1.0F / 247.0F));
00528 break;
00529 case 30:
00530 dst[drs+writeIdx] = (dptr[0] * (70.0F / 162.0F) +
00531 dptr[-1] * (56.0F / 162.0F) +
00532 dptr[-2] * (28.0F / 162.0F) +
00533 dptr[-3] * ( 8.0F / 162.0F));
00534 break;
00535 case 32:
00536 dst[drs+writeIdx] = (dptr[0] * (70.0F / 246.0F) +
00537 (dptr[-1] + dptr[1]) * (56.0F / 246.0F) +
00538 (dptr[-2] + dptr[2]) * (28.0F / 246.0F) +
00539 dptr[-3] * ( 8.0F / 246.0F));
00540 break;
00541 case 34:
00542 dst[drs+writeIdx] = (dptr[0] * (70.0F / 255.0F) +
00543 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
00544 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
00545 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
00546 dptr[4] * ( 1.0F / 255.0F));
00547 break;
00548 case 40:
00549 dst[drs+writeIdx] = (dptr[0] * (70.0F / 163.0F) +
00550 dptr[-1] * (56.0F / 163.0F) +
00551 dptr[-2] * (28.0F / 163.0F) +
00552 dptr[-3] * ( 8.0F / 163.0F) +
00553 dptr[-4] * ( 1.0F / 163.0F));
00554 break;
00555 case 42:
00556 dst[drs+writeIdx] = (dptr[0] * (70.0F / 247.0F) +
00557 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
00558 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
00559 dptr[-3] * ( 8.0F / 247.0F) +
00560 dptr[-4] * ( 1.0F / 247.0F));
00561 break;
00562 }
00563 }
00564
00565 else if(sts+sx < 4)
00566 {
00567 switch(sx)
00568 {
00569 case 0:
00570 dst[drs + writeIdx] =
00571 (dptr[0] * (70.0F / 163.0F) +
00572 dptr[1] * (56.0F / 163.0F) +
00573 dptr[2] * (28.0F / 163.0F) +
00574 dptr[3] * ( 8.0F / 163.0F) +
00575 dptr[4] * ( 1.0F / 163.0F)
00576 );
00577 break;
00578 case 1:
00579 dst[drs + writeIdx] =
00580 (dptr[0] * (70.0F / 219.0F) +
00581 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
00582 dptr[2] * (28.0F / 219.0F) +
00583 dptr[3] * ( 8.0F / 219.0F) +
00584 dptr[4] * ( 1.0F / 219.0F)
00585 );
00586 break;
00587 case 2:
00588 dst[drs + writeIdx] =
00589 (dptr[0] * (70.0F / 247.0F) +
00590 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
00591 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
00592 dptr[3] * ( 8.0F / 247.0F) +
00593 dptr[4] * ( 1.0F / 247.0F)
00594 );
00595 break;
00596 case 3:
00597 dst[drs + writeIdx] =
00598 (dptr[0] * (70.0F / 255.0F) +
00599 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
00600 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
00601 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
00602 dptr[4] * ( 1.0F / 255.0F)
00603 );
00604 break;
00605 default:
00606
00607 break;
00608 }
00609 }
00610
00611 else if(sx < 4 && sts+sx < w-4)
00612 {
00613 switch(sx)
00614 {
00615 case 0:
00616 dst[drs + writeIdx] =
00617 ((border[bn4] + dptr[4]) * ( 1.0F / 256.0F) +
00618 (border[bn3] + dptr[3]) * ( 8.0F / 256.0F) +
00619 (border[bn2] + dptr[2]) * (28.0F / 256.0F) +
00620 (border[bn1] + dptr[1]) * (56.0F / 256.0F) +
00621 dptr[0] * (70.0F / 256.0F)
00622 );
00623 break;
00624 case 1:
00625 dst[drs + writeIdx] =
00626 ((border[bn3] + dptr[4]) * ( 1.0F / 256.0F) +
00627 (border[bn2] + dptr[3]) * ( 8.0F / 256.0F) +
00628 (border[bn1] + dptr[2]) * (28.0F / 256.0F) +
00629 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
00630 dptr[0] * (70.0F / 256.0F)
00631 );
00632 break;
00633 case 2:
00634 dst[drs + writeIdx] =
00635 ((border[bn2] + dptr[4]) * ( 1.0F / 256.0F) +
00636 (border[bn1] + dptr[3]) * ( 8.0F / 256.0F) +
00637 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
00638 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
00639 dptr[0] * (70.0F / 256.0F)
00640 );
00641 break;
00642 case 3:
00643 dst[drs + writeIdx] =
00644 ((border[bn1] + dptr[4]) * ( 1.0F / 256.0F) +
00645 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) +
00646 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
00647 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
00648 dptr[0] * (70.0F / 256.0F)
00649 );
00650 break;
00651 }
00652 }
00653
00654 else if(sx>=4 && sx<src_tile_width-4 && sts+sx < w-4)
00655 {
00656 dst[drs + writeIdx] =
00657 ((dptr[-4] + dptr[4]) * ( 1.0F / 256.0F) +
00658 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) +
00659 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
00660 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
00661 dptr[0] * (70.0F / 256.0F)
00662 );
00663 }
00664
00665 else if(sx>=4 && sts+sx < w-4)
00666 {
00667 switch(src_tile_width-sx)
00668 {
00669 case 4:
00670 dst[drs + writeIdx] =
00671 ((dptr[-4] + border[bp1]) * ( 1.0F / 256.0F) +
00672 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) +
00673 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
00674 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
00675 dptr[0] * (70.0F / 256.0F)
00676 );
00677 break;
00678 case 3:
00679 dst[drs + writeIdx] =
00680 ((dptr[-4] + border[bp2]) * ( 1.0F / 256.0F) +
00681 (dptr[-3] + border[bp1]) * ( 8.0F / 256.0F) +
00682 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
00683 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
00684 dptr[0] * (70.0F / 256.0F)
00685 );
00686 break;
00687 case 2:
00688 dst[drs + writeIdx] =
00689 ((dptr[-4] + border[bp3]) * ( 1.0F / 256.0F) +
00690 (dptr[-3] + border[bp2]) * ( 8.0F / 256.0F) +
00691 (dptr[-2] + border[bp1]) * (28.0F / 256.0F) +
00692 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
00693 dptr[0] * (70.0F / 256.0F)
00694 );
00695 break;
00696 case 1:
00697 dst[drs + writeIdx] =
00698 ((dptr[-4] + border[bp4]) * ( 1.0F / 256.0F) +
00699 (dptr[-3] + border[bp3]) * ( 8.0F / 256.0F) +
00700 (dptr[-2] + border[bp2]) * (28.0F / 256.0F) +
00701 (dptr[-1] + border[bp1]) * (56.0F / 256.0F) +
00702 dptr[0] * (70.0F / 256.0F)
00703 );
00704 break;
00705 }
00706 }
00707 else if(sx < 4 && sts + sx >= w-4)
00708 {
00709 switch(sx)
00710 {
00711 case 0:
00712 switch(w-(sts+sx))
00713 {
00714 case 4:
00715 dst[drs + writeIdx] =
00716 (dptr[0] * (70.0F / 255.0F) +
00717 (border[bn1] + dptr[1]) * (56.0F / 255.0F) +
00718 (border[bn2] + dptr[2]) * (28.0F / 255.0F) +
00719 (border[bn3] + dptr[3]) * ( 8.0F / 255.0F) +
00720 border[bn4] * ( 1.0F / 255.0F)
00721 );
00722 break;
00723 case 3:
00724 dst[drs + writeIdx] =
00725 (dptr[0] * (70.0F / 247.0F) +
00726 (border[bn1] + dptr[1]) * (56.0F / 247.0F) +
00727 (border[bn2] + dptr[2]) * (28.0F / 247.0F) +
00728 border[bn3] * ( 8.0F / 247.0F) +
00729 border[bn4] * ( 1.0F / 247.0F)
00730 );
00731 break;
00732 case 2:
00733 dst[drs + writeIdx] =
00734 (dptr[0] * (70.0F / 219.0F) +
00735 (border[bn1] + dptr[1]) * (56.0F / 219.0F) +
00736 border[bn2] * (28.0F / 219.0F) +
00737 border[bn3] * ( 8.0F / 219.0F) +
00738 border[bn4] * ( 1.0F / 219.0F)
00739 );
00740 break;
00741 case 1:
00742 dst[drs + writeIdx] =
00743 (dptr[0] * (70.0F / 163.0F) +
00744 border[bn1] * (56.0F / 163.0F) +
00745 border[bn2] * (28.0F / 163.0F) +
00746 border[bn3] * ( 8.0F / 163.0F) +
00747 border[bn4] * ( 1.0F / 163.0F)
00748 );
00749 break;
00750 }
00751 break;
00752 case 1:
00753 switch(w-(sts+sx))
00754 {
00755 case 4:
00756 dst[drs + writeIdx] =
00757 (dptr[0] * (70.0F / 255.0F) +
00758 (dptr[1] + dptr[1]) * (56.0F / 255.0F) +
00759 (border[bn1] + dptr[2]) * (28.0F / 255.0F) +
00760 (border[bn2] + dptr[3]) * ( 8.0F / 255.0F) +
00761 border[bn3] * ( 1.0F / 255.0F)
00762 );
00763 break;
00764 case 3:
00765 dst[drs + writeIdx] =
00766 (dptr[0] * (70.0F / 247.0F) +
00767 (dptr[1] + dptr[1]) * (56.0F / 247.0F) +
00768 (border[bn1] + dptr[2]) * (28.0F / 247.0F) +
00769 border[bn2] * ( 8.0F / 247.0F) +
00770 border[bn3] * ( 1.0F / 247.0F)
00771 );
00772 break;
00773 case 2:
00774 dst[drs + writeIdx] =
00775 (dptr[0] * (70.0F / 219.0F) +
00776 (dptr[1] + dptr[1]) * (56.0F / 219.0F) +
00777 border[bn1] * (28.0F / 219.0F) +
00778 border[bn2] * ( 8.0F / 219.0F) +
00779 border[bn3] * ( 1.0F / 219.0F)
00780 );
00781 break;
00782 case 1:
00783 dst[drs + writeIdx] =
00784 (dptr[0] * (70.0F / 163.0F) +
00785 dptr[1] * (56.0F / 163.0F) +
00786 border[bn1] * (28.0F / 163.0F) +
00787 border[bn2] * ( 8.0F / 163.0F) +
00788 border[bn3] * ( 1.0F / 163.0F)
00789 );
00790 break;
00791 }
00792 break;
00793 case 2:
00794 switch(w-(sts+sx))
00795 {
00796 case 4:
00797 dst[drs + writeIdx] =
00798 (dptr[0] * (70.0F / 255.0F) +
00799 (dptr[1] + dptr[1]) * (56.0F / 255.0F) +
00800 (dptr[2] + dptr[2]) * (28.0F / 255.0F) +
00801 (border[bn1] + dptr[3]) * ( 8.0F / 255.0F) +
00802 border[bn2] * ( 1.0F / 255.0F)
00803 );
00804 break;
00805 case 3:
00806 dst[drs + writeIdx] =
00807 (dptr[0] * (70.0F / 247.0F) +
00808 (dptr[1] + dptr[1]) * (56.0F / 247.0F) +
00809 (dptr[2] + dptr[2]) * (28.0F / 247.0F) +
00810 border[bn1] * ( 8.0F / 247.0F) +
00811 border[bn2] * ( 1.0F / 247.0F)
00812 );
00813 break;
00814 case 2:
00815 dst[drs + writeIdx] =
00816 (dptr[0] * (70.0F / 219.0F) +
00817 (dptr[1] + dptr[1]) * (56.0F / 219.0F) +
00818 dptr[2] * (28.0F / 219.0F) +
00819 border[bn1] * ( 8.0F / 219.0F) +
00820 border[bn2] * ( 1.0F / 219.0F)
00821 );
00822 break;
00823 case 1:
00824 dst[drs + writeIdx] =
00825 (dptr[0] * (70.0F / 163.0F) +
00826 dptr[1] * (56.0F / 163.0F) +
00827 dptr[2] * (28.0F / 163.0F) +
00828 border[bn1] * ( 8.0F / 163.0F) +
00829 border[bn2] * ( 1.0F / 163.0F)
00830 );
00831 break;
00832 }
00833 break;
00834 case 3:
00835 switch(w-(sts+sx))
00836 {
00837 case 4:
00838 dst[drs + writeIdx] =
00839 (dptr[0] * (70.0F / 255.0F) +
00840 (dptr[1] + dptr[1]) * (56.0F / 255.0F) +
00841 (dptr[2] + dptr[2]) * (28.0F / 255.0F) +
00842 (dptr[3] + dptr[3]) * ( 8.0F / 255.0F) +
00843 border[bn1] * ( 1.0F / 255.0F)
00844 );
00845 break;
00846 case 3:
00847 dst[drs + writeIdx] =
00848 (dptr[0] * (70.0F / 247.0F) +
00849 (dptr[1] + dptr[1]) * (56.0F / 247.0F) +
00850 (dptr[2] + dptr[2]) * (28.0F / 247.0F) +
00851 dptr[3] * ( 8.0F / 247.0F) +
00852 border[bn1] * ( 1.0F / 247.0F)
00853 );
00854 break;
00855 case 2:
00856 dst[drs + writeIdx] =
00857 (dptr[0] * (70.0F / 219.0F) +
00858 (dptr[1] + dptr[1]) * (56.0F / 219.0F) +
00859 dptr[2] * (28.0F / 219.0F) +
00860 dptr[3] * ( 8.0F / 219.0F) +
00861 border[bn1] * ( 1.0F / 219.0F)
00862 );
00863 break;
00864 case 1:
00865 dst[drs + writeIdx] =
00866 (dptr[0] * (70.0F / 163.0F) +
00867 dptr[1] * (56.0F / 163.0F) +
00868 dptr[2] * (28.0F / 163.0F) +
00869 dptr[3] * ( 8.0F / 163.0F) +
00870 border[bn1] * ( 1.0F / 163.0F)
00871 );
00872 break;
00873 }
00874 break;
00875 }
00876 }
00877
00878
00879 else if(sts + sx < w)
00880 {
00881 switch(w-(sts+sx))
00882 {
00883 case 4:
00884 dst[drs + writeIdx] =
00885 (dptr[-4] * ( 1.0F / 255.0F) +
00886 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
00887 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
00888 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
00889 dptr[0] * (70.0F / 255.0F)
00890 );
00891 break;
00892 case 3:
00893 dst[drs + writeIdx] =
00894 (dptr[-4] * ( 1.0F / 247.0F) +
00895 dptr[-3] * ( 8.0F / 247.0F) +
00896 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
00897 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
00898 dptr[0] * (70.0F / 247.0F)
00899 );
00900 break;
00901 case 2:
00902 dst[drs + writeIdx] =
00903 (dptr[-4] * ( 1.0F / 219.0F) +
00904 dptr[-3] * ( 8.0F / 219.0F) +
00905 dptr[-2] * (28.0F / 219.0F) +
00906 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
00907 dptr[0] * (70.0F / 219.0F)
00908 );
00909 break;
00910 case 1:
00911 dst[drs + writeIdx] =
00912 (dptr[-4] * ( 1.0F / 163.0F) +
00913 dptr[-3] * ( 8.0F / 163.0F) +
00914 dptr[-2] * (28.0F / 163.0F) +
00915 dptr[-1] * (56.0F / 163.0F) +
00916 dptr[ 0] * (70.0F / 163.0F)
00917 );
00918 break;
00919 }
00920 }
00921 }
00922 }
00923
00924
00925 __global__ void cuda_global_lowpass_9_y_dec_y(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width, int tile_height)
00926 {
00927
00928 const int src_tile_height = tile_height<<1;
00929 float *data = (float *) shared_data;
00930 float *border = (float *) &data[tile_width*src_tile_height];
00931
00932 const int dy = threadIdx.y;
00933 const int dts = IMUL(blockIdx.y, tile_height);
00934 const int dcs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00935 const int sy = dy<<1;
00936 const int sts = dts<<1;
00937 const int ste = sts + src_tile_height;
00938 const int scs = dcs;
00939
00940
00941 const int stec = min(ste, h);
00942
00943
00944
00945 if (dy < 4 && sts+sy > 4)
00946 {
00947 border[IMUL(dy,tile_width)+threadIdx.x] = src[IMUL(sts+dy,w)+scs-IMUL(4,w)];
00948 }
00949
00950 else if (dy >= tile_height-4)
00951 {
00952 int bordOff = 8+dy-tile_height;
00953 if(ste+4 <= h)
00954
00955 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[IMUL(sts+dy+tile_height+4,w)+scs];
00956 else
00957
00958
00959
00960 border[threadIdx.x+IMUL(bordOff,tile_width)] = 0;
00961 }
00962
00963
00964
00965 if (scs < w && sts+sy < stec)
00966 {
00967
00968 int smemPos = IMUL(sy, tile_width) + threadIdx.x;
00969 int gmemPos = IMUL(sts + sy, w) + scs;
00970
00971
00972 data[smemPos] = src[gmemPos];
00973 if(sts+sy+1 < stec)
00974 data[smemPos+tile_width] = src[gmemPos+w];
00975
00976
00977
00978
00979 __syncthreads();
00980
00981
00982 smemPos = IMUL(sy, tile_width) + threadIdx.x;
00983 gmemPos = IMUL(dts + dy, dw) + dcs;
00984
00985
00986
00987 float *dptr = data + smemPos;
00988 const int sw = tile_width, sw2 = sw + sw, sw3 = sw2 + sw, sw4 = sw3 + sw;
00989 const int nsw = -sw, nsw2 = nsw - sw, nsw3 = nsw2 - sw, nsw4 = nsw3 - sw;
00990 const int bn4 = threadIdx.x, bn3 = bn4 + tile_width, bn2 = bn3 + tile_width, bn1 = bn2 + tile_width;
00991 const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width, bp3 = bp2 + tile_width, bp4 = bp3 + tile_width;
00992
00993 if(h < 9)
00994 {
00995
00996 const int numAhead = max(0,min(h-1-(sts+sy),4));
00997 const int numBehind = max(0,min(sts+sy,4));
00998 const int situation = numBehind*10+numAhead;
00999 switch(situation)
01000 {
01001 case 0:
01002 dst[gmemPos] = dptr[0];
01003 break;
01004 case 1:
01005 dst[gmemPos] = (dptr[0] * (70.0F / 126.0F) +
01006 dptr[sw] * (56.0F / 126.0F));
01007 break;
01008 case 2:
01009 dst[gmemPos] = (dptr[0] * (70.0F / 154.0F) +
01010 dptr[sw] * (56.0F / 154.0F) +
01011 dptr[sw2] * (28.0F / 154.0F));
01012 break;
01013 case 3:
01014 dst[gmemPos] = (dptr[0] * (70.0F / 162.0F) +
01015 dptr[sw] * (56.0F / 162.0F) +
01016 dptr[sw2] * (28.0F / 162.0F) +
01017 dptr[sw3] * ( 8.0F / 162.0F));
01018 break;
01019 case 4:
01020 dst[gmemPos] = (dptr[0] * (70.0F / 163.0F) +
01021 dptr[sw] * (56.0F / 163.0F) +
01022 dptr[sw2] * (28.0F / 163.0F) +
01023 dptr[sw3] * ( 8.0F / 163.0F) +
01024 dptr[sw4] * ( 1.0F / 163.0F));
01025 break;
01026 case 20:
01027 dst[gmemPos] = (dptr[0] * (70.0F / 154.0F) +
01028 dptr[nsw] * (56.0F / 154.0F) +
01029 dptr[nsw2] * (28.0F / 154.0F));
01030 break;
01031 case 21:
01032 dst[gmemPos] = (dptr[0] * (70.0F / 210.0F) +
01033 (dptr[nsw] + dptr[sw]) * (56.0F / 210.0F) +
01034 dptr[nsw2] * (28.0F / 210.0F));
01035 break;
01036 case 22:
01037 dst[gmemPos] = (dptr[0] * (70.0F / 238.0F) +
01038 (dptr[nsw] + dptr[sw]) * (56.0F / 238.0F) +
01039 (dptr[nsw2] + dptr[sw2]) * (28.0F / 238.0F));
01040 break;
01041 case 23:
01042 dst[gmemPos] = (dptr[0] * (70.0F / 246.0F) +
01043 (dptr[nsw] + dptr[sw]) * (56.0F / 246.0F) +
01044 (dptr[nsw2] + dptr[sw2]) * (28.0F / 246.0F) +
01045 dptr[sw3] * ( 8.0F / 246.0F));
01046 break;
01047 case 24:
01048 dst[gmemPos] = (dptr[0] * (70.0F / 247.0F) +
01049 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
01050 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01051 dptr[sw3] * ( 8.0F / 247.0F) +
01052 dptr[sw4] * ( 1.0F / 247.0F));
01053 break;
01054 case 40:
01055 dst[gmemPos] = (dptr[0] * (70.0F / 163.0F) +
01056 dptr[nsw] * (56.0F / 163.0F) +
01057 dptr[nsw2] * (28.0F / 163.0F) +
01058 dptr[nsw3] * ( 8.0F / 163.0F) +
01059 dptr[nsw4] * ( 1.0F / 163.0F));
01060 break;
01061 case 41:
01062 dst[gmemPos] = (dptr[0] * (70.0F / 219.0F) +
01063 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01064 dptr[nsw2] * (28.0F / 219.0F) +
01065 dptr[nsw3] * ( 8.0F / 219.0F) +
01066 dptr[nsw4] * ( 1.0F / 219.0F));
01067 break;
01068 case 42:
01069 dst[gmemPos] = (dptr[0] * (70.0F / 247.0F) +
01070 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
01071 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01072 dptr[nsw3] * ( 8.0F / 247.0F) +
01073 dptr[nsw4] * ( 1.0F / 247.0F));
01074 break;
01075 case 43:
01076 dst[gmemPos] = (dptr[0] * (70.0F / 255.0F) +
01077 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
01078 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01079 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01080 dptr[nsw4] * ( 1.0F / 255.0F));
01081 break;
01082 }
01083 }
01084
01085 else if(sts + sy < 4)
01086 {
01087 switch(sts+sy)
01088 {
01089 case 0:
01090 dst[gmemPos] =
01091 (dptr[0] * (70.0F / 163.0F) +
01092 dptr[sw] * (56.0F / 163.0F) +
01093 dptr[sw2] * (28.0F / 163.0F) +
01094 dptr[sw3] * ( 8.0F / 163.0F) +
01095 dptr[sw4] * ( 1.0F / 163.0F)
01096 );
01097 break;
01098 case 1:
01099 dst[gmemPos] =
01100 (dptr[0] * (70.0F / 219.0F) +
01101 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01102 dptr[sw2] * (28.0F / 219.0F) +
01103 dptr[sw3] * ( 8.0F / 219.0F) +
01104 dptr[sw4] * ( 1.0F / 219.0F)
01105 );
01106 break;
01107 case 2:
01108 dst[gmemPos] =
01109 (dptr[0] * (70.0F / 247.0F) +
01110 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
01111 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01112 dptr[sw3] * ( 8.0F / 247.0F) +
01113 dptr[sw4] * ( 1.0F / 247.0F)
01114 );
01115 break;
01116 case 3:
01117 dst[gmemPos] =
01118 (dptr[0] * (70.0F / 255.0F) +
01119 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
01120 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01121 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01122 dptr[sw4] * ( 1.0F / 255.0F)
01123 );
01124 break;
01125 }
01126 }
01127 else if(sy < 4 && sts+sy<h-4)
01128 {
01129 switch(sy)
01130 {
01131 case 0:
01132 dst[gmemPos] =
01133 (dptr[0] * (70.0F / 256.0F) +
01134 (border[bn1] + dptr[sw]) * (56.0F / 256.0F) +
01135 (border[bn2] + dptr[sw2]) * (28.0F / 256.0F) +
01136 (border[bn3] + dptr[sw3]) * ( 8.0F / 256.0F) +
01137 (border[bn4] + dptr[sw4]) * ( 1.0F / 256.0F)
01138 );
01139 break;
01140 case 1:
01141 dst[gmemPos] =
01142 (dptr[0] * (70.0F / 256.0F) +
01143 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
01144 (border[bn1] + dptr[sw2]) * (28.0F / 256.0F) +
01145 (border[bn2] + dptr[sw3]) * ( 8.0F / 256.0F) +
01146 (border[bn3] + dptr[sw4]) * ( 1.0F / 256.0F)
01147 );
01148 break;
01149 case 2:
01150 dst[gmemPos] =
01151 (dptr[0] * (70.0F / 256.0F) +
01152 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
01153 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
01154 (border[bn1] + dptr[sw3]) * ( 8.0F / 256.0F) +
01155 (border[bn2] + dptr[sw4]) * ( 1.0F / 256.0F)
01156 );
01157 break;
01158 case 3:
01159 dst[gmemPos] =
01160 (dptr[0] * (70.0F / 256.0F) +
01161 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
01162 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
01163 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) +
01164 (border[bn1] + dptr[sw4]) * ( 1.0F / 256.0F)
01165 );
01166 break;
01167 }
01168 }
01169 else if(sy >= 4 && sy <src_tile_height-4 && sts+sy<h-4)
01170 {
01171 dst[gmemPos] =
01172 ((dptr[nsw4] + dptr[sw4]) * ( 1.0F / 256.0F) +
01173 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) +
01174 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
01175 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
01176 dptr[0] * (70.0F / 256.0F)
01177 );
01178 }
01179 else if(sy >= 4 && sts + sy < h-4)
01180 {
01181 switch(src_tile_height-sy)
01182 {
01183 case 4:
01184 dst[gmemPos] =
01185 (dptr[0] * (70.0F / 256.0F) +
01186 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
01187 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
01188 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) +
01189 (dptr[nsw4] + border[bp1]) * ( 1.0F / 256.0F)
01190 );
01191 break;
01192 case 3:
01193 dst[gmemPos] =
01194 (dptr[0] * (70.0F / 256.0F) +
01195 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
01196 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
01197 (dptr[nsw3] + border[bp1]) * ( 8.0F / 256.0F) +
01198 (dptr[nsw4] + border[bp2]) * ( 1.0F / 256.0F)
01199 );
01200 break;
01201 case 2:
01202 dst[gmemPos] =
01203 (dptr[0] * (70.0F / 256.0F) +
01204 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
01205 (dptr[nsw2] + border[bp1]) * (28.0F / 256.0F) +
01206 (dptr[nsw3] + border[bp2]) * ( 8.0F / 256.0F) +
01207 (dptr[nsw4] + border[bp3]) * ( 1.0F / 256.0F)
01208 );
01209 break;
01210 case 1:
01211 dst[gmemPos] =
01212 (dptr[0] * (70.0F / 256.0F) +
01213 (dptr[nsw] + border[bp1]) * (56.0F / 256.0F) +
01214 (dptr[nsw2] + border[bp2]) * (28.0F / 256.0F) +
01215 (dptr[nsw3] + border[bp3]) * ( 8.0F / 256.0F) +
01216 (dptr[nsw4] + border[bp4]) * ( 1.0F / 256.0F)
01217 );
01218 break;
01219 }
01220 }
01221 else if(sy < 4 && sts + sy >= h-4)
01222 {
01223 switch(sy)
01224 {
01225 case 0:
01226 switch(h-(sts+sy))
01227 {
01228 case 4:
01229 dst[gmemPos] =
01230 (dptr[0] * (70.0F / 255.0F) +
01231 (border[bn1] + dptr[sw]) * (56.0F / 255.0F) +
01232 (border[bn2] + dptr[sw2]) * (28.0F / 255.0F) +
01233 (border[bn3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01234 border[bn4] * ( 1.0F / 255.0F)
01235 );
01236 break;
01237 case 3:
01238 dst[gmemPos] =
01239 (dptr[0] * (70.0F / 247.0F) +
01240 (border[bn1] + dptr[sw]) * (56.0F / 247.0F) +
01241 (border[bn2] + dptr[sw2]) * (28.0F / 247.0F) +
01242 border[bn3] * ( 8.0F / 247.0F) +
01243 border[bn4] * ( 1.0F / 247.0F)
01244 );
01245 break;
01246 case 2:
01247 dst[gmemPos] =
01248 (dptr[0] * (70.0F / 219.0F) +
01249 (border[bn1] + dptr[sw]) * (56.0F / 219.0F) +
01250 border[bn2] * (28.0F / 219.0F) +
01251 border[bn3] * ( 8.0F / 219.0F) +
01252 border[bn4] * ( 1.0F / 219.0F)
01253 );
01254 break;
01255 case 1:
01256 dst[gmemPos] =
01257 (dptr[0] * (70.0F / 163.0F) +
01258 border[bn1] * (56.0F / 163.0F) +
01259 border[bn2] * (28.0F / 163.0F) +
01260 border[bn3] * ( 8.0F / 163.0F) +
01261 border[bn4] * ( 1.0F / 163.0F)
01262 );
01263 break;
01264 }
01265 break;
01266 case 1:
01267 switch(h-(sts+sy))
01268 {
01269 case 4:
01270 dst[gmemPos] =
01271 (dptr[0] * (70.0F / 255.0F) +
01272 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
01273 (border[bn1] + dptr[sw2]) * (28.0F / 255.0F) +
01274 (border[bn2] + dptr[sw3]) * ( 8.0F / 255.0F) +
01275 border[bn3] * ( 1.0F / 255.0F)
01276 );
01277 break;
01278 case 3:
01279 dst[gmemPos] =
01280 (dptr[0] * (70.0F / 247.0F) +
01281 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
01282 (border[bn1] + dptr[sw2]) * (28.0F / 247.0F) +
01283 border[bn2] * ( 8.0F / 247.0F) +
01284 border[bn3] * ( 1.0F / 247.0F)
01285 );
01286 break;
01287 case 2:
01288 dst[gmemPos] =
01289 (dptr[0] * (70.0F / 219.0F) +
01290 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01291 border[bn1] * (28.0F / 219.0F) +
01292 border[bn2] * ( 8.0F / 219.0F) +
01293 border[bn3] * ( 1.0F / 219.0F)
01294 );
01295 break;
01296 case 1:
01297 dst[gmemPos] =
01298 (dptr[0] * (70.0F / 163.0F) +
01299 dptr[nsw] * (56.0F / 163.0F) +
01300 border[bn1] * (28.0F / 163.0F) +
01301 border[bn2] * ( 8.0F / 163.0F) +
01302 border[bn3] * ( 1.0F / 163.0F)
01303 );
01304 break;
01305 }
01306 break;
01307 case 2:
01308 switch(h-(sts+sy))
01309 {
01310 case 4:
01311 dst[gmemPos] =
01312 (dptr[0] * (70.0F / 255.0F) +
01313 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
01314 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01315 (border[bn1] + dptr[sw3]) * ( 8.0F / 255.0F) +
01316 border[bn2] * ( 1.0F / 255.0F)
01317 );
01318 break;
01319 case 3:
01320 dst[gmemPos] =
01321 (dptr[0] * (70.0F / 247.0F) +
01322 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
01323 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01324 border[bn1] * ( 8.0F / 247.0F) +
01325 border[bn2] * ( 1.0F / 247.0F)
01326 );
01327 break;
01328 case 2:
01329 dst[gmemPos] =
01330 (dptr[0] * (70.0F / 219.0F) +
01331 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01332 dptr[nsw2] * (28.0F / 219.0F) +
01333 border[bn1] * ( 8.0F / 219.0F) +
01334 border[bn2] * ( 1.0F / 219.0F)
01335 );
01336 break;
01337 case 1:
01338 dst[gmemPos] =
01339 (dptr[0] * (70.0F / 163.0F) +
01340 dptr[nsw] * (56.0F / 163.0F) +
01341 dptr[nsw2] * (28.0F / 163.0F) +
01342 border[bn1] * ( 8.0F / 163.0F) +
01343 border[bn2] * ( 1.0F / 163.0F)
01344 );
01345 break;
01346 }
01347 break;
01348 case 3:
01349 switch(h-(sts+sy))
01350 {
01351 case 4:
01352 dst[gmemPos] =
01353 (dptr[0] * (70.0F / 255.0F) +
01354 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
01355 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01356 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01357 border[bn1] * ( 1.0F / 255.0F)
01358 );
01359 break;
01360 case 3:
01361 dst[gmemPos] =
01362 (dptr[0] * (70.0F / 247.0F) +
01363 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
01364 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01365 dptr[nsw3] * ( 8.0F / 247.0F) +
01366 border[bn1] * ( 1.0F / 247.0F)
01367 );
01368 break;
01369 case 2:
01370 dst[gmemPos] =
01371 (dptr[0] * (70.0F / 219.0F) +
01372 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01373 dptr[nsw2] * (28.0F / 219.0F) +
01374 dptr[nsw3] * ( 8.0F / 219.0F) +
01375 border[bn1] * ( 1.0F / 219.0F)
01376 );
01377 break;
01378 case 1:
01379 dst[gmemPos] =
01380 (dptr[0] * (70.0F / 163.0F) +
01381 dptr[nsw] * (56.0F / 163.0F) +
01382 dptr[nsw2] * (28.0F / 163.0F) +
01383 dptr[nsw3] * ( 8.0F / 163.0F) +
01384 border[bn1] * ( 1.0F / 163.0F)
01385 );
01386 break;
01387 }
01388 break;
01389 }
01390 }
01391 else
01392 {
01393 switch(h-(sts+sy))
01394 {
01395 case 4:
01396 dst[gmemPos] =
01397 (dptr[0] * (70.0F / 255.0F) +
01398 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
01399 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01400 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01401 dptr[nsw4] * ( 1.0F / 255.0F)
01402 );
01403 break;
01404 case 3:
01405 dst[gmemPos] =
01406 (dptr[0] * (70.0F / 247.0F) +
01407 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
01408 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01409 dptr[nsw3] * ( 8.0F / 247.0F) +
01410 dptr[nsw4] * ( 1.0F / 247.0F)
01411 );
01412 break;
01413 case 2:
01414 dst[gmemPos] =
01415 (dptr[0] * (70.0F / 219.0F) +
01416 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01417 dptr[nsw2] * (28.0F / 219.0F) +
01418 dptr[nsw3] * ( 8.0F / 219.0F) +
01419 dptr[nsw4] * ( 1.0F / 219.0F)
01420 );
01421 break;
01422 case 1:
01423 dst[gmemPos] =
01424 (dptr[0] * (70.0F / 163.0F) +
01425 dptr[nsw] * (56.0F / 163.0F) +
01426 dptr[nsw2] * (28.0F / 163.0F) +
01427 dptr[nsw3] * ( 8.0F / 163.0F) +
01428 dptr[nsw4] * ( 1.0F / 163.0F)
01429 );
01430 break;
01431 }
01432 }
01433 }
01434 }
01435
01436
01437
01438 __global__ void cuda_global_lowpass_9_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
01439 {
01440
01441 float *data = (float *) shared_data;
01442
01443 float *border = (float *) &data[tile_width];
01444 const int sx = threadIdx.x;
01445 const int sts = IMUL(blockIdx.x, tile_width);
01446 const int srs = IMUL(blockIdx.y,w);
01447
01448 const int loadIdx = sts + sx;
01449 const int bn4 = 0, bn3 = 1, bn2 = 2, bn1 = 3;
01450 const int bp1 = 4, bp2 = 5, bp3 = 6, bp4 = 7;
01451 float *dptr = &data[sx];
01452
01453 if (sx < 4 && sts > 0) border[sx] = src[srs + sts - (4-sx)];
01454 if (sx >= tile_width-4 && sts+tile_width < w-4) border[4+sx-(tile_width-4)] = src[srs + sts + sx + 4];
01455
01456
01457 if (loadIdx < w)
01458 data[sx] = src[srs + loadIdx];
01459 else
01460 return;
01461
01462
01463
01464 __syncthreads();
01465
01466
01467
01468 if(w < 9)
01469 {
01470
01471 const int numAhead = max(0,min(w-1-(sts+sx),4));
01472 const int numBehind = max(0,min(sts+sx,4));
01473 const int situation = numBehind*10+numAhead;
01474
01475 switch(situation)
01476 {
01477 case 0:
01478 dst[srs+loadIdx] = dptr[0];
01479 break;
01480 case 1:
01481 dst[srs+loadIdx] = (dptr[0] * (70.0F / 126.0F) +
01482 dptr[1] * (56.0F / 126.0F));
01483 break;
01484 case 2:
01485 dst[srs+loadIdx] = (dptr[0] * (70.0F / 154.0F) +
01486 dptr[1] * (56.0F / 154.0F) +
01487 dptr[2] * (28.0F / 154.0F));
01488 break;
01489 case 3:
01490 dst[srs+loadIdx] = (dptr[0] * (70.0F / 162.0F) +
01491 dptr[1] * (56.0F / 162.0F) +
01492 dptr[2] * (28.0F / 162.0F) +
01493 dptr[3] * ( 8.0F / 162.0F));
01494 break;
01495 case 4:
01496 dst[srs+loadIdx] = (dptr[0] * (70.0F / 163.0F) +
01497 dptr[1] * (56.0F / 163.0F) +
01498 dptr[2] * (28.0F / 163.0F) +
01499 dptr[3] * ( 8.0F / 163.0F) +
01500 dptr[4] * ( 1.0F / 163.0F));
01501 break;
01502 case 10:
01503 dst[srs+loadIdx] = (dptr[0] * (70.0F / 126.0F) +
01504 dptr[-1] * (56.0F / 126.0F));
01505 break;
01506 case 11:
01507 dst[srs+loadIdx] = (dptr[0] * (70.0F / 182.0F) +
01508 (dptr[-1] + dptr[1]) * (56.0F / 182.0F));
01509 break;
01510 case 12:
01511 dst[srs+loadIdx] = (dptr[0] * (70.0F / 210.0F) +
01512 (dptr[-1] + dptr[1]) * (56.0F / 210.0F) +
01513 dptr[2] * (28.0F / 210.0F));
01514 break;
01515 case 13:
01516 dst[srs+loadIdx] = (dptr[0] * (70.0F / 218.0F) +
01517 (dptr[-1] + dptr[1]) * (56.0F / 218.0F) +
01518 dptr[2] * (28.0F / 218.0F) +
01519 dptr[3] * ( 8.0F / 218.0F));
01520 break;
01521 case 14:
01522 dst[srs+loadIdx] = (dptr[0] * (70.0F / 219.0F) +
01523 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
01524 dptr[2] * (28.0F / 219.0F) +
01525 dptr[3] * ( 8.0F / 219.0F) +
01526 dptr[4] * ( 1.0F / 219.0F));
01527 break;
01528 case 20:
01529 dst[srs+loadIdx] = (dptr[0] * (70.0F / 154.0F) +
01530 dptr[-1] * (56.0F / 154.0F) +
01531 dptr[-2] * (28.0F / 154.0F));
01532 break;
01533 case 21:
01534 dst[srs+loadIdx] = (dptr[0] * (70.0F / 210.0F) +
01535 (dptr[-1] + dptr[1]) * (56.0F / 210.0F) +
01536 dptr[-2] * (28.0F / 210.0F));
01537 break;
01538 case 22:
01539 dst[srs+loadIdx] = (dptr[0] * (70.0F / 238.0F) +
01540 (dptr[-1] + dptr[1]) * (56.0F / 238.0F) +
01541 (dptr[-2] + dptr[2]) * (28.0F / 238.0F));
01542 break;
01543 case 23:
01544 dst[srs+loadIdx] = (dptr[0] * (70.0F / 246.0F) +
01545 (dptr[-1] + dptr[1]) * (56.0F / 246.0F) +
01546 (dptr[-2] + dptr[2]) * (28.0F / 246.0F) +
01547 dptr[3] * ( 8.0F / 246.0F));
01548 break;
01549 case 24:
01550 dst[srs+loadIdx] = (dptr[0] * (70.0F / 247.0F) +
01551 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
01552 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
01553 dptr[3] * ( 8.0F / 247.0F) +
01554 dptr[4] * ( 1.0F / 247.0F));
01555 break;
01556 case 30:
01557 dst[srs+loadIdx] = (dptr[0] * (70.0F / 162.0F) +
01558 dptr[-1] * (56.0F / 162.0F) +
01559 dptr[-2] * (28.0F / 162.0F) +
01560 dptr[-3] * ( 8.0F / 162.0F));
01561 break;
01562 case 31:
01563 dst[srs+loadIdx] = (dptr[0] * (70.0F / 218.0F) +
01564 (dptr[-1] + dptr[1]) * (56.0F / 218.0F) +
01565 dptr[-2] * (28.0F / 218.0F) +
01566 dptr[-3] * ( 8.0F / 218.0F));
01567 break;
01568 case 32:
01569 dst[srs+loadIdx] = (dptr[0] * (70.0F / 246.0F) +
01570 (dptr[-1] + dptr[1]) * (56.0F / 246.0F) +
01571 (dptr[-2] + dptr[2]) * (28.0F / 246.0F) +
01572 dptr[-3] * ( 8.0F / 246.0F));
01573 break;
01574 case 33:
01575 dst[srs+loadIdx] = (dptr[0] * (70.0F / 254.0F) +
01576 (dptr[-1] + dptr[1]) * (56.0F / 254.0F) +
01577 (dptr[-2] + dptr[2]) * (28.0F / 254.0F) +
01578 (dptr[-3] + dptr[3]) * ( 8.0F / 254.0F));
01579 break;
01580 case 34:
01581 dst[srs+loadIdx] = (dptr[0] * (70.0F / 255.0F) +
01582 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
01583 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
01584 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
01585 dptr[4] * ( 1.0F / 255.0F));
01586 break;
01587 case 40:
01588 dst[srs+loadIdx] = (dptr[0] * (70.0F / 163.0F) +
01589 dptr[-1] * (56.0F / 163.0F) +
01590 dptr[-2] * (28.0F / 163.0F) +
01591 dptr[-3] * ( 8.0F / 163.0F) +
01592 dptr[-4] * ( 1.0F / 163.0F));
01593 break;
01594 case 41:
01595 dst[srs+loadIdx] = (dptr[0] * (70.0F / 219.0F) +
01596 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
01597 dptr[-2] * (28.0F / 219.0F) +
01598 dptr[-3] * ( 8.0F / 219.0F) +
01599 dptr[-4] * ( 1.0F / 219.0F));
01600 break;
01601 case 42:
01602 dst[srs+loadIdx] = (dptr[0] * (70.0F / 247.0F) +
01603 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
01604 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
01605 dptr[-3] * ( 8.0F / 247.0F) +
01606 dptr[-4] * ( 1.0F / 247.0F));
01607 break;
01608 case 43:
01609 dst[srs+loadIdx] = (dptr[0] * (70.0F / 255.0F) +
01610 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
01611 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
01612 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
01613 dptr[-4] * ( 1.0F / 255.0F));
01614 break;
01615 }
01616 }
01617
01618 else if(sts+sx < 4)
01619 {
01620 switch(sx)
01621 {
01622 case 0:
01623 dst[srs + loadIdx] =
01624 (dptr[0] * (70.0F / 163.0F) +
01625 dptr[1] * (56.0F / 163.0F) +
01626 dptr[2] * (28.0F / 163.0F) +
01627 dptr[3] * ( 8.0F / 163.0F) +
01628 dptr[4] * ( 1.0F / 163.0F)
01629 );
01630 break;
01631 case 1:
01632 dst[srs + loadIdx] =
01633 (dptr[0] * (70.0F / 219.0F) +
01634 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
01635 dptr[2] * (28.0F / 219.0F) +
01636 dptr[3] * ( 8.0F / 219.0F) +
01637 dptr[4] * ( 1.0F / 219.0F)
01638 );
01639 break;
01640 case 2:
01641 dst[srs + loadIdx] =
01642 (dptr[0] * (70.0F / 247.0F) +
01643 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
01644 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
01645 dptr[3] * ( 8.0F / 247.0F) +
01646 dptr[4] * ( 1.0F / 247.0F)
01647 );
01648 break;
01649 case 3:
01650 dst[srs + loadIdx] =
01651 (dptr[0] * (70.0F / 255.0F) +
01652 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
01653 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
01654 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
01655 dptr[4] * ( 1.0F / 255.0F)
01656 );
01657 break;
01658 default:
01659
01660 break;
01661 }
01662 }
01663
01664 else if(sx < 4 && sts+sx < w-4)
01665 {
01666 switch(sx)
01667 {
01668 case 0:
01669 dst[srs + loadIdx] =
01670 ((border[bn4] + dptr[4]) * ( 1.0F / 256.0F) +
01671 (border[bn3] + dptr[3]) * ( 8.0F / 256.0F) +
01672 (border[bn2] + dptr[2]) * (28.0F / 256.0F) +
01673 (border[bn1] + dptr[1]) * (56.0F / 256.0F) +
01674 dptr[0] * (70.0F / 256.0F)
01675 );
01676 break;
01677 case 1:
01678 dst[srs + loadIdx] =
01679 ((border[bn3] + dptr[4]) * ( 1.0F / 256.0F) +
01680 (border[bn2] + dptr[3]) * ( 8.0F / 256.0F) +
01681 (border[bn1] + dptr[2]) * (28.0F / 256.0F) +
01682 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
01683 dptr[0] * (70.0F / 256.0F)
01684 );
01685 break;
01686 case 2:
01687 dst[srs + loadIdx] =
01688 ((border[bn2] + dptr[4]) * ( 1.0F / 256.0F) +
01689 (border[bn1] + dptr[3]) * ( 8.0F / 256.0F) +
01690 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
01691 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
01692 dptr[0] * (70.0F / 256.0F)
01693 );
01694 break;
01695 case 3:
01696 dst[srs + loadIdx] =
01697 ((border[bn1] + dptr[4]) * ( 1.0F / 256.0F) +
01698 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) +
01699 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
01700 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
01701 dptr[0] * (70.0F / 256.0F)
01702 );
01703 break;
01704 }
01705 }
01706
01707 else if(sx>=4 && sx<tile_width-4 && sts+sx < w-4)
01708 {
01709 dst[srs + loadIdx] =
01710 ((dptr[-4] + dptr[4]) * ( 1.0F / 256.0F) +
01711 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) +
01712 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
01713 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
01714 dptr[0] * (70.0F / 256.0F)
01715 );
01716 }
01717
01718 else if(sx>=4 && sts+sx < w-4)
01719 {
01720 switch(tile_width-sx)
01721 {
01722 case 4:
01723 dst[srs + loadIdx] =
01724 ((dptr[-4] + border[bp1]) * ( 1.0F / 256.0F) +
01725 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) +
01726 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
01727 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
01728 dptr[0] * (70.0F / 256.0F)
01729 );
01730 break;
01731 case 3:
01732 dst[srs + loadIdx] =
01733 ((dptr[-4] + border[bp2]) * ( 1.0F / 256.0F) +
01734 (dptr[-3] + border[bp1]) * ( 8.0F / 256.0F) +
01735 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
01736 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
01737 dptr[0] * (70.0F / 256.0F)
01738 );
01739 break;
01740 case 2:
01741 dst[srs + loadIdx] =
01742 ((dptr[-4] + border[bp3]) * ( 1.0F / 256.0F) +
01743 (dptr[-3] + border[bp2]) * ( 8.0F / 256.0F) +
01744 (dptr[-2] + border[bp1]) * (28.0F / 256.0F) +
01745 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
01746 dptr[0] * (70.0F / 256.0F)
01747 );
01748 break;
01749 case 1:
01750 dst[srs + loadIdx] =
01751 ((dptr[-4] + border[bp4]) * ( 1.0F / 256.0F) +
01752 (dptr[-3] + border[bp3]) * ( 8.0F / 256.0F) +
01753 (dptr[-2] + border[bp2]) * (28.0F / 256.0F) +
01754 (dptr[-1] + border[bp1]) * (56.0F / 256.0F) +
01755 dptr[0] * (70.0F / 256.0F)
01756 );
01757 break;
01758 }
01759 }
01760 else if(sx < 4 && sts + sx >= w-4)
01761 {
01762 switch(sx)
01763 {
01764 case 0:
01765 switch(w-(sts+sx))
01766 {
01767 case 4:
01768 dst[srs + loadIdx] =
01769 (dptr[0] * (70.0F / 255.0F) +
01770 (border[bn1] + dptr[1]) * (56.0F / 255.0F) +
01771 (border[bn2] + dptr[2]) * (28.0F / 255.0F) +
01772 (border[bn3] + dptr[3]) * ( 8.0F / 255.0F) +
01773 border[bn4] * ( 1.0F / 255.0F)
01774 );
01775 break;
01776 case 3:
01777 dst[srs + loadIdx] =
01778 (dptr[0] * (70.0F / 247.0F) +
01779 (border[bn1] + dptr[1]) * (56.0F / 247.0F) +
01780 (border[bn2] + dptr[2]) * (28.0F / 247.0F) +
01781 border[bn3] * ( 8.0F / 247.0F) +
01782 border[bn4] * ( 1.0F / 247.0F)
01783 );
01784 break;
01785 case 2:
01786 dst[srs + loadIdx] =
01787 (dptr[0] * (70.0F / 219.0F) +
01788 (border[bn1] + dptr[1]) * (56.0F / 219.0F) +
01789 border[bn2] * (28.0F / 219.0F) +
01790 border[bn3] * ( 8.0F / 219.0F) +
01791 border[bn4] * ( 1.0F / 219.0F)
01792 );
01793 break;
01794 case 1:
01795 dst[srs + loadIdx] =
01796 (dptr[0] * (70.0F / 163.0F) +
01797 border[bn1] * (56.0F / 163.0F) +
01798 border[bn2] * (28.0F / 163.0F) +
01799 border[bn3] * ( 8.0F / 163.0F) +
01800 border[bn4] * ( 1.0F / 163.0F)
01801 );
01802 break;
01803 }
01804 break;
01805 case 1:
01806 switch(w-(sts+sx))
01807 {
01808 case 4:
01809 dst[srs + loadIdx] =
01810 (dptr[0] * (70.0F / 255.0F) +
01811 (dptr[1] + dptr[1]) * (56.0F / 255.0F) +
01812 (border[bn1] + dptr[2]) * (28.0F / 255.0F) +
01813 (border[bn2] + dptr[3]) * ( 8.0F / 255.0F) +
01814 border[bn3] * ( 1.0F / 255.0F)
01815 );
01816 break;
01817 case 3:
01818 dst[srs + loadIdx] =
01819 (dptr[0] * (70.0F / 247.0F) +
01820 (dptr[1] + dptr[1]) * (56.0F / 247.0F) +
01821 (border[bn1] + dptr[2]) * (28.0F / 247.0F) +
01822 border[bn2] * ( 8.0F / 247.0F) +
01823 border[bn3] * ( 1.0F / 247.0F)
01824 );
01825 break;
01826 case 2:
01827 dst[srs + loadIdx] =
01828 (dptr[0] * (70.0F / 219.0F) +
01829 (dptr[1] + dptr[1]) * (56.0F / 219.0F) +
01830 border[bn1] * (28.0F / 219.0F) +
01831 border[bn2] * ( 8.0F / 219.0F) +
01832 border[bn3] * ( 1.0F / 219.0F)
01833 );
01834 break;
01835 case 1:
01836 dst[srs + loadIdx] =
01837 (dptr[0] * (70.0F / 163.0F) +
01838 dptr[1] * (56.0F / 163.0F) +
01839 border[bn1] * (28.0F / 163.0F) +
01840 border[bn2] * ( 8.0F / 163.0F) +
01841 border[bn3] * ( 1.0F / 163.0F)
01842 );
01843 break;
01844 }
01845 break;
01846 case 2:
01847 switch(w-(sts+sx))
01848 {
01849 case 4:
01850 dst[srs + loadIdx] =
01851 (dptr[0] * (70.0F / 255.0F) +
01852 (dptr[1] + dptr[1]) * (56.0F / 255.0F) +
01853 (dptr[2] + dptr[2]) * (28.0F / 255.0F) +
01854 (border[bn1] + dptr[3]) * ( 8.0F / 255.0F) +
01855 border[bn2] * ( 1.0F / 255.0F)
01856 );
01857 break;
01858 case 3:
01859 dst[srs + loadIdx] =
01860 (dptr[0] * (70.0F / 247.0F) +
01861 (dptr[1] + dptr[1]) * (56.0F / 247.0F) +
01862 (dptr[2] + dptr[2]) * (28.0F / 247.0F) +
01863 border[bn1] * ( 8.0F / 247.0F) +
01864 border[bn2] * ( 1.0F / 247.0F)
01865 );
01866 break;
01867 case 2:
01868 dst[srs + loadIdx] =
01869 (dptr[0] * (70.0F / 219.0F) +
01870 (dptr[1] + dptr[1]) * (56.0F / 219.0F) +
01871 dptr[2] * (28.0F / 219.0F) +
01872 border[bn1] * ( 8.0F / 219.0F) +
01873 border[bn2] * ( 1.0F / 219.0F)
01874 );
01875 break;
01876 case 1:
01877 dst[srs + loadIdx] =
01878 (dptr[0] * (70.0F / 163.0F) +
01879 dptr[1] * (56.0F / 163.0F) +
01880 dptr[2] * (28.0F / 163.0F) +
01881 border[bn1] * ( 8.0F / 163.0F) +
01882 border[bn2] * ( 1.0F / 163.0F)
01883 );
01884 break;
01885 }
01886 break;
01887 case 3:
01888 switch(w-(sts+sx))
01889 {
01890 case 4:
01891 dst[srs + loadIdx] =
01892 (dptr[0] * (70.0F / 255.0F) +
01893 (dptr[1] + dptr[1]) * (56.0F / 255.0F) +
01894 (dptr[2] + dptr[2]) * (28.0F / 255.0F) +
01895 (dptr[3] + dptr[3]) * ( 8.0F / 255.0F) +
01896 border[bn1] * ( 1.0F / 255.0F)
01897 );
01898 break;
01899 case 3:
01900 dst[srs + loadIdx] =
01901 (dptr[0] * (70.0F / 247.0F) +
01902 (dptr[1] + dptr[1]) * (56.0F / 247.0F) +
01903 (dptr[2] + dptr[2]) * (28.0F / 247.0F) +
01904 dptr[3] * ( 8.0F / 247.0F) +
01905 border[bn1] * ( 1.0F / 247.0F)
01906 );
01907 break;
01908 case 2:
01909 dst[srs + loadIdx] =
01910 (dptr[0] * (70.0F / 219.0F) +
01911 (dptr[1] + dptr[1]) * (56.0F / 219.0F) +
01912 dptr[2] * (28.0F / 219.0F) +
01913 dptr[3] * ( 8.0F / 219.0F) +
01914 border[bn1] * ( 1.0F / 219.0F)
01915 );
01916 break;
01917 case 1:
01918 dst[srs + loadIdx] =
01919 (dptr[0] * (70.0F / 163.0F) +
01920 dptr[1] * (56.0F / 163.0F) +
01921 dptr[2] * (28.0F / 163.0F) +
01922 dptr[3] * ( 8.0F / 163.0F) +
01923 border[bn1] * ( 1.0F / 163.0F)
01924 );
01925 break;
01926 }
01927 break;
01928 }
01929 }
01930
01931
01932 else if(sts + sx < w)
01933 {
01934 switch(w-(sts+sx))
01935 {
01936 case 4:
01937 dst[srs + loadIdx] =
01938 (dptr[-4] * ( 1.0F / 255.0F) +
01939 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
01940 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
01941 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
01942 dptr[0] * (70.0F / 255.0F)
01943 );
01944 break;
01945 case 3:
01946 dst[srs + loadIdx] =
01947 (dptr[-4] * ( 1.0F / 247.0F) +
01948 dptr[-3] * ( 8.0F / 247.0F) +
01949 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
01950 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
01951 dptr[0] * (70.0F / 247.0F)
01952 );
01953 break;
01954 case 2:
01955 dst[srs + loadIdx] =
01956 (dptr[-4] * ( 1.0F / 219.0F) +
01957 dptr[-3] * ( 8.0F / 219.0F) +
01958 dptr[-2] * (28.0F / 219.0F) +
01959 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
01960 dptr[0] * (70.0F / 219.0F)
01961 );
01962 break;
01963 case 1:
01964 dst[srs + loadIdx] =
01965 (dptr[-4] * ( 1.0F / 163.0F) +
01966 dptr[-3] * ( 8.0F / 163.0F) +
01967 dptr[-2] * (28.0F / 163.0F) +
01968 dptr[-1] * (56.0F / 163.0F) +
01969 dptr[ 0] * (70.0F / 163.0F)
01970 );
01971 break;
01972 }
01973 }
01974 }
01975
01976
01977 __global__ void cuda_global_lowpass_9_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
01978 {
01979
01980 float *data = (float *) shared_data;
01981 float *border = (float *) &data[tile_width*tile_height];
01982
01983 const int sy = threadIdx.y;
01984
01985 const int sts = IMUL(blockIdx.y, tile_height);
01986 const int ste = sts + tile_height;
01987 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
01988
01989
01990 const int stec = min(ste, h);
01991
01992
01993
01994
01995
01996 if (scs < w && sts+sy < stec)
01997 {
01998
01999 int smemPos = IMUL(sy, tile_width) + threadIdx.x;
02000 int gmemPos = IMUL(sts + sy, w) + scs;
02001
02002 data[smemPos] = src[gmemPos];
02003
02004 if (sy < 4 && gmemPos > IMUL(4,w))
02005 border[smemPos] = src[gmemPos-IMUL(4,w)];
02006
02007 int bordOff = 8+sy-tile_height;
02008
02009 if (sy >= tile_height-4)
02010 {
02011 if(ste+4 <= h)
02012 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(4,w)];
02013 else
02014
02015
02016
02017 border[threadIdx.x+IMUL(bordOff,tile_width)] = 0;
02018 }
02019
02020
02021
02022 __syncthreads();
02023
02024
02025 smemPos = IMUL(sy, tile_width) + threadIdx.x;
02026 gmemPos = IMUL(sts + sy, w) + scs;
02027
02028
02029
02030 float *dptr = data + smemPos;
02031 const int sw = tile_width, sw2 = sw + sw, sw3 = sw2 + sw, sw4 = sw3 + sw;
02032 const int nsw = -sw, nsw2 = nsw - sw, nsw3 = nsw2 - sw, nsw4 = nsw3 - sw;
02033 const int bn4 = threadIdx.x, bn3 = bn4 + tile_width, bn2 = bn3 + tile_width, bn1 = bn2 + tile_width;
02034 const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width, bp3 = bp2 + tile_width, bp4 = bp3 + tile_width;
02035
02036 if(h < 9)
02037 {
02038
02039 const int numAhead = max(0,min(h-1-(sts+sy),4));
02040 const int numBehind = max(0,min(sts+sy,4));
02041 const int situation = numBehind*10+numAhead;
02042 switch(situation)
02043 {
02044 case 0:
02045 dst[gmemPos] = dptr[0];
02046 break;
02047 case 1:
02048 dst[gmemPos] = (dptr[0] * (70.0F / 126.0F) +
02049 dptr[sw] * (56.0F / 126.0F));
02050 break;
02051 case 2:
02052 dst[gmemPos] = (dptr[0] * (70.0F / 154.0F) +
02053 dptr[sw] * (56.0F / 154.0F) +
02054 dptr[sw2] * (28.0F / 154.0F));
02055 break;
02056 case 3:
02057 dst[gmemPos] = (dptr[0] * (70.0F / 162.0F) +
02058 dptr[sw] * (56.0F / 162.0F) +
02059 dptr[sw2] * (28.0F / 162.0F) +
02060 dptr[sw3] * ( 8.0F / 162.0F));
02061 break;
02062 case 4:
02063 dst[gmemPos] = (dptr[0] * (70.0F / 163.0F) +
02064 dptr[sw] * (56.0F / 163.0F) +
02065 dptr[sw2] * (28.0F / 163.0F) +
02066 dptr[sw3] * ( 8.0F / 163.0F) +
02067 dptr[sw4] * ( 1.0F / 163.0F));
02068 break;
02069 case 10:
02070 dst[gmemPos] = (dptr[0] * (70.0F / 126.0F) +
02071 dptr[nsw] * (56.0F / 126.0F));
02072 break;
02073 case 11:
02074 dst[gmemPos] = (dptr[0] * (70.0F / 182.0F) +
02075 (dptr[nsw] + dptr[sw]) * (56.0F / 182.0F));
02076 break;
02077 case 12:
02078 dst[gmemPos] = (dptr[0] * (70.0F / 210.0F) +
02079 (dptr[nsw] + dptr[sw]) * (56.0F / 210.0F) +
02080 dptr[sw2] * (28.0F / 210.0F));
02081 break;
02082 case 13:
02083 dst[gmemPos] = (dptr[0] * (70.0F / 218.0F) +
02084 (dptr[nsw] + dptr[sw]) * (56.0F / 218.0F) +
02085 dptr[sw2] * (28.0F / 218.0F) +
02086 dptr[sw3] * ( 8.0F / 218.0F));
02087 break;
02088 case 14:
02089 dst[gmemPos] = (dptr[0] * (70.0F / 219.0F) +
02090 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02091 dptr[sw2] * (28.0F / 219.0F) +
02092 dptr[sw3] * ( 8.0F / 219.0F) +
02093 dptr[sw4] * ( 1.0F / 219.0F));
02094 break;
02095 case 20:
02096 dst[gmemPos] = (dptr[0] * (70.0F / 154.0F) +
02097 dptr[nsw] * (56.0F / 154.0F) +
02098 dptr[nsw2] * (28.0F / 154.0F));
02099 break;
02100 case 21:
02101 dst[gmemPos] = (dptr[0] * (70.0F / 210.0F) +
02102 (dptr[nsw] + dptr[sw]) * (56.0F / 210.0F) +
02103 dptr[nsw2] * (28.0F / 210.0F));
02104 break;
02105 case 22:
02106 dst[gmemPos] = (dptr[0] * (70.0F / 238.0F) +
02107 (dptr[nsw] + dptr[sw]) * (56.0F / 238.0F) +
02108 (dptr[nsw2] + dptr[sw2]) * (28.0F / 238.0F));
02109 break;
02110 case 23:
02111 dst[gmemPos] = (dptr[0] * (70.0F / 246.0F) +
02112 (dptr[nsw] + dptr[sw]) * (56.0F / 246.0F) +
02113 (dptr[nsw2] + dptr[sw2]) * (28.0F / 246.0F) +
02114 dptr[sw3] * ( 8.0F / 246.0F));
02115 break;
02116 case 24:
02117 dst[gmemPos] = (dptr[0] * (70.0F / 247.0F) +
02118 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
02119 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02120 dptr[sw3] * ( 8.0F / 247.0F) +
02121 dptr[sw4] * ( 1.0F / 247.0F));
02122 break;
02123 case 30:
02124 dst[gmemPos] = (dptr[0] * (70.0F / 162.0F) +
02125 dptr[nsw] * (56.0F / 162.0F) +
02126 dptr[nsw2] * (28.0F / 162.0F) +
02127 dptr[nsw3] * ( 8.0F / 162.0F));
02128 break;
02129 case 31:
02130 dst[gmemPos] = (dptr[0] * (70.0F / 218.0F) +
02131 (dptr[nsw] + dptr[sw]) * (56.0F / 218.0F) +
02132 dptr[nsw2] * (28.0F / 218.0F) +
02133 dptr[nsw3] * ( 8.0F / 218.0F));
02134 break;
02135 case 32:
02136 dst[gmemPos] = (dptr[0] * (70.0F / 246.0F) +
02137 (dptr[nsw] + dptr[sw]) * (56.0F / 246.0F) +
02138 (dptr[nsw2] + dptr[sw2]) * (28.0F / 246.0F) +
02139 dptr[nsw3] * ( 8.0F / 246.0F));
02140 break;
02141 case 33:
02142 dst[gmemPos] = (dptr[0] * (70.0F / 254.0F) +
02143 (dptr[nsw] + dptr[sw]) * (56.0F / 254.0F) +
02144 (dptr[nsw2] + dptr[sw2]) * (28.0F / 254.0F) +
02145 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 254.0F));
02146 break;
02147 case 34:
02148 dst[gmemPos] = (dptr[0] * (70.0F / 255.0F) +
02149 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
02150 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02151 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02152 dptr[sw4] * ( 1.0F / 255.0F));
02153 break;
02154 case 40:
02155 dst[gmemPos] = (dptr[0] * (70.0F / 163.0F) +
02156 dptr[nsw] * (56.0F / 163.0F) +
02157 dptr[nsw2] * (28.0F / 163.0F) +
02158 dptr[nsw3] * ( 8.0F / 163.0F) +
02159 dptr[nsw4] * ( 1.0F / 163.0F));
02160 break;
02161 case 41:
02162 dst[gmemPos] = (dptr[0] * (70.0F / 219.0F) +
02163 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02164 dptr[nsw2] * (28.0F / 219.0F) +
02165 dptr[nsw3] * ( 8.0F / 219.0F) +
02166 dptr[nsw4] * ( 1.0F / 219.0F));
02167 break;
02168 case 42:
02169 dst[gmemPos] = (dptr[0] * (70.0F / 247.0F) +
02170 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
02171 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02172 dptr[nsw3] * ( 8.0F / 247.0F) +
02173 dptr[nsw4] * ( 1.0F / 247.0F));
02174 break;
02175 case 43:
02176 dst[gmemPos] = (dptr[0] * (70.0F / 255.0F) +
02177 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
02178 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02179 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02180 dptr[nsw4] * ( 1.0F / 255.0F));
02181 break;
02182 }
02183 }
02184
02185 else if(sts + sy < 4)
02186 {
02187 switch(sts+sy)
02188 {
02189 case 0:
02190 dst[gmemPos] =
02191 (dptr[0] * (70.0F / 163.0F) +
02192 dptr[sw] * (56.0F / 163.0F) +
02193 dptr[sw2] * (28.0F / 163.0F) +
02194 dptr[sw3] * ( 8.0F / 163.0F) +
02195 dptr[sw4] * ( 1.0F / 163.0F)
02196 );
02197 break;
02198 case 1:
02199 dst[gmemPos] =
02200 (dptr[0] * (70.0F / 219.0F) +
02201 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02202 dptr[sw2] * (28.0F / 219.0F) +
02203 dptr[sw3] * ( 8.0F / 219.0F) +
02204 dptr[sw4] * ( 1.0F / 219.0F)
02205 );
02206 break;
02207 case 2:
02208 dst[gmemPos] =
02209 (dptr[0] * (70.0F / 247.0F) +
02210 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
02211 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02212 dptr[sw3] * ( 8.0F / 247.0F) +
02213 dptr[sw4] * ( 1.0F / 247.0F)
02214 );
02215 break;
02216 case 3:
02217 dst[gmemPos] =
02218 (dptr[0] * (70.0F / 255.0F) +
02219 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
02220 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02221 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02222 dptr[sw4] * ( 1.0F / 255.0F)
02223 );
02224 break;
02225 }
02226 }
02227 else if(sy < 4 && sts+sy<h-4)
02228 {
02229 switch(sy)
02230 {
02231 case 0:
02232 dst[gmemPos] =
02233 (dptr[0] * (70.0F / 256.0F) +
02234 (border[bn1] + dptr[sw]) * (56.0F / 256.0F) +
02235 (border[bn2] + dptr[sw2]) * (28.0F / 256.0F) +
02236 (border[bn3] + dptr[sw3]) * ( 8.0F / 256.0F) +
02237 (border[bn4] + dptr[sw4]) * ( 1.0F / 256.0F)
02238 );
02239 break;
02240 case 1:
02241 dst[gmemPos] =
02242 (dptr[0] * (70.0F / 256.0F) +
02243 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
02244 (border[bn1] + dptr[sw2]) * (28.0F / 256.0F) +
02245 (border[bn2] + dptr[sw3]) * ( 8.0F / 256.0F) +
02246 (border[bn3] + dptr[sw4]) * ( 1.0F / 256.0F)
02247 );
02248 break;
02249 case 2:
02250 dst[gmemPos] =
02251 (dptr[0] * (70.0F / 256.0F) +
02252 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
02253 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
02254 (border[bn1] + dptr[sw3]) * ( 8.0F / 256.0F) +
02255 (border[bn2] + dptr[sw4]) * ( 1.0F / 256.0F)
02256 );
02257 break;
02258 case 3:
02259 dst[gmemPos] =
02260 (dptr[0] * (70.0F / 256.0F) +
02261 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
02262 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
02263 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) +
02264 (border[bn1] + dptr[sw4]) * ( 1.0F / 256.0F)
02265 );
02266 break;
02267 }
02268 }
02269 else if(sy >= 4 && sy <tile_height-4 && sts+sy<h-4)
02270 {
02271 dst[gmemPos] =
02272 ((dptr[nsw4] + dptr[sw4]) * ( 1.0F / 256.0F) +
02273 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) +
02274 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
02275 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
02276 dptr[0] * (70.0F / 256.0F)
02277 );
02278 }
02279 else if(sy >= 4 && sts + sy < h-4)
02280 {
02281 switch(tile_height-sy)
02282 {
02283 case 4:
02284 dst[gmemPos] =
02285 (dptr[0] * (70.0F / 256.0F) +
02286 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
02287 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
02288 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) +
02289 (dptr[nsw4] + border[bp1]) * ( 1.0F / 256.0F)
02290 );
02291 break;
02292 case 3:
02293 dst[gmemPos] =
02294 (dptr[0] * (70.0F / 256.0F) +
02295 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
02296 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
02297 (dptr[nsw3] + border[bp1]) * ( 8.0F / 256.0F) +
02298 (dptr[nsw4] + border[bp2]) * ( 1.0F / 256.0F)
02299 );
02300 break;
02301 case 2:
02302 dst[gmemPos] =
02303 (dptr[0] * (70.0F / 256.0F) +
02304 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) +
02305 (dptr[nsw2] + border[bp1]) * (28.0F / 256.0F) +
02306 (dptr[nsw3] + border[bp2]) * ( 8.0F / 256.0F) +
02307 (dptr[nsw4] + border[bp3]) * ( 1.0F / 256.0F)
02308 );
02309 break;
02310 case 1:
02311 dst[gmemPos] =
02312 (dptr[0] * (70.0F / 256.0F) +
02313 (dptr[nsw] + border[bp1]) * (56.0F / 256.0F) +
02314 (dptr[nsw2] + border[bp2]) * (28.0F / 256.0F) +
02315 (dptr[nsw3] + border[bp3]) * ( 8.0F / 256.0F) +
02316 (dptr[nsw4] + border[bp4]) * ( 1.0F / 256.0F)
02317 );
02318 break;
02319 }
02320 }
02321 else if(sy < 4 && sts + sy >= h-4)
02322 {
02323 switch(sy)
02324 {
02325 case 0:
02326 switch(h-(sts+sy))
02327 {
02328 case 4:
02329 dst[gmemPos] =
02330 (dptr[0] * (70.0F / 255.0F) +
02331 (border[bn1] + dptr[sw]) * (56.0F / 255.0F) +
02332 (border[bn2] + dptr[sw2]) * (28.0F / 255.0F) +
02333 (border[bn3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02334 border[bn4] * ( 1.0F / 255.0F)
02335 );
02336 break;
02337 case 3:
02338 dst[gmemPos] =
02339 (dptr[0] * (70.0F / 247.0F) +
02340 (border[bn1] + dptr[sw]) * (56.0F / 247.0F) +
02341 (border[bn2] + dptr[sw2]) * (28.0F / 247.0F) +
02342 border[bn3] * ( 8.0F / 247.0F) +
02343 border[bn4] * ( 1.0F / 247.0F)
02344 );
02345 break;
02346 case 2:
02347 dst[gmemPos] =
02348 (dptr[0] * (70.0F / 219.0F) +
02349 (border[bn1] + dptr[sw]) * (56.0F / 219.0F) +
02350 border[bn2] * (28.0F / 219.0F) +
02351 border[bn3] * ( 8.0F / 219.0F) +
02352 border[bn4] * ( 1.0F / 219.0F)
02353 );
02354 break;
02355 case 1:
02356 dst[gmemPos] =
02357 (dptr[0] * (70.0F / 163.0F) +
02358 border[bn1] * (56.0F / 163.0F) +
02359 border[bn2] * (28.0F / 163.0F) +
02360 border[bn3] * ( 8.0F / 163.0F) +
02361 border[bn4] * ( 1.0F / 163.0F)
02362 );
02363 break;
02364 }
02365 break;
02366 case 1:
02367 switch(h-(sts+sy))
02368 {
02369 case 4:
02370 dst[gmemPos] =
02371 (dptr[0] * (70.0F / 255.0F) +
02372 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
02373 (border[bn1] + dptr[sw2]) * (28.0F / 255.0F) +
02374 (border[bn2] + dptr[sw3]) * ( 8.0F / 255.0F) +
02375 border[bn3] * ( 1.0F / 255.0F)
02376 );
02377 break;
02378 case 3:
02379 dst[gmemPos] =
02380 (dptr[0] * (70.0F / 247.0F) +
02381 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
02382 (border[bn1] + dptr[sw2]) * (28.0F / 247.0F) +
02383 border[bn2] * ( 8.0F / 247.0F) +
02384 border[bn3] * ( 1.0F / 247.0F)
02385 );
02386 break;
02387 case 2:
02388 dst[gmemPos] =
02389 (dptr[0] * (70.0F / 219.0F) +
02390 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02391 border[bn1] * (28.0F / 219.0F) +
02392 border[bn2] * ( 8.0F / 219.0F) +
02393 border[bn3] * ( 1.0F / 219.0F)
02394 );
02395 break;
02396 case 1:
02397 dst[gmemPos] =
02398 (dptr[0] * (70.0F / 163.0F) +
02399 dptr[nsw] * (56.0F / 163.0F) +
02400 border[bn1] * (28.0F / 163.0F) +
02401 border[bn2] * ( 8.0F / 163.0F) +
02402 border[bn3] * ( 1.0F / 163.0F)
02403 );
02404 break;
02405 }
02406 break;
02407 case 2:
02408 switch(h-(sts+sy))
02409 {
02410 case 4:
02411 dst[gmemPos] =
02412 (dptr[0] * (70.0F / 255.0F) +
02413 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
02414 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02415 (border[bn1] + dptr[sw3]) * ( 8.0F / 255.0F) +
02416 border[bn2] * ( 1.0F / 255.0F)
02417 );
02418 break;
02419 case 3:
02420 dst[gmemPos] =
02421 (dptr[0] * (70.0F / 247.0F) +
02422 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
02423 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02424 border[bn1] * ( 8.0F / 247.0F) +
02425 border[bn2] * ( 1.0F / 247.0F)
02426 );
02427 break;
02428 case 2:
02429 dst[gmemPos] =
02430 (dptr[0] * (70.0F / 219.0F) +
02431 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02432 dptr[nsw2] * (28.0F / 219.0F) +
02433 border[bn1] * ( 8.0F / 219.0F) +
02434 border[bn2] * ( 1.0F / 219.0F)
02435 );
02436 break;
02437 case 1:
02438 dst[gmemPos] =
02439 (dptr[0] * (70.0F / 163.0F) +
02440 dptr[nsw] * (56.0F / 163.0F) +
02441 dptr[nsw2] * (28.0F / 163.0F) +
02442 border[bn1] * ( 8.0F / 163.0F) +
02443 border[bn2] * ( 1.0F / 163.0F)
02444 );
02445 break;
02446 }
02447 break;
02448 case 3:
02449 switch(h-(sts+sy))
02450 {
02451 case 4:
02452 dst[gmemPos] =
02453 (dptr[0] * (70.0F / 255.0F) +
02454 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
02455 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02456 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02457 border[bn1] * ( 1.0F / 255.0F)
02458 );
02459 break;
02460 case 3:
02461 dst[gmemPos] =
02462 (dptr[0] * (70.0F / 247.0F) +
02463 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
02464 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02465 dptr[nsw3] * ( 8.0F / 247.0F) +
02466 border[bn1] * ( 1.0F / 247.0F)
02467 );
02468 break;
02469 case 2:
02470 dst[gmemPos] =
02471 (dptr[0] * (70.0F / 219.0F) +
02472 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02473 dptr[nsw2] * (28.0F / 219.0F) +
02474 dptr[nsw3] * ( 8.0F / 219.0F) +
02475 border[bn1] * ( 1.0F / 219.0F)
02476 );
02477 break;
02478 case 1:
02479 dst[gmemPos] =
02480 (dptr[0] * (70.0F / 163.0F) +
02481 dptr[nsw] * (56.0F / 163.0F) +
02482 dptr[nsw2] * (28.0F / 163.0F) +
02483 dptr[nsw3] * ( 8.0F / 163.0F) +
02484 border[bn1] * ( 1.0F / 163.0F)
02485 );
02486 break;
02487 }
02488 break;
02489 }
02490 }
02491 else
02492 {
02493 switch(h-(sts+sy))
02494 {
02495 case 4:
02496 dst[gmemPos] =
02497 (dptr[0] * (70.0F / 255.0F) +
02498 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) +
02499 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02500 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02501 dptr[nsw4] * ( 1.0F / 255.0F)
02502 );
02503 break;
02504 case 3:
02505 dst[gmemPos] =
02506 (dptr[0] * (70.0F / 247.0F) +
02507 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) +
02508 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02509 dptr[nsw3] * ( 8.0F / 247.0F) +
02510 dptr[nsw4] * ( 1.0F / 247.0F)
02511 );
02512 break;
02513 case 2:
02514 dst[gmemPos] =
02515 (dptr[0] * (70.0F / 219.0F) +
02516 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02517 dptr[nsw2] * (28.0F / 219.0F) +
02518 dptr[nsw3] * ( 8.0F / 219.0F) +
02519 dptr[nsw4] * ( 1.0F / 219.0F)
02520 );
02521 break;
02522 case 1:
02523 dst[gmemPos] =
02524 (dptr[0] * (70.0F / 163.0F) +
02525 dptr[nsw] * (56.0F / 163.0F) +
02526 dptr[nsw2] * (28.0F / 163.0F) +
02527 dptr[nsw3] * ( 8.0F / 163.0F) +
02528 dptr[nsw4] * ( 1.0F / 163.0F)
02529 );
02530 break;
02531 }
02532 }
02533 }
02534 }
02535
02536 __global__ void cuda_global_lowpass_5_x(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
02537 {
02538
02539
02540
02541
02542
02543
02544 float *data = (float *) shared_data;
02545 float *border = (float *) &data[tile_width];
02546
02547 const int sx = threadIdx.x;
02548 const int sts = IMUL(blockIdx.x, tile_width);
02549 const int srs = IMUL(blockIdx.y, w);
02550
02551
02552 const int loadIdx = sts + sx;
02553 if (loadIdx < w) data[sx] = src[srs + loadIdx];
02554
02555
02556 if (sx < 2 && sts > 0)
02557 border[sx] = src[srs + loadIdx - 2];
02558
02559 else if(sx >= tile_width-2 && sts+tile_width < w-2)
02560 border[4+sx-tile_width] = src[srs + sts + sx + 2];
02561
02562
02563
02564 __syncthreads();
02565
02566
02567 if ( loadIdx < w ) {
02568 const int writeIdx = sts + sx;
02569 const float *dptr = data + sx;
02570
02571
02572
02573
02574 if(w < 5)
02575 {
02576 int numAhead = max(0,min(w-1-(sts+sx),2));
02577 int numBehind = max(0,min(sts+sx,2));
02578 int situation = numBehind*10+numAhead;
02579 switch(situation)
02580 {
02581 case 0:
02582 dst[srs + writeIdx] = dptr[0];
02583 break;
02584 case 1:
02585 dst[srs + writeIdx] = (dptr[0] * (6.0F / 10.0F) +
02586 dptr[1] * (4.0F / 10.0F));
02587 break;
02588 case 2:
02589 dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
02590 dptr[1] * (4.0F / 11.0F) +
02591 dptr[2] * (1.0F / 11.0F));
02592 break;
02593 case 10:
02594 dst[srs + writeIdx] = (dptr[0] * (6.0F / 10.0F) +
02595 dptr[-1] * (4.0F / 10.0F));
02596 break;
02597 case 11:
02598 dst[srs + writeIdx] = (dptr[0] * (6.0F / 14.0F) +
02599 (dptr[-1] + dptr[1]) * (4.0F / 14.0F));
02600 break;
02601 case 12:
02602 dst[srs + writeIdx] = (dptr[0] * (6.0F / 15.0F) +
02603 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
02604 dptr[2] * (1.0F / 15.0F));
02605 break;
02606 case 20:
02607 dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
02608 dptr[-1] * (4.0F / 11.0F) +
02609 dptr[-2] * (1.0F / 11.0F));
02610 break;
02611 case 21:
02612 dst[srs + writeIdx] = (dptr[0] * (6.0F / 15.0F) +
02613 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
02614 dptr[-2] * (1.0F / 15.0F));
02615 break;
02616 }
02617 }
02618
02619 else if(sts+sx < 2)
02620 {
02621 switch(sx)
02622 {
02623 case 0:
02624 dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
02625 dptr[1] * (4.0F / 11.0F) +
02626 dptr[2] * (1.0F / 11.0F));
02627 break;
02628 case 1:
02629 dst[srs + writeIdx] = (dptr[0] * (6.0F / 15.0F) +
02630 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
02631 dptr[2] * (1.0F / 15.0F));
02632 break;
02633 }
02634 }
02635
02636 else if(sx < 2 && sts+sx < w-2)
02637 {
02638 switch(sx)
02639 {
02640 case 0:
02641 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
02642 (border[1] + dptr[1]) * (4.0F / 16.0F) +
02643 (border[0] + dptr[2]) * (1.0F / 16.0F));
02644 break;
02645 case 1:
02646 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
02647 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) +
02648 (border[1] + dptr[2]) * (1.0F / 16.0F));
02649 break;
02650 }
02651 }
02652
02653 else if(sx < tile_width-2 && sts+sx < w-2)
02654 {
02655 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
02656 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) +
02657 (dptr[-2] + dptr[2]) * (1.0F / 16.0F));
02658 }
02659
02660 else if(sts+sx < w-2)
02661 {
02662 switch(tile_width-sx)
02663 {
02664 case 2:
02665 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
02666 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) +
02667 (dptr[-2] + border[2]) * (1.0F / 16.0F));
02668 break;
02669 case 1:
02670 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) +
02671 (dptr[-1] + border[2]) * (4.0F / 16.0F) +
02672 (dptr[-2] + border[3]) * (1.0F / 16.0F));
02673 break;
02674 }
02675 }
02676
02677 else
02678 {
02679 switch(w-(sts+sx))
02680 {
02681 case 2:
02682 dst[srs + writeIdx] = (dptr[0] * (6.0F / 15.0F) +
02683 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
02684 (dptr[-2]) * (1.0F / 15.0F));
02685 break;
02686 case 1:
02687 dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
02688 (dptr[-1]) * (4.0F / 11.0F) +
02689 (dptr[-2]) * (1.0F / 11.0F));
02690 break;
02691 }
02692 }
02693 }
02694 }
02695
02696 __global__ void cuda_global_lowpass_5_y(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
02697 {
02698
02699
02700
02701
02702
02703
02704
02705 float *data = (float *) shared_data;
02706 float *border = (float *) &data[tile_width*tile_height];
02707
02708 const int sy = threadIdx.y;
02709
02710 const int sts = IMUL(blockIdx.y, tile_height);
02711 const int ste = sts + tile_height;
02712
02713
02714 const int stec = min(ste, h);
02715
02716
02717 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
02718
02719 int smemPos = IMUL(sy, tile_width) + threadIdx.x;
02720 int gmemPos = IMUL(sts + sy, w) + scs;
02721
02722
02723 if (scs < w && sts+sy < h) {
02724
02725
02726
02727 data[smemPos] = src[gmemPos];
02728
02729
02730 if (sy < 2 && gmemPos > IMUL(2,w))
02731 border[smemPos] = src[gmemPos-IMUL(2,w)];
02732
02733 int bordOff = 4+sy-tile_height;
02734
02735 if (sy >= tile_height-2 && ste+2 < h)
02736 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(2,w)];
02737
02738
02739
02740 __syncthreads();
02741
02742
02743
02744 float *dptr = data + smemPos;
02745 const int sw = tile_width, sw2 = sw + sw;
02746 const int nsw = -sw, nsw2 = nsw - sw;
02747 const int bn2 = threadIdx.x, bn1 = bn2 + tile_width;
02748 const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width;
02749
02750
02751
02752
02753
02754 if(h < 5)
02755 {
02756 int numAhead = max(0,min(h-1-(sts+sy),2));
02757 int numBehind = max(0,min(sts+sy,2));
02758 int situation = numBehind*10+numAhead;
02759 switch(situation)
02760 {
02761 case 0:
02762 dst[gmemPos] = dptr[0];
02763 break;
02764 case 1:
02765 dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) +
02766 dptr[sw] * (4.0F / 10.0F));
02767 break;
02768 case 2:
02769 dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) +
02770 dptr[sw] * (4.0F / 11.0F) +
02771 dptr[sw2] * (1.0F / 11.0F));
02772 break;
02773 case 10:
02774 dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) +
02775 dptr[nsw] * (4.0F / 10.0F));
02776 break;
02777 case 11:
02778 dst[gmemPos] = (dptr[0] * (6.0F / 14.0F) +
02779 (dptr[nsw] + dptr[sw]) * (4.0F / 14.0F));
02780 break;
02781 case 12:
02782 dst[gmemPos] = (dptr[0] * (6.0F / 15.0F) +
02783 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
02784 dptr[sw2] * (1.0F / 15.0F));
02785 break;
02786 case 20:
02787 dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) +
02788 dptr[nsw] * (4.0F / 11.0F) +
02789 dptr[nsw2] * (1.0F / 11.0F));
02790 break;
02791 case 21:
02792 dst[gmemPos] = (dptr[0] * (6.0F / 15.0F) +
02793 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
02794 dptr[nsw2] * (1.0F / 15.0F));
02795 break;
02796 }
02797 }
02798
02799 else if(sts + sy < 2)
02800 {
02801 switch(sts+sy)
02802 {
02803 case 0:
02804 dst[gmemPos] =
02805 (dptr[0] * (6.0F / 11.0F) +
02806 dptr[sw] * (4.0F / 11.0F) +
02807 dptr[sw2] * (1.0F / 11.0F)
02808 );
02809 break;
02810 case 1:
02811 dst[gmemPos] =
02812 (dptr[0] * (6.0F / 15.0F) +
02813 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
02814 dptr[sw2] * (1.0F / 15.0F)
02815 );
02816 break;
02817 }
02818 }
02819 else if(sy < 2 && sts+sy<h-2)
02820 {
02821 switch(sy)
02822 {
02823 case 0:
02824 dst[gmemPos] =
02825 (dptr[0] * (6.0F / 16.0F) +
02826 (border[bn1] + dptr[sw]) * (4.0F / 16.0F) +
02827 (border[bn2] + dptr[sw2]) * (1.0F / 16.0F)
02828 );
02829 break;
02830 case 1:
02831 dst[gmemPos] =
02832 (dptr[0] * (6.0F / 16.0F) +
02833 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) +
02834 (border[bn1] + dptr[sw2]) * (1.0F / 16.0F)
02835 );
02836 break;
02837 }
02838 }
02839 else if(sy <tile_height-2 && sts+sy<h-2)
02840 {
02841 dst[gmemPos] =
02842 ((dptr[nsw2] + dptr[sw2]) * (1.0F / 16.0F) +
02843 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) +
02844 dptr[0] * (6.0F / 16.0F)
02845 );
02846 }
02847 else if(sts + sy < h-2)
02848 {
02849 switch(tile_height-sy)
02850 {
02851 case 2:
02852 dst[gmemPos] =
02853 (dptr[0] * (6.0F / 16.0F) +
02854 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) +
02855 (dptr[nsw2] + border[bp1]) * (1.0F / 16.0F)
02856 );
02857 break;
02858 case 1:
02859 dst[gmemPos] =
02860 (dptr[0] * (6.0F / 16.0F) +
02861 (dptr[nsw] + border[bp1]) * (4.0F / 16.0F) +
02862 (dptr[nsw2] + border[bp2]) * (1.0F / 16.0F)
02863 );
02864 break;
02865 }
02866 }
02867 else
02868 {
02869 switch(h-(sts+sy))
02870 {
02871 case 2:
02872 dst[gmemPos] =
02873 (dptr[0] * (6.0F / 15.0F) +
02874 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
02875 dptr[nsw2] * (1.0F / 15.0F)
02876 );
02877 break;
02878 case 1:
02879 dst[gmemPos] =
02880 (dptr[0] * (6.0F / 11.0F) +
02881 dptr[nsw] * (4.0F / 11.0F) +
02882 dptr[nsw2] * (1.0F / 11.0F)
02883 );
02884 break;
02885 }
02886 }
02887
02888 }
02889
02890 }
02891
02892
02893
02894 __global__ void cuda_global_lowpass_3_x(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
02895 {
02896
02897
02898 float *data = (float *) shared_data;
02899 float *border = (float *) &data[tile_width];
02900
02901 const int sx = threadIdx.x;
02902 const int sts = IMUL(blockIdx.x, tile_width);
02903 const int srs = IMUL(blockIdx.y, w);
02904
02905
02906 const int loadIdx = sts + sx;
02907 if (loadIdx < w) data[sx] = src[srs + loadIdx];
02908
02909
02910 if (sx < 1 && sts > 0)
02911 border[sx] = src[srs + loadIdx - 1];
02912
02913 else if(sx >= tile_width-1 && sts+tile_width < w-1)
02914 border[2+sx-tile_width] = src[srs + sts + sx + 1];
02915
02916
02917
02918 __syncthreads();
02919
02920
02921 if ( loadIdx < w ) {
02922 const int writeIdx = sts + sx;
02923 const float *dptr = data + sx;
02924
02925
02926
02927
02928 if(w < 3)
02929 {
02930 int numAhead = max(0,min(w-1-(sts+sx),1));
02931 int numBehind = max(0,min(sts+sx,1));
02932 int situation = numBehind*10+numAhead;
02933 switch(situation)
02934 {
02935 case 0:
02936 dst[srs + writeIdx] = dptr[0];
02937 break;
02938 case 1:
02939 dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) +
02940 dptr[1] * (1.0F / 3.0F));
02941 break;
02942 case 10:
02943 dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) +
02944 dptr[-1] * (1.0F / 3.0F));
02945 break;
02946 case 11:
02947 dst[srs + writeIdx] = (dptr[0] * (2.0F / 4.0F) +
02948 (dptr[-1] + dptr[1]) * (1.0F / 4.0F));
02949 break;
02950 }
02951 }
02952
02953 else if(sts+sx < 1)
02954 {
02955 switch(sx)
02956 {
02957 case 0:
02958 dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) +
02959 dptr[1] * (1.0F / 3.0F));
02960 break;
02961 }
02962 }
02963
02964 else if(sx < 1 && sts+sx < w-1)
02965 {
02966 switch(sx)
02967 {
02968 case 0:
02969 dst[srs + writeIdx] = (dptr[0] * (2.0F / 4.0F) +
02970 (border[0] + dptr[1]) * (1.0F / 4.0F));
02971 break;
02972 }
02973 }
02974
02975 else if(sx < tile_width-1 && sts+sx < w-1)
02976 {
02977 dst[srs + writeIdx] = (dptr[0] * (2.0F / 4.0F) +
02978 (dptr[-1] + dptr[1]) * (1.0F / 4.0F));
02979 }
02980
02981 else if(sts+sx < w-1)
02982 {
02983 switch(tile_width-sx)
02984 {
02985 case 1:
02986 dst[srs + writeIdx] = (dptr[0] * (2.0F / 4.0F) +
02987 (dptr[-1] + border[1]) * (1.0F / 4.0F));
02988 break;
02989 }
02990 }
02991
02992 else
02993 {
02994 switch(w-(sts+sx))
02995 {
02996 case 1:
02997 dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) +
02998 (dptr[-1]) * (1.0F / 3.0F));
02999 break;
03000 }
03001 }
03002 }
03003 }
03004
03005 __global__ void cuda_global_lowpass_3_y(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
03006 {
03007
03008
03009
03010
03011
03012
03013
03014 float *data = (float *) shared_data;
03015 float *border = (float *) &data[tile_width*tile_height];
03016
03017 const int sy = threadIdx.y;
03018
03019 const int sts = IMUL(blockIdx.y, tile_height);
03020 const int ste = sts + tile_height;
03021
03022
03023 const int stec = min(ste, h);
03024
03025
03026 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
03027
03028
03029 if (scs < w && sts+sy < h) {
03030
03031 int smemPos = IMUL(sy, tile_width) + threadIdx.x;
03032 int gmemPos = IMUL(sts + sy, w) + scs;
03033
03034
03035 data[smemPos] = src[gmemPos];
03036
03037
03038 if (sy < 1 && gmemPos > w)
03039 border[smemPos] = src[gmemPos-w];
03040
03041 int bordOff = 2+sy-tile_height;
03042
03043 if (sy >= tile_height-1 && ste+1 < h)
03044 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+w];
03045
03046
03047
03048 __syncthreads();
03049
03050
03051
03052
03053 float *dptr = data + smemPos;
03054 const int sw = tile_width;
03055 const int nsw = -sw;
03056 const int bn1 = threadIdx.x;
03057 const int bp1 = bn1+tile_width;
03058
03059
03060
03061
03062
03063 if(h < 3)
03064 {
03065 int numAhead = max(0,min(h-1-(sts+sy),1));
03066 int numBehind = max(0,min(sts+sy,1));
03067 int situation = numBehind*10+numAhead;
03068 switch(situation)
03069 {
03070 case 0:
03071 dst[gmemPos] = dptr[0];
03072 break;
03073 case 1:
03074 dst[gmemPos] = (dptr[0] * (2.0F / 3.0F) +
03075 dptr[sw] * (1.0F / 3.0F));
03076 break;
03077 case 10:
03078 dst[gmemPos] = (dptr[0] * (2.0F / 3.0F) +
03079 dptr[nsw] * (1.0F / 3.0F));
03080 break;
03081 case 11:
03082 dst[gmemPos] = (dptr[0] * (2.0F / 4.0F) +
03083 (dptr[nsw] + dptr[sw]) * (1.0F / 4.0F));
03084 break;
03085 }
03086 }
03087
03088 else if(sts + sy < 1)
03089 {
03090 switch(sts+sy)
03091 {
03092 case 0:
03093 dst[gmemPos] =
03094 (dptr[0] * (2.0F / 3.0F) +
03095 dptr[sw] * (1.0F / 3.0F));
03096 break;
03097 }
03098 }
03099 else if(sy < 1 && sts+sy<h-1)
03100 {
03101 switch(sy)
03102 {
03103 case 0:
03104 dst[gmemPos] =
03105 (dptr[0] * (2.0F / 4.0F) +
03106 (border[bn1] + dptr[sw]) * (1.0F / 4.0F));
03107 break;
03108 }
03109 }
03110 else if(sy <tile_height-1 && sts+sy<h-1)
03111 {
03112 dst[gmemPos] =
03113 (dptr[0] * (2.0F / 4.0F) +
03114 (dptr[nsw] + dptr[sw]) * (1.0F / 4.0F));
03115 }
03116 else if(sts + sy < h-1)
03117 {
03118 switch(tile_height-sy)
03119 {
03120 case 1:
03121 dst[gmemPos] =
03122 (dptr[0] * (2.0F / 4.0F) +
03123 (dptr[nsw] + border[bp1]) * (1.0F / 4.0F));
03124 break;
03125 }
03126 }
03127 else
03128 {
03129 switch(h-(sts+sy))
03130 {
03131 case 1:
03132 dst[gmemPos] =
03133 (dptr[0] * (2.0F / 3.0F) +
03134 dptr[nsw] * (1.0F / 3.0F));
03135 break;
03136 }
03137 }
03138
03139 }
03140
03141 }
03142
03143
03144 #include "CUDA/cutil.h"
03145
03146 texture<float, 2, cudaReadModeElementType> texRef;
03147
03148
03149 __global__ void cuda_global_lowpass_texture_9_x_dec_x(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height)
03150 {
03151
03152 float *data = (float *) shared_data;
03153 const int yResPos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
03154 const int xResPos = IMUL(blockIdx.x,tile_width) + threadIdx.x;
03155 const int ySrcPos = yResPos;
03156 const int xSrcPos = xResPos<<1;
03157 const int ctrOff = 4;
03158 const int shrIdx = threadIdx.x+ctrOff;
03159
03160 if(xResPos < dw && yResPos < dh)
03161 {
03162 data[shrIdx] = tex2D(texRef,xSrcPos,ySrcPos);
03163 if(threadIdx.x < ctrOff)
03164 data[shrIdx-ctrOff] = tex2D(texRef,xSrcPos-ctrOff,ySrcPos);
03165 else if(threadIdx.x > tile_width - ctrOff)
03166 data[shrIdx+ctrOff] = tex2D(texRef,xSrcPos+ctrOff,ySrcPos);
03167 __syncthreads();
03168 dst[IMUL(yResPos,dw) + xResPos] =
03169 ((data[shrIdx-4] + data[shrIdx+4]) * ( 1.0F / 256.0F) +
03170 (data[shrIdx-3] + data[shrIdx+3]) * ( 8.0F / 256.0F) +
03171 (data[shrIdx-2] + data[shrIdx+2]) * (28.0F / 256.0F) +
03172 (data[shrIdx-1] + data[shrIdx+1]) * (56.0F / 256.0F) +
03173 data[shrIdx] * (70.0F / 256.0F)
03174 );
03175 }
03176 }
03177
03178 void cuda_c_lowpass_texture_9_x_dec_x(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height)
03179 {
03180
03181 texRef.addressMode[0] = cudaAddressModeClamp;
03182 texRef.addressMode[1] = cudaAddressModeClamp;
03183 texRef.filterMode = cudaFilterModePoint;
03184 texRef.normalized = false;
03185 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
03186 CUDA_SAFE_CALL_NO_SYNC(cudaBindTexture2D(NULL,texRef,src,channelDesc,w,h,w*sizeof(float)));
03187 dim3 blockGridRows(iDivUp(dw, tile_width), iDivUp(dh, tile_height));
03188 dim3 threadBlockRows(tile_width, tile_height);
03189 cuda_global_lowpass_texture_9_x_dec_x<<<blockGridRows, threadBlockRows, (tile_width+8)*tile_height*sizeof(float)>>>(src, w, h, dst, dw, dh, tile_width, tile_height);
03190 CUDA_SAFE_CALL_NO_SYNC(cudaUnbindTexture(texRef));
03191 }
03192
03193 __global__ void cuda_global_lowpass_texture_9_y_dec_y(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height)
03194 {
03195
03196 float *data = (float *) shared_data;
03197 const int yResPos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
03198 const int xResPos = IMUL(blockIdx.x,tile_width) + threadIdx.x;
03199 const int ySrcPos = yResPos<<1;
03200 const int xSrcPos = xResPos;
03201 const int ctrOff = 4;
03202 const int shrIdx = threadIdx.y+ctrOff;
03203
03204 if(xResPos < dw && yResPos < dh)
03205 {
03206 data[shrIdx] = tex2D(texRef,xSrcPos,ySrcPos);
03207 if(threadIdx.y < ctrOff)
03208 data[shrIdx-ctrOff] = tex2D(texRef,xSrcPos,ySrcPos-ctrOff);
03209 else if(threadIdx.y > tile_height - ctrOff)
03210 data[shrIdx+ctrOff] = tex2D(texRef,xSrcPos,ySrcPos+ctrOff);
03211 __syncthreads();
03212 dst[IMUL(yResPos,dw) + xResPos] =
03213 ((data[shrIdx-4] + data[shrIdx+4]) * ( 1.0F / 256.0F) +
03214 (data[shrIdx-3] + data[shrIdx+3]) * ( 8.0F / 256.0F) +
03215 (data[shrIdx-2] + data[shrIdx+2]) * (28.0F / 256.0F) +
03216 (data[shrIdx-1] + data[shrIdx+1]) * (56.0F / 256.0F) +
03217 data[shrIdx] * (70.0F / 256.0F)
03218 );
03219 }
03220 }
03221
03222 void cuda_c_lowpass_texture_9_y_dec_y(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height)
03223 {
03224
03225 texRef.addressMode[0] = cudaAddressModeClamp;
03226 texRef.addressMode[1] = cudaAddressModeClamp;
03227 texRef.filterMode = cudaFilterModePoint;
03228 texRef.normalized = false;
03229 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
03230 CUDA_SAFE_CALL_NO_SYNC(cudaBindTexture2D(NULL,texRef,src,channelDesc,w,h,w*sizeof(float)));
03231 dim3 blockGridRows(iDivUp(dw, tile_width), iDivUp(dh, tile_height));
03232 dim3 threadBlockRows(tile_width, tile_height);
03233 cuda_global_lowpass_texture_9_y_dec_y<<<blockGridRows, threadBlockRows, tile_width*(tile_height+8)*sizeof(float)>>>(src, w, h, dst, dw, dh, tile_width, tile_height);
03234 CUDA_SAFE_CALL_NO_SYNC(cudaUnbindTexture(texRef));
03235 }
03236
03237
03238 #endif