00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include "CUDA/cuda-lowpass.h"
00039 #include <cuda.h>
00040 #include "CUDA/cutil.h"
00041
00042 #define INT_IS_32_BITS
00043 #include "Envision/env_types.h"
00044
00045
00046
00047 #define IMUL(a, b) __mul24(a, b)
00048
00049 #define ROW_TILE_W 128
00050 #define COLUMN_TILE_W 16
00051 #define COLUMN_TILE_H 16 //48
00052
00053
00054
00055
00056
00057
00058 __global__ void cudalowpass5xdecx(const int *src, const unsigned int w, const unsigned int h, int* dst)
00059 {
00060 __shared__ int data[ROW_TILE_W];
00061
00062 const int sx = threadIdx.x;
00063 const int dx = (sx >> 1);
00064 const int sts = IMUL(blockIdx.x, ROW_TILE_W);
00065 const int dts = (sts >>1);
00066 const int srs = IMUL(blockIdx.y, w);
00067 const int drs = IMUL(blockIdx.y, (w >> 1));
00068
00069
00070 const int loadIdx = sts + sx;
00071 if (loadIdx < w) data[sx] = src[srs + loadIdx];
00072
00073 int border; if (sx == 0 && sts > 0) border = src[srs + sts - 1]; else border = 0;
00074 const int ww = (w & 0xfffe);
00075
00076
00077
00078 __syncthreads();
00079
00080
00081 if ( (sx & 1) == 0 && loadIdx < ww) {
00082 const int writeIdx = dts + dx;
00083 const int *dptr = data + sx;
00084
00085 if (loadIdx == 0) dst[drs + writeIdx] = (dptr[1] + ((*dptr) << 1)) / 3;
00086 else if (sx == 0) dst[drs + writeIdx] = (border + dptr[1] + ((*dptr) << 1) ) >> 2;
00087 else dst[drs + writeIdx] = (dptr[-1] + dptr[1] + ((*dptr) << 1)) >> 2;
00088 }
00089 }
00090
00091
00092
00093
00094 __global__ void cudalowpass5ydecy(const int *src, const unsigned int w, const unsigned int h,
00095 int* dst, int sms, int gms)
00096 {
00097
00098 __shared__ int data[COLUMN_TILE_W * COLUMN_TILE_H];
00099 __shared__ int border[COLUMN_TILE_W];
00100
00101 const int sy = threadIdx.y;
00102 const int dy = (sy >> 1);
00103
00104 const int sts = IMUL(blockIdx.y, COLUMN_TILE_H);
00105 const int ste = sts + COLUMN_TILE_H;
00106
00107 const int dts = (sts >> 1);
00108 const int dte = (ste >> 1);
00109
00110
00111 const int stec = min(ste, h);
00112 const int dtec = min(dte, (h >> 1));
00113
00114
00115 const int scs = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
00116 const int dcs = scs;
00117
00118
00119 if (scs < w) {
00120
00121 int smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x;
00122 int gmemPos = IMUL(sts + sy, w) + scs;
00123
00124
00125
00126 for (int y = sts + sy; y < stec; y += blockDim.y) {
00127 data[smemPos] = src[gmemPos];
00128 smemPos += sms; gmemPos += gms;
00129 }
00130
00131 if (sy == 0 && sts > 0) border[threadIdx.x] = src[IMUL(sts - 1, w) + scs];
00132
00133
00134
00135 __syncthreads();
00136
00137
00138 if ((sy & 1) == 0) {
00139
00140 smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x;
00141 gmemPos = IMUL(dts + dy, w) + dcs;
00142
00143
00144
00145 int *dptr = data + smemPos;
00146 int dgms = (gms >> 1);
00147
00148 if (sts + sy == 0) {
00149 dst[gmemPos] = (dptr[COLUMN_TILE_W] + ((*dptr) << 1)) / 3;
00150 dptr += sms; gmemPos += dgms;
00151 for (int y = sts + sy + blockDim.y; y < stec; y += blockDim.y) {
00152 dst[gmemPos] = (dptr[-COLUMN_TILE_W] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2;
00153 dptr += sms; gmemPos += dgms;
00154 }
00155 } else if (sy == 0) {
00156 dst[gmemPos] = (border[threadIdx.x] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2;
00157 dptr += sms; gmemPos += dgms;
00158 for (int y = sts + sy + blockDim.y; y < stec; y += blockDim.y) {
00159 dst[gmemPos] = (dptr[-COLUMN_TILE_W] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2;
00160 dptr += sms; gmemPos += dgms;
00161 }
00162 } else {
00163 for (int y = sts + sy; y < stec; y += blockDim.y) {
00164 dst[gmemPos] = (dptr[-COLUMN_TILE_W] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2;
00165 dptr += sms; gmemPos += dgms;
00166 }
00167 }
00168 }
00169 }
00170 }
00171
00172
00173 inline int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
00174
00175
00176 void cuda_lowpass_5_x_dec_x_fewbits_optim(const int* src, const unsigned int w, const unsigned int h, int* dst)
00177 {
00178 dim3 blockGridRows(iDivUp(w, ROW_TILE_W), h);
00179 dim3 threadBlockRows(ROW_TILE_W);
00180
00181 cudalowpass5xdecx<<<blockGridRows, threadBlockRows>>>(src, w, h, dst);
00182 }
00183
00184
00185 void cuda_lowpass_5_y_dec_y_fewbits_optim(const int* src, const unsigned int w, const unsigned int h, int* dst)
00186 {
00187 dim3 blockGridColumns(iDivUp(w, COLUMN_TILE_W), iDivUp(h, COLUMN_TILE_H));
00188 dim3 threadBlockColumns(COLUMN_TILE_W, 8);
00189
00190 cudalowpass5ydecy<<<blockGridColumns, threadBlockColumns>>>(src, w, h, dst,
00191 COLUMN_TILE_W * threadBlockColumns.y,
00192 w * threadBlockColumns.y);
00193 }
00194
00195
00196 __global__ void cudalowpass9x(const int* src, const unsigned int w, const unsigned int h, int* dst)
00197 {
00198 __shared__ int data[ROW_TILE_W];
00199 __shared__ int border[6];
00200 const int sx = threadIdx.x;
00201 const int sts = IMUL(blockIdx.x, ROW_TILE_W);
00202 const int srs = IMUL(blockIdx.y,w);
00203
00204 const int loadIdx = sts + sx;
00205 const int off = sx - 3;
00206
00207
00208 if (sx < 3 && sts > 0) border[sx] = src[srs + sts - (3-sx)];
00209 if (sx >= ROW_TILE_W-3 && sts+ROW_TILE_W < w-3) border[3+sx-(ROW_TILE_W-3)] = src[srs + sts + sx + 3];
00210
00211
00212 if (loadIdx < w)
00213 data[sx] = src[srs + loadIdx];
00214 else
00215 return;
00216
00217
00218
00219 __syncthreads();
00220
00221
00222 if(sts+sx < 3)
00223 {
00224 switch(sx)
00225 {
00226 case 0:
00227 dst[srs + loadIdx] =
00228 (data[0]* 72 +
00229 data[1] * 56 +
00230 data[2] * 28 +
00231 data[3] * 8
00232 ) / 164;
00233 break;
00234 case 1:
00235 dst[srs + loadIdx] =
00236 ((data[0] + data[2]) * 56 +
00237 data[1] * 72 +
00238 data[3] * 28 +
00239 data[4] * 8
00240 ) / 220;
00241 break;
00242 case 2:
00243 dst[srs + loadIdx] =
00244 ((data[0] + data[4]) * 28 +
00245 (data[1] + data[3]) * 56 +
00246 data[2] * 72 +
00247 data[5] * 8
00248 ) / 248;
00249 default:
00250
00251 break;
00252 }
00253 }
00254
00255 else if(sx < 3 && sts+sx < w-3)
00256 {
00257 switch(sx)
00258 {
00259 case 0:
00260 dst[srs + loadIdx] =
00261 ((border[0] + data[off+6]) * 8 +
00262 (border[1] + data[off+5]) * 28 +
00263 (border[2] + data[off+4]) * 56 +
00264 data[off+3] * 72
00265 ) >> 8;
00266 break;
00267 case 1:
00268 dst[srs + loadIdx] =
00269 ((border[1] + data[off+6]) * 8 +
00270 (border[2] + data[off+5]) * 28 +
00271 (data[off+2] + data[off+4]) * 56 +
00272 data[off+3] * 72
00273 ) >> 8;
00274 break;
00275 case 2:
00276 dst[srs + loadIdx] =
00277 ((border[2] + data[off+6]) * 8 +
00278 (data[off+1] + data[off+5]) * 28 +
00279 (data[off+2] + data[off+4]) * 56 +
00280 data[off+3] * 72
00281 ) >> 8;
00282 break;
00283 }
00284 }
00285
00286 else if(sx < ROW_TILE_W-3 && sts +sx < w-3)
00287 {
00288 dst[srs + loadIdx] =
00289 ((data[off+0] + data[off+6]) * 8 +
00290 (data[off+1] + data[off+5]) * 28 +
00291 (data[off+2] + data[off+4]) * 56 +
00292 data[off+3] * 72
00293 ) >> 8;
00294 }
00295
00296 else if(sts+sx < w-3)
00297 {
00298 switch(sx)
00299 {
00300 case ROW_TILE_W-3:
00301 dst[srs + loadIdx] =
00302 ((data[off+0] + border[3]) * 8 +
00303 (data[off+1] + data[off+5]) * 28 +
00304 (data[off+2] + data[off+4]) * 56 +
00305 data[off+3] * 72
00306 ) >> 8;
00307 break;
00308 case ROW_TILE_W-2:
00309 dst[srs + loadIdx] =
00310 ((data[off+0] + border[4]) * 8 +
00311 (data[off+1] + border[3]) * 28 +
00312 (data[off+2] + data[off+4]) * 56 +
00313 data[off+3] * 72
00314 ) >> 8;
00315 break;
00316 case ROW_TILE_W-1:
00317 dst[srs + loadIdx] =
00318 ((data[off+0] + border[5]) * 8 +
00319 (data[off+1] + border[4]) * 28 +
00320 (data[off+2] + border[3]) * 56 +
00321 data[off+3] * 72
00322 ) >> 8;
00323 break;
00324 }
00325 }
00326
00327 else if(sts + sx < w)
00328 {
00329
00330
00331
00332
00333
00334
00335
00336
00337
00338 switch(w-(sts+sx))
00339 {
00340 case 3:
00341 dst[srs + loadIdx] =
00342 (data[off+0] * 8 +
00343 (data[off+1] + data[off+5]) * 28 +
00344 (data[off+2] + data[off+4]) * 56 +
00345 data[off+3] * 72
00346 ) / 248;
00347 break;
00348 case 2:
00349 dst[srs + loadIdx] =
00350 (data[off+0] * 8 +
00351 data[off+1] * 28 +
00352 (data[off+2] + data[off+4]) * 56 +
00353 data[off+3] * 72
00354 ) / 220;
00355 break;
00356 case 1:
00357 dst[srs + loadIdx] =
00358 (data[off+0] * 8 +
00359 data[off+1] * 28 +
00360 data[off+2] * 56 +
00361 data[off+3] * 72
00362 ) / 164;
00363 break;
00364 default:
00365 dst[srs + loadIdx] = sx;
00366 }
00367 }
00368 }
00369
00370
00371 __global__ void cudalowpass9y(const int* src,
00372 const unsigned int w,
00373 const unsigned int h,
00374 int* dst, int sms, int gms)
00375 {
00376
00377
00378 __shared__ int data[COLUMN_TILE_W * COLUMN_TILE_H];
00379 __shared__ int border[COLUMN_TILE_W * 6];
00380
00381 const int sy = threadIdx.y;
00382
00383 const int sts = IMUL(blockIdx.y, COLUMN_TILE_H);
00384 const int ste = sts + COLUMN_TILE_H;
00385
00386
00387
00388 const int stec = min(ste, h);
00389
00390
00391 const int scs = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
00392
00393
00394 if (scs < w && sts+sy < stec)
00395 {
00396
00397 int smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x;
00398 int gmemPos = IMUL(sts + sy, w) + scs;
00399
00400
00401
00402
00403
00404
00405
00406 data[smemPos] = src[gmemPos];
00407
00408 if (sy < 3 && gmemPos > IMUL(3,w))
00409 border[smemPos] = src[gmemPos-IMUL(3,w)];
00410
00411
00412 int bordOff = 6+sy-COLUMN_TILE_H;
00413
00414 if (sy >= COLUMN_TILE_H-3 && ste+3 < h)
00415 border[threadIdx.x+IMUL(bordOff,COLUMN_TILE_W)] = src[gmemPos+IMUL(3,w)];
00416
00417
00418
00419
00420 __syncthreads();
00421
00422
00423 smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x;
00424 gmemPos = IMUL(sts + sy, w) + scs;
00425
00426
00427
00428 int *dptr = data + smemPos;
00429 const int sw = COLUMN_TILE_W, sw2 = sw + sw, sw3 = sw2 + sw;
00430 const int nsw = -sw, nsw2 = nsw - sw, nsw3 = nsw2 - sw;
00431 const int bn3 = threadIdx.x, bn2 = bn3 + COLUMN_TILE_W, bn1 = bn2 + COLUMN_TILE_W;
00432 const int bp1 = bn1+COLUMN_TILE_W, bp2 = bp1 + COLUMN_TILE_W, bp3 = bp2 + COLUMN_TILE_W;
00433
00434
00435 if(sts + sy < 3)
00436 {
00437 switch(sts+sy)
00438 {
00439 case 0:
00440 dst[gmemPos] =
00441 (dptr[0] * 72 +
00442 dptr[sw] * 56 +
00443 dptr[sw2] * 28 +
00444 dptr[sw3] * 8
00445 ) / 164;
00446 break;
00447 case 1:
00448 dst[gmemPos] =
00449 (dptr[0] * 72 +
00450 (dptr[nsw] + dptr[sw]) * 56 +
00451 dptr[sw2] * 28 +
00452 dptr[sw3] * 8
00453 ) / 220;
00454 break;
00455 case 2:
00456 dst[gmemPos] =
00457 (dptr[0] * 72 +
00458 (dptr[nsw] + dptr[sw]) * 56 +
00459 (dptr[nsw2] + dptr[sw2]) * 28 +
00460 dptr[sw3] * 8
00461 ) / 248;
00462 break;
00463 }
00464 }
00465 else if(sy < 3 && sts+sy<h-3)
00466 {
00467 switch(sy)
00468 {
00469 case 0:
00470 dst[gmemPos] =
00471 (dptr[0] * 72 +
00472 (border[bn1] + dptr[sw]) * 56 +
00473 (border[bn2] + dptr[sw2]) * 28 +
00474 (border[bn3] + dptr[sw3]) * 8
00475 ) >> 8;
00476 break;
00477 case 1:
00478 dst[gmemPos] =
00479 (dptr[0] * 72 +
00480 (dptr[nsw] + dptr[sw]) * 56 +
00481 (border[bn1] + dptr[sw2]) * 28 +
00482 (border[bn2] + dptr[sw3]) * 8
00483 ) >> 8;
00484 break;
00485 case 2:
00486 dst[gmemPos] =
00487 (dptr[0] * 72 +
00488 (dptr[nsw] + dptr[sw]) * 56 +
00489 (dptr[nsw2] + dptr[sw2]) * 28 +
00490 (border[bn1] + dptr[sw3]) * 8
00491 ) >> 8;
00492 break;
00493 }
00494 }
00495 else if(sy <COLUMN_TILE_H-3 && sts+sy<h-3)
00496 {
00497 dst[gmemPos] =
00498 ((dptr[nsw3] + dptr[sw3]) * 8 +
00499 (dptr[nsw2] + dptr[sw2]) * 28 +
00500 (dptr[nsw] + dptr[sw]) * 56 +
00501 dptr[0] * 72
00502 ) >> 8;
00503 }
00504 else if(sts + sy < h-3)
00505 {
00506 switch(sy)
00507 {
00508 case COLUMN_TILE_H-3:
00509 dst[gmemPos] =
00510 (dptr[0] * 72 +
00511 (dptr[nsw] + dptr[sw]) * 56 +
00512 (dptr[nsw2] + dptr[sw2]) * 28 +
00513 (dptr[nsw3] + border[bp1]) * 8
00514 ) >> 8;
00515 break;
00516 case COLUMN_TILE_H-2:
00517 dst[gmemPos] =
00518 (dptr[0] * 72 +
00519 (dptr[nsw] + dptr[sw]) * 56 +
00520 (dptr[nsw2] + border[bp1]) * 28 +
00521 (dptr[nsw3] + border[bp2]) * 8
00522 ) >> 8;
00523 break;
00524 case COLUMN_TILE_H-1:
00525 dst[gmemPos] =
00526 (dptr[0] * 72 +
00527 (dptr[nsw] + border[bp1]) * 56 +
00528 (dptr[nsw2] + border[bp2]) * 28 +
00529 (dptr[nsw3] + border[bp3]) * 8
00530 ) >> 8;
00531 break;
00532 }
00533 }
00534 else
00535 {
00536 switch(h-(sts+sy))
00537 {
00538 case 3:
00539 dst[gmemPos] =
00540 (dptr[0] * 72 +
00541 (dptr[nsw] + dptr[sw]) * 56 +
00542 (dptr[nsw2] + dptr[sw2]) * 28 +
00543 dptr[nsw3] * 8
00544 ) / 248;
00545 break;
00546 case 2:
00547 dst[gmemPos] =
00548 (dptr[0] * 72 +
00549 (dptr[nsw] + dptr[sw]) * 56 +
00550 dptr[nsw2] * 28 +
00551 dptr[nsw3] * 8
00552 ) / 220;
00553 break;
00554 case 1:
00555 dst[gmemPos] =
00556 (dptr[0] * 72 +
00557 dptr[nsw] * 56 +
00558 dptr[nsw2] * 28 +
00559 dptr[nsw3] * 8
00560 ) / 164;
00561 break;
00562 }
00563 }
00564 }
00565 }
00566
00567
00568
00569 void cuda_lowpass_9_x_fewbits_optim(const int* src,
00570 const unsigned int w,
00571 const unsigned int h,
00572 int* dst)
00573 {
00574
00575 dim3 blockGridRows(iDivUp(w, ROW_TILE_W), h);
00576 dim3 threadBlockRows(ROW_TILE_W);
00577 cudalowpass9x<<<blockGridRows, threadBlockRows>>>(src, w, h, dst);
00578 }
00579
00580 void cuda_lowpass_9_y_fewbits_optim(const int* src,
00581 const unsigned int w,
00582 const unsigned int h,
00583 int* dst)
00584 {
00585
00586 dim3 blockGridColumns(iDivUp(w, COLUMN_TILE_W), iDivUp(h, COLUMN_TILE_H));
00587 dim3 threadBlockColumns(COLUMN_TILE_W, COLUMN_TILE_H);
00588
00589 cudalowpass9y<<<blockGridColumns, threadBlockColumns>>>(src, w, h, dst, COLUMN_TILE_W*threadBlockColumns.y,
00590 w*threadBlockColumns.y);
00591 }
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602