00001 /*!@file CUDA/cuda-lowpass.cu CUDA/GPU optimized lowpass code */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: 00034 // $HeadURL$ 00035 // $Id$ 00036 // 00037 00038 #include "CUDA/cuda-lowpass.h" 00039 #include <cuda.h> 00040 #include "CUDA/cutil.h" 00041 // define int as 32 bits on CUDA architecture to satisfy env_config.h 00042 #define INT_IS_32_BITS 00043 #include "Envision/env_types.h" 00044 00045 // 24-bit multiplication is faster on G80, but we must be sure to 00046 // multiply integers only within [-8M, 8M - 1] range 00047 #define IMUL(a, b) __mul24(a, b) 00048 00049 #define ROW_TILE_W 128 00050 #define COLUMN_TILE_W 16 00051 #define COLUMN_TILE_H 16 //48 00052 00053 00054 00055 //////////////////////////////////////////////////////////////////////////////// 00056 // Row convolution filter 00057 //////////////////////////////////////////////////////////////////////////////// 00058 __global__ void cudalowpass5xdecx(const int *src, const unsigned int w, const unsigned int h, int* dst) 00059 { 00060 __shared__ int data[ROW_TILE_W]; // Data cache in shared memory 00061 00062 const int sx = threadIdx.x; // source pixel within source tile 00063 const int dx = (sx >> 1); // dest pixel within dest tile (decimated 2x) 00064 const int sts = IMUL(blockIdx.x, ROW_TILE_W); // tile start for source, relative to row start 00065 const int dts = (sts >>1); // tile start for dest, relative to row start 00066 const int srs = IMUL(blockIdx.y, w); // Row start index in source data 00067 const int drs = IMUL(blockIdx.y, (w >> 1)); // Row start index in dest data 00068 00069 // Load global memory values into our data cache: 00070 const int loadIdx = sts + sx; // index of one pixel value to load 00071 if (loadIdx < w) data[sx] = src[srs + loadIdx]; 00072 00073 int border; if (sx == 0 && sts > 0) border = src[srs + sts - 1]; else border = 0; 00074 const int ww = (w & 0xfffe); // evened-down source size 00075 00076 // Ensure the completness of loading stage because results emitted 00077 // by each thread depend on the data loaded by other threads: 00078 __syncthreads(); 00079 00080 // only process every other pixel 00081 if ( (sx & 1) == 0 && loadIdx < ww) { 00082 const int writeIdx = dts + dx; // write index relative to row start 00083 const int *dptr = data + sx; 00084 00085 if (loadIdx == 0) dst[drs + writeIdx] = (dptr[1] + ((*dptr) << 1)) / 3; // first pixel of image 00086 else if (sx == 0) dst[drs + writeIdx] = (border + dptr[1] + ((*dptr) << 1) ) >> 2; // first of tile 00087 else dst[drs + writeIdx] = (dptr[-1] + dptr[1] + ((*dptr) << 1)) >> 2; // all other pixels 00088 } 00089 } 00090 00091 //////////////////////////////////////////////////////////////////////////////// 00092 // Column convolution filter 00093 //////////////////////////////////////////////////////////////////////////////// 00094 __global__ void cudalowpass5ydecy(const int *src, const unsigned int w, const unsigned int h, 00095 int* dst, int sms, int gms) 00096 { 00097 // Data cache 00098 __shared__ int data[COLUMN_TILE_W * COLUMN_TILE_H]; 00099 __shared__ int border[COLUMN_TILE_W]; 00100 00101 const int sy = threadIdx.y; // source pixel row within source tile 00102 const int dy = (sy >> 1); // dest pixel row within dest tile (decimated 2x) 00103 00104 const int sts = IMUL(blockIdx.y, COLUMN_TILE_H); // tile start for source, in rows 00105 const int ste = sts + COLUMN_TILE_H; // tile end for source, in rows 00106 00107 const int dts = (sts >> 1); 00108 const int dte = (ste >> 1); 00109 00110 // Clamp tile and apron limits by image borders 00111 const int stec = min(ste, h); 00112 const int dtec = min(dte, (h >> 1)); 00113 00114 // Current column index 00115 const int scs = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x; 00116 const int dcs = scs; 00117 00118 // only process columns that are actually within image bounds: 00119 if (scs < w) { 00120 // Shared and global memory indices for current column 00121 int smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x; 00122 int gmemPos = IMUL(sts + sy, w) + scs; 00123 00124 // Cycle through the entire data cache 00125 // Load global memory values, if indices are within the image borders: 00126 for (int y = sts + sy; y < stec; y += blockDim.y) { 00127 data[smemPos] = src[gmemPos]; 00128 smemPos += sms; gmemPos += gms; 00129 } 00130 00131 if (sy == 0 && sts > 0) border[threadIdx.x] = src[IMUL(sts - 1, w) + scs]; 00132 00133 // Ensure the completness of loading stage because results emitted 00134 // by each thread depend on the data loaded by other threads: 00135 __syncthreads(); 00136 00137 // only process every other row 00138 if ((sy & 1) == 0) { 00139 // Shared and global memory indices for current column 00140 smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x; 00141 gmemPos = IMUL(dts + dy, w) + dcs; 00142 00143 // Cycle through the tile body, clamped by image borders 00144 // Calculate and output the results 00145 int *dptr = data + smemPos; 00146 int dgms = (gms >> 1); // memory stride for dest 00147 00148 if (sts + sy == 0) { // top row of image 00149 dst[gmemPos] = (dptr[COLUMN_TILE_W] + ((*dptr) << 1)) / 3; 00150 dptr += sms; gmemPos += dgms; 00151 for (int y = sts + sy + blockDim.y; y < stec; y += blockDim.y) { 00152 dst[gmemPos] = (dptr[-COLUMN_TILE_W] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2; 00153 dptr += sms; gmemPos += dgms; 00154 } 00155 } else if (sy == 0) { // top row of a tile 00156 dst[gmemPos] = (border[threadIdx.x] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2; 00157 dptr += sms; gmemPos += dgms; 00158 for (int y = sts + sy + blockDim.y; y < stec; y += blockDim.y) { 00159 dst[gmemPos] = (dptr[-COLUMN_TILE_W] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2; 00160 dptr += sms; gmemPos += dgms; 00161 } 00162 } else { // all other rows 00163 for (int y = sts + sy; y < stec; y += blockDim.y) { 00164 dst[gmemPos] = (dptr[-COLUMN_TILE_W] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2; 00165 dptr += sms; gmemPos += dgms; 00166 } 00167 } 00168 } 00169 } 00170 } 00171 00172 //Round a / b to nearest higher integer value 00173 inline int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } 00174 00175 // ###################################################################### 00176 void cuda_lowpass_5_x_dec_x_fewbits_optim(const int* src, const unsigned int w, const unsigned int h, int* dst) 00177 { 00178 dim3 blockGridRows(iDivUp(w, ROW_TILE_W), h); 00179 dim3 threadBlockRows(ROW_TILE_W); 00180 00181 cudalowpass5xdecx<<<blockGridRows, threadBlockRows>>>(src, w, h, dst); 00182 } 00183 00184 // ###################################################################### 00185 void cuda_lowpass_5_y_dec_y_fewbits_optim(const int* src, const unsigned int w, const unsigned int h, int* dst) 00186 { 00187 dim3 blockGridColumns(iDivUp(w, COLUMN_TILE_W), iDivUp(h, COLUMN_TILE_H)); 00188 dim3 threadBlockColumns(COLUMN_TILE_W, 8); 00189 00190 cudalowpass5ydecy<<<blockGridColumns, threadBlockColumns>>>(src, w, h, dst, 00191 COLUMN_TILE_W * threadBlockColumns.y, 00192 w * threadBlockColumns.y); 00193 } 00194 00195 00196 __global__ void cudalowpass9x(const int* src, const unsigned int w, const unsigned int h, int* dst) 00197 { 00198 __shared__ int data[ROW_TILE_W]; // Data cache in shared memory 00199 __shared__ int border[6]; // Bordering data flanking this tile 00200 const int sx = threadIdx.x; // source pixel within source tile 00201 const int sts = IMUL(blockIdx.x, ROW_TILE_W); // tile start for source, relative to row start 00202 const int srs = IMUL(blockIdx.y,w); // Row start index in source data 00203 00204 const int loadIdx = sts + sx; // index of one pixel value to load 00205 const int off = sx - 3; // Offset for the data storing 00206 00207 // Load border pixels 00208 if (sx < 3 && sts > 0) border[sx] = src[srs + sts - (3-sx)]; 00209 if (sx >= ROW_TILE_W-3 && sts+ROW_TILE_W < w-3) border[3+sx-(ROW_TILE_W-3)] = src[srs + sts + sx + 3]; 00210 00211 // Load the row into shared memory among the thread block 00212 if (loadIdx < w) 00213 data[sx] = src[srs + loadIdx]; 00214 else 00215 return; // Threads that are over the edge of the image on the right most tile... 00216 00217 // Ensure the completness of loading stage because results emitted 00218 // by each thread depend on the data loaded by other threads: 00219 __syncthreads(); 00220 00221 // First part of source row, just reduce sample 00222 if(sts+sx < 3) 00223 { 00224 switch(sx) 00225 { 00226 case 0: 00227 dst[srs + loadIdx] = // [ (72^) 56 28 8 ] 00228 (data[0]* 72 + 00229 data[1] * 56 + 00230 data[2] * 28 + 00231 data[3] * 8 00232 ) / 164; 00233 break; 00234 case 1: 00235 dst[srs + loadIdx] = // [ 56^ (72) 56 28 8 ] 00236 ((data[0] + data[2]) * 56 + 00237 data[1] * 72 + 00238 data[3] * 28 + 00239 data[4] * 8 00240 ) / 220; 00241 break; 00242 case 2: 00243 dst[srs + loadIdx] = // [ 28^ 56 (72) 56 28 8 ] 00244 ((data[0] + data[4]) * 28 + 00245 (data[1] + data[3]) * 56 + 00246 data[2] * 72 + 00247 data[5] * 8 00248 ) / 248; 00249 default: 00250 //LERROR(); 00251 break; 00252 } 00253 } 00254 // If not the first part of the soure row, but is the first bit of this tile, use border 00255 else if(sx < 3 && sts+sx < w-3) 00256 { 00257 switch(sx) 00258 { 00259 case 0: 00260 dst[srs + loadIdx] = // [ 8^ 28 56 (72) 56 28 8 ] 00261 ((border[0] + data[off+6]) * 8 + 00262 (border[1] + data[off+5]) * 28 + 00263 (border[2] + data[off+4]) * 56 + 00264 data[off+3] * 72 00265 ) >> 8; 00266 break; 00267 case 1: 00268 dst[srs + loadIdx] = // [ 8^ 28 56 (72) 56 28 8 ] 00269 ((border[1] + data[off+6]) * 8 + 00270 (border[2] + data[off+5]) * 28 + 00271 (data[off+2] + data[off+4]) * 56 + 00272 data[off+3] * 72 00273 ) >> 8; 00274 break; 00275 case 2: 00276 dst[srs + loadIdx] = // [ 8^ 28 56 (72) 56 28 8 ] 00277 ((border[2] + data[off+6]) * 8 + 00278 (data[off+1] + data[off+5]) * 28 + 00279 (data[off+2] + data[off+4]) * 56 + 00280 data[off+3] * 72 00281 ) >> 8; 00282 break; 00283 } 00284 } 00285 // If we are not near the edge of this tile, do standard way 00286 else if(sx < ROW_TILE_W-3 && sts +sx < w-3) 00287 { 00288 dst[srs + loadIdx] = // [ 8^ 28 56 (72) 56 28 8 ] 00289 ((data[off+0] + data[off+6]) * 8 + 00290 (data[off+1] + data[off+5]) * 28 + 00291 (data[off+2] + data[off+4]) * 56 + 00292 data[off+3] * 72 00293 ) >> 8; 00294 } 00295 // If not the last part of the source row, but in the last bit of this tile, use border 00296 else if(sts+sx < w-3) 00297 { 00298 switch(sx) 00299 { 00300 case ROW_TILE_W-3: 00301 dst[srs + loadIdx] = // [ 8^ 28 56 (72) 56 28 8 ] 00302 ((data[off+0] + border[3]) * 8 + 00303 (data[off+1] + data[off+5]) * 28 + 00304 (data[off+2] + data[off+4]) * 56 + 00305 data[off+3] * 72 00306 ) >> 8; 00307 break; 00308 case ROW_TILE_W-2: 00309 dst[srs + loadIdx] = // [ 8^ 28 56 (72) 56 28 8 ] 00310 ((data[off+0] + border[4]) * 8 + 00311 (data[off+1] + border[3]) * 28 + 00312 (data[off+2] + data[off+4]) * 56 + 00313 data[off+3] * 72 00314 ) >> 8; 00315 break; 00316 case ROW_TILE_W-1: 00317 dst[srs + loadIdx] = // [ 8^ 28 56 (72) 56 28 8 ] 00318 ((data[off+0] + border[5]) * 8 + 00319 (data[off+1] + border[4]) * 28 + 00320 (data[off+2] + border[3]) * 56 + 00321 data[off+3] * 72 00322 ) >> 8; 00323 break; 00324 } 00325 } 00326 // If in the last bit of the source row, reduce sample 00327 else if(sts + sx < w) 00328 { 00329 // dst[srs + loadIdx] = // [ 8^ 28 56 (72) ] 00330 // (data[off+0] * 8 + 00331 // data[off+1] * 28 + 00332 // data[off+2] * 56 + 00333 // data[off+3] * 72 00334 // ) / 164; 00335 // } 00336 // else if(sx < 0) 00337 // { 00338 switch(w-(sts+sx)) 00339 { 00340 case 3: 00341 dst[srs + loadIdx] = // [ 8^ 28 56 (72) 56 28 ] 00342 (data[off+0] * 8 + 00343 (data[off+1] + data[off+5]) * 28 + 00344 (data[off+2] + data[off+4]) * 56 + 00345 data[off+3] * 72 00346 ) / 248; 00347 break; 00348 case 2: 00349 dst[srs + loadIdx] = // [ 8^ 28 56 (72) 56 ] 00350 (data[off+0] * 8 + 00351 data[off+1] * 28 + 00352 (data[off+2] + data[off+4]) * 56 + 00353 data[off+3] * 72 00354 ) / 220; 00355 break; 00356 case 1: 00357 dst[srs + loadIdx] = // [ 8^ 28 56 (72) ] 00358 (data[off+0] * 8 + 00359 data[off+1] * 28 + 00360 data[off+2] * 56 + 00361 data[off+3] * 72 00362 ) / 164; 00363 break; 00364 default: 00365 dst[srs + loadIdx] = sx; 00366 } 00367 } 00368 } 00369 00370 // ###################################################################### 00371 __global__ void cudalowpass9y(const int* src, 00372 const unsigned int w, 00373 const unsigned int h, 00374 int* dst, int sms, int gms) 00375 { 00376 00377 // Data cache 00378 __shared__ int data[COLUMN_TILE_W * COLUMN_TILE_H]; 00379 __shared__ int border[COLUMN_TILE_W * 6]; 00380 00381 const int sy = threadIdx.y; // source pixel row within source tile 00382 00383 const int sts = IMUL(blockIdx.y, COLUMN_TILE_H); // tile start for source, in rows 00384 const int ste = sts + COLUMN_TILE_H; // tile end for source, in rows 00385 00386 00387 // Clamp tile and apron limits by image borders 00388 const int stec = min(ste, h); 00389 00390 // Current column index 00391 const int scs = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x; 00392 00393 // only process columns that are actually within image bounds: 00394 if (scs < w && sts+sy < stec) 00395 { 00396 // Shared and global memory indices for current column 00397 int smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x; 00398 int gmemPos = IMUL(sts + sy, w) + scs; 00399 00400 // Cycle through the entire data cache 00401 // Load global memory values, if indices are within the image borders: 00402 // for (int y = sts + sy; y < stec; y += blockDim.y) { 00403 // data[smemPos] = src[gmemPos]; 00404 // smemPos += sms; gmemPos += gms; 00405 // } 00406 data[smemPos] = src[gmemPos]; 00407 00408 if (sy < 3 && gmemPos > IMUL(3,w)) 00409 border[smemPos] = src[gmemPos-IMUL(3,w)]; 00410 //border[threadIdx.x+IMUL(sy,COLUMN_TILE_W)] = src[IMUL(sts-(3-sy), w) + scs]; 00411 00412 int bordOff = 6+sy-COLUMN_TILE_H; 00413 00414 if (sy >= COLUMN_TILE_H-3 && ste+3 < h) //blockIdx.y < blockDim.y-1) 00415 border[threadIdx.x+IMUL(bordOff,COLUMN_TILE_W)] = src[gmemPos+IMUL(3,w)]; 00416 //border[threadIdx.x+IMUL(bordOff,COLUMN_TILE_W)] = src[IMUL(ste-1+COLUMN_TILE_H-sy, w) + scs]; 00417 00418 // Ensure the completness of loading stage because results emitted 00419 // by each thread depend on the data loaded by other threads: 00420 __syncthreads(); 00421 00422 // Shared and global memory indices for current column 00423 smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x; 00424 gmemPos = IMUL(sts + sy, w) + scs; 00425 00426 00427 // Setup the offsets to get to the correct smem points in the arrays for both the data and the border 00428 int *dptr = data + smemPos; 00429 const int sw = COLUMN_TILE_W, sw2 = sw + sw, sw3 = sw2 + sw; 00430 const int nsw = -sw, nsw2 = nsw - sw, nsw3 = nsw2 - sw; 00431 const int bn3 = threadIdx.x, bn2 = bn3 + COLUMN_TILE_W, bn1 = bn2 + COLUMN_TILE_W; 00432 const int bp1 = bn1+COLUMN_TILE_W, bp2 = bp1 + COLUMN_TILE_W, bp3 = bp2 + COLUMN_TILE_W; 00433 00434 // Are we in the top 3 rows of the whole image 00435 if(sts + sy < 3) 00436 { 00437 switch(sts+sy) 00438 { 00439 case 0: 00440 dst[gmemPos] = 00441 (dptr[0] * 72 + 00442 dptr[sw] * 56 + 00443 dptr[sw2] * 28 + 00444 dptr[sw3] * 8 00445 ) / 164; 00446 break; 00447 case 1: 00448 dst[gmemPos] = 00449 (dptr[0] * 72 + 00450 (dptr[nsw] + dptr[sw]) * 56 + 00451 dptr[sw2] * 28 + 00452 dptr[sw3] * 8 00453 ) / 220; 00454 break; 00455 case 2: 00456 dst[gmemPos] = 00457 (dptr[0] * 72 + 00458 (dptr[nsw] + dptr[sw]) * 56 + 00459 (dptr[nsw2] + dptr[sw2]) * 28 + 00460 dptr[sw3] * 8 00461 ) / 248; 00462 break; 00463 } 00464 } 00465 else if(sy < 3 && sts+sy<h-3) // If not top 3 in the whole image, are we in the top 3 rows of this tile 00466 { 00467 switch(sy) 00468 { 00469 case 0: 00470 dst[gmemPos] = 00471 (dptr[0] * 72 + 00472 (border[bn1] + dptr[sw]) * 56 + 00473 (border[bn2] + dptr[sw2]) * 28 + 00474 (border[bn3] + dptr[sw3]) * 8 00475 ) >> 8; 00476 break; 00477 case 1: 00478 dst[gmemPos] = 00479 (dptr[0] * 72 + 00480 (dptr[nsw] + dptr[sw]) * 56 + 00481 (border[bn1] + dptr[sw2]) * 28 + 00482 (border[bn2] + dptr[sw3]) * 8 00483 ) >> 8; 00484 break; 00485 case 2: 00486 dst[gmemPos] = 00487 (dptr[0] * 72 + 00488 (dptr[nsw] + dptr[sw]) * 56 + 00489 (dptr[nsw2] + dptr[sw2]) * 28 + 00490 (border[bn1] + dptr[sw3]) * 8 00491 ) >> 8; 00492 break; 00493 } 00494 } 00495 else if(sy <COLUMN_TILE_H-3 && sts+sy<h-3)//(sy < COLUMN_TILE_H-4 && sts+sy<h-3) // Are we in the middle of the tile 00496 { 00497 dst[gmemPos] = 00498 ((dptr[nsw3] + dptr[sw3]) * 8 + 00499 (dptr[nsw2] + dptr[sw2]) * 28 + 00500 (dptr[nsw] + dptr[sw]) * 56 + 00501 dptr[0] * 72 00502 ) >> 8; 00503 } 00504 else if(sts + sy < h-3) // Are we not at the bottom of the image, but bottom 3 of the tile 00505 { 00506 switch(sy) 00507 { 00508 case COLUMN_TILE_H-3: 00509 dst[gmemPos] = 00510 (dptr[0] * 72 + 00511 (dptr[nsw] + dptr[sw]) * 56 + 00512 (dptr[nsw2] + dptr[sw2]) * 28 + 00513 (dptr[nsw3] + border[bp1]) * 8 00514 ) >> 8; 00515 break; 00516 case COLUMN_TILE_H-2: 00517 dst[gmemPos] = 00518 (dptr[0] * 72 + 00519 (dptr[nsw] + dptr[sw]) * 56 + 00520 (dptr[nsw2] + border[bp1]) * 28 + 00521 (dptr[nsw3] + border[bp2]) * 8 00522 ) >> 8; 00523 break; 00524 case COLUMN_TILE_H-1: 00525 dst[gmemPos] = 00526 (dptr[0] * 72 + 00527 (dptr[nsw] + border[bp1]) * 56 + 00528 (dptr[nsw2] + border[bp2]) * 28 + 00529 (dptr[nsw3] + border[bp3]) * 8 00530 ) >> 8; 00531 break; 00532 } 00533 } 00534 else // We must be at the bottom 3 of the image 00535 { 00536 switch(h-(sts+sy)) 00537 { 00538 case 3: 00539 dst[gmemPos] = 00540 (dptr[0] * 72 + 00541 (dptr[nsw] + dptr[sw]) * 56 + 00542 (dptr[nsw2] + dptr[sw2]) * 28 + 00543 dptr[nsw3] * 8 00544 ) / 248; 00545 break; 00546 case 2: 00547 dst[gmemPos] = 00548 (dptr[0] * 72 + 00549 (dptr[nsw] + dptr[sw]) * 56 + 00550 dptr[nsw2] * 28 + 00551 dptr[nsw3] * 8 00552 ) / 220; 00553 break; 00554 case 1: 00555 dst[gmemPos] = 00556 (dptr[0] * 72 + 00557 dptr[nsw] * 56 + 00558 dptr[nsw2] * 28 + 00559 dptr[nsw3] * 8 00560 ) / 164; 00561 break; 00562 } 00563 } 00564 } 00565 } 00566 00567 00568 // ##################################################################### 00569 void cuda_lowpass_9_x_fewbits_optim(const int* src, 00570 const unsigned int w, 00571 const unsigned int h, 00572 int* dst) 00573 { 00574 //ENV_ASSERT(w >= 9); 00575 dim3 blockGridRows(iDivUp(w, ROW_TILE_W), h); 00576 dim3 threadBlockRows(ROW_TILE_W); 00577 cudalowpass9x<<<blockGridRows, threadBlockRows>>>(src, w, h, dst); 00578 } 00579 00580 void cuda_lowpass_9_y_fewbits_optim(const int* src, 00581 const unsigned int w, 00582 const unsigned int h, 00583 int* dst) 00584 { 00585 //ENV_ASSERT(h >= 9); 00586 dim3 blockGridColumns(iDivUp(w, COLUMN_TILE_W), iDivUp(h, COLUMN_TILE_H)); 00587 dim3 threadBlockColumns(COLUMN_TILE_W, COLUMN_TILE_H); 00588 00589 cudalowpass9y<<<blockGridColumns, threadBlockColumns>>>(src, w, h, dst, COLUMN_TILE_W*threadBlockColumns.y, 00590 w*threadBlockColumns.y); 00591 } 00592 00593 00594 00595 00596 00597 // ###################################################################### 00598 /* So things look consistent in everyone's emacs... */ 00599 /* Local Variables: */ 00600 /* mode: c++ */ 00601 /* indent-tabs-mode: nil */ 00602 /* End: */