00001 /*!@file CUDA/cuda-lowpass.h CUDA/GPU optimized lowpass filter code */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_lowpass.h $ 00035 // $Id: cuda_lowpass.h 12962 2010-03-06 02:13:53Z irock $ 00036 // 00037 00038 #ifndef CUDA_LOWPASS_H_DEFINED 00039 #define CUDA_LOWPASS_H_DEFINED 00040 00041 #include <cuda.h> 00042 #include "CUDA/cutil.h" 00043 #include "cudadefs.h" 00044 00045 //////////////////////////////////////////////////////////////////////////////// 00046 // Row convolution filter 00047 //////////////////////////////////////////////////////////////////////////////// 00048 __global__ void cuda_global_lowpass_5_x_dec_x(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width) 00049 { 00050 // Data cache in shared memory 00051 //__shared__ float data[CUDA_1D_TILE_W]; 00052 //__shared__ float border[4]; 00053 //tile_width=CUDA_1D_TILE_W; 00054 00055 // Save 4 pixels for the border 00056 float *data = (float *) shared_data; // size of tile_width 00057 float *border = (float *) &data[tile_width]; // size of 4 00058 00059 const int sx = threadIdx.x; // source pixel within source tile 00060 const int dx = (sx >> 1); // dest pixel within dest tile (decimated 2x) 00061 const int sts = IMUL(blockIdx.x, tile_width); // tile start for source, relative to row start 00062 const int dts = (sts >>1); // tile start for dest, relative to row start 00063 const int srs = IMUL(blockIdx.y, w); // Row start index in source data 00064 const int drs = IMUL(blockIdx.y, (w >> 1)); // Row start index in dest data 00065 00066 // Load global memory values into our data cache: 00067 const int loadIdx = sts + sx; // index of one pixel value to load 00068 if (loadIdx < w) data[sx] = src[srs + loadIdx]; 00069 00070 // Load beginning border 00071 if (sx < 2 && sts > 0) 00072 border[sx] = src[srs + loadIdx - 2]; 00073 // Load ending border 00074 else if(sx >= tile_width-2 && sts+tile_width < w-2) 00075 border[4+sx-tile_width] = src[srs + sts + sx + 2]; 00076 00077 const int ww = (w & 0xfffe); // evened-down source size 00078 00079 // Ensure the completness of loading stage because results emitted 00080 // by each thread depend on the data loaded by other threads: 00081 __syncthreads(); 00082 00083 // only process every other pixel 00084 if ( (sx & 1) == 0 && loadIdx < ww) { 00085 const int writeIdx = dts + dx; // write index relative to row start 00086 const float *dptr = data + sx; 00087 // [ 1 4 (6) 4 1 ] / 16.0 00088 00089 // If we are smaller than the Gaussian filter we are using, special case this 00090 // this is not very efficient, just making sure it can handle small images 00091 if(w < 5) 00092 { 00093 int numAhead = max(0,min(w-1-(sts+sx),2)); 00094 int numBehind = max(0,min(sts+sx,2)); 00095 int situation = numBehind*10+numAhead; 00096 switch(situation) 00097 { 00098 case 0: // 00 00099 dst[drs + writeIdx] = dptr[0]; 00100 break; 00101 case 1: // 01 00102 dst[drs + writeIdx] = (dptr[0] * (6.0F / 10.0F) + 00103 dptr[1] * (4.0F / 10.0F)); 00104 break; 00105 case 2: // 02 00106 dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) + 00107 dptr[1] * (4.0F / 11.0F) + 00108 dptr[2] * (1.0F / 11.0F)); 00109 break; 00110 case 10: 00111 dst[drs + writeIdx] = (dptr[0] * (6.0F / 10.0F) + 00112 dptr[-1] * (4.0F / 10.0F)); 00113 break; 00114 case 11: 00115 dst[drs + writeIdx] = (dptr[0] * (6.0F / 14.0F) + 00116 (dptr[-1] + dptr[1]) * (4.0F / 14.0F)); 00117 break; 00118 case 12: 00119 dst[drs + writeIdx] = (dptr[0] * (6.0F / 15.0F) + 00120 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) + 00121 dptr[2] * (1.0F / 15.0F)); 00122 break; 00123 case 20: 00124 dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) + 00125 dptr[-1] * (4.0F / 11.0F) + 00126 dptr[-2] * (1.0F / 11.0F)); 00127 break; 00128 case 21: 00129 dst[drs + writeIdx] = (dptr[0] * (6.0F / 15.0F) + 00130 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) + 00131 dptr[-2] * (1.0F / 15.0F)); 00132 break; 00133 } 00134 } 00135 // First set of pixels in the row 00136 else if(sts+sx < 2) 00137 { 00138 switch(sx) 00139 { 00140 case 0: 00141 dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) + 00142 dptr[1] * (4.0F / 11.0F) + 00143 dptr[2] * (1.0F / 11.0F)); 00144 break; 00145 case 1: 00146 dst[drs + writeIdx] = (dptr[0] * (6.0F / 15.0F) + 00147 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) + 00148 dptr[2] * (1.0F / 15.0F)); 00149 break; 00150 } 00151 } 00152 // First two pixels in tile 00153 else if(sx < 2 && sts+sx < w-2) 00154 { 00155 switch(sx) 00156 { 00157 case 0: 00158 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 00159 (border[1] + dptr[1]) * (4.0F / 16.0F) + 00160 (border[0] + dptr[2]) * (1.0F / 16.0F)); 00161 break; 00162 case 1: 00163 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 00164 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) + 00165 (border[1] + dptr[2]) * (1.0F / 16.0F)); 00166 break; 00167 } 00168 } 00169 // In the middle of the tile 00170 else if(sx < tile_width-2 && sts+sx < w-2) 00171 { 00172 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 00173 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) + 00174 (dptr[-2] + dptr[2]) * (1.0F / 16.0F)); 00175 } 00176 // Last two pixels of the tile 00177 else if(sts+sx < w-2) 00178 { 00179 switch(tile_width-sx) 00180 { 00181 case 2: 00182 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 00183 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) + 00184 (dptr[-2] + border[2]) * (1.0F / 16.0F)); 00185 break; 00186 case 1: 00187 dst[drs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 00188 (dptr[-1] + border[2]) * (4.0F / 16.0F) + 00189 (dptr[-2] + border[3]) * (1.0F / 16.0F)); 00190 break; 00191 } 00192 } 00193 // Last two pixels of the row 00194 else 00195 { 00196 switch(w-(sts+sx)) 00197 { 00198 case 2: 00199 dst[drs + writeIdx] = (dptr[0] * (6.0F / 15.0F) + 00200 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) + 00201 (dptr[-2]) * (1.0F / 15.0F)); 00202 break; 00203 case 1: 00204 dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) + 00205 (dptr[-1]) * (4.0F / 11.0F) + 00206 (dptr[-2]) * (1.0F / 11.0F)); 00207 break; 00208 } 00209 } 00210 } 00211 } 00212 00213 //////////////////////////////////////////////////////////////////////////////// 00214 // Column convolution filter 00215 //////////////////////////////////////////////////////////////////////////////// 00216 __global__ void cuda_global_lowpass_5_y_dec_y(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height) 00217 { 00218 // Data cache 00219 //__shared__ float data[CUDA_TILE_W*CUDA_TILE_H]; 00220 //__shared__ float border[CUDA_TILE_W*4]; 00221 //const int tile_width=CUDA_TILE_W; 00222 //const int tile_height= CUDA_TILE_H; 00223 00224 // Save 4 rows for the border 00225 float *data = (float *) shared_data; //tile_width * tile_height size 00226 float *border = (float *) &data[tile_width*tile_height]; // size of tile_width*4 00227 00228 const int sy = threadIdx.y; // source pixel row within source tile 00229 const int dy = (sy >> 1); // dest pixel row within dest tile (decimated 2x) 00230 00231 const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows 00232 const int ste = sts + tile_height; // tile end for source, in rows 00233 00234 const int dts = (sts >> 1); 00235 const int dte = (ste >> 1); 00236 00237 // Clamp tile and apron limits by image borders 00238 const int stec = min(ste, h); 00239 const int dtec = min(dte, (h >> 1)); 00240 00241 // Current column index 00242 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; 00243 const int dcs = scs; 00244 00245 // only process columns that are actually within image bounds: 00246 if (scs < w) { 00247 // Shared and global (source) memory indices for current column 00248 int smemPos = IMUL(sy, tile_width) + threadIdx.x; 00249 int gmemPos = IMUL(sts + sy, w) + scs; 00250 00251 // Load data 00252 data[smemPos] = src[gmemPos]; 00253 00254 // Load border 00255 if (sy < 2 && gmemPos > IMUL(2,w)) 00256 border[smemPos] = src[gmemPos-IMUL(2,w)]; 00257 00258 int bordOff = 4+sy-tile_height; 00259 00260 if (sy >= tile_height-2 && ste+2 < h) 00261 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(2,w)]; 00262 00263 // Ensure the completness of loading stage because results emitted 00264 // by each thread depend on the data loaded by other threads: 00265 __syncthreads(); 00266 00267 // only process every other row 00268 if ((sy & 1) == 0) { 00269 // Shared and global (destination) memory indices for current column 00270 smemPos = IMUL(sy, tile_width) + threadIdx.x; 00271 gmemPos = IMUL(dts + dy, w) + dcs; 00272 00273 // Cycle through the tile body, clamped by image borders 00274 // Calculate and output the results 00275 float *dptr = data + smemPos; 00276 const int sw = tile_width, sw2 = sw + sw; 00277 const int nsw = -sw, nsw2 = nsw - sw; 00278 const int bn2 = threadIdx.x, bn1 = bn2 + tile_width; 00279 const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width; 00280 00281 // [ 1 4 (6) 4 1 ] / 16 00282 00283 // If we are smaller than the Gaussian filter we are using, special case this 00284 // this is not very efficient, just making sure it can handle small images 00285 if(h < 5) 00286 { 00287 int numAhead = max(0,min(h-1-(sts+sy),2)); 00288 int numBehind = max(0,min(sts+sy,2)); 00289 int situation = numBehind*10+numAhead; 00290 switch(situation) 00291 { 00292 case 0: // 00 00293 dst[gmemPos] = dptr[0]; 00294 break; 00295 case 1: // 01 00296 dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) + 00297 dptr[sw] * (4.0F / 10.0F)); 00298 break; 00299 case 2: // 02 00300 dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) + 00301 dptr[sw] * (4.0F / 11.0F) + 00302 dptr[sw2] * (1.0F / 11.0F)); 00303 break; 00304 case 10: 00305 dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) + 00306 dptr[nsw] * (4.0F / 10.0F)); 00307 break; 00308 case 11: 00309 dst[gmemPos] = (dptr[0] * (6.0F / 14.0F) + 00310 (dptr[nsw] + dptr[sw]) * (4.0F / 14.0F)); 00311 break; 00312 case 12: 00313 dst[gmemPos] = (dptr[0] * (6.0F / 15.0F) + 00314 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) + 00315 dptr[sw2] * (1.0F / 15.0F)); 00316 break; 00317 case 20: 00318 dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) + 00319 dptr[nsw] * (4.0F / 11.0F) + 00320 dptr[nsw2] * (1.0F / 11.0F)); 00321 break; 00322 case 21: 00323 dst[gmemPos] = (dptr[0] * (6.0F / 15.0F) + 00324 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) + 00325 dptr[nsw2] * (1.0F / 15.0F)); 00326 break; 00327 } 00328 } 00329 // Are we in the top 2 rows of the whole image 00330 else if(sts + sy < 2) 00331 { 00332 switch(sts+sy) 00333 { 00334 case 0: 00335 dst[gmemPos] = 00336 (dptr[0] * (6.0F / 11.0F) + 00337 dptr[sw] * (4.0F / 11.0F) + 00338 dptr[sw2] * (1.0F / 11.0F) 00339 ); 00340 break; 00341 case 1: 00342 dst[gmemPos] = 00343 (dptr[0] * (6.0F / 15.0F) + 00344 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) + 00345 dptr[sw2] * (1.0F / 15.0F) 00346 ); 00347 break; 00348 } 00349 } 00350 else if(sy < 2 && sts+sy<h-2) // If not top 2 in the whole image, are we in the top 2 rows of this tile 00351 { 00352 switch(sy) 00353 { 00354 case 0: 00355 dst[gmemPos] = 00356 (dptr[0] * (6.0F / 16.0F) + 00357 (border[bn1] + dptr[sw]) * (4.0F / 16.0F) + 00358 (border[bn2] + dptr[sw2]) * (1.0F / 16.0F) 00359 ); 00360 break; 00361 case 1: 00362 dst[gmemPos] = 00363 (dptr[0] * (6.0F / 16.0F) + 00364 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) + 00365 (border[bn1] + dptr[sw2]) * (1.0F / 16.0F) 00366 ); 00367 break; 00368 } 00369 } 00370 else if(sy <tile_height-2 && sts+sy<h-2) // Are we in the middle of the tile 00371 { 00372 dst[gmemPos] = 00373 ((dptr[nsw2] + dptr[sw2]) * (1.0F / 16.0F) + 00374 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) + 00375 dptr[0] * (6.0F / 16.0F) 00376 ); 00377 } 00378 else if(sts + sy < h-2) // Are we not at the bottom of the image, but bottom 4 of the tile 00379 { 00380 switch(tile_height-sy) 00381 { 00382 case 2: 00383 dst[gmemPos] = 00384 (dptr[0] * (6.0F / 16.0F) + 00385 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) + 00386 (dptr[nsw2] + border[bp1]) * (1.0F / 16.0F) 00387 ); 00388 break; 00389 case 1: 00390 dst[gmemPos] = 00391 (dptr[0] * (6.0F / 16.0F) + 00392 (dptr[nsw] + border[bp1]) * (4.0F / 16.0F) + 00393 (dptr[nsw2] + border[bp2]) * (1.0F / 16.0F) 00394 ); 00395 break; 00396 } 00397 } 00398 else // We must be at the bottom 4 of the image 00399 { 00400 switch(h-(sts+sy)) 00401 { 00402 case 2: 00403 dst[gmemPos] = 00404 (dptr[0] * (6.0F / 15.0F) + 00405 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) + 00406 dptr[nsw2] * (1.0F / 15.0F) 00407 ); 00408 break; 00409 case 1: 00410 dst[gmemPos] = 00411 (dptr[0] * (6.0F / 11.0F) + 00412 dptr[nsw] * (4.0F / 11.0F) + 00413 dptr[nsw2] * (1.0F / 11.0F) 00414 ); 00415 break; 00416 } 00417 } 00418 00419 } 00420 } 00421 } 00422 00423 00424 __global__ void cuda_global_lowpass_9_x_dec_x(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width) 00425 { 00426 // w and h are from the original source image 00427 const int src_tile_width = tile_width<<1; // Size of the tile used from the source data 00428 // Data cache in shared memory 00429 float *data = (float *) shared_data; // size of src_tile_width 00430 // Bordering data flanking this tile 00431 float *border = (float *) &data[src_tile_width]; // size of 8 00432 const int dx = threadIdx.x; // dest pixel within dest tile (decimated 2x) 00433 const int dts = IMUL(blockIdx.x, tile_width); // tile start for dest, relative to row start 00434 const int drs = IMUL(blockIdx.y,dw); // Row start index in dest data 00435 const int sx = dx<<1; // source pixel within source tile 00436 const int sts = (dts <<1); // tile start for source, relative to row start 00437 const int srs = IMUL(blockIdx.y, w); // Row start index in source data 00438 00439 const int loadIdx = sts + sx; // index of one pixel value to load 00440 const int writeIdx = dts + dx; // index of one pixel value to load 00441 const int bn4 = 0, bn3 = 1, bn2 = 2, bn1 = 3; 00442 const int bp1 = 4, bp2 = 5, bp3 = 6, bp4 = 7; 00443 float *dptr = &data[sx]; 00444 00445 // Load border pixels 00446 if (dx < 4 && sts > 0) border[dx] = src[srs + sts - (4-dx)]; 00447 if (dx >= tile_width-4 && sts+src_tile_width < w-4) border[4+dx-(tile_width-4)] = src[srs + sts + tile_width + dx + 4]; 00448 00449 // Load the row into shared memory among the thread block 00450 if (loadIdx < w) 00451 { 00452 data[sx] = src[srs + loadIdx]; 00453 if(sx+1 < src_tile_width && loadIdx+1 < w) 00454 data[sx+1] = src[srs + loadIdx + 1]; 00455 } 00456 00457 // Ensure the completness of loading stage because results emitted 00458 // by each thread depend on the data loaded by other threads: 00459 __syncthreads(); 00460 00461 // [ 1 8 28 56 (70) 56 28 8 1 ] 00462 00463 00464 00465 00466 if(writeIdx < dw && blockIdx.y < dh) 00467 { 00468 /* if(true) */ 00469 /* { */ 00470 /* dst[drs+writeIdx] = dptr[0]; */ 00471 /* } */ 00472 if(w < 9) 00473 { 00474 // This is not at all efficient, just here to ensure filter behaves properly in small image cases 00475 const int numAhead = max(0,min(w-1-(sts+sx),4)); 00476 const int numBehind = max(0,min(sts+sx,4)); 00477 const int situation = numBehind*10+numAhead; 00478 00479 switch(situation) 00480 { 00481 case 0: // 00 00482 dst[drs+writeIdx] = dptr[0]; 00483 break; 00484 case 2: // 02 00485 dst[drs+writeIdx] = (dptr[0] * (70.0F / 154.0F) + 00486 dptr[1] * (56.0F / 154.0F) + 00487 dptr[2] * (28.0F / 154.0F)); 00488 break; 00489 case 4: 00490 dst[drs+writeIdx] = (dptr[0] * (70.0F / 163.0F) + 00491 dptr[1] * (56.0F / 163.0F) + 00492 dptr[2] * (28.0F / 163.0F) + 00493 dptr[3] * ( 8.0F / 163.0F) + 00494 dptr[4] * ( 1.0F / 163.0F)); 00495 break; 00496 case 10: 00497 dst[drs+writeIdx] = (dptr[0] * (70.0F / 126.0F) + 00498 dptr[-1] * (56.0F / 126.0F)); 00499 break; 00500 case 12: 00501 dst[drs+writeIdx] = (dptr[0] * (70.0F / 210.0F) + 00502 (dptr[-1] + dptr[1]) * (56.0F / 210.0F) + 00503 dptr[2] * (28.0F / 210.0F)); 00504 break; 00505 case 14: 00506 dst[drs+writeIdx] = (dptr[0] * (70.0F / 219.0F) + 00507 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) + 00508 dptr[2] * (28.0F / 219.0F) + 00509 dptr[3] * ( 8.0F / 219.0F) + 00510 dptr[4] * ( 1.0F / 219.0F)); 00511 break; 00512 case 20: 00513 dst[drs+writeIdx] = (dptr[0] * (70.0F / 154.0F) + 00514 dptr[-1] * (56.0F / 154.0F) + 00515 dptr[-2] * (28.0F / 154.0F)); 00516 break; 00517 case 22: 00518 dst[drs+writeIdx] = (dptr[0] * (70.0F / 238.0F) + 00519 (dptr[-1] + dptr[1]) * (56.0F / 238.0F) + 00520 (dptr[-2] + dptr[2]) * (28.0F / 238.0F)); 00521 break; 00522 case 24: 00523 dst[drs+writeIdx] = (dptr[0] * (70.0F / 247.0F) + 00524 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) + 00525 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) + 00526 dptr[3] * ( 8.0F / 247.0F) + 00527 dptr[4] * ( 1.0F / 247.0F)); 00528 break; 00529 case 30: 00530 dst[drs+writeIdx] = (dptr[0] * (70.0F / 162.0F) + 00531 dptr[-1] * (56.0F / 162.0F) + 00532 dptr[-2] * (28.0F / 162.0F) + 00533 dptr[-3] * ( 8.0F / 162.0F)); 00534 break; 00535 case 32: 00536 dst[drs+writeIdx] = (dptr[0] * (70.0F / 246.0F) + 00537 (dptr[-1] + dptr[1]) * (56.0F / 246.0F) + 00538 (dptr[-2] + dptr[2]) * (28.0F / 246.0F) + 00539 dptr[-3] * ( 8.0F / 246.0F)); 00540 break; 00541 case 34: 00542 dst[drs+writeIdx] = (dptr[0] * (70.0F / 255.0F) + 00543 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) + 00544 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) + 00545 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) + 00546 dptr[4] * ( 1.0F / 255.0F)); 00547 break; 00548 case 40: 00549 dst[drs+writeIdx] = (dptr[0] * (70.0F / 163.0F) + 00550 dptr[-1] * (56.0F / 163.0F) + 00551 dptr[-2] * (28.0F / 163.0F) + 00552 dptr[-3] * ( 8.0F / 163.0F) + 00553 dptr[-4] * ( 1.0F / 163.0F)); 00554 break; 00555 case 42: 00556 dst[drs+writeIdx] = (dptr[0] * (70.0F / 247.0F) + 00557 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) + 00558 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) + 00559 dptr[-3] * ( 8.0F / 247.0F) + 00560 dptr[-4] * ( 1.0F / 247.0F)); 00561 break; 00562 } 00563 } 00564 // First part of source row, just reduce sample 00565 else if(sts+sx < 4) 00566 { 00567 switch(sx) 00568 { 00569 case 0: 00570 dst[drs + writeIdx] = 00571 (dptr[0] * (70.0F / 163.0F) + 00572 dptr[1] * (56.0F / 163.0F) + 00573 dptr[2] * (28.0F / 163.0F) + 00574 dptr[3] * ( 8.0F / 163.0F) + 00575 dptr[4] * ( 1.0F / 163.0F) 00576 ); 00577 break; 00578 case 1: 00579 dst[drs + writeIdx] = 00580 (dptr[0] * (70.0F / 219.0F) + 00581 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) + 00582 dptr[2] * (28.0F / 219.0F) + 00583 dptr[3] * ( 8.0F / 219.0F) + 00584 dptr[4] * ( 1.0F / 219.0F) 00585 ); 00586 break; 00587 case 2: 00588 dst[drs + writeIdx] = 00589 (dptr[0] * (70.0F / 247.0F) + 00590 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) + 00591 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) + 00592 dptr[3] * ( 8.0F / 247.0F) + 00593 dptr[4] * ( 1.0F / 247.0F) 00594 ); 00595 break; 00596 case 3: 00597 dst[drs + writeIdx] = 00598 (dptr[0] * (70.0F / 255.0F) + 00599 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) + 00600 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) + 00601 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) + 00602 dptr[4] * ( 1.0F / 255.0F) 00603 ); 00604 break; 00605 default: 00606 //LERROR(); 00607 break; 00608 } 00609 } 00610 // If not the first part of the source row, but is the first bit of this tile, use border 00611 else if(sx < 4 && sts+sx < w-4) 00612 { 00613 switch(sx) 00614 { 00615 case 0: 00616 dst[drs + writeIdx] = 00617 ((border[bn4] + dptr[4]) * ( 1.0F / 256.0F) + 00618 (border[bn3] + dptr[3]) * ( 8.0F / 256.0F) + 00619 (border[bn2] + dptr[2]) * (28.0F / 256.0F) + 00620 (border[bn1] + dptr[1]) * (56.0F / 256.0F) + 00621 dptr[0] * (70.0F / 256.0F) 00622 ); 00623 break; 00624 case 1: 00625 dst[drs + writeIdx] = 00626 ((border[bn3] + dptr[4]) * ( 1.0F / 256.0F) + 00627 (border[bn2] + dptr[3]) * ( 8.0F / 256.0F) + 00628 (border[bn1] + dptr[2]) * (28.0F / 256.0F) + 00629 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 00630 dptr[0] * (70.0F / 256.0F) 00631 ); 00632 break; 00633 case 2: 00634 dst[drs + writeIdx] = 00635 ((border[bn2] + dptr[4]) * ( 1.0F / 256.0F) + 00636 (border[bn1] + dptr[3]) * ( 8.0F / 256.0F) + 00637 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 00638 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 00639 dptr[0] * (70.0F / 256.0F) 00640 ); 00641 break; 00642 case 3: 00643 dst[drs + writeIdx] = 00644 ((border[bn1] + dptr[4]) * ( 1.0F / 256.0F) + 00645 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) + 00646 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 00647 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 00648 dptr[0] * (70.0F / 256.0F) 00649 ); 00650 break; 00651 } 00652 } 00653 // If we are not near the edge of this tile, do standard way 00654 else if(sx>=4 && sx<src_tile_width-4 && sts+sx < w-4) 00655 { 00656 dst[drs + writeIdx] = 00657 ((dptr[-4] + dptr[4]) * ( 1.0F / 256.0F) + 00658 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) + 00659 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 00660 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 00661 dptr[0] * (70.0F / 256.0F) 00662 ); 00663 } 00664 // If not the last part of the source row, but in the last bit of this tile, use border 00665 else if(sx>=4 && sts+sx < w-4) 00666 { 00667 switch(src_tile_width-sx) 00668 { 00669 case 4: 00670 dst[drs + writeIdx] = 00671 ((dptr[-4] + border[bp1]) * ( 1.0F / 256.0F) + 00672 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) + 00673 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 00674 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 00675 dptr[0] * (70.0F / 256.0F) 00676 ); 00677 break; 00678 case 3: 00679 dst[drs + writeIdx] = 00680 ((dptr[-4] + border[bp2]) * ( 1.0F / 256.0F) + 00681 (dptr[-3] + border[bp1]) * ( 8.0F / 256.0F) + 00682 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 00683 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 00684 dptr[0] * (70.0F / 256.0F) 00685 ); 00686 break; 00687 case 2: 00688 dst[drs + writeIdx] = 00689 ((dptr[-4] + border[bp3]) * ( 1.0F / 256.0F) + 00690 (dptr[-3] + border[bp2]) * ( 8.0F / 256.0F) + 00691 (dptr[-2] + border[bp1]) * (28.0F / 256.0F) + 00692 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 00693 dptr[0] * (70.0F / 256.0F) 00694 ); 00695 break; 00696 case 1: 00697 dst[drs + writeIdx] = 00698 ((dptr[-4] + border[bp4]) * ( 1.0F / 256.0F) + 00699 (dptr[-3] + border[bp3]) * ( 8.0F / 256.0F) + 00700 (dptr[-2] + border[bp2]) * (28.0F / 256.0F) + 00701 (dptr[-1] + border[bp1]) * (56.0F / 256.0F) + 00702 dptr[0] * (70.0F / 256.0F) 00703 ); 00704 break; 00705 } 00706 } 00707 else if(sx < 4 && sts + sx >= w-4) // We are in the beginning of a tile close to the end of the image 00708 { 00709 switch(sx) 00710 { 00711 case 0: 00712 switch(w-(sts+sx)) 00713 { 00714 case 4: 00715 dst[drs + writeIdx] = 00716 (dptr[0] * (70.0F / 255.0F) + 00717 (border[bn1] + dptr[1]) * (56.0F / 255.0F) + 00718 (border[bn2] + dptr[2]) * (28.0F / 255.0F) + 00719 (border[bn3] + dptr[3]) * ( 8.0F / 255.0F) + 00720 border[bn4] * ( 1.0F / 255.0F) 00721 ); 00722 break; 00723 case 3: 00724 dst[drs + writeIdx] = 00725 (dptr[0] * (70.0F / 247.0F) + 00726 (border[bn1] + dptr[1]) * (56.0F / 247.0F) + 00727 (border[bn2] + dptr[2]) * (28.0F / 247.0F) + 00728 border[bn3] * ( 8.0F / 247.0F) + 00729 border[bn4] * ( 1.0F / 247.0F) 00730 ); 00731 break; 00732 case 2: 00733 dst[drs + writeIdx] = 00734 (dptr[0] * (70.0F / 219.0F) + 00735 (border[bn1] + dptr[1]) * (56.0F / 219.0F) + 00736 border[bn2] * (28.0F / 219.0F) + 00737 border[bn3] * ( 8.0F / 219.0F) + 00738 border[bn4] * ( 1.0F / 219.0F) 00739 ); 00740 break; 00741 case 1: 00742 dst[drs + writeIdx] = 00743 (dptr[0] * (70.0F / 163.0F) + 00744 border[bn1] * (56.0F / 163.0F) + 00745 border[bn2] * (28.0F / 163.0F) + 00746 border[bn3] * ( 8.0F / 163.0F) + 00747 border[bn4] * ( 1.0F / 163.0F) 00748 ); 00749 break; 00750 } 00751 break; 00752 case 1: 00753 switch(w-(sts+sx)) 00754 { 00755 case 4: 00756 dst[drs + writeIdx] = 00757 (dptr[0] * (70.0F / 255.0F) + 00758 (dptr[1] + dptr[1]) * (56.0F / 255.0F) + 00759 (border[bn1] + dptr[2]) * (28.0F / 255.0F) + 00760 (border[bn2] + dptr[3]) * ( 8.0F / 255.0F) + 00761 border[bn3] * ( 1.0F / 255.0F) 00762 ); 00763 break; 00764 case 3: 00765 dst[drs + writeIdx] = 00766 (dptr[0] * (70.0F / 247.0F) + 00767 (dptr[1] + dptr[1]) * (56.0F / 247.0F) + 00768 (border[bn1] + dptr[2]) * (28.0F / 247.0F) + 00769 border[bn2] * ( 8.0F / 247.0F) + 00770 border[bn3] * ( 1.0F / 247.0F) 00771 ); 00772 break; 00773 case 2: 00774 dst[drs + writeIdx] = 00775 (dptr[0] * (70.0F / 219.0F) + 00776 (dptr[1] + dptr[1]) * (56.0F / 219.0F) + 00777 border[bn1] * (28.0F / 219.0F) + 00778 border[bn2] * ( 8.0F / 219.0F) + 00779 border[bn3] * ( 1.0F / 219.0F) 00780 ); 00781 break; 00782 case 1: 00783 dst[drs + writeIdx] = 00784 (dptr[0] * (70.0F / 163.0F) + 00785 dptr[1] * (56.0F / 163.0F) + 00786 border[bn1] * (28.0F / 163.0F) + 00787 border[bn2] * ( 8.0F / 163.0F) + 00788 border[bn3] * ( 1.0F / 163.0F) 00789 ); 00790 break; 00791 } 00792 break; 00793 case 2: 00794 switch(w-(sts+sx)) 00795 { 00796 case 4: 00797 dst[drs + writeIdx] = 00798 (dptr[0] * (70.0F / 255.0F) + 00799 (dptr[1] + dptr[1]) * (56.0F / 255.0F) + 00800 (dptr[2] + dptr[2]) * (28.0F / 255.0F) + 00801 (border[bn1] + dptr[3]) * ( 8.0F / 255.0F) + 00802 border[bn2] * ( 1.0F / 255.0F) 00803 ); 00804 break; 00805 case 3: 00806 dst[drs + writeIdx] = 00807 (dptr[0] * (70.0F / 247.0F) + 00808 (dptr[1] + dptr[1]) * (56.0F / 247.0F) + 00809 (dptr[2] + dptr[2]) * (28.0F / 247.0F) + 00810 border[bn1] * ( 8.0F / 247.0F) + 00811 border[bn2] * ( 1.0F / 247.0F) 00812 ); 00813 break; 00814 case 2: 00815 dst[drs + writeIdx] = 00816 (dptr[0] * (70.0F / 219.0F) + 00817 (dptr[1] + dptr[1]) * (56.0F / 219.0F) + 00818 dptr[2] * (28.0F / 219.0F) + 00819 border[bn1] * ( 8.0F / 219.0F) + 00820 border[bn2] * ( 1.0F / 219.0F) 00821 ); 00822 break; 00823 case 1: 00824 dst[drs + writeIdx] = 00825 (dptr[0] * (70.0F / 163.0F) + 00826 dptr[1] * (56.0F / 163.0F) + 00827 dptr[2] * (28.0F / 163.0F) + 00828 border[bn1] * ( 8.0F / 163.0F) + 00829 border[bn2] * ( 1.0F / 163.0F) 00830 ); 00831 break; 00832 } 00833 break; 00834 case 3: 00835 switch(w-(sts+sx)) 00836 { 00837 case 4: 00838 dst[drs + writeIdx] = 00839 (dptr[0] * (70.0F / 255.0F) + 00840 (dptr[1] + dptr[1]) * (56.0F / 255.0F) + 00841 (dptr[2] + dptr[2]) * (28.0F / 255.0F) + 00842 (dptr[3] + dptr[3]) * ( 8.0F / 255.0F) + 00843 border[bn1] * ( 1.0F / 255.0F) 00844 ); 00845 break; 00846 case 3: 00847 dst[drs + writeIdx] = 00848 (dptr[0] * (70.0F / 247.0F) + 00849 (dptr[1] + dptr[1]) * (56.0F / 247.0F) + 00850 (dptr[2] + dptr[2]) * (28.0F / 247.0F) + 00851 dptr[3] * ( 8.0F / 247.0F) + 00852 border[bn1] * ( 1.0F / 247.0F) 00853 ); 00854 break; 00855 case 2: 00856 dst[drs + writeIdx] = 00857 (dptr[0] * (70.0F / 219.0F) + 00858 (dptr[1] + dptr[1]) * (56.0F / 219.0F) + 00859 dptr[2] * (28.0F / 219.0F) + 00860 dptr[3] * ( 8.0F / 219.0F) + 00861 border[bn1] * ( 1.0F / 219.0F) 00862 ); 00863 break; 00864 case 1: 00865 dst[drs + writeIdx] = 00866 (dptr[0] * (70.0F / 163.0F) + 00867 dptr[1] * (56.0F / 163.0F) + 00868 dptr[2] * (28.0F / 163.0F) + 00869 dptr[3] * ( 8.0F / 163.0F) + 00870 border[bn1] * ( 1.0F / 163.0F) 00871 ); 00872 break; 00873 } 00874 break; 00875 } 00876 } 00877 00878 // If in the last bit of the source row, reduce sample 00879 else if(sts + sx < w) 00880 { 00881 switch(w-(sts+sx)) 00882 { 00883 case 4: 00884 dst[drs + writeIdx] = 00885 (dptr[-4] * ( 1.0F / 255.0F) + 00886 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) + 00887 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) + 00888 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) + 00889 dptr[0] * (70.0F / 255.0F) 00890 ); 00891 break; 00892 case 3: 00893 dst[drs + writeIdx] = 00894 (dptr[-4] * ( 1.0F / 247.0F) + 00895 dptr[-3] * ( 8.0F / 247.0F) + 00896 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) + 00897 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) + 00898 dptr[0] * (70.0F / 247.0F) 00899 ); 00900 break; 00901 case 2: 00902 dst[drs + writeIdx] = 00903 (dptr[-4] * ( 1.0F / 219.0F) + 00904 dptr[-3] * ( 8.0F / 219.0F) + 00905 dptr[-2] * (28.0F / 219.0F) + 00906 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) + 00907 dptr[0] * (70.0F / 219.0F) 00908 ); 00909 break; 00910 case 1: 00911 dst[drs + writeIdx] = 00912 (dptr[-4] * ( 1.0F / 163.0F) + 00913 dptr[-3] * ( 8.0F / 163.0F) + 00914 dptr[-2] * (28.0F / 163.0F) + 00915 dptr[-1] * (56.0F / 163.0F) + 00916 dptr[ 0] * (70.0F / 163.0F) 00917 ); 00918 break; 00919 } 00920 } 00921 } 00922 } 00923 00924 // ###################################################################### 00925 __global__ void cuda_global_lowpass_9_y_dec_y(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width, int tile_height) 00926 { 00927 // Data cache 00928 const int src_tile_height = tile_height<<1; 00929 float *data = (float *) shared_data; // size of tile_width * src_tile_height 00930 float *border = (float *) &data[tile_width*src_tile_height]; // size of tile_width * 8 00931 00932 const int dy = threadIdx.y; // dest pixel within dest tile (decimated 2x) 00933 const int dts = IMUL(blockIdx.y, tile_height); // tile start for dest, relative to row start 00934 const int dcs = IMUL(blockIdx.x, tile_width) + threadIdx.x; // Current column index in dest data 00935 const int sy = dy<<1; // source pixel within source tile 00936 const int sts = dts<<1; // tile start for source, in rows 00937 const int ste = sts + src_tile_height; // tile end for source, in rows 00938 const int scs = dcs; // Current column index 00939 00940 // Clamp tile and apron limits by image borders 00941 const int stec = min(ste, h); 00942 00943 00944 // Load the top border 00945 if (dy < 4 && sts+sy > 4) 00946 { 00947 border[IMUL(dy,tile_width)+threadIdx.x] = src[IMUL(sts+dy,w)+scs-IMUL(4,w)]; 00948 } 00949 // Load the bottom border 00950 else if (dy >= tile_height-4) 00951 { 00952 int bordOff = 8+dy-tile_height; 00953 if(ste+4 <= h) 00954 //border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(4,w)]; 00955 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[IMUL(sts+dy+tile_height+4,w)+scs]; 00956 else 00957 // This is explicitly handling the case where the second to last tile is so close to the edge of the image, that the post border 00958 // can't fit in the remaining space, maybe we should be handling this as yet another special case, but there are a lot of special 00959 // cases already in this function (handling smaller than 9 imgs, handling all cases of small last tile) 00960 border[threadIdx.x+IMUL(bordOff,tile_width)] = 0; 00961 } 00962 00963 00964 // only process columns that are actually within image bounds: 00965 if (scs < w && sts+sy < stec) 00966 { 00967 // Shared and global memory indices for current column 00968 int smemPos = IMUL(sy, tile_width) + threadIdx.x; 00969 int gmemPos = IMUL(sts + sy, w) + scs; 00970 00971 // Each thread loads up to two pixels of data 00972 data[smemPos] = src[gmemPos]; 00973 if(sts+sy+1 < stec) 00974 data[smemPos+tile_width] = src[gmemPos+w]; 00975 00976 00977 // Ensure the completness of loading stage because results emitted 00978 // by each thread depend on the data loaded by other threads: 00979 __syncthreads(); 00980 00981 // Shared and global memory indices for current column 00982 smemPos = IMUL(sy, tile_width) + threadIdx.x; 00983 gmemPos = IMUL(dts + dy, dw) + dcs; 00984 00985 00986 // Setup the offsets to get to the correct smem points in the arrays for both the data and the border 00987 float *dptr = data + smemPos; 00988 const int sw = tile_width, sw2 = sw + sw, sw3 = sw2 + sw, sw4 = sw3 + sw; 00989 const int nsw = -sw, nsw2 = nsw - sw, nsw3 = nsw2 - sw, nsw4 = nsw3 - sw; 00990 const int bn4 = threadIdx.x, bn3 = bn4 + tile_width, bn2 = bn3 + tile_width, bn1 = bn2 + tile_width; 00991 const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width, bp3 = bp2 + tile_width, bp4 = bp3 + tile_width; 00992 00993 if(h < 9) 00994 { 00995 // This is not at all efficient, just here to ensure filter behaves properly in small image cases 00996 const int numAhead = max(0,min(h-1-(sts+sy),4)); 00997 const int numBehind = max(0,min(sts+sy,4)); 00998 const int situation = numBehind*10+numAhead; 00999 switch(situation) 01000 { 01001 case 0: // 00 01002 dst[gmemPos] = dptr[0]; 01003 break; 01004 case 1: // 01 01005 dst[gmemPos] = (dptr[0] * (70.0F / 126.0F) + 01006 dptr[sw] * (56.0F / 126.0F)); 01007 break; 01008 case 2: // 02 01009 dst[gmemPos] = (dptr[0] * (70.0F / 154.0F) + 01010 dptr[sw] * (56.0F / 154.0F) + 01011 dptr[sw2] * (28.0F / 154.0F)); 01012 break; 01013 case 3: 01014 dst[gmemPos] = (dptr[0] * (70.0F / 162.0F) + 01015 dptr[sw] * (56.0F / 162.0F) + 01016 dptr[sw2] * (28.0F / 162.0F) + 01017 dptr[sw3] * ( 8.0F / 162.0F)); 01018 break; 01019 case 4: 01020 dst[gmemPos] = (dptr[0] * (70.0F / 163.0F) + 01021 dptr[sw] * (56.0F / 163.0F) + 01022 dptr[sw2] * (28.0F / 163.0F) + 01023 dptr[sw3] * ( 8.0F / 163.0F) + 01024 dptr[sw4] * ( 1.0F / 163.0F)); 01025 break; 01026 case 20: 01027 dst[gmemPos] = (dptr[0] * (70.0F / 154.0F) + 01028 dptr[nsw] * (56.0F / 154.0F) + 01029 dptr[nsw2] * (28.0F / 154.0F)); 01030 break; 01031 case 21: 01032 dst[gmemPos] = (dptr[0] * (70.0F / 210.0F) + 01033 (dptr[nsw] + dptr[sw]) * (56.0F / 210.0F) + 01034 dptr[nsw2] * (28.0F / 210.0F)); 01035 break; 01036 case 22: 01037 dst[gmemPos] = (dptr[0] * (70.0F / 238.0F) + 01038 (dptr[nsw] + dptr[sw]) * (56.0F / 238.0F) + 01039 (dptr[nsw2] + dptr[sw2]) * (28.0F / 238.0F)); 01040 break; 01041 case 23: 01042 dst[gmemPos] = (dptr[0] * (70.0F / 246.0F) + 01043 (dptr[nsw] + dptr[sw]) * (56.0F / 246.0F) + 01044 (dptr[nsw2] + dptr[sw2]) * (28.0F / 246.0F) + 01045 dptr[sw3] * ( 8.0F / 246.0F)); 01046 break; 01047 case 24: 01048 dst[gmemPos] = (dptr[0] * (70.0F / 247.0F) + 01049 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 01050 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 01051 dptr[sw3] * ( 8.0F / 247.0F) + 01052 dptr[sw4] * ( 1.0F / 247.0F)); 01053 break; 01054 case 40: 01055 dst[gmemPos] = (dptr[0] * (70.0F / 163.0F) + 01056 dptr[nsw] * (56.0F / 163.0F) + 01057 dptr[nsw2] * (28.0F / 163.0F) + 01058 dptr[nsw3] * ( 8.0F / 163.0F) + 01059 dptr[nsw4] * ( 1.0F / 163.0F)); 01060 break; 01061 case 41: 01062 dst[gmemPos] = (dptr[0] * (70.0F / 219.0F) + 01063 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 01064 dptr[nsw2] * (28.0F / 219.0F) + 01065 dptr[nsw3] * ( 8.0F / 219.0F) + 01066 dptr[nsw4] * ( 1.0F / 219.0F)); 01067 break; 01068 case 42: 01069 dst[gmemPos] = (dptr[0] * (70.0F / 247.0F) + 01070 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 01071 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 01072 dptr[nsw3] * ( 8.0F / 247.0F) + 01073 dptr[nsw4] * ( 1.0F / 247.0F)); 01074 break; 01075 case 43: 01076 dst[gmemPos] = (dptr[0] * (70.0F / 255.0F) + 01077 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 01078 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 01079 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) + 01080 dptr[nsw4] * ( 1.0F / 255.0F)); 01081 break; 01082 } 01083 } 01084 // Are we in the top 4 rows of the whole image 01085 else if(sts + sy < 4) 01086 { 01087 switch(sts+sy) 01088 { 01089 case 0: 01090 dst[gmemPos] = 01091 (dptr[0] * (70.0F / 163.0F) + 01092 dptr[sw] * (56.0F / 163.0F) + 01093 dptr[sw2] * (28.0F / 163.0F) + 01094 dptr[sw3] * ( 8.0F / 163.0F) + 01095 dptr[sw4] * ( 1.0F / 163.0F) 01096 ); 01097 break; 01098 case 1: 01099 dst[gmemPos] = 01100 (dptr[0] * (70.0F / 219.0F) + 01101 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 01102 dptr[sw2] * (28.0F / 219.0F) + 01103 dptr[sw3] * ( 8.0F / 219.0F) + 01104 dptr[sw4] * ( 1.0F / 219.0F) 01105 ); 01106 break; 01107 case 2: 01108 dst[gmemPos] = 01109 (dptr[0] * (70.0F / 247.0F) + 01110 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 01111 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 01112 dptr[sw3] * ( 8.0F / 247.0F) + 01113 dptr[sw4] * ( 1.0F / 247.0F) 01114 ); 01115 break; 01116 case 3: 01117 dst[gmemPos] = 01118 (dptr[0] * (70.0F / 255.0F) + 01119 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 01120 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 01121 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) + 01122 dptr[sw4] * ( 1.0F / 255.0F) 01123 ); 01124 break; 01125 } 01126 } 01127 else if(sy < 4 && sts+sy<h-4) // If not top 4 in the whole image, are we in the top 4 rows of this tile 01128 { 01129 switch(sy) 01130 { 01131 case 0: 01132 dst[gmemPos] = 01133 (dptr[0] * (70.0F / 256.0F) + 01134 (border[bn1] + dptr[sw]) * (56.0F / 256.0F) + 01135 (border[bn2] + dptr[sw2]) * (28.0F / 256.0F) + 01136 (border[bn3] + dptr[sw3]) * ( 8.0F / 256.0F) + 01137 (border[bn4] + dptr[sw4]) * ( 1.0F / 256.0F) 01138 ); 01139 break; 01140 case 1: 01141 dst[gmemPos] = 01142 (dptr[0] * (70.0F / 256.0F) + 01143 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 01144 (border[bn1] + dptr[sw2]) * (28.0F / 256.0F) + 01145 (border[bn2] + dptr[sw3]) * ( 8.0F / 256.0F) + 01146 (border[bn3] + dptr[sw4]) * ( 1.0F / 256.0F) 01147 ); 01148 break; 01149 case 2: 01150 dst[gmemPos] = 01151 (dptr[0] * (70.0F / 256.0F) + 01152 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 01153 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 01154 (border[bn1] + dptr[sw3]) * ( 8.0F / 256.0F) + 01155 (border[bn2] + dptr[sw4]) * ( 1.0F / 256.0F) 01156 ); 01157 break; 01158 case 3: 01159 dst[gmemPos] = 01160 (dptr[0] * (70.0F / 256.0F) + 01161 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 01162 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 01163 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) + 01164 (border[bn1] + dptr[sw4]) * ( 1.0F / 256.0F) 01165 ); 01166 break; 01167 } 01168 } 01169 else if(sy >= 4 && sy <src_tile_height-4 && sts+sy<h-4) // Are we in the middle of the tile 01170 { 01171 dst[gmemPos] = 01172 ((dptr[nsw4] + dptr[sw4]) * ( 1.0F / 256.0F) + 01173 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) + 01174 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 01175 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 01176 dptr[0] * (70.0F / 256.0F) 01177 ); 01178 } 01179 else if(sy >= 4 && sts + sy < h-4) // Are we not at the bottom of the image, but bottom 4 of the tile 01180 { 01181 switch(src_tile_height-sy) 01182 { 01183 case 4: 01184 dst[gmemPos] = 01185 (dptr[0] * (70.0F / 256.0F) + 01186 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 01187 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 01188 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) + 01189 (dptr[nsw4] + border[bp1]) * ( 1.0F / 256.0F) 01190 ); 01191 break; 01192 case 3: 01193 dst[gmemPos] = 01194 (dptr[0] * (70.0F / 256.0F) + 01195 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 01196 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 01197 (dptr[nsw3] + border[bp1]) * ( 8.0F / 256.0F) + 01198 (dptr[nsw4] + border[bp2]) * ( 1.0F / 256.0F) 01199 ); 01200 break; 01201 case 2: 01202 dst[gmemPos] = 01203 (dptr[0] * (70.0F / 256.0F) + 01204 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 01205 (dptr[nsw2] + border[bp1]) * (28.0F / 256.0F) + 01206 (dptr[nsw3] + border[bp2]) * ( 8.0F / 256.0F) + 01207 (dptr[nsw4] + border[bp3]) * ( 1.0F / 256.0F) 01208 ); 01209 break; 01210 case 1: 01211 dst[gmemPos] = 01212 (dptr[0] * (70.0F / 256.0F) + 01213 (dptr[nsw] + border[bp1]) * (56.0F / 256.0F) + 01214 (dptr[nsw2] + border[bp2]) * (28.0F / 256.0F) + 01215 (dptr[nsw3] + border[bp3]) * ( 8.0F / 256.0F) + 01216 (dptr[nsw4] + border[bp4]) * ( 1.0F / 256.0F) 01217 ); 01218 break; 01219 } 01220 } 01221 else if(sy < 4 && sts + sy >= h-4) // We are in the top of a tile close to the bottom of the image 01222 { 01223 switch(sy) 01224 { 01225 case 0: 01226 switch(h-(sts+sy)) 01227 { 01228 case 4: 01229 dst[gmemPos] = 01230 (dptr[0] * (70.0F / 255.0F) + 01231 (border[bn1] + dptr[sw]) * (56.0F / 255.0F) + 01232 (border[bn2] + dptr[sw2]) * (28.0F / 255.0F) + 01233 (border[bn3] + dptr[sw3]) * ( 8.0F / 255.0F) + 01234 border[bn4] * ( 1.0F / 255.0F) 01235 ); 01236 break; 01237 case 3: 01238 dst[gmemPos] = 01239 (dptr[0] * (70.0F / 247.0F) + 01240 (border[bn1] + dptr[sw]) * (56.0F / 247.0F) + 01241 (border[bn2] + dptr[sw2]) * (28.0F / 247.0F) + 01242 border[bn3] * ( 8.0F / 247.0F) + 01243 border[bn4] * ( 1.0F / 247.0F) 01244 ); 01245 break; 01246 case 2: 01247 dst[gmemPos] = 01248 (dptr[0] * (70.0F / 219.0F) + 01249 (border[bn1] + dptr[sw]) * (56.0F / 219.0F) + 01250 border[bn2] * (28.0F / 219.0F) + 01251 border[bn3] * ( 8.0F / 219.0F) + 01252 border[bn4] * ( 1.0F / 219.0F) 01253 ); 01254 break; 01255 case 1: 01256 dst[gmemPos] = 01257 (dptr[0] * (70.0F / 163.0F) + 01258 border[bn1] * (56.0F / 163.0F) + 01259 border[bn2] * (28.0F / 163.0F) + 01260 border[bn3] * ( 8.0F / 163.0F) + 01261 border[bn4] * ( 1.0F / 163.0F) 01262 ); 01263 break; 01264 } 01265 break; 01266 case 1: 01267 switch(h-(sts+sy)) 01268 { 01269 case 4: 01270 dst[gmemPos] = 01271 (dptr[0] * (70.0F / 255.0F) + 01272 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 01273 (border[bn1] + dptr[sw2]) * (28.0F / 255.0F) + 01274 (border[bn2] + dptr[sw3]) * ( 8.0F / 255.0F) + 01275 border[bn3] * ( 1.0F / 255.0F) 01276 ); 01277 break; 01278 case 3: 01279 dst[gmemPos] = 01280 (dptr[0] * (70.0F / 247.0F) + 01281 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 01282 (border[bn1] + dptr[sw2]) * (28.0F / 247.0F) + 01283 border[bn2] * ( 8.0F / 247.0F) + 01284 border[bn3] * ( 1.0F / 247.0F) 01285 ); 01286 break; 01287 case 2: 01288 dst[gmemPos] = 01289 (dptr[0] * (70.0F / 219.0F) + 01290 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 01291 border[bn1] * (28.0F / 219.0F) + 01292 border[bn2] * ( 8.0F / 219.0F) + 01293 border[bn3] * ( 1.0F / 219.0F) 01294 ); 01295 break; 01296 case 1: 01297 dst[gmemPos] = 01298 (dptr[0] * (70.0F / 163.0F) + 01299 dptr[nsw] * (56.0F / 163.0F) + 01300 border[bn1] * (28.0F / 163.0F) + 01301 border[bn2] * ( 8.0F / 163.0F) + 01302 border[bn3] * ( 1.0F / 163.0F) 01303 ); 01304 break; 01305 } 01306 break; 01307 case 2: 01308 switch(h-(sts+sy)) 01309 { 01310 case 4: 01311 dst[gmemPos] = 01312 (dptr[0] * (70.0F / 255.0F) + 01313 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 01314 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 01315 (border[bn1] + dptr[sw3]) * ( 8.0F / 255.0F) + 01316 border[bn2] * ( 1.0F / 255.0F) 01317 ); 01318 break; 01319 case 3: 01320 dst[gmemPos] = 01321 (dptr[0] * (70.0F / 247.0F) + 01322 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 01323 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 01324 border[bn1] * ( 8.0F / 247.0F) + 01325 border[bn2] * ( 1.0F / 247.0F) 01326 ); 01327 break; 01328 case 2: 01329 dst[gmemPos] = 01330 (dptr[0] * (70.0F / 219.0F) + 01331 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 01332 dptr[nsw2] * (28.0F / 219.0F) + 01333 border[bn1] * ( 8.0F / 219.0F) + 01334 border[bn2] * ( 1.0F / 219.0F) 01335 ); 01336 break; 01337 case 1: 01338 dst[gmemPos] = 01339 (dptr[0] * (70.0F / 163.0F) + 01340 dptr[nsw] * (56.0F / 163.0F) + 01341 dptr[nsw2] * (28.0F / 163.0F) + 01342 border[bn1] * ( 8.0F / 163.0F) + 01343 border[bn2] * ( 1.0F / 163.0F) 01344 ); 01345 break; 01346 } 01347 break; 01348 case 3: 01349 switch(h-(sts+sy)) 01350 { 01351 case 4: 01352 dst[gmemPos] = 01353 (dptr[0] * (70.0F / 255.0F) + 01354 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 01355 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 01356 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) + 01357 border[bn1] * ( 1.0F / 255.0F) 01358 ); 01359 break; 01360 case 3: 01361 dst[gmemPos] = 01362 (dptr[0] * (70.0F / 247.0F) + 01363 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 01364 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 01365 dptr[nsw3] * ( 8.0F / 247.0F) + 01366 border[bn1] * ( 1.0F / 247.0F) 01367 ); 01368 break; 01369 case 2: 01370 dst[gmemPos] = 01371 (dptr[0] * (70.0F / 219.0F) + 01372 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 01373 dptr[nsw2] * (28.0F / 219.0F) + 01374 dptr[nsw3] * ( 8.0F / 219.0F) + 01375 border[bn1] * ( 1.0F / 219.0F) 01376 ); 01377 break; 01378 case 1: 01379 dst[gmemPos] = 01380 (dptr[0] * (70.0F / 163.0F) + 01381 dptr[nsw] * (56.0F / 163.0F) + 01382 dptr[nsw2] * (28.0F / 163.0F) + 01383 dptr[nsw3] * ( 8.0F / 163.0F) + 01384 border[bn1] * ( 1.0F / 163.0F) 01385 ); 01386 break; 01387 } 01388 break; 01389 } 01390 } 01391 else // We must be at the bottom 4 of the image 01392 { 01393 switch(h-(sts+sy)) 01394 { 01395 case 4: 01396 dst[gmemPos] = 01397 (dptr[0] * (70.0F / 255.0F) + 01398 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 01399 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 01400 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) + 01401 dptr[nsw4] * ( 1.0F / 255.0F) 01402 ); 01403 break; 01404 case 3: 01405 dst[gmemPos] = 01406 (dptr[0] * (70.0F / 247.0F) + 01407 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 01408 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 01409 dptr[nsw3] * ( 8.0F / 247.0F) + 01410 dptr[nsw4] * ( 1.0F / 247.0F) 01411 ); 01412 break; 01413 case 2: 01414 dst[gmemPos] = 01415 (dptr[0] * (70.0F / 219.0F) + 01416 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 01417 dptr[nsw2] * (28.0F / 219.0F) + 01418 dptr[nsw3] * ( 8.0F / 219.0F) + 01419 dptr[nsw4] * ( 1.0F / 219.0F) 01420 ); 01421 break; 01422 case 1: 01423 dst[gmemPos] = 01424 (dptr[0] * (70.0F / 163.0F) + 01425 dptr[nsw] * (56.0F / 163.0F) + 01426 dptr[nsw2] * (28.0F / 163.0F) + 01427 dptr[nsw3] * ( 8.0F / 163.0F) + 01428 dptr[nsw4] * ( 1.0F / 163.0F) 01429 ); 01430 break; 01431 } 01432 } 01433 } 01434 } 01435 01436 01437 01438 __global__ void cuda_global_lowpass_9_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width) 01439 { 01440 // Data cache in shared memory 01441 float *data = (float *) shared_data; // size of tile_width 01442 // Bordering data flanking this tile 01443 float *border = (float *) &data[tile_width]; // size of 8 01444 const int sx = threadIdx.x; // source pixel within source tile 01445 const int sts = IMUL(blockIdx.x, tile_width); // tile start for source, relative to row start 01446 const int srs = IMUL(blockIdx.y,w); // Row start index in source data 01447 01448 const int loadIdx = sts + sx; // index of one pixel value to load 01449 const int bn4 = 0, bn3 = 1, bn2 = 2, bn1 = 3; 01450 const int bp1 = 4, bp2 = 5, bp3 = 6, bp4 = 7; 01451 float *dptr = &data[sx]; 01452 // Load border pixels 01453 if (sx < 4 && sts > 0) border[sx] = src[srs + sts - (4-sx)]; 01454 if (sx >= tile_width-4 && sts+tile_width < w-4) border[4+sx-(tile_width-4)] = src[srs + sts + sx + 4]; 01455 01456 // Load the row into shared memory among the thread block 01457 if (loadIdx < w) 01458 data[sx] = src[srs + loadIdx]; 01459 else 01460 return; // Threads that are over the edge of the image on the right most tile... 01461 01462 // Ensure the completness of loading stage because results emitted 01463 // by each thread depend on the data loaded by other threads: 01464 __syncthreads(); 01465 01466 // [ 1 8 28 56 (70) 56 28 8 1 ] 01467 01468 if(w < 9) 01469 { 01470 // This is not at all efficient, just here to ensure filter behaves properly in small image cases 01471 const int numAhead = max(0,min(w-1-(sts+sx),4)); 01472 const int numBehind = max(0,min(sts+sx,4)); 01473 const int situation = numBehind*10+numAhead; 01474 01475 switch(situation) 01476 { 01477 case 0: // 00 01478 dst[srs+loadIdx] = dptr[0]; 01479 break; 01480 case 1: // 01 01481 dst[srs+loadIdx] = (dptr[0] * (70.0F / 126.0F) + 01482 dptr[1] * (56.0F / 126.0F)); 01483 break; 01484 case 2: // 02 01485 dst[srs+loadIdx] = (dptr[0] * (70.0F / 154.0F) + 01486 dptr[1] * (56.0F / 154.0F) + 01487 dptr[2] * (28.0F / 154.0F)); 01488 break; 01489 case 3: 01490 dst[srs+loadIdx] = (dptr[0] * (70.0F / 162.0F) + 01491 dptr[1] * (56.0F / 162.0F) + 01492 dptr[2] * (28.0F / 162.0F) + 01493 dptr[3] * ( 8.0F / 162.0F)); 01494 break; 01495 case 4: 01496 dst[srs+loadIdx] = (dptr[0] * (70.0F / 163.0F) + 01497 dptr[1] * (56.0F / 163.0F) + 01498 dptr[2] * (28.0F / 163.0F) + 01499 dptr[3] * ( 8.0F / 163.0F) + 01500 dptr[4] * ( 1.0F / 163.0F)); 01501 break; 01502 case 10: 01503 dst[srs+loadIdx] = (dptr[0] * (70.0F / 126.0F) + 01504 dptr[-1] * (56.0F / 126.0F)); 01505 break; 01506 case 11: 01507 dst[srs+loadIdx] = (dptr[0] * (70.0F / 182.0F) + 01508 (dptr[-1] + dptr[1]) * (56.0F / 182.0F)); 01509 break; 01510 case 12: 01511 dst[srs+loadIdx] = (dptr[0] * (70.0F / 210.0F) + 01512 (dptr[-1] + dptr[1]) * (56.0F / 210.0F) + 01513 dptr[2] * (28.0F / 210.0F)); 01514 break; 01515 case 13: 01516 dst[srs+loadIdx] = (dptr[0] * (70.0F / 218.0F) + 01517 (dptr[-1] + dptr[1]) * (56.0F / 218.0F) + 01518 dptr[2] * (28.0F / 218.0F) + 01519 dptr[3] * ( 8.0F / 218.0F)); 01520 break; 01521 case 14: 01522 dst[srs+loadIdx] = (dptr[0] * (70.0F / 219.0F) + 01523 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) + 01524 dptr[2] * (28.0F / 219.0F) + 01525 dptr[3] * ( 8.0F / 219.0F) + 01526 dptr[4] * ( 1.0F / 219.0F)); 01527 break; 01528 case 20: 01529 dst[srs+loadIdx] = (dptr[0] * (70.0F / 154.0F) + 01530 dptr[-1] * (56.0F / 154.0F) + 01531 dptr[-2] * (28.0F / 154.0F)); 01532 break; 01533 case 21: 01534 dst[srs+loadIdx] = (dptr[0] * (70.0F / 210.0F) + 01535 (dptr[-1] + dptr[1]) * (56.0F / 210.0F) + 01536 dptr[-2] * (28.0F / 210.0F)); 01537 break; 01538 case 22: 01539 dst[srs+loadIdx] = (dptr[0] * (70.0F / 238.0F) + 01540 (dptr[-1] + dptr[1]) * (56.0F / 238.0F) + 01541 (dptr[-2] + dptr[2]) * (28.0F / 238.0F)); 01542 break; 01543 case 23: 01544 dst[srs+loadIdx] = (dptr[0] * (70.0F / 246.0F) + 01545 (dptr[-1] + dptr[1]) * (56.0F / 246.0F) + 01546 (dptr[-2] + dptr[2]) * (28.0F / 246.0F) + 01547 dptr[3] * ( 8.0F / 246.0F)); 01548 break; 01549 case 24: 01550 dst[srs+loadIdx] = (dptr[0] * (70.0F / 247.0F) + 01551 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) + 01552 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) + 01553 dptr[3] * ( 8.0F / 247.0F) + 01554 dptr[4] * ( 1.0F / 247.0F)); 01555 break; 01556 case 30: 01557 dst[srs+loadIdx] = (dptr[0] * (70.0F / 162.0F) + 01558 dptr[-1] * (56.0F / 162.0F) + 01559 dptr[-2] * (28.0F / 162.0F) + 01560 dptr[-3] * ( 8.0F / 162.0F)); 01561 break; 01562 case 31: 01563 dst[srs+loadIdx] = (dptr[0] * (70.0F / 218.0F) + 01564 (dptr[-1] + dptr[1]) * (56.0F / 218.0F) + 01565 dptr[-2] * (28.0F / 218.0F) + 01566 dptr[-3] * ( 8.0F / 218.0F)); 01567 break; 01568 case 32: 01569 dst[srs+loadIdx] = (dptr[0] * (70.0F / 246.0F) + 01570 (dptr[-1] + dptr[1]) * (56.0F / 246.0F) + 01571 (dptr[-2] + dptr[2]) * (28.0F / 246.0F) + 01572 dptr[-3] * ( 8.0F / 246.0F)); 01573 break; 01574 case 33: 01575 dst[srs+loadIdx] = (dptr[0] * (70.0F / 254.0F) + 01576 (dptr[-1] + dptr[1]) * (56.0F / 254.0F) + 01577 (dptr[-2] + dptr[2]) * (28.0F / 254.0F) + 01578 (dptr[-3] + dptr[3]) * ( 8.0F / 254.0F)); 01579 break; 01580 case 34: 01581 dst[srs+loadIdx] = (dptr[0] * (70.0F / 255.0F) + 01582 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) + 01583 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) + 01584 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) + 01585 dptr[4] * ( 1.0F / 255.0F)); 01586 break; 01587 case 40: 01588 dst[srs+loadIdx] = (dptr[0] * (70.0F / 163.0F) + 01589 dptr[-1] * (56.0F / 163.0F) + 01590 dptr[-2] * (28.0F / 163.0F) + 01591 dptr[-3] * ( 8.0F / 163.0F) + 01592 dptr[-4] * ( 1.0F / 163.0F)); 01593 break; 01594 case 41: 01595 dst[srs+loadIdx] = (dptr[0] * (70.0F / 219.0F) + 01596 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) + 01597 dptr[-2] * (28.0F / 219.0F) + 01598 dptr[-3] * ( 8.0F / 219.0F) + 01599 dptr[-4] * ( 1.0F / 219.0F)); 01600 break; 01601 case 42: 01602 dst[srs+loadIdx] = (dptr[0] * (70.0F / 247.0F) + 01603 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) + 01604 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) + 01605 dptr[-3] * ( 8.0F / 247.0F) + 01606 dptr[-4] * ( 1.0F / 247.0F)); 01607 break; 01608 case 43: 01609 dst[srs+loadIdx] = (dptr[0] * (70.0F / 255.0F) + 01610 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) + 01611 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) + 01612 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) + 01613 dptr[-4] * ( 1.0F / 255.0F)); 01614 break; 01615 } 01616 } 01617 // First part of source row, just reduce sample 01618 else if(sts+sx < 4) 01619 { 01620 switch(sx) 01621 { 01622 case 0: 01623 dst[srs + loadIdx] = 01624 (dptr[0] * (70.0F / 163.0F) + 01625 dptr[1] * (56.0F / 163.0F) + 01626 dptr[2] * (28.0F / 163.0F) + 01627 dptr[3] * ( 8.0F / 163.0F) + 01628 dptr[4] * ( 1.0F / 163.0F) 01629 ); 01630 break; 01631 case 1: 01632 dst[srs + loadIdx] = 01633 (dptr[0] * (70.0F / 219.0F) + 01634 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) + 01635 dptr[2] * (28.0F / 219.0F) + 01636 dptr[3] * ( 8.0F / 219.0F) + 01637 dptr[4] * ( 1.0F / 219.0F) 01638 ); 01639 break; 01640 case 2: 01641 dst[srs + loadIdx] = 01642 (dptr[0] * (70.0F / 247.0F) + 01643 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) + 01644 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) + 01645 dptr[3] * ( 8.0F / 247.0F) + 01646 dptr[4] * ( 1.0F / 247.0F) 01647 ); 01648 break; 01649 case 3: 01650 dst[srs + loadIdx] = 01651 (dptr[0] * (70.0F / 255.0F) + 01652 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) + 01653 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) + 01654 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) + 01655 dptr[4] * ( 1.0F / 255.0F) 01656 ); 01657 break; 01658 default: 01659 //LERROR(); 01660 break; 01661 } 01662 } 01663 // If not the first part of the source row, but is the first bit of this tile, use border 01664 else if(sx < 4 && sts+sx < w-4) 01665 { 01666 switch(sx) 01667 { 01668 case 0: 01669 dst[srs + loadIdx] = 01670 ((border[bn4] + dptr[4]) * ( 1.0F / 256.0F) + 01671 (border[bn3] + dptr[3]) * ( 8.0F / 256.0F) + 01672 (border[bn2] + dptr[2]) * (28.0F / 256.0F) + 01673 (border[bn1] + dptr[1]) * (56.0F / 256.0F) + 01674 dptr[0] * (70.0F / 256.0F) 01675 ); 01676 break; 01677 case 1: 01678 dst[srs + loadIdx] = 01679 ((border[bn3] + dptr[4]) * ( 1.0F / 256.0F) + 01680 (border[bn2] + dptr[3]) * ( 8.0F / 256.0F) + 01681 (border[bn1] + dptr[2]) * (28.0F / 256.0F) + 01682 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 01683 dptr[0] * (70.0F / 256.0F) 01684 ); 01685 break; 01686 case 2: 01687 dst[srs + loadIdx] = 01688 ((border[bn2] + dptr[4]) * ( 1.0F / 256.0F) + 01689 (border[bn1] + dptr[3]) * ( 8.0F / 256.0F) + 01690 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 01691 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 01692 dptr[0] * (70.0F / 256.0F) 01693 ); 01694 break; 01695 case 3: 01696 dst[srs + loadIdx] = 01697 ((border[bn1] + dptr[4]) * ( 1.0F / 256.0F) + 01698 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) + 01699 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 01700 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 01701 dptr[0] * (70.0F / 256.0F) 01702 ); 01703 break; 01704 } 01705 } 01706 // If we are not near the edge of this tile, do standard way 01707 else if(sx>=4 && sx<tile_width-4 && sts+sx < w-4) 01708 { 01709 dst[srs + loadIdx] = 01710 ((dptr[-4] + dptr[4]) * ( 1.0F / 256.0F) + 01711 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) + 01712 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 01713 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 01714 dptr[0] * (70.0F / 256.0F) 01715 ); 01716 } 01717 // If not the last part of the source row, but in the last bit of this tile, use border 01718 else if(sx>=4 && sts+sx < w-4) 01719 { 01720 switch(tile_width-sx) 01721 { 01722 case 4: 01723 dst[srs + loadIdx] = 01724 ((dptr[-4] + border[bp1]) * ( 1.0F / 256.0F) + 01725 (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) + 01726 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 01727 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 01728 dptr[0] * (70.0F / 256.0F) 01729 ); 01730 break; 01731 case 3: 01732 dst[srs + loadIdx] = 01733 ((dptr[-4] + border[bp2]) * ( 1.0F / 256.0F) + 01734 (dptr[-3] + border[bp1]) * ( 8.0F / 256.0F) + 01735 (dptr[-2] + dptr[2]) * (28.0F / 256.0F) + 01736 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 01737 dptr[0] * (70.0F / 256.0F) 01738 ); 01739 break; 01740 case 2: 01741 dst[srs + loadIdx] = 01742 ((dptr[-4] + border[bp3]) * ( 1.0F / 256.0F) + 01743 (dptr[-3] + border[bp2]) * ( 8.0F / 256.0F) + 01744 (dptr[-2] + border[bp1]) * (28.0F / 256.0F) + 01745 (dptr[-1] + dptr[1]) * (56.0F / 256.0F) + 01746 dptr[0] * (70.0F / 256.0F) 01747 ); 01748 break; 01749 case 1: 01750 dst[srs + loadIdx] = 01751 ((dptr[-4] + border[bp4]) * ( 1.0F / 256.0F) + 01752 (dptr[-3] + border[bp3]) * ( 8.0F / 256.0F) + 01753 (dptr[-2] + border[bp2]) * (28.0F / 256.0F) + 01754 (dptr[-1] + border[bp1]) * (56.0F / 256.0F) + 01755 dptr[0] * (70.0F / 256.0F) 01756 ); 01757 break; 01758 } 01759 } 01760 else if(sx < 4 && sts + sx >= w-4) // We are in the beginning of a tile close to the end of the image 01761 { 01762 switch(sx) 01763 { 01764 case 0: 01765 switch(w-(sts+sx)) 01766 { 01767 case 4: 01768 dst[srs + loadIdx] = 01769 (dptr[0] * (70.0F / 255.0F) + 01770 (border[bn1] + dptr[1]) * (56.0F / 255.0F) + 01771 (border[bn2] + dptr[2]) * (28.0F / 255.0F) + 01772 (border[bn3] + dptr[3]) * ( 8.0F / 255.0F) + 01773 border[bn4] * ( 1.0F / 255.0F) 01774 ); 01775 break; 01776 case 3: 01777 dst[srs + loadIdx] = 01778 (dptr[0] * (70.0F / 247.0F) + 01779 (border[bn1] + dptr[1]) * (56.0F / 247.0F) + 01780 (border[bn2] + dptr[2]) * (28.0F / 247.0F) + 01781 border[bn3] * ( 8.0F / 247.0F) + 01782 border[bn4] * ( 1.0F / 247.0F) 01783 ); 01784 break; 01785 case 2: 01786 dst[srs + loadIdx] = 01787 (dptr[0] * (70.0F / 219.0F) + 01788 (border[bn1] + dptr[1]) * (56.0F / 219.0F) + 01789 border[bn2] * (28.0F / 219.0F) + 01790 border[bn3] * ( 8.0F / 219.0F) + 01791 border[bn4] * ( 1.0F / 219.0F) 01792 ); 01793 break; 01794 case 1: 01795 dst[srs + loadIdx] = 01796 (dptr[0] * (70.0F / 163.0F) + 01797 border[bn1] * (56.0F / 163.0F) + 01798 border[bn2] * (28.0F / 163.0F) + 01799 border[bn3] * ( 8.0F / 163.0F) + 01800 border[bn4] * ( 1.0F / 163.0F) 01801 ); 01802 break; 01803 } 01804 break; 01805 case 1: 01806 switch(w-(sts+sx)) 01807 { 01808 case 4: 01809 dst[srs + loadIdx] = 01810 (dptr[0] * (70.0F / 255.0F) + 01811 (dptr[1] + dptr[1]) * (56.0F / 255.0F) + 01812 (border[bn1] + dptr[2]) * (28.0F / 255.0F) + 01813 (border[bn2] + dptr[3]) * ( 8.0F / 255.0F) + 01814 border[bn3] * ( 1.0F / 255.0F) 01815 ); 01816 break; 01817 case 3: 01818 dst[srs + loadIdx] = 01819 (dptr[0] * (70.0F / 247.0F) + 01820 (dptr[1] + dptr[1]) * (56.0F / 247.0F) + 01821 (border[bn1] + dptr[2]) * (28.0F / 247.0F) + 01822 border[bn2] * ( 8.0F / 247.0F) + 01823 border[bn3] * ( 1.0F / 247.0F) 01824 ); 01825 break; 01826 case 2: 01827 dst[srs + loadIdx] = 01828 (dptr[0] * (70.0F / 219.0F) + 01829 (dptr[1] + dptr[1]) * (56.0F / 219.0F) + 01830 border[bn1] * (28.0F / 219.0F) + 01831 border[bn2] * ( 8.0F / 219.0F) + 01832 border[bn3] * ( 1.0F / 219.0F) 01833 ); 01834 break; 01835 case 1: 01836 dst[srs + loadIdx] = 01837 (dptr[0] * (70.0F / 163.0F) + 01838 dptr[1] * (56.0F / 163.0F) + 01839 border[bn1] * (28.0F / 163.0F) + 01840 border[bn2] * ( 8.0F / 163.0F) + 01841 border[bn3] * ( 1.0F / 163.0F) 01842 ); 01843 break; 01844 } 01845 break; 01846 case 2: 01847 switch(w-(sts+sx)) 01848 { 01849 case 4: 01850 dst[srs + loadIdx] = 01851 (dptr[0] * (70.0F / 255.0F) + 01852 (dptr[1] + dptr[1]) * (56.0F / 255.0F) + 01853 (dptr[2] + dptr[2]) * (28.0F / 255.0F) + 01854 (border[bn1] + dptr[3]) * ( 8.0F / 255.0F) + 01855 border[bn2] * ( 1.0F / 255.0F) 01856 ); 01857 break; 01858 case 3: 01859 dst[srs + loadIdx] = 01860 (dptr[0] * (70.0F / 247.0F) + 01861 (dptr[1] + dptr[1]) * (56.0F / 247.0F) + 01862 (dptr[2] + dptr[2]) * (28.0F / 247.0F) + 01863 border[bn1] * ( 8.0F / 247.0F) + 01864 border[bn2] * ( 1.0F / 247.0F) 01865 ); 01866 break; 01867 case 2: 01868 dst[srs + loadIdx] = 01869 (dptr[0] * (70.0F / 219.0F) + 01870 (dptr[1] + dptr[1]) * (56.0F / 219.0F) + 01871 dptr[2] * (28.0F / 219.0F) + 01872 border[bn1] * ( 8.0F / 219.0F) + 01873 border[bn2] * ( 1.0F / 219.0F) 01874 ); 01875 break; 01876 case 1: 01877 dst[srs + loadIdx] = 01878 (dptr[0] * (70.0F / 163.0F) + 01879 dptr[1] * (56.0F / 163.0F) + 01880 dptr[2] * (28.0F / 163.0F) + 01881 border[bn1] * ( 8.0F / 163.0F) + 01882 border[bn2] * ( 1.0F / 163.0F) 01883 ); 01884 break; 01885 } 01886 break; 01887 case 3: 01888 switch(w-(sts+sx)) 01889 { 01890 case 4: 01891 dst[srs + loadIdx] = 01892 (dptr[0] * (70.0F / 255.0F) + 01893 (dptr[1] + dptr[1]) * (56.0F / 255.0F) + 01894 (dptr[2] + dptr[2]) * (28.0F / 255.0F) + 01895 (dptr[3] + dptr[3]) * ( 8.0F / 255.0F) + 01896 border[bn1] * ( 1.0F / 255.0F) 01897 ); 01898 break; 01899 case 3: 01900 dst[srs + loadIdx] = 01901 (dptr[0] * (70.0F / 247.0F) + 01902 (dptr[1] + dptr[1]) * (56.0F / 247.0F) + 01903 (dptr[2] + dptr[2]) * (28.0F / 247.0F) + 01904 dptr[3] * ( 8.0F / 247.0F) + 01905 border[bn1] * ( 1.0F / 247.0F) 01906 ); 01907 break; 01908 case 2: 01909 dst[srs + loadIdx] = 01910 (dptr[0] * (70.0F / 219.0F) + 01911 (dptr[1] + dptr[1]) * (56.0F / 219.0F) + 01912 dptr[2] * (28.0F / 219.0F) + 01913 dptr[3] * ( 8.0F / 219.0F) + 01914 border[bn1] * ( 1.0F / 219.0F) 01915 ); 01916 break; 01917 case 1: 01918 dst[srs + loadIdx] = 01919 (dptr[0] * (70.0F / 163.0F) + 01920 dptr[1] * (56.0F / 163.0F) + 01921 dptr[2] * (28.0F / 163.0F) + 01922 dptr[3] * ( 8.0F / 163.0F) + 01923 border[bn1] * ( 1.0F / 163.0F) 01924 ); 01925 break; 01926 } 01927 break; 01928 } 01929 } 01930 01931 // If in the last bit of the source row, reduce sample 01932 else if(sts + sx < w) 01933 { 01934 switch(w-(sts+sx)) 01935 { 01936 case 4: 01937 dst[srs + loadIdx] = 01938 (dptr[-4] * ( 1.0F / 255.0F) + 01939 (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) + 01940 (dptr[-2] + dptr[2]) * (28.0F / 255.0F) + 01941 (dptr[-1] + dptr[1]) * (56.0F / 255.0F) + 01942 dptr[0] * (70.0F / 255.0F) 01943 ); 01944 break; 01945 case 3: 01946 dst[srs + loadIdx] = 01947 (dptr[-4] * ( 1.0F / 247.0F) + 01948 dptr[-3] * ( 8.0F / 247.0F) + 01949 (dptr[-2] + dptr[2]) * (28.0F / 247.0F) + 01950 (dptr[-1] + dptr[1]) * (56.0F / 247.0F) + 01951 dptr[0] * (70.0F / 247.0F) 01952 ); 01953 break; 01954 case 2: 01955 dst[srs + loadIdx] = 01956 (dptr[-4] * ( 1.0F / 219.0F) + 01957 dptr[-3] * ( 8.0F / 219.0F) + 01958 dptr[-2] * (28.0F / 219.0F) + 01959 (dptr[-1] + dptr[1]) * (56.0F / 219.0F) + 01960 dptr[0] * (70.0F / 219.0F) 01961 ); 01962 break; 01963 case 1: 01964 dst[srs + loadIdx] = 01965 (dptr[-4] * ( 1.0F / 163.0F) + 01966 dptr[-3] * ( 8.0F / 163.0F) + 01967 dptr[-2] * (28.0F / 163.0F) + 01968 dptr[-1] * (56.0F / 163.0F) + 01969 dptr[ 0] * (70.0F / 163.0F) 01970 ); 01971 break; 01972 } 01973 } 01974 } 01975 01976 // ###################################################################### 01977 __global__ void cuda_global_lowpass_9_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height) 01978 { 01979 // Data cache 01980 float *data = (float *) shared_data; // size of tile_width * tile_height 01981 float *border = (float *) &data[tile_width*tile_height]; // size of tile_width * 8 01982 01983 const int sy = threadIdx.y; // source pixel row within source tile 01984 01985 const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows 01986 const int ste = sts + tile_height; // tile end for source, in rows 01987 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; // Current column index 01988 01989 // Clamp tile and apron limits by image borders 01990 const int stec = min(ste, h); 01991 01992 01993 01994 01995 // only process columns that are actually within image bounds: 01996 if (scs < w && sts+sy < stec) 01997 { 01998 // Shared and global memory indices for current column 01999 int smemPos = IMUL(sy, tile_width) + threadIdx.x; 02000 int gmemPos = IMUL(sts + sy, w) + scs; 02001 02002 data[smemPos] = src[gmemPos]; 02003 02004 if (sy < 4 && gmemPos > IMUL(4,w)) 02005 border[smemPos] = src[gmemPos-IMUL(4,w)]; 02006 02007 int bordOff = 8+sy-tile_height; 02008 02009 if (sy >= tile_height-4) 02010 { 02011 if(ste+4 <= h) 02012 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(4,w)]; 02013 else 02014 // This is explicitly handling the case where the second to last tile is so close to the edge of the image, that the post border 02015 // can't fit in the remaining space, maybe we should be handling this as yet another special case, but there are a lot of special 02016 // cases already in this function (handling smaller than 9 imgs, handling all cases of small last tile) 02017 border[threadIdx.x+IMUL(bordOff,tile_width)] = 0; 02018 } 02019 02020 // Ensure the completness of loading stage because results emitted 02021 // by each thread depend on the data loaded by other threads: 02022 __syncthreads(); 02023 02024 // Shared and global memory indices for current column 02025 smemPos = IMUL(sy, tile_width) + threadIdx.x; 02026 gmemPos = IMUL(sts + sy, w) + scs; 02027 02028 02029 // Setup the offsets to get to the correct smem points in the arrays for both the data and the border 02030 float *dptr = data + smemPos; 02031 const int sw = tile_width, sw2 = sw + sw, sw3 = sw2 + sw, sw4 = sw3 + sw; 02032 const int nsw = -sw, nsw2 = nsw - sw, nsw3 = nsw2 - sw, nsw4 = nsw3 - sw; 02033 const int bn4 = threadIdx.x, bn3 = bn4 + tile_width, bn2 = bn3 + tile_width, bn1 = bn2 + tile_width; 02034 const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width, bp3 = bp2 + tile_width, bp4 = bp3 + tile_width; 02035 02036 if(h < 9) 02037 { 02038 // This is not at all efficient, just here to ensure filter behaves properly in small image cases 02039 const int numAhead = max(0,min(h-1-(sts+sy),4)); 02040 const int numBehind = max(0,min(sts+sy,4)); 02041 const int situation = numBehind*10+numAhead; 02042 switch(situation) 02043 { 02044 case 0: // 00 02045 dst[gmemPos] = dptr[0]; 02046 break; 02047 case 1: // 01 02048 dst[gmemPos] = (dptr[0] * (70.0F / 126.0F) + 02049 dptr[sw] * (56.0F / 126.0F)); 02050 break; 02051 case 2: // 02 02052 dst[gmemPos] = (dptr[0] * (70.0F / 154.0F) + 02053 dptr[sw] * (56.0F / 154.0F) + 02054 dptr[sw2] * (28.0F / 154.0F)); 02055 break; 02056 case 3: 02057 dst[gmemPos] = (dptr[0] * (70.0F / 162.0F) + 02058 dptr[sw] * (56.0F / 162.0F) + 02059 dptr[sw2] * (28.0F / 162.0F) + 02060 dptr[sw3] * ( 8.0F / 162.0F)); 02061 break; 02062 case 4: 02063 dst[gmemPos] = (dptr[0] * (70.0F / 163.0F) + 02064 dptr[sw] * (56.0F / 163.0F) + 02065 dptr[sw2] * (28.0F / 163.0F) + 02066 dptr[sw3] * ( 8.0F / 163.0F) + 02067 dptr[sw4] * ( 1.0F / 163.0F)); 02068 break; 02069 case 10: 02070 dst[gmemPos] = (dptr[0] * (70.0F / 126.0F) + 02071 dptr[nsw] * (56.0F / 126.0F)); 02072 break; 02073 case 11: 02074 dst[gmemPos] = (dptr[0] * (70.0F / 182.0F) + 02075 (dptr[nsw] + dptr[sw]) * (56.0F / 182.0F)); 02076 break; 02077 case 12: 02078 dst[gmemPos] = (dptr[0] * (70.0F / 210.0F) + 02079 (dptr[nsw] + dptr[sw]) * (56.0F / 210.0F) + 02080 dptr[sw2] * (28.0F / 210.0F)); 02081 break; 02082 case 13: 02083 dst[gmemPos] = (dptr[0] * (70.0F / 218.0F) + 02084 (dptr[nsw] + dptr[sw]) * (56.0F / 218.0F) + 02085 dptr[sw2] * (28.0F / 218.0F) + 02086 dptr[sw3] * ( 8.0F / 218.0F)); 02087 break; 02088 case 14: 02089 dst[gmemPos] = (dptr[0] * (70.0F / 219.0F) + 02090 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 02091 dptr[sw2] * (28.0F / 219.0F) + 02092 dptr[sw3] * ( 8.0F / 219.0F) + 02093 dptr[sw4] * ( 1.0F / 219.0F)); 02094 break; 02095 case 20: 02096 dst[gmemPos] = (dptr[0] * (70.0F / 154.0F) + 02097 dptr[nsw] * (56.0F / 154.0F) + 02098 dptr[nsw2] * (28.0F / 154.0F)); 02099 break; 02100 case 21: 02101 dst[gmemPos] = (dptr[0] * (70.0F / 210.0F) + 02102 (dptr[nsw] + dptr[sw]) * (56.0F / 210.0F) + 02103 dptr[nsw2] * (28.0F / 210.0F)); 02104 break; 02105 case 22: 02106 dst[gmemPos] = (dptr[0] * (70.0F / 238.0F) + 02107 (dptr[nsw] + dptr[sw]) * (56.0F / 238.0F) + 02108 (dptr[nsw2] + dptr[sw2]) * (28.0F / 238.0F)); 02109 break; 02110 case 23: 02111 dst[gmemPos] = (dptr[0] * (70.0F / 246.0F) + 02112 (dptr[nsw] + dptr[sw]) * (56.0F / 246.0F) + 02113 (dptr[nsw2] + dptr[sw2]) * (28.0F / 246.0F) + 02114 dptr[sw3] * ( 8.0F / 246.0F)); 02115 break; 02116 case 24: 02117 dst[gmemPos] = (dptr[0] * (70.0F / 247.0F) + 02118 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 02119 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 02120 dptr[sw3] * ( 8.0F / 247.0F) + 02121 dptr[sw4] * ( 1.0F / 247.0F)); 02122 break; 02123 case 30: 02124 dst[gmemPos] = (dptr[0] * (70.0F / 162.0F) + 02125 dptr[nsw] * (56.0F / 162.0F) + 02126 dptr[nsw2] * (28.0F / 162.0F) + 02127 dptr[nsw3] * ( 8.0F / 162.0F)); 02128 break; 02129 case 31: 02130 dst[gmemPos] = (dptr[0] * (70.0F / 218.0F) + 02131 (dptr[nsw] + dptr[sw]) * (56.0F / 218.0F) + 02132 dptr[nsw2] * (28.0F / 218.0F) + 02133 dptr[nsw3] * ( 8.0F / 218.0F)); 02134 break; 02135 case 32: 02136 dst[gmemPos] = (dptr[0] * (70.0F / 246.0F) + 02137 (dptr[nsw] + dptr[sw]) * (56.0F / 246.0F) + 02138 (dptr[nsw2] + dptr[sw2]) * (28.0F / 246.0F) + 02139 dptr[nsw3] * ( 8.0F / 246.0F)); 02140 break; 02141 case 33: 02142 dst[gmemPos] = (dptr[0] * (70.0F / 254.0F) + 02143 (dptr[nsw] + dptr[sw]) * (56.0F / 254.0F) + 02144 (dptr[nsw2] + dptr[sw2]) * (28.0F / 254.0F) + 02145 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 254.0F)); 02146 break; 02147 case 34: 02148 dst[gmemPos] = (dptr[0] * (70.0F / 255.0F) + 02149 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 02150 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 02151 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) + 02152 dptr[sw4] * ( 1.0F / 255.0F)); 02153 break; 02154 case 40: 02155 dst[gmemPos] = (dptr[0] * (70.0F / 163.0F) + 02156 dptr[nsw] * (56.0F / 163.0F) + 02157 dptr[nsw2] * (28.0F / 163.0F) + 02158 dptr[nsw3] * ( 8.0F / 163.0F) + 02159 dptr[nsw4] * ( 1.0F / 163.0F)); 02160 break; 02161 case 41: 02162 dst[gmemPos] = (dptr[0] * (70.0F / 219.0F) + 02163 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 02164 dptr[nsw2] * (28.0F / 219.0F) + 02165 dptr[nsw3] * ( 8.0F / 219.0F) + 02166 dptr[nsw4] * ( 1.0F / 219.0F)); 02167 break; 02168 case 42: 02169 dst[gmemPos] = (dptr[0] * (70.0F / 247.0F) + 02170 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 02171 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 02172 dptr[nsw3] * ( 8.0F / 247.0F) + 02173 dptr[nsw4] * ( 1.0F / 247.0F)); 02174 break; 02175 case 43: 02176 dst[gmemPos] = (dptr[0] * (70.0F / 255.0F) + 02177 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 02178 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 02179 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) + 02180 dptr[nsw4] * ( 1.0F / 255.0F)); 02181 break; 02182 } 02183 } 02184 // Are we in the top 4 rows of the whole image 02185 else if(sts + sy < 4) 02186 { 02187 switch(sts+sy) 02188 { 02189 case 0: 02190 dst[gmemPos] = 02191 (dptr[0] * (70.0F / 163.0F) + 02192 dptr[sw] * (56.0F / 163.0F) + 02193 dptr[sw2] * (28.0F / 163.0F) + 02194 dptr[sw3] * ( 8.0F / 163.0F) + 02195 dptr[sw4] * ( 1.0F / 163.0F) 02196 ); 02197 break; 02198 case 1: 02199 dst[gmemPos] = 02200 (dptr[0] * (70.0F / 219.0F) + 02201 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 02202 dptr[sw2] * (28.0F / 219.0F) + 02203 dptr[sw3] * ( 8.0F / 219.0F) + 02204 dptr[sw4] * ( 1.0F / 219.0F) 02205 ); 02206 break; 02207 case 2: 02208 dst[gmemPos] = 02209 (dptr[0] * (70.0F / 247.0F) + 02210 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 02211 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 02212 dptr[sw3] * ( 8.0F / 247.0F) + 02213 dptr[sw4] * ( 1.0F / 247.0F) 02214 ); 02215 break; 02216 case 3: 02217 dst[gmemPos] = 02218 (dptr[0] * (70.0F / 255.0F) + 02219 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 02220 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 02221 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) + 02222 dptr[sw4] * ( 1.0F / 255.0F) 02223 ); 02224 break; 02225 } 02226 } 02227 else if(sy < 4 && sts+sy<h-4) // If not top 4 in the whole image, are we in the top 4 rows of this tile 02228 { 02229 switch(sy) 02230 { 02231 case 0: 02232 dst[gmemPos] = 02233 (dptr[0] * (70.0F / 256.0F) + 02234 (border[bn1] + dptr[sw]) * (56.0F / 256.0F) + 02235 (border[bn2] + dptr[sw2]) * (28.0F / 256.0F) + 02236 (border[bn3] + dptr[sw3]) * ( 8.0F / 256.0F) + 02237 (border[bn4] + dptr[sw4]) * ( 1.0F / 256.0F) 02238 ); 02239 break; 02240 case 1: 02241 dst[gmemPos] = 02242 (dptr[0] * (70.0F / 256.0F) + 02243 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 02244 (border[bn1] + dptr[sw2]) * (28.0F / 256.0F) + 02245 (border[bn2] + dptr[sw3]) * ( 8.0F / 256.0F) + 02246 (border[bn3] + dptr[sw4]) * ( 1.0F / 256.0F) 02247 ); 02248 break; 02249 case 2: 02250 dst[gmemPos] = 02251 (dptr[0] * (70.0F / 256.0F) + 02252 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 02253 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 02254 (border[bn1] + dptr[sw3]) * ( 8.0F / 256.0F) + 02255 (border[bn2] + dptr[sw4]) * ( 1.0F / 256.0F) 02256 ); 02257 break; 02258 case 3: 02259 dst[gmemPos] = 02260 (dptr[0] * (70.0F / 256.0F) + 02261 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 02262 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 02263 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) + 02264 (border[bn1] + dptr[sw4]) * ( 1.0F / 256.0F) 02265 ); 02266 break; 02267 } 02268 } 02269 else if(sy >= 4 && sy <tile_height-4 && sts+sy<h-4) // Are we in the middle of the tile 02270 { 02271 dst[gmemPos] = 02272 ((dptr[nsw4] + dptr[sw4]) * ( 1.0F / 256.0F) + 02273 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) + 02274 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 02275 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 02276 dptr[0] * (70.0F / 256.0F) 02277 ); 02278 } 02279 else if(sy >= 4 && sts + sy < h-4) // Are we not at the bottom of the image, but bottom 4 of the tile 02280 { 02281 switch(tile_height-sy) 02282 { 02283 case 4: 02284 dst[gmemPos] = 02285 (dptr[0] * (70.0F / 256.0F) + 02286 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 02287 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 02288 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) + 02289 (dptr[nsw4] + border[bp1]) * ( 1.0F / 256.0F) 02290 ); 02291 break; 02292 case 3: 02293 dst[gmemPos] = 02294 (dptr[0] * (70.0F / 256.0F) + 02295 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 02296 (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) + 02297 (dptr[nsw3] + border[bp1]) * ( 8.0F / 256.0F) + 02298 (dptr[nsw4] + border[bp2]) * ( 1.0F / 256.0F) 02299 ); 02300 break; 02301 case 2: 02302 dst[gmemPos] = 02303 (dptr[0] * (70.0F / 256.0F) + 02304 (dptr[nsw] + dptr[sw]) * (56.0F / 256.0F) + 02305 (dptr[nsw2] + border[bp1]) * (28.0F / 256.0F) + 02306 (dptr[nsw3] + border[bp2]) * ( 8.0F / 256.0F) + 02307 (dptr[nsw4] + border[bp3]) * ( 1.0F / 256.0F) 02308 ); 02309 break; 02310 case 1: 02311 dst[gmemPos] = 02312 (dptr[0] * (70.0F / 256.0F) + 02313 (dptr[nsw] + border[bp1]) * (56.0F / 256.0F) + 02314 (dptr[nsw2] + border[bp2]) * (28.0F / 256.0F) + 02315 (dptr[nsw3] + border[bp3]) * ( 8.0F / 256.0F) + 02316 (dptr[nsw4] + border[bp4]) * ( 1.0F / 256.0F) 02317 ); 02318 break; 02319 } 02320 } 02321 else if(sy < 4 && sts + sy >= h-4) // We are in the top of a tile close to the bottom of the image 02322 { 02323 switch(sy) 02324 { 02325 case 0: 02326 switch(h-(sts+sy)) 02327 { 02328 case 4: 02329 dst[gmemPos] = 02330 (dptr[0] * (70.0F / 255.0F) + 02331 (border[bn1] + dptr[sw]) * (56.0F / 255.0F) + 02332 (border[bn2] + dptr[sw2]) * (28.0F / 255.0F) + 02333 (border[bn3] + dptr[sw3]) * ( 8.0F / 255.0F) + 02334 border[bn4] * ( 1.0F / 255.0F) 02335 ); 02336 break; 02337 case 3: 02338 dst[gmemPos] = 02339 (dptr[0] * (70.0F / 247.0F) + 02340 (border[bn1] + dptr[sw]) * (56.0F / 247.0F) + 02341 (border[bn2] + dptr[sw2]) * (28.0F / 247.0F) + 02342 border[bn3] * ( 8.0F / 247.0F) + 02343 border[bn4] * ( 1.0F / 247.0F) 02344 ); 02345 break; 02346 case 2: 02347 dst[gmemPos] = 02348 (dptr[0] * (70.0F / 219.0F) + 02349 (border[bn1] + dptr[sw]) * (56.0F / 219.0F) + 02350 border[bn2] * (28.0F / 219.0F) + 02351 border[bn3] * ( 8.0F / 219.0F) + 02352 border[bn4] * ( 1.0F / 219.0F) 02353 ); 02354 break; 02355 case 1: 02356 dst[gmemPos] = 02357 (dptr[0] * (70.0F / 163.0F) + 02358 border[bn1] * (56.0F / 163.0F) + 02359 border[bn2] * (28.0F / 163.0F) + 02360 border[bn3] * ( 8.0F / 163.0F) + 02361 border[bn4] * ( 1.0F / 163.0F) 02362 ); 02363 break; 02364 } 02365 break; 02366 case 1: 02367 switch(h-(sts+sy)) 02368 { 02369 case 4: 02370 dst[gmemPos] = 02371 (dptr[0] * (70.0F / 255.0F) + 02372 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 02373 (border[bn1] + dptr[sw2]) * (28.0F / 255.0F) + 02374 (border[bn2] + dptr[sw3]) * ( 8.0F / 255.0F) + 02375 border[bn3] * ( 1.0F / 255.0F) 02376 ); 02377 break; 02378 case 3: 02379 dst[gmemPos] = 02380 (dptr[0] * (70.0F / 247.0F) + 02381 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 02382 (border[bn1] + dptr[sw2]) * (28.0F / 247.0F) + 02383 border[bn2] * ( 8.0F / 247.0F) + 02384 border[bn3] * ( 1.0F / 247.0F) 02385 ); 02386 break; 02387 case 2: 02388 dst[gmemPos] = 02389 (dptr[0] * (70.0F / 219.0F) + 02390 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 02391 border[bn1] * (28.0F / 219.0F) + 02392 border[bn2] * ( 8.0F / 219.0F) + 02393 border[bn3] * ( 1.0F / 219.0F) 02394 ); 02395 break; 02396 case 1: 02397 dst[gmemPos] = 02398 (dptr[0] * (70.0F / 163.0F) + 02399 dptr[nsw] * (56.0F / 163.0F) + 02400 border[bn1] * (28.0F / 163.0F) + 02401 border[bn2] * ( 8.0F / 163.0F) + 02402 border[bn3] * ( 1.0F / 163.0F) 02403 ); 02404 break; 02405 } 02406 break; 02407 case 2: 02408 switch(h-(sts+sy)) 02409 { 02410 case 4: 02411 dst[gmemPos] = 02412 (dptr[0] * (70.0F / 255.0F) + 02413 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 02414 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 02415 (border[bn1] + dptr[sw3]) * ( 8.0F / 255.0F) + 02416 border[bn2] * ( 1.0F / 255.0F) 02417 ); 02418 break; 02419 case 3: 02420 dst[gmemPos] = 02421 (dptr[0] * (70.0F / 247.0F) + 02422 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 02423 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 02424 border[bn1] * ( 8.0F / 247.0F) + 02425 border[bn2] * ( 1.0F / 247.0F) 02426 ); 02427 break; 02428 case 2: 02429 dst[gmemPos] = 02430 (dptr[0] * (70.0F / 219.0F) + 02431 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 02432 dptr[nsw2] * (28.0F / 219.0F) + 02433 border[bn1] * ( 8.0F / 219.0F) + 02434 border[bn2] * ( 1.0F / 219.0F) 02435 ); 02436 break; 02437 case 1: 02438 dst[gmemPos] = 02439 (dptr[0] * (70.0F / 163.0F) + 02440 dptr[nsw] * (56.0F / 163.0F) + 02441 dptr[nsw2] * (28.0F / 163.0F) + 02442 border[bn1] * ( 8.0F / 163.0F) + 02443 border[bn2] * ( 1.0F / 163.0F) 02444 ); 02445 break; 02446 } 02447 break; 02448 case 3: 02449 switch(h-(sts+sy)) 02450 { 02451 case 4: 02452 dst[gmemPos] = 02453 (dptr[0] * (70.0F / 255.0F) + 02454 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 02455 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 02456 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) + 02457 border[bn1] * ( 1.0F / 255.0F) 02458 ); 02459 break; 02460 case 3: 02461 dst[gmemPos] = 02462 (dptr[0] * (70.0F / 247.0F) + 02463 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 02464 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 02465 dptr[nsw3] * ( 8.0F / 247.0F) + 02466 border[bn1] * ( 1.0F / 247.0F) 02467 ); 02468 break; 02469 case 2: 02470 dst[gmemPos] = 02471 (dptr[0] * (70.0F / 219.0F) + 02472 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 02473 dptr[nsw2] * (28.0F / 219.0F) + 02474 dptr[nsw3] * ( 8.0F / 219.0F) + 02475 border[bn1] * ( 1.0F / 219.0F) 02476 ); 02477 break; 02478 case 1: 02479 dst[gmemPos] = 02480 (dptr[0] * (70.0F / 163.0F) + 02481 dptr[nsw] * (56.0F / 163.0F) + 02482 dptr[nsw2] * (28.0F / 163.0F) + 02483 dptr[nsw3] * ( 8.0F / 163.0F) + 02484 border[bn1] * ( 1.0F / 163.0F) 02485 ); 02486 break; 02487 } 02488 break; 02489 } 02490 } 02491 else // We must be at the bottom 4 of the image 02492 { 02493 switch(h-(sts+sy)) 02494 { 02495 case 4: 02496 dst[gmemPos] = 02497 (dptr[0] * (70.0F / 255.0F) + 02498 (dptr[nsw] + dptr[sw]) * (56.0F / 255.0F) + 02499 (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) + 02500 (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) + 02501 dptr[nsw4] * ( 1.0F / 255.0F) 02502 ); 02503 break; 02504 case 3: 02505 dst[gmemPos] = 02506 (dptr[0] * (70.0F / 247.0F) + 02507 (dptr[nsw] + dptr[sw]) * (56.0F / 247.0F) + 02508 (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) + 02509 dptr[nsw3] * ( 8.0F / 247.0F) + 02510 dptr[nsw4] * ( 1.0F / 247.0F) 02511 ); 02512 break; 02513 case 2: 02514 dst[gmemPos] = 02515 (dptr[0] * (70.0F / 219.0F) + 02516 (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) + 02517 dptr[nsw2] * (28.0F / 219.0F) + 02518 dptr[nsw3] * ( 8.0F / 219.0F) + 02519 dptr[nsw4] * ( 1.0F / 219.0F) 02520 ); 02521 break; 02522 case 1: 02523 dst[gmemPos] = 02524 (dptr[0] * (70.0F / 163.0F) + 02525 dptr[nsw] * (56.0F / 163.0F) + 02526 dptr[nsw2] * (28.0F / 163.0F) + 02527 dptr[nsw3] * ( 8.0F / 163.0F) + 02528 dptr[nsw4] * ( 1.0F / 163.0F) 02529 ); 02530 break; 02531 } 02532 } 02533 } 02534 } 02535 02536 __global__ void cuda_global_lowpass_5_x(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width) 02537 { 02538 // Data cache in shared memory 02539 //__shared__ float data[CUDA_1D_TILE_W]; 02540 //__shared__ float border[4]; 02541 //tile_width=CUDA_1D_TILE_W; 02542 02543 // Save 4 pixels for the border 02544 float *data = (float *) shared_data; // size of tile_width 02545 float *border = (float *) &data[tile_width]; // size of 4 02546 02547 const int sx = threadIdx.x; // source pixel within source tile 02548 const int sts = IMUL(blockIdx.x, tile_width); // tile start for source, relative to row start 02549 const int srs = IMUL(blockIdx.y, w); // Row start index in source data 02550 02551 // Load global memory values into our data cache: 02552 const int loadIdx = sts + sx; // index of one pixel value to load 02553 if (loadIdx < w) data[sx] = src[srs + loadIdx]; 02554 02555 // Load beginning border 02556 if (sx < 2 && sts > 0) 02557 border[sx] = src[srs + loadIdx - 2]; 02558 // Load ending border 02559 else if(sx >= tile_width-2 && sts+tile_width < w-2) 02560 border[4+sx-tile_width] = src[srs + sts + sx + 2]; 02561 02562 // Ensure the completness of loading stage because results emitted 02563 // by each thread depend on the data loaded by other threads: 02564 __syncthreads(); 02565 02566 02567 if ( loadIdx < w ) { 02568 const int writeIdx = sts + sx; // write index relative to row start 02569 const float *dptr = data + sx; 02570 // [ 1 4 (6) 4 1 ] / 16.0 02571 02572 // If we are smaller than the Gaussian filter we are using, special case this 02573 // this is not very efficient, just making sure it can handle small images 02574 if(w < 5) 02575 { 02576 int numAhead = max(0,min(w-1-(sts+sx),2)); 02577 int numBehind = max(0,min(sts+sx,2)); 02578 int situation = numBehind*10+numAhead; 02579 switch(situation) 02580 { 02581 case 0: // 00 02582 dst[srs + writeIdx] = dptr[0]; 02583 break; 02584 case 1: // 01 02585 dst[srs + writeIdx] = (dptr[0] * (6.0F / 10.0F) + 02586 dptr[1] * (4.0F / 10.0F)); 02587 break; 02588 case 2: // 02 02589 dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) + 02590 dptr[1] * (4.0F / 11.0F) + 02591 dptr[2] * (1.0F / 11.0F)); 02592 break; 02593 case 10: 02594 dst[srs + writeIdx] = (dptr[0] * (6.0F / 10.0F) + 02595 dptr[-1] * (4.0F / 10.0F)); 02596 break; 02597 case 11: 02598 dst[srs + writeIdx] = (dptr[0] * (6.0F / 14.0F) + 02599 (dptr[-1] + dptr[1]) * (4.0F / 14.0F)); 02600 break; 02601 case 12: 02602 dst[srs + writeIdx] = (dptr[0] * (6.0F / 15.0F) + 02603 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) + 02604 dptr[2] * (1.0F / 15.0F)); 02605 break; 02606 case 20: 02607 dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) + 02608 dptr[-1] * (4.0F / 11.0F) + 02609 dptr[-2] * (1.0F / 11.0F)); 02610 break; 02611 case 21: 02612 dst[srs + writeIdx] = (dptr[0] * (6.0F / 15.0F) + 02613 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) + 02614 dptr[-2] * (1.0F / 15.0F)); 02615 break; 02616 } 02617 } 02618 // First set of pixels in the row 02619 else if(sts+sx < 2) 02620 { 02621 switch(sx) 02622 { 02623 case 0: 02624 dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) + 02625 dptr[1] * (4.0F / 11.0F) + 02626 dptr[2] * (1.0F / 11.0F)); 02627 break; 02628 case 1: 02629 dst[srs + writeIdx] = (dptr[0] * (6.0F / 15.0F) + 02630 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) + 02631 dptr[2] * (1.0F / 15.0F)); 02632 break; 02633 } 02634 } 02635 // First two pixels in tile 02636 else if(sx < 2 && sts+sx < w-2) 02637 { 02638 switch(sx) 02639 { 02640 case 0: 02641 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 02642 (border[1] + dptr[1]) * (4.0F / 16.0F) + 02643 (border[0] + dptr[2]) * (1.0F / 16.0F)); 02644 break; 02645 case 1: 02646 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 02647 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) + 02648 (border[1] + dptr[2]) * (1.0F / 16.0F)); 02649 break; 02650 } 02651 } 02652 // In the middle of the tile 02653 else if(sx < tile_width-2 && sts+sx < w-2) 02654 { 02655 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 02656 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) + 02657 (dptr[-2] + dptr[2]) * (1.0F / 16.0F)); 02658 } 02659 // Last two pixels of the tile 02660 else if(sts+sx < w-2) 02661 { 02662 switch(tile_width-sx) 02663 { 02664 case 2: 02665 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 02666 (dptr[-1] + dptr[1]) * (4.0F / 16.0F) + 02667 (dptr[-2] + border[2]) * (1.0F / 16.0F)); 02668 break; 02669 case 1: 02670 dst[srs + writeIdx] = (dptr[0] * (6.0F / 16.0F) + 02671 (dptr[-1] + border[2]) * (4.0F / 16.0F) + 02672 (dptr[-2] + border[3]) * (1.0F / 16.0F)); 02673 break; 02674 } 02675 } 02676 // Last two pixels of the row 02677 else 02678 { 02679 switch(w-(sts+sx)) 02680 { 02681 case 2: 02682 dst[srs + writeIdx] = (dptr[0] * (6.0F / 15.0F) + 02683 (dptr[-1] + dptr[1]) * (4.0F / 15.0F) + 02684 (dptr[-2]) * (1.0F / 15.0F)); 02685 break; 02686 case 1: 02687 dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) + 02688 (dptr[-1]) * (4.0F / 11.0F) + 02689 (dptr[-2]) * (1.0F / 11.0F)); 02690 break; 02691 } 02692 } 02693 } 02694 } 02695 02696 __global__ void cuda_global_lowpass_5_y(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height) 02697 { 02698 // Data cache 02699 //__shared__ float data[CUDA_TILE_W*CUDA_TILE_H]; 02700 //__shared__ float border[CUDA_TILE_W*4]; 02701 //const int tile_width=CUDA_TILE_W; 02702 //const int tile_height= CUDA_TILE_H; 02703 02704 // Save 4 rows for the border 02705 float *data = (float *) shared_data; //tile_width * tile_height size 02706 float *border = (float *) &data[tile_width*tile_height]; // size of tile_width*4 02707 02708 const int sy = threadIdx.y; // source pixel row within source tile 02709 02710 const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows 02711 const int ste = sts + tile_height; // tile end for source, in rows 02712 02713 // Clamp tile and apron limits by image borders 02714 const int stec = min(ste, h); 02715 02716 // Current column index 02717 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; 02718 02719 int smemPos = IMUL(sy, tile_width) + threadIdx.x; 02720 int gmemPos = IMUL(sts + sy, w) + scs; 02721 02722 // only process columns that are actually within image bounds: 02723 if (scs < w && sts+sy < h) { 02724 // Shared and global (source) memory indices for current column 02725 02726 // Load data 02727 data[smemPos] = src[gmemPos]; 02728 02729 // Load border 02730 if (sy < 2 && gmemPos > IMUL(2,w)) 02731 border[smemPos] = src[gmemPos-IMUL(2,w)]; 02732 02733 int bordOff = 4+sy-tile_height; 02734 02735 if (sy >= tile_height-2 && ste+2 < h) 02736 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(2,w)]; 02737 02738 // Ensure the completness of loading stage because results emitted 02739 // by each thread depend on the data loaded by other threads: 02740 __syncthreads(); 02741 02742 // Cycle through the tile body, clamped by image borders 02743 // Calculate and output the results 02744 float *dptr = data + smemPos; 02745 const int sw = tile_width, sw2 = sw + sw; 02746 const int nsw = -sw, nsw2 = nsw - sw; 02747 const int bn2 = threadIdx.x, bn1 = bn2 + tile_width; 02748 const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width; 02749 02750 // [ 1 4 (6) 4 1 ] / 16 02751 02752 // If we are smaller than the Gaussian filter we are using, special case this 02753 // this is not very efficient, just making sure it can handle small images 02754 if(h < 5) 02755 { 02756 int numAhead = max(0,min(h-1-(sts+sy),2)); 02757 int numBehind = max(0,min(sts+sy,2)); 02758 int situation = numBehind*10+numAhead; 02759 switch(situation) 02760 { 02761 case 0: // 00 02762 dst[gmemPos] = dptr[0]; 02763 break; 02764 case 1: // 01 02765 dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) + 02766 dptr[sw] * (4.0F / 10.0F)); 02767 break; 02768 case 2: // 02 02769 dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) + 02770 dptr[sw] * (4.0F / 11.0F) + 02771 dptr[sw2] * (1.0F / 11.0F)); 02772 break; 02773 case 10: 02774 dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) + 02775 dptr[nsw] * (4.0F / 10.0F)); 02776 break; 02777 case 11: 02778 dst[gmemPos] = (dptr[0] * (6.0F / 14.0F) + 02779 (dptr[nsw] + dptr[sw]) * (4.0F / 14.0F)); 02780 break; 02781 case 12: 02782 dst[gmemPos] = (dptr[0] * (6.0F / 15.0F) + 02783 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) + 02784 dptr[sw2] * (1.0F / 15.0F)); 02785 break; 02786 case 20: 02787 dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) + 02788 dptr[nsw] * (4.0F / 11.0F) + 02789 dptr[nsw2] * (1.0F / 11.0F)); 02790 break; 02791 case 21: 02792 dst[gmemPos] = (dptr[0] * (6.0F / 15.0F) + 02793 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) + 02794 dptr[nsw2] * (1.0F / 15.0F)); 02795 break; 02796 } 02797 } 02798 // Are we in the top 2 rows of the whole image 02799 else if(sts + sy < 2) 02800 { 02801 switch(sts+sy) 02802 { 02803 case 0: 02804 dst[gmemPos] = 02805 (dptr[0] * (6.0F / 11.0F) + 02806 dptr[sw] * (4.0F / 11.0F) + 02807 dptr[sw2] * (1.0F / 11.0F) 02808 ); 02809 break; 02810 case 1: 02811 dst[gmemPos] = 02812 (dptr[0] * (6.0F / 15.0F) + 02813 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) + 02814 dptr[sw2] * (1.0F / 15.0F) 02815 ); 02816 break; 02817 } 02818 } 02819 else if(sy < 2 && sts+sy<h-2) // If not top 2 in the whole image, are we in the top 2 rows of this tile 02820 { 02821 switch(sy) 02822 { 02823 case 0: 02824 dst[gmemPos] = 02825 (dptr[0] * (6.0F / 16.0F) + 02826 (border[bn1] + dptr[sw]) * (4.0F / 16.0F) + 02827 (border[bn2] + dptr[sw2]) * (1.0F / 16.0F) 02828 ); 02829 break; 02830 case 1: 02831 dst[gmemPos] = 02832 (dptr[0] * (6.0F / 16.0F) + 02833 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) + 02834 (border[bn1] + dptr[sw2]) * (1.0F / 16.0F) 02835 ); 02836 break; 02837 } 02838 } 02839 else if(sy <tile_height-2 && sts+sy<h-2) // Are we in the middle of the tile 02840 { 02841 dst[gmemPos] = 02842 ((dptr[nsw2] + dptr[sw2]) * (1.0F / 16.0F) + 02843 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) + 02844 dptr[0] * (6.0F / 16.0F) 02845 ); 02846 } 02847 else if(sts + sy < h-2) // Are we not at the bottom of the image, but bottom 4 of the tile 02848 { 02849 switch(tile_height-sy) 02850 { 02851 case 2: 02852 dst[gmemPos] = 02853 (dptr[0] * (6.0F / 16.0F) + 02854 (dptr[nsw] + dptr[sw]) * (4.0F / 16.0F) + 02855 (dptr[nsw2] + border[bp1]) * (1.0F / 16.0F) 02856 ); 02857 break; 02858 case 1: 02859 dst[gmemPos] = 02860 (dptr[0] * (6.0F / 16.0F) + 02861 (dptr[nsw] + border[bp1]) * (4.0F / 16.0F) + 02862 (dptr[nsw2] + border[bp2]) * (1.0F / 16.0F) 02863 ); 02864 break; 02865 } 02866 } 02867 else // We must be at the bottom 4 of the image 02868 { 02869 switch(h-(sts+sy)) 02870 { 02871 case 2: 02872 dst[gmemPos] = 02873 (dptr[0] * (6.0F / 15.0F) + 02874 (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) + 02875 dptr[nsw2] * (1.0F / 15.0F) 02876 ); 02877 break; 02878 case 1: 02879 dst[gmemPos] = 02880 (dptr[0] * (6.0F / 11.0F) + 02881 dptr[nsw] * (4.0F / 11.0F) + 02882 dptr[nsw2] * (1.0F / 11.0F) 02883 ); 02884 break; 02885 } 02886 } 02887 02888 } 02889 02890 } 02891 02892 02893 02894 __global__ void cuda_global_lowpass_3_x(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width) 02895 { 02896 02897 // Save 2 pixels for the border 02898 float *data = (float *) shared_data; // size of tile_width 02899 float *border = (float *) &data[tile_width]; // size of 2 02900 02901 const int sx = threadIdx.x; // source pixel within source tile 02902 const int sts = IMUL(blockIdx.x, tile_width); // tile start for source, relative to row start 02903 const int srs = IMUL(blockIdx.y, w); // Row start index in source data 02904 02905 // Load global memory values into our data cache: 02906 const int loadIdx = sts + sx; // index of one pixel value to load 02907 if (loadIdx < w) data[sx] = src[srs + loadIdx]; 02908 02909 // Load beginning border 02910 if (sx < 1 && sts > 0) 02911 border[sx] = src[srs + loadIdx - 1]; 02912 // Load ending border 02913 else if(sx >= tile_width-1 && sts+tile_width < w-1) 02914 border[2+sx-tile_width] = src[srs + sts + sx + 1]; 02915 02916 // Ensure the completness of loading stage because results emitted 02917 // by each thread depend on the data loaded by other threads: 02918 __syncthreads(); 02919 02920 02921 if ( loadIdx < w ) { 02922 const int writeIdx = sts + sx; // write index relative to row start 02923 const float *dptr = data + sx; 02924 // [ 1 (2) 1 ] / 4.0 02925 02926 // If we are smaller than the Gaussian filter we are using, special case this 02927 // this is not very efficient, just making sure it can handle small images 02928 if(w < 3) 02929 { 02930 int numAhead = max(0,min(w-1-(sts+sx),1)); 02931 int numBehind = max(0,min(sts+sx,1)); 02932 int situation = numBehind*10+numAhead; 02933 switch(situation) 02934 { 02935 case 0: // 00 02936 dst[srs + writeIdx] = dptr[0]; 02937 break; 02938 case 1: // 01 02939 dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) + 02940 dptr[1] * (1.0F / 3.0F)); 02941 break; 02942 case 10: 02943 dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) + 02944 dptr[-1] * (1.0F / 3.0F)); 02945 break; 02946 case 11: 02947 dst[srs + writeIdx] = (dptr[0] * (2.0F / 4.0F) + 02948 (dptr[-1] + dptr[1]) * (1.0F / 4.0F)); 02949 break; 02950 } 02951 } 02952 // First set of pixels in the row 02953 else if(sts+sx < 1) 02954 { 02955 switch(sx) 02956 { 02957 case 0: 02958 dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) + 02959 dptr[1] * (1.0F / 3.0F)); 02960 break; 02961 } 02962 } 02963 // First pixel in tile 02964 else if(sx < 1 && sts+sx < w-1) 02965 { 02966 switch(sx) 02967 { 02968 case 0: 02969 dst[srs + writeIdx] = (dptr[0] * (2.0F / 4.0F) + 02970 (border[0] + dptr[1]) * (1.0F / 4.0F)); 02971 break; 02972 } 02973 } 02974 // In the middle of the tile 02975 else if(sx < tile_width-1 && sts+sx < w-1) 02976 { 02977 dst[srs + writeIdx] = (dptr[0] * (2.0F / 4.0F) + 02978 (dptr[-1] + dptr[1]) * (1.0F / 4.0F)); 02979 } 02980 // Last pixel of the tile 02981 else if(sts+sx < w-1) 02982 { 02983 switch(tile_width-sx) 02984 { 02985 case 1: 02986 dst[srs + writeIdx] = (dptr[0] * (2.0F / 4.0F) + 02987 (dptr[-1] + border[1]) * (1.0F / 4.0F)); 02988 break; 02989 } 02990 } 02991 // Last pixel of the row 02992 else 02993 { 02994 switch(w-(sts+sx)) 02995 { 02996 case 1: 02997 dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) + 02998 (dptr[-1]) * (1.0F / 3.0F)); 02999 break; 03000 } 03001 } 03002 } 03003 } 03004 03005 __global__ void cuda_global_lowpass_3_y(const float *src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height) 03006 { 03007 // Data cache 03008 //__shared__ float data[CUDA_TILE_W*CUDA_TILE_H]; 03009 //__shared__ float border[CUDA_TILE_W*4]; 03010 //const int tile_width=CUDA_TILE_W; 03011 //const int tile_height= CUDA_TILE_H; 03012 03013 // Save 2 rows for the border 03014 float *data = (float *) shared_data; //tile_width * tile_height size 03015 float *border = (float *) &data[tile_width*tile_height]; // size of tile_width*2 03016 03017 const int sy = threadIdx.y; // source pixel row within source tile 03018 03019 const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows 03020 const int ste = sts + tile_height; // tile end for source, in rows 03021 03022 // Clamp tile and apron limits by image borders 03023 const int stec = min(ste, h); 03024 03025 // Current column index 03026 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; 03027 03028 // only process columns that are actually within image bounds: 03029 if (scs < w && sts+sy < h) { 03030 // Shared and global (source) memory indices for current column 03031 int smemPos = IMUL(sy, tile_width) + threadIdx.x; 03032 int gmemPos = IMUL(sts + sy, w) + scs; 03033 03034 // Load data 03035 data[smemPos] = src[gmemPos]; 03036 03037 // Load border 03038 if (sy < 1 && gmemPos > w) 03039 border[smemPos] = src[gmemPos-w]; 03040 03041 int bordOff = 2+sy-tile_height; 03042 03043 if (sy >= tile_height-1 && ste+1 < h) 03044 border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+w]; 03045 03046 // Ensure the completness of loading stage because results emitted 03047 // by each thread depend on the data loaded by other threads: 03048 __syncthreads(); 03049 03050 03051 // Cycle through the tile body, clamped by image borders 03052 // Calculate and output the results 03053 float *dptr = data + smemPos; 03054 const int sw = tile_width; 03055 const int nsw = -sw; 03056 const int bn1 = threadIdx.x; 03057 const int bp1 = bn1+tile_width; 03058 03059 // [ 1 (2) 1 ] / 4 03060 03061 // If we are smaller than the Gaussian filter we are using, special case this 03062 // this is not very efficient, just making sure it can handle small images 03063 if(h < 3) 03064 { 03065 int numAhead = max(0,min(h-1-(sts+sy),1)); 03066 int numBehind = max(0,min(sts+sy,1)); 03067 int situation = numBehind*10+numAhead; 03068 switch(situation) 03069 { 03070 case 0: // 00 03071 dst[gmemPos] = dptr[0]; 03072 break; 03073 case 1: // 01 03074 dst[gmemPos] = (dptr[0] * (2.0F / 3.0F) + 03075 dptr[sw] * (1.0F / 3.0F)); 03076 break; 03077 case 10: 03078 dst[gmemPos] = (dptr[0] * (2.0F / 3.0F) + 03079 dptr[nsw] * (1.0F / 3.0F)); 03080 break; 03081 case 11: 03082 dst[gmemPos] = (dptr[0] * (2.0F / 4.0F) + 03083 (dptr[nsw] + dptr[sw]) * (1.0F / 4.0F)); 03084 break; 03085 } 03086 } 03087 // Are we in the top row of the whole image 03088 else if(sts + sy < 1) 03089 { 03090 switch(sts+sy) 03091 { 03092 case 0: 03093 dst[gmemPos] = 03094 (dptr[0] * (2.0F / 3.0F) + 03095 dptr[sw] * (1.0F / 3.0F)); 03096 break; 03097 } 03098 } 03099 else if(sy < 1 && sts+sy<h-1) // If not top in the whole image, are we in the top row of this tile 03100 { 03101 switch(sy) 03102 { 03103 case 0: 03104 dst[gmemPos] = 03105 (dptr[0] * (2.0F / 4.0F) + 03106 (border[bn1] + dptr[sw]) * (1.0F / 4.0F)); 03107 break; 03108 } 03109 } 03110 else if(sy <tile_height-1 && sts+sy<h-1) // Are we in the middle of the tile 03111 { 03112 dst[gmemPos] = 03113 (dptr[0] * (2.0F / 4.0F) + 03114 (dptr[nsw] + dptr[sw]) * (1.0F / 4.0F)); 03115 } 03116 else if(sts + sy < h-1) // Are we not at the bottom of the image, but bottom row of the tile 03117 { 03118 switch(tile_height-sy) 03119 { 03120 case 1: 03121 dst[gmemPos] = 03122 (dptr[0] * (2.0F / 4.0F) + 03123 (dptr[nsw] + border[bp1]) * (1.0F / 4.0F)); 03124 break; 03125 } 03126 } 03127 else // We must be at the bottom row of the image 03128 { 03129 switch(h-(sts+sy)) 03130 { 03131 case 1: 03132 dst[gmemPos] = 03133 (dptr[0] * (2.0F / 3.0F) + 03134 dptr[nsw] * (1.0F / 3.0F)); 03135 break; 03136 } 03137 } 03138 03139 } 03140 03141 } 03142 03143 03144 #include "CUDA/cutil.h" 03145 03146 texture<float, 2, cudaReadModeElementType> texRef; 03147 03148 03149 __global__ void cuda_global_lowpass_texture_9_x_dec_x(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height) 03150 { 03151 // w and h are from the original source image 03152 float *data = (float *) shared_data; //(tile_width+8) * tile_height size 03153 const int yResPos = IMUL(blockIdx.y, tile_height) + threadIdx.y; // Y position 03154 const int xResPos = IMUL(blockIdx.x,tile_width) + threadIdx.x; // index of one pixel value to load 03155 const int ySrcPos = yResPos; // Y position (unchanged in this function) 03156 const int xSrcPos = xResPos<<1; // X position 03157 const int ctrOff = 4; 03158 const int shrIdx = threadIdx.x+ctrOff; 03159 // [ 1 8 28 56 (70) 56 28 8 1 ] 03160 if(xResPos < dw && yResPos < dh) 03161 { 03162 data[shrIdx] = tex2D(texRef,xSrcPos,ySrcPos); 03163 if(threadIdx.x < ctrOff) 03164 data[shrIdx-ctrOff] = tex2D(texRef,xSrcPos-ctrOff,ySrcPos); 03165 else if(threadIdx.x > tile_width - ctrOff) 03166 data[shrIdx+ctrOff] = tex2D(texRef,xSrcPos+ctrOff,ySrcPos); 03167 __syncthreads(); 03168 dst[IMUL(yResPos,dw) + xResPos] = 03169 ((data[shrIdx-4] + data[shrIdx+4]) * ( 1.0F / 256.0F) + 03170 (data[shrIdx-3] + data[shrIdx+3]) * ( 8.0F / 256.0F) + 03171 (data[shrIdx-2] + data[shrIdx+2]) * (28.0F / 256.0F) + 03172 (data[shrIdx-1] + data[shrIdx+1]) * (56.0F / 256.0F) + 03173 data[shrIdx] * (70.0F / 256.0F) 03174 ); 03175 } 03176 } 03177 03178 void cuda_c_lowpass_texture_9_x_dec_x(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height) 03179 { 03180 //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); 03181 texRef.addressMode[0] = cudaAddressModeClamp; 03182 texRef.addressMode[1] = cudaAddressModeClamp; 03183 texRef.filterMode = cudaFilterModePoint; 03184 texRef.normalized = false; 03185 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); //cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat); 03186 CUDA_SAFE_CALL_NO_SYNC(cudaBindTexture2D(NULL,texRef,src,channelDesc,w,h,w*sizeof(float))); 03187 dim3 blockGridRows(iDivUp(dw, tile_width), iDivUp(dh, tile_height)); 03188 dim3 threadBlockRows(tile_width, tile_height); 03189 cuda_global_lowpass_texture_9_x_dec_x<<<blockGridRows, threadBlockRows, (tile_width+8)*tile_height*sizeof(float)>>>(src, w, h, dst, dw, dh, tile_width, tile_height); 03190 CUDA_SAFE_CALL_NO_SYNC(cudaUnbindTexture(texRef)); 03191 } 03192 03193 __global__ void cuda_global_lowpass_texture_9_y_dec_y(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height) 03194 { 03195 // w and h are from the original source image 03196 float *data = (float *) shared_data; //tile_width * (tile_height+8) size 03197 const int yResPos = IMUL(blockIdx.y, tile_height) + threadIdx.y; // Y position 03198 const int xResPos = IMUL(blockIdx.x,tile_width) + threadIdx.x; // index of one pixel value to load 03199 const int ySrcPos = yResPos<<1; // Y position 03200 const int xSrcPos = xResPos; // X position (unchanged in this function) 03201 const int ctrOff = 4; 03202 const int shrIdx = threadIdx.y+ctrOff; 03203 // [ 1 8 28 56 (70) 56 28 8 1 ] 03204 if(xResPos < dw && yResPos < dh) 03205 { 03206 data[shrIdx] = tex2D(texRef,xSrcPos,ySrcPos); 03207 if(threadIdx.y < ctrOff) 03208 data[shrIdx-ctrOff] = tex2D(texRef,xSrcPos,ySrcPos-ctrOff); 03209 else if(threadIdx.y > tile_height - ctrOff) 03210 data[shrIdx+ctrOff] = tex2D(texRef,xSrcPos,ySrcPos+ctrOff); 03211 __syncthreads(); 03212 dst[IMUL(yResPos,dw) + xResPos] = 03213 ((data[shrIdx-4] + data[shrIdx+4]) * ( 1.0F / 256.0F) + 03214 (data[shrIdx-3] + data[shrIdx+3]) * ( 8.0F / 256.0F) + 03215 (data[shrIdx-2] + data[shrIdx+2]) * (28.0F / 256.0F) + 03216 (data[shrIdx-1] + data[shrIdx+1]) * (56.0F / 256.0F) + 03217 data[shrIdx] * (70.0F / 256.0F) 03218 ); 03219 } 03220 } 03221 03222 void cuda_c_lowpass_texture_9_y_dec_y(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height) 03223 { 03224 //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); 03225 texRef.addressMode[0] = cudaAddressModeClamp; 03226 texRef.addressMode[1] = cudaAddressModeClamp; 03227 texRef.filterMode = cudaFilterModePoint; 03228 texRef.normalized = false; 03229 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); //cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat); 03230 CUDA_SAFE_CALL_NO_SYNC(cudaBindTexture2D(NULL,texRef,src,channelDesc,w,h,w*sizeof(float))); 03231 dim3 blockGridRows(iDivUp(dw, tile_width), iDivUp(dh, tile_height)); 03232 dim3 threadBlockRows(tile_width, tile_height); 03233 cuda_global_lowpass_texture_9_y_dec_y<<<blockGridRows, threadBlockRows, tile_width*(tile_height+8)*sizeof(float)>>>(src, w, h, dst, dw, dh, tile_width, tile_height); 03234 CUDA_SAFE_CALL_NO_SYNC(cudaUnbindTexture(texRef)); 03235 } 03236 03237 03238 #endif