cuda_lowpass.h

00001 /*!@file CUDA/cuda-lowpass.h CUDA/GPU optimized lowpass filter code */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file:
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_lowpass.h $
00035 // $Id: cuda_lowpass.h 12962 2010-03-06 02:13:53Z irock $
00036 //
00037 
00038 #ifndef CUDA_LOWPASS_H_DEFINED
00039 #define CUDA_LOWPASS_H_DEFINED
00040 
00041 #include <cuda.h>
00042 #include "CUDA/cutil.h"
00043 #include "cudadefs.h"
00044 
00045 ////////////////////////////////////////////////////////////////////////////////
00046 // Row convolution filter
00047 ////////////////////////////////////////////////////////////////////////////////
00048 __global__ void cuda_global_lowpass_5_x_dec_x(const float *src,  const unsigned int w, const unsigned int h, float* dst, int tile_width)
00049 {
00050   // Data cache in shared memory
00051   //__shared__ float data[CUDA_1D_TILE_W];
00052   //__shared__ float border[4];
00053   //tile_width=CUDA_1D_TILE_W;
00054 
00055   // Save 4 pixels for the border
00056   float *data = (float *) shared_data; // size of tile_width
00057   float *border = (float *) &data[tile_width]; // size of 4
00058 
00059   const int sx = threadIdx.x;                    // source pixel within source tile
00060   const int dx = (sx >> 1);                      // dest pixel within dest tile (decimated 2x)
00061   const int sts = IMUL(blockIdx.x, tile_width);  // tile start for source, relative to row start
00062   const int dts = (sts >>1);                     // tile start for dest, relative to row start
00063   const int srs = IMUL(blockIdx.y, w);           // Row start index in source data
00064   const int drs = IMUL(blockIdx.y, (w >> 1));    // Row start index in dest data
00065 
00066   // Load global memory values into our data cache:
00067   const int loadIdx = sts + sx;  // index of one pixel value to load
00068   if (loadIdx < w) data[sx] = src[srs + loadIdx];
00069 
00070   // Load beginning border
00071   if (sx < 2 && sts > 0)
00072     border[sx] = src[srs + loadIdx - 2];
00073   // Load ending border
00074   else if(sx >= tile_width-2 && sts+tile_width < w-2)
00075     border[4+sx-tile_width] = src[srs + sts + sx + 2];
00076 
00077   const int ww = (w & 0xfffe); // evened-down source size
00078 
00079   // Ensure the completness of loading stage because results emitted
00080   // by each thread depend on the data loaded by other threads:
00081   __syncthreads();
00082 
00083   // only process every other pixel
00084   if ( (sx & 1) == 0 && loadIdx < ww) {
00085     const int writeIdx = dts + dx; // write index relative to row start
00086     const float *dptr = data + sx;
00087     // [ 1 4 (6) 4 1 ] / 16.0
00088 
00089     // If we are smaller than the Gaussian filter we are using, special case this
00090     // this is not very efficient, just making sure it can handle small images
00091     if(w < 5)
00092     {
00093       int numAhead = max(0,min(w-1-(sts+sx),2));
00094       int numBehind = max(0,min(sts+sx,2));
00095       int situation = numBehind*10+numAhead;
00096       switch(situation)
00097       {
00098       case 0: // 00
00099         dst[drs + writeIdx] = dptr[0];
00100         break;
00101       case 1: // 01
00102         dst[drs + writeIdx] = (dptr[0] * (6.0F / 10.0F) +
00103                                dptr[1] * (4.0F / 10.0F));
00104         break;
00105       case 2: // 02
00106         dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
00107                                dptr[1] * (4.0F / 11.0F) +
00108                                dptr[2] * (1.0F / 11.0F));
00109         break;
00110       case 10:
00111         dst[drs + writeIdx] = (dptr[0] * (6.0F / 10.0F) +
00112                                dptr[-1] * (4.0F / 10.0F));
00113         break;
00114       case 11:
00115         dst[drs + writeIdx] = (dptr[0]              * (6.0F / 14.0F) +
00116                                (dptr[-1] + dptr[1]) * (4.0F / 14.0F));
00117         break;
00118       case 12:
00119         dst[drs + writeIdx] = (dptr[0]              * (6.0F / 15.0F) +
00120                                (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
00121                                dptr[2]              * (1.0F / 15.0F));
00122         break;
00123       case 20:
00124         dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
00125                                dptr[-1] * (4.0F / 11.0F) +
00126                                dptr[-2] * (1.0F / 11.0F));
00127         break;
00128       case 21:
00129         dst[drs + writeIdx] = (dptr[0]              * (6.0F / 15.0F) +
00130                                (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
00131                                dptr[-2]              * (1.0F / 15.0F));
00132         break;
00133       }
00134     }
00135     // First set of pixels in the row
00136     else if(sts+sx < 2)
00137     {
00138       switch(sx)
00139       {
00140       case 0:
00141         dst[drs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
00142                                dptr[1] * (4.0F / 11.0F) +
00143                                dptr[2] * (1.0F / 11.0F));
00144         break;
00145       case 1:
00146         dst[drs + writeIdx] = (dptr[0]              * (6.0F / 15.0F) +
00147                                (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
00148                                dptr[2]              * (1.0F / 15.0F));
00149         break;
00150       }
00151     }
00152     // First two pixels in tile
00153     else if(sx < 2 && sts+sx < w-2)
00154     {
00155       switch(sx)
00156       {
00157       case 0:
00158         dst[drs + writeIdx] = (dptr[0]               * (6.0F / 16.0F) +
00159                                (border[1] + dptr[1]) * (4.0F / 16.0F) +
00160                                (border[0] + dptr[2]) * (1.0F / 16.0F));
00161         break;
00162       case 1:
00163         dst[drs + writeIdx] = (dptr[0]               * (6.0F / 16.0F) +
00164                                (dptr[-1] + dptr[1])  * (4.0F / 16.0F) +
00165                                (border[1] + dptr[2]) * (1.0F / 16.0F));
00166         break;
00167       }
00168     }
00169     // In the middle of the tile
00170     else if(sx < tile_width-2 && sts+sx < w-2)
00171     {
00172       dst[drs + writeIdx] = (dptr[0]              * (6.0F / 16.0F) +
00173                              (dptr[-1] + dptr[1]) * (4.0F / 16.0F) +
00174                              (dptr[-2] + dptr[2]) * (1.0F / 16.0F));
00175     }
00176     // Last two pixels of the tile
00177     else if(sts+sx < w-2)
00178     {
00179       switch(tile_width-sx)
00180       {
00181       case 2:
00182         dst[drs + writeIdx] = (dptr[0]                * (6.0F / 16.0F) +
00183                                (dptr[-1] + dptr[1])   * (4.0F / 16.0F) +
00184                                (dptr[-2] + border[2]) * (1.0F / 16.0F));
00185         break;
00186       case 1:
00187         dst[drs + writeIdx] = (dptr[0]                * (6.0F / 16.0F) +
00188                                (dptr[-1] + border[2]) * (4.0F / 16.0F) +
00189                                (dptr[-2] + border[3]) * (1.0F / 16.0F));
00190         break;
00191       }
00192     }
00193     // Last two pixels of the row
00194     else
00195     {
00196       switch(w-(sts+sx))
00197       {
00198       case 2:
00199         dst[drs + writeIdx] = (dptr[0]              * (6.0F / 15.0F) +
00200                                (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
00201                                (dptr[-2])           * (1.0F / 15.0F));
00202         break;
00203       case 1:
00204         dst[drs + writeIdx] = (dptr[0]    * (6.0F / 11.0F) +
00205                                (dptr[-1]) * (4.0F / 11.0F) +
00206                                (dptr[-2]) * (1.0F / 11.0F));
00207         break;
00208       }
00209     }
00210   }
00211 }
00212 
00213 ////////////////////////////////////////////////////////////////////////////////
00214 // Column convolution filter
00215 ////////////////////////////////////////////////////////////////////////////////
00216 __global__ void cuda_global_lowpass_5_y_dec_y(const float *src,  const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00217 {
00218   // Data cache
00219   //__shared__ float data[CUDA_TILE_W*CUDA_TILE_H];
00220   //__shared__ float border[CUDA_TILE_W*4];
00221   //const int tile_width=CUDA_TILE_W;
00222   //const int tile_height= CUDA_TILE_H;
00223 
00224   // Save 4 rows for the border
00225   float *data = (float *) shared_data; //tile_width * tile_height size
00226   float *border = (float *) &data[tile_width*tile_height]; // size of tile_width*4
00227 
00228   const int sy = threadIdx.y; // source pixel row within source tile
00229   const int dy = (sy >> 1);   // dest pixel row within dest tile (decimated 2x)
00230 
00231   const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows
00232   const int ste = sts + tile_height; // tile end for source, in rows
00233 
00234   const int dts = (sts >> 1);
00235   const int dte = (ste >> 1);
00236 
00237   // Clamp tile and apron limits by image borders
00238   const int stec = min(ste, h);
00239   const int dtec = min(dte, (h >> 1));
00240 
00241   // Current column index
00242   const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00243   const int dcs = scs;
00244 
00245   // only process columns that are actually within image bounds:
00246   if (scs < w) {
00247     // Shared and global (source) memory indices for current column
00248     int smemPos = IMUL(sy, tile_width) + threadIdx.x;
00249     int gmemPos = IMUL(sts + sy, w) + scs;
00250 
00251     // Load data
00252     data[smemPos] = src[gmemPos];
00253 
00254     // Load border
00255     if (sy < 2 && gmemPos > IMUL(2,w))
00256       border[smemPos] = src[gmemPos-IMUL(2,w)];
00257 
00258     int bordOff = 4+sy-tile_height;
00259 
00260     if (sy >= tile_height-2 && ste+2 < h)
00261       border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(2,w)];
00262 
00263     // Ensure the completness of loading stage because results emitted
00264     // by each thread depend on the data loaded by other threads:
00265     __syncthreads();
00266 
00267     // only process every other row
00268     if ((sy & 1) == 0) {
00269       // Shared and global (destination) memory indices for current column
00270       smemPos = IMUL(sy, tile_width) + threadIdx.x;
00271       gmemPos = IMUL(dts + dy, w) + dcs;
00272 
00273       // Cycle through the tile body, clamped by image borders
00274       // Calculate and output the results
00275       float *dptr = data + smemPos;
00276       const int sw = tile_width, sw2 = sw + sw;
00277       const int nsw = -sw, nsw2 = nsw - sw;
00278       const int bn2 = threadIdx.x, bn1 = bn2 + tile_width;
00279       const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width;
00280 
00281       //  [ 1 4 (6) 4 1 ] / 16
00282 
00283     // If we are smaller than the Gaussian filter we are using, special case this
00284     // this is not very efficient, just making sure it can handle small images
00285       if(h < 5)
00286       {
00287         int numAhead = max(0,min(h-1-(sts+sy),2));
00288         int numBehind = max(0,min(sts+sy,2));
00289         int situation = numBehind*10+numAhead;
00290         switch(situation)
00291         {
00292         case 0: // 00
00293           dst[gmemPos] = dptr[0];
00294           break;
00295         case 1: // 01
00296           dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) +
00297                           dptr[sw] * (4.0F / 10.0F));
00298           break;
00299         case 2: // 02
00300           dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) +
00301                           dptr[sw] * (4.0F / 11.0F) +
00302                           dptr[sw2] * (1.0F / 11.0F));
00303           break;
00304         case 10:
00305           dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) +
00306                           dptr[nsw] * (4.0F / 10.0F));
00307           break;
00308         case 11:
00309           dst[gmemPos] = (dptr[0]              * (6.0F / 14.0F) +
00310                           (dptr[nsw] + dptr[sw]) * (4.0F / 14.0F));
00311           break;
00312         case 12:
00313           dst[gmemPos] = (dptr[0]              * (6.0F / 15.0F) +
00314                           (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
00315                           dptr[sw2]              * (1.0F / 15.0F));
00316           break;
00317         case 20:
00318           dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) +
00319                           dptr[nsw] * (4.0F / 11.0F) +
00320                           dptr[nsw2] * (1.0F / 11.0F));
00321           break;
00322         case 21:
00323           dst[gmemPos] = (dptr[0]              * (6.0F / 15.0F) +
00324                           (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
00325                           dptr[nsw2]              * (1.0F / 15.0F));
00326           break;
00327         }
00328       }
00329       // Are we in the top 2 rows of the whole image
00330       else if(sts + sy < 2)
00331       {
00332         switch(sts+sy)
00333         {
00334         case 0:
00335           dst[gmemPos] =
00336             (dptr[0]   * (6.0F / 11.0F) +
00337              dptr[sw]  * (4.0F / 11.0F) +
00338              dptr[sw2] * (1.0F / 11.0F)
00339              );
00340           break;
00341         case 1:
00342           dst[gmemPos] =
00343             (dptr[0]                * (6.0F / 15.0F) +
00344              (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
00345              dptr[sw2]              * (1.0F / 15.0F)
00346              );
00347           break;
00348         }
00349       }
00350       else if(sy < 2 && sts+sy<h-2) // If not top 2 in the whole image, are we in the top 2 rows of this tile
00351       {
00352         switch(sy)
00353         {
00354         case 0:
00355           dst[gmemPos] =
00356             (dptr[0]                   * (6.0F / 16.0F) +
00357              (border[bn1] + dptr[sw])  * (4.0F / 16.0F) +
00358              (border[bn2] + dptr[sw2]) * (1.0F / 16.0F)
00359              );
00360           break;
00361         case 1:
00362           dst[gmemPos] =
00363             (dptr[0]                   * (6.0F / 16.0F) +
00364              (dptr[nsw] + dptr[sw])    * (4.0F / 16.0F) +
00365              (border[bn1] + dptr[sw2]) * (1.0F / 16.0F)
00366              );
00367           break;
00368         }
00369       }
00370       else if(sy <tile_height-2 && sts+sy<h-2) // Are we in the middle of the tile
00371       {
00372         dst[gmemPos] =
00373           ((dptr[nsw2] + dptr[sw2]) * (1.0F / 16.0F) +
00374            (dptr[nsw] + dptr[sw])   * (4.0F / 16.0F) +
00375            dptr[0]                  * (6.0F / 16.0F)
00376            );
00377       }
00378       else if(sts + sy < h-2) // Are we not at the bottom of the image, but bottom 4 of the tile
00379       {
00380         switch(tile_height-sy)
00381         {
00382         case 2:
00383           dst[gmemPos] =
00384             (dptr[0]                    * (6.0F / 16.0F) +
00385              (dptr[nsw] + dptr[sw])     * (4.0F / 16.0F) +
00386              (dptr[nsw2] + border[bp1]) * (1.0F / 16.0F)
00387              );
00388           break;
00389         case 1:
00390           dst[gmemPos] =
00391             (dptr[0]                    * (6.0F / 16.0F) +
00392              (dptr[nsw] +  border[bp1]) * (4.0F / 16.0F) +
00393              (dptr[nsw2] + border[bp2]) * (1.0F / 16.0F)
00394              );
00395           break;
00396         }
00397       }
00398       else // We must be at the bottom 4 of the image
00399       {
00400         switch(h-(sts+sy))
00401           {
00402           case 2:
00403             dst[gmemPos] =
00404               (dptr[0]                * (6.0F / 15.0F) +
00405                (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
00406                dptr[nsw2]             * (1.0F / 15.0F)
00407                );
00408             break;
00409           case 1:
00410             dst[gmemPos] =
00411               (dptr[0]    * (6.0F / 11.0F) +
00412                dptr[nsw]  * (4.0F / 11.0F) +
00413                dptr[nsw2] * (1.0F / 11.0F)
00414                );
00415             break;
00416           }
00417       }
00418 
00419     }
00420   }
00421 }
00422 
00423 
00424 __global__ void cuda_global_lowpass_9_x_dec_x(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width)
00425 {
00426   // w and h are from the original source image
00427   const int src_tile_width = tile_width<<1; // Size of the tile used from the source data
00428   // Data cache in shared memory
00429   float *data = (float *) shared_data; // size of src_tile_width
00430   // Bordering data flanking this tile
00431   float *border = (float *) &data[src_tile_width]; // size of 8
00432   const int dx = threadIdx.x;                   // dest pixel within dest tile (decimated 2x)
00433   const int dts = IMUL(blockIdx.x, tile_width); // tile start for dest, relative to row start
00434   const int drs = IMUL(blockIdx.y,dw);           // Row start index in dest data
00435   const int sx = dx<<1;                         // source pixel within source tile
00436   const int sts = (dts <<1);                    // tile start for source, relative to row start
00437   const int srs = IMUL(blockIdx.y, w);   // Row start index in source data
00438 
00439   const int loadIdx = sts + sx; // index of one pixel value to load
00440   const int writeIdx = dts + dx; // index of one pixel value to load
00441   const int bn4 = 0, bn3 = 1, bn2 = 2, bn1 = 3;
00442   const int bp1 = 4, bp2 = 5, bp3 = 6, bp4 = 7;
00443   float *dptr = &data[sx];
00444 
00445   // Load border pixels
00446   if (dx < 4 && sts > 0) border[dx] = src[srs + sts - (4-dx)];
00447   if (dx >= tile_width-4 && sts+src_tile_width < w-4) border[4+dx-(tile_width-4)] = src[srs + sts + tile_width + dx + 4];
00448 
00449   // Load the row into shared memory among the thread block
00450   if (loadIdx < w)
00451     {
00452       data[sx] = src[srs + loadIdx];
00453       if(sx+1 < src_tile_width && loadIdx+1 < w)
00454           data[sx+1] = src[srs + loadIdx + 1];
00455     }
00456 
00457   // Ensure the completness of loading stage because results emitted
00458   // by each thread depend on the data loaded by other threads:
00459   __syncthreads();
00460 
00461   // [ 1 8 28 56 (70) 56 28 8 1 ]
00462 
00463 
00464 
00465 
00466   if(writeIdx < dw && blockIdx.y < dh)
00467     {
00468       /* if(true) */
00469       /*         { */
00470       /*           dst[drs+writeIdx] = dptr[0]; */
00471       /*         } */
00472       if(w < 9)
00473         {
00474           // This is not at all efficient, just here to ensure filter behaves properly in small image cases
00475           const int numAhead = max(0,min(w-1-(sts+sx),4));
00476           const int numBehind = max(0,min(sts+sx,4));
00477           const int situation = numBehind*10+numAhead;
00478 
00479           switch(situation)
00480             {
00481             case 0: // 00
00482               dst[drs+writeIdx] = dptr[0];
00483               break;
00484             case 2: // 02
00485               dst[drs+writeIdx] = (dptr[0] * (70.0F / 154.0F) +
00486                                   dptr[1]     * (56.0F / 154.0F) +
00487                                   dptr[2]     * (28.0F / 154.0F));
00488               break;
00489             case 4:
00490               dst[drs+writeIdx] = (dptr[0] * (70.0F / 163.0F) +
00491                                   dptr[1] * (56.0F / 163.0F) +
00492                                   dptr[2] * (28.0F / 163.0F) +
00493                                   dptr[3] * ( 8.0F / 163.0F) +
00494                                   dptr[4] * ( 1.0F / 163.0F));
00495               break;
00496             case 10:
00497               dst[drs+writeIdx] = (dptr[0]  * (70.0F / 126.0F) +
00498                                   dptr[-1] * (56.0F / 126.0F));
00499               break;
00500             case 12:
00501               dst[drs+writeIdx] = (dptr[0]              * (70.0F / 210.0F) +
00502                                   (dptr[-1] + dptr[1]) * (56.0F / 210.0F) +
00503                                   dptr[2]              * (28.0F / 210.0F));
00504               break;
00505             case 14:
00506               dst[drs+writeIdx] = (dptr[0]               * (70.0F / 219.0F) +
00507                                   (dptr[-1] + dptr[1])  * (56.0F / 219.0F) +
00508                                   dptr[2]               * (28.0F / 219.0F) +
00509                                   dptr[3]               * ( 8.0F / 219.0F) +
00510                                   dptr[4]               * ( 1.0F / 219.0F));
00511               break;
00512             case 20:
00513               dst[drs+writeIdx] = (dptr[0]     * (70.0F / 154.0F) +
00514                                   dptr[-1]    * (56.0F / 154.0F) +
00515                                   dptr[-2]    * (28.0F / 154.0F));
00516               break;
00517             case 22:
00518               dst[drs+writeIdx] = (dptr[0]              * (70.0F / 238.0F) +
00519                                   (dptr[-1] + dptr[1]) * (56.0F / 238.0F) +
00520                                   (dptr[-2] + dptr[2]) * (28.0F / 238.0F));
00521               break;
00522             case 24:
00523               dst[drs+writeIdx] = (dptr[0]              * (70.0F / 247.0F) +
00524                                   (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
00525                                   (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
00526                                   dptr[3]              * ( 8.0F / 247.0F) +
00527                                   dptr[4]              * ( 1.0F / 247.0F));
00528               break;
00529             case 30:
00530               dst[drs+writeIdx] = (dptr[0]  * (70.0F / 162.0F) +
00531                                   dptr[-1] * (56.0F / 162.0F) +
00532                                   dptr[-2] * (28.0F / 162.0F) +
00533                                   dptr[-3] * ( 8.0F / 162.0F));
00534               break;
00535             case 32:
00536               dst[drs+writeIdx] = (dptr[0]              * (70.0F / 246.0F) +
00537                                   (dptr[-1] + dptr[1]) * (56.0F / 246.0F) +
00538                                   (dptr[-2] + dptr[2]) * (28.0F / 246.0F) +
00539                                   dptr[-3]             * ( 8.0F / 246.0F));
00540               break;
00541             case 34:
00542               dst[drs+writeIdx] = (dptr[0]              * (70.0F / 255.0F) +
00543                                   (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
00544                                   (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
00545                                   (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
00546                                   dptr[4]              * ( 1.0F / 255.0F));
00547               break;
00548             case 40:
00549               dst[drs+writeIdx] = (dptr[0]  * (70.0F / 163.0F) +
00550                                   dptr[-1] * (56.0F / 163.0F) +
00551                                   dptr[-2] * (28.0F / 163.0F) +
00552                                   dptr[-3] * ( 8.0F / 163.0F) +
00553                                   dptr[-4] * ( 1.0F / 163.0F));
00554               break;
00555             case 42:
00556               dst[drs+writeIdx] = (dptr[0]              * (70.0F / 247.0F) +
00557                                   (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
00558                                   (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
00559                                   dptr[-3]             * ( 8.0F / 247.0F) +
00560                                   dptr[-4]             * ( 1.0F / 247.0F));
00561               break;
00562             }
00563         }
00564       // First part of source row, just reduce sample
00565       else if(sts+sx < 4)
00566         {
00567           switch(sx)
00568             {
00569             case 0:
00570               dst[drs + writeIdx] =
00571                 (dptr[0] * (70.0F / 163.0F) +
00572                  dptr[1] * (56.0F / 163.0F) +
00573                  dptr[2] * (28.0F / 163.0F) +
00574                  dptr[3] * ( 8.0F / 163.0F) +
00575                  dptr[4] * ( 1.0F / 163.0F)
00576                  );
00577               break;
00578             case 1:
00579               dst[drs + writeIdx] =
00580                 (dptr[0]              * (70.0F / 219.0F) +
00581                  (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
00582                  dptr[2]              * (28.0F / 219.0F) +
00583                  dptr[3]              * ( 8.0F / 219.0F) +
00584                  dptr[4]              * ( 1.0F / 219.0F)
00585                  );
00586               break;
00587             case 2:
00588               dst[drs + writeIdx] =
00589                 (dptr[0]              * (70.0F / 247.0F) +
00590                  (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
00591                  (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
00592                  dptr[3]              * ( 8.0F / 247.0F) +
00593                  dptr[4]              * ( 1.0F / 247.0F)
00594                  );
00595               break;
00596             case 3:
00597               dst[drs + writeIdx] =
00598                 (dptr[0]              * (70.0F / 255.0F) +
00599                  (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
00600                  (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
00601                  (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
00602                  dptr[4]              * ( 1.0F / 255.0F)
00603                  );
00604               break;
00605             default:
00606               //LERROR();
00607               break;
00608             }
00609         }
00610       // If not the first part of the source row, but is the first bit of this tile, use border
00611       else if(sx < 4 && sts+sx < w-4)
00612         {
00613           switch(sx)
00614             {
00615             case 0:
00616               dst[drs + writeIdx] =
00617                 ((border[bn4] + dptr[4]) * ( 1.0F / 256.0F) +
00618                  (border[bn3] + dptr[3]) * ( 8.0F / 256.0F) +
00619                  (border[bn2] + dptr[2]) * (28.0F / 256.0F) +
00620                  (border[bn1] + dptr[1]) * (56.0F / 256.0F) +
00621                  dptr[0]                 * (70.0F / 256.0F)
00622                  );
00623               break;
00624             case 1:
00625               dst[drs + writeIdx] =
00626                 ((border[bn3] + dptr[4])   * ( 1.0F / 256.0F) +
00627                  (border[bn2] + dptr[3])   * ( 8.0F / 256.0F) +
00628                  (border[bn1] + dptr[2])   * (28.0F / 256.0F) +
00629                  (dptr[-1] + dptr[1])      * (56.0F / 256.0F) +
00630                  dptr[0]                   * (70.0F / 256.0F)
00631                  );
00632               break;
00633             case 2:
00634               dst[drs + writeIdx] =
00635                 ((border[bn2] + dptr[4]) * ( 1.0F / 256.0F) +
00636                  (border[bn1] + dptr[3]) * ( 8.0F / 256.0F) +
00637                  (dptr[-2] + dptr[2])    * (28.0F / 256.0F) +
00638                  (dptr[-1] + dptr[1])    * (56.0F / 256.0F) +
00639                  dptr[0]                 * (70.0F / 256.0F)
00640                  );
00641               break;
00642             case 3:
00643               dst[drs + writeIdx] =
00644                 ((border[bn1] + dptr[4]) * ( 1.0F / 256.0F) +
00645                  (dptr[-3] + dptr[3])    * ( 8.0F / 256.0F) +
00646                  (dptr[-2] + dptr[2])    * (28.0F / 256.0F) +
00647                  (dptr[-1] + dptr[1])    * (56.0F / 256.0F) +
00648                  dptr[0]                 * (70.0F / 256.0F)
00649                  );
00650               break;
00651             }
00652         }
00653       // If we are not near the edge of this tile, do standard way
00654       else if(sx>=4 && sx<src_tile_width-4 && sts+sx < w-4)
00655         {
00656           dst[drs + writeIdx] =
00657             ((dptr[-4] + dptr[4]) * ( 1.0F / 256.0F) +
00658              (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) +
00659              (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
00660              (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
00661              dptr[0]              * (70.0F / 256.0F)
00662              );
00663         }
00664       // If not the last part of the source row, but in the last bit of this tile, use border
00665       else if(sx>=4 && sts+sx < w-4)
00666         {
00667           switch(src_tile_width-sx)
00668             {
00669             case 4:
00670               dst[drs + writeIdx] =
00671                 ((dptr[-4] + border[bp1]) * ( 1.0F / 256.0F) +
00672                  (dptr[-3] + dptr[3])     * ( 8.0F / 256.0F) +
00673                  (dptr[-2] + dptr[2])     * (28.0F / 256.0F) +
00674                  (dptr[-1] + dptr[1])     * (56.0F / 256.0F) +
00675                  dptr[0]                  * (70.0F / 256.0F)
00676                  );
00677               break;
00678             case 3:
00679               dst[drs + writeIdx] =
00680                 ((dptr[-4] + border[bp2]) * ( 1.0F / 256.0F) +
00681                  (dptr[-3] + border[bp1]) * ( 8.0F / 256.0F) +
00682                  (dptr[-2] + dptr[2])     * (28.0F / 256.0F) +
00683                  (dptr[-1] + dptr[1])     * (56.0F / 256.0F) +
00684                  dptr[0]                  * (70.0F / 256.0F)
00685                  );
00686               break;
00687             case 2:
00688               dst[drs + writeIdx] =
00689                 ((dptr[-4] + border[bp3]) * ( 1.0F / 256.0F) +
00690                  (dptr[-3] + border[bp2]) * ( 8.0F / 256.0F) +
00691                  (dptr[-2] + border[bp1]) * (28.0F / 256.0F) +
00692                  (dptr[-1] + dptr[1])     * (56.0F / 256.0F) +
00693                  dptr[0]                  * (70.0F / 256.0F)
00694                  );
00695               break;
00696             case 1:
00697               dst[drs + writeIdx] =
00698                 ((dptr[-4] + border[bp4]) * ( 1.0F / 256.0F) +
00699                  (dptr[-3] + border[bp3]) * ( 8.0F / 256.0F) +
00700                  (dptr[-2] + border[bp2]) * (28.0F / 256.0F) +
00701                  (dptr[-1] + border[bp1]) * (56.0F / 256.0F) +
00702                  dptr[0]                  * (70.0F / 256.0F)
00703                  );
00704               break;
00705             }
00706         }
00707       else if(sx < 4 && sts + sx >= w-4) // We are in the beginning of a tile close to the end of the image
00708         {
00709           switch(sx)
00710             {
00711             case 0:
00712               switch(w-(sts+sx))
00713                 {
00714                 case 4:
00715                   dst[drs + writeIdx] =
00716                     (dptr[0]                 * (70.0F / 255.0F) +
00717                      (border[bn1] + dptr[1]) * (56.0F / 255.0F) +
00718                      (border[bn2] + dptr[2]) * (28.0F / 255.0F) +
00719                      (border[bn3] + dptr[3]) * ( 8.0F / 255.0F) +
00720                      border[bn4]             * ( 1.0F / 255.0F)
00721                      );
00722                   break;
00723                 case 3:
00724                   dst[drs + writeIdx] =
00725                     (dptr[0]                 * (70.0F / 247.0F) +
00726                      (border[bn1] + dptr[1]) * (56.0F / 247.0F) +
00727                      (border[bn2] + dptr[2]) * (28.0F / 247.0F) +
00728                      border[bn3]             * ( 8.0F / 247.0F) +
00729                      border[bn4]             * ( 1.0F / 247.0F)
00730                      );
00731                   break;
00732                 case 2:
00733                   dst[drs + writeIdx] =
00734                     (dptr[0]                 * (70.0F / 219.0F) +
00735                      (border[bn1] + dptr[1]) * (56.0F / 219.0F) +
00736                      border[bn2]             * (28.0F / 219.0F) +
00737                      border[bn3]             * ( 8.0F / 219.0F) +
00738                      border[bn4]             * ( 1.0F / 219.0F)
00739                      );
00740                   break;
00741                 case 1:
00742                   dst[drs + writeIdx] =
00743                     (dptr[0]     * (70.0F / 163.0F) +
00744                      border[bn1] * (56.0F / 163.0F) +
00745                      border[bn2] * (28.0F / 163.0F) +
00746                      border[bn3] * ( 8.0F / 163.0F) +
00747                      border[bn4] * ( 1.0F / 163.0F)
00748                      );
00749                   break;
00750                 }
00751               break;
00752             case 1:
00753               switch(w-(sts+sx))
00754                 {
00755                 case 4:
00756                   dst[drs + writeIdx] =
00757                     (dptr[0]                 * (70.0F / 255.0F) +
00758                      (dptr[1] + dptr[1])     * (56.0F / 255.0F) +
00759                      (border[bn1] + dptr[2]) * (28.0F / 255.0F) +
00760                      (border[bn2] + dptr[3]) * ( 8.0F / 255.0F) +
00761                      border[bn3]             * ( 1.0F / 255.0F)
00762                      );
00763                   break;
00764                 case 3:
00765                   dst[drs + writeIdx] =
00766                     (dptr[0]                 * (70.0F / 247.0F) +
00767                      (dptr[1] + dptr[1])     * (56.0F / 247.0F) +
00768                      (border[bn1] + dptr[2]) * (28.0F / 247.0F) +
00769                      border[bn2]             * ( 8.0F / 247.0F) +
00770                      border[bn3]             * ( 1.0F / 247.0F)
00771                      );
00772                   break;
00773                 case 2:
00774                   dst[drs + writeIdx] =
00775                     (dptr[0]                * (70.0F / 219.0F) +
00776                      (dptr[1] + dptr[1])    * (56.0F / 219.0F) +
00777                      border[bn1]            * (28.0F / 219.0F) +
00778                      border[bn2]            * ( 8.0F / 219.0F) +
00779                      border[bn3]            * ( 1.0F / 219.0F)
00780                      );
00781                   break;
00782                 case 1:
00783                   dst[drs + writeIdx] =
00784                     (dptr[0]     * (70.0F / 163.0F) +
00785                      dptr[1]     * (56.0F / 163.0F) +
00786                      border[bn1] * (28.0F / 163.0F) +
00787                      border[bn2] * ( 8.0F / 163.0F) +
00788                      border[bn3] * ( 1.0F / 163.0F)
00789                      );
00790                   break;
00791                 }
00792               break;
00793             case 2:
00794               switch(w-(sts+sx))
00795                 {
00796                 case 4:
00797                   dst[drs + writeIdx] =
00798                     (dptr[0]                  * (70.0F / 255.0F) +
00799                      (dptr[1] + dptr[1])      * (56.0F / 255.0F) +
00800                      (dptr[2] + dptr[2])      * (28.0F / 255.0F) +
00801                      (border[bn1] + dptr[3])  * ( 8.0F / 255.0F) +
00802                      border[bn2]              * ( 1.0F / 255.0F)
00803                      );
00804                   break;
00805                 case 3:
00806                   dst[drs + writeIdx] =
00807                     (dptr[0]                 * (70.0F / 247.0F) +
00808                      (dptr[1] + dptr[1])     * (56.0F / 247.0F) +
00809                      (dptr[2] + dptr[2])     * (28.0F / 247.0F) +
00810                      border[bn1]             * ( 8.0F / 247.0F) +
00811                      border[bn2]             * ( 1.0F / 247.0F)
00812                      );
00813                   break;
00814                 case 2:
00815                   dst[drs + writeIdx] =
00816                     (dptr[0]                * (70.0F / 219.0F) +
00817                      (dptr[1] + dptr[1])    * (56.0F / 219.0F) +
00818                      dptr[2]                * (28.0F / 219.0F) +
00819                      border[bn1]            * ( 8.0F / 219.0F) +
00820                      border[bn2]            * ( 1.0F / 219.0F)
00821                      );
00822                   break;
00823                 case 1:
00824                   dst[drs + writeIdx] =
00825                     (dptr[0]          * (70.0F / 163.0F) +
00826                      dptr[1]          * (56.0F / 163.0F) +
00827                      dptr[2]          * (28.0F / 163.0F) +
00828                      border[bn1]      * ( 8.0F / 163.0F) +
00829                      border[bn2]      * ( 1.0F / 163.0F)
00830                      );
00831                   break;
00832                 }
00833               break;
00834             case 3:
00835               switch(w-(sts+sx))
00836                 {
00837                 case 4:
00838                   dst[drs + writeIdx] =
00839                     (dptr[0]               * (70.0F / 255.0F) +
00840                      (dptr[1] + dptr[1])   * (56.0F / 255.0F) +
00841                      (dptr[2] + dptr[2])   * (28.0F / 255.0F) +
00842                      (dptr[3] + dptr[3])   * ( 8.0F / 255.0F) +
00843                      border[bn1]           * ( 1.0F / 255.0F)
00844                      );
00845                   break;
00846                 case 3:
00847                   dst[drs + writeIdx] =
00848                     (dptr[0]               * (70.0F / 247.0F) +
00849                      (dptr[1] + dptr[1])   * (56.0F / 247.0F) +
00850                      (dptr[2] + dptr[2])   * (28.0F / 247.0F) +
00851                      dptr[3]               * ( 8.0F / 247.0F) +
00852                      border[bn1]           * ( 1.0F / 247.0F)
00853                      );
00854                   break;
00855                 case 2:
00856                   dst[drs + writeIdx] =
00857                     (dptr[0]             * (70.0F / 219.0F) +
00858                      (dptr[1] + dptr[1]) * (56.0F / 219.0F) +
00859                      dptr[2]             * (28.0F / 219.0F) +
00860                      dptr[3]             * ( 8.0F / 219.0F) +
00861                      border[bn1]         * ( 1.0F / 219.0F)
00862                      );
00863                   break;
00864                 case 1:
00865                   dst[drs + writeIdx] =
00866                     (dptr[0]     * (70.0F / 163.0F) +
00867                      dptr[1]     * (56.0F / 163.0F) +
00868                      dptr[2]     * (28.0F / 163.0F) +
00869                      dptr[3]     * ( 8.0F / 163.0F) +
00870                      border[bn1] * ( 1.0F / 163.0F)
00871                      );
00872                   break;
00873                 }
00874               break;
00875             }
00876         }
00877 
00878       // If in the last bit of the source row, reduce sample
00879       else if(sts + sx < w)
00880         {
00881           switch(w-(sts+sx))
00882             {
00883             case 4:
00884               dst[drs + writeIdx] =
00885                 (dptr[-4]             * ( 1.0F / 255.0F) +
00886                  (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
00887                  (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
00888                  (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
00889                  dptr[0]              * (70.0F / 255.0F)
00890                  );
00891               break;
00892             case 3:
00893               dst[drs + writeIdx] =
00894                 (dptr[-4]             * ( 1.0F / 247.0F) +
00895                  dptr[-3]             * ( 8.0F / 247.0F) +
00896                  (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
00897                  (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
00898                  dptr[0]              * (70.0F / 247.0F)
00899                  );
00900               break;
00901             case 2:
00902               dst[drs + writeIdx] =
00903                 (dptr[-4]             * ( 1.0F / 219.0F) +
00904                  dptr[-3]             * ( 8.0F / 219.0F) +
00905                  dptr[-2]             * (28.0F / 219.0F) +
00906                  (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
00907                  dptr[0]              * (70.0F / 219.0F)
00908                  );
00909               break;
00910             case 1:
00911               dst[drs + writeIdx] =
00912                 (dptr[-4]             * ( 1.0F / 163.0F) +
00913                  dptr[-3]             * ( 8.0F / 163.0F) +
00914                  dptr[-2]             * (28.0F / 163.0F) +
00915                  dptr[-1]             * (56.0F / 163.0F) +
00916                  dptr[ 0]             * (70.0F / 163.0F)
00917                  );
00918               break;
00919             }
00920         }
00921     }
00922 }
00923 
00924 // ######################################################################
00925 __global__ void cuda_global_lowpass_9_y_dec_y(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width, int tile_height)
00926 {
00927   // Data cache
00928   const int src_tile_height = tile_height<<1;
00929   float *data = (float *) shared_data; // size of tile_width * src_tile_height
00930   float *border = (float *) &data[tile_width*src_tile_height]; // size of tile_width * 8
00931 
00932   const int dy = threadIdx.y;                   // dest pixel within dest tile (decimated 2x)
00933   const int dts = IMUL(blockIdx.y, tile_height); // tile start for dest, relative to row start
00934   const int dcs = IMUL(blockIdx.x, tile_width) + threadIdx.x;  // Current column index in dest data
00935   const int sy = dy<<1;                         // source pixel within source tile
00936   const int sts = dts<<1;                       // tile start for source, in rows
00937   const int ste = sts + src_tile_height;        // tile end for source, in rows
00938   const int scs = dcs;                           // Current column index
00939 
00940   // Clamp tile and apron limits by image borders
00941   const int stec = min(ste, h);
00942 
00943 
00944   // Load the top border
00945   if (dy < 4 && sts+sy > 4)
00946     {
00947       border[IMUL(dy,tile_width)+threadIdx.x] = src[IMUL(sts+dy,w)+scs-IMUL(4,w)];
00948     }
00949   // Load the bottom border
00950   else if (dy >= tile_height-4)
00951     {
00952       int bordOff = 8+dy-tile_height;
00953       if(ste+4 <= h)
00954         //border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(4,w)];
00955         border[threadIdx.x+IMUL(bordOff,tile_width)] = src[IMUL(sts+dy+tile_height+4,w)+scs];
00956       else
00957         // This is explicitly handling the case where the second to last tile is so close to the edge of the image, that the post border
00958         // can't fit in the remaining space, maybe we should be handling this as yet another special case, but there are a lot of special
00959         // cases already in this function (handling smaller than 9 imgs, handling all cases of small last tile)
00960         border[threadIdx.x+IMUL(bordOff,tile_width)] = 0;
00961     }
00962 
00963 
00964   // only process columns that are actually within image bounds:
00965   if (scs < w && sts+sy < stec)
00966   {
00967     // Shared and global memory indices for current column
00968     int smemPos = IMUL(sy, tile_width) + threadIdx.x;
00969     int gmemPos = IMUL(sts + sy, w) + scs;
00970 
00971     // Each thread loads up to two pixels of data
00972     data[smemPos] = src[gmemPos];
00973     if(sts+sy+1 < stec)
00974       data[smemPos+tile_width] = src[gmemPos+w];
00975 
00976 
00977     // Ensure the completness of loading stage because results emitted
00978     // by each thread depend on the data loaded by other threads:
00979     __syncthreads();
00980 
00981     // Shared and global memory indices for current column
00982     smemPos = IMUL(sy, tile_width) + threadIdx.x;
00983     gmemPos = IMUL(dts + dy, dw) + dcs;
00984 
00985 
00986     // Setup the offsets to get to the correct smem points in the arrays for both the data and the border
00987     float *dptr = data + smemPos;
00988     const int sw = tile_width, sw2 = sw + sw, sw3 = sw2 + sw, sw4 = sw3 + sw;
00989     const int nsw = -sw, nsw2 = nsw - sw, nsw3 = nsw2 - sw, nsw4 = nsw3 - sw;
00990     const int bn4 = threadIdx.x, bn3 = bn4 + tile_width, bn2 = bn3 + tile_width, bn1 = bn2 + tile_width;
00991     const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width, bp3 = bp2 + tile_width, bp4 = bp3 + tile_width;
00992 
00993     if(h < 9)
00994     {
00995       // This is not at all efficient, just here to ensure filter behaves properly in small image cases
00996       const int numAhead = max(0,min(h-1-(sts+sy),4));
00997       const int numBehind = max(0,min(sts+sy,4));
00998       const int situation = numBehind*10+numAhead;
00999       switch(situation)
01000       {
01001       case 0: // 00
01002         dst[gmemPos] = dptr[0];
01003         break;
01004       case 1: // 01
01005         dst[gmemPos] = (dptr[0] * (70.0F / 126.0F) +
01006                         dptr[sw] * (56.0F / 126.0F));
01007         break;
01008       case 2: // 02
01009         dst[gmemPos] = (dptr[0]   * (70.0F / 154.0F) +
01010                         dptr[sw]  * (56.0F / 154.0F) +
01011                         dptr[sw2] * (28.0F / 154.0F));
01012         break;
01013       case 3:
01014         dst[gmemPos] = (dptr[0]   * (70.0F / 162.0F) +
01015                         dptr[sw]  * (56.0F / 162.0F) +
01016                         dptr[sw2] * (28.0F / 162.0F) +
01017                         dptr[sw3] * ( 8.0F / 162.0F));
01018         break;
01019       case 4:
01020         dst[gmemPos] = (dptr[0]   * (70.0F / 163.0F) +
01021                         dptr[sw]  * (56.0F / 163.0F) +
01022                         dptr[sw2] * (28.0F / 163.0F) +
01023                         dptr[sw3] * ( 8.0F / 163.0F) +
01024                         dptr[sw4] * ( 1.0F / 163.0F));
01025         break;
01026       case 20:
01027         dst[gmemPos] = (dptr[0]    * (70.0F / 154.0F) +
01028                         dptr[nsw]  * (56.0F / 154.0F) +
01029                         dptr[nsw2] * (28.0F / 154.0F));
01030         break;
01031       case 21:
01032         dst[gmemPos] = (dptr[0]                 * (70.0F / 210.0F) +
01033                         (dptr[nsw] + dptr[sw])  * (56.0F / 210.0F) +
01034                         dptr[nsw2]              * (28.0F / 210.0F));
01035         break;
01036       case 22:
01037         dst[gmemPos] = (dptr[0]                  * (70.0F / 238.0F) +
01038                         (dptr[nsw] + dptr[sw])   * (56.0F / 238.0F) +
01039                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 238.0F));
01040         break;
01041       case 23:
01042         dst[gmemPos] = (dptr[0]                  * (70.0F / 246.0F) +
01043                         (dptr[nsw] + dptr[sw])   * (56.0F / 246.0F) +
01044                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 246.0F) +
01045                         dptr[sw3]                * ( 8.0F / 246.0F));
01046         break;
01047       case 24:
01048         dst[gmemPos] = (dptr[0]                  * (70.0F / 247.0F) +
01049                         (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
01050                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01051                         dptr[sw3]                * ( 8.0F / 247.0F) +
01052                         dptr[sw4]                * ( 1.0F / 247.0F));
01053         break;
01054       case 40:
01055         dst[gmemPos] = (dptr[0]    * (70.0F / 163.0F) +
01056                         dptr[nsw]  * (56.0F / 163.0F) +
01057                         dptr[nsw2] * (28.0F / 163.0F) +
01058                         dptr[nsw3] * ( 8.0F / 163.0F) +
01059                         dptr[nsw4] * ( 1.0F / 163.0F));
01060         break;
01061       case 41:
01062         dst[gmemPos] = (dptr[0]                 * (70.0F / 219.0F) +
01063                         (dptr[nsw] + dptr[sw])  * (56.0F / 219.0F) +
01064                         dptr[nsw2]              * (28.0F / 219.0F) +
01065                         dptr[nsw3]              * ( 8.0F / 219.0F) +
01066                         dptr[nsw4]              * ( 1.0F / 219.0F));
01067         break;
01068       case 42:
01069         dst[gmemPos] = (dptr[0]                  * (70.0F / 247.0F) +
01070                         (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
01071                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01072                         dptr[nsw3]               * ( 8.0F / 247.0F) +
01073                         dptr[nsw4]               * ( 1.0F / 247.0F));
01074         break;
01075       case 43:
01076         dst[gmemPos] = (dptr[0]                  * (70.0F / 255.0F) +
01077                         (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
01078                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01079                         (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01080                         dptr[nsw4]               * ( 1.0F / 255.0F));
01081         break;
01082       }
01083     }
01084     // Are we in the top 4 rows of the whole image
01085     else if(sts + sy < 4)
01086     {
01087       switch(sts+sy)
01088       {
01089       case 0:
01090         dst[gmemPos] =
01091           (dptr[0]   * (70.0F / 163.0F) +
01092            dptr[sw]  * (56.0F / 163.0F) +
01093            dptr[sw2] * (28.0F / 163.0F) +
01094            dptr[sw3] * ( 8.0F / 163.0F) +
01095            dptr[sw4] * ( 1.0F / 163.0F)
01096            );
01097         break;
01098       case 1:
01099         dst[gmemPos] =
01100           (dptr[0]                * (70.0F / 219.0F) +
01101            (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01102            dptr[sw2]              * (28.0F / 219.0F) +
01103            dptr[sw3]              * ( 8.0F / 219.0F) +
01104            dptr[sw4]              * ( 1.0F / 219.0F)
01105            );
01106         break;
01107       case 2:
01108         dst[gmemPos] =
01109           (dptr[0]                  * (70.0F / 247.0F) +
01110            (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
01111            (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01112            dptr[sw3]                * ( 8.0F / 247.0F) +
01113            dptr[sw4]                * ( 1.0F / 247.0F)
01114            );
01115         break;
01116       case 3:
01117         dst[gmemPos] =
01118           (dptr[0]                  * (70.0F / 255.0F) +
01119            (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
01120            (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01121            (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01122            dptr[sw4]                * ( 1.0F / 255.0F)
01123            );
01124         break;
01125       }
01126     }
01127     else if(sy < 4 && sts+sy<h-4) // If not top 4 in the whole image, are we in the top 4 rows of this tile
01128     {
01129       switch(sy)
01130       {
01131       case 0:
01132         dst[gmemPos] =
01133           (dptr[0]                   * (70.0F / 256.0F) +
01134            (border[bn1] + dptr[sw])  * (56.0F / 256.0F) +
01135            (border[bn2] + dptr[sw2]) * (28.0F / 256.0F) +
01136            (border[bn3] + dptr[sw3]) * ( 8.0F / 256.0F) +
01137            (border[bn4] + dptr[sw4]) * ( 1.0F / 256.0F)
01138            );
01139         break;
01140       case 1:
01141         dst[gmemPos] =
01142           (dptr[0]                   * (70.0F / 256.0F) +
01143            (dptr[nsw] + dptr[sw])    * (56.0F / 256.0F) +
01144            (border[bn1] + dptr[sw2]) * (28.0F / 256.0F) +
01145            (border[bn2] + dptr[sw3]) * ( 8.0F / 256.0F) +
01146            (border[bn3] + dptr[sw4]) * ( 1.0F / 256.0F)
01147            );
01148         break;
01149       case 2:
01150         dst[gmemPos] =
01151           (dptr[0]                   * (70.0F / 256.0F) +
01152            (dptr[nsw] + dptr[sw])    * (56.0F / 256.0F) +
01153            (dptr[nsw2] + dptr[sw2])  * (28.0F / 256.0F) +
01154            (border[bn1] + dptr[sw3]) * ( 8.0F / 256.0F) +
01155            (border[bn2] + dptr[sw4]) * ( 1.0F / 256.0F)
01156            );
01157         break;
01158       case 3:
01159         dst[gmemPos] =
01160           (dptr[0]                   * (70.0F / 256.0F) +
01161            (dptr[nsw] + dptr[sw])    * (56.0F / 256.0F) +
01162            (dptr[nsw2] + dptr[sw2])  * (28.0F / 256.0F) +
01163            (dptr[nsw3] + dptr[sw3])  * ( 8.0F / 256.0F) +
01164            (border[bn1] + dptr[sw4]) * ( 1.0F / 256.0F)
01165            );
01166         break;
01167       }
01168     }
01169     else if(sy >= 4 && sy <src_tile_height-4 && sts+sy<h-4) // Are we in the middle of the tile
01170     {
01171         dst[gmemPos] =
01172           ((dptr[nsw4] + dptr[sw4]) * ( 1.0F / 256.0F) +
01173            (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) +
01174            (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
01175            (dptr[nsw] + dptr[sw])   * (56.0F / 256.0F) +
01176            dptr[0]                  * (70.0F / 256.0F)
01177            );
01178     }
01179     else if(sy >= 4 && sts + sy < h-4) // Are we not at the bottom of the image, but bottom 4 of the tile
01180     {
01181       switch(src_tile_height-sy)
01182       {
01183       case 4:
01184         dst[gmemPos] =
01185           (dptr[0]                    * (70.0F / 256.0F) +
01186            (dptr[nsw] + dptr[sw])     * (56.0F / 256.0F) +
01187            (dptr[nsw2] + dptr[sw2])   * (28.0F / 256.0F) +
01188            (dptr[nsw3] + dptr[sw3])   * ( 8.0F / 256.0F) +
01189            (dptr[nsw4] + border[bp1]) * ( 1.0F / 256.0F)
01190            );
01191         break;
01192       case 3:
01193         dst[gmemPos] =
01194           (dptr[0]                    * (70.0F / 256.0F) +
01195            (dptr[nsw] + dptr[sw])     * (56.0F / 256.0F) +
01196            (dptr[nsw2] + dptr[sw2])   * (28.0F / 256.0F) +
01197            (dptr[nsw3] + border[bp1]) * ( 8.0F / 256.0F) +
01198            (dptr[nsw4] + border[bp2]) * ( 1.0F / 256.0F)
01199            );
01200         break;
01201       case 2:
01202         dst[gmemPos] =
01203           (dptr[0]                    * (70.0F / 256.0F) +
01204            (dptr[nsw] + dptr[sw])     * (56.0F / 256.0F) +
01205            (dptr[nsw2] + border[bp1]) * (28.0F / 256.0F) +
01206            (dptr[nsw3] + border[bp2]) * ( 8.0F / 256.0F) +
01207            (dptr[nsw4] + border[bp3]) * ( 1.0F / 256.0F)
01208            );
01209         break;
01210       case 1:
01211         dst[gmemPos] =
01212           (dptr[0]                    * (70.0F / 256.0F) +
01213            (dptr[nsw] +  border[bp1]) * (56.0F / 256.0F) +
01214            (dptr[nsw2] + border[bp2]) * (28.0F / 256.0F) +
01215            (dptr[nsw3] + border[bp3]) * ( 8.0F / 256.0F) +
01216            (dptr[nsw4] + border[bp4]) * ( 1.0F / 256.0F)
01217            );
01218         break;
01219       }
01220     }
01221     else if(sy < 4 && sts + sy >= h-4) // We are in the top of a tile close to the bottom of the image
01222     {
01223       switch(sy)
01224       {
01225       case 0:
01226         switch(h-(sts+sy))
01227         {
01228           case 4:
01229             dst[gmemPos] =
01230               (dptr[0]                  * (70.0F / 255.0F) +
01231                (border[bn1] + dptr[sw])   * (56.0F / 255.0F) +
01232                (border[bn2] + dptr[sw2]) * (28.0F / 255.0F) +
01233                (border[bn3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01234                border[bn4]               * ( 1.0F / 255.0F)
01235                );
01236             break;
01237           case 3:
01238             dst[gmemPos] =
01239               (dptr[0]                  * (70.0F / 247.0F) +
01240                (border[bn1] + dptr[sw])   * (56.0F / 247.0F) +
01241                (border[bn2] + dptr[sw2]) * (28.0F / 247.0F) +
01242                border[bn3]               * ( 8.0F / 247.0F) +
01243                border[bn4]               * ( 1.0F / 247.0F)
01244                );
01245             break;
01246           case 2:
01247             dst[gmemPos] =
01248               (dptr[0]                * (70.0F / 219.0F) +
01249                (border[bn1] + dptr[sw]) * (56.0F / 219.0F) +
01250                border[bn2]             * (28.0F / 219.0F) +
01251                border[bn3]             * ( 8.0F / 219.0F) +
01252                border[bn4]             * ( 1.0F / 219.0F)
01253                );
01254             break;
01255           case 1:
01256             dst[gmemPos] =
01257               (dptr[0]    * (70.0F / 163.0F) +
01258                border[bn1]  * (56.0F / 163.0F) +
01259                border[bn2] * (28.0F / 163.0F) +
01260                border[bn3] * ( 8.0F / 163.0F) +
01261                border[bn4] * ( 1.0F / 163.0F)
01262                );
01263             break;
01264         }
01265         break;
01266       case 1:
01267         switch(h-(sts+sy))
01268         {
01269           case 4:
01270             dst[gmemPos] =
01271               (dptr[0]                  * (70.0F / 255.0F) +
01272                (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
01273                (border[bn1] + dptr[sw2]) * (28.0F / 255.0F) +
01274                (border[bn2] + dptr[sw3]) * ( 8.0F / 255.0F) +
01275                border[bn3]               * ( 1.0F / 255.0F)
01276                );
01277             break;
01278           case 3:
01279             dst[gmemPos] =
01280               (dptr[0]                  * (70.0F / 247.0F) +
01281                (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
01282                (border[bn1] + dptr[sw2]) * (28.0F / 247.0F) +
01283                border[bn2]               * ( 8.0F / 247.0F) +
01284                border[bn3]               * ( 1.0F / 247.0F)
01285                );
01286             break;
01287           case 2:
01288             dst[gmemPos] =
01289               (dptr[0]                * (70.0F / 219.0F) +
01290                (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01291                border[bn1]             * (28.0F / 219.0F) +
01292                border[bn2]             * ( 8.0F / 219.0F) +
01293                border[bn3]             * ( 1.0F / 219.0F)
01294                );
01295             break;
01296           case 1:
01297             dst[gmemPos] =
01298               (dptr[0]    * (70.0F / 163.0F) +
01299                dptr[nsw]  * (56.0F / 163.0F) +
01300                border[bn1] * (28.0F / 163.0F) +
01301                border[bn2] * ( 8.0F / 163.0F) +
01302                border[bn3] * ( 1.0F / 163.0F)
01303                );
01304             break;
01305         }
01306         break;
01307       case 2:
01308         switch(h-(sts+sy))
01309         {
01310           case 4:
01311             dst[gmemPos] =
01312               (dptr[0]                  * (70.0F / 255.0F) +
01313                (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
01314                (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01315                (border[bn1] + dptr[sw3]) * ( 8.0F / 255.0F) +
01316                border[bn2]               * ( 1.0F / 255.0F)
01317                );
01318             break;
01319           case 3:
01320             dst[gmemPos] =
01321               (dptr[0]                  * (70.0F / 247.0F) +
01322                (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
01323                (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01324                border[bn1]               * ( 8.0F / 247.0F) +
01325                border[bn2]               * ( 1.0F / 247.0F)
01326                );
01327             break;
01328           case 2:
01329             dst[gmemPos] =
01330               (dptr[0]                * (70.0F / 219.0F) +
01331                (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01332                dptr[nsw2]             * (28.0F / 219.0F) +
01333                border[bn1]             * ( 8.0F / 219.0F) +
01334                border[bn2]             * ( 1.0F / 219.0F)
01335                );
01336             break;
01337           case 1:
01338             dst[gmemPos] =
01339               (dptr[0]    * (70.0F / 163.0F) +
01340                dptr[nsw]  * (56.0F / 163.0F) +
01341                dptr[nsw2] * (28.0F / 163.0F) +
01342                border[bn1] * ( 8.0F / 163.0F) +
01343                border[bn2] * ( 1.0F / 163.0F)
01344                );
01345             break;
01346         }
01347         break;
01348       case 3:
01349         switch(h-(sts+sy))
01350         {
01351           case 4:
01352             dst[gmemPos] =
01353               (dptr[0]                  * (70.0F / 255.0F) +
01354                (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
01355                (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01356                (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01357                border[bn1]               * ( 1.0F / 255.0F)
01358                );
01359             break;
01360           case 3:
01361             dst[gmemPos] =
01362               (dptr[0]                  * (70.0F / 247.0F) +
01363                (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
01364                (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01365                dptr[nsw3]               * ( 8.0F / 247.0F) +
01366                border[bn1]               * ( 1.0F / 247.0F)
01367                );
01368             break;
01369           case 2:
01370             dst[gmemPos] =
01371               (dptr[0]                * (70.0F / 219.0F) +
01372                (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01373                dptr[nsw2]             * (28.0F / 219.0F) +
01374                dptr[nsw3]             * ( 8.0F / 219.0F) +
01375                border[bn1]             * ( 1.0F / 219.0F)
01376                );
01377             break;
01378           case 1:
01379             dst[gmemPos] =
01380               (dptr[0]    * (70.0F / 163.0F) +
01381                dptr[nsw]  * (56.0F / 163.0F) +
01382                dptr[nsw2] * (28.0F / 163.0F) +
01383                dptr[nsw3] * ( 8.0F / 163.0F) +
01384                border[bn1] * ( 1.0F / 163.0F)
01385                );
01386             break;
01387         }
01388         break;
01389       }
01390     }
01391     else  // We must be at the bottom 4 of the image
01392     {
01393       switch(h-(sts+sy))
01394       {
01395       case 4:
01396         dst[gmemPos] =
01397           (dptr[0]                  * (70.0F / 255.0F) +
01398            (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
01399            (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
01400            (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
01401            dptr[nsw4]               * ( 1.0F / 255.0F)
01402            );
01403         break;
01404       case 3:
01405         dst[gmemPos] =
01406           (dptr[0]                  * (70.0F / 247.0F) +
01407            (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
01408            (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
01409            dptr[nsw3]               * ( 8.0F / 247.0F) +
01410            dptr[nsw4]               * ( 1.0F / 247.0F)
01411            );
01412         break;
01413       case 2:
01414         dst[gmemPos] =
01415           (dptr[0]                * (70.0F / 219.0F) +
01416            (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
01417            dptr[nsw2]             * (28.0F / 219.0F) +
01418            dptr[nsw3]             * ( 8.0F / 219.0F) +
01419            dptr[nsw4]             * ( 1.0F / 219.0F)
01420            );
01421         break;
01422       case 1:
01423         dst[gmemPos] =
01424           (dptr[0]    * (70.0F / 163.0F) +
01425            dptr[nsw]  * (56.0F / 163.0F) +
01426            dptr[nsw2] * (28.0F / 163.0F) +
01427            dptr[nsw3] * ( 8.0F / 163.0F) +
01428            dptr[nsw4] * ( 1.0F / 163.0F)
01429            );
01430         break;
01431       }
01432     }
01433   }
01434 }
01435 
01436 
01437 
01438 __global__ void cuda_global_lowpass_9_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
01439 {
01440   // Data cache in shared memory
01441   float *data = (float *) shared_data; // size of tile_width
01442   // Bordering data flanking this tile
01443   float *border = (float *) &data[tile_width]; // size of 8
01444   const int sx = threadIdx.x;                   // source pixel within source tile
01445   const int sts = IMUL(blockIdx.x, tile_width); // tile start for source, relative to row start
01446   const int srs = IMUL(blockIdx.y,w);           // Row start index in source data
01447 
01448   const int loadIdx = sts + sx; // index of one pixel value to load
01449   const int bn4 = 0, bn3 = 1, bn2 = 2, bn1 = 3;
01450   const int bp1 = 4, bp2 = 5, bp3 = 6, bp4 = 7;
01451   float *dptr = &data[sx];
01452   // Load border pixels
01453   if (sx < 4 && sts > 0) border[sx] = src[srs + sts - (4-sx)];
01454   if (sx >= tile_width-4 && sts+tile_width < w-4) border[4+sx-(tile_width-4)] = src[srs + sts + sx + 4];
01455 
01456  // Load the row into shared memory among the thread block
01457   if (loadIdx < w)
01458     data[sx] = src[srs + loadIdx];
01459   else
01460     return; // Threads that are over the edge of the image on the right most tile...
01461 
01462   // Ensure the completness of loading stage because results emitted
01463   // by each thread depend on the data loaded by other threads:
01464   __syncthreads();
01465 
01466   // [ 1 8 28 56 (70) 56 28 8 1 ]
01467 
01468   if(w < 9)
01469   {
01470     // This is not at all efficient, just here to ensure filter behaves properly in small image cases
01471     const int numAhead = max(0,min(w-1-(sts+sx),4));
01472     const int numBehind = max(0,min(sts+sx,4));
01473     const int situation = numBehind*10+numAhead;
01474 
01475     switch(situation)
01476     {
01477     case 0: // 00
01478       dst[srs+loadIdx] = dptr[0];
01479       break;
01480     case 1: // 01
01481       dst[srs+loadIdx] = (dptr[0] * (70.0F / 126.0F) +
01482                       dptr[1]     * (56.0F / 126.0F));
01483       break;
01484     case 2: // 02
01485       dst[srs+loadIdx] = (dptr[0] * (70.0F / 154.0F) +
01486                       dptr[1]     * (56.0F / 154.0F) +
01487                       dptr[2]     * (28.0F / 154.0F));
01488       break;
01489     case 3:
01490       dst[srs+loadIdx] = (dptr[0] * (70.0F / 162.0F) +
01491                           dptr[1] * (56.0F / 162.0F) +
01492                           dptr[2] * (28.0F / 162.0F) +
01493                           dptr[3] * ( 8.0F / 162.0F));
01494       break;
01495     case 4:
01496       dst[srs+loadIdx] = (dptr[0] * (70.0F / 163.0F) +
01497                           dptr[1] * (56.0F / 163.0F) +
01498                           dptr[2] * (28.0F / 163.0F) +
01499                           dptr[3] * ( 8.0F / 163.0F) +
01500                           dptr[4] * ( 1.0F / 163.0F));
01501       break;
01502     case 10:
01503       dst[srs+loadIdx] = (dptr[0]  * (70.0F / 126.0F) +
01504                           dptr[-1] * (56.0F / 126.0F));
01505       break;
01506     case 11:
01507       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 182.0F) +
01508                           (dptr[-1] + dptr[1]) * (56.0F / 182.0F));
01509       break;
01510     case 12:
01511       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 210.0F) +
01512                           (dptr[-1] + dptr[1]) * (56.0F / 210.0F) +
01513                           dptr[2]              * (28.0F / 210.0F));
01514       break;
01515     case 13:
01516       dst[srs+loadIdx] = (dptr[0]               * (70.0F / 218.0F) +
01517                           (dptr[-1] + dptr[1])  * (56.0F / 218.0F) +
01518                           dptr[2]               * (28.0F / 218.0F) +
01519                           dptr[3]               * ( 8.0F / 218.0F));
01520       break;
01521     case 14:
01522       dst[srs+loadIdx] = (dptr[0]               * (70.0F / 219.0F) +
01523                           (dptr[-1] + dptr[1])  * (56.0F / 219.0F) +
01524                           dptr[2]               * (28.0F / 219.0F) +
01525                           dptr[3]               * ( 8.0F / 219.0F) +
01526                           dptr[4]               * ( 1.0F / 219.0F));
01527       break;
01528     case 20:
01529       dst[srs+loadIdx] = (dptr[0]     * (70.0F / 154.0F) +
01530                           dptr[-1]    * (56.0F / 154.0F) +
01531                           dptr[-2]    * (28.0F / 154.0F));
01532       break;
01533     case 21:
01534       dst[srs+loadIdx] = (dptr[0]               * (70.0F / 210.0F) +
01535                           (dptr[-1] + dptr[1])  * (56.0F / 210.0F) +
01536                           dptr[-2]              * (28.0F / 210.0F));
01537       break;
01538     case 22:
01539       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 238.0F) +
01540                           (dptr[-1] + dptr[1]) * (56.0F / 238.0F) +
01541                           (dptr[-2] + dptr[2]) * (28.0F / 238.0F));
01542       break;
01543     case 23:
01544       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 246.0F) +
01545                           (dptr[-1] + dptr[1]) * (56.0F / 246.0F) +
01546                           (dptr[-2] + dptr[2]) * (28.0F / 246.0F) +
01547                           dptr[3]              * ( 8.0F / 246.0F));
01548       break;
01549     case 24:
01550       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 247.0F) +
01551                           (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
01552                           (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
01553                           dptr[3]              * ( 8.0F / 247.0F) +
01554                           dptr[4]              * ( 1.0F / 247.0F));
01555       break;
01556     case 30:
01557       dst[srs+loadIdx] = (dptr[0]  * (70.0F / 162.0F) +
01558                           dptr[-1] * (56.0F / 162.0F) +
01559                           dptr[-2] * (28.0F / 162.0F) +
01560                           dptr[-3] * ( 8.0F / 162.0F));
01561       break;
01562     case 31:
01563       dst[srs+loadIdx] = (dptr[0]               * (70.0F / 218.0F) +
01564                           (dptr[-1] + dptr[1])  * (56.0F / 218.0F) +
01565                           dptr[-2]              * (28.0F / 218.0F) +
01566                           dptr[-3]              * ( 8.0F / 218.0F));
01567       break;
01568     case 32:
01569       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 246.0F) +
01570                           (dptr[-1] + dptr[1]) * (56.0F / 246.0F) +
01571                           (dptr[-2] + dptr[2]) * (28.0F / 246.0F) +
01572                           dptr[-3]             * ( 8.0F / 246.0F));
01573       break;
01574     case 33:
01575       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 254.0F) +
01576                           (dptr[-1] + dptr[1]) * (56.0F / 254.0F) +
01577                           (dptr[-2] + dptr[2]) * (28.0F / 254.0F) +
01578                           (dptr[-3] + dptr[3]) * ( 8.0F / 254.0F));
01579       break;
01580     case 34:
01581       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 255.0F) +
01582                           (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
01583                           (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
01584                           (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
01585                           dptr[4]              * ( 1.0F / 255.0F));
01586       break;
01587     case 40:
01588       dst[srs+loadIdx] = (dptr[0]  * (70.0F / 163.0F) +
01589                           dptr[-1] * (56.0F / 163.0F) +
01590                           dptr[-2] * (28.0F / 163.0F) +
01591                           dptr[-3] * ( 8.0F / 163.0F) +
01592                           dptr[-4] * ( 1.0F / 163.0F));
01593       break;
01594     case 41:
01595       dst[srs+loadIdx] = (dptr[0]               * (70.0F / 219.0F) +
01596                           (dptr[-1] + dptr[1])  * (56.0F / 219.0F) +
01597                           dptr[-2]              * (28.0F / 219.0F) +
01598                           dptr[-3]              * ( 8.0F / 219.0F) +
01599                           dptr[-4]              * ( 1.0F / 219.0F));
01600       break;
01601     case 42:
01602       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 247.0F) +
01603                           (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
01604                           (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
01605                           dptr[-3]             * ( 8.0F / 247.0F) +
01606                           dptr[-4]             * ( 1.0F / 247.0F));
01607       break;
01608     case 43:
01609       dst[srs+loadIdx] = (dptr[0]              * (70.0F / 255.0F) +
01610                           (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
01611                           (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
01612                           (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
01613                           dptr[-4]             * ( 1.0F / 255.0F));
01614       break;
01615     }
01616   }
01617   // First part of source row, just reduce sample
01618   else if(sts+sx < 4)
01619   {
01620     switch(sx)
01621     {
01622     case 0:
01623       dst[srs + loadIdx] =
01624         (dptr[0] * (70.0F / 163.0F) +
01625          dptr[1] * (56.0F / 163.0F) +
01626          dptr[2] * (28.0F / 163.0F) +
01627          dptr[3] * ( 8.0F / 163.0F) +
01628          dptr[4] * ( 1.0F / 163.0F)
01629          );
01630       break;
01631     case 1:
01632       dst[srs + loadIdx] =
01633         (dptr[0]              * (70.0F / 219.0F) +
01634          (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
01635          dptr[2]              * (28.0F / 219.0F) +
01636          dptr[3]              * ( 8.0F / 219.0F) +
01637          dptr[4]              * ( 1.0F / 219.0F)
01638          );
01639       break;
01640     case 2:
01641       dst[srs + loadIdx] =
01642         (dptr[0]              * (70.0F / 247.0F) +
01643          (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
01644          (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
01645          dptr[3]              * ( 8.0F / 247.0F) +
01646          dptr[4]              * ( 1.0F / 247.0F)
01647          );
01648       break;
01649     case 3:
01650       dst[srs + loadIdx] =
01651         (dptr[0]              * (70.0F / 255.0F) +
01652          (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
01653          (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
01654          (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
01655          dptr[4]              * ( 1.0F / 255.0F)
01656          );
01657       break;
01658     default:
01659       //LERROR();
01660       break;
01661     }
01662   }
01663   // If not the first part of the source row, but is the first bit of this tile, use border
01664   else if(sx < 4 && sts+sx < w-4)
01665   {
01666     switch(sx)
01667     {
01668     case 0:
01669       dst[srs + loadIdx] =
01670         ((border[bn4] + dptr[4]) * ( 1.0F / 256.0F) +
01671          (border[bn3] + dptr[3]) * ( 8.0F / 256.0F) +
01672          (border[bn2] + dptr[2]) * (28.0F / 256.0F) +
01673          (border[bn1] + dptr[1]) * (56.0F / 256.0F) +
01674          dptr[0]                 * (70.0F / 256.0F)
01675          );
01676       break;
01677     case 1:
01678       dst[srs + loadIdx] =
01679         ((border[bn3] + dptr[4])   * ( 1.0F / 256.0F) +
01680          (border[bn2] + dptr[3])   * ( 8.0F / 256.0F) +
01681          (border[bn1] + dptr[2])   * (28.0F / 256.0F) +
01682          (dptr[-1] + dptr[1])      * (56.0F / 256.0F) +
01683          dptr[0]                   * (70.0F / 256.0F)
01684          );
01685       break;
01686     case 2:
01687       dst[srs + loadIdx] =
01688         ((border[bn2] + dptr[4]) * ( 1.0F / 256.0F) +
01689          (border[bn1] + dptr[3]) * ( 8.0F / 256.0F) +
01690          (dptr[-2] + dptr[2])    * (28.0F / 256.0F) +
01691          (dptr[-1] + dptr[1])    * (56.0F / 256.0F) +
01692          dptr[0]                 * (70.0F / 256.0F)
01693          );
01694       break;
01695     case 3:
01696       dst[srs + loadIdx] =
01697         ((border[bn1] + dptr[4]) * ( 1.0F / 256.0F) +
01698          (dptr[-3] + dptr[3])    * ( 8.0F / 256.0F) +
01699          (dptr[-2] + dptr[2])    * (28.0F / 256.0F) +
01700          (dptr[-1] + dptr[1])    * (56.0F / 256.0F) +
01701          dptr[0]                 * (70.0F / 256.0F)
01702          );
01703       break;
01704     }
01705   }
01706   // If we are not near the edge of this tile, do standard way
01707   else if(sx>=4 && sx<tile_width-4 && sts+sx < w-4)
01708   {
01709     dst[srs + loadIdx] =
01710       ((dptr[-4] + dptr[4]) * ( 1.0F / 256.0F) +
01711        (dptr[-3] + dptr[3]) * ( 8.0F / 256.0F) +
01712        (dptr[-2] + dptr[2]) * (28.0F / 256.0F) +
01713        (dptr[-1] + dptr[1]) * (56.0F / 256.0F) +
01714        dptr[0]              * (70.0F / 256.0F)
01715        );
01716   }
01717   // If not the last part of the source row, but in the last bit of this tile, use border
01718   else if(sx>=4 && sts+sx < w-4)
01719   {
01720     switch(tile_width-sx)
01721       {
01722       case 4:
01723         dst[srs + loadIdx] =
01724           ((dptr[-4] + border[bp1]) * ( 1.0F / 256.0F) +
01725            (dptr[-3] + dptr[3])     * ( 8.0F / 256.0F) +
01726            (dptr[-2] + dptr[2])     * (28.0F / 256.0F) +
01727            (dptr[-1] + dptr[1])     * (56.0F / 256.0F) +
01728            dptr[0]                  * (70.0F / 256.0F)
01729            );
01730         break;
01731       case 3:
01732         dst[srs + loadIdx] =
01733           ((dptr[-4] + border[bp2]) * ( 1.0F / 256.0F) +
01734            (dptr[-3] + border[bp1]) * ( 8.0F / 256.0F) +
01735            (dptr[-2] + dptr[2])     * (28.0F / 256.0F) +
01736            (dptr[-1] + dptr[1])     * (56.0F / 256.0F) +
01737            dptr[0]                  * (70.0F / 256.0F)
01738            );
01739         break;
01740       case 2:
01741         dst[srs + loadIdx] =
01742           ((dptr[-4] + border[bp3]) * ( 1.0F / 256.0F) +
01743            (dptr[-3] + border[bp2]) * ( 8.0F / 256.0F) +
01744            (dptr[-2] + border[bp1]) * (28.0F / 256.0F) +
01745            (dptr[-1] + dptr[1])     * (56.0F / 256.0F) +
01746            dptr[0]                  * (70.0F / 256.0F)
01747            );
01748         break;
01749       case 1:
01750         dst[srs + loadIdx] =
01751           ((dptr[-4] + border[bp4]) * ( 1.0F / 256.0F) +
01752            (dptr[-3] + border[bp3]) * ( 8.0F / 256.0F) +
01753            (dptr[-2] + border[bp2]) * (28.0F / 256.0F) +
01754            (dptr[-1] + border[bp1]) * (56.0F / 256.0F) +
01755            dptr[0]                  * (70.0F / 256.0F)
01756            );
01757         break;
01758       }
01759   }
01760   else if(sx < 4 && sts + sx >= w-4) // We are in the beginning of a tile close to the end of the image
01761   {
01762     switch(sx)
01763     {
01764     case 0:
01765       switch(w-(sts+sx))
01766       {
01767       case 4:
01768         dst[srs + loadIdx] =
01769           (dptr[0]                 * (70.0F / 255.0F) +
01770            (border[bn1] + dptr[1]) * (56.0F / 255.0F) +
01771            (border[bn2] + dptr[2]) * (28.0F / 255.0F) +
01772            (border[bn3] + dptr[3]) * ( 8.0F / 255.0F) +
01773            border[bn4]             * ( 1.0F / 255.0F)
01774            );
01775         break;
01776       case 3:
01777         dst[srs + loadIdx] =
01778           (dptr[0]                 * (70.0F / 247.0F) +
01779            (border[bn1] + dptr[1]) * (56.0F / 247.0F) +
01780            (border[bn2] + dptr[2]) * (28.0F / 247.0F) +
01781            border[bn3]             * ( 8.0F / 247.0F) +
01782            border[bn4]             * ( 1.0F / 247.0F)
01783            );
01784         break;
01785       case 2:
01786         dst[srs + loadIdx] =
01787           (dptr[0]                 * (70.0F / 219.0F) +
01788            (border[bn1] + dptr[1]) * (56.0F / 219.0F) +
01789            border[bn2]             * (28.0F / 219.0F) +
01790            border[bn3]             * ( 8.0F / 219.0F) +
01791            border[bn4]             * ( 1.0F / 219.0F)
01792            );
01793         break;
01794       case 1:
01795         dst[srs + loadIdx] =
01796           (dptr[0]     * (70.0F / 163.0F) +
01797            border[bn1] * (56.0F / 163.0F) +
01798            border[bn2] * (28.0F / 163.0F) +
01799            border[bn3] * ( 8.0F / 163.0F) +
01800            border[bn4] * ( 1.0F / 163.0F)
01801            );
01802         break;
01803       }
01804       break;
01805     case 1:
01806       switch(w-(sts+sx))
01807         {
01808           case 4:
01809             dst[srs + loadIdx] =
01810               (dptr[0]                 * (70.0F / 255.0F) +
01811                (dptr[1] + dptr[1])     * (56.0F / 255.0F) +
01812                (border[bn1] + dptr[2]) * (28.0F / 255.0F) +
01813                (border[bn2] + dptr[3]) * ( 8.0F / 255.0F) +
01814                border[bn3]             * ( 1.0F / 255.0F)
01815                );
01816             break;
01817           case 3:
01818             dst[srs + loadIdx] =
01819               (dptr[0]                 * (70.0F / 247.0F) +
01820                (dptr[1] + dptr[1])     * (56.0F / 247.0F) +
01821                (border[bn1] + dptr[2]) * (28.0F / 247.0F) +
01822                border[bn2]             * ( 8.0F / 247.0F) +
01823                border[bn3]             * ( 1.0F / 247.0F)
01824                );
01825             break;
01826           case 2:
01827             dst[srs + loadIdx] =
01828               (dptr[0]                * (70.0F / 219.0F) +
01829                (dptr[1] + dptr[1])    * (56.0F / 219.0F) +
01830                border[bn1]            * (28.0F / 219.0F) +
01831                border[bn2]            * ( 8.0F / 219.0F) +
01832                border[bn3]            * ( 1.0F / 219.0F)
01833                );
01834             break;
01835           case 1:
01836             dst[srs + loadIdx] =
01837               (dptr[0]     * (70.0F / 163.0F) +
01838                dptr[1]     * (56.0F / 163.0F) +
01839                border[bn1] * (28.0F / 163.0F) +
01840                border[bn2] * ( 8.0F / 163.0F) +
01841                border[bn3] * ( 1.0F / 163.0F)
01842                );
01843             break;
01844         }
01845         break;
01846       case 2:
01847         switch(w-(sts+sx))
01848         {
01849           case 4:
01850             dst[srs + loadIdx] =
01851               (dptr[0]                  * (70.0F / 255.0F) +
01852                (dptr[1] + dptr[1])      * (56.0F / 255.0F) +
01853                (dptr[2] + dptr[2])      * (28.0F / 255.0F) +
01854                (border[bn1] + dptr[3])  * ( 8.0F / 255.0F) +
01855                border[bn2]              * ( 1.0F / 255.0F)
01856                );
01857             break;
01858           case 3:
01859             dst[srs + loadIdx] =
01860               (dptr[0]                 * (70.0F / 247.0F) +
01861                (dptr[1] + dptr[1])     * (56.0F / 247.0F) +
01862                (dptr[2] + dptr[2])     * (28.0F / 247.0F) +
01863                border[bn1]             * ( 8.0F / 247.0F) +
01864                border[bn2]             * ( 1.0F / 247.0F)
01865                );
01866             break;
01867           case 2:
01868             dst[srs + loadIdx] =
01869               (dptr[0]                * (70.0F / 219.0F) +
01870                (dptr[1] + dptr[1])    * (56.0F / 219.0F) +
01871                dptr[2]                * (28.0F / 219.0F) +
01872                border[bn1]            * ( 8.0F / 219.0F) +
01873                border[bn2]            * ( 1.0F / 219.0F)
01874                );
01875             break;
01876           case 1:
01877             dst[srs + loadIdx] =
01878               (dptr[0]          * (70.0F / 163.0F) +
01879                dptr[1]          * (56.0F / 163.0F) +
01880                dptr[2]          * (28.0F / 163.0F) +
01881                border[bn1]      * ( 8.0F / 163.0F) +
01882                border[bn2]      * ( 1.0F / 163.0F)
01883                );
01884             break;
01885         }
01886         break;
01887       case 3:
01888         switch(w-(sts+sx))
01889         {
01890           case 4:
01891             dst[srs + loadIdx] =
01892               (dptr[0]               * (70.0F / 255.0F) +
01893                (dptr[1] + dptr[1])   * (56.0F / 255.0F) +
01894                (dptr[2] + dptr[2])   * (28.0F / 255.0F) +
01895                (dptr[3] + dptr[3])   * ( 8.0F / 255.0F) +
01896                border[bn1]           * ( 1.0F / 255.0F)
01897                );
01898             break;
01899           case 3:
01900             dst[srs + loadIdx] =
01901               (dptr[0]               * (70.0F / 247.0F) +
01902                (dptr[1] + dptr[1])   * (56.0F / 247.0F) +
01903                (dptr[2] + dptr[2])   * (28.0F / 247.0F) +
01904                dptr[3]               * ( 8.0F / 247.0F) +
01905                border[bn1]           * ( 1.0F / 247.0F)
01906                );
01907             break;
01908           case 2:
01909             dst[srs + loadIdx] =
01910               (dptr[0]             * (70.0F / 219.0F) +
01911                (dptr[1] + dptr[1]) * (56.0F / 219.0F) +
01912                dptr[2]             * (28.0F / 219.0F) +
01913                dptr[3]             * ( 8.0F / 219.0F) +
01914                border[bn1]         * ( 1.0F / 219.0F)
01915                );
01916             break;
01917           case 1:
01918             dst[srs + loadIdx] =
01919               (dptr[0]     * (70.0F / 163.0F) +
01920                dptr[1]     * (56.0F / 163.0F) +
01921                dptr[2]     * (28.0F / 163.0F) +
01922                dptr[3]     * ( 8.0F / 163.0F) +
01923                border[bn1] * ( 1.0F / 163.0F)
01924                );
01925             break;
01926         }
01927         break;
01928       }
01929     }
01930 
01931   // If in the last bit of the source row, reduce sample
01932   else if(sts + sx < w)
01933   {
01934     switch(w-(sts+sx))
01935     {
01936     case 4:
01937       dst[srs + loadIdx] =
01938         (dptr[-4]             * ( 1.0F / 255.0F) +
01939          (dptr[-3] + dptr[3]) * ( 8.0F / 255.0F) +
01940          (dptr[-2] + dptr[2]) * (28.0F / 255.0F) +
01941          (dptr[-1] + dptr[1]) * (56.0F / 255.0F) +
01942          dptr[0]              * (70.0F / 255.0F)
01943          );
01944       break;
01945     case 3:
01946       dst[srs + loadIdx] =
01947         (dptr[-4]             * ( 1.0F / 247.0F) +
01948          dptr[-3]             * ( 8.0F / 247.0F) +
01949          (dptr[-2] + dptr[2]) * (28.0F / 247.0F) +
01950          (dptr[-1] + dptr[1]) * (56.0F / 247.0F) +
01951          dptr[0]              * (70.0F / 247.0F)
01952          );
01953       break;
01954     case 2:
01955       dst[srs + loadIdx] =
01956         (dptr[-4]             * ( 1.0F / 219.0F) +
01957          dptr[-3]             * ( 8.0F / 219.0F) +
01958          dptr[-2]             * (28.0F / 219.0F) +
01959          (dptr[-1] + dptr[1]) * (56.0F / 219.0F) +
01960          dptr[0]              * (70.0F / 219.0F)
01961          );
01962       break;
01963     case 1:
01964       dst[srs + loadIdx] =
01965         (dptr[-4]             * ( 1.0F / 163.0F) +
01966          dptr[-3]             * ( 8.0F / 163.0F) +
01967          dptr[-2]             * (28.0F / 163.0F) +
01968          dptr[-1]             * (56.0F / 163.0F) +
01969          dptr[ 0]             * (70.0F / 163.0F)
01970          );
01971       break;
01972     }
01973   }
01974 }
01975 
01976 // ######################################################################
01977 __global__ void cuda_global_lowpass_9_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
01978 {
01979   // Data cache
01980   float *data = (float *) shared_data; // size of tile_width * tile_height
01981   float *border = (float *) &data[tile_width*tile_height]; // size of tile_width * 8
01982 
01983   const int sy = threadIdx.y; // source pixel row within source tile
01984 
01985   const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows
01986   const int ste = sts + tile_height; // tile end for source, in rows
01987   const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;  // Current column index
01988 
01989   // Clamp tile and apron limits by image borders
01990   const int stec = min(ste, h);
01991 
01992 
01993 
01994 
01995   // only process columns that are actually within image bounds:
01996   if (scs < w && sts+sy < stec)
01997   {
01998     // Shared and global memory indices for current column
01999     int smemPos = IMUL(sy, tile_width) + threadIdx.x;
02000     int gmemPos = IMUL(sts + sy, w) + scs;
02001 
02002     data[smemPos] = src[gmemPos];
02003 
02004     if (sy < 4 && gmemPos > IMUL(4,w))
02005       border[smemPos] = src[gmemPos-IMUL(4,w)];
02006 
02007     int bordOff = 8+sy-tile_height;
02008 
02009     if (sy >= tile_height-4)
02010     {
02011       if(ste+4 <= h)
02012         border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(4,w)];
02013       else
02014         // This is explicitly handling the case where the second to last tile is so close to the edge of the image, that the post border
02015         // can't fit in the remaining space, maybe we should be handling this as yet another special case, but there are a lot of special
02016         // cases already in this function (handling smaller than 9 imgs, handling all cases of small last tile)
02017         border[threadIdx.x+IMUL(bordOff,tile_width)] = 0;
02018     }
02019 
02020     // Ensure the completness of loading stage because results emitted
02021     // by each thread depend on the data loaded by other threads:
02022     __syncthreads();
02023 
02024     // Shared and global memory indices for current column
02025     smemPos = IMUL(sy, tile_width) + threadIdx.x;
02026     gmemPos = IMUL(sts + sy, w) + scs;
02027 
02028 
02029     // Setup the offsets to get to the correct smem points in the arrays for both the data and the border
02030     float *dptr = data + smemPos;
02031     const int sw = tile_width, sw2 = sw + sw, sw3 = sw2 + sw, sw4 = sw3 + sw;
02032     const int nsw = -sw, nsw2 = nsw - sw, nsw3 = nsw2 - sw, nsw4 = nsw3 - sw;
02033     const int bn4 = threadIdx.x, bn3 = bn4 + tile_width, bn2 = bn3 + tile_width, bn1 = bn2 + tile_width;
02034     const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width, bp3 = bp2 + tile_width, bp4 = bp3 + tile_width;
02035 
02036     if(h < 9)
02037     {
02038       // This is not at all efficient, just here to ensure filter behaves properly in small image cases
02039       const int numAhead = max(0,min(h-1-(sts+sy),4));
02040       const int numBehind = max(0,min(sts+sy,4));
02041       const int situation = numBehind*10+numAhead;
02042       switch(situation)
02043       {
02044       case 0: // 00
02045         dst[gmemPos] = dptr[0];
02046         break;
02047       case 1: // 01
02048         dst[gmemPos] = (dptr[0] * (70.0F / 126.0F) +
02049                         dptr[sw] * (56.0F / 126.0F));
02050         break;
02051       case 2: // 02
02052         dst[gmemPos] = (dptr[0]   * (70.0F / 154.0F) +
02053                         dptr[sw]  * (56.0F / 154.0F) +
02054                         dptr[sw2] * (28.0F / 154.0F));
02055         break;
02056       case 3:
02057         dst[gmemPos] = (dptr[0]   * (70.0F / 162.0F) +
02058                         dptr[sw]  * (56.0F / 162.0F) +
02059                         dptr[sw2] * (28.0F / 162.0F) +
02060                         dptr[sw3] * ( 8.0F / 162.0F));
02061         break;
02062       case 4:
02063         dst[gmemPos] = (dptr[0]   * (70.0F / 163.0F) +
02064                         dptr[sw]  * (56.0F / 163.0F) +
02065                         dptr[sw2] * (28.0F / 163.0F) +
02066                         dptr[sw3] * ( 8.0F / 163.0F) +
02067                         dptr[sw4] * ( 1.0F / 163.0F));
02068         break;
02069       case 10:
02070         dst[gmemPos] = (dptr[0] * (70.0F / 126.0F) +
02071                         dptr[nsw] * (56.0F / 126.0F));
02072         break;
02073       case 11:
02074         dst[gmemPos] = (dptr[0]                * (70.0F / 182.0F) +
02075                         (dptr[nsw] + dptr[sw]) * (56.0F / 182.0F));
02076         break;
02077       case 12:
02078         dst[gmemPos] = (dptr[0]                * (70.0F / 210.0F) +
02079                         (dptr[nsw] + dptr[sw]) * (56.0F / 210.0F) +
02080                         dptr[sw2]              * (28.0F / 210.0F));
02081         break;
02082       case 13:
02083         dst[gmemPos] = (dptr[0]                 * (70.0F / 218.0F) +
02084                         (dptr[nsw] + dptr[sw])  * (56.0F / 218.0F) +
02085                         dptr[sw2]               * (28.0F / 218.0F) +
02086                         dptr[sw3]               * ( 8.0F / 218.0F));
02087         break;
02088       case 14:
02089         dst[gmemPos] = (dptr[0]                 * (70.0F / 219.0F) +
02090                         (dptr[nsw] + dptr[sw])  * (56.0F / 219.0F) +
02091                         dptr[sw2]               * (28.0F / 219.0F) +
02092                         dptr[sw3]               * ( 8.0F / 219.0F) +
02093                         dptr[sw4]               * ( 1.0F / 219.0F));
02094         break;
02095       case 20:
02096         dst[gmemPos] = (dptr[0]    * (70.0F / 154.0F) +
02097                         dptr[nsw]  * (56.0F / 154.0F) +
02098                         dptr[nsw2] * (28.0F / 154.0F));
02099         break;
02100       case 21:
02101         dst[gmemPos] = (dptr[0]                 * (70.0F / 210.0F) +
02102                         (dptr[nsw] + dptr[sw])  * (56.0F / 210.0F) +
02103                         dptr[nsw2]              * (28.0F / 210.0F));
02104         break;
02105       case 22:
02106         dst[gmemPos] = (dptr[0]                  * (70.0F / 238.0F) +
02107                         (dptr[nsw] + dptr[sw])   * (56.0F / 238.0F) +
02108                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 238.0F));
02109         break;
02110       case 23:
02111         dst[gmemPos] = (dptr[0]                  * (70.0F / 246.0F) +
02112                         (dptr[nsw] + dptr[sw])   * (56.0F / 246.0F) +
02113                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 246.0F) +
02114                         dptr[sw3]                * ( 8.0F / 246.0F));
02115         break;
02116       case 24:
02117         dst[gmemPos] = (dptr[0]                  * (70.0F / 247.0F) +
02118                         (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
02119                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02120                         dptr[sw3]                * ( 8.0F / 247.0F) +
02121                         dptr[sw4]                * ( 1.0F / 247.0F));
02122         break;
02123       case 30:
02124         dst[gmemPos] = (dptr[0]    * (70.0F / 162.0F) +
02125                         dptr[nsw]  * (56.0F / 162.0F) +
02126                         dptr[nsw2] * (28.0F / 162.0F) +
02127                         dptr[nsw3] * ( 8.0F / 162.0F));
02128         break;
02129       case 31:
02130         dst[gmemPos] = (dptr[0]                 * (70.0F / 218.0F) +
02131                         (dptr[nsw] + dptr[sw])  * (56.0F / 218.0F) +
02132                         dptr[nsw2]              * (28.0F / 218.0F) +
02133                         dptr[nsw3]              * ( 8.0F / 218.0F));
02134         break;
02135       case 32:
02136         dst[gmemPos] = (dptr[0]                  * (70.0F / 246.0F) +
02137                         (dptr[nsw] + dptr[sw])   * (56.0F / 246.0F) +
02138                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 246.0F) +
02139                         dptr[nsw3]               * ( 8.0F / 246.0F));
02140         break;
02141       case 33:
02142         dst[gmemPos] = (dptr[0]                  * (70.0F / 254.0F) +
02143                         (dptr[nsw] + dptr[sw])   * (56.0F / 254.0F) +
02144                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 254.0F) +
02145                         (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 254.0F));
02146         break;
02147       case 34:
02148         dst[gmemPos] = (dptr[0]                  * (70.0F / 255.0F) +
02149                         (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
02150                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02151                         (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02152                         dptr[sw4]                * ( 1.0F / 255.0F));
02153         break;
02154       case 40:
02155         dst[gmemPos] = (dptr[0]    * (70.0F / 163.0F) +
02156                         dptr[nsw]  * (56.0F / 163.0F) +
02157                         dptr[nsw2] * (28.0F / 163.0F) +
02158                         dptr[nsw3] * ( 8.0F / 163.0F) +
02159                         dptr[nsw4] * ( 1.0F / 163.0F));
02160         break;
02161       case 41:
02162         dst[gmemPos] = (dptr[0]                 * (70.0F / 219.0F) +
02163                         (dptr[nsw] + dptr[sw])  * (56.0F / 219.0F) +
02164                         dptr[nsw2]              * (28.0F / 219.0F) +
02165                         dptr[nsw3]              * ( 8.0F / 219.0F) +
02166                         dptr[nsw4]              * ( 1.0F / 219.0F));
02167         break;
02168       case 42:
02169         dst[gmemPos] = (dptr[0]                  * (70.0F / 247.0F) +
02170                         (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
02171                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02172                         dptr[nsw3]               * ( 8.0F / 247.0F) +
02173                         dptr[nsw4]               * ( 1.0F / 247.0F));
02174         break;
02175       case 43:
02176         dst[gmemPos] = (dptr[0]                  * (70.0F / 255.0F) +
02177                         (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
02178                         (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02179                         (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02180                         dptr[nsw4]               * ( 1.0F / 255.0F));
02181         break;
02182       }
02183     }
02184     // Are we in the top 4 rows of the whole image
02185     else if(sts + sy < 4)
02186     {
02187       switch(sts+sy)
02188       {
02189       case 0:
02190         dst[gmemPos] =
02191           (dptr[0]   * (70.0F / 163.0F) +
02192            dptr[sw]  * (56.0F / 163.0F) +
02193            dptr[sw2] * (28.0F / 163.0F) +
02194            dptr[sw3] * ( 8.0F / 163.0F) +
02195            dptr[sw4] * ( 1.0F / 163.0F)
02196            );
02197         break;
02198       case 1:
02199         dst[gmemPos] =
02200           (dptr[0]                * (70.0F / 219.0F) +
02201            (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02202            dptr[sw2]              * (28.0F / 219.0F) +
02203            dptr[sw3]              * ( 8.0F / 219.0F) +
02204            dptr[sw4]              * ( 1.0F / 219.0F)
02205            );
02206         break;
02207       case 2:
02208         dst[gmemPos] =
02209           (dptr[0]                  * (70.0F / 247.0F) +
02210            (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
02211            (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02212            dptr[sw3]                * ( 8.0F / 247.0F) +
02213            dptr[sw4]                * ( 1.0F / 247.0F)
02214            );
02215         break;
02216       case 3:
02217         dst[gmemPos] =
02218           (dptr[0]                  * (70.0F / 255.0F) +
02219            (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
02220            (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02221            (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02222            dptr[sw4]                * ( 1.0F / 255.0F)
02223            );
02224         break;
02225       }
02226     }
02227     else if(sy < 4 && sts+sy<h-4) // If not top 4 in the whole image, are we in the top 4 rows of this tile
02228     {
02229       switch(sy)
02230       {
02231       case 0:
02232         dst[gmemPos] =
02233           (dptr[0]                   * (70.0F / 256.0F) +
02234            (border[bn1] + dptr[sw])  * (56.0F / 256.0F) +
02235            (border[bn2] + dptr[sw2]) * (28.0F / 256.0F) +
02236            (border[bn3] + dptr[sw3]) * ( 8.0F / 256.0F) +
02237            (border[bn4] + dptr[sw4]) * ( 1.0F / 256.0F)
02238            );
02239         break;
02240       case 1:
02241         dst[gmemPos] =
02242           (dptr[0]                   * (70.0F / 256.0F) +
02243            (dptr[nsw] + dptr[sw])    * (56.0F / 256.0F) +
02244            (border[bn1] + dptr[sw2]) * (28.0F / 256.0F) +
02245            (border[bn2] + dptr[sw3]) * ( 8.0F / 256.0F) +
02246            (border[bn3] + dptr[sw4]) * ( 1.0F / 256.0F)
02247            );
02248         break;
02249       case 2:
02250         dst[gmemPos] =
02251           (dptr[0]                   * (70.0F / 256.0F) +
02252            (dptr[nsw] + dptr[sw])    * (56.0F / 256.0F) +
02253            (dptr[nsw2] + dptr[sw2])  * (28.0F / 256.0F) +
02254            (border[bn1] + dptr[sw3]) * ( 8.0F / 256.0F) +
02255            (border[bn2] + dptr[sw4]) * ( 1.0F / 256.0F)
02256            );
02257         break;
02258       case 3:
02259         dst[gmemPos] =
02260           (dptr[0]                   * (70.0F / 256.0F) +
02261            (dptr[nsw] + dptr[sw])    * (56.0F / 256.0F) +
02262            (dptr[nsw2] + dptr[sw2])  * (28.0F / 256.0F) +
02263            (dptr[nsw3] + dptr[sw3])  * ( 8.0F / 256.0F) +
02264            (border[bn1] + dptr[sw4]) * ( 1.0F / 256.0F)
02265            );
02266         break;
02267       }
02268     }
02269     else if(sy >= 4 && sy <tile_height-4 && sts+sy<h-4) // Are we in the middle of the tile
02270     {
02271         dst[gmemPos] =
02272           ((dptr[nsw4] + dptr[sw4]) * ( 1.0F / 256.0F) +
02273            (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 256.0F) +
02274            (dptr[nsw2] + dptr[sw2]) * (28.0F / 256.0F) +
02275            (dptr[nsw] + dptr[sw])   * (56.0F / 256.0F) +
02276            dptr[0]                  * (70.0F / 256.0F)
02277            );
02278     }
02279     else if(sy >= 4 && sts + sy < h-4) // Are we not at the bottom of the image, but bottom 4 of the tile
02280     {
02281       switch(tile_height-sy)
02282       {
02283       case 4:
02284         dst[gmemPos] =
02285           (dptr[0]                    * (70.0F / 256.0F) +
02286            (dptr[nsw] + dptr[sw])     * (56.0F / 256.0F) +
02287            (dptr[nsw2] + dptr[sw2])   * (28.0F / 256.0F) +
02288            (dptr[nsw3] + dptr[sw3])   * ( 8.0F / 256.0F) +
02289            (dptr[nsw4] + border[bp1]) * ( 1.0F / 256.0F)
02290            );
02291         break;
02292       case 3:
02293         dst[gmemPos] =
02294           (dptr[0]                    * (70.0F / 256.0F) +
02295            (dptr[nsw] + dptr[sw])     * (56.0F / 256.0F) +
02296            (dptr[nsw2] + dptr[sw2])   * (28.0F / 256.0F) +
02297            (dptr[nsw3] + border[bp1]) * ( 8.0F / 256.0F) +
02298            (dptr[nsw4] + border[bp2]) * ( 1.0F / 256.0F)
02299            );
02300         break;
02301       case 2:
02302         dst[gmemPos] =
02303           (dptr[0]                    * (70.0F / 256.0F) +
02304            (dptr[nsw] + dptr[sw])     * (56.0F / 256.0F) +
02305            (dptr[nsw2] + border[bp1]) * (28.0F / 256.0F) +
02306            (dptr[nsw3] + border[bp2]) * ( 8.0F / 256.0F) +
02307            (dptr[nsw4] + border[bp3]) * ( 1.0F / 256.0F)
02308            );
02309         break;
02310       case 1:
02311         dst[gmemPos] =
02312           (dptr[0]                    * (70.0F / 256.0F) +
02313            (dptr[nsw] +  border[bp1]) * (56.0F / 256.0F) +
02314            (dptr[nsw2] + border[bp2]) * (28.0F / 256.0F) +
02315            (dptr[nsw3] + border[bp3]) * ( 8.0F / 256.0F) +
02316            (dptr[nsw4] + border[bp4]) * ( 1.0F / 256.0F)
02317            );
02318         break;
02319       }
02320     }
02321     else if(sy < 4 && sts + sy >= h-4) // We are in the top of a tile close to the bottom of the image
02322     {
02323       switch(sy)
02324       {
02325       case 0:
02326         switch(h-(sts+sy))
02327         {
02328           case 4:
02329             dst[gmemPos] =
02330               (dptr[0]                  * (70.0F / 255.0F) +
02331                (border[bn1] + dptr[sw])   * (56.0F / 255.0F) +
02332                (border[bn2] + dptr[sw2]) * (28.0F / 255.0F) +
02333                (border[bn3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02334                border[bn4]               * ( 1.0F / 255.0F)
02335                );
02336             break;
02337           case 3:
02338             dst[gmemPos] =
02339               (dptr[0]                  * (70.0F / 247.0F) +
02340                (border[bn1] + dptr[sw])   * (56.0F / 247.0F) +
02341                (border[bn2] + dptr[sw2]) * (28.0F / 247.0F) +
02342                border[bn3]               * ( 8.0F / 247.0F) +
02343                border[bn4]               * ( 1.0F / 247.0F)
02344                );
02345             break;
02346           case 2:
02347             dst[gmemPos] =
02348               (dptr[0]                * (70.0F / 219.0F) +
02349                (border[bn1] + dptr[sw]) * (56.0F / 219.0F) +
02350                border[bn2]             * (28.0F / 219.0F) +
02351                border[bn3]             * ( 8.0F / 219.0F) +
02352                border[bn4]             * ( 1.0F / 219.0F)
02353                );
02354             break;
02355           case 1:
02356             dst[gmemPos] =
02357               (dptr[0]    * (70.0F / 163.0F) +
02358                border[bn1]  * (56.0F / 163.0F) +
02359                border[bn2] * (28.0F / 163.0F) +
02360                border[bn3] * ( 8.0F / 163.0F) +
02361                border[bn4] * ( 1.0F / 163.0F)
02362                );
02363             break;
02364         }
02365         break;
02366       case 1:
02367         switch(h-(sts+sy))
02368         {
02369           case 4:
02370             dst[gmemPos] =
02371               (dptr[0]                  * (70.0F / 255.0F) +
02372                (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
02373                (border[bn1] + dptr[sw2]) * (28.0F / 255.0F) +
02374                (border[bn2] + dptr[sw3]) * ( 8.0F / 255.0F) +
02375                border[bn3]               * ( 1.0F / 255.0F)
02376                );
02377             break;
02378           case 3:
02379             dst[gmemPos] =
02380               (dptr[0]                  * (70.0F / 247.0F) +
02381                (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
02382                (border[bn1] + dptr[sw2]) * (28.0F / 247.0F) +
02383                border[bn2]               * ( 8.0F / 247.0F) +
02384                border[bn3]               * ( 1.0F / 247.0F)
02385                );
02386             break;
02387           case 2:
02388             dst[gmemPos] =
02389               (dptr[0]                * (70.0F / 219.0F) +
02390                (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02391                border[bn1]             * (28.0F / 219.0F) +
02392                border[bn2]             * ( 8.0F / 219.0F) +
02393                border[bn3]             * ( 1.0F / 219.0F)
02394                );
02395             break;
02396           case 1:
02397             dst[gmemPos] =
02398               (dptr[0]    * (70.0F / 163.0F) +
02399                dptr[nsw]  * (56.0F / 163.0F) +
02400                border[bn1] * (28.0F / 163.0F) +
02401                border[bn2] * ( 8.0F / 163.0F) +
02402                border[bn3] * ( 1.0F / 163.0F)
02403                );
02404             break;
02405         }
02406         break;
02407       case 2:
02408         switch(h-(sts+sy))
02409         {
02410           case 4:
02411             dst[gmemPos] =
02412               (dptr[0]                  * (70.0F / 255.0F) +
02413                (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
02414                (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02415                (border[bn1] + dptr[sw3]) * ( 8.0F / 255.0F) +
02416                border[bn2]               * ( 1.0F / 255.0F)
02417                );
02418             break;
02419           case 3:
02420             dst[gmemPos] =
02421               (dptr[0]                  * (70.0F / 247.0F) +
02422                (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
02423                (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02424                border[bn1]               * ( 8.0F / 247.0F) +
02425                border[bn2]               * ( 1.0F / 247.0F)
02426                );
02427             break;
02428           case 2:
02429             dst[gmemPos] =
02430               (dptr[0]                * (70.0F / 219.0F) +
02431                (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02432                dptr[nsw2]             * (28.0F / 219.0F) +
02433                border[bn1]             * ( 8.0F / 219.0F) +
02434                border[bn2]             * ( 1.0F / 219.0F)
02435                );
02436             break;
02437           case 1:
02438             dst[gmemPos] =
02439               (dptr[0]    * (70.0F / 163.0F) +
02440                dptr[nsw]  * (56.0F / 163.0F) +
02441                dptr[nsw2] * (28.0F / 163.0F) +
02442                border[bn1] * ( 8.0F / 163.0F) +
02443                border[bn2] * ( 1.0F / 163.0F)
02444                );
02445             break;
02446         }
02447         break;
02448       case 3:
02449         switch(h-(sts+sy))
02450         {
02451           case 4:
02452             dst[gmemPos] =
02453               (dptr[0]                  * (70.0F / 255.0F) +
02454                (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
02455                (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02456                (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02457                border[bn1]               * ( 1.0F / 255.0F)
02458                );
02459             break;
02460           case 3:
02461             dst[gmemPos] =
02462               (dptr[0]                  * (70.0F / 247.0F) +
02463                (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
02464                (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02465                dptr[nsw3]               * ( 8.0F / 247.0F) +
02466                border[bn1]               * ( 1.0F / 247.0F)
02467                );
02468             break;
02469           case 2:
02470             dst[gmemPos] =
02471               (dptr[0]                * (70.0F / 219.0F) +
02472                (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02473                dptr[nsw2]             * (28.0F / 219.0F) +
02474                dptr[nsw3]             * ( 8.0F / 219.0F) +
02475                border[bn1]             * ( 1.0F / 219.0F)
02476                );
02477             break;
02478           case 1:
02479             dst[gmemPos] =
02480               (dptr[0]    * (70.0F / 163.0F) +
02481                dptr[nsw]  * (56.0F / 163.0F) +
02482                dptr[nsw2] * (28.0F / 163.0F) +
02483                dptr[nsw3] * ( 8.0F / 163.0F) +
02484                border[bn1] * ( 1.0F / 163.0F)
02485                );
02486             break;
02487         }
02488         break;
02489       }
02490     }
02491     else  // We must be at the bottom 4 of the image
02492     {
02493       switch(h-(sts+sy))
02494       {
02495       case 4:
02496         dst[gmemPos] =
02497           (dptr[0]                  * (70.0F / 255.0F) +
02498            (dptr[nsw] + dptr[sw])   * (56.0F / 255.0F) +
02499            (dptr[nsw2] + dptr[sw2]) * (28.0F / 255.0F) +
02500            (dptr[nsw3] + dptr[sw3]) * ( 8.0F / 255.0F) +
02501            dptr[nsw4]               * ( 1.0F / 255.0F)
02502            );
02503         break;
02504       case 3:
02505         dst[gmemPos] =
02506           (dptr[0]                  * (70.0F / 247.0F) +
02507            (dptr[nsw] + dptr[sw])   * (56.0F / 247.0F) +
02508            (dptr[nsw2] + dptr[sw2]) * (28.0F / 247.0F) +
02509            dptr[nsw3]               * ( 8.0F / 247.0F) +
02510            dptr[nsw4]               * ( 1.0F / 247.0F)
02511            );
02512         break;
02513       case 2:
02514         dst[gmemPos] =
02515           (dptr[0]                * (70.0F / 219.0F) +
02516            (dptr[nsw] + dptr[sw]) * (56.0F / 219.0F) +
02517            dptr[nsw2]             * (28.0F / 219.0F) +
02518            dptr[nsw3]             * ( 8.0F / 219.0F) +
02519            dptr[nsw4]             * ( 1.0F / 219.0F)
02520            );
02521         break;
02522       case 1:
02523         dst[gmemPos] =
02524           (dptr[0]    * (70.0F / 163.0F) +
02525            dptr[nsw]  * (56.0F / 163.0F) +
02526            dptr[nsw2] * (28.0F / 163.0F) +
02527            dptr[nsw3] * ( 8.0F / 163.0F) +
02528            dptr[nsw4] * ( 1.0F / 163.0F)
02529            );
02530         break;
02531       }
02532     }
02533   }
02534 }
02535 
02536 __global__ void cuda_global_lowpass_5_x(const float *src,  const unsigned int w, const unsigned int h, float* dst, int tile_width)
02537 {
02538   // Data cache in shared memory
02539   //__shared__ float data[CUDA_1D_TILE_W];
02540   //__shared__ float border[4];
02541   //tile_width=CUDA_1D_TILE_W;
02542 
02543   // Save 4 pixels for the border
02544   float *data = (float *) shared_data; // size of tile_width
02545   float *border = (float *) &data[tile_width]; // size of 4
02546 
02547   const int sx = threadIdx.x;                    // source pixel within source tile
02548   const int sts = IMUL(blockIdx.x, tile_width);  // tile start for source, relative to row start
02549   const int srs = IMUL(blockIdx.y, w);           // Row start index in source data
02550 
02551   // Load global memory values into our data cache:
02552   const int loadIdx = sts + sx;  // index of one pixel value to load
02553   if (loadIdx < w) data[sx] = src[srs + loadIdx];
02554 
02555   // Load beginning border
02556   if (sx < 2 && sts > 0)
02557     border[sx] = src[srs + loadIdx - 2];
02558   // Load ending border
02559   else if(sx >= tile_width-2 && sts+tile_width < w-2)
02560     border[4+sx-tile_width] = src[srs + sts + sx + 2];
02561 
02562   // Ensure the completness of loading stage because results emitted
02563   // by each thread depend on the data loaded by other threads:
02564   __syncthreads();
02565 
02566 
02567   if ( loadIdx < w ) {
02568     const int writeIdx = sts + sx; // write index relative to row start
02569     const float *dptr = data + sx;
02570     // [ 1 4 (6) 4 1 ] / 16.0
02571 
02572     // If we are smaller than the Gaussian filter we are using, special case this
02573     // this is not very efficient, just making sure it can handle small images
02574     if(w < 5)
02575     {
02576       int numAhead = max(0,min(w-1-(sts+sx),2));
02577       int numBehind = max(0,min(sts+sx,2));
02578       int situation = numBehind*10+numAhead;
02579       switch(situation)
02580       {
02581       case 0: // 00
02582         dst[srs + writeIdx] = dptr[0];
02583         break;
02584       case 1: // 01
02585         dst[srs + writeIdx] = (dptr[0] * (6.0F / 10.0F) +
02586                                dptr[1] * (4.0F / 10.0F));
02587         break;
02588       case 2: // 02
02589         dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
02590                                dptr[1] * (4.0F / 11.0F) +
02591                                dptr[2] * (1.0F / 11.0F));
02592         break;
02593       case 10:
02594         dst[srs + writeIdx] = (dptr[0] * (6.0F / 10.0F) +
02595                                dptr[-1] * (4.0F / 10.0F));
02596         break;
02597       case 11:
02598         dst[srs + writeIdx] = (dptr[0]              * (6.0F / 14.0F) +
02599                                (dptr[-1] + dptr[1]) * (4.0F / 14.0F));
02600         break;
02601       case 12:
02602         dst[srs + writeIdx] = (dptr[0]              * (6.0F / 15.0F) +
02603                                (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
02604                                dptr[2]              * (1.0F / 15.0F));
02605         break;
02606       case 20:
02607         dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
02608                                dptr[-1] * (4.0F / 11.0F) +
02609                                dptr[-2] * (1.0F / 11.0F));
02610         break;
02611       case 21:
02612         dst[srs + writeIdx] = (dptr[0]              * (6.0F / 15.0F) +
02613                                (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
02614                                dptr[-2]              * (1.0F / 15.0F));
02615         break;
02616       }
02617     }
02618     // First set of pixels in the row
02619     else if(sts+sx < 2)
02620     {
02621       switch(sx)
02622       {
02623       case 0:
02624         dst[srs + writeIdx] = (dptr[0] * (6.0F / 11.0F) +
02625                                dptr[1] * (4.0F / 11.0F) +
02626                                dptr[2] * (1.0F / 11.0F));
02627         break;
02628       case 1:
02629         dst[srs + writeIdx] = (dptr[0]              * (6.0F / 15.0F) +
02630                                (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
02631                                dptr[2]              * (1.0F / 15.0F));
02632         break;
02633       }
02634     }
02635     // First two pixels in tile
02636     else if(sx < 2 && sts+sx < w-2)
02637     {
02638       switch(sx)
02639       {
02640       case 0:
02641         dst[srs + writeIdx] = (dptr[0]               * (6.0F / 16.0F) +
02642                                (border[1] + dptr[1]) * (4.0F / 16.0F) +
02643                                (border[0] + dptr[2]) * (1.0F / 16.0F));
02644         break;
02645       case 1:
02646         dst[srs + writeIdx] = (dptr[0]               * (6.0F / 16.0F) +
02647                                (dptr[-1] + dptr[1])  * (4.0F / 16.0F) +
02648                                (border[1] + dptr[2]) * (1.0F / 16.0F));
02649         break;
02650       }
02651     }
02652     // In the middle of the tile
02653     else if(sx < tile_width-2 && sts+sx < w-2)
02654     {
02655       dst[srs + writeIdx] = (dptr[0]              * (6.0F / 16.0F) +
02656                              (dptr[-1] + dptr[1]) * (4.0F / 16.0F) +
02657                              (dptr[-2] + dptr[2]) * (1.0F / 16.0F));
02658     }
02659     // Last two pixels of the tile
02660     else if(sts+sx < w-2)
02661     {
02662       switch(tile_width-sx)
02663       {
02664       case 2:
02665         dst[srs + writeIdx] = (dptr[0]                * (6.0F / 16.0F) +
02666                                (dptr[-1] + dptr[1])   * (4.0F / 16.0F) +
02667                                (dptr[-2] + border[2]) * (1.0F / 16.0F));
02668         break;
02669       case 1:
02670         dst[srs + writeIdx] = (dptr[0]                * (6.0F / 16.0F) +
02671                                (dptr[-1] + border[2]) * (4.0F / 16.0F) +
02672                                (dptr[-2] + border[3]) * (1.0F / 16.0F));
02673         break;
02674       }
02675     }
02676     // Last two pixels of the row
02677     else
02678     {
02679       switch(w-(sts+sx))
02680       {
02681       case 2:
02682         dst[srs + writeIdx] = (dptr[0]              * (6.0F / 15.0F) +
02683                                (dptr[-1] + dptr[1]) * (4.0F / 15.0F) +
02684                                (dptr[-2])           * (1.0F / 15.0F));
02685         break;
02686       case 1:
02687         dst[srs + writeIdx] = (dptr[0]    * (6.0F / 11.0F) +
02688                                (dptr[-1]) * (4.0F / 11.0F) +
02689                                (dptr[-2]) * (1.0F / 11.0F));
02690         break;
02691       }
02692     }
02693   }
02694 }
02695 
02696 __global__ void cuda_global_lowpass_5_y(const float *src,  const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
02697 {
02698   // Data cache
02699   //__shared__ float data[CUDA_TILE_W*CUDA_TILE_H];
02700   //__shared__ float border[CUDA_TILE_W*4];
02701   //const int tile_width=CUDA_TILE_W;
02702   //const int tile_height= CUDA_TILE_H;
02703 
02704   // Save 4 rows for the border
02705   float *data = (float *) shared_data; //tile_width * tile_height size
02706   float *border = (float *) &data[tile_width*tile_height]; // size of tile_width*4
02707 
02708   const int sy = threadIdx.y; // source pixel row within source tile
02709 
02710   const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows
02711   const int ste = sts + tile_height; // tile end for source, in rows
02712 
02713   // Clamp tile and apron limits by image borders
02714   const int stec = min(ste, h);
02715 
02716   // Current column index
02717   const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
02718 
02719   int smemPos = IMUL(sy, tile_width) + threadIdx.x;
02720   int gmemPos = IMUL(sts + sy, w) + scs;
02721 
02722   // only process columns that are actually within image bounds:
02723   if (scs < w && sts+sy < h) {
02724     // Shared and global (source) memory indices for current column
02725 
02726     // Load data
02727     data[smemPos] = src[gmemPos];
02728 
02729     // Load border
02730     if (sy < 2 && gmemPos > IMUL(2,w))
02731       border[smemPos] = src[gmemPos-IMUL(2,w)];
02732 
02733     int bordOff = 4+sy-tile_height;
02734 
02735     if (sy >= tile_height-2 && ste+2 < h)
02736       border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+IMUL(2,w)];
02737 
02738     // Ensure the completness of loading stage because results emitted
02739     // by each thread depend on the data loaded by other threads:
02740     __syncthreads();
02741 
02742     // Cycle through the tile body, clamped by image borders
02743     // Calculate and output the results
02744     float *dptr = data + smemPos;
02745     const int sw = tile_width, sw2 = sw + sw;
02746     const int nsw = -sw, nsw2 = nsw - sw;
02747     const int bn2 = threadIdx.x, bn1 = bn2 + tile_width;
02748     const int bp1 = bn1+tile_width, bp2 = bp1 + tile_width;
02749 
02750     //  [ 1 4 (6) 4 1 ] / 16
02751 
02752     // If we are smaller than the Gaussian filter we are using, special case this
02753     // this is not very efficient, just making sure it can handle small images
02754     if(h < 5)
02755       {
02756         int numAhead = max(0,min(h-1-(sts+sy),2));
02757         int numBehind = max(0,min(sts+sy,2));
02758         int situation = numBehind*10+numAhead;
02759         switch(situation)
02760           {
02761           case 0: // 00
02762             dst[gmemPos] = dptr[0];
02763             break;
02764           case 1: // 01
02765             dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) +
02766                             dptr[sw] * (4.0F / 10.0F));
02767             break;
02768           case 2: // 02
02769             dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) +
02770                             dptr[sw] * (4.0F / 11.0F) +
02771                             dptr[sw2] * (1.0F / 11.0F));
02772             break;
02773           case 10:
02774             dst[gmemPos] = (dptr[0] * (6.0F / 10.0F) +
02775                             dptr[nsw] * (4.0F / 10.0F));
02776             break;
02777           case 11:
02778             dst[gmemPos] = (dptr[0]              * (6.0F / 14.0F) +
02779                             (dptr[nsw] + dptr[sw]) * (4.0F / 14.0F));
02780             break;
02781           case 12:
02782             dst[gmemPos] = (dptr[0]              * (6.0F / 15.0F) +
02783                             (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
02784                             dptr[sw2]              * (1.0F / 15.0F));
02785             break;
02786           case 20:
02787             dst[gmemPos] = (dptr[0] * (6.0F / 11.0F) +
02788                             dptr[nsw] * (4.0F / 11.0F) +
02789                             dptr[nsw2] * (1.0F / 11.0F));
02790             break;
02791           case 21:
02792             dst[gmemPos] = (dptr[0]              * (6.0F / 15.0F) +
02793                             (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
02794                             dptr[nsw2]              * (1.0F / 15.0F));
02795             break;
02796           }
02797       }
02798     // Are we in the top 2 rows of the whole image
02799     else if(sts + sy < 2)
02800       {
02801         switch(sts+sy)
02802           {
02803           case 0:
02804             dst[gmemPos] =
02805               (dptr[0]   * (6.0F / 11.0F) +
02806                dptr[sw]  * (4.0F / 11.0F) +
02807                dptr[sw2] * (1.0F / 11.0F)
02808                );
02809             break;
02810           case 1:
02811             dst[gmemPos] =
02812               (dptr[0]                * (6.0F / 15.0F) +
02813                (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
02814                dptr[sw2]              * (1.0F / 15.0F)
02815                );
02816             break;
02817           }
02818       }
02819     else if(sy < 2 && sts+sy<h-2) // If not top 2 in the whole image, are we in the top 2 rows of this tile
02820       {
02821         switch(sy)
02822           {
02823           case 0:
02824             dst[gmemPos] =
02825               (dptr[0]                   * (6.0F / 16.0F) +
02826                (border[bn1] + dptr[sw])  * (4.0F / 16.0F) +
02827                (border[bn2] + dptr[sw2]) * (1.0F / 16.0F)
02828                );
02829             break;
02830           case 1:
02831             dst[gmemPos] =
02832               (dptr[0]                   * (6.0F / 16.0F) +
02833                (dptr[nsw] + dptr[sw])    * (4.0F / 16.0F) +
02834                (border[bn1] + dptr[sw2]) * (1.0F / 16.0F)
02835                );
02836             break;
02837           }
02838       }
02839     else if(sy <tile_height-2 && sts+sy<h-2) // Are we in the middle of the tile
02840       {
02841         dst[gmemPos] =
02842           ((dptr[nsw2] + dptr[sw2]) * (1.0F / 16.0F) +
02843            (dptr[nsw] + dptr[sw])   * (4.0F / 16.0F) +
02844            dptr[0]                  * (6.0F / 16.0F)
02845            );
02846       }
02847     else if(sts + sy < h-2) // Are we not at the bottom of the image, but bottom 4 of the tile
02848       {
02849         switch(tile_height-sy)
02850           {
02851           case 2:
02852             dst[gmemPos] =
02853               (dptr[0]                    * (6.0F / 16.0F) +
02854                (dptr[nsw] + dptr[sw])     * (4.0F / 16.0F) +
02855                (dptr[nsw2] + border[bp1]) * (1.0F / 16.0F)
02856                );
02857             break;
02858           case 1:
02859             dst[gmemPos] =
02860               (dptr[0]                    * (6.0F / 16.0F) +
02861                (dptr[nsw] +  border[bp1]) * (4.0F / 16.0F) +
02862                (dptr[nsw2] + border[bp2]) * (1.0F / 16.0F)
02863                );
02864             break;
02865           }
02866       }
02867     else // We must be at the bottom 4 of the image
02868       {
02869         switch(h-(sts+sy))
02870           {
02871           case 2:
02872             dst[gmemPos] =
02873               (dptr[0]                * (6.0F / 15.0F) +
02874                (dptr[nsw] + dptr[sw]) * (4.0F / 15.0F) +
02875                dptr[nsw2]             * (1.0F / 15.0F)
02876                );
02877             break;
02878           case 1:
02879             dst[gmemPos] =
02880               (dptr[0]    * (6.0F / 11.0F) +
02881                dptr[nsw]  * (4.0F / 11.0F) +
02882                dptr[nsw2] * (1.0F / 11.0F)
02883                );
02884             break;
02885           }
02886       }
02887 
02888   }
02889 
02890 }
02891 
02892 
02893 
02894 __global__ void cuda_global_lowpass_3_x(const float *src,  const unsigned int w, const unsigned int h, float* dst, int tile_width)
02895 {
02896 
02897   // Save 2 pixels for the border
02898   float *data = (float *) shared_data; // size of tile_width
02899   float *border = (float *) &data[tile_width]; // size of 2
02900 
02901   const int sx = threadIdx.x;                    // source pixel within source tile
02902   const int sts = IMUL(blockIdx.x, tile_width);  // tile start for source, relative to row start
02903   const int srs = IMUL(blockIdx.y, w);           // Row start index in source data
02904 
02905   // Load global memory values into our data cache:
02906   const int loadIdx = sts + sx;  // index of one pixel value to load
02907   if (loadIdx < w) data[sx] = src[srs + loadIdx];
02908 
02909   // Load beginning border
02910   if (sx < 1 && sts > 0)
02911     border[sx] = src[srs + loadIdx - 1];
02912   // Load ending border
02913   else if(sx >= tile_width-1 && sts+tile_width < w-1)
02914     border[2+sx-tile_width] = src[srs + sts + sx + 1];
02915 
02916   // Ensure the completness of loading stage because results emitted
02917   // by each thread depend on the data loaded by other threads:
02918   __syncthreads();
02919 
02920 
02921   if ( loadIdx < w ) {
02922     const int writeIdx = sts + sx; // write index relative to row start
02923     const float *dptr = data + sx;
02924     // [ 1 (2) 1 ] / 4.0
02925 
02926     // If we are smaller than the Gaussian filter we are using, special case this
02927     // this is not very efficient, just making sure it can handle small images
02928     if(w < 3)
02929     {
02930       int numAhead = max(0,min(w-1-(sts+sx),1));
02931       int numBehind = max(0,min(sts+sx,1));
02932       int situation = numBehind*10+numAhead;
02933       switch(situation)
02934       {
02935       case 0: // 00
02936         dst[srs + writeIdx] = dptr[0];
02937         break;
02938       case 1: // 01
02939         dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) +
02940                                dptr[1] * (1.0F / 3.0F));
02941         break;
02942       case 10:
02943         dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) +
02944                                dptr[-1] * (1.0F / 3.0F));
02945         break;
02946       case 11:
02947         dst[srs + writeIdx] = (dptr[0]              * (2.0F / 4.0F) +
02948                                (dptr[-1] + dptr[1]) * (1.0F / 4.0F));
02949         break;
02950       }
02951     }
02952     // First set of pixels in the row
02953     else if(sts+sx < 1)
02954     {
02955       switch(sx)
02956       {
02957       case 0:
02958         dst[srs + writeIdx] = (dptr[0] * (2.0F / 3.0F) +
02959                                dptr[1] * (1.0F / 3.0F));
02960         break;
02961       }
02962     }
02963     // First pixel in tile
02964     else if(sx < 1 && sts+sx < w-1)
02965     {
02966       switch(sx)
02967       {
02968       case 0:
02969         dst[srs + writeIdx] = (dptr[0]               * (2.0F / 4.0F) +
02970                                (border[0] + dptr[1]) * (1.0F / 4.0F));
02971         break;
02972       }
02973     }
02974     // In the middle of the tile
02975     else if(sx < tile_width-1 && sts+sx < w-1)
02976     {
02977       dst[srs + writeIdx] = (dptr[0]              * (2.0F / 4.0F) +
02978                              (dptr[-1] + dptr[1]) * (1.0F / 4.0F));
02979     }
02980     // Last pixel of the tile
02981     else if(sts+sx < w-1)
02982     {
02983       switch(tile_width-sx)
02984       {
02985       case 1:
02986         dst[srs + writeIdx] = (dptr[0]                * (2.0F / 4.0F) +
02987                                (dptr[-1] + border[1]) * (1.0F / 4.0F));
02988         break;
02989       }
02990     }
02991     // Last pixel of the row
02992     else
02993     {
02994       switch(w-(sts+sx))
02995       {
02996       case 1:
02997         dst[srs + writeIdx] = (dptr[0]    * (2.0F / 3.0F) +
02998                                (dptr[-1]) * (1.0F / 3.0F));
02999         break;
03000       }
03001     }
03002   }
03003 }
03004 
03005 __global__ void cuda_global_lowpass_3_y(const float *src,  const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
03006 {
03007   // Data cache
03008   //__shared__ float data[CUDA_TILE_W*CUDA_TILE_H];
03009   //__shared__ float border[CUDA_TILE_W*4];
03010   //const int tile_width=CUDA_TILE_W;
03011   //const int tile_height= CUDA_TILE_H;
03012 
03013   // Save 2 rows for the border
03014   float *data = (float *) shared_data; //tile_width * tile_height size
03015   float *border = (float *) &data[tile_width*tile_height]; // size of tile_width*2
03016 
03017   const int sy = threadIdx.y; // source pixel row within source tile
03018 
03019   const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows
03020   const int ste = sts + tile_height; // tile end for source, in rows
03021 
03022   // Clamp tile and apron limits by image borders
03023   const int stec = min(ste, h);
03024 
03025   // Current column index
03026   const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
03027 
03028   // only process columns that are actually within image bounds:
03029   if (scs < w && sts+sy < h) {
03030     // Shared and global (source) memory indices for current column
03031     int smemPos = IMUL(sy, tile_width) + threadIdx.x;
03032     int gmemPos = IMUL(sts + sy, w) + scs;
03033 
03034     // Load data
03035     data[smemPos] = src[gmemPos];
03036 
03037     // Load border
03038     if (sy < 1 && gmemPos > w)
03039       border[smemPos] = src[gmemPos-w];
03040 
03041     int bordOff = 2+sy-tile_height;
03042 
03043     if (sy >= tile_height-1 && ste+1 < h)
03044       border[threadIdx.x+IMUL(bordOff,tile_width)] = src[gmemPos+w];
03045 
03046     // Ensure the completness of loading stage because results emitted
03047     // by each thread depend on the data loaded by other threads:
03048     __syncthreads();
03049 
03050 
03051     // Cycle through the tile body, clamped by image borders
03052     // Calculate and output the results
03053     float *dptr = data + smemPos;
03054     const int sw = tile_width;
03055     const int nsw = -sw;
03056     const int bn1 = threadIdx.x;
03057     const int bp1 = bn1+tile_width;
03058 
03059     //  [ 1 (2) 1 ] / 4
03060 
03061     // If we are smaller than the Gaussian filter we are using, special case this
03062     // this is not very efficient, just making sure it can handle small images
03063     if(h < 3)
03064       {
03065         int numAhead = max(0,min(h-1-(sts+sy),1));
03066         int numBehind = max(0,min(sts+sy,1));
03067         int situation = numBehind*10+numAhead;
03068         switch(situation)
03069           {
03070           case 0: // 00
03071             dst[gmemPos] = dptr[0];
03072             break;
03073           case 1: // 01
03074             dst[gmemPos] = (dptr[0] * (2.0F / 3.0F) +
03075                             dptr[sw] * (1.0F / 3.0F));
03076             break;
03077           case 10:
03078             dst[gmemPos] = (dptr[0] * (2.0F / 3.0F) +
03079                             dptr[nsw] * (1.0F / 3.0F));
03080             break;
03081           case 11:
03082             dst[gmemPos] = (dptr[0]              * (2.0F / 4.0F) +
03083                             (dptr[nsw] + dptr[sw]) * (1.0F / 4.0F));
03084             break;
03085           }
03086       }
03087     // Are we in the top row of the whole image
03088     else if(sts + sy < 1)
03089       {
03090         switch(sts+sy)
03091           {
03092           case 0:
03093             dst[gmemPos] =
03094               (dptr[0]   * (2.0F / 3.0F) +
03095                dptr[sw]  * (1.0F / 3.0F));
03096             break;
03097           }
03098       }
03099     else if(sy < 1 && sts+sy<h-1) // If not top in the whole image, are we in the top row of this tile
03100       {
03101         switch(sy)
03102           {
03103           case 0:
03104             dst[gmemPos] =
03105               (dptr[0]                   * (2.0F / 4.0F) +
03106                (border[bn1] + dptr[sw])  * (1.0F / 4.0F));
03107             break;
03108           }
03109       }
03110     else if(sy <tile_height-1 && sts+sy<h-1) // Are we in the middle of the tile
03111       {
03112         dst[gmemPos] =
03113           (dptr[0]                  * (2.0F / 4.0F) +
03114            (dptr[nsw] + dptr[sw])   * (1.0F / 4.0F));
03115       }
03116     else if(sts + sy < h-1) // Are we not at the bottom of the image, but bottom row of the tile
03117       {
03118         switch(tile_height-sy)
03119           {
03120           case 1:
03121             dst[gmemPos] =
03122               (dptr[0]                    * (2.0F / 4.0F) +
03123                (dptr[nsw] +  border[bp1]) * (1.0F / 4.0F));
03124             break;
03125           }
03126       }
03127     else // We must be at the bottom row of the image
03128       {
03129         switch(h-(sts+sy))
03130           {
03131           case 1:
03132             dst[gmemPos] =
03133               (dptr[0]    * (2.0F / 3.0F) +
03134                dptr[nsw]  * (1.0F / 3.0F));
03135             break;
03136           }
03137       }
03138 
03139   }
03140 
03141 }
03142 
03143 
03144 #include "CUDA/cutil.h"
03145 
03146 texture<float, 2, cudaReadModeElementType> texRef;
03147 
03148 
03149 __global__ void cuda_global_lowpass_texture_9_x_dec_x(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height)
03150 {
03151   // w and h are from the original source image
03152   float *data = (float *) shared_data; //(tile_width+8) * tile_height size
03153   const int yResPos = IMUL(blockIdx.y, tile_height) + threadIdx.y; // Y position
03154   const int xResPos = IMUL(blockIdx.x,tile_width) + threadIdx.x; // index of one pixel value to load
03155   const int ySrcPos = yResPos; // Y position (unchanged in this function)
03156   const int xSrcPos = xResPos<<1; // X position
03157   const int ctrOff = 4;
03158   const int shrIdx = threadIdx.x+ctrOff;
03159   // [ 1 8 28 56 (70) 56 28 8 1 ]
03160   if(xResPos < dw && yResPos < dh)
03161     {
03162       data[shrIdx] = tex2D(texRef,xSrcPos,ySrcPos);
03163       if(threadIdx.x < ctrOff)
03164         data[shrIdx-ctrOff] = tex2D(texRef,xSrcPos-ctrOff,ySrcPos);
03165       else if(threadIdx.x > tile_width - ctrOff)
03166         data[shrIdx+ctrOff] = tex2D(texRef,xSrcPos+ctrOff,ySrcPos);
03167       __syncthreads();
03168       dst[IMUL(yResPos,dw) + xResPos] =
03169         ((data[shrIdx-4] + data[shrIdx+4]) * ( 1.0F / 256.0F) +
03170          (data[shrIdx-3] + data[shrIdx+3]) * ( 8.0F / 256.0F) +
03171          (data[shrIdx-2] + data[shrIdx+2]) * (28.0F / 256.0F) +
03172          (data[shrIdx-1] + data[shrIdx+1]) * (56.0F / 256.0F) +
03173          data[shrIdx]              * (70.0F / 256.0F)
03174          );
03175     }
03176 }
03177 
03178 void cuda_c_lowpass_texture_9_x_dec_x(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height)
03179 {
03180   //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
03181   texRef.addressMode[0] = cudaAddressModeClamp;
03182   texRef.addressMode[1] = cudaAddressModeClamp;
03183   texRef.filterMode = cudaFilterModePoint;
03184   texRef.normalized = false;
03185   cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); //cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);
03186   CUDA_SAFE_CALL_NO_SYNC(cudaBindTexture2D(NULL,texRef,src,channelDesc,w,h,w*sizeof(float)));
03187   dim3 blockGridRows(iDivUp(dw, tile_width), iDivUp(dh, tile_height));
03188   dim3 threadBlockRows(tile_width, tile_height);
03189   cuda_global_lowpass_texture_9_x_dec_x<<<blockGridRows, threadBlockRows, (tile_width+8)*tile_height*sizeof(float)>>>(src, w, h, dst, dw, dh, tile_width, tile_height);
03190   CUDA_SAFE_CALL_NO_SYNC(cudaUnbindTexture(texRef));
03191 }
03192 
03193 __global__ void cuda_global_lowpass_texture_9_y_dec_y(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height)
03194 {
03195   // w and h are from the original source image
03196   float *data = (float *) shared_data; //tile_width * (tile_height+8) size
03197   const int yResPos = IMUL(blockIdx.y, tile_height) + threadIdx.y; // Y position
03198   const int xResPos = IMUL(blockIdx.x,tile_width) + threadIdx.x; // index of one pixel value to load
03199   const int ySrcPos = yResPos<<1; // Y position
03200   const int xSrcPos = xResPos; // X position (unchanged in this function)
03201   const int ctrOff = 4;
03202   const int shrIdx = threadIdx.y+ctrOff;
03203   // [ 1 8 28 56 (70) 56 28 8 1 ]
03204   if(xResPos < dw && yResPos < dh)
03205     {
03206       data[shrIdx] = tex2D(texRef,xSrcPos,ySrcPos);
03207       if(threadIdx.y < ctrOff)
03208         data[shrIdx-ctrOff] = tex2D(texRef,xSrcPos,ySrcPos-ctrOff);
03209       else if(threadIdx.y > tile_height - ctrOff)
03210         data[shrIdx+ctrOff] = tex2D(texRef,xSrcPos,ySrcPos+ctrOff);
03211       __syncthreads();
03212       dst[IMUL(yResPos,dw) + xResPos] =
03213         ((data[shrIdx-4] + data[shrIdx+4]) * ( 1.0F / 256.0F) +
03214          (data[shrIdx-3] + data[shrIdx+3]) * ( 8.0F / 256.0F) +
03215          (data[shrIdx-2] + data[shrIdx+2]) * (28.0F / 256.0F) +
03216          (data[shrIdx-1] + data[shrIdx+1]) * (56.0F / 256.0F) +
03217          data[shrIdx]              * (70.0F / 256.0F)
03218          );
03219     }
03220 }
03221 
03222 void cuda_c_lowpass_texture_9_y_dec_y(const float *src, int w, int h, float *dst, int dw, int dh, int tile_width, int tile_height)
03223 {
03224   //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
03225   texRef.addressMode[0] = cudaAddressModeClamp;
03226   texRef.addressMode[1] = cudaAddressModeClamp;
03227   texRef.filterMode = cudaFilterModePoint;
03228   texRef.normalized = false;
03229   cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); //cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);
03230   CUDA_SAFE_CALL_NO_SYNC(cudaBindTexture2D(NULL,texRef,src,channelDesc,w,h,w*sizeof(float)));
03231   dim3 blockGridRows(iDivUp(dw, tile_width), iDivUp(dh, tile_height));
03232   dim3 threadBlockRows(tile_width, tile_height);
03233   cuda_global_lowpass_texture_9_y_dec_y<<<blockGridRows, threadBlockRows, tile_width*(tile_height+8)*sizeof(float)>>>(src, w, h, dst, dw, dh, tile_width, tile_height);
03234   CUDA_SAFE_CALL_NO_SYNC(cudaUnbindTexture(texRef));
03235 }
03236 
03237 
03238 #endif
Generated on Sun May 8 08:40:23 2011 for iLab Neuromorphic Vision Toolkit by  doxygen 1.6.3