cuda_convolutions.h

00001 /*!@file CUDA/cuda-convolutions.h CUDA/GPU convolution methods */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file:
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_convolutions.h $
00035 // $Id: cuda_convolutions.h 12962 2010-03-06 02:13:53Z irock $
00036 //
00037 
00038 #ifndef CUDA_CONVOLUTIONS_H_DEFINED
00039 #define CUDA_CONVOLUTIONS_H_DEFINED
00040 
00041 #include <cuda.h>
00042 #include "CUDA/cutil.h"
00043 #include "cudadefs.h"
00044 
00045 
00046 
00047 __global__ void cuda_global_convolveHmaxHelper(float *res, const float *src, const int src_w, const int src_h,
00048                                                 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00049 {
00050   const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00051   const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00052   const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x;
00053   const int res_idx = IMUL(y_pos,src_w)+x_pos;
00054 
00055   // Load the filter into shared memory
00056   float *data = (float *) shared_data; // size of tile_width
00057   const int tile_sz = IMUL(tile_height,tile_width);
00058   const int fil_sz = IMUL(Nx,Ny);
00059 
00060   for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00061   {
00062     const int curld_idx = ld_idx+i;
00063     data[curld_idx] = f[curld_idx];
00064   }
00065   __syncthreads();
00066 
00067   if(y_pos < src_h && x_pos < src_w)
00068   {
00069     const int Nx2 = (Nx - 1) / 2;
00070     const int Ny2 = (Ny - 1) / 2;
00071     float sum = 0.0F;
00072     float sumw = 0.0F;
00073     float im;
00074     for(int i=0;i<Ny;i++)
00075       {
00076         const int s_y=y_pos-Ny2+i;
00077         const int f_y=Ny-i-1;
00078         if(s_y >= 0 && s_y < src_h)
00079         {
00080           for(int j=0;j<Nx;j++)
00081           {
00082             const int s_x=x_pos-Nx2+j;
00083             const int f_x=Nx-j-1;
00084             const int flt_idx = IMUL(f_y,Nx)+f_x;
00085             const int src_idx = IMUL(s_y,src_w)+s_x;
00086             if(s_x >=0 && s_x < src_w)
00087               {
00088                 im = src[src_idx];
00089                 //sum += im*data[flt_idx];
00090                 sum = __fadd_rn(__fmul_rn(im,data[flt_idx]),sum);
00091                 //sumw += im*im;
00092                 sumw = __fadd_rn(__fmul_rn(im,im),sumw);
00093               }
00094           }
00095         }
00096       }
00097     if(sumw > 0.0F)
00098       res[res_idx] = fabs(sum)/sqrt(sumw);
00099     else
00100       res[res_idx] = sum;
00101   }
00102 
00103 }
00104 
00105 
00106 __global__ void cuda_global_convolveZeroHelper(float *res, const float *src, const int src_w, const int src_h,
00107                                                const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00108 {
00109   const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00110   const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00111   const int ld_idx = threadIdx.y*tile_width + threadIdx.x;
00112   const int res_idx = y_pos*src_w+x_pos;
00113 
00114   // Load the filter into shared memory
00115   float *data = (float *) shared_data; // size of tile_width
00116   const int tile_sz = IMUL(tile_height,tile_width);
00117   const int fil_sz = IMUL(Nx,Ny);
00118 
00119   for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00120   {
00121     const int curld_idx = ld_idx+i;
00122     data[curld_idx] = f[curld_idx];
00123   }
00124 
00125   __syncthreads();
00126 
00127   if(y_pos < src_h && x_pos < src_w)
00128   {
00129     const int Nx2 = (Nx - 1) >>1;
00130     const int Ny2 = (Ny - 1) >>1;
00131     float sum = 0.0F;
00132     for(int i=0;i<Ny;i++)
00133       {
00134         const int s_y=y_pos-Ny2+i;
00135         const int f_y=Ny-i-1;
00136         if(s_y >= 0 && s_y < src_h)
00137           {
00138             for(int j=0;j<Nx;j++)
00139               {
00140                 const int s_x=x_pos-Nx2+j;
00141                 const int f_x=Nx-j-1;
00142                 if(s_x >=0 && s_x < src_w)
00143                   {
00144                     const int flt_idx = IMUL(f_y,Nx)+f_x;
00145                     const int src_idx = IMUL(s_y,src_w)+s_x;
00146                     //sum += src[src_idx]*data[flt_idx];
00147                     //sum = __fmaf_rn(src[src_idx],data[flt_idx],sum);
00148                     sum = __fadd_rn(__fmul_rn(src[src_idx],data[flt_idx]),sum);
00149                   }
00150               }
00151           }
00152       }
00153     res[res_idx] = sum;
00154   }
00155 
00156 }
00157 
00158 __global__ void cuda_global_convolveCleanHelper(float *res, const float *src, const int src_w, const int src_h,
00159                                                 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00160 {
00161   const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00162   const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00163   const int ld_idx = threadIdx.y*tile_width + threadIdx.x;
00164   const int res_idx = y_pos*src_w+x_pos;
00165 
00166   // Load the filter into shared memory
00167   float *data = (float *) shared_data; // size of tile_width
00168   const int tile_sz = IMUL(tile_height,tile_width);
00169   const int fil_sz = IMUL(Nx,Ny);
00170 
00171   for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00172   {
00173     const int curld_idx = ld_idx+i;
00174     data[curld_idx] = f[curld_idx];
00175   }
00176 
00177   __syncthreads();
00178 
00179   if(y_pos < src_h && x_pos < src_w)
00180   {
00181     const int Nx2 = (Nx - 1) / 2;
00182     const int Ny2 = (Ny - 1) / 2;
00183     float sum = 0.0F;
00184     float sumf = 0.0F;
00185     float sumw = 0.0F;
00186     for(int i=0;i<Ny;i++)
00187       {
00188         const int s_y=y_pos-Ny2+i;
00189         const int f_y=Ny-i-1;
00190         for(int j=0;j<Nx;j++)
00191           {
00192             const int s_x=x_pos-Nx2+j;
00193             const int f_x=Nx-j-1;
00194             const int flt_idx = IMUL(f_y,Nx)+f_x;
00195             const int src_idx = IMUL(s_y,src_w)+s_x;
00196             if(s_y >= 0 && s_y < src_h  && s_x >=0 && s_x < src_w)
00197               {
00198                 //sum += src[src_idx]*data[flt_idx];
00199                 sum = __fadd_rn(__fmul_rn(src[src_idx],data[flt_idx]),sum);
00200                 sumw += data[flt_idx];
00201                 sumf += data[flt_idx];
00202               }
00203             else
00204               {
00205                 sumf += data[flt_idx];
00206               }
00207           }
00208       }
00209 
00210     res[res_idx] = sum*sumf/sumw;
00211   }
00212 
00213 }
00214 
00215 
00216 __global__ void cuda_global_convolveHmaxHelperOptimized(float *res, const float *src, const int src_w, const int src_h,
00217                                                const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00218 {
00219   const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00220   const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00221   const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x;
00222   const int res_idx = IMUL(y_pos,src_w)+x_pos;
00223 
00224   // Load the filter into shared memory
00225   const int tile_sz = IMUL(tile_height,tile_width);
00226   const int fil_sz = IMUL(Nx,Ny);
00227 
00228   const int Nx2 = (Nx - 1) >>1;
00229   const int Ny2 = (Ny - 1) >>1;
00230 
00231   float *filt = (float *) shared_data; // size of filter: fil_sz
00232   float *data = (float *) &(filt[fil_sz]); // size of tile: (tile_width+Nx)*(tile_height+Ny);
00233   const int dw = tile_width+Nx;
00234   //const int dh = tile_height+Ny; // Not used
00235   const int shr_x = threadIdx.x+Nx2;
00236   const int shr_y = threadIdx.y+Ny2;
00237   const int shr_idx = IMUL(shr_y,dw)+shr_x;
00238 
00239   // Load filter
00240   for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00241     {
00242       const int curld_idx = ld_idx+i;
00243       filt[curld_idx] = f[curld_idx];
00244     }
00245   if(y_pos < src_h && x_pos < src_w)
00246     {
00247       // Load bulk of tile
00248       data[shr_idx]=src[res_idx];
00249       const bool topBorder = (y_pos > Ny2 && threadIdx.y < Ny2);
00250       const bool bottomBorder = (y_pos < src_h-Ny2 && threadIdx.y > tile_height-Ny2-1);
00251       const bool leftBorder = (x_pos > Nx2 && threadIdx.x < Nx2);
00252       const bool rightBorder = (x_pos < src_w-Nx2 && threadIdx.x > tile_width-Nx2-1);
00253       // Load top border
00254       if(topBorder)
00255         data[shr_idx-IMUL(Ny2,dw)] = src[res_idx-IMUL(Ny2,src_w)];
00256       // Load bottom border
00257       if(bottomBorder)
00258         data[shr_idx+IMUL(Ny2,dw)] = src[res_idx+IMUL(Ny2,src_w)];
00259       // Load left border
00260       if(leftBorder)
00261         data[shr_idx-Nx2] = src[res_idx-Nx2];
00262       // Load right border
00263       if(rightBorder)
00264         data[shr_idx+Nx2] = src[res_idx+Nx2];
00265       // Load corners
00266       if(topBorder && leftBorder)
00267         data[shr_idx-IMUL(Ny2,dw)-Nx2] = src[res_idx-IMUL(Ny2,src_w)-Nx2];
00268       if(topBorder && rightBorder)
00269         data[shr_idx-IMUL(Ny2,dw)+Nx2] = src[res_idx-IMUL(Ny2,src_w)+Nx2];
00270       if(bottomBorder && leftBorder)
00271         data[shr_idx+IMUL(Ny2,dw)-Nx2] = src[res_idx+IMUL(Ny2,src_w)-Nx2];
00272       if(bottomBorder && rightBorder)
00273         data[shr_idx+IMUL(Ny2,dw)+Nx2] = src[res_idx+IMUL(Ny2,src_w)+Nx2];
00274       __syncthreads();
00275 
00276       float sum = 0.0F, sumw = 0.0F;
00277       for(int i=0;i<Ny;i++)
00278         {
00279           const int s_y=shr_y-Ny2+i;
00280           const int y_loc = y_pos-Ny2+i;
00281           const int f_y=Ny-i-1;
00282           if(y_loc >= 0 && y_loc < src_h)
00283             {
00284               for(int j=0;j<Nx;j++)
00285                 {
00286                   const int s_x=shr_x-Nx2+j;
00287                   const int x_loc = x_pos-Nx2+j;
00288                   const int f_x=Nx-j-1;
00289                   if(x_loc >=0 && x_loc < src_w)
00290                     {
00291                       const int flt_idx = IMUL(f_y,Nx)+f_x;
00292                       const int dat_idx = IMUL(s_y,dw)+s_x;
00293                       //sum += im*data[flt_idx];
00294                       sum = __fadd_rn(__fmul_rn(data[dat_idx],filt[flt_idx]),sum);
00295                       //sumw += im*im;
00296                       sumw = __fadd_rn(__fmul_rn(data[dat_idx],data[dat_idx]),sumw);
00297                     }
00298                 }
00299             }
00300         }
00301       if(sumw > 0.0F)
00302         res[res_idx] = fabs(sum)/sqrt(sumw);
00303       else
00304         res[res_idx] = sum;
00305     }
00306 }
00307 
00308 
00309 
00310 __global__ void cuda_global_convolveZeroHelperOptimized(float *res, const float *src, const int src_w, const int src_h,
00311                                                const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00312 {
00313   const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00314   const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00315   const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x;
00316   const int res_idx = IMUL(y_pos,src_w)+x_pos;
00317 
00318   // Load the filter into shared memory
00319   const int tile_sz = IMUL(tile_height,tile_width);
00320   const int fil_sz = IMUL(Nx,Ny);
00321 
00322   const int Nx2 = (Nx - 1) >>1;
00323   const int Ny2 = (Ny - 1) >>1;
00324 
00325   float *filt = (float *) shared_data; // size of filter: fil_sz
00326   float *data = (float *) &(filt[fil_sz]); // size of tile: (tile_width+Nx)*(tile_height+Ny);
00327   const int dw = tile_width+Nx;
00328   //const int dh = tile_height+Ny; // Not used
00329   const int shr_x = threadIdx.x+Nx2;
00330   const int shr_y = threadIdx.y+Ny2;
00331   const int shr_idx = IMUL(shr_y,dw)+shr_x;
00332 
00333   // Load filter
00334   for(int i=0;ld_idx+i<fil_sz;i+=tile_sz)
00335     {
00336       const int curld_idx = ld_idx+i;
00337       filt[curld_idx] = f[curld_idx];
00338     }
00339   if(y_pos < src_h && x_pos < src_w)
00340     {
00341       // Load bulk of tile
00342       data[shr_idx]=src[res_idx];
00343       const bool topBorder = (y_pos > Ny2 && threadIdx.y < Ny2);
00344       const bool bottomBorder = (y_pos < src_h-Ny2 && threadIdx.y > tile_height-Ny2-1);
00345       const bool leftBorder = (x_pos > Nx2 && threadIdx.x < Nx2);
00346       const bool rightBorder = (x_pos < src_w-Nx2 && threadIdx.x > tile_width-Nx2-1);
00347       // Load top border
00348       if(topBorder)
00349         data[shr_idx-IMUL(Ny2,dw)] = src[res_idx-IMUL(Ny2,src_w)];
00350       // Load bottom border
00351       if(bottomBorder)
00352         data[shr_idx+IMUL(Ny2,dw)] = src[res_idx+IMUL(Ny2,src_w)];
00353       // Load left border
00354       if(leftBorder)
00355         data[shr_idx-Nx2] = src[res_idx-Nx2];
00356       // Load right border
00357       if(rightBorder)
00358         data[shr_idx+Nx2] = src[res_idx+Nx2];
00359       // Load corners
00360       if(topBorder && leftBorder)
00361         data[shr_idx-IMUL(Ny2,dw)-Nx2] = src[res_idx-IMUL(Ny2,src_w)-Nx2];
00362       if(topBorder && rightBorder)
00363         data[shr_idx-IMUL(Ny2,dw)+Nx2] = src[res_idx-IMUL(Ny2,src_w)+Nx2];
00364       if(bottomBorder && leftBorder)
00365         data[shr_idx+IMUL(Ny2,dw)-Nx2] = src[res_idx+IMUL(Ny2,src_w)-Nx2];
00366       if(bottomBorder && rightBorder)
00367         data[shr_idx+IMUL(Ny2,dw)+Nx2] = src[res_idx+IMUL(Ny2,src_w)+Nx2];
00368       __syncthreads();
00369 
00370       float sum = 0.0F;
00371       for(int i=0;i<Ny;i++)
00372         {
00373           const int s_y=shr_y-Ny2+i;
00374           const int y_loc = y_pos-Ny2+i;
00375           const int f_y=Ny-i-1;
00376           if(y_loc >= 0 && y_loc < src_h)
00377             {
00378               for(int j=0;j<Nx;j++)
00379                 {
00380                   const int s_x=shr_x-Nx2+j;
00381                   const int x_loc = x_pos-Nx2+j;
00382                   const int f_x=Nx-j-1;
00383                   if(x_loc >=0 && x_loc < src_w)
00384                     {
00385                       const int flt_idx = IMUL(f_y,Nx)+f_x;
00386                       const int dat_idx = IMUL(s_y,dw)+s_x;
00387                       //sum += src[src_idx]*data[flt_idx];
00388                       //sum = __fmaf_rn(src[src_idx],data[flt_idx],sum);
00389                       sum = __fadd_rn(__fmul_rn(data[dat_idx],filt[flt_idx]),sum);
00390                     }
00391                 }
00392             }
00393         }
00394       res[res_idx] = sum;
00395     }
00396 }
00397 
00398 
00399 
00400 
00401 __global__ void cuda_global_optConvolve(float *res, const float *src, const int src_w, const int src_h,
00402                              const float *f, const int fil_w, const int fil_h, const int tile_width, const int tile_height)
00403 {
00404   const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00405   const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00406   const int ld_idx = threadIdx.y*tile_width + threadIdx.x;
00407   const int res_idx = y_pos*src_w+x_pos;
00408 
00409   // Load the filter into shared memory
00410   float *data = (float *) shared_data; // size of tile_width
00411   const int tile_sz = IMUL(tile_height,tile_width);
00412   const int fil_sz = IMUL(fil_w,fil_h);
00413 
00414   for(int i=0;i<fil_sz;i+=tile_sz)
00415   {
00416     const int curld_idx = ld_idx+IMUL(i,tile_sz);
00417     if(curld_idx < fil_sz)
00418       data[curld_idx] = f[curld_idx];
00419   }
00420   __syncthreads();
00421 
00422   const int Nx2 = (fil_w - 1) / 2;
00423   const int Ny2 = (fil_h - 1) / 2;
00424 
00425   //const int srow_skip = src_w-fil_w;
00426 
00427   //for (int dst_y = 0; dst_y < src_h; ++dst_y)
00428   // Determine if we're safely inside the image in the y-direction:
00429   const bool y_clean = y_pos >= Ny2 && y_pos <  (src_h - Nx2);
00430 
00431   //for (int dst_x = 0; dst_x < src_w; ++dst_x, ++dptr)
00432   // Determine if we're safely inside the image in the x-direction:
00433   const bool x_clean = x_pos >= Nx2 && x_pos <  (src_w - Nx2);
00434 
00435   // Here is where we pick whether we can use the optimized inner
00436   // loop (in cases where the filter and image patch overlap
00437   // completely) or whether we must use the inner loop that can
00438   // handle boundary conditions.
00439 
00440   if (x_clean && y_clean)
00441     {
00442       float dst_val = 0.0f;
00443       for(int i=0;i<fil_h;i++)
00444         {
00445           const int s_y=y_pos-Ny2+i;
00446           const int f_y=fil_h-i-1;
00447         for(int j=0;j<fil_w;j++)
00448           {
00449             const int s_x=x_pos-Nx2+j;
00450             const int f_x=fil_w-j-1;
00451             const int flt_idx = IMUL(f_y,fil_w)+f_x;
00452             const int src_idx = IMUL(s_y,src_w)+s_x;
00453             dst_val += src[src_idx]*data[flt_idx];
00454           }
00455         }
00456       res[res_idx] = dst_val;
00457     }
00458   else if(x_pos < src_w && y_pos < src_h)
00459     {
00460       // OK, we're at an image boundary, so what do we do to make
00461       // up for the missing pixels? The approach here is to
00462       // compute the average value of the non-missing pixels, and
00463       // proceed as if the missing pixels had that value. This
00464       // minimizes the introduction of edge artifacts e.g. when
00465       // convolving with an oriented filter.
00466       float dst_val = 0.0f;
00467       float src_sum = 0.0f;
00468       int src_cnt = 0;
00469       float fil_sum_skipped = 0.0f;
00470 
00471       for(int i=0;i<fil_h;i++)
00472         {
00473           const int s_y=y_pos-Ny2+i;
00474           const int f_y=fil_h-i-1;
00475           if(s_y >= 0 && s_y < src_h)
00476           {
00477             for(int j=0;j<fil_w;j++)
00478             {
00479               const int s_x=x_pos-Nx2+j;
00480               const int f_x=fil_w-j-1;
00481               const int flt_idx = IMUL(f_y,fil_w)+f_x;
00482               const int src_idx = IMUL(s_y,src_w)+s_x;
00483               if(s_x >=0 && s_x < src_w)
00484               {
00485                 float src_val = src[src_idx];
00486                 dst_val += src_val*data[flt_idx];
00487                 src_sum += src_val;
00488                 src_cnt++;
00489               }
00490               else
00491               {
00492                 fil_sum_skipped += data[flt_idx];
00493               }
00494             }
00495           }
00496           else
00497           {
00498               for (int f_x = fil_w-1; f_x >= 0; --f_x)
00499                 fil_sum_skipped += data[IMUL(f_y,fil_w)+f_x];
00500           }
00501         }
00502       const float src_avg = src_sum / src_cnt;
00503       res[res_idx] = dst_val + (fil_sum_skipped*src_avg);
00504 
00505 
00506     }
00507 
00508 }
00509 
00510 
00511 
00512 __global__ void cuda_global_xFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00513 {
00514   const int hfs2 = (hfs-1) / 2;
00515   const int y_pos = blockIdx.y;
00516   const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00517   const int ld_idx = threadIdx.x;
00518   const int res_idx = y_pos*src_w+x_pos;
00519 
00520   // Load the filter into shared memory
00521   float *data = (float *) shared_data; // size of share_len
00522   float sum = 0;
00523   const int numIters = IDIVUP(hfs,shr_len);
00524   // Handle the fact that the filter might be very large, and unable to fit in our shared memory size
00525   for(int iter=0;iter<numIters;iter++)
00526     {
00527       const int filt_beg = IMUL(iter,shr_len);
00528       const int filt_end = filt_beg+shr_len;
00529       const int load_max = min(filt_end,hfs)-filt_beg;
00530       for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00531         {
00532           const int filt_idx = data_idx + filt_beg;
00533           data[data_idx] = f[filt_idx];
00534         }
00535       __syncthreads();
00536 
00537       if(y_pos < src_h && x_pos < src_w)
00538         {
00539 
00540           for(int j=0;j<load_max;j++)
00541             {
00542               const int s_x=x_pos+hfs2-(filt_beg+j);
00543               const int f_x=j;
00544               if(s_x >=0 && s_x < src_w)
00545                 {
00546                   const int src_idx = IMUL(y_pos,src_w)+s_x;
00547                   sum += src[src_idx]*data[f_x];
00548                 }
00549             }
00550 
00551         }
00552       // Avoid doing this synch unless we have a very large filter
00553       if(numIters > 1)
00554         __syncthreads();
00555 
00556     }
00557   if(y_pos < src_h && x_pos < src_w)
00558     {
00559       res[res_idx] = sum;
00560     }
00561 
00562 }
00563 
00564 __global__ void cuda_global_xFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00565 {
00566   const int hfs2 = (hfs - 1) / 2;
00567   const int y_pos = blockIdx.y;
00568   const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00569   const int ld_idx = threadIdx.x;
00570   const int res_idx = y_pos*src_w+x_pos;
00571 
00572   // Load the filter into shared memory
00573   float *data = (float *) shared_data; // size of tile_width
00574   float sum = 0, sumw = 0, sumf = 0;
00575   const int numIters = IDIVUP(hfs,shr_len);
00576   // Handle the fact that the filter might be very large, and unable to fit in our shared memory size
00577   for(int iter=0;iter<numIters;iter++)
00578     {
00579       const int filt_beg = IMUL(iter,shr_len);
00580       const int filt_end = filt_beg+shr_len;
00581       const int load_max = min(filt_end,hfs)-filt_beg;
00582       for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00583         {
00584           const int filt_idx = data_idx + filt_beg;
00585           data[data_idx] = f[filt_idx];
00586         }
00587       __syncthreads();
00588 
00589       if(y_pos < src_h && x_pos < src_w)
00590         {
00591 
00592           for(int j=0;j<load_max;j++)
00593             {
00594               const int s_x=x_pos+hfs2-(filt_beg+j);
00595               const int f_x=j;
00596               if(s_x >=0 && s_x < src_w)
00597                 {
00598                   const int src_idx = IMUL(y_pos,src_w)+s_x;
00599                   sum += src[src_idx]*data[f_x];
00600                   sumw += data[f_x];
00601                 }
00602               else
00603                 {
00604                   sumf += data[f_x];
00605                 }
00606             }
00607         }
00608       // Avoid doing this synch unless we have a very large filter
00609       if(numIters > 1)
00610         __syncthreads();
00611 
00612     }
00613   if(y_pos < src_h && x_pos < src_w)
00614     {
00615       sumf+=sumw;
00616       res[res_idx] = sum*sumf/sumw;
00617     }
00618 
00619 }
00620 
00621 __global__ void cuda_global_xFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00622 {
00623   const int hfs2 = (hfs - 1) / 2;
00624   const int y_pos = blockIdx.y;
00625   const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00626   const int ld_idx = threadIdx.x;
00627   const int res_idx = y_pos*src_w+x_pos;
00628 
00629   // Load the filter into shared memory
00630   float *data = (float *) shared_data; // size of tile_width
00631   float sum = 0;
00632   const int numIters = IDIVUP(hfs,shr_len);
00633   // Handle the fact that the filter might be very large, and unable to fit in our shared memory size
00634   for(int iter=0;iter<numIters;iter++)
00635     {
00636       const int filt_beg = IMUL(iter,shr_len);
00637       const int filt_end = filt_beg+shr_len;
00638       const int load_max = min(filt_end,hfs)-filt_beg;
00639       for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00640         {
00641           const int filt_idx = data_idx + filt_beg;
00642           data[data_idx] = f[filt_idx];
00643         }
00644       __syncthreads();
00645 
00646       if(y_pos < src_h && x_pos < src_w)
00647         {
00648 
00649           for(int j=0;j<load_max;j++)
00650             {
00651               const int s_x=x_pos+hfs2-(filt_beg+j);
00652               const int f_x=j;
00653               if(s_x < 0)
00654                 {
00655                   sum += src[IMUL(y_pos,src_w)]*data[f_x];
00656                 }
00657               else if(s_x >= src_w)
00658                 {
00659                   sum += src[IMUL(y_pos+1,src_w)-1]*data[f_x];
00660                 }
00661               else
00662                 {
00663                   const int src_idx = IMUL(y_pos,src_w)+s_x;
00664                   sum += src[src_idx]*data[f_x];
00665                 }
00666             }
00667         }
00668       // Avoid doing this synch unless we have a very large filter
00669       if(numIters > 1)
00670         __syncthreads();
00671 
00672     }
00673   if(y_pos < src_h && x_pos < src_w)
00674     {
00675       res[res_idx] = sum;
00676     }
00677 
00678 
00679 }
00680 
00681 
00682 __global__ void cuda_global_yFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00683 {
00684   const int hfs2 = (hfs - 1) / 2;
00685   const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
00686   const int x_pos = blockIdx.x;
00687   const int ld_idx = threadIdx.y;
00688   const int res_idx = y_pos*src_w+x_pos;
00689 
00690   // Load the filter into shared memory
00691   float *data = (float *) shared_data; // size of tile_width
00692   float sum = 0;
00693   const int numIters = IDIVUP(hfs,shr_len);
00694   // Handle the fact that the filter might be very large, and unable to fit in our memory size
00695   for(int iter=0;iter<numIters;iter++)
00696     {
00697       const int filt_beg = IMUL((iter),shr_len);
00698       const int filt_end = filt_beg+shr_len;
00699       const int load_max = min(filt_end,hfs)-filt_beg;
00700       for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00701         {
00702           const int filt_idx = data_idx + filt_beg;
00703           data[data_idx] = f[filt_idx];
00704         }
00705       __syncthreads();
00706 
00707       if(y_pos < src_h && x_pos < src_w)
00708         {
00709 
00710           for(int j=0;j<load_max;j++)
00711             {
00712               const int s_y=y_pos+hfs2-(filt_beg+j);
00713               const int f_y=j;
00714               if(s_y >=0 && s_y < src_h)
00715                 {
00716                   const int src_idx = IMUL(s_y,src_w)+x_pos;
00717                   sum += src[src_idx]*data[f_y];
00718                 }
00719             }
00720         }
00721       // Avoid doing this synch unless we have a very large filter
00722       if(numIters > 1)
00723         __syncthreads();
00724 
00725     }
00726   if(y_pos < src_h && x_pos < src_w)
00727     {
00728       res[res_idx] = sum;
00729     }
00730 
00731 }
00732 
00733 __global__ void cuda_global_yFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00734 {
00735   const int hfs2 = (hfs - 1) / 2;
00736   const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
00737   const int x_pos = blockIdx.x;
00738   const int ld_idx = threadIdx.y;
00739   const int res_idx = y_pos*src_w+x_pos;
00740 
00741   // Load the filter into shared memory
00742   float *data = (float *) shared_data; // size of tile_width
00743   float sum = 0, sumw = 0, sumf = 0;
00744   const int numIters = IDIVUP(hfs,shr_len);
00745   // Handle the fact that the filter might be very large, and unable to fit in our memory size
00746   for(int iter=0;iter<numIters;iter++)
00747     {
00748       const int filt_beg = IMUL((iter),shr_len);
00749       const int filt_end = filt_beg+shr_len;
00750       const int load_max = min(filt_end,hfs)-filt_beg;
00751       for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00752         {
00753           const int filt_idx = data_idx + filt_beg;
00754           data[data_idx] = f[filt_idx];
00755         }
00756       __syncthreads();
00757 
00758       if(y_pos < src_h && x_pos < src_w)
00759         {
00760 
00761           for(int j=0;j<load_max;j++)
00762             {
00763               const int s_y=y_pos+hfs2-(filt_beg+j);
00764               const int f_y=j;
00765               if(s_y >=0 && s_y < src_h)
00766                 {
00767                   const int src_idx = IMUL(s_y,src_w)+x_pos;
00768                   sum += src[src_idx]*data[f_y];
00769                   sumw += data[f_y];
00770                 }
00771               else
00772                 {
00773                   sumf += data[f_y];
00774                 }
00775             }
00776         }
00777       // Avoid doing this synch unless we have a very large filter
00778       if(numIters > 1)
00779         __syncthreads();
00780     }
00781   if(y_pos < src_h && x_pos < src_w)
00782     {
00783       sumf+=sumw;
00784       res[res_idx] = sum*sumf/sumw;
00785     }
00786 
00787 }
00788 
00789 __global__ void cuda_global_yFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len)
00790 {
00791   const int hfs2 = (hfs - 1) / 2;
00792   const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
00793   const int x_pos = blockIdx.x;
00794   const int ld_idx = threadIdx.y;
00795   const int res_idx = y_pos*src_w+x_pos;
00796 
00797 
00798   // Load the filter into shared memory
00799   float *data = (float *) shared_data; // size of tile_width
00800   float sum = 0;
00801   const int numIters = IDIVUP(hfs,shr_len);
00802   // Handle the fact that the filter might be very large, and unable to fit in our memory size
00803   for(int iter=0;iter<numIters;iter++)
00804     {
00805       const int filt_beg = IMUL((iter),shr_len);
00806       const int filt_end = filt_beg+shr_len;
00807       const int load_max = min(filt_end,hfs)-filt_beg;
00808       for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len)
00809         {
00810           const int filt_idx = data_idx + filt_beg;
00811           data[data_idx] = f[filt_idx];
00812         }
00813       __syncthreads();
00814 
00815       if(y_pos < src_h && x_pos < src_w)
00816         {
00817           for(int j=0;j<load_max;j++)
00818             {
00819               const int s_y=y_pos+hfs2-(filt_beg+j);
00820               const int f_y=j;
00821               if(s_y < 0)
00822                 {
00823                   sum += src[x_pos]*data[f_y];
00824                 }
00825               else if(s_y >= src_h)
00826                 {
00827                   sum += src[IMUL(src_h-1,src_w)+x_pos]*data[f_y];
00828                 }
00829               else
00830                 {
00831                   const int src_idx = IMUL(s_y,src_w)+x_pos;
00832                   sum += src[src_idx]*data[f_y];
00833                 }
00834             }
00835         }
00836       // Avoid doing this synch unless we have a very large filter
00837       if(numIters > 1)
00838         __syncthreads();
00839     }
00840   if(y_pos < src_h && x_pos < src_w)
00841     {
00842       res[res_idx] = sum;
00843     }
00844 
00845 
00846 }
00847 
00848 
00849 __global__ void cuda_global_optXFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00850 {
00851   const int hfs2 = (hfs-1) / 2;
00852   const int y_pos = blockIdx.y;
00853   const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00854   const int data_idx = threadIdx.x;
00855   const int res_idx = y_pos*src_w+x_pos;
00856   const int ctr_off = hfs2;
00857 
00858   // Load the filter into shared memory
00859   float *filt = (float *) shared_data; // size of hfs
00860   float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1
00861 
00862   float sum = 0;
00863 
00864   for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
00865     {
00866       filt[filt_idx] = f[filt_idx];
00867     }
00868   data[data_idx+ctr_off] = src[res_idx];
00869   // Load early border
00870   if(data_idx < ctr_off && x_pos > ctr_off)
00871     {
00872       data[data_idx] = src[res_idx-ctr_off];
00873     }
00874   // Load late border
00875   if(data_idx > tile_len-ctr_off-1 && x_pos < src_w-ctr_off)
00876     {
00877       data[data_idx+ctr_off+ctr_off] = src[res_idx+ctr_off];
00878     }
00879   __syncthreads();
00880 
00881 
00882   if(y_pos < src_h && x_pos < src_w)
00883     {
00884 
00885       for(int j=0;j<hfs;j++)
00886         {
00887           const int s_x=data_idx+ctr_off+hfs2-j;
00888           const int x_loc=x_pos+hfs2-j;
00889           const int f_x=j;
00890           if(x_loc >=0 && x_loc < src_w)
00891             {
00892               sum += data[s_x]*filt[f_x];
00893             }
00894         }
00895 
00896     }
00897   if(y_pos < src_h && x_pos < src_w)
00898     {
00899       res[res_idx] = sum;
00900     }
00901 
00902 }
00903 
00904 
00905 __global__ void cuda_global_optXFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00906 {
00907   const int hfs2 = (hfs - 1) / 2;
00908   const int y_pos = blockIdx.y;
00909   const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00910   const int data_idx = threadIdx.x;
00911   const int res_idx = y_pos*src_w+x_pos;
00912   const int ctr_off = hfs2;
00913 
00914   // Load the filter into shared memory
00915   float *filt = (float *) shared_data; // size of hfs
00916   float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1
00917 
00918   float sum = 0, sumw = 0, sumf = 0;
00919 
00920   for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
00921     {
00922       filt[filt_idx] = f[filt_idx];
00923     }
00924   if(y_pos < src_h && x_pos < src_w)
00925     {
00926 
00927       data[data_idx+ctr_off] = src[res_idx];
00928       // Load early border
00929       if(data_idx < ctr_off && x_pos > ctr_off)
00930         {
00931           data[data_idx] = src[res_idx-ctr_off];
00932         }
00933       // Load late border
00934       if(data_idx > tile_len-ctr_off-1 && x_pos < src_w-ctr_off)
00935         {
00936           data[data_idx+ctr_off+ctr_off] = src[res_idx+ctr_off];
00937         }
00938       __syncthreads();
00939 
00940 
00941       for(int j=0;j<hfs;j++)
00942         {
00943           const int s_x=data_idx+ctr_off+hfs2-j;
00944           const int x_loc=x_pos+hfs2-j;
00945           const int f_x=j;
00946           if(x_loc >=0 && x_loc < src_w)
00947             {
00948               sum += data[s_x]*filt[f_x];
00949               sumw += filt[f_x];
00950             }
00951           else
00952             {
00953               sumf += filt[f_x];
00954             }
00955         }
00956       sumf+=sumw;
00957       res[res_idx] = sum*sumf/sumw;
00958     }
00959 
00960 }
00961 
00962 
00963 __global__ void cuda_global_optXFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00964 {
00965   const int hfs2 = (hfs - 1) / 2;
00966   const int y_pos = blockIdx.y;
00967   const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x;
00968   const int data_idx = threadIdx.x;
00969   const int res_idx = y_pos*src_w+x_pos;
00970   const int ctr_off = hfs2;
00971 
00972   // Load the filter into shared memory
00973   float *filt = (float *) shared_data; // size of hfs
00974   float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1
00975 
00976   float sum = 0;
00977 
00978   for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
00979     {
00980       filt[filt_idx] = f[filt_idx];
00981     }
00982   if(y_pos < src_h && x_pos < src_w)
00983     {
00984 
00985       data[data_idx+ctr_off] = src[res_idx];
00986       // Load early border
00987       if(data_idx < ctr_off && x_pos > ctr_off)
00988         {
00989           data[data_idx] = src[res_idx-ctr_off];
00990         }
00991       // Load late border
00992       if(data_idx > tile_len-ctr_off-1 && x_pos < src_w-ctr_off)
00993         {
00994           data[data_idx+ctr_off+ctr_off] = src[res_idx+ctr_off];
00995         }
00996       __syncthreads();
00997 
00998       for(int j=0;j<hfs;j++)
00999         {
01000           const int x_loc=x_pos+hfs2-j;
01001           const int f_x=j;
01002           if(x_loc < 0)
01003             {
01004               // Hold to the leftmost pixel
01005               sum += data[ctr_off]*filt[f_x];
01006             }
01007           else if(x_loc >= src_w)
01008             {
01009               // Hold to the rightmost pixel
01010               const int max_x = src_w-IMUL(blockIdx.x, tile_len)+ctr_off-1;
01011               sum += data[max_x]*filt[f_x];
01012             }
01013           else
01014             {
01015               const int s_x=data_idx+ctr_off+hfs2-j;
01016               sum += data[s_x]*filt[f_x];
01017             }
01018         }
01019 
01020       res[res_idx] = sum;
01021     }
01022 
01023 }
01024 
01025 
01026 __global__ void cuda_global_optYFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
01027 {
01028   const int hfs2 = (hfs-1) / 2;
01029   const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
01030   const int x_pos = blockIdx.x;
01031   const int data_idx = threadIdx.y;
01032   const int res_idx = y_pos*src_w+x_pos;
01033   const int ctr_off = hfs2;
01034 
01035   // Load the filter into shared memory
01036   float *filt = (float *) shared_data; // size of hfs
01037   float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1
01038 
01039   float sum = 0;
01040 
01041   for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
01042     {
01043       filt[filt_idx] = f[filt_idx];
01044     }
01045   if(y_pos < src_h && x_pos < src_w)
01046     {
01047       data[data_idx+ctr_off] = src[res_idx];
01048       // Load early border
01049       if(data_idx < ctr_off && y_pos > ctr_off)
01050         {
01051           data[data_idx] = src[res_idx-IMUL(ctr_off,src_w)];
01052         }
01053       // Load late border
01054       if(data_idx > tile_len-ctr_off-1 && y_pos < src_h-ctr_off)
01055         {
01056           data[data_idx+ctr_off+ctr_off] = src[res_idx+IMUL(ctr_off,src_w)];
01057         }
01058       __syncthreads();
01059 
01060 
01061       for(int j=0;j<hfs;j++)
01062         {
01063           const int s_y=data_idx+ctr_off+hfs2-j;
01064           const int y_loc=y_pos+hfs2-j;
01065           const int f_y=j;
01066           if(y_loc >=0 && y_loc < src_h)
01067             {
01068               sum += data[s_y]*filt[f_y];
01069             }
01070         }
01071 
01072       res[res_idx] = sum;
01073     }
01074 
01075 }
01076 
01077 
01078 __global__ void cuda_global_optYFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
01079 {
01080   const int hfs2 = (hfs - 1) / 2;
01081   const int y_pos = IMUL(blockIdx.y,tile_len) + threadIdx.y;
01082   const int x_pos = blockIdx.x;
01083   const int data_idx = threadIdx.y;
01084   const int res_idx = y_pos*src_w+x_pos;
01085   const int ctr_off = hfs2;
01086 
01087   // Load the filter into shared memory
01088   float *filt = (float *) shared_data; // size of hfs
01089   float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1
01090 
01091   float sum = 0, sumw = 0, sumf = 0;
01092 
01093   // Load the filter
01094   for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
01095     {
01096       filt[filt_idx] = f[filt_idx];
01097     }
01098   if(y_pos < src_h && x_pos < src_w)
01099     {
01100       // Load the bulk of the tile data
01101       data[data_idx+ctr_off] = src[res_idx];
01102       // Load early border
01103       if(data_idx < ctr_off && y_pos > ctr_off)
01104         {
01105           data[data_idx] = src[res_idx-IMUL(ctr_off,src_w)];
01106         }
01107       // Load late border
01108       if(data_idx > tile_len-ctr_off-1 && y_pos < src_h-ctr_off)
01109         {
01110           data[data_idx+ctr_off+ctr_off] = src[res_idx+IMUL(ctr_off,src_w)];
01111         }
01112       __syncthreads();
01113 
01114       for(int j=0;j<hfs;j++)
01115         {
01116           const int s_y=data_idx+ctr_off+hfs2-j;
01117           const int y_loc=y_pos+hfs2-j;
01118           const int f_y=j;
01119           if(y_loc >=0 && y_loc < src_h)
01120             {
01121               sum += data[s_y]*filt[f_y];
01122               sumw += filt[f_y];
01123             }
01124           else
01125             {
01126               sumf += filt[f_y];
01127             }
01128         }
01129     }
01130   if(y_pos < src_h && x_pos < src_w)
01131     {
01132       sumf+=sumw;
01133       res[res_idx] = sum*sumf/sumw;
01134     }
01135 
01136 }
01137 
01138 __global__ void cuda_global_optYFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
01139 {
01140   const int hfs2 = (hfs - 1) / 2;
01141   const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y;
01142   const int x_pos = blockIdx.x;
01143   const int data_idx = threadIdx.y;
01144   const int res_idx = y_pos*src_w+x_pos;
01145   const int ctr_off = hfs2;
01146 
01147   // Load the filter into shared memory
01148   float *filt = (float *) shared_data; // size of hfs
01149   float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1
01150 
01151   float sum = 0;
01152 
01153   for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len)
01154     {
01155       filt[filt_idx] = f[filt_idx];
01156     }
01157   if(y_pos < src_h && x_pos < src_w)
01158     {
01159       data[data_idx+ctr_off] = src[res_idx];
01160       // Load early border
01161       if(data_idx < ctr_off && y_pos > ctr_off)
01162         {
01163           data[data_idx] = src[res_idx-IMUL(ctr_off,src_w)];
01164         }
01165       // Load late border
01166       if(data_idx > tile_len-ctr_off-1 && y_pos < src_h-ctr_off)
01167         {
01168           data[data_idx+ctr_off+ctr_off] = src[res_idx+IMUL(ctr_off,src_w)];
01169         }
01170       __syncthreads();
01171 
01172       for(int j=0;j<hfs;j++)
01173         {
01174           const int y_loc=y_pos+hfs2-j;
01175           const int f_y=j;
01176           if(y_loc < 0)
01177             {
01178               // Hold to the topmost pixel
01179               sum += data[ctr_off]*filt[f_y];
01180             }
01181           else if(y_loc >= src_h)
01182             {
01183               // Hold to the bottommost pixel
01184               const int max_y = src_h-IMUL(blockIdx.y, tile_len)+ctr_off-1;
01185               sum += data[max_y]*filt[f_y];
01186             }
01187           else
01188             {
01189               const int s_y=data_idx+ctr_off+hfs2-j;
01190               sum += data[s_y]*filt[f_y];
01191             }
01192         }
01193     }
01194   if(y_pos < src_h && x_pos < src_w)
01195     {
01196       res[res_idx] = sum;
01197     }
01198 
01199 }
01200 
01201 
01202 #endif