cuda-lowpass.cu

Go to the documentation of this file.
00001 /*!@file CUDA/cuda-lowpass.cu CUDA/GPU optimized lowpass code */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file:
00034 // $HeadURL$
00035 // $Id$
00036 //
00037 
00038 #include "CUDA/cuda-lowpass.h"
00039 #include <cuda.h>
00040 #include "CUDA/cutil.h"
00041 // define int as 32 bits on CUDA architecture to satisfy env_config.h
00042 #define INT_IS_32_BITS
00043 #include "Envision/env_types.h"
00044 
00045 // 24-bit multiplication is faster on G80, but we must be sure to
00046 // multiply integers only within [-8M, 8M - 1] range
00047 #define IMUL(a, b) __mul24(a, b)
00048 
00049 #define ROW_TILE_W 128
00050 #define COLUMN_TILE_W 16
00051 #define COLUMN_TILE_H 16 //48
00052 
00053 
00054 
00055 ////////////////////////////////////////////////////////////////////////////////
00056 // Row convolution filter
00057 ////////////////////////////////////////////////////////////////////////////////
00058 __global__ void cudalowpass5xdecx(const int *src,  const unsigned int w, const unsigned int h, int* dst)
00059 {
00060   __shared__ int data[ROW_TILE_W]; // Data cache in shared memory
00061 
00062   const int sx = threadIdx.x;                    // source pixel within source tile
00063   const int dx = (sx >> 1);                      // dest pixel within dest tile (decimated 2x)
00064   const int sts = IMUL(blockIdx.x, ROW_TILE_W);  // tile start for source, relative to row start
00065   const int dts = (sts >>1);                     // tile start for dest, relative to row start
00066   const int srs = IMUL(blockIdx.y, w);           // Row start index in source data
00067   const int drs = IMUL(blockIdx.y, (w >> 1));    // Row start index in dest data
00068 
00069   // Load global memory values into our data cache:
00070   const int loadIdx = sts + sx;  // index of one pixel value to load
00071   if (loadIdx < w) data[sx] = src[srs + loadIdx];
00072 
00073   int border; if (sx == 0 && sts > 0) border = src[srs + sts - 1]; else border = 0;
00074   const int ww = (w & 0xfffe); // evened-down source size
00075 
00076   // Ensure the completness of loading stage because results emitted
00077   // by each thread depend on the data loaded by other threads:
00078   __syncthreads();
00079 
00080   // only process every other pixel
00081   if ( (sx & 1) == 0 && loadIdx < ww) {
00082     const int writeIdx = dts + dx; // write index relative to row start
00083     const int *dptr = data + sx;
00084 
00085     if (loadIdx == 0) dst[drs + writeIdx] = (dptr[1] + ((*dptr) << 1)) / 3;            // first pixel of image
00086     else if (sx == 0) dst[drs + writeIdx] = (border + dptr[1] + ((*dptr) << 1) ) >> 2; // first of tile
00087     else dst[drs + writeIdx] = (dptr[-1] + dptr[1] + ((*dptr) << 1)) >> 2;             // all other pixels
00088   }
00089 }
00090 
00091 ////////////////////////////////////////////////////////////////////////////////
00092 // Column convolution filter
00093 ////////////////////////////////////////////////////////////////////////////////
00094 __global__ void cudalowpass5ydecy(const int *src,  const unsigned int w, const unsigned int h,
00095                                   int* dst, int sms, int gms)
00096 {
00097   // Data cache
00098   __shared__ int data[COLUMN_TILE_W * COLUMN_TILE_H];
00099   __shared__ int border[COLUMN_TILE_W];
00100 
00101   const int sy = threadIdx.y; // source pixel row within source tile
00102   const int dy = (sy >> 1);   // dest pixel row within dest tile (decimated 2x)
00103 
00104   const int sts = IMUL(blockIdx.y, COLUMN_TILE_H); // tile start for source, in rows
00105   const int ste = sts + COLUMN_TILE_H; // tile end for source, in rows
00106 
00107   const int dts = (sts >> 1);
00108   const int dte = (ste >> 1);
00109 
00110   // Clamp tile and apron limits by image borders
00111   const int stec = min(ste, h);
00112   const int dtec = min(dte, (h >> 1));
00113 
00114   // Current column index
00115   const int scs = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
00116   const int dcs = scs;
00117 
00118   // only process columns that are actually within image bounds:
00119   if (scs < w) {
00120     // Shared and global memory indices for current column
00121     int smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x;
00122     int gmemPos = IMUL(sts + sy, w) + scs;
00123 
00124     // Cycle through the entire data cache
00125     // Load global memory values, if indices are within the image borders:
00126     for (int y = sts + sy; y < stec; y += blockDim.y) {
00127       data[smemPos] = src[gmemPos];
00128       smemPos += sms; gmemPos += gms;
00129     }
00130 
00131     if (sy == 0 && sts > 0) border[threadIdx.x] = src[IMUL(sts - 1, w) + scs];
00132 
00133     // Ensure the completness of loading stage because results emitted
00134     // by each thread depend on the data loaded by other threads:
00135     __syncthreads();
00136 
00137     // only process every other row
00138     if ((sy & 1) == 0) {
00139       // Shared and global memory indices for current column
00140       smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x;
00141       gmemPos = IMUL(dts + dy, w) + dcs;
00142   
00143       // Cycle through the tile body, clamped by image borders
00144       // Calculate and output the results
00145       int *dptr = data + smemPos;
00146       int dgms = (gms >> 1);  // memory stride for dest
00147 
00148       if (sts + sy == 0) { // top row of image
00149         dst[gmemPos] = (dptr[COLUMN_TILE_W] + ((*dptr) << 1)) / 3;
00150         dptr += sms; gmemPos += dgms;
00151         for (int y = sts + sy + blockDim.y; y < stec; y += blockDim.y) {
00152           dst[gmemPos] = (dptr[-COLUMN_TILE_W] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2;
00153           dptr += sms; gmemPos += dgms;
00154         }
00155       } else if (sy == 0) { // top row of a tile
00156         dst[gmemPos] = (border[threadIdx.x] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2;
00157         dptr += sms; gmemPos += dgms;
00158         for (int y = sts + sy + blockDim.y; y < stec; y += blockDim.y) {
00159           dst[gmemPos] = (dptr[-COLUMN_TILE_W] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2;
00160           dptr += sms; gmemPos += dgms;
00161         }
00162       } else { // all other rows
00163         for (int y = sts + sy; y < stec; y += blockDim.y) {
00164           dst[gmemPos] = (dptr[-COLUMN_TILE_W] + dptr[COLUMN_TILE_W] + ((*dptr) << 1)) >> 2;
00165           dptr += sms; gmemPos += dgms;
00166         }
00167       }
00168     }
00169   }
00170 }
00171 
00172 //Round a / b to nearest higher integer value
00173 inline int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
00174 
00175 // ######################################################################
00176 void cuda_lowpass_5_x_dec_x_fewbits_optim(const int* src, const unsigned int w, const unsigned int h, int* dst)
00177 {
00178   dim3 blockGridRows(iDivUp(w, ROW_TILE_W), h);
00179   dim3 threadBlockRows(ROW_TILE_W);
00180 
00181   cudalowpass5xdecx<<<blockGridRows, threadBlockRows>>>(src, w, h, dst);
00182 }
00183 
00184 // ######################################################################
00185 void cuda_lowpass_5_y_dec_y_fewbits_optim(const int* src, const unsigned int w, const unsigned int h, int* dst)
00186 {
00187   dim3 blockGridColumns(iDivUp(w, COLUMN_TILE_W), iDivUp(h, COLUMN_TILE_H));
00188   dim3 threadBlockColumns(COLUMN_TILE_W, 8);
00189 
00190   cudalowpass5ydecy<<<blockGridColumns, threadBlockColumns>>>(src, w, h, dst, 
00191                                                               COLUMN_TILE_W * threadBlockColumns.y,
00192                                                               w * threadBlockColumns.y);
00193 }
00194 
00195 
00196 __global__ void cudalowpass9x(const int* src, const unsigned int w, const unsigned int h, int* dst)
00197 {
00198   __shared__ int data[ROW_TILE_W]; // Data cache in shared memory
00199   __shared__ int border[6];  // Bordering data flanking this tile
00200   const int sx = threadIdx.x;                   // source pixel within source tile
00201   const int sts = IMUL(blockIdx.x, ROW_TILE_W); // tile start for source, relative to row start
00202   const int srs = IMUL(blockIdx.y,w);           // Row start index in source data
00203 
00204   const int loadIdx = sts + sx; // index of one pixel value to load
00205   const int off = sx - 3; // Offset for the data storing
00206 
00207   // Load border pixels
00208   if (sx < 3 && sts > 0) border[sx] = src[srs + sts - (3-sx)];
00209   if (sx >= ROW_TILE_W-3 && sts+ROW_TILE_W < w-3) border[3+sx-(ROW_TILE_W-3)] = src[srs + sts + sx + 3];
00210  
00211  // Load the row into shared memory among the thread block
00212   if (loadIdx < w) 
00213     data[sx] = src[srs + loadIdx];
00214   else
00215     return; // Threads that are over the edge of the image on the right most tile...
00216 
00217   // Ensure the completness of loading stage because results emitted
00218   // by each thread depend on the data loaded by other threads:
00219   __syncthreads();
00220 
00221   // First part of source row, just reduce sample
00222   if(sts+sx < 3)
00223   {
00224     switch(sx)
00225     {
00226     case 0:
00227       dst[srs + loadIdx] =                 // [ (72^) 56 28 8 ]
00228                            (data[0]* 72 +
00229                             data[1] * 56 +
00230                             data[2] * 28 +
00231                             data[3] *  8
00232                             ) / 164;
00233       break;
00234     case 1:
00235       dst[srs + loadIdx] =                  // [ 56^ (72) 56 28 8 ]
00236                            ((data[0] + data[2]) * 56 +
00237                             data[1] * 72 +
00238                             data[3] * 28 +
00239                             data[4] *  8
00240                             ) / 220;
00241       break;
00242     case 2:
00243       dst[srs + loadIdx] =                  // [ 28^ 56 (72) 56 28 8 ]
00244                            ((data[0] + data[4]) * 28 +
00245                             (data[1] + data[3]) * 56 +
00246                             data[2] * 72 +
00247                             data[5] *  8
00248                             ) / 248;
00249     default:
00250       //LERROR();
00251       break;
00252     }
00253   }
00254   // If not the first part of the soure row, but is the first bit of this tile, use border
00255   else if(sx < 3 && sts+sx < w-3)
00256   {
00257     switch(sx)
00258     {
00259     case 0:
00260       dst[srs + loadIdx] =              // [ 8^ 28 56 (72) 56 28 8 ]
00261         ((border[0] + data[off+6]) *  8 +
00262          (border[1] + data[off+5]) * 28 +
00263          (border[2] + data[off+4]) * 56 +
00264          data[off+3] * 72
00265          ) >> 8;
00266       break;
00267     case 1:
00268       dst[srs + loadIdx] =              // [ 8^ 28 56 (72) 56 28 8 ]
00269         ((border[1] + data[off+6]) *  8 +
00270          (border[2] + data[off+5]) * 28 +
00271          (data[off+2] + data[off+4]) * 56 +
00272          data[off+3] * 72
00273          ) >> 8;
00274       break;    
00275     case 2:
00276       dst[srs + loadIdx] =              // [ 8^ 28 56 (72) 56 28 8 ]
00277         ((border[2] + data[off+6]) *  8 +
00278          (data[off+1] + data[off+5]) * 28 +
00279          (data[off+2] + data[off+4]) * 56 +
00280          data[off+3] * 72
00281          ) >> 8;
00282       break;
00283     }
00284   }
00285   // If we are not near the edge of this tile, do standard way
00286   else if(sx < ROW_TILE_W-3 && sts +sx < w-3)
00287   {
00288     dst[srs + loadIdx] =              // [ 8^ 28 56 (72) 56 28 8 ]
00289       ((data[off+0] + data[off+6]) *  8 +
00290        (data[off+1] + data[off+5]) * 28 +
00291        (data[off+2] + data[off+4]) * 56 +
00292        data[off+3] * 72
00293        ) >> 8;
00294   }
00295   // If not the last part of the source row, but in the last bit of this tile, use border
00296   else if(sts+sx < w-3)
00297   {
00298     switch(sx)
00299       {
00300       case ROW_TILE_W-3:
00301         dst[srs + loadIdx] =              // [ 8^ 28 56 (72) 56 28 8 ]
00302           ((data[off+0] + border[3]) *  8 +
00303            (data[off+1] + data[off+5]) * 28 +
00304            (data[off+2] + data[off+4]) * 56 +
00305            data[off+3] * 72
00306            ) >> 8;
00307         break;
00308       case ROW_TILE_W-2:
00309         dst[srs + loadIdx] =              // [ 8^ 28 56 (72) 56 28 8 ]
00310           ((data[off+0] + border[4]) *  8 +
00311            (data[off+1] + border[3]) * 28 +
00312            (data[off+2] + data[off+4]) * 56 +
00313            data[off+3] * 72
00314            ) >> 8;
00315         break;    
00316       case ROW_TILE_W-1:
00317         dst[srs + loadIdx] =              // [ 8^ 28 56 (72) 56 28 8 ]
00318           ((data[off+0] + border[5]) *  8 +
00319            (data[off+1] + border[4]) * 28 +
00320            (data[off+2] + border[3]) * 56 +
00321            data[off+3] * 72
00322            ) >> 8;
00323         break;
00324   }
00325   }
00326   // If in the last bit of the source row, reduce sample
00327   else if(sts + sx < w)
00328   {
00329 //      dst[srs  + loadIdx] =                  // [ 8^ 28 56 (72) ]
00330 //         (data[off+0] *  8 +
00331 //          data[off+1] * 28 +
00332 //          data[off+2] * 56 +
00333 //          data[off+3] * 72
00334 //          ) / 164;
00335 //   }
00336 //   else if(sx < 0)
00337 //   {
00338     switch(w-(sts+sx))
00339     {
00340     case 3:
00341       dst[srs + loadIdx] =                  // [ 8^ 28 56 (72) 56 28 ]
00342         (data[off+0] *  8 +
00343          (data[off+1] + data[off+5]) * 28 +
00344          (data[off+2] + data[off+4]) * 56 +
00345          data[off+3] * 72
00346          ) / 248;
00347       break;
00348     case 2:
00349       dst[srs + loadIdx] =                  // [ 8^ 28 56 (72) 56 ]
00350         (data[off+0] *  8 +
00351          data[off+1] * 28 +
00352          (data[off+2] + data[off+4]) * 56 +
00353          data[off+3] * 72
00354          ) / 220;
00355       break;
00356     case 1:
00357       dst[srs  + loadIdx] =                  // [ 8^ 28 56 (72) ]
00358         (data[off+0] *  8 +
00359          data[off+1] * 28 +
00360          data[off+2] * 56 +
00361          data[off+3] * 72
00362          ) / 164;
00363       break;
00364     default:
00365       dst[srs + loadIdx] = sx;
00366     }
00367   }
00368 }
00369 
00370 // ######################################################################
00371 __global__ void cudalowpass9y(const int* src,
00372                                      const unsigned int w,
00373                                      const unsigned int h,
00374                                      int* dst, int sms, int gms)
00375 {
00376 
00377   // Data cache
00378   __shared__ int data[COLUMN_TILE_W * COLUMN_TILE_H];
00379   __shared__ int border[COLUMN_TILE_W * 6];
00380 
00381   const int sy = threadIdx.y; // source pixel row within source tile
00382 
00383   const int sts = IMUL(blockIdx.y, COLUMN_TILE_H); // tile start for source, in rows
00384   const int ste = sts + COLUMN_TILE_H; // tile end for source, in rows
00385 
00386 
00387   // Clamp tile and apron limits by image borders
00388   const int stec = min(ste, h);
00389 
00390   // Current column index
00391   const int scs = IMUL(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
00392 
00393   // only process columns that are actually within image bounds:
00394   if (scs < w && sts+sy < stec) 
00395   {
00396     // Shared and global memory indices for current column
00397     int smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x;
00398     int gmemPos = IMUL(sts + sy, w) + scs;
00399 
00400     // Cycle through the entire data cache
00401     // Load global memory values, if indices are within the image borders:
00402 //     for (int y = sts + sy; y < stec; y += blockDim.y) {
00403 //       data[smemPos] = src[gmemPos];
00404 //       smemPos += sms; gmemPos += gms;
00405 //     }
00406     data[smemPos] = src[gmemPos];
00407 
00408     if (sy < 3 && gmemPos > IMUL(3,w)) 
00409       border[smemPos] = src[gmemPos-IMUL(3,w)];
00410       //border[threadIdx.x+IMUL(sy,COLUMN_TILE_W)] = src[IMUL(sts-(3-sy), w) + scs];
00411 
00412     int bordOff = 6+sy-COLUMN_TILE_H;
00413 
00414     if (sy >= COLUMN_TILE_H-3 && ste+3 < h) //blockIdx.y < blockDim.y-1) 
00415       border[threadIdx.x+IMUL(bordOff,COLUMN_TILE_W)] = src[gmemPos+IMUL(3,w)];
00416       //border[threadIdx.x+IMUL(bordOff,COLUMN_TILE_W)] = src[IMUL(ste-1+COLUMN_TILE_H-sy, w) + scs];
00417 
00418     // Ensure the completness of loading stage because results emitted
00419     // by each thread depend on the data loaded by other threads:
00420     __syncthreads();
00421 
00422     // Shared and global memory indices for current column
00423     smemPos = IMUL(sy, COLUMN_TILE_W) + threadIdx.x;
00424     gmemPos = IMUL(sts + sy, w) + scs;
00425   
00426 
00427     // Setup the offsets to get to the correct smem points in the arrays for both the data and the border
00428     int *dptr = data + smemPos;
00429     const int sw = COLUMN_TILE_W, sw2 = sw + sw, sw3 = sw2 + sw;
00430     const int nsw = -sw, nsw2 = nsw - sw, nsw3 = nsw2 - sw;
00431     const int bn3 = threadIdx.x, bn2 = bn3 + COLUMN_TILE_W, bn1 = bn2 + COLUMN_TILE_W;
00432     const int bp1 = bn1+COLUMN_TILE_W, bp2 = bp1 + COLUMN_TILE_W, bp3 = bp2 + COLUMN_TILE_W;
00433 
00434     // Are we in the top 3 rows of the whole image
00435     if(sts + sy < 3)
00436     {
00437       switch(sts+sy)
00438       {
00439       case 0:
00440         dst[gmemPos] =
00441           (dptr[0] * 72 +
00442            dptr[sw] * 56 +
00443            dptr[sw2] * 28 +
00444            dptr[sw3] *  8
00445            ) / 164;
00446         break;
00447       case 1:
00448         dst[gmemPos] =
00449           (dptr[0] * 72 +
00450            (dptr[nsw] + dptr[sw]) * 56 +
00451            dptr[sw2] * 28 +
00452            dptr[sw3] *  8
00453            ) / 220;
00454         break;
00455       case 2:
00456         dst[gmemPos] =
00457           (dptr[0] * 72 +
00458            (dptr[nsw] + dptr[sw]) * 56 +
00459            (dptr[nsw2] + dptr[sw2]) * 28 +
00460            dptr[sw3] *  8
00461            ) / 248;
00462         break;
00463       }
00464     }
00465     else if(sy < 3 && sts+sy<h-3) // If not top 3 in the whole image, are we in the top 3 rows of this tile
00466     {
00467       switch(sy)
00468       {
00469       case 0:
00470         dst[gmemPos] =
00471           (dptr[0] * 72 +
00472            (border[bn1] + dptr[sw]) * 56 +
00473            (border[bn2] + dptr[sw2]) * 28 +
00474            (border[bn3] + dptr[sw3]) *  8
00475            ) >> 8;
00476         break;
00477       case 1:
00478         dst[gmemPos] =
00479           (dptr[0] * 72 +
00480            (dptr[nsw] + dptr[sw]) * 56 +
00481            (border[bn1] + dptr[sw2]) * 28 +
00482            (border[bn2] + dptr[sw3]) *  8
00483            ) >> 8;
00484         break;
00485       case 2:
00486         dst[gmemPos] =
00487           (dptr[0] * 72 +
00488            (dptr[nsw] + dptr[sw]) * 56 +
00489            (dptr[nsw2] + dptr[sw2]) * 28 +
00490            (border[bn1] + dptr[sw3]) *  8
00491            ) >> 8;
00492         break;
00493       }      
00494     }
00495     else if(sy <COLUMN_TILE_H-3 && sts+sy<h-3)//(sy < COLUMN_TILE_H-4 && sts+sy<h-3) // Are we in the middle of the tile
00496     {
00497         dst[gmemPos] =
00498           ((dptr[nsw3] + dptr[sw3]) *  8 +
00499            (dptr[nsw2] + dptr[sw2]) * 28 +
00500            (dptr[nsw] + dptr[sw]) * 56 +
00501            dptr[0] * 72
00502            ) >> 8;
00503     }
00504     else if(sts + sy < h-3) // Are we not at the bottom of the image, but bottom 3 of the tile
00505     {
00506       switch(sy)
00507       {
00508       case COLUMN_TILE_H-3:
00509         dst[gmemPos] =
00510           (dptr[0] * 72 +
00511            (dptr[nsw] + dptr[sw]) * 56 +
00512            (dptr[nsw2] + dptr[sw2]) * 28 +
00513            (dptr[nsw3] + border[bp1]) *  8
00514            ) >> 8;
00515         break;
00516       case COLUMN_TILE_H-2:
00517         dst[gmemPos] =
00518           (dptr[0] * 72 +
00519            (dptr[nsw] + dptr[sw]) * 56 +
00520            (dptr[nsw2] + border[bp1]) * 28 +
00521            (dptr[nsw3] + border[bp2]) *  8
00522            ) >> 8;
00523         break;
00524       case COLUMN_TILE_H-1:
00525         dst[gmemPos] =
00526           (dptr[0] * 72 +
00527            (dptr[nsw] + border[bp1]) * 56 +
00528            (dptr[nsw2] + border[bp2]) * 28 +
00529            (dptr[nsw3] + border[bp3]) *  8
00530            ) >> 8;
00531         break;
00532       }
00533     }
00534     else // We must be at the bottom 3 of the image
00535     {
00536       switch(h-(sts+sy))
00537       {
00538       case 3:
00539         dst[gmemPos] =
00540           (dptr[0] * 72 +
00541            (dptr[nsw] + dptr[sw]) * 56 +
00542            (dptr[nsw2] + dptr[sw2]) * 28 +
00543            dptr[nsw3] *  8 
00544            ) / 248;
00545         break;
00546       case 2:
00547         dst[gmemPos] =
00548           (dptr[0] * 72 +
00549            (dptr[nsw] + dptr[sw]) * 56 +
00550            dptr[nsw2] * 28 +
00551            dptr[nsw3] *  8
00552            ) / 220;
00553         break;
00554       case 1:
00555         dst[gmemPos] =
00556           (dptr[0] * 72 +
00557            dptr[nsw] * 56 +
00558            dptr[nsw2] * 28 +
00559            dptr[nsw3] *  8
00560            ) / 164;
00561         break;
00562       }
00563     }   
00564   }
00565 }
00566 
00567 
00568 // #####################################################################
00569 void cuda_lowpass_9_x_fewbits_optim(const int* src,
00570                                      const unsigned int w,
00571                                      const unsigned int h,
00572                                      int* dst)
00573 {
00574   //ENV_ASSERT(w >= 9);
00575   dim3 blockGridRows(iDivUp(w, ROW_TILE_W), h);
00576   dim3 threadBlockRows(ROW_TILE_W);
00577   cudalowpass9x<<<blockGridRows, threadBlockRows>>>(src, w, h, dst);
00578 }
00579 
00580 void cuda_lowpass_9_y_fewbits_optim(const int* src,
00581                                      const unsigned int w,
00582                                      const unsigned int h,
00583                                      int* dst)
00584 {
00585   //ENV_ASSERT(h >= 9);
00586   dim3 blockGridColumns(iDivUp(w, COLUMN_TILE_W), iDivUp(h, COLUMN_TILE_H));
00587   dim3 threadBlockColumns(COLUMN_TILE_W, COLUMN_TILE_H);
00588 
00589   cudalowpass9y<<<blockGridColumns, threadBlockColumns>>>(src, w, h, dst, COLUMN_TILE_W*threadBlockColumns.y,
00590                                                           w*threadBlockColumns.y);
00591 }
00592 
00593 
00594 
00595 
00596 
00597 // ######################################################################
00598 /* So things look consistent in everyone's emacs... */
00599 /* Local Variables: */
00600 /* mode: c++ */
00601 /* indent-tabs-mode: nil */
00602 /* End: */