cuda_filterops.h

00001 /*!@file CUDA/cuda-filterops.h CUDA/GPU optimized filter operations code */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file:
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_filterops.h $
00035 // $Id: cuda_filterops.h 12962 2010-03-06 02:13:53Z irock $
00036 //
00037 
00038 
00039 #ifndef CUDA_FILTEROPS_H_DEFINED
00040 #define CUDA_FILTEROPS_H_DEFINED
00041 
00042 #include <cuda.h>
00043 #include "CUDA/cutil.h"
00044 #include "cudadefs.h"
00045 #include <float.h>
00046 
00047 
00048 __global__ void cuda_global_orientedFilter(const float *src, float *re, float *im, const float kx, const float ky, const float intensity, const int w, const int h, const int tile_width)
00049 {
00050   // Determine index
00051   const int x_idx = IMUL(blockIdx.x,tile_width) + threadIdx.x;
00052   const int y_idx = blockIdx.y;
00053   const int idx = IMUL(y_idx,w) + x_idx;
00054 
00055   const float val = src[idx] * intensity;
00056   // (x,y) = (0,0) at center of image:
00057   const int w2l = w >> 1, w2r = w - w2l;
00058   const int h2l = h >> 1, h2r = h - h2l;
00059 
00060   // Position in coordinate space if the center of the image is (0,0)
00061   //for (int j = -h2l; j < h2r; ++j)
00062   //  for (int i = -w2l; i < w2r; ++i)
00063   const int x_pos = x_idx-w2l;
00064   const int y_pos = y_idx-h2l;
00065   if(x_pos < w2r && y_pos < h2r)
00066   {
00067     const float arg = kx*x_pos + ky*y_pos;
00068     //const float arg = kx * i + ky * j;
00069     // NOTE: sin and cos are single cycle calls on GPU, no need for trig tables
00070     // Since we're using the same arg, sincosf() is what we need
00071     // If we want to be less accurate, we can use __sincosf() directly
00072     float sinarg, cosarg;
00073     sincosf(arg, &sinarg, &cosarg);
00074 
00075 
00076     re[idx] = val * cosarg;
00077     im[idx] = val * sinarg;
00078   }
00079 
00080 }
00081 
00082 
00083 __global__ void cuda_global_centerSurroundAbs(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int scalex, int scaley, int remx, int remy, int tile_width )
00084 {
00085   // compute abs(hires - lowres):
00086   const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x;
00087   const int y_pos = blockIdx.y;
00088   const int cidx = IMUL(y_pos,lw) + x_pos;
00089   // sidx = y_pos/scaley*sw + x_pos/scalex
00090   int sidx = IMUL(y_pos/scaley,sw) + x_pos/scalex;
00091   // This seems to lock the indices to one less than the last at the edges of the image...
00092   // don't understand justification, but it was in the cpu version of centerSurround()
00093   if(x_pos > remx) sidx--;
00094   if(y_pos > remy) sidx-=sw;
00095 
00096   if(x_pos < lw && y_pos < lh)
00097     {
00098       float cval = center[cidx];
00099       float sval = surround[sidx];
00100       if(cval > sval)
00101         {
00102           res[cidx] = cval - sval;
00103         }
00104       else
00105         {
00106           res[cidx] = sval - cval;
00107         }
00108     }
00109 }
00110 
00111 __global__ void cuda_global_centerSurroundClamped(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int scalex, int scaley, int remx, int remy, int tile_width )
00112 {
00113   // compute hires - lowres, clamped to 0:
00114   const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x;
00115   const int y_pos = blockIdx.y;
00116   const int cidx = IMUL(y_pos,lw) + x_pos;
00117   // sidx = y_pos/scaley*sw + x_pos/scalex
00118   int sidx = IMUL(y_pos/scaley,sw) + x_pos/scalex;
00119   // This seems to lock the indices to one less than the last at the edges of the image...
00120   // don't understand justification, but it was in the cpu version of centerSurround()
00121   if(x_pos > remx) sidx--;
00122   if(y_pos > remy) sidx-=sw;
00123 
00124   if(x_pos < lw && y_pos < lh)
00125     {
00126       float cval = center[cidx];
00127       float sval = surround[sidx];
00128       if(cval > sval)
00129         {
00130           res[cidx] = cval - sval;
00131         }
00132       else
00133         {
00134           res[cidx] = 0;
00135         }
00136     }
00137 }
00138 
00139 __global__ void cuda_global_centerSurroundDirectional(const float *center, const float *surround, float *pos, float *neg, int lw, int lh, int sw, int sh, int scalex, int scaley, int remx, int remy, int tile_width )
00140 {
00141   // compute abs(hires - lowres):
00142   const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x;
00143   const int y_pos = blockIdx.y;
00144   const int cidx = IMUL(y_pos,lw) + x_pos;
00145   // sidx = y_pos/scaley*sw + x_pos/scalex
00146   int sidx = IMUL(y_pos/scaley,sw) + x_pos/scalex;
00147   // This seems to lock the indices to one less than the last at the edges of the image...
00148   // don't understand justification, but it was in the cpu version of centerSurround()
00149   if(x_pos > remx) sidx--;
00150   if(y_pos > remy) sidx-=sw;
00151 
00152   if(x_pos < lw && y_pos < lh)
00153     {
00154       float cval = center[cidx];
00155       float sval = surround[sidx];
00156       if(cval > sval)
00157         {
00158           pos[cidx] = cval - sval;
00159           neg[cidx] = 0.0F;
00160         }
00161       else
00162         {
00163           pos[cidx] = 0.0F;
00164           neg[cidx] = sval - cval;
00165         }
00166     }
00167 }
00168 
00169 
00170 
00171 __global__ void cuda_global_centerSurroundAbsAttenuate(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int borderSize, int scalex, int scaley, int remx, int remy, int tile_width, int tile_height)
00172 {
00173   // compute abs(hires - lowres):
00174   const float increment = 1.0 / (float)(borderSize + 1);
00175   const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x;
00176   const int y_pos = blockIdx.y;
00177   const int cidx = IMUL(y_pos,lw) + x_pos;
00178   float result;
00179   // sidx = y_pos/scaley*sw + x_pos/scalex
00180   int sidx = IMUL(y_pos/scaley,sw) + x_pos/scalex;
00181   // This seems to lock the indices to one less than the last at the edges of the image...
00182   // don't understand justification, but it was in the cpu version of centerSurround()
00183   if(x_pos > remx) sidx--;
00184   if(y_pos > remy) sidx-=sw;
00185 
00186   if(x_pos < lw && y_pos < lh)
00187     {
00188       // Perform the center/surround absolute calculation
00189       float cval = center[cidx];
00190       float sval = surround[sidx];
00191       if(cval > sval)
00192         {
00193           result = cval - sval;
00194         }
00195       else
00196         {
00197           result = sval - cval;
00198         }
00199       // Perform the attenuate border calculation
00200       // Y Border: In the top lines of the border
00201       if(y_pos < borderSize)
00202         {
00203           float coeff = increment*(y_pos+1);
00204           result *= coeff;
00205         }
00206       // Y Border: In the bottom lines of the border
00207       else if(y_pos > lh-borderSize-1)
00208         {
00209           float coeff = increment*(borderSize-lh);
00210           result *= coeff;
00211         }
00212       // X Border: In the left lines of the border
00213       if(x_pos < borderSize)
00214         {
00215           float coeff = increment*(x_pos+1);
00216           result *= coeff;
00217         }
00218       // Y Border: In the right lines of the border
00219       else if(x_pos < lw-borderSize-1)
00220         {
00221           float coeff = increment*(borderSize-lw);
00222           result *= coeff;
00223         }
00224     }
00225 
00226 }
00227 
00228 
00229 __global__ void cuda_global_spatialPoolMax(const float *src, float *res, const int src_w, const int src_h, int skip_w, int skip_h, int reg_w, int reg_h, int tile_width, int tile_height)
00230 {
00231   // Determine which region you are working on, and which tile within that region
00232   const int tilesperregion_w = IDIVUP(reg_w,tile_width);
00233   const int tilesperregion_h = IDIVUP(reg_h,tile_height);
00234   // Find out which region we are in
00235   const int reg_x = blockIdx.x / tilesperregion_w;
00236   const int reg_y = blockIdx.y / tilesperregion_h;
00237   // Within the region, which tile are we
00238   const int tile_x = blockIdx.x % tilesperregion_w;
00239   const int tile_y = blockIdx.y % tilesperregion_h;
00240   // What are the src x and y positions for this thread
00241   const int reg_x_pos = IMUL(tile_x,tile_width) + threadIdx.x;
00242   const int reg_y_pos = IMUL(tile_y,tile_height) + threadIdx.y;
00243   const int x_pos = IMUL(reg_x, skip_w) + reg_x_pos;
00244   const int y_pos = IMUL(reg_y, skip_h) + reg_y_pos;
00245   const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x;
00246   const int src_idx = IMUL(y_pos,src_w)+x_pos;
00247   const int src_sz = IMUL(src_w,src_h);
00248 
00249   // Load the spatial pool into shared memory
00250   float *data = (float *) shared_data; // size of tile_width*tile_height
00251   const int tile_sz = IMUL(tile_height,tile_width);
00252 
00253   // Make sure we are in bounds of the region and in bounds of the image
00254   if(y_pos < src_h && x_pos < src_w && reg_x_pos < reg_w && reg_y_pos < reg_h)
00255     data[ld_idx] = src[src_idx];
00256   else
00257     data[ld_idx] = -FLT_MAX;
00258 
00259   __syncthreads();
00260 
00261   if(y_pos < src_h && x_pos < src_w)
00262   {
00263     // Come up with a per block max
00264     int incr = 1;
00265     int mod = 2;
00266     while(incr < tile_sz)
00267     {
00268       if(ld_idx % mod == 0 && ld_idx+incr < tile_sz)
00269       {
00270         // Max between neighbors
00271         if(data[ld_idx] < data[ld_idx+incr])
00272           data[ld_idx] = data[ld_idx+incr];
00273       }
00274       __syncthreads();
00275 
00276       incr *= 2;
00277       mod *= 2;
00278     }
00279   }
00280   // Have the first thread load the output, should be done regardless of whether we are in the actual image or not
00281   if(ld_idx == 0)
00282     {
00283       const int res_idx = IMUL(blockIdx.y,gridDim.x)+blockIdx.x;
00284       res[res_idx] = data[ld_idx];
00285     }
00286 
00287 }
00288 
00289 
00290 
00291 #endif