00001 /*!@file CUDA/cuda-filterops.h CUDA/GPU optimized filter operations code */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_filterops.h $ 00035 // $Id: cuda_filterops.h 12962 2010-03-06 02:13:53Z irock $ 00036 // 00037 00038 00039 #ifndef CUDA_FILTEROPS_H_DEFINED 00040 #define CUDA_FILTEROPS_H_DEFINED 00041 00042 #include <cuda.h> 00043 #include "CUDA/cutil.h" 00044 #include "cudadefs.h" 00045 #include <float.h> 00046 00047 00048 __global__ void cuda_global_orientedFilter(const float *src, float *re, float *im, const float kx, const float ky, const float intensity, const int w, const int h, const int tile_width) 00049 { 00050 // Determine index 00051 const int x_idx = IMUL(blockIdx.x,tile_width) + threadIdx.x; 00052 const int y_idx = blockIdx.y; 00053 const int idx = IMUL(y_idx,w) + x_idx; 00054 00055 const float val = src[idx] * intensity; 00056 // (x,y) = (0,0) at center of image: 00057 const int w2l = w >> 1, w2r = w - w2l; 00058 const int h2l = h >> 1, h2r = h - h2l; 00059 00060 // Position in coordinate space if the center of the image is (0,0) 00061 //for (int j = -h2l; j < h2r; ++j) 00062 // for (int i = -w2l; i < w2r; ++i) 00063 const int x_pos = x_idx-w2l; 00064 const int y_pos = y_idx-h2l; 00065 if(x_pos < w2r && y_pos < h2r) 00066 { 00067 const float arg = kx*x_pos + ky*y_pos; 00068 //const float arg = kx * i + ky * j; 00069 // NOTE: sin and cos are single cycle calls on GPU, no need for trig tables 00070 // Since we're using the same arg, sincosf() is what we need 00071 // If we want to be less accurate, we can use __sincosf() directly 00072 float sinarg, cosarg; 00073 sincosf(arg, &sinarg, &cosarg); 00074 00075 00076 re[idx] = val * cosarg; 00077 im[idx] = val * sinarg; 00078 } 00079 00080 } 00081 00082 00083 __global__ void cuda_global_centerSurroundAbs(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int scalex, int scaley, int remx, int remy, int tile_width ) 00084 { 00085 // compute abs(hires - lowres): 00086 const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x; 00087 const int y_pos = blockIdx.y; 00088 const int cidx = IMUL(y_pos,lw) + x_pos; 00089 // sidx = y_pos/scaley*sw + x_pos/scalex 00090 int sidx = IMUL(y_pos/scaley,sw) + x_pos/scalex; 00091 // This seems to lock the indices to one less than the last at the edges of the image... 00092 // don't understand justification, but it was in the cpu version of centerSurround() 00093 if(x_pos > remx) sidx--; 00094 if(y_pos > remy) sidx-=sw; 00095 00096 if(x_pos < lw && y_pos < lh) 00097 { 00098 float cval = center[cidx]; 00099 float sval = surround[sidx]; 00100 if(cval > sval) 00101 { 00102 res[cidx] = cval - sval; 00103 } 00104 else 00105 { 00106 res[cidx] = sval - cval; 00107 } 00108 } 00109 } 00110 00111 __global__ void cuda_global_centerSurroundClamped(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int scalex, int scaley, int remx, int remy, int tile_width ) 00112 { 00113 // compute hires - lowres, clamped to 0: 00114 const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x; 00115 const int y_pos = blockIdx.y; 00116 const int cidx = IMUL(y_pos,lw) + x_pos; 00117 // sidx = y_pos/scaley*sw + x_pos/scalex 00118 int sidx = IMUL(y_pos/scaley,sw) + x_pos/scalex; 00119 // This seems to lock the indices to one less than the last at the edges of the image... 00120 // don't understand justification, but it was in the cpu version of centerSurround() 00121 if(x_pos > remx) sidx--; 00122 if(y_pos > remy) sidx-=sw; 00123 00124 if(x_pos < lw && y_pos < lh) 00125 { 00126 float cval = center[cidx]; 00127 float sval = surround[sidx]; 00128 if(cval > sval) 00129 { 00130 res[cidx] = cval - sval; 00131 } 00132 else 00133 { 00134 res[cidx] = 0; 00135 } 00136 } 00137 } 00138 00139 __global__ void cuda_global_centerSurroundDirectional(const float *center, const float *surround, float *pos, float *neg, int lw, int lh, int sw, int sh, int scalex, int scaley, int remx, int remy, int tile_width ) 00140 { 00141 // compute abs(hires - lowres): 00142 const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x; 00143 const int y_pos = blockIdx.y; 00144 const int cidx = IMUL(y_pos,lw) + x_pos; 00145 // sidx = y_pos/scaley*sw + x_pos/scalex 00146 int sidx = IMUL(y_pos/scaley,sw) + x_pos/scalex; 00147 // This seems to lock the indices to one less than the last at the edges of the image... 00148 // don't understand justification, but it was in the cpu version of centerSurround() 00149 if(x_pos > remx) sidx--; 00150 if(y_pos > remy) sidx-=sw; 00151 00152 if(x_pos < lw && y_pos < lh) 00153 { 00154 float cval = center[cidx]; 00155 float sval = surround[sidx]; 00156 if(cval > sval) 00157 { 00158 pos[cidx] = cval - sval; 00159 neg[cidx] = 0.0F; 00160 } 00161 else 00162 { 00163 pos[cidx] = 0.0F; 00164 neg[cidx] = sval - cval; 00165 } 00166 } 00167 } 00168 00169 00170 00171 __global__ void cuda_global_centerSurroundAbsAttenuate(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int borderSize, int scalex, int scaley, int remx, int remy, int tile_width, int tile_height) 00172 { 00173 // compute abs(hires - lowres): 00174 const float increment = 1.0 / (float)(borderSize + 1); 00175 const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x; 00176 const int y_pos = blockIdx.y; 00177 const int cidx = IMUL(y_pos,lw) + x_pos; 00178 float result; 00179 // sidx = y_pos/scaley*sw + x_pos/scalex 00180 int sidx = IMUL(y_pos/scaley,sw) + x_pos/scalex; 00181 // This seems to lock the indices to one less than the last at the edges of the image... 00182 // don't understand justification, but it was in the cpu version of centerSurround() 00183 if(x_pos > remx) sidx--; 00184 if(y_pos > remy) sidx-=sw; 00185 00186 if(x_pos < lw && y_pos < lh) 00187 { 00188 // Perform the center/surround absolute calculation 00189 float cval = center[cidx]; 00190 float sval = surround[sidx]; 00191 if(cval > sval) 00192 { 00193 result = cval - sval; 00194 } 00195 else 00196 { 00197 result = sval - cval; 00198 } 00199 // Perform the attenuate border calculation 00200 // Y Border: In the top lines of the border 00201 if(y_pos < borderSize) 00202 { 00203 float coeff = increment*(y_pos+1); 00204 result *= coeff; 00205 } 00206 // Y Border: In the bottom lines of the border 00207 else if(y_pos > lh-borderSize-1) 00208 { 00209 float coeff = increment*(borderSize-lh); 00210 result *= coeff; 00211 } 00212 // X Border: In the left lines of the border 00213 if(x_pos < borderSize) 00214 { 00215 float coeff = increment*(x_pos+1); 00216 result *= coeff; 00217 } 00218 // Y Border: In the right lines of the border 00219 else if(x_pos < lw-borderSize-1) 00220 { 00221 float coeff = increment*(borderSize-lw); 00222 result *= coeff; 00223 } 00224 } 00225 00226 } 00227 00228 00229 __global__ void cuda_global_spatialPoolMax(const float *src, float *res, const int src_w, const int src_h, int skip_w, int skip_h, int reg_w, int reg_h, int tile_width, int tile_height) 00230 { 00231 // Determine which region you are working on, and which tile within that region 00232 const int tilesperregion_w = IDIVUP(reg_w,tile_width); 00233 const int tilesperregion_h = IDIVUP(reg_h,tile_height); 00234 // Find out which region we are in 00235 const int reg_x = blockIdx.x / tilesperregion_w; 00236 const int reg_y = blockIdx.y / tilesperregion_h; 00237 // Within the region, which tile are we 00238 const int tile_x = blockIdx.x % tilesperregion_w; 00239 const int tile_y = blockIdx.y % tilesperregion_h; 00240 // What are the src x and y positions for this thread 00241 const int reg_x_pos = IMUL(tile_x,tile_width) + threadIdx.x; 00242 const int reg_y_pos = IMUL(tile_y,tile_height) + threadIdx.y; 00243 const int x_pos = IMUL(reg_x, skip_w) + reg_x_pos; 00244 const int y_pos = IMUL(reg_y, skip_h) + reg_y_pos; 00245 const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x; 00246 const int src_idx = IMUL(y_pos,src_w)+x_pos; 00247 const int src_sz = IMUL(src_w,src_h); 00248 00249 // Load the spatial pool into shared memory 00250 float *data = (float *) shared_data; // size of tile_width*tile_height 00251 const int tile_sz = IMUL(tile_height,tile_width); 00252 00253 // Make sure we are in bounds of the region and in bounds of the image 00254 if(y_pos < src_h && x_pos < src_w && reg_x_pos < reg_w && reg_y_pos < reg_h) 00255 data[ld_idx] = src[src_idx]; 00256 else 00257 data[ld_idx] = -FLT_MAX; 00258 00259 __syncthreads(); 00260 00261 if(y_pos < src_h && x_pos < src_w) 00262 { 00263 // Come up with a per block max 00264 int incr = 1; 00265 int mod = 2; 00266 while(incr < tile_sz) 00267 { 00268 if(ld_idx % mod == 0 && ld_idx+incr < tile_sz) 00269 { 00270 // Max between neighbors 00271 if(data[ld_idx] < data[ld_idx+incr]) 00272 data[ld_idx] = data[ld_idx+incr]; 00273 } 00274 __syncthreads(); 00275 00276 incr *= 2; 00277 mod *= 2; 00278 } 00279 } 00280 // Have the first thread load the output, should be done regardless of whether we are in the actual image or not 00281 if(ld_idx == 0) 00282 { 00283 const int res_idx = IMUL(blockIdx.y,gridDim.x)+blockIdx.x; 00284 res[res_idx] = data[ld_idx]; 00285 } 00286 00287 } 00288 00289 00290 00291 #endif