00001 /*!@file CUDA/cuda-kernels.h CUDA/GPU convolution kernel generation code */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_cutpaste.h $ 00035 // $Id: cuda_cutpaste.h 13228 2010-04-15 01:49:10Z itti $ 00036 // 00037 00038 00039 #ifndef CUDA_CUTPASTE_H_DEFINED 00040 #define CUDA_CUTPASTE_H_DEFINED 00041 00042 #include <cuda.h> 00043 #include "CUDA/cutil.h" 00044 #include "cudadefs.h" 00045 00046 __global__ void cuda_global_crop(const float *src, float *res, int srcw, int srch, int startx, int starty, int endx, int endy, int maxx,int maxy, int tile_width, int tile_height) 00047 { 00048 int resw = endx-startx; 00049 int resh = endy-starty; 00050 int dy = IMUL(blockIdx.y,tile_height)+threadIdx.y; 00051 int dx = IMUL(blockIdx.x,tile_width)+threadIdx.x; 00052 int res_idx = IMUL(dy,resw) + dx; 00053 int src_idx = IMUL(starty+dy,srcw) + startx+dx; 00054 00055 if(dx < resw && dy < resh && startx+dx < endx && starty+dy < endy) 00056 { 00057 if(startx+dx < maxx && starty+dy < maxy) 00058 { 00059 res[res_idx] = src[src_idx]; 00060 } 00061 else 00062 { 00063 res[res_idx] = 0.0F; 00064 } 00065 } 00066 } 00067 00068 __global__ void cuda_global_shiftImage(const float *src, float *dst, int w, int h, float deltax, float deltay, int tile_width, int tile_height) 00069 { 00070 // Save bottom row and right column for the border 00071 float *data = (float *) shared_data; //tile_width * tile_height size 00072 float *borderY = (float *) &data[tile_width*tile_height]; // size of (tile_height) 00073 float *borderX = (float *) &data[tile_width*tile_height+tile_height]; // size of (tile_width+1) 00074 const int sy = threadIdx.y; // source pixel column within source tile 00075 const int sx = threadIdx.x; // source pixel row within source tile 00076 const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows 00077 //const int ste = sts + tile_height; // tile end for source, in rows 00078 00079 00080 // Current column index 00081 const int scs = IMUL(blockIdx.x, tile_width) + sx; 00082 00083 int smemPos = IMUL(sy, tile_width) + sx; 00084 int gmemPos = IMUL(sts + sy, w) + scs; 00085 const int ypos=sts+sy, xpos=scs; 00086 float val=0.0F; 00087 00088 // prepare a couple of variable for the x direction 00089 int xt = (int)floor(deltax); 00090 float xfrac = deltax - xt; 00091 int startx = MAX(0,xt); 00092 int endx = MIN(0,xt) + w; 00093 if (fabs(xfrac) < 1.0e-10F) xfrac = 0.0F; 00094 else endx--; 00095 00096 // prepare a couple of variable for the y direction 00097 int yt = (int)floor(deltay); 00098 float yfrac = deltay - yt; 00099 int starty = MAX(0,yt); 00100 int endy = MIN(0,yt) + h; 00101 if (fabs(yfrac) < 1.0e-10F) yfrac = 0.0F; 00102 else endy--; 00103 00104 // dispatch to faster shiftClean() if displacements are roughly integer: 00105 //if (fabs(xfrac) < 1.0e-10 && fabs(yfrac) < 1.0e-10) 00106 // return shiftClean(srcImg, xt, yt); 00107 00108 if (xfrac > 0.0) 00109 { 00110 xfrac = 1.0 - xfrac; 00111 xt++; 00112 } 00113 00114 if (yfrac > 0.0) 00115 { 00116 yfrac = 1.0 - yfrac; 00117 yt++; 00118 } 00119 00120 // prepare the coefficients 00121 float tl = (1.0F - xfrac) * (1.0F - yfrac); 00122 float tr = xfrac * (1.0F - yfrac); 00123 float bl = (1.0F - xfrac) * yfrac; 00124 float br = xfrac * yfrac; 00125 00126 // only process columns that are actually within image bounds: 00127 if (xpos < w && ypos < h) { 00128 // Shared and global (source) memory indices for current column 00129 00130 // Load data 00131 data[smemPos] = src[gmemPos]; 00132 // Bottom of a tile, not of the image 00133 bool bot = (sy == tile_height-1 && ypos+1 < h); 00134 // Bottom of the image 00135 bool tbot = (ypos+1 >= h); 00136 // Right of a tile, not of the image 00137 bool rig = (sx == tile_width-1 && xpos+1 < w); 00138 // Right of the image 00139 bool trig = (xpos+1 >= w); 00140 // Load Y border 00141 if(rig) 00142 borderY[threadIdx.y] = src[gmemPos + 1]; 00143 00144 // Load X border 00145 if(bot) 00146 borderX[threadIdx.x] = src[gmemPos+w]; 00147 00148 // Load corner 00149 if(bot && rig) 00150 borderX[tile_width] = src[gmemPos+w+1]; 00151 00152 // Ensure the completness of loading stage because results emitted 00153 // by each thread depend on the data loaded by other threads: 00154 __syncthreads(); 00155 00156 int dx = xpos+xt; 00157 int dy = ypos+yt; 00158 00159 val=0; 00160 if(dy >= starty && dy < endy && dx >= startx && dx < endx) 00161 { 00162 val += data[smemPos]*tl; 00163 if(bot) 00164 { 00165 if (rig) 00166 { 00167 val += borderY[threadIdx.y]*tr; 00168 val += borderX[threadIdx.x]*bl; 00169 val += borderX[tile_width]*br; 00170 } 00171 else if(trig) 00172 { 00173 val += data[smemPos]*tr; // Duplicate value 00174 val += borderX[threadIdx.x]*bl; 00175 val += borderX[threadIdx.x]*br; // Duplicate value 00176 } 00177 else // Not at the right 00178 { 00179 val += data[smemPos+1]*tr; 00180 val += borderX[threadIdx.x]*bl; 00181 val += borderX[threadIdx.x+1]*br; 00182 } 00183 } 00184 else if(tbot) 00185 { 00186 if (rig) 00187 { 00188 val += borderY[threadIdx.y]*tr; 00189 val += data[smemPos]*bl; // Duplicate value 00190 val += borderY[threadIdx.y]*br; // Duplicate value 00191 } 00192 else if(trig) 00193 { 00194 // Nothing to add 00195 val += data[smemPos]*tr; // Duplicate value 00196 val += data[smemPos]*bl; // Duplicate value 00197 val += data[smemPos]*br; // Duplicate value 00198 } 00199 else // Not at the right 00200 { 00201 val += data[smemPos+1]*tr; 00202 val += data[smemPos]*bl; // Duplicate value 00203 val += data[smemPos+1]*br; // Duplicate value 00204 } 00205 } 00206 else // Not at the bottom 00207 { 00208 if (rig) 00209 { 00210 val += borderY[threadIdx.y]*tr; 00211 val += data[smemPos+tile_width]*bl; 00212 val += borderY[threadIdx.y+1]*br; 00213 } 00214 else if(trig) 00215 { 00216 val += data[smemPos]*tr; // Duplicate value 00217 val += data[smemPos+tile_width]*bl; 00218 val += data[smemPos+tile_width]*br; // Duplicate value 00219 } 00220 else // Not at the right 00221 { 00222 val += data[smemPos+1]*tr; 00223 val += data[smemPos+tile_width]*bl; 00224 val += data[smemPos+tile_width+1]*br; 00225 } 00226 } 00227 } 00228 00229 00230 // Determine new memory location 00231 int dmemPos = IMUL(dy,w) + dx; 00232 if(dx >= 0 && dx < w && dy >= 0 && dy < h) 00233 { 00234 dst[dmemPos] = val; 00235 } 00236 } 00237 } 00238 00239 00240 __global__ void cuda_global_inplacePaste(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height) 00241 { 00242 int sx = threadIdx.x; 00243 int stsx = blockIdx.x*tile_width; 00244 int xpos = stsx + sx; 00245 int sy = threadIdx.y; 00246 int stsy = blockIdx.y*tile_height; 00247 int ypos = stsy + sy; 00248 int didx = (ypos+dy)*w + xpos+dx; 00249 int iidx = ypos*iw + xpos; 00250 if(xpos < iw && ypos < ih) 00251 { 00252 dst[didx] = img[iidx]; 00253 } 00254 } 00255 00256 00257 __global__ void cuda_global_inplacePasteRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height) 00258 { 00259 int sx = threadIdx.x; 00260 int stsx = blockIdx.x*tile_width; 00261 int xpos = stsx + sx; 00262 int sy = threadIdx.y; 00263 int stsy = blockIdx.y*tile_height; 00264 int ypos = stsy + sy; 00265 int didx = (ypos+dy)*w + xpos+dx; 00266 int iidx = ypos*iw + xpos; 00267 if(xpos < iw && ypos < ih) 00268 { 00269 dst[didx].p[0] = img[iidx].p[0]; 00270 dst[didx].p[1] = img[iidx].p[1]; 00271 dst[didx].p[2] = img[iidx].p[2]; 00272 } 00273 } 00274 00275 __global__ void cuda_global_inplaceOverlay(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height) 00276 { 00277 int sx = threadIdx.x; 00278 int stsx = blockIdx.x*tile_width; 00279 int xpos = stsx + sx; 00280 int sy = threadIdx.y; 00281 int stsy = blockIdx.y*tile_height; 00282 int ypos = stsy + sy; 00283 int didx = (ypos+dy)*w + xpos+dx; 00284 int iidx = ypos*iw + xpos; 00285 if(xpos < iw && ypos < ih) 00286 { 00287 float val = img[iidx]; 00288 if(val > 0.0F) 00289 dst[didx] = val; 00290 } 00291 } 00292 00293 00294 __global__ void cuda_global_inplaceOverlayRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height) 00295 { 00296 int sx = threadIdx.x; 00297 int stsx = blockIdx.x*tile_width; 00298 int xpos = stsx + sx; 00299 int sy = threadIdx.y; 00300 int stsy = blockIdx.y*tile_height; 00301 int ypos = stsy + sy; 00302 int didx = (ypos+dy)*w + xpos+dx; 00303 int iidx = ypos*iw + xpos; 00304 if(xpos < iw && ypos < ih) 00305 { 00306 float p0,p1,p2; 00307 p0 = img[iidx].p[0]; 00308 p1 = img[iidx].p[1]; 00309 p2 = img[iidx].p[2]; 00310 if(p0+p1+p2 > 0.0F) 00311 { 00312 dst[didx].p[0] = p0; 00313 dst[didx].p[1] = p1; 00314 dst[didx].p[2] = p2; 00315 } 00316 } 00317 } 00318 00319 00320 #endif