cuda_cutpaste.h

00001 /*!@file CUDA/cuda-kernels.h CUDA/GPU convolution kernel generation code  */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file:
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_cutpaste.h $
00035 // $Id: cuda_cutpaste.h 13228 2010-04-15 01:49:10Z itti $
00036 //
00037 
00038 
00039 #ifndef CUDA_CUTPASTE_H_DEFINED
00040 #define CUDA_CUTPASTE_H_DEFINED
00041 
00042 #include <cuda.h>
00043 #include "CUDA/cutil.h"
00044 #include "cudadefs.h"
00045 
00046 __global__ void cuda_global_crop(const float *src, float *res, int srcw, int srch, int startx, int starty, int endx, int endy, int maxx,int maxy, int tile_width, int tile_height)
00047 {
00048   int resw = endx-startx;
00049   int resh = endy-starty;
00050   int dy = IMUL(blockIdx.y,tile_height)+threadIdx.y;
00051   int dx = IMUL(blockIdx.x,tile_width)+threadIdx.x;
00052   int res_idx = IMUL(dy,resw) + dx;
00053   int src_idx = IMUL(starty+dy,srcw) + startx+dx;
00054 
00055   if(dx < resw && dy < resh && startx+dx < endx && starty+dy < endy)
00056     {
00057       if(startx+dx < maxx && starty+dy < maxy)
00058         {
00059           res[res_idx] = src[src_idx];
00060         }
00061       else
00062         {
00063           res[res_idx] = 0.0F;
00064         }
00065     }
00066 }
00067 
00068 __global__ void cuda_global_shiftImage(const float *src, float *dst, int w, int h, float deltax, float deltay, int tile_width, int tile_height)
00069 {
00070   // Save bottom row and right column for the border
00071   float *data = (float *) shared_data; //tile_width * tile_height size
00072   float *borderY = (float *) &data[tile_width*tile_height]; // size of (tile_height)
00073   float *borderX = (float *) &data[tile_width*tile_height+tile_height]; // size of (tile_width+1)
00074   const int sy = threadIdx.y; // source pixel column within source tile
00075   const int sx = threadIdx.x; // source pixel row within source tile
00076   const int sts = IMUL(blockIdx.y, tile_height); // tile start for source, in rows
00077   //const int ste = sts + tile_height; // tile end for source, in rows
00078 
00079 
00080   // Current column index
00081   const int scs = IMUL(blockIdx.x, tile_width) + sx;
00082 
00083   int smemPos = IMUL(sy, tile_width) + sx;
00084   int gmemPos = IMUL(sts + sy, w) + scs;
00085   const int ypos=sts+sy, xpos=scs;
00086   float val=0.0F;
00087 
00088   // prepare a couple of variable for the x direction
00089   int xt = (int)floor(deltax);
00090   float xfrac = deltax - xt;
00091   int startx = MAX(0,xt);
00092   int endx = MIN(0,xt) + w;
00093   if (fabs(xfrac) < 1.0e-10F) xfrac = 0.0F;
00094   else endx--;
00095 
00096   // prepare a couple of variable for the y direction
00097   int yt = (int)floor(deltay);
00098   float yfrac = deltay - yt;
00099   int starty = MAX(0,yt);
00100   int endy = MIN(0,yt) + h;
00101   if (fabs(yfrac) < 1.0e-10F) yfrac = 0.0F;
00102   else endy--;
00103 
00104   // dispatch to faster shiftClean() if displacements are roughly integer:
00105   //if (fabs(xfrac) < 1.0e-10 && fabs(yfrac) < 1.0e-10)
00106   //  return shiftClean(srcImg, xt, yt);
00107 
00108   if (xfrac > 0.0)
00109     {
00110       xfrac = 1.0 - xfrac;
00111       xt++;
00112     }
00113 
00114   if (yfrac > 0.0)
00115     {
00116       yfrac = 1.0 - yfrac;
00117       yt++;
00118     }
00119 
00120   // prepare the coefficients
00121   float tl = (1.0F - xfrac) * (1.0F - yfrac);
00122   float tr = xfrac * (1.0F - yfrac);
00123   float bl = (1.0F - xfrac) * yfrac;
00124   float br = xfrac * yfrac;
00125 
00126   // only process columns that are actually within image bounds:
00127   if (xpos < w && ypos < h) {
00128     // Shared and global (source) memory indices for current column
00129 
00130     // Load data
00131     data[smemPos] = src[gmemPos];
00132     // Bottom of a tile, not of the image
00133     bool bot = (sy == tile_height-1 && ypos+1 < h);
00134     // Bottom of the image
00135     bool tbot = (ypos+1 >= h);
00136     // Right of a tile, not of the image
00137     bool rig = (sx == tile_width-1 && xpos+1 < w);
00138     // Right of the image
00139     bool trig = (xpos+1 >= w);
00140     // Load Y border
00141     if(rig)
00142       borderY[threadIdx.y] = src[gmemPos + 1];
00143 
00144     // Load X border
00145     if(bot)
00146       borderX[threadIdx.x] = src[gmemPos+w];
00147 
00148     // Load corner
00149     if(bot && rig)
00150       borderX[tile_width] = src[gmemPos+w+1];
00151 
00152     // Ensure the completness of loading stage because results emitted
00153     // by each thread depend on the data loaded by other threads:
00154     __syncthreads();
00155 
00156     int dx = xpos+xt;
00157     int dy = ypos+yt;
00158 
00159     val=0;
00160     if(dy >= starty && dy < endy && dx >= startx && dx < endx)
00161       {
00162         val += data[smemPos]*tl;
00163         if(bot)
00164           {
00165             if (rig)
00166               {
00167                 val += borderY[threadIdx.y]*tr;
00168                 val += borderX[threadIdx.x]*bl;
00169                 val += borderX[tile_width]*br;
00170               }
00171             else if(trig)
00172               {
00173                 val += data[smemPos]*tr; // Duplicate value
00174                 val += borderX[threadIdx.x]*bl;
00175                 val += borderX[threadIdx.x]*br; // Duplicate value
00176               }
00177             else // Not at the right
00178               {
00179                 val += data[smemPos+1]*tr;
00180                 val += borderX[threadIdx.x]*bl;
00181                 val += borderX[threadIdx.x+1]*br;
00182               }
00183           }
00184         else if(tbot)
00185           {
00186             if (rig)
00187               {
00188                 val += borderY[threadIdx.y]*tr;
00189                 val += data[smemPos]*bl; // Duplicate value
00190                 val += borderY[threadIdx.y]*br; // Duplicate value
00191               }
00192             else if(trig)
00193               {
00194                 // Nothing to add
00195                 val += data[smemPos]*tr; // Duplicate value
00196                 val += data[smemPos]*bl; // Duplicate value
00197                 val += data[smemPos]*br; // Duplicate value
00198               }
00199             else // Not at the right
00200               {
00201                 val += data[smemPos+1]*tr;
00202                 val += data[smemPos]*bl; // Duplicate value
00203                 val += data[smemPos+1]*br; // Duplicate value
00204               }
00205           }
00206         else // Not at the bottom
00207           {
00208             if (rig)
00209               {
00210                 val += borderY[threadIdx.y]*tr;
00211                 val += data[smemPos+tile_width]*bl;
00212                 val += borderY[threadIdx.y+1]*br;
00213               }
00214             else if(trig)
00215               {
00216                 val += data[smemPos]*tr; // Duplicate value
00217                 val += data[smemPos+tile_width]*bl;
00218                 val += data[smemPos+tile_width]*br; // Duplicate value
00219               }
00220             else // Not at the right
00221               {
00222                 val += data[smemPos+1]*tr;
00223                 val += data[smemPos+tile_width]*bl;
00224                 val += data[smemPos+tile_width+1]*br;
00225               }
00226           }
00227       }
00228 
00229 
00230     // Determine new memory location
00231     int dmemPos = IMUL(dy,w) + dx;
00232     if(dx >= 0 && dx < w && dy >= 0 && dy < h)
00233       {
00234         dst[dmemPos] = val;
00235       }
00236   }
00237 }
00238 
00239 
00240 __global__ void cuda_global_inplacePaste(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00241 {
00242   int sx = threadIdx.x;
00243   int stsx = blockIdx.x*tile_width;
00244   int xpos = stsx + sx;
00245   int sy = threadIdx.y;
00246   int stsy = blockIdx.y*tile_height;
00247   int ypos = stsy + sy;
00248   int didx = (ypos+dy)*w + xpos+dx;
00249   int iidx = ypos*iw + xpos;
00250   if(xpos < iw && ypos < ih)
00251     {
00252       dst[didx] = img[iidx];
00253     }
00254 }
00255 
00256 
00257 __global__ void cuda_global_inplacePasteRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00258 {
00259   int sx = threadIdx.x;
00260   int stsx = blockIdx.x*tile_width;
00261   int xpos = stsx + sx;
00262   int sy = threadIdx.y;
00263   int stsy = blockIdx.y*tile_height;
00264   int ypos = stsy + sy;
00265   int didx = (ypos+dy)*w + xpos+dx;
00266   int iidx = ypos*iw + xpos;
00267   if(xpos < iw && ypos < ih)
00268     {
00269       dst[didx].p[0] = img[iidx].p[0];
00270       dst[didx].p[1] = img[iidx].p[1];
00271       dst[didx].p[2] = img[iidx].p[2];
00272     }
00273 }
00274 
00275 __global__ void cuda_global_inplaceOverlay(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00276 {
00277   int sx = threadIdx.x;
00278   int stsx = blockIdx.x*tile_width;
00279   int xpos = stsx + sx;
00280   int sy = threadIdx.y;
00281   int stsy = blockIdx.y*tile_height;
00282   int ypos = stsy + sy;
00283   int didx = (ypos+dy)*w + xpos+dx;
00284   int iidx = ypos*iw + xpos;
00285   if(xpos < iw && ypos < ih)
00286     {
00287       float val = img[iidx];
00288       if(val > 0.0F)
00289         dst[didx] = val;
00290     }
00291 }
00292 
00293 
00294 __global__ void cuda_global_inplaceOverlayRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00295 {
00296   int sx = threadIdx.x;
00297   int stsx = blockIdx.x*tile_width;
00298   int xpos = stsx + sx;
00299   int sy = threadIdx.y;
00300   int stsy = blockIdx.y*tile_height;
00301   int ypos = stsy + sy;
00302   int didx = (ypos+dy)*w + xpos+dx;
00303   int iidx = ypos*iw + xpos;
00304   if(xpos < iw && ypos < ih)
00305     {
00306       float p0,p1,p2;
00307       p0 = img[iidx].p[0];
00308       p1 = img[iidx].p[1];
00309       p2 = img[iidx].p[2];
00310       if(p0+p1+p2 > 0.0F)
00311         {
00312           dst[didx].p[0] = p0;
00313           dst[didx].p[1] = p1;
00314           dst[didx].p[2] = p2;
00315         }
00316     }
00317 }
00318 
00319 
00320 #endif