cuda_shapeops.h

00001 /*!@file CUDA/cuda-lowpass.h CUDA/GPU optimized lowpass filter code */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file:
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_shapeops.h $
00035 // $Id: cuda_shapeops.h 12962 2010-03-06 02:13:53Z irock $
00036 //
00037 
00038 #ifndef CUDA_SHAPEOPS_H_DEFINED
00039 #define CUDA_SHAPEOPS_H_DEFINED
00040 
00041 #include <cuda.h>
00042 #include "CUDA/cutil.h"
00043 #include "cudadefs.h"
00044 
00045 __global__ void cuda_global_dec_xy(const float *src,  float* dst, const int x_factor, const int y_factor, const unsigned int w, const unsigned int h, int tile_width)
00046 {
00047 
00048   // Destination width, height
00049   const int new_width = w/x_factor;
00050   const int new_height = h/y_factor;
00051   const int new_size = new_width*new_height;
00052 
00053   const int dx = threadIdx.x;                    // dest pixel within dest tile
00054   const int dts = IMUL(blockIdx.x, tile_width);  // tile start for source, relative to row start
00055   const int drs = IMUL(blockIdx.y, new_width);   // Row start index in dest data
00056 
00057   const int srx = IMUL((dx+dts),x_factor);             // src pixel in src row
00058   const int srs = IMUL(IMUL(blockIdx.y, w),y_factor);  // Row start index in source data
00059 
00060   const int writeIdx = drs + dts + dx; // write index
00061   const int loadIdx = srs + srx;  // load index
00062 
00063   // only process every so many pixels
00064   if(writeIdx < new_size  && loadIdx < w*h && dts+dx < new_width) {
00065     dst[writeIdx] = src[loadIdx];
00066   }
00067 
00068 }
00069 
00070 __global__ void cuda_global_dec_x(const float *src,  float* dst, const int x_factor, const unsigned int w, const unsigned int h, int tile_width)
00071 {
00072 
00073   // Destination width, height
00074   const int new_width = w/x_factor;
00075   const int new_size = new_width*h;
00076 
00077   const int dx = threadIdx.x;                    // dest pixel within dest tile
00078   const int dts = IMUL(blockIdx.x, tile_width);  // tile start for source, relative to row start
00079   const int drs = IMUL(blockIdx.y, new_width);   // Row start index in dest data
00080 
00081   const int srx = IMUL((dx+dts),x_factor);       // src pixel in src row
00082   const int srs = IMUL(blockIdx.y, w);           // Row start index in source data
00083 
00084   const int writeIdx = drs + dts + dx; // write index
00085   const int loadIdx = srs + srx;  // load index
00086 
00087   // only process every so many pixels
00088   if(writeIdx < new_size  && loadIdx < w*h && dts+dx < new_width) {
00089     dst[writeIdx] = src[loadIdx];
00090   }
00091 
00092 }
00093 
00094 
00095 __global__ void cuda_global_dec_y(const float *src,  float* dst, const int y_factor, const unsigned int w, const unsigned int h, int tile_width)
00096 {
00097 
00098   // Destination width, height
00099   const int new_height = h/y_factor;
00100   const int new_size = w*new_height;
00101 
00102   const int dx = threadIdx.x;                    // dest pixel within dest tile
00103   const int dts = IMUL(blockIdx.x, tile_width);  // tile start for source, relative to row start
00104   const int drs = IMUL(blockIdx.y, w);           // Row start index in dest data
00105 
00106   const int srx = dx+dts;                              // src pixel in src row
00107   const int srs = IMUL(IMUL(blockIdx.y, w),y_factor);  // Row start index in source data
00108 
00109   const int writeIdx = drs + dts + dx; // write index
00110   const int loadIdx = srs + srx;  // load index
00111 
00112   // only process every so many pixels
00113   if(writeIdx < new_size  && loadIdx < w*h && dts+dx < w) {
00114     dst[writeIdx] = src[loadIdx];
00115   }
00116 
00117 }
00118 
00119 __global__ void cuda_global_quickLocalAvg(const float *in, float *res, float fac, int scalex, int scaley, int remx, int remy, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00120 {
00121   const int srs = IMUL(blockIdx.y, tile_height) + threadIdx.y; // row start for scaled avg
00122   const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; // column index for scaled avg
00123   int sidx = IMUL(srs,sw) + scs;
00124 
00125   if(scs < sw && srs < sh)
00126   {
00127     res[sidx] = 0;
00128     int offx = 0; int offy=0;
00129     // Remaining input pixels will be taken up by the last averaging pixel in each dimension
00130     if(scs == sw-1) offx+=remx;
00131     if(srs == sh-1) offy+=remy;
00132 
00133     for(int j=0;j<scaley+offy;j++)
00134     {
00135       for(int i=0;i<scalex+offx;i++)
00136         {
00137           const int x_pos = IMUL(scs,scalex)+i;
00138           const int y_pos = IMUL(srs,scaley)+j;
00139           int lidx = IMUL(y_pos,lw) + x_pos;
00140           if(x_pos < lw && y_pos < lh)
00141             res[sidx] += in[lidx];
00142         }
00143     }
00144     // Normalize by the area of the average
00145     res[sidx] *= fac;
00146   }
00147 
00148 }
00149 
00150 
00151 __global__ void cuda_global_quickLocalAvg2x2(const float *in, float *res, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00152 {
00153   const int srs = IMUL(blockIdx.y, tile_height) + threadIdx.y; // row start for scaled avg
00154   const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; // column index for scaled avg
00155   const int sidx = IMUL(srs,sw) + scs;
00156 
00157   if(scs < sw && srs < sh)
00158   {
00159     res[sidx] = 0;
00160     const int x_pos = IMUL(scs,2);
00161     const int y_pos = IMUL(srs,2);
00162     int lidx = IMUL(y_pos,lw) + x_pos;
00163 
00164     if(x_pos+1 < lw && y_pos+1 < lh)
00165     {
00166       res[sidx] = (in[lidx] + in[lidx+1] + in[lidx+lw] + in[lidx+lw+1])*0.25F;
00167     }
00168   }
00169 
00170 }
00171 
00172 __global__ void cuda_global_quickLocalMax(const float *in, float *res, int scalex, int scaley, int remx, int remy, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00173 {
00174   const int srs = IMUL(blockIdx.y, tile_height) + threadIdx.y; // row start for scaled avg
00175   const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; // column index for scaled avg
00176   int sidx = IMUL(srs,sw) + scs;
00177   float curRes = -10000.0F;
00178   if(scs < sw && srs < sh)
00179   {
00180     int offx = 0; int offy=0;
00181     // Remaining input pixels will be taken up by the last averaging pixel in each dimension
00182     if(scs == sw-1) offx+=remx;
00183     if(srs == sh-1) offy+=remy;
00184 
00185     for(int j=0;j<scaley+offy;j++)
00186     {
00187       for(int i=0;i<scalex+offx;i++)
00188         {
00189           const int x_pos = IMUL(scs,scalex)+i;
00190           const int y_pos = IMUL(srs,scaley)+j;
00191           int lidx = IMUL(y_pos,lw) + x_pos;
00192           if(x_pos < lw && y_pos < lh)
00193             curRes= fmaxf(in[lidx],curRes);
00194         }
00195     }
00196     // Normalize by the area of the average
00197     res[sidx] = curRes;
00198   }
00199 
00200 }
00201 
00202 
00203 
00204 __global__ void cuda_global_rescaleBilinear(const float *src, float *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height)
00205 {
00206   // code inspired from one of the Graphics Gems book:
00207   /*
00208     (1) (x,y) are the original coords corresponding to scaled coords (i,j)
00209     (2) (x0,y0) are the greatest lower bound integral coords from (x,y)
00210     (3) (x1,y1) are the least upper bound integral coords from (x,y)
00211     (4) d00, d10, d01, d11 are the values of the original image at the corners
00212         of the rect (x0,y0),(x1,y1)
00213     (5) the value in the scaled image is computed from bilinear interpolation
00214         among d00,d10,d01,d11
00215   */
00216 
00217   // Destination column
00218   const int dest_col = blockIdx.x*tile_width + threadIdx.x;
00219   // Destination row index
00220   const int dest_row = blockIdx.y*tile_height + threadIdx.y;
00221   // Destination index
00222   const int dest_idx = dest_row*new_w + dest_col;
00223 
00224   if(dest_col < new_w && dest_row < new_h)
00225   {
00226     // Src column
00227     const float y = fmaxf(0.0f,(dest_row+0.5f)*sh - 0.5f);
00228     const int src_row0 = int(y);
00229     const int src_row1 = (int) fminf(src_row0+1,orig_h-1);
00230     const float x = fmaxf(0.0f,(dest_col+0.5f)*sw - 0.5f);
00231     const int src_col0 = int(x);
00232     const int src_col1 = (int) fminf(src_col0+1,orig_w-1);
00233     const float fy = y - float(src_row0);
00234     const float fx = x - float(src_col0);
00235     const int yw0 = IMUL(src_row0,orig_w);
00236     const int yw1 = IMUL(src_row1,orig_w);
00237 
00238     const float d00 = src[yw0+src_col0];
00239     const float d10 = src[yw0+src_col1];
00240     const float d01 = src[yw1+src_col0];
00241     const float d11 = src[yw1+src_col1];
00242     float dx0 = d00 + (d10 - d00) * fx;
00243     float dx1 = d01 + (d11 - d01) * fx;
00244     res[dest_idx] = dx0 + (dx1 - dx0)*fy;
00245   }
00246 }
00247 
00248 
00249 __global__ void cuda_global_rescaleBilinearRGB(const float3_t *src, float3_t *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height)
00250 {
00251   // code inspired from one of the Graphics Gems book:
00252   /*
00253     (1) (x,y) are the original coords corresponding to scaled coords (i,j)
00254     (2) (x0,y0) are the greatest lower bound integral coords from (x,y)
00255     (3) (x1,y1) are the least upper bound integral coords from (x,y)
00256     (4) d00, d10, d01, d11 are the values of the original image at the corners
00257         of the rect (x0,y0),(x1,y1)
00258     (5) the value in the scaled image is computed from bilinear interpolation
00259         among d00,d10,d01,d11
00260   */
00261 
00262   // Destination column
00263   const int dest_col = blockIdx.x*tile_width + threadIdx.x;
00264   // Destination row index
00265   const int dest_row = blockIdx.y*tile_height + threadIdx.y;
00266   // Destination index
00267   const int dest_idx = dest_row*new_w + dest_col;
00268 
00269   if(dest_col < new_w && dest_row < new_h)
00270   {
00271     // Src column
00272     const float y = fmaxf(0.0f,(dest_row+0.5f)*sh - 0.5f);
00273     const int src_row0 = int(y);
00274     const int src_row1 = (int) fminf(src_row0+1,orig_h-1);
00275     const float x = fmaxf(0.0f,(dest_col+0.5f)*sw - 0.5f);
00276     const int src_col0 = int(x);
00277     const int src_col1 = (int) fminf(src_col0+1,orig_w-1);
00278     const float fy = y - float(src_row0);
00279     const float fx = x - float(src_col0);
00280     const int yw0 = IMUL(src_row0,orig_w);
00281     const int yw1 = IMUL(src_row1,orig_w);
00282 
00283     float d00,d10,d01,d11;
00284     float dx0,dx1;
00285     for(int i=0;i<3;i++)
00286       {
00287         d00 = src[yw0+src_col0].p[i];
00288         d10 = src[yw0+src_col1].p[i];
00289         d01 = src[yw1+src_col0].p[i];
00290         d11 = src[yw1+src_col1].p[i];
00291         dx0 = d00 + (d10 - d00) * fx;
00292         dx1 = d01 + (d11 - d01) * fx;
00293         res[dest_idx].p[i] = dx0 + (dx1 - dx0)*fy;
00294       }
00295   }
00296 }
00297 
00298 
00299 #endif