00001 /*!@file CUDA/cuda-lowpass.h CUDA/GPU optimized lowpass filter code */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_shapeops.h $ 00035 // $Id: cuda_shapeops.h 12962 2010-03-06 02:13:53Z irock $ 00036 // 00037 00038 #ifndef CUDA_SHAPEOPS_H_DEFINED 00039 #define CUDA_SHAPEOPS_H_DEFINED 00040 00041 #include <cuda.h> 00042 #include "CUDA/cutil.h" 00043 #include "cudadefs.h" 00044 00045 __global__ void cuda_global_dec_xy(const float *src, float* dst, const int x_factor, const int y_factor, const unsigned int w, const unsigned int h, int tile_width) 00046 { 00047 00048 // Destination width, height 00049 const int new_width = w/x_factor; 00050 const int new_height = h/y_factor; 00051 const int new_size = new_width*new_height; 00052 00053 const int dx = threadIdx.x; // dest pixel within dest tile 00054 const int dts = IMUL(blockIdx.x, tile_width); // tile start for source, relative to row start 00055 const int drs = IMUL(blockIdx.y, new_width); // Row start index in dest data 00056 00057 const int srx = IMUL((dx+dts),x_factor); // src pixel in src row 00058 const int srs = IMUL(IMUL(blockIdx.y, w),y_factor); // Row start index in source data 00059 00060 const int writeIdx = drs + dts + dx; // write index 00061 const int loadIdx = srs + srx; // load index 00062 00063 // only process every so many pixels 00064 if(writeIdx < new_size && loadIdx < w*h && dts+dx < new_width) { 00065 dst[writeIdx] = src[loadIdx]; 00066 } 00067 00068 } 00069 00070 __global__ void cuda_global_dec_x(const float *src, float* dst, const int x_factor, const unsigned int w, const unsigned int h, int tile_width) 00071 { 00072 00073 // Destination width, height 00074 const int new_width = w/x_factor; 00075 const int new_size = new_width*h; 00076 00077 const int dx = threadIdx.x; // dest pixel within dest tile 00078 const int dts = IMUL(blockIdx.x, tile_width); // tile start for source, relative to row start 00079 const int drs = IMUL(blockIdx.y, new_width); // Row start index in dest data 00080 00081 const int srx = IMUL((dx+dts),x_factor); // src pixel in src row 00082 const int srs = IMUL(blockIdx.y, w); // Row start index in source data 00083 00084 const int writeIdx = drs + dts + dx; // write index 00085 const int loadIdx = srs + srx; // load index 00086 00087 // only process every so many pixels 00088 if(writeIdx < new_size && loadIdx < w*h && dts+dx < new_width) { 00089 dst[writeIdx] = src[loadIdx]; 00090 } 00091 00092 } 00093 00094 00095 __global__ void cuda_global_dec_y(const float *src, float* dst, const int y_factor, const unsigned int w, const unsigned int h, int tile_width) 00096 { 00097 00098 // Destination width, height 00099 const int new_height = h/y_factor; 00100 const int new_size = w*new_height; 00101 00102 const int dx = threadIdx.x; // dest pixel within dest tile 00103 const int dts = IMUL(blockIdx.x, tile_width); // tile start for source, relative to row start 00104 const int drs = IMUL(blockIdx.y, w); // Row start index in dest data 00105 00106 const int srx = dx+dts; // src pixel in src row 00107 const int srs = IMUL(IMUL(blockIdx.y, w),y_factor); // Row start index in source data 00108 00109 const int writeIdx = drs + dts + dx; // write index 00110 const int loadIdx = srs + srx; // load index 00111 00112 // only process every so many pixels 00113 if(writeIdx < new_size && loadIdx < w*h && dts+dx < w) { 00114 dst[writeIdx] = src[loadIdx]; 00115 } 00116 00117 } 00118 00119 __global__ void cuda_global_quickLocalAvg(const float *in, float *res, float fac, int scalex, int scaley, int remx, int remy, int lw, int lh, int sw, int sh, int tile_width, int tile_height) 00120 { 00121 const int srs = IMUL(blockIdx.y, tile_height) + threadIdx.y; // row start for scaled avg 00122 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; // column index for scaled avg 00123 int sidx = IMUL(srs,sw) + scs; 00124 00125 if(scs < sw && srs < sh) 00126 { 00127 res[sidx] = 0; 00128 int offx = 0; int offy=0; 00129 // Remaining input pixels will be taken up by the last averaging pixel in each dimension 00130 if(scs == sw-1) offx+=remx; 00131 if(srs == sh-1) offy+=remy; 00132 00133 for(int j=0;j<scaley+offy;j++) 00134 { 00135 for(int i=0;i<scalex+offx;i++) 00136 { 00137 const int x_pos = IMUL(scs,scalex)+i; 00138 const int y_pos = IMUL(srs,scaley)+j; 00139 int lidx = IMUL(y_pos,lw) + x_pos; 00140 if(x_pos < lw && y_pos < lh) 00141 res[sidx] += in[lidx]; 00142 } 00143 } 00144 // Normalize by the area of the average 00145 res[sidx] *= fac; 00146 } 00147 00148 } 00149 00150 00151 __global__ void cuda_global_quickLocalAvg2x2(const float *in, float *res, int lw, int lh, int sw, int sh, int tile_width, int tile_height) 00152 { 00153 const int srs = IMUL(blockIdx.y, tile_height) + threadIdx.y; // row start for scaled avg 00154 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; // column index for scaled avg 00155 const int sidx = IMUL(srs,sw) + scs; 00156 00157 if(scs < sw && srs < sh) 00158 { 00159 res[sidx] = 0; 00160 const int x_pos = IMUL(scs,2); 00161 const int y_pos = IMUL(srs,2); 00162 int lidx = IMUL(y_pos,lw) + x_pos; 00163 00164 if(x_pos+1 < lw && y_pos+1 < lh) 00165 { 00166 res[sidx] = (in[lidx] + in[lidx+1] + in[lidx+lw] + in[lidx+lw+1])*0.25F; 00167 } 00168 } 00169 00170 } 00171 00172 __global__ void cuda_global_quickLocalMax(const float *in, float *res, int scalex, int scaley, int remx, int remy, int lw, int lh, int sw, int sh, int tile_width, int tile_height) 00173 { 00174 const int srs = IMUL(blockIdx.y, tile_height) + threadIdx.y; // row start for scaled avg 00175 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x; // column index for scaled avg 00176 int sidx = IMUL(srs,sw) + scs; 00177 float curRes = -10000.0F; 00178 if(scs < sw && srs < sh) 00179 { 00180 int offx = 0; int offy=0; 00181 // Remaining input pixels will be taken up by the last averaging pixel in each dimension 00182 if(scs == sw-1) offx+=remx; 00183 if(srs == sh-1) offy+=remy; 00184 00185 for(int j=0;j<scaley+offy;j++) 00186 { 00187 for(int i=0;i<scalex+offx;i++) 00188 { 00189 const int x_pos = IMUL(scs,scalex)+i; 00190 const int y_pos = IMUL(srs,scaley)+j; 00191 int lidx = IMUL(y_pos,lw) + x_pos; 00192 if(x_pos < lw && y_pos < lh) 00193 curRes= fmaxf(in[lidx],curRes); 00194 } 00195 } 00196 // Normalize by the area of the average 00197 res[sidx] = curRes; 00198 } 00199 00200 } 00201 00202 00203 00204 __global__ void cuda_global_rescaleBilinear(const float *src, float *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height) 00205 { 00206 // code inspired from one of the Graphics Gems book: 00207 /* 00208 (1) (x,y) are the original coords corresponding to scaled coords (i,j) 00209 (2) (x0,y0) are the greatest lower bound integral coords from (x,y) 00210 (3) (x1,y1) are the least upper bound integral coords from (x,y) 00211 (4) d00, d10, d01, d11 are the values of the original image at the corners 00212 of the rect (x0,y0),(x1,y1) 00213 (5) the value in the scaled image is computed from bilinear interpolation 00214 among d00,d10,d01,d11 00215 */ 00216 00217 // Destination column 00218 const int dest_col = blockIdx.x*tile_width + threadIdx.x; 00219 // Destination row index 00220 const int dest_row = blockIdx.y*tile_height + threadIdx.y; 00221 // Destination index 00222 const int dest_idx = dest_row*new_w + dest_col; 00223 00224 if(dest_col < new_w && dest_row < new_h) 00225 { 00226 // Src column 00227 const float y = fmaxf(0.0f,(dest_row+0.5f)*sh - 0.5f); 00228 const int src_row0 = int(y); 00229 const int src_row1 = (int) fminf(src_row0+1,orig_h-1); 00230 const float x = fmaxf(0.0f,(dest_col+0.5f)*sw - 0.5f); 00231 const int src_col0 = int(x); 00232 const int src_col1 = (int) fminf(src_col0+1,orig_w-1); 00233 const float fy = y - float(src_row0); 00234 const float fx = x - float(src_col0); 00235 const int yw0 = IMUL(src_row0,orig_w); 00236 const int yw1 = IMUL(src_row1,orig_w); 00237 00238 const float d00 = src[yw0+src_col0]; 00239 const float d10 = src[yw0+src_col1]; 00240 const float d01 = src[yw1+src_col0]; 00241 const float d11 = src[yw1+src_col1]; 00242 float dx0 = d00 + (d10 - d00) * fx; 00243 float dx1 = d01 + (d11 - d01) * fx; 00244 res[dest_idx] = dx0 + (dx1 - dx0)*fy; 00245 } 00246 } 00247 00248 00249 __global__ void cuda_global_rescaleBilinearRGB(const float3_t *src, float3_t *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height) 00250 { 00251 // code inspired from one of the Graphics Gems book: 00252 /* 00253 (1) (x,y) are the original coords corresponding to scaled coords (i,j) 00254 (2) (x0,y0) are the greatest lower bound integral coords from (x,y) 00255 (3) (x1,y1) are the least upper bound integral coords from (x,y) 00256 (4) d00, d10, d01, d11 are the values of the original image at the corners 00257 of the rect (x0,y0),(x1,y1) 00258 (5) the value in the scaled image is computed from bilinear interpolation 00259 among d00,d10,d01,d11 00260 */ 00261 00262 // Destination column 00263 const int dest_col = blockIdx.x*tile_width + threadIdx.x; 00264 // Destination row index 00265 const int dest_row = blockIdx.y*tile_height + threadIdx.y; 00266 // Destination index 00267 const int dest_idx = dest_row*new_w + dest_col; 00268 00269 if(dest_col < new_w && dest_row < new_h) 00270 { 00271 // Src column 00272 const float y = fmaxf(0.0f,(dest_row+0.5f)*sh - 0.5f); 00273 const int src_row0 = int(y); 00274 const int src_row1 = (int) fminf(src_row0+1,orig_h-1); 00275 const float x = fmaxf(0.0f,(dest_col+0.5f)*sw - 0.5f); 00276 const int src_col0 = int(x); 00277 const int src_col1 = (int) fminf(src_col0+1,orig_w-1); 00278 const float fy = y - float(src_row0); 00279 const float fx = x - float(src_col0); 00280 const int yw0 = IMUL(src_row0,orig_w); 00281 const int yw1 = IMUL(src_row1,orig_w); 00282 00283 float d00,d10,d01,d11; 00284 float dx0,dx1; 00285 for(int i=0;i<3;i++) 00286 { 00287 d00 = src[yw0+src_col0].p[i]; 00288 d10 = src[yw0+src_col1].p[i]; 00289 d01 = src[yw1+src_col0].p[i]; 00290 d11 = src[yw1+src_col1].p[i]; 00291 dx0 = d00 + (d10 - d00) * fx; 00292 dx1 = d01 + (d11 - d01) * fx; 00293 res[dest_idx].p[i] = dx0 + (dx1 - dx0)*fy; 00294 } 00295 } 00296 } 00297 00298 00299 #endif