00001 /*!@file CUDA/CudaConvolutions.C C++ wrapper for CUDA convolution methods */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/CudaConvolutions.C $ 00035 // $Id: CudaConvolutions.C 12962 2010-03-06 02:13:53Z irock $ 00036 // 00037 00038 #include "CUDA/CudaImage.H" 00039 #include "Util/Assert.H" 00040 #include "CUDA/cudadefs.h" 00041 #include "Image/Convolutions.H" 00042 #include "CudaConvolutions.H" 00043 #include "CudaDevices.H" 00044 #include "wrap_c_cuda.h" 00045 #include <algorithm> 00046 00047 // ###################################################################### 00048 00049 CudaImage<float> cudaOptConvolve(const CudaImage<float>& src, const CudaImage<float>& f) 00050 { 00051 MemoryPolicy mp = src.getMemoryPolicy(); 00052 int dev = src.getMemoryDevice(); 00053 Dims tile = CudaDevices::getDeviceTileSize(dev); 00054 00055 ASSERT(src.initialized()); 00056 ASSERT(mp != HOST_MEMORY); 00057 const int src_w = src.getWidth(); 00058 const int src_h = src.getHeight(); 00059 00060 const int fil_w = f.getWidth(); 00061 const int fil_h = f.getHeight(); 00062 00063 ASSERT((fil_w & 1) && (fil_h & 1)); 00064 CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev); 00065 00066 cuda_c_optConvolve(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,f.getCudaArrayPtr(),fil_w,fil_h,tile.w(),tile.h()); 00067 return result; 00068 } 00069 00070 00071 // ###################################################################### 00072 00073 CudaImage<float> cudaConvolveZeroHelper(const CudaImage<float>& src, const CudaImage<float>& filter, 00074 const int Nx, const int Ny, bool runOptimized) 00075 { 00076 MemoryPolicy mp = src.getMemoryPolicy(); 00077 int dev = src.getMemoryDevice(); 00078 Dims tile = CudaDevices::getDeviceTileSize(dev); 00079 00080 ASSERT(src.initialized()); //ASSERT((Nx & 1) && (Ny & 1)); 00081 ASSERT(mp != HOST_MEMORY); 00082 const int src_w = src.getWidth(), src_h = src.getHeight(); 00083 CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev); 00084 00085 int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float)); 00086 // Decide whether we can run optimized versions based on size of filter 00087 if(runOptimized && mem_size < Nx*Ny+(tile.w()+Nx)*(tile.h()+Ny)) 00088 { 00089 printf("Unable to run convolveZeroHelper optimized\n"); 00090 runOptimized = false; 00091 } 00092 00093 if(runOptimized) 00094 cuda_c_convolveZeroHelperOptimized(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h()); 00095 else 00096 cuda_c_convolveZeroHelper(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h()); 00097 00098 return result; 00099 } 00100 00101 // ###################################################################### 00102 00103 CudaImage<float> cudaConvolveCleanHelper(const CudaImage<float>& src, const CudaImage<float>& filter, 00104 const int Nx, const int Ny, bool runOptimized) 00105 { 00106 MemoryPolicy mp = src.getMemoryPolicy(); 00107 int dev = src.getMemoryDevice(); 00108 Dims tile = CudaDevices::getDeviceTileSize(dev); 00109 00110 ASSERT(src.initialized()); //ASSERT((Nx & 1) && (Ny & 1)); 00111 ASSERT(mp != HOST_MEMORY); 00112 const int src_w = src.getWidth(), src_h = src.getHeight(); 00113 CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev); 00114 00115 cuda_c_convolveCleanHelper(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h()); 00116 00117 return result; 00118 } 00119 00120 // ###################################################################### 00121 CudaImage<float> cudaConvolveHmax(const CudaImage<float>& src, const CudaImage<float>& filter, bool runOptimized) 00122 { 00123 MemoryPolicy mp = src.getMemoryPolicy(); 00124 int dev = src.getMemoryDevice(); 00125 Dims tile = CudaDevices::getDeviceTileSize(dev); 00126 const int Nx = filter.getWidth(), Ny = filter.getHeight(); 00127 ASSERT(src.initialized()); 00128 ASSERT((Nx & 1) && (Ny & 1)); 00129 ASSERT(mp != HOST_MEMORY); 00130 00131 const int src_w = src.getWidth(), src_h = src.getHeight(); 00132 CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev); 00133 00134 int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float)); 00135 // Decide whether we can run optimized versions based on size of filter 00136 if(runOptimized && mem_size < Nx*Ny+(tile.w()+Nx)*(tile.h()+Ny)) 00137 { 00138 //printf("Unable to run convolveHmaxHelper optimized Nx %d Ny %d tw %d th %d\n",Nx,Ny,tile.w(),tile.h()); 00139 runOptimized = false; 00140 } 00141 00142 if(runOptimized) 00143 cuda_c_convolveHmaxHelperOptimized(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h()); 00144 else 00145 cuda_c_convolveHmaxHelper(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h()); 00146 00147 return result; 00148 00149 } 00150 00151 00152 // ###################################################################### 00153 CudaImage<float> cudaConvolve(const CudaImage<float>& src, const CudaImage<float>& filter, 00154 const int Nx, const int Ny, 00155 ConvolutionBoundaryStrategy boundary, bool runOptimized) 00156 { 00157 switch (boundary) 00158 { 00159 case CONV_BOUNDARY_ZERO: 00160 return cudaConvolveZeroHelper(src, filter, Nx, Ny, runOptimized); 00161 break; 00162 case CONV_BOUNDARY_CLEAN: 00163 return cudaConvolveCleanHelper(src, filter, Nx, Ny, runOptimized); 00164 break; 00165 case CONV_BOUNDARY_REPLICATE: 00166 // not implemented yet -- pass through to error 00167 default: 00168 LFATAL("convolution boundary strategy %d not supported", 00169 (int) boundary); 00170 } 00171 /* can't happen */ return CudaImage<float>(); 00172 } 00173 00174 // ###################################################################### 00175 CudaImage<float> cudaXFilter(const CudaImage<float>& src, const CudaImage<float>& hFilt, const int hfs, 00176 ConvolutionBoundaryStrategy boundary, bool runOptimized) 00177 { 00178 MemoryPolicy mp = src.getMemoryPolicy(); 00179 int dev = src.getMemoryDevice(); 00180 ASSERT(src.initialized()); 00181 ASSERT(mp != HOST_MEMORY); 00182 if (hfs == 0) 00183 { 00184 return src; // no filter 00185 } 00186 00187 Dims tile = CudaDevices::getDeviceTileSize(dev); 00188 00189 // Needed for non-optimized functions 00190 int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float)); 00191 int share_len = std::min(mem_size,hfs); 00192 // Decide whether we can run optimized versions based on size of filter 00193 if(runOptimized && mem_size < hfs*2+tile.sz()) 00194 { 00195 Dims tileOpt = Dims(mem_size-hfs*2,1); 00196 if(tileOpt.sz() < 16) 00197 runOptimized = false; 00198 else 00199 tile = tileOpt; 00200 } 00201 const int w = src.getWidth(), h = src.getHeight(); 00202 CudaImage<float> result = CudaImage<float>(w, h, NO_INIT,mp,dev); 00203 00204 // *** horizontal pass *** 00205 if(runOptimized) 00206 { 00207 switch(boundary) 00208 { 00209 case CONV_BOUNDARY_ZERO: 00210 cuda_c_optXFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz()); 00211 break; 00212 case CONV_BOUNDARY_CLEAN: 00213 cuda_c_optXFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz()); 00214 break; 00215 case CONV_BOUNDARY_REPLICATE: 00216 cuda_c_optXFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz()); 00217 break; 00218 default: 00219 LFATAL("convolution boundary strategy %d not supported", 00220 (int) boundary); 00221 break; 00222 } 00223 } 00224 else 00225 { 00226 switch(boundary) 00227 { 00228 case CONV_BOUNDARY_ZERO: 00229 cuda_c_xFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz()); 00230 break; 00231 case CONV_BOUNDARY_CLEAN: 00232 cuda_c_xFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz()); 00233 break; 00234 case CONV_BOUNDARY_REPLICATE: 00235 cuda_c_xFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz()); 00236 break; 00237 default: 00238 LFATAL("convolution boundary strategy %d not supported", 00239 (int) boundary); 00240 break; 00241 } 00242 } 00243 return result; 00244 } 00245 00246 // ###################################################################### 00247 CudaImage<float> cudaYFilter(const CudaImage<float>& src, const CudaImage<float>& hFilt, const int hfs, 00248 ConvolutionBoundaryStrategy boundary, bool runOptimized) 00249 { 00250 MemoryPolicy mp = src.getMemoryPolicy(); 00251 int dev = src.getMemoryDevice(); 00252 ASSERT(src.initialized()); 00253 ASSERT(mp != HOST_MEMORY); 00254 if (hfs == 0) 00255 { 00256 return src; // no filter 00257 } 00258 00259 Dims tile = CudaDevices::getDeviceTileSize(dev); 00260 // Needed for non-optimized functions 00261 int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float)); 00262 int share_len = std::min(mem_size,hfs); 00263 00264 // Decide whether we can run optimized versions based on size of filter 00265 if(runOptimized && mem_size < hfs*2+tile.sz()) 00266 { 00267 // Modifying tile size 00268 Dims tileOpt = Dims(mem_size-hfs*2,1); 00269 if(tileOpt.sz() < 16) 00270 runOptimized = false; 00271 else 00272 tile = tileOpt; 00273 } 00274 const int w = src.getWidth(), h = src.getHeight(); 00275 CudaImage<float> result = CudaImage<float>(w, h, NO_INIT,mp,dev); 00276 00277 // *** horizontal pass *** 00278 if(runOptimized) 00279 { 00280 switch(boundary) 00281 { 00282 case CONV_BOUNDARY_ZERO: 00283 cuda_c_optYFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz()); 00284 break; 00285 case CONV_BOUNDARY_CLEAN: 00286 cuda_c_optYFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz()); 00287 break; 00288 case CONV_BOUNDARY_REPLICATE: 00289 cuda_c_optYFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz()); 00290 break; 00291 default: 00292 LFATAL("convolution boundary strategy %d not supported", 00293 (int) boundary); 00294 break; 00295 } 00296 } 00297 else 00298 { 00299 switch(boundary) 00300 { 00301 case CONV_BOUNDARY_ZERO: 00302 cuda_c_yFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz()); 00303 break; 00304 case CONV_BOUNDARY_CLEAN: 00305 cuda_c_yFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz()); 00306 break; 00307 case CONV_BOUNDARY_REPLICATE: 00308 cuda_c_yFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz()); 00309 break; 00310 default: 00311 LFATAL("convolution boundary strategy %d not supported", 00312 (int) boundary); 00313 break; 00314 } 00315 } 00316 return result; 00317 } 00318 00319 00320 // ###################################################################### 00321 CudaImage<float> cudaSepFilter(const CudaImage<float>& src, const CudaImage<float>& hFilter, 00322 const CudaImage<float>& vFilter, 00323 ConvolutionBoundaryStrategy boundary, bool runOptimized) 00324 { 00325 ASSERT(hFilter.is1D() || hFilter.getSize() == 0); 00326 ASSERT(vFilter.is1D() || vFilter.getSize() == 0); 00327 return cudaSepFilter(src, hFilter, vFilter, 00328 hFilter.getSize(), vFilter.getSize(), boundary, runOptimized); 00329 } 00330 00331 // ###################################################################### 00332 CudaImage<float> cudaSepFilter(const CudaImage<float>& src, const CudaImage<float>& hFilt, const CudaImage<float>& vFilt, 00333 const int hfs, const int vfs, 00334 ConvolutionBoundaryStrategy boundary, bool runOptimized) 00335 { 00336 00337 CudaImage<float> res=src; 00338 if (hfs > 0) res = cudaXFilter(src, hFilt, hfs, boundary, runOptimized); 00339 if (vfs > 0) res = cudaYFilter(res, vFilt, vfs, boundary, runOptimized); 00340 return res; 00341 }