CudaConvolutions.C

Go to the documentation of this file.
00001 /*!@file CUDA/CudaConvolutions.C C++ wrapper for CUDA convolution methods */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file:
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/CudaConvolutions.C $
00035 // $Id: CudaConvolutions.C 12962 2010-03-06 02:13:53Z irock $
00036 //
00037 
00038 #include "CUDA/CudaImage.H"
00039 #include "Util/Assert.H"
00040 #include "CUDA/cudadefs.h"
00041 #include "Image/Convolutions.H"
00042 #include "CudaConvolutions.H"
00043 #include "CudaDevices.H"
00044 #include "wrap_c_cuda.h"
00045 #include <algorithm>
00046 
00047 // ######################################################################
00048 
00049 CudaImage<float> cudaOptConvolve(const CudaImage<float>& src, const CudaImage<float>& f)
00050 {
00051   MemoryPolicy mp = src.getMemoryPolicy();
00052   int dev = src.getMemoryDevice();
00053   Dims tile = CudaDevices::getDeviceTileSize(dev);
00054 
00055   ASSERT(src.initialized());
00056   ASSERT(mp != HOST_MEMORY);
00057   const int src_w = src.getWidth();
00058   const int src_h = src.getHeight();
00059 
00060   const int fil_w = f.getWidth();
00061   const int fil_h = f.getHeight();
00062 
00063   ASSERT((fil_w & 1) && (fil_h & 1));
00064   CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev);
00065 
00066   cuda_c_optConvolve(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,f.getCudaArrayPtr(),fil_w,fil_h,tile.w(),tile.h());
00067   return result;
00068 }
00069 
00070 
00071 // ######################################################################
00072 
00073 CudaImage<float> cudaConvolveZeroHelper(const CudaImage<float>& src, const CudaImage<float>& filter,
00074                                         const int Nx, const int Ny, bool runOptimized)
00075 {
00076   MemoryPolicy mp = src.getMemoryPolicy();
00077   int dev = src.getMemoryDevice();
00078   Dims tile = CudaDevices::getDeviceTileSize(dev);
00079 
00080   ASSERT(src.initialized()); //ASSERT((Nx & 1)  && (Ny & 1));
00081   ASSERT(mp != HOST_MEMORY);
00082   const int src_w = src.getWidth(), src_h = src.getHeight();
00083   CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev);
00084 
00085   int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float));
00086   // Decide whether we can run optimized versions based on size of filter
00087   if(runOptimized && mem_size < Nx*Ny+(tile.w()+Nx)*(tile.h()+Ny))
00088   {
00089     printf("Unable to run convolveZeroHelper optimized\n");
00090     runOptimized = false;
00091   }
00092 
00093   if(runOptimized)
00094     cuda_c_convolveZeroHelperOptimized(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00095   else
00096     cuda_c_convolveZeroHelper(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00097 
00098   return result;
00099 }
00100 
00101 // ######################################################################
00102 
00103 CudaImage<float> cudaConvolveCleanHelper(const CudaImage<float>& src, const CudaImage<float>& filter,
00104                                          const int Nx, const int Ny, bool runOptimized)
00105 {
00106   MemoryPolicy mp = src.getMemoryPolicy();
00107   int dev = src.getMemoryDevice();
00108   Dims tile = CudaDevices::getDeviceTileSize(dev);
00109 
00110   ASSERT(src.initialized()); //ASSERT((Nx & 1)  && (Ny & 1));
00111   ASSERT(mp != HOST_MEMORY);
00112   const int src_w = src.getWidth(), src_h = src.getHeight();
00113   CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev);
00114 
00115   cuda_c_convolveCleanHelper(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00116 
00117   return result;
00118 }
00119 
00120 // ######################################################################
00121 CudaImage<float> cudaConvolveHmax(const CudaImage<float>& src, const CudaImage<float>& filter, bool runOptimized)
00122 {
00123   MemoryPolicy mp = src.getMemoryPolicy();
00124   int dev = src.getMemoryDevice();
00125   Dims tile = CudaDevices::getDeviceTileSize(dev);
00126   const int Nx = filter.getWidth(), Ny = filter.getHeight();
00127   ASSERT(src.initialized());
00128   ASSERT((Nx & 1)  && (Ny & 1));
00129   ASSERT(mp != HOST_MEMORY);
00130 
00131   const int src_w = src.getWidth(), src_h = src.getHeight();
00132   CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev);
00133 
00134   int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float));
00135   // Decide whether we can run optimized versions based on size of filter
00136   if(runOptimized && mem_size < Nx*Ny+(tile.w()+Nx)*(tile.h()+Ny))
00137   {
00138     //printf("Unable to run convolveHmaxHelper optimized Nx %d Ny %d tw %d th %d\n",Nx,Ny,tile.w(),tile.h());
00139     runOptimized = false;
00140   }
00141 
00142   if(runOptimized)
00143     cuda_c_convolveHmaxHelperOptimized(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00144   else
00145     cuda_c_convolveHmaxHelper(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00146 
00147   return result;
00148 
00149 }
00150 
00151 
00152 // ######################################################################
00153 CudaImage<float> cudaConvolve(const CudaImage<float>& src, const CudaImage<float>& filter,
00154          const int Nx, const int Ny,
00155                               ConvolutionBoundaryStrategy boundary, bool runOptimized)
00156 {
00157   switch (boundary)
00158     {
00159     case CONV_BOUNDARY_ZERO:
00160       return cudaConvolveZeroHelper(src, filter, Nx, Ny, runOptimized);
00161       break;
00162     case CONV_BOUNDARY_CLEAN:
00163       return cudaConvolveCleanHelper(src, filter, Nx, Ny, runOptimized);
00164       break;
00165     case CONV_BOUNDARY_REPLICATE:
00166       // not implemented yet -- pass through to error
00167     default:
00168       LFATAL("convolution boundary strategy %d not supported",
00169              (int) boundary);
00170     }
00171   /* can't happen */ return CudaImage<float>();
00172 }
00173 
00174 // ######################################################################
00175 CudaImage<float> cudaXFilter(const CudaImage<float>& src, const CudaImage<float>& hFilt, const int hfs,
00176                              ConvolutionBoundaryStrategy boundary, bool runOptimized)
00177 {
00178   MemoryPolicy mp = src.getMemoryPolicy();
00179   int dev = src.getMemoryDevice();
00180   ASSERT(src.initialized());
00181   ASSERT(mp != HOST_MEMORY);
00182   if (hfs == 0)
00183     {
00184       return src;  // no filter
00185     }
00186 
00187   Dims tile = CudaDevices::getDeviceTileSize(dev);
00188 
00189   // Needed for non-optimized functions
00190   int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float));
00191   int share_len = std::min(mem_size,hfs);
00192   // Decide whether we can run optimized versions based on size of filter
00193   if(runOptimized && mem_size < hfs*2+tile.sz())
00194   {
00195     Dims tileOpt = Dims(mem_size-hfs*2,1);
00196     if(tileOpt.sz() < 16)
00197       runOptimized = false;
00198     else
00199       tile = tileOpt;
00200   }
00201   const int w = src.getWidth(), h = src.getHeight();
00202   CudaImage<float> result = CudaImage<float>(w, h, NO_INIT,mp,dev);
00203 
00204   // *** horizontal pass ***
00205   if(runOptimized)
00206     {
00207       switch(boundary)
00208         {
00209         case CONV_BOUNDARY_ZERO:
00210           cuda_c_optXFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00211           break;
00212         case CONV_BOUNDARY_CLEAN:
00213           cuda_c_optXFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00214           break;
00215         case CONV_BOUNDARY_REPLICATE:
00216           cuda_c_optXFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00217           break;
00218         default:
00219           LFATAL("convolution boundary strategy %d not supported",
00220                  (int) boundary);
00221           break;
00222         }
00223     }
00224   else
00225     {
00226       switch(boundary)
00227         {
00228         case CONV_BOUNDARY_ZERO:
00229           cuda_c_xFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00230           break;
00231         case CONV_BOUNDARY_CLEAN:
00232           cuda_c_xFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00233           break;
00234         case CONV_BOUNDARY_REPLICATE:
00235           cuda_c_xFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00236           break;
00237         default:
00238           LFATAL("convolution boundary strategy %d not supported",
00239                  (int) boundary);
00240           break;
00241         }
00242     }
00243   return result;
00244 }
00245 
00246 // ######################################################################
00247 CudaImage<float> cudaYFilter(const CudaImage<float>& src, const CudaImage<float>& hFilt, const int hfs,
00248                              ConvolutionBoundaryStrategy boundary, bool runOptimized)
00249 {
00250   MemoryPolicy mp = src.getMemoryPolicy();
00251   int dev = src.getMemoryDevice();
00252   ASSERT(src.initialized());
00253   ASSERT(mp != HOST_MEMORY);
00254   if (hfs == 0)
00255     {
00256       return src;  // no filter
00257     }
00258 
00259   Dims tile = CudaDevices::getDeviceTileSize(dev);
00260   // Needed for non-optimized functions
00261   int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float));
00262   int share_len = std::min(mem_size,hfs);
00263 
00264   // Decide whether we can run optimized versions based on size of filter
00265   if(runOptimized && mem_size < hfs*2+tile.sz())
00266   {
00267     // Modifying tile size
00268     Dims tileOpt = Dims(mem_size-hfs*2,1);
00269     if(tileOpt.sz() < 16)
00270       runOptimized = false;
00271     else
00272       tile = tileOpt;
00273   }
00274   const int w = src.getWidth(), h = src.getHeight();
00275   CudaImage<float> result = CudaImage<float>(w, h, NO_INIT,mp,dev);
00276 
00277   // *** horizontal pass ***
00278   if(runOptimized)
00279     {
00280       switch(boundary)
00281         {
00282         case CONV_BOUNDARY_ZERO:
00283           cuda_c_optYFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00284           break;
00285         case CONV_BOUNDARY_CLEAN:
00286           cuda_c_optYFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00287           break;
00288         case CONV_BOUNDARY_REPLICATE:
00289           cuda_c_optYFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00290           break;
00291         default:
00292           LFATAL("convolution boundary strategy %d not supported",
00293                  (int) boundary);
00294           break;
00295         }
00296     }
00297   else
00298     {
00299       switch(boundary)
00300         {
00301         case CONV_BOUNDARY_ZERO:
00302           cuda_c_yFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00303           break;
00304         case CONV_BOUNDARY_CLEAN:
00305           cuda_c_yFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00306           break;
00307         case CONV_BOUNDARY_REPLICATE:
00308           cuda_c_yFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00309           break;
00310         default:
00311           LFATAL("convolution boundary strategy %d not supported",
00312                  (int) boundary);
00313           break;
00314         }
00315     }
00316   return result;
00317 }
00318 
00319 
00320 // ######################################################################
00321 CudaImage<float> cudaSepFilter(const CudaImage<float>& src, const CudaImage<float>& hFilter,
00322           const CudaImage<float>& vFilter,
00323                                ConvolutionBoundaryStrategy boundary, bool runOptimized)
00324 {
00325   ASSERT(hFilter.is1D() || hFilter.getSize() == 0);
00326   ASSERT(vFilter.is1D() || vFilter.getSize() == 0);
00327   return cudaSepFilter(src, hFilter, vFilter,
00328                        hFilter.getSize(), vFilter.getSize(), boundary, runOptimized);
00329 }
00330 
00331 // ######################################################################
00332 CudaImage<float> cudaSepFilter(const CudaImage<float>& src, const CudaImage<float>& hFilt, const CudaImage<float>& vFilt,
00333           const int hfs, const int vfs,
00334                                ConvolutionBoundaryStrategy boundary, bool runOptimized)
00335 {
00336 
00337   CudaImage<float> res=src;
00338   if (hfs > 0) res = cudaXFilter(src, hFilt, hfs, boundary, runOptimized);
00339   if (vfs > 0) res = cudaYFilter(res, vFilt, vfs, boundary, runOptimized);
00340   return res;
00341 }