00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include "CUDA/CudaImage.H"
00039 #include "Util/Assert.H"
00040 #include "CUDA/cudadefs.h"
00041 #include "Image/Convolutions.H"
00042 #include "CudaConvolutions.H"
00043 #include "CudaDevices.H"
00044 #include "wrap_c_cuda.h"
00045 #include <algorithm>
00046
00047
00048
00049 CudaImage<float> cudaOptConvolve(const CudaImage<float>& src, const CudaImage<float>& f)
00050 {
00051 MemoryPolicy mp = src.getMemoryPolicy();
00052 int dev = src.getMemoryDevice();
00053 Dims tile = CudaDevices::getDeviceTileSize(dev);
00054
00055 ASSERT(src.initialized());
00056 ASSERT(mp != HOST_MEMORY);
00057 const int src_w = src.getWidth();
00058 const int src_h = src.getHeight();
00059
00060 const int fil_w = f.getWidth();
00061 const int fil_h = f.getHeight();
00062
00063 ASSERT((fil_w & 1) && (fil_h & 1));
00064 CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev);
00065
00066 cuda_c_optConvolve(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,f.getCudaArrayPtr(),fil_w,fil_h,tile.w(),tile.h());
00067 return result;
00068 }
00069
00070
00071
00072
00073 CudaImage<float> cudaConvolveZeroHelper(const CudaImage<float>& src, const CudaImage<float>& filter,
00074 const int Nx, const int Ny, bool runOptimized)
00075 {
00076 MemoryPolicy mp = src.getMemoryPolicy();
00077 int dev = src.getMemoryDevice();
00078 Dims tile = CudaDevices::getDeviceTileSize(dev);
00079
00080 ASSERT(src.initialized());
00081 ASSERT(mp != HOST_MEMORY);
00082 const int src_w = src.getWidth(), src_h = src.getHeight();
00083 CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev);
00084
00085 int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float));
00086
00087 if(runOptimized && mem_size < Nx*Ny+(tile.w()+Nx)*(tile.h()+Ny))
00088 {
00089 printf("Unable to run convolveZeroHelper optimized\n");
00090 runOptimized = false;
00091 }
00092
00093 if(runOptimized)
00094 cuda_c_convolveZeroHelperOptimized(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00095 else
00096 cuda_c_convolveZeroHelper(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00097
00098 return result;
00099 }
00100
00101
00102
00103 CudaImage<float> cudaConvolveCleanHelper(const CudaImage<float>& src, const CudaImage<float>& filter,
00104 const int Nx, const int Ny, bool runOptimized)
00105 {
00106 MemoryPolicy mp = src.getMemoryPolicy();
00107 int dev = src.getMemoryDevice();
00108 Dims tile = CudaDevices::getDeviceTileSize(dev);
00109
00110 ASSERT(src.initialized());
00111 ASSERT(mp != HOST_MEMORY);
00112 const int src_w = src.getWidth(), src_h = src.getHeight();
00113 CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev);
00114
00115 cuda_c_convolveCleanHelper(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00116
00117 return result;
00118 }
00119
00120
00121 CudaImage<float> cudaConvolveHmax(const CudaImage<float>& src, const CudaImage<float>& filter, bool runOptimized)
00122 {
00123 MemoryPolicy mp = src.getMemoryPolicy();
00124 int dev = src.getMemoryDevice();
00125 Dims tile = CudaDevices::getDeviceTileSize(dev);
00126 const int Nx = filter.getWidth(), Ny = filter.getHeight();
00127 ASSERT(src.initialized());
00128 ASSERT((Nx & 1) && (Ny & 1));
00129 ASSERT(mp != HOST_MEMORY);
00130
00131 const int src_w = src.getWidth(), src_h = src.getHeight();
00132 CudaImage<float> result = CudaImage<float>(src_w, src_h, NO_INIT,mp,dev);
00133
00134 int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float));
00135
00136 if(runOptimized && mem_size < Nx*Ny+(tile.w()+Nx)*(tile.h()+Ny))
00137 {
00138
00139 runOptimized = false;
00140 }
00141
00142 if(runOptimized)
00143 cuda_c_convolveHmaxHelperOptimized(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00144 else
00145 cuda_c_convolveHmaxHelper(result.getCudaArrayPtr(),src.getCudaArrayPtr(),src_w,src_h,filter.getCudaArrayPtr(),Nx,Ny,tile.w(),tile.h());
00146
00147 return result;
00148
00149 }
00150
00151
00152
00153 CudaImage<float> cudaConvolve(const CudaImage<float>& src, const CudaImage<float>& filter,
00154 const int Nx, const int Ny,
00155 ConvolutionBoundaryStrategy boundary, bool runOptimized)
00156 {
00157 switch (boundary)
00158 {
00159 case CONV_BOUNDARY_ZERO:
00160 return cudaConvolveZeroHelper(src, filter, Nx, Ny, runOptimized);
00161 break;
00162 case CONV_BOUNDARY_CLEAN:
00163 return cudaConvolveCleanHelper(src, filter, Nx, Ny, runOptimized);
00164 break;
00165 case CONV_BOUNDARY_REPLICATE:
00166
00167 default:
00168 LFATAL("convolution boundary strategy %d not supported",
00169 (int) boundary);
00170 }
00171 return CudaImage<float>();
00172 }
00173
00174
00175 CudaImage<float> cudaXFilter(const CudaImage<float>& src, const CudaImage<float>& hFilt, const int hfs,
00176 ConvolutionBoundaryStrategy boundary, bool runOptimized)
00177 {
00178 MemoryPolicy mp = src.getMemoryPolicy();
00179 int dev = src.getMemoryDevice();
00180 ASSERT(src.initialized());
00181 ASSERT(mp != HOST_MEMORY);
00182 if (hfs == 0)
00183 {
00184 return src;
00185 }
00186
00187 Dims tile = CudaDevices::getDeviceTileSize(dev);
00188
00189
00190 int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float));
00191 int share_len = std::min(mem_size,hfs);
00192
00193 if(runOptimized && mem_size < hfs*2+tile.sz())
00194 {
00195 Dims tileOpt = Dims(mem_size-hfs*2,1);
00196 if(tileOpt.sz() < 16)
00197 runOptimized = false;
00198 else
00199 tile = tileOpt;
00200 }
00201 const int w = src.getWidth(), h = src.getHeight();
00202 CudaImage<float> result = CudaImage<float>(w, h, NO_INIT,mp,dev);
00203
00204
00205 if(runOptimized)
00206 {
00207 switch(boundary)
00208 {
00209 case CONV_BOUNDARY_ZERO:
00210 cuda_c_optXFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00211 break;
00212 case CONV_BOUNDARY_CLEAN:
00213 cuda_c_optXFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00214 break;
00215 case CONV_BOUNDARY_REPLICATE:
00216 cuda_c_optXFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00217 break;
00218 default:
00219 LFATAL("convolution boundary strategy %d not supported",
00220 (int) boundary);
00221 break;
00222 }
00223 }
00224 else
00225 {
00226 switch(boundary)
00227 {
00228 case CONV_BOUNDARY_ZERO:
00229 cuda_c_xFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00230 break;
00231 case CONV_BOUNDARY_CLEAN:
00232 cuda_c_xFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00233 break;
00234 case CONV_BOUNDARY_REPLICATE:
00235 cuda_c_xFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00236 break;
00237 default:
00238 LFATAL("convolution boundary strategy %d not supported",
00239 (int) boundary);
00240 break;
00241 }
00242 }
00243 return result;
00244 }
00245
00246
00247 CudaImage<float> cudaYFilter(const CudaImage<float>& src, const CudaImage<float>& hFilt, const int hfs,
00248 ConvolutionBoundaryStrategy boundary, bool runOptimized)
00249 {
00250 MemoryPolicy mp = src.getMemoryPolicy();
00251 int dev = src.getMemoryDevice();
00252 ASSERT(src.initialized());
00253 ASSERT(mp != HOST_MEMORY);
00254 if (hfs == 0)
00255 {
00256 return src;
00257 }
00258
00259 Dims tile = CudaDevices::getDeviceTileSize(dev);
00260
00261 int mem_size = CudaDevices::getDeviceSharedMemorySize(dev)/int(sizeof(float));
00262 int share_len = std::min(mem_size,hfs);
00263
00264
00265 if(runOptimized && mem_size < hfs*2+tile.sz())
00266 {
00267
00268 Dims tileOpt = Dims(mem_size-hfs*2,1);
00269 if(tileOpt.sz() < 16)
00270 runOptimized = false;
00271 else
00272 tile = tileOpt;
00273 }
00274 const int w = src.getWidth(), h = src.getHeight();
00275 CudaImage<float> result = CudaImage<float>(w, h, NO_INIT,mp,dev);
00276
00277
00278 if(runOptimized)
00279 {
00280 switch(boundary)
00281 {
00282 case CONV_BOUNDARY_ZERO:
00283 cuda_c_optYFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00284 break;
00285 case CONV_BOUNDARY_CLEAN:
00286 cuda_c_optYFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00287 break;
00288 case CONV_BOUNDARY_REPLICATE:
00289 cuda_c_optYFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,tile.sz());
00290 break;
00291 default:
00292 LFATAL("convolution boundary strategy %d not supported",
00293 (int) boundary);
00294 break;
00295 }
00296 }
00297 else
00298 {
00299 switch(boundary)
00300 {
00301 case CONV_BOUNDARY_ZERO:
00302 cuda_c_yFilterZero(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00303 break;
00304 case CONV_BOUNDARY_CLEAN:
00305 cuda_c_yFilterClean(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00306 break;
00307 case CONV_BOUNDARY_REPLICATE:
00308 cuda_c_yFilterReplicate(result.getCudaArrayPtr(),src.getCudaArrayPtr(),w,h,hFilt.getCudaArrayPtr(),hfs,share_len,tile.sz());
00309 break;
00310 default:
00311 LFATAL("convolution boundary strategy %d not supported",
00312 (int) boundary);
00313 break;
00314 }
00315 }
00316 return result;
00317 }
00318
00319
00320
00321 CudaImage<float> cudaSepFilter(const CudaImage<float>& src, const CudaImage<float>& hFilter,
00322 const CudaImage<float>& vFilter,
00323 ConvolutionBoundaryStrategy boundary, bool runOptimized)
00324 {
00325 ASSERT(hFilter.is1D() || hFilter.getSize() == 0);
00326 ASSERT(vFilter.is1D() || vFilter.getSize() == 0);
00327 return cudaSepFilter(src, hFilter, vFilter,
00328 hFilter.getSize(), vFilter.getSize(), boundary, runOptimized);
00329 }
00330
00331
00332 CudaImage<float> cudaSepFilter(const CudaImage<float>& src, const CudaImage<float>& hFilt, const CudaImage<float>& vFilt,
00333 const int hfs, const int vfs,
00334 ConvolutionBoundaryStrategy boundary, bool runOptimized)
00335 {
00336
00337 CudaImage<float> res=src;
00338 if (hfs > 0) res = cudaXFilter(src, hFilt, hfs, boundary, runOptimized);
00339 if (vfs > 0) res = cudaYFilter(res, vFilt, vfs, boundary, runOptimized);
00340 return res;
00341 }