00001 /*!@file CUDA/test-cuda-lowpass.C test CUDA/GPU optimized lowpass filtering routines */ 00002 00003 00004 // //////////////////////////////////////////////////////////////////// // 00005 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00006 // by the University of Southern California (USC) and the iLab at USC. // 00007 // See http://iLab.usc.edu for information about this project. // 00008 // //////////////////////////////////////////////////////////////////// // 00009 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00010 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00011 // in Visual Environments, and Applications'' by Christof Koch and // 00012 // Laurent Itti, California Institute of Technology, 2001 (patent // 00013 // pending; application number 09/912,225 filed July 23, 2001; see // 00014 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00015 // //////////////////////////////////////////////////////////////////// // 00016 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00017 // // 00018 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00019 // redistribute it and/or modify it under the terms of the GNU General // 00020 // Public License as published by the Free Software Foundation; either // 00021 // version 2 of the License, or (at your option) any later version. // 00022 // // 00023 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00024 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00025 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00026 // PURPOSE. See the GNU General Public License for more details. // 00027 // // 00028 // You should have received a copy of the GNU General Public License // 00029 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00030 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00031 // Boston, MA 02111-1307 USA. // 00032 // //////////////////////////////////////////////////////////////////// // 00033 00034 00035 #include "Envision/env_c_math_ops.h" // reference implementation 00036 #include "Envision/env_image_ops.h" // reference implementation 00037 #include "Envision/env_pyr.h" 00038 #include "CUDA/cuda-lowpass.h" 00039 #include "CUDA/cutil.h" 00040 #include "CUDA/env_cuda.h" 00041 #include "Image/Image.H" 00042 #include "Image/MathOps.H" 00043 #include "Raster/Raster.H" 00044 #include "Util/log.H" 00045 #include "Util/Timer.H" 00046 #include <cuda_runtime_api.h> 00047 #include <stdio.h> 00048 00049 #include "Envision/env_alloc.h" 00050 #include "Envision/env_c_math_ops.h" 00051 #include "Envision/env_image.h" 00052 #include "Envision/env_image_ops.h" 00053 #include "Envision/env_log.h" 00054 #include "Envision/env_mt_visual_cortex.h" 00055 #include "Envision/env_params.h" 00056 #include "Envision/env_stdio_interface.h" 00057 00058 //////////////////////////////////////////////////////////////////////////////// 00059 // Common host and device functions 00060 //////////////////////////////////////////////////////////////////////////////// 00061 //Round a / b to nearest higher integer value 00062 inline int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } 00063 00064 //Round a / b to nearest lower integer value 00065 inline int iDivDown(int a, int b) { return a / b; } 00066 00067 //Align a to nearest higher multiple of b 00068 inline int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; } 00069 00070 //Align a to nearest lower multiple of b 00071 inline int iAlignDown(int a, int b) { return a - a % b; } 00072 00073 #define NREP 20 00074 00075 // ###################################################################### 00076 // Thunk to convert from env_size_t to size_t 00077 static void* malloc_thunk(env_size_t n) 00078 { 00079 return malloc(n); 00080 } 00081 00082 // ###################################################################### 00083 void compareregions(Image<int> &c, Image<int> &g, const uint rowStart, const uint rowStop, const uint colStart, const uint colStop) 00084 { 00085 uint w,h; 00086 w = c.getWidth(); 00087 h = c.getHeight(); 00088 if(w != (uint) g.getWidth() || h != (uint) g.getHeight()) 00089 { 00090 LINFO("Images are not the same size"); 00091 return; 00092 } 00093 if(rowStart > rowStop || colStart > colStop || rowStop > h || colStop > w) 00094 { 00095 LINFO("Invalid regions to compare"); 00096 return; 00097 } 00098 for(uint i=colStart;i<colStop;i++) 00099 { 00100 printf("\nC[%d]: ",i); 00101 for(uint j=rowStart;j<rowStop;j++) 00102 { 00103 printf("%d ",c.getVal(i,j)); 00104 } 00105 printf("\nG[%d]: ",i); 00106 for(uint j=rowStart;j<rowStop;j++) 00107 { 00108 printf("%d ",g.getVal(i,j)); 00109 } 00110 } 00111 printf("\n"); 00112 00113 } 00114 00115 // ###################################################################### 00116 void imgcompare(const int* cpu, const int* gpu, const uint w, const uint h) 00117 { 00118 Image<int> c(cpu, w, h), g(gpu, w, h); 00119 Image<float> diff = g - c; 00120 float mi, ma, av; getMinMaxAvg(diff, mi, ma, av); 00121 LINFO("%s: %ux%u image, GPU - CPU: avg=%f, diff = [%f .. %f]", 00122 mi == ma && ma == 0.0F ? "PASS" : "FAIL", w, h, av, mi, ma); 00123 //compareregions(c,g,0,30,575,600); 00124 } 00125 00126 // ###################################################################### 00127 void imgcompare(const float* cpu, const float* gpu, const uint w, const uint h) 00128 { 00129 Image<float> c(cpu, w, h), g(gpu, w, h); 00130 Image<float> diff = g - c; 00131 float mi, ma, av; getMinMaxAvg(diff, mi, ma, av); 00132 LINFO("%s: %ux%u image, GPU - CPU: avg=%f, diff = [%f .. %f]", 00133 mi == ma && ma == 0.0F ? "PASS" : "FAIL", w, h, av, mi, ma); 00134 //compareregions(c,g,0,30,575,600); 00135 } 00136 00137 void test_lowpass5(Image<int> &iimg, char *cpu_file, char *gpu_int_file); 00138 void test_lowpass9(Image<int> &iimg, char *cpu_file, char *gpu_int_file); 00139 00140 //////////////////////////////////////////////////////////////////////////////// 00141 // Main program 00142 //////////////////////////////////////////////////////////////////////////////// 00143 int main(int argc, char **argv) 00144 { 00145 if (argc != 4) LFATAL("USAGE: %s <input.pgm> <outCPU.pgm> <outGPU.pgm>", argv[0]); 00146 00147 CUT_DEVICE_INIT(0); 00148 LINFO("Reading: %s", argv[1]); 00149 Image<byte> img = Raster::ReadGray(argv[1]); 00150 Image<int> iimg = img; // convert to ints 00151 test_lowpass5(iimg,argv[2],argv[3]); 00152 //test_lowpass9(iimg,argv[2],argv[3]); 00153 00154 } 00155 00156 void test_lowpass5(Image<int> &iimg, char *cpu_file, char *gpu_int_file) 00157 { 00158 int *dsrc, *ddst, *ddst2; 00159 const uint w = iimg.getWidth(), h = iimg.getHeight(); 00160 const uint siz = w * h * sizeof(int); 00161 LINFO("Processing %ux%u image on lowpass5...", w, h); 00162 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&dsrc, siz) ); // note: (void*) to avoid compiler warn 00163 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst, siz/2) ); 00164 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst2, siz/4) ); 00165 CUDA_SAFE_CALL( cudaThreadSynchronize() ); 00166 CUDA_SAFE_CALL( cudaMemcpy(dsrc, iimg.getArrayPtr(), siz, cudaMemcpyHostToDevice) ); 00167 Timer tim; 00168 00169 LINFO("GPU go!"); tim.reset(); 00170 for (uint ii = 0; ii < NREP; ++ii) 00171 { 00172 cuda_lowpass_5_x_dec_x_fewbits_optim(dsrc, w, h, ddst); 00173 CUT_CHECK_ERROR("convolutionRowGPU() execution failed\n"); 00174 cuda_lowpass_5_y_dec_y_fewbits_optim(ddst, w/2, h, ddst2); 00175 CUT_CHECK_ERROR("convolutionColumnGPU() execution failed\n"); 00176 } 00177 LINFO("GPU done! %fms", tim.getSecs() * 1000.0F); 00178 00179 LINFO("Reading back GPU results... siz/4 %d",siz/4); 00180 Image<int> ires(iDivUp(w,2), iDivUp(h,2), ZEROS); 00181 00182 CUDA_SAFE_CALL( cudaMemcpy(ires.getArrayPtr(), ddst2, siz/4, cudaMemcpyDeviceToHost) ); 00183 CUDA_SAFE_CALL( cudaThreadSynchronize() ); 00184 00185 Image<byte> bres = ires; // will convert and clamp as necessary 00186 00187 Raster::WriteGray(bres, gpu_int_file); 00188 00189 // compare with CPU: 00190 Image<int> ires2(iDivUp(w,2), h, NO_INIT), ires3(iDivUp(w,2), iDivUp(h,2), NO_INIT); 00191 LINFO("CPU go!"); tim.reset(); 00192 for (uint ii = 0; ii < NREP; ++ii) 00193 { 00194 env_c_lowpass_5_x_dec_x_fewbits_optim((const intg32*)iimg.getArrayPtr(), w, h, 00195 (intg32*)ires2.getArrayPtr(), w/2); 00196 env_c_lowpass_5_y_dec_y_fewbits_optim((const intg32*)ires2.getArrayPtr(), w/2, h, 00197 (intg32*)ires3.getArrayPtr(), h/2); 00198 } 00199 LINFO("CPU done! %fms", tim.getSecs() * 1000.0F); 00200 00201 Raster::WriteGray(Image<byte>(ires3), cpu_file); 00202 00203 imgcompare(ires3.getArrayPtr(), ires.getArrayPtr(), w/2, h/2); 00204 00205 CUDA_SAFE_CALL( cudaFree(ddst2) ); 00206 CUDA_SAFE_CALL( cudaFree(ddst) ); 00207 CUDA_SAFE_CALL( cudaFree(dsrc) ); 00208 00209 // ###################################################################### 00210 LINFO("Moving on to pyramid test..."); 00211 00212 struct env_params envp; 00213 env_params_set_defaults(&envp); 00214 00215 envp.maxnorm_type = ENV_VCXNORM_MAXNORM; 00216 envp.scale_bits = 16; 00217 00218 env_assert_set_handler(&env_stdio_assert_handler); 00219 env_allocation_init(&malloc_thunk, &free); 00220 00221 env_params_validate(&envp); 00222 struct env_math imath; 00223 env_init_integer_math(&imath, &envp); 00224 00225 struct env_image einput; const struct env_dims di = { iimg.getWidth(), iimg.getHeight() }; 00226 env_img_init(&einput, di); 00227 00228 const int d = 8, firstlevel = 0; 00229 memcpy((void*)env_img_pixelsw(&einput), iimg.getArrayPtr(), iimg.getSize() * sizeof(int)); 00230 struct env_pyr gpyr; env_pyr_init(&gpyr, d); 00231 00232 // if firstlevel is zero, copy source image into level 0 of the pyramid: 00233 if (firstlevel == 0) env_img_copy_src_dst(&einput, env_pyr_imgw(&gpyr, 0)); 00234 00235 // allocate device memory: 00236 const env_size_t depth = env_pyr_depth(&gpyr); 00237 const env_size_t wh = w * h; 00238 const env_size_t rsiz = siz / 2; // siz/3 would be enough except for a bunch of small 1x1 levels 00239 int *dres, *dtmp; 00240 CUDA_SAFE_CALL(cudaMalloc((void **)(void *)&dsrc, siz + siz/2 + rsiz)); // (void*) to avoid compiler warn 00241 dtmp = dsrc + wh; dres = dtmp + wh/2; 00242 00243 // copy source image to device memory: 00244 CUDA_SAFE_CALL(cudaThreadSynchronize()); 00245 CUDA_SAFE_CALL(cudaMemcpy(dsrc, env_img_pixels(&einput), siz, cudaMemcpyHostToDevice)); 00246 00247 // run the pyramid in DEVICE memory: 00248 env_size_t outw[depth], outh[depth]; 00249 LINFO("GPU pyramid go!"); tim.reset(); 00250 for (uint ii = 0; ii < NREP; ++ii) 00251 cudacore_pyr_build_lowpass_5(dsrc, dtmp, dres, depth, w, h, outw, outh); 00252 LINFO("GPU pyramid done! %fms", tim.getSecs() * 1000.0F); 00253 00254 // collect the results, starting at firstlevel and ignoring previous 00255 // levels; level 0 (if desired) has been handled already on the CPU 00256 // (simple memcpy) and is not in dres, which starts at level 1: 00257 int *dresptr = dres; 00258 for (env_size_t lev = 1; lev < depth; ++lev) { 00259 // get a pointer to image at that level: 00260 struct env_image *res = env_pyr_imgw(&gpyr, lev); 00261 const env_size_t ww = outw[lev], hh = outh[lev]; 00262 00263 if (lev < firstlevel) 00264 env_img_make_empty(res); // kill that level 00265 else { 00266 const struct env_dims di = { ww, hh }; 00267 env_img_resize_dims(res, di); 00268 CUDA_SAFE_CALL(cudaMemcpy((void *)env_img_pixels(res), dresptr, 00269 ww * hh * sizeof(int), cudaMemcpyDeviceToHost)); 00270 } 00271 00272 // ready for next level: 00273 dresptr += ww * hh; 00274 } 00275 00276 // free allocated memory: 00277 CUDA_SAFE_CALL( cudaFree(dsrc) ); 00278 00279 // compute on CPU: 00280 struct env_pyr cpyr; env_pyr_init(&cpyr, d); 00281 LINFO("CPU pyramid go!"); tim.reset(); 00282 for (uint ii = 0; ii < NREP; ++ii) 00283 env_pyr_build_lowpass_5_cpu(&einput, firstlevel, &imath, &cpyr); 00284 LINFO("CPU pyramid done! %fms", tim.getSecs() * 1000.0F); 00285 00286 // compare results: 00287 outw[0] = w; outh[0] = h; 00288 for (uint ii = firstlevel; ii < depth; ++ii) 00289 imgcompare((const int*)env_img_pixels(env_pyr_img(&cpyr, ii)), 00290 (const int*)env_img_pixels(env_pyr_img(&gpyr, ii)), outw[ii], outh[ii]); 00291 } 00292 00293 00294 00295 00296 void test_lowpass9(Image<int> &iimg, char *cpu_file, char *gpu_int_file) 00297 { 00298 int *dsrc, *ddst, *ddst2; 00299 const uint w = iimg.getWidth(), h = iimg.getHeight(); 00300 const uint siz = w * h * sizeof(int); 00301 00302 LINFO("Processing %ux%u image on lowpass9...", w, h); 00303 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&dsrc, siz) ); // note: (void*) to avoid compiler warn 00304 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst, siz) ); 00305 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst2, siz) ); 00306 CUDA_SAFE_CALL( cudaThreadSynchronize() ); 00307 CUDA_SAFE_CALL( cudaMemcpy(dsrc, iimg.getArrayPtr(), siz, cudaMemcpyHostToDevice) ); 00308 Timer tim; 00309 00310 LINFO("GPU int go!"); tim.reset(); 00311 for (uint ii = 0; ii < NREP; ++ii) 00312 { 00313 cuda_lowpass_9_x_fewbits_optim(dsrc, w, h, ddst); 00314 CUT_CHECK_ERROR("convolutionRowGPU() execution failed\n"); 00315 cuda_lowpass_9_y_fewbits_optim(ddst, w, h, ddst2); 00316 CUT_CHECK_ERROR("convolutionColumnGPU() execution failed\n"); 00317 } 00318 LINFO("GPU int done! %fms", tim.getSecs() * 1000.0F); 00319 00320 LINFO("Reading back GPU int results... siz %d",siz); 00321 Image<int> ires(w, h, ZEROS); 00322 00323 CUDA_SAFE_CALL( cudaMemcpy(ires.getArrayPtr(), ddst2, siz, cudaMemcpyDeviceToHost) ); 00324 CUDA_SAFE_CALL( cudaFree(ddst2) ); 00325 CUDA_SAFE_CALL( cudaFree(ddst) ); 00326 CUDA_SAFE_CALL( cudaFree(dsrc) ); 00327 CUDA_SAFE_CALL( cudaThreadSynchronize() ); 00328 00329 Image<byte> bres = ires; // will convert and clamp as necessary 00330 Raster::WriteGray(bres, gpu_int_file); 00331 00332 // compare with CPU: 00333 Image<int> ires2(w, h, NO_INIT), ires3(w, h, NO_INIT); 00334 LINFO("CPU go!"); tim.reset(); 00335 for (uint ii = 0; ii < NREP; ++ii) 00336 { 00337 env_c_lowpass_9_x_fewbits_optim((const intg32*)iimg.getArrayPtr(), w, h, 00338 (intg32*)ires2.getArrayPtr()); 00339 env_c_lowpass_9_y_fewbits_optim((const intg32*)ires2.getArrayPtr(), w, h, 00340 (intg32*)ires3.getArrayPtr()); 00341 } 00342 LINFO("CPU done! %fms", tim.getSecs() * 1000.0F); 00343 00344 Raster::WriteGray(Image<byte>(ires3), cpu_file); 00345 00346 imgcompare((Image<float>(ires3)).getArrayPtr(), (Image<float>(ires)).getArrayPtr(), w, h); 00347 00348 // // ###################################################################### 00349 // LINFO("Moving on to pyramid test..."); 00350 00351 // struct env_params envp; 00352 // env_params_set_defaults(&envp); 00353 00354 // envp.maxnorm_type = ENV_VCXNORM_MAXNORM; 00355 // envp.scale_bits = 16; 00356 00357 // env_assert_set_handler(&env_stdio_assert_handler); 00358 // env_allocation_init(&malloc_thunk, &free); 00359 00360 // env_params_validate(&envp); 00361 // struct env_math imath; 00362 // env_init_integer_math(&imath, &envp); 00363 00364 // struct env_image einput; const struct env_dims di = { iimg.getWidth(), iimg.getHeight() }; 00365 // env_img_init(&einput, di); 00366 00367 // const int d = 8, firstlevel = 0; 00368 // memcpy((void*)env_img_pixelsw(&einput), iimg.getArrayPtr(), iimg.getSize() * sizeof(int)); 00369 // struct env_pyr gpyr; env_pyr_init(&gpyr, d); 00370 00371 // // if firstlevel is zero, copy source image into level 0 of the pyramid: 00372 // if (firstlevel == 0) env_img_copy_src_dst(&einput, env_pyr_imgw(&gpyr, 0)); 00373 00374 // // allocate device memory: 00375 // const env_size_t depth = env_pyr_depth(&gpyr); 00376 // const env_size_t wh = w * h; 00377 // const env_size_t rsiz = siz / 2; // siz/3 would be enough except for a bunch of small 1x1 levels 00378 // int *dres, *dtmp; 00379 // CUDA_SAFE_CALL(cudaMalloc((void **)(void *)&dsrc, siz + siz/2 + rsiz)); // (void*) to avoid compiler warn 00380 // dtmp = dsrc + wh; dres = dtmp + wh/2; 00381 00382 // // copy source image to device memory: 00383 // CUDA_SAFE_CALL(cudaThreadSynchronize()); 00384 // CUDA_SAFE_CALL(cudaMemcpy(dsrc, env_img_pixels(&einput), siz, cudaMemcpyHostToDevice)); 00385 00386 // // run the pyramid in DEVICE memory: 00387 // env_size_t outw[depth], outh[depth]; 00388 // LINFO("GPU pyramid go!"); tim.reset(); 00389 // for (uint ii = 0; ii < NREP; ++ii) 00390 // cudacore_pyr_build_lowpass_5(dsrc, dtmp, dres, depth, w, h, outw, outh); 00391 // LINFO("GPU pyramid done! %fms", tim.getSecs() * 1000.0F); 00392 00393 // // collect the results, starting at firstlevel and ignoring previous 00394 // // levels; level 0 (if desired) has been handled already on the CPU 00395 // // (simple memcpy) and is not in dres, which starts at level 1: 00396 // int *dresptr = dres; 00397 // for (env_size_t lev = 1; lev < depth; ++lev) { 00398 // // get a pointer to image at that level: 00399 // struct env_image *res = env_pyr_imgw(&gpyr, lev); 00400 // const env_size_t ww = outw[lev], hh = outh[lev]; 00401 00402 // if (lev < firstlevel) 00403 // env_img_make_empty(res); // kill that level 00404 // else { 00405 // const struct env_dims di = { ww, hh }; 00406 // env_img_resize_dims(res, di); 00407 // CUDA_SAFE_CALL(cudaMemcpy((void *)env_img_pixels(res), dresptr, 00408 // ww * hh * sizeof(int), cudaMemcpyDeviceToHost)); 00409 // } 00410 00411 // // ready for next level: 00412 // dresptr += ww * hh; 00413 // } 00414 00415 // // free allocated memory: 00416 // CUDA_SAFE_CALL( cudaFree(dsrc) ); 00417 00418 // // compute on CPU: 00419 // struct env_pyr cpyr; env_pyr_init(&cpyr, d); 00420 // LINFO("CPU pyramid go!"); tim.reset(); 00421 // for (uint ii = 0; ii < NREP; ++ii) 00422 // env_pyr_build_lowpass_5_cpu(&einput, firstlevel, &imath, &cpyr); 00423 // LINFO("CPU pyramid done! %fms", tim.getSecs() * 1000.0F); 00424 00425 // // compare results: 00426 // outw[0] = w; outh[0] = h; 00427 // for (uint ii = firstlevel; ii < depth; ++ii) 00428 // imgcompare((const int*)env_img_pixels(env_pyr_img(&cpyr, ii)), 00429 // (const int*)env_img_pixels(env_pyr_img(&gpyr, ii)), outw[ii], outh[ii]); 00430 } 00431 00432 00433 00434 00435 00436 00437 00438 00439 // ###################################################################### 00440 /* So things look consistent in everyone's emacs... */ 00441 /* Local Variables: */ 00442 /* mode: c++ */ 00443 /* indent-tabs-mode: nil */ 00444 /* End: */