test-cuda-lowpass.C

Go to the documentation of this file.
00001 /*!@file CUDA/test-cuda-lowpass.C test CUDA/GPU optimized lowpass filtering routines */
00002 
00003 
00004 // //////////////////////////////////////////////////////////////////// //
00005 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00006 // by the University of Southern California (USC) and the iLab at USC.  //
00007 // See http://iLab.usc.edu for information about this project.          //
00008 // //////////////////////////////////////////////////////////////////// //
00009 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00010 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00011 // in Visual Environments, and Applications'' by Christof Koch and      //
00012 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00013 // pending; application number 09/912,225 filed July 23, 2001; see      //
00014 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00015 // //////////////////////////////////////////////////////////////////// //
00016 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00017 //                                                                      //
00018 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00019 // redistribute it and/or modify it under the terms of the GNU General  //
00020 // Public License as published by the Free Software Foundation; either  //
00021 // version 2 of the License, or (at your option) any later version.     //
00022 //                                                                      //
00023 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00024 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00025 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00026 // PURPOSE.  See the GNU General Public License for more details.       //
00027 //                                                                      //
00028 // You should have received a copy of the GNU General Public License    //
00029 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00030 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00031 // Boston, MA 02111-1307 USA.                                           //
00032 // //////////////////////////////////////////////////////////////////// //
00033 
00034 
00035 #include "Envision/env_c_math_ops.h" // reference implementation
00036 #include "Envision/env_image_ops.h" // reference implementation
00037 #include "Envision/env_pyr.h"
00038 #include "CUDA/cuda-lowpass.h"
00039 #include "CUDA/cutil.h"
00040 #include "CUDA/env_cuda.h"
00041 #include "Image/Image.H"
00042 #include "Image/MathOps.H"
00043 #include "Raster/Raster.H"
00044 #include "Util/log.H"
00045 #include "Util/Timer.H"
00046 #include <cuda_runtime_api.h>
00047 #include <stdio.h>
00048 
00049 #include "Envision/env_alloc.h"
00050 #include "Envision/env_c_math_ops.h"
00051 #include "Envision/env_image.h"
00052 #include "Envision/env_image_ops.h"
00053 #include "Envision/env_log.h"
00054 #include "Envision/env_mt_visual_cortex.h"
00055 #include "Envision/env_params.h"
00056 #include "Envision/env_stdio_interface.h"
00057 
00058 ////////////////////////////////////////////////////////////////////////////////
00059 // Common host and device functions
00060 ////////////////////////////////////////////////////////////////////////////////
00061 //Round a / b to nearest higher integer value
00062 inline int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
00063 
00064 //Round a / b to nearest lower integer value
00065 inline int iDivDown(int a, int b) { return a / b; }
00066 
00067 //Align a to nearest higher multiple of b
00068 inline int iAlignUp(int a, int b) { return (a % b != 0) ?  (a - a % b + b) : a; }
00069 
00070 //Align a to nearest lower multiple of b
00071 inline int iAlignDown(int a, int b) { return a - a % b; }
00072 
00073 #define NREP 20
00074 
00075 // ######################################################################
00076 // Thunk to convert from env_size_t to size_t
00077 static void* malloc_thunk(env_size_t n)
00078 {
00079         return malloc(n);
00080 }
00081 
00082 // ######################################################################
00083 void compareregions(Image<int> &c, Image<int> &g, const uint rowStart, const uint rowStop, const uint colStart, const uint colStop)
00084 {
00085   uint w,h;
00086   w = c.getWidth();
00087   h = c.getHeight();
00088   if(w != (uint) g.getWidth() || h != (uint) g.getHeight())
00089   {
00090     LINFO("Images are not the same size");
00091     return;
00092   }
00093   if(rowStart > rowStop || colStart > colStop || rowStop > h || colStop > w)
00094   {
00095     LINFO("Invalid regions to compare");
00096     return;
00097   }
00098   for(uint i=colStart;i<colStop;i++)
00099   {
00100     printf("\nC[%d]: ",i);
00101     for(uint j=rowStart;j<rowStop;j++)
00102     {
00103       printf("%d ",c.getVal(i,j));
00104     }
00105     printf("\nG[%d]: ",i);
00106     for(uint j=rowStart;j<rowStop;j++)
00107     {
00108       printf("%d ",g.getVal(i,j));
00109     }
00110   }
00111   printf("\n");
00112 
00113 }
00114 
00115 // ######################################################################
00116 void imgcompare(const int* cpu, const int* gpu, const uint w, const uint h)
00117 {
00118   Image<int> c(cpu, w, h), g(gpu, w, h);
00119   Image<float> diff = g - c;
00120   float mi, ma, av; getMinMaxAvg(diff, mi, ma, av);
00121   LINFO("%s: %ux%u image, GPU - CPU: avg=%f, diff = [%f .. %f]",
00122         mi == ma && ma == 0.0F ? "PASS" : "FAIL", w, h, av, mi, ma);
00123   //compareregions(c,g,0,30,575,600);
00124 }
00125 
00126 // ######################################################################
00127 void imgcompare(const float* cpu, const float* gpu, const uint w, const uint h)
00128 {
00129   Image<float> c(cpu, w, h), g(gpu, w, h);
00130   Image<float> diff = g - c;
00131   float mi, ma, av; getMinMaxAvg(diff, mi, ma, av);
00132   LINFO("%s: %ux%u image, GPU - CPU: avg=%f, diff = [%f .. %f]",
00133         mi == ma && ma == 0.0F ? "PASS" : "FAIL", w, h, av, mi, ma);
00134   //compareregions(c,g,0,30,575,600);
00135 }
00136 
00137 void test_lowpass5(Image<int> &iimg, char *cpu_file, char *gpu_int_file);
00138 void test_lowpass9(Image<int> &iimg, char *cpu_file, char *gpu_int_file);
00139 
00140 ////////////////////////////////////////////////////////////////////////////////
00141 // Main program
00142 ////////////////////////////////////////////////////////////////////////////////
00143 int main(int argc, char **argv)
00144 {
00145   if (argc != 4) LFATAL("USAGE: %s <input.pgm> <outCPU.pgm> <outGPU.pgm>", argv[0]);
00146 
00147   CUT_DEVICE_INIT(0);
00148   LINFO("Reading: %s", argv[1]);
00149   Image<byte> img = Raster::ReadGray(argv[1]);
00150   Image<int> iimg = img; // convert to ints
00151   test_lowpass5(iimg,argv[2],argv[3]);
00152   //test_lowpass9(iimg,argv[2],argv[3]);
00153 
00154 }
00155 
00156 void test_lowpass5(Image<int> &iimg, char *cpu_file, char *gpu_int_file)
00157 {
00158   int *dsrc, *ddst, *ddst2;
00159   const uint w = iimg.getWidth(), h = iimg.getHeight();
00160   const uint siz = w * h * sizeof(int);
00161   LINFO("Processing %ux%u image on lowpass5...", w, h);
00162   CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&dsrc, siz) ); // note: (void*) to avoid compiler warn
00163   CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst, siz/2) );
00164   CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst2, siz/4) );
00165   CUDA_SAFE_CALL( cudaThreadSynchronize() );
00166   CUDA_SAFE_CALL( cudaMemcpy(dsrc, iimg.getArrayPtr(), siz, cudaMemcpyHostToDevice) );
00167   Timer tim;
00168 
00169   LINFO("GPU go!"); tim.reset();
00170   for (uint ii = 0; ii < NREP; ++ii)
00171     {
00172       cuda_lowpass_5_x_dec_x_fewbits_optim(dsrc, w, h, ddst);
00173       CUT_CHECK_ERROR("convolutionRowGPU() execution failed\n");
00174       cuda_lowpass_5_y_dec_y_fewbits_optim(ddst, w/2, h, ddst2);
00175       CUT_CHECK_ERROR("convolutionColumnGPU() execution failed\n");
00176     }
00177   LINFO("GPU done! %fms", tim.getSecs() * 1000.0F);
00178 
00179   LINFO("Reading back GPU results... siz/4 %d",siz/4);
00180   Image<int> ires(iDivUp(w,2), iDivUp(h,2), ZEROS);
00181 
00182   CUDA_SAFE_CALL( cudaMemcpy(ires.getArrayPtr(), ddst2, siz/4, cudaMemcpyDeviceToHost) );
00183   CUDA_SAFE_CALL( cudaThreadSynchronize() );
00184 
00185   Image<byte> bres = ires; // will convert and clamp as necessary
00186 
00187   Raster::WriteGray(bres, gpu_int_file);
00188 
00189   // compare with CPU:
00190   Image<int> ires2(iDivUp(w,2), h, NO_INIT), ires3(iDivUp(w,2), iDivUp(h,2), NO_INIT);
00191   LINFO("CPU go!"); tim.reset();
00192   for (uint ii = 0; ii < NREP; ++ii)
00193     {
00194       env_c_lowpass_5_x_dec_x_fewbits_optim((const intg32*)iimg.getArrayPtr(), w, h,
00195                                             (intg32*)ires2.getArrayPtr(), w/2);
00196       env_c_lowpass_5_y_dec_y_fewbits_optim((const intg32*)ires2.getArrayPtr(), w/2, h,
00197                                             (intg32*)ires3.getArrayPtr(), h/2);
00198     }
00199   LINFO("CPU done! %fms", tim.getSecs() * 1000.0F);
00200 
00201   Raster::WriteGray(Image<byte>(ires3), cpu_file);
00202 
00203   imgcompare(ires3.getArrayPtr(), ires.getArrayPtr(), w/2, h/2);
00204 
00205   CUDA_SAFE_CALL( cudaFree(ddst2) );
00206   CUDA_SAFE_CALL( cudaFree(ddst) );
00207   CUDA_SAFE_CALL( cudaFree(dsrc) );
00208 
00209   // ######################################################################
00210   LINFO("Moving on to pyramid test...");
00211 
00212   struct env_params envp;
00213   env_params_set_defaults(&envp);
00214 
00215   envp.maxnorm_type = ENV_VCXNORM_MAXNORM;
00216   envp.scale_bits = 16;
00217 
00218   env_assert_set_handler(&env_stdio_assert_handler);
00219   env_allocation_init(&malloc_thunk, &free);
00220 
00221   env_params_validate(&envp);
00222   struct env_math imath;
00223   env_init_integer_math(&imath, &envp);
00224 
00225   struct env_image einput; const struct env_dims di = { iimg.getWidth(), iimg.getHeight() };
00226   env_img_init(&einput, di);
00227 
00228   const int d = 8, firstlevel = 0;
00229   memcpy((void*)env_img_pixelsw(&einput), iimg.getArrayPtr(), iimg.getSize() * sizeof(int));
00230   struct env_pyr gpyr; env_pyr_init(&gpyr, d);
00231 
00232   // if firstlevel is zero, copy source image into level 0 of the pyramid:
00233   if (firstlevel == 0) env_img_copy_src_dst(&einput, env_pyr_imgw(&gpyr, 0));
00234 
00235   // allocate device memory:
00236   const env_size_t depth = env_pyr_depth(&gpyr);
00237   const env_size_t wh = w * h;
00238   const env_size_t rsiz = siz / 2; // siz/3 would be enough except for a bunch of small 1x1 levels
00239   int *dres, *dtmp;
00240   CUDA_SAFE_CALL(cudaMalloc((void **)(void *)&dsrc, siz + siz/2 + rsiz)); // (void*) to avoid compiler warn
00241   dtmp = dsrc + wh; dres = dtmp + wh/2;
00242 
00243   // copy source image to device memory:
00244   CUDA_SAFE_CALL(cudaThreadSynchronize());
00245   CUDA_SAFE_CALL(cudaMemcpy(dsrc, env_img_pixels(&einput), siz, cudaMemcpyHostToDevice));
00246 
00247   // run the pyramid in DEVICE memory:
00248   env_size_t outw[depth], outh[depth];
00249   LINFO("GPU pyramid go!"); tim.reset();
00250   for (uint ii = 0; ii < NREP; ++ii)
00251     cudacore_pyr_build_lowpass_5(dsrc, dtmp, dres, depth, w, h, outw, outh);
00252   LINFO("GPU pyramid done! %fms", tim.getSecs() * 1000.0F);
00253 
00254   // collect the results, starting at firstlevel and ignoring previous
00255   // levels; level 0 (if desired) has been handled already on the CPU
00256   // (simple memcpy) and is not in dres, which starts at level 1:
00257   int *dresptr = dres;
00258   for (env_size_t lev = 1; lev < depth; ++lev) {
00259     // get a pointer to image at that level:
00260     struct env_image *res = env_pyr_imgw(&gpyr, lev);
00261     const env_size_t ww = outw[lev], hh = outh[lev];
00262 
00263     if (lev < firstlevel)
00264       env_img_make_empty(res); // kill that level
00265     else {
00266       const struct env_dims di = { ww, hh };
00267       env_img_resize_dims(res, di);
00268       CUDA_SAFE_CALL(cudaMemcpy((void *)env_img_pixels(res), dresptr,
00269                                 ww * hh * sizeof(int), cudaMemcpyDeviceToHost));
00270     }
00271 
00272     // ready for next level:
00273     dresptr += ww * hh;
00274   }
00275 
00276   // free allocated memory:
00277   CUDA_SAFE_CALL( cudaFree(dsrc) );
00278 
00279   // compute on CPU:
00280   struct env_pyr cpyr; env_pyr_init(&cpyr, d);
00281   LINFO("CPU pyramid go!"); tim.reset();
00282   for (uint ii = 0; ii < NREP; ++ii)
00283     env_pyr_build_lowpass_5_cpu(&einput, firstlevel, &imath, &cpyr);
00284   LINFO("CPU pyramid done! %fms", tim.getSecs() * 1000.0F);
00285 
00286   // compare results:
00287   outw[0] = w; outh[0] = h;
00288   for (uint ii = firstlevel; ii < depth; ++ii)
00289     imgcompare((const int*)env_img_pixels(env_pyr_img(&cpyr, ii)),
00290                (const int*)env_img_pixels(env_pyr_img(&gpyr, ii)), outw[ii], outh[ii]);
00291 }
00292 
00293 
00294 
00295 
00296 void test_lowpass9(Image<int> &iimg, char *cpu_file, char *gpu_int_file)
00297 {
00298   int *dsrc, *ddst, *ddst2;
00299   const uint w = iimg.getWidth(), h = iimg.getHeight();
00300   const uint siz = w * h * sizeof(int);
00301 
00302   LINFO("Processing %ux%u image on lowpass9...", w, h);
00303   CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&dsrc, siz) ); // note: (void*) to avoid compiler warn
00304   CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst, siz) );
00305   CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst2, siz) );
00306   CUDA_SAFE_CALL( cudaThreadSynchronize() );
00307   CUDA_SAFE_CALL( cudaMemcpy(dsrc, iimg.getArrayPtr(), siz, cudaMemcpyHostToDevice) );
00308   Timer tim;
00309 
00310   LINFO("GPU int go!"); tim.reset();
00311   for (uint ii = 0; ii < NREP; ++ii)
00312     {
00313       cuda_lowpass_9_x_fewbits_optim(dsrc, w, h, ddst);
00314       CUT_CHECK_ERROR("convolutionRowGPU() execution failed\n");
00315       cuda_lowpass_9_y_fewbits_optim(ddst, w, h, ddst2);
00316       CUT_CHECK_ERROR("convolutionColumnGPU() execution failed\n");
00317     }
00318   LINFO("GPU int done! %fms", tim.getSecs() * 1000.0F);
00319 
00320   LINFO("Reading back GPU int results... siz %d",siz);
00321   Image<int> ires(w, h, ZEROS);
00322 
00323   CUDA_SAFE_CALL( cudaMemcpy(ires.getArrayPtr(), ddst2, siz, cudaMemcpyDeviceToHost) );
00324   CUDA_SAFE_CALL( cudaFree(ddst2) );
00325   CUDA_SAFE_CALL( cudaFree(ddst) );
00326   CUDA_SAFE_CALL( cudaFree(dsrc) );
00327   CUDA_SAFE_CALL( cudaThreadSynchronize() );
00328 
00329   Image<byte> bres = ires; // will convert and clamp as necessary
00330   Raster::WriteGray(bres, gpu_int_file);
00331 
00332   // compare with CPU:
00333   Image<int> ires2(w, h, NO_INIT), ires3(w, h, NO_INIT);
00334   LINFO("CPU go!"); tim.reset();
00335   for (uint ii = 0; ii < NREP; ++ii)
00336     {
00337       env_c_lowpass_9_x_fewbits_optim((const intg32*)iimg.getArrayPtr(), w, h,
00338                                             (intg32*)ires2.getArrayPtr());
00339       env_c_lowpass_9_y_fewbits_optim((const intg32*)ires2.getArrayPtr(), w, h,
00340                                             (intg32*)ires3.getArrayPtr());
00341     }
00342   LINFO("CPU done! %fms", tim.getSecs() * 1000.0F);
00343 
00344   Raster::WriteGray(Image<byte>(ires3), cpu_file);
00345 
00346   imgcompare((Image<float>(ires3)).getArrayPtr(), (Image<float>(ires)).getArrayPtr(), w, h);
00347 
00348 //   // ######################################################################
00349 //   LINFO("Moving on to pyramid test...");
00350 
00351 //   struct env_params envp;
00352 //   env_params_set_defaults(&envp);
00353 
00354 //   envp.maxnorm_type = ENV_VCXNORM_MAXNORM;
00355 //   envp.scale_bits = 16;
00356 
00357 //   env_assert_set_handler(&env_stdio_assert_handler);
00358 //   env_allocation_init(&malloc_thunk, &free);
00359 
00360 //   env_params_validate(&envp);
00361 //   struct env_math imath;
00362 //   env_init_integer_math(&imath, &envp);
00363 
00364 //   struct env_image einput; const struct env_dims di = { iimg.getWidth(), iimg.getHeight() };
00365 //   env_img_init(&einput, di);
00366 
00367 //   const int d = 8, firstlevel = 0;
00368 //   memcpy((void*)env_img_pixelsw(&einput), iimg.getArrayPtr(), iimg.getSize() * sizeof(int));
00369 //   struct env_pyr gpyr; env_pyr_init(&gpyr, d);
00370 
00371 //   // if firstlevel is zero, copy source image into level 0 of the pyramid:
00372 //   if (firstlevel == 0) env_img_copy_src_dst(&einput, env_pyr_imgw(&gpyr, 0));
00373 
00374 //   // allocate device memory:
00375 //   const env_size_t depth = env_pyr_depth(&gpyr);
00376 //   const env_size_t wh = w * h;
00377 //   const env_size_t rsiz = siz / 2; // siz/3 would be enough except for a bunch of small 1x1 levels
00378 //   int *dres, *dtmp;
00379 //   CUDA_SAFE_CALL(cudaMalloc((void **)(void *)&dsrc, siz + siz/2 + rsiz)); // (void*) to avoid compiler warn
00380 //   dtmp = dsrc + wh; dres = dtmp + wh/2;
00381 
00382 //   // copy source image to device memory:
00383 //   CUDA_SAFE_CALL(cudaThreadSynchronize());
00384 //   CUDA_SAFE_CALL(cudaMemcpy(dsrc, env_img_pixels(&einput), siz, cudaMemcpyHostToDevice));
00385 
00386 //   // run the pyramid in DEVICE memory:
00387 //   env_size_t outw[depth], outh[depth];
00388 //   LINFO("GPU pyramid go!"); tim.reset();
00389 //   for (uint ii = 0; ii < NREP; ++ii)
00390 //     cudacore_pyr_build_lowpass_5(dsrc, dtmp, dres, depth, w, h, outw, outh);
00391 //   LINFO("GPU pyramid done! %fms", tim.getSecs() * 1000.0F);
00392 
00393 //   // collect the results, starting at firstlevel and ignoring previous
00394 //   // levels; level 0 (if desired) has been handled already on the CPU
00395 //   // (simple memcpy) and is not in dres, which starts at level 1:
00396 //   int *dresptr = dres;
00397 //   for (env_size_t lev = 1; lev < depth; ++lev) {
00398 //     // get a pointer to image at that level:
00399 //     struct env_image *res = env_pyr_imgw(&gpyr, lev);
00400 //     const env_size_t ww = outw[lev], hh = outh[lev];
00401 
00402 //     if (lev < firstlevel)
00403 //       env_img_make_empty(res); // kill that level
00404 //     else {
00405 //       const struct env_dims di = { ww, hh };
00406 //       env_img_resize_dims(res, di);
00407 //       CUDA_SAFE_CALL(cudaMemcpy((void *)env_img_pixels(res), dresptr,
00408 //                                 ww * hh * sizeof(int), cudaMemcpyDeviceToHost));
00409 //     }
00410 
00411 //     // ready for next level:
00412 //     dresptr += ww * hh;
00413 //   }
00414 
00415 //   // free allocated memory:
00416 //   CUDA_SAFE_CALL( cudaFree(dsrc) );
00417 
00418 //   // compute on CPU:
00419 //   struct env_pyr cpyr; env_pyr_init(&cpyr, d);
00420 //   LINFO("CPU pyramid go!"); tim.reset();
00421 //   for (uint ii = 0; ii < NREP; ++ii)
00422 //     env_pyr_build_lowpass_5_cpu(&einput, firstlevel, &imath, &cpyr);
00423 //   LINFO("CPU pyramid done! %fms", tim.getSecs() * 1000.0F);
00424 
00425 //   // compare results:
00426 //   outw[0] = w; outh[0] = h;
00427 //   for (uint ii = firstlevel; ii < depth; ++ii)
00428 //     imgcompare((const int*)env_img_pixels(env_pyr_img(&cpyr, ii)),
00429 //                (const int*)env_img_pixels(env_pyr_img(&gpyr, ii)), outw[ii], outh[ii]);
00430 }
00431 
00432 
00433 
00434 
00435 
00436 
00437 
00438 
00439 // ######################################################################
00440 /* So things look consistent in everyone's emacs... */
00441 /* Local Variables: */
00442 /* mode: c++ */
00443 /* indent-tabs-mode: nil */
00444 /* End: */