00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #include "Envision/env_c_math_ops.h"
00036 #include "Envision/env_image_ops.h"
00037 #include "Envision/env_pyr.h"
00038 #include "CUDA/cuda-lowpass.h"
00039 #include "CUDA/cutil.h"
00040 #include "CUDA/env_cuda.h"
00041 #include "Image/Image.H"
00042 #include "Image/MathOps.H"
00043 #include "Raster/Raster.H"
00044 #include "Util/log.H"
00045 #include "Util/Timer.H"
00046 #include <cuda_runtime_api.h>
00047 #include <stdio.h>
00048
00049 #include "Envision/env_alloc.h"
00050 #include "Envision/env_c_math_ops.h"
00051 #include "Envision/env_image.h"
00052 #include "Envision/env_image_ops.h"
00053 #include "Envision/env_log.h"
00054 #include "Envision/env_mt_visual_cortex.h"
00055 #include "Envision/env_params.h"
00056 #include "Envision/env_stdio_interface.h"
00057
00058
00059
00060
00061
00062 inline int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
00063
00064
00065 inline int iDivDown(int a, int b) { return a / b; }
00066
00067
00068 inline int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; }
00069
00070
00071 inline int iAlignDown(int a, int b) { return a - a % b; }
00072
00073 #define NREP 20
00074
00075
00076
00077 static void* malloc_thunk(env_size_t n)
00078 {
00079 return malloc(n);
00080 }
00081
00082
00083 void compareregions(Image<int> &c, Image<int> &g, const uint rowStart, const uint rowStop, const uint colStart, const uint colStop)
00084 {
00085 uint w,h;
00086 w = c.getWidth();
00087 h = c.getHeight();
00088 if(w != (uint) g.getWidth() || h != (uint) g.getHeight())
00089 {
00090 LINFO("Images are not the same size");
00091 return;
00092 }
00093 if(rowStart > rowStop || colStart > colStop || rowStop > h || colStop > w)
00094 {
00095 LINFO("Invalid regions to compare");
00096 return;
00097 }
00098 for(uint i=colStart;i<colStop;i++)
00099 {
00100 printf("\nC[%d]: ",i);
00101 for(uint j=rowStart;j<rowStop;j++)
00102 {
00103 printf("%d ",c.getVal(i,j));
00104 }
00105 printf("\nG[%d]: ",i);
00106 for(uint j=rowStart;j<rowStop;j++)
00107 {
00108 printf("%d ",g.getVal(i,j));
00109 }
00110 }
00111 printf("\n");
00112
00113 }
00114
00115
00116 void imgcompare(const int* cpu, const int* gpu, const uint w, const uint h)
00117 {
00118 Image<int> c(cpu, w, h), g(gpu, w, h);
00119 Image<float> diff = g - c;
00120 float mi, ma, av; getMinMaxAvg(diff, mi, ma, av);
00121 LINFO("%s: %ux%u image, GPU - CPU: avg=%f, diff = [%f .. %f]",
00122 mi == ma && ma == 0.0F ? "PASS" : "FAIL", w, h, av, mi, ma);
00123
00124 }
00125
00126
00127 void imgcompare(const float* cpu, const float* gpu, const uint w, const uint h)
00128 {
00129 Image<float> c(cpu, w, h), g(gpu, w, h);
00130 Image<float> diff = g - c;
00131 float mi, ma, av; getMinMaxAvg(diff, mi, ma, av);
00132 LINFO("%s: %ux%u image, GPU - CPU: avg=%f, diff = [%f .. %f]",
00133 mi == ma && ma == 0.0F ? "PASS" : "FAIL", w, h, av, mi, ma);
00134
00135 }
00136
00137 void test_lowpass5(Image<int> &iimg, char *cpu_file, char *gpu_int_file);
00138 void test_lowpass9(Image<int> &iimg, char *cpu_file, char *gpu_int_file);
00139
00140
00141
00142
00143 int main(int argc, char **argv)
00144 {
00145 if (argc != 4) LFATAL("USAGE: %s <input.pgm> <outCPU.pgm> <outGPU.pgm>", argv[0]);
00146
00147 CUT_DEVICE_INIT(0);
00148 LINFO("Reading: %s", argv[1]);
00149 Image<byte> img = Raster::ReadGray(argv[1]);
00150 Image<int> iimg = img;
00151 test_lowpass5(iimg,argv[2],argv[3]);
00152
00153
00154 }
00155
00156 void test_lowpass5(Image<int> &iimg, char *cpu_file, char *gpu_int_file)
00157 {
00158 int *dsrc, *ddst, *ddst2;
00159 const uint w = iimg.getWidth(), h = iimg.getHeight();
00160 const uint siz = w * h * sizeof(int);
00161 LINFO("Processing %ux%u image on lowpass5...", w, h);
00162 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&dsrc, siz) );
00163 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst, siz/2) );
00164 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst2, siz/4) );
00165 CUDA_SAFE_CALL( cudaThreadSynchronize() );
00166 CUDA_SAFE_CALL( cudaMemcpy(dsrc, iimg.getArrayPtr(), siz, cudaMemcpyHostToDevice) );
00167 Timer tim;
00168
00169 LINFO("GPU go!"); tim.reset();
00170 for (uint ii = 0; ii < NREP; ++ii)
00171 {
00172 cuda_lowpass_5_x_dec_x_fewbits_optim(dsrc, w, h, ddst);
00173 CUT_CHECK_ERROR("convolutionRowGPU() execution failed\n");
00174 cuda_lowpass_5_y_dec_y_fewbits_optim(ddst, w/2, h, ddst2);
00175 CUT_CHECK_ERROR("convolutionColumnGPU() execution failed\n");
00176 }
00177 LINFO("GPU done! %fms", tim.getSecs() * 1000.0F);
00178
00179 LINFO("Reading back GPU results... siz/4 %d",siz/4);
00180 Image<int> ires(iDivUp(w,2), iDivUp(h,2), ZEROS);
00181
00182 CUDA_SAFE_CALL( cudaMemcpy(ires.getArrayPtr(), ddst2, siz/4, cudaMemcpyDeviceToHost) );
00183 CUDA_SAFE_CALL( cudaThreadSynchronize() );
00184
00185 Image<byte> bres = ires;
00186
00187 Raster::WriteGray(bres, gpu_int_file);
00188
00189
00190 Image<int> ires2(iDivUp(w,2), h, NO_INIT), ires3(iDivUp(w,2), iDivUp(h,2), NO_INIT);
00191 LINFO("CPU go!"); tim.reset();
00192 for (uint ii = 0; ii < NREP; ++ii)
00193 {
00194 env_c_lowpass_5_x_dec_x_fewbits_optim((const intg32*)iimg.getArrayPtr(), w, h,
00195 (intg32*)ires2.getArrayPtr(), w/2);
00196 env_c_lowpass_5_y_dec_y_fewbits_optim((const intg32*)ires2.getArrayPtr(), w/2, h,
00197 (intg32*)ires3.getArrayPtr(), h/2);
00198 }
00199 LINFO("CPU done! %fms", tim.getSecs() * 1000.0F);
00200
00201 Raster::WriteGray(Image<byte>(ires3), cpu_file);
00202
00203 imgcompare(ires3.getArrayPtr(), ires.getArrayPtr(), w/2, h/2);
00204
00205 CUDA_SAFE_CALL( cudaFree(ddst2) );
00206 CUDA_SAFE_CALL( cudaFree(ddst) );
00207 CUDA_SAFE_CALL( cudaFree(dsrc) );
00208
00209
00210 LINFO("Moving on to pyramid test...");
00211
00212 struct env_params envp;
00213 env_params_set_defaults(&envp);
00214
00215 envp.maxnorm_type = ENV_VCXNORM_MAXNORM;
00216 envp.scale_bits = 16;
00217
00218 env_assert_set_handler(&env_stdio_assert_handler);
00219 env_allocation_init(&malloc_thunk, &free);
00220
00221 env_params_validate(&envp);
00222 struct env_math imath;
00223 env_init_integer_math(&imath, &envp);
00224
00225 struct env_image einput; const struct env_dims di = { iimg.getWidth(), iimg.getHeight() };
00226 env_img_init(&einput, di);
00227
00228 const int d = 8, firstlevel = 0;
00229 memcpy((void*)env_img_pixelsw(&einput), iimg.getArrayPtr(), iimg.getSize() * sizeof(int));
00230 struct env_pyr gpyr; env_pyr_init(&gpyr, d);
00231
00232
00233 if (firstlevel == 0) env_img_copy_src_dst(&einput, env_pyr_imgw(&gpyr, 0));
00234
00235
00236 const env_size_t depth = env_pyr_depth(&gpyr);
00237 const env_size_t wh = w * h;
00238 const env_size_t rsiz = siz / 2;
00239 int *dres, *dtmp;
00240 CUDA_SAFE_CALL(cudaMalloc((void **)(void *)&dsrc, siz + siz/2 + rsiz));
00241 dtmp = dsrc + wh; dres = dtmp + wh/2;
00242
00243
00244 CUDA_SAFE_CALL(cudaThreadSynchronize());
00245 CUDA_SAFE_CALL(cudaMemcpy(dsrc, env_img_pixels(&einput), siz, cudaMemcpyHostToDevice));
00246
00247
00248 env_size_t outw[depth], outh[depth];
00249 LINFO("GPU pyramid go!"); tim.reset();
00250 for (uint ii = 0; ii < NREP; ++ii)
00251 cudacore_pyr_build_lowpass_5(dsrc, dtmp, dres, depth, w, h, outw, outh);
00252 LINFO("GPU pyramid done! %fms", tim.getSecs() * 1000.0F);
00253
00254
00255
00256
00257 int *dresptr = dres;
00258 for (env_size_t lev = 1; lev < depth; ++lev) {
00259
00260 struct env_image *res = env_pyr_imgw(&gpyr, lev);
00261 const env_size_t ww = outw[lev], hh = outh[lev];
00262
00263 if (lev < firstlevel)
00264 env_img_make_empty(res);
00265 else {
00266 const struct env_dims di = { ww, hh };
00267 env_img_resize_dims(res, di);
00268 CUDA_SAFE_CALL(cudaMemcpy((void *)env_img_pixels(res), dresptr,
00269 ww * hh * sizeof(int), cudaMemcpyDeviceToHost));
00270 }
00271
00272
00273 dresptr += ww * hh;
00274 }
00275
00276
00277 CUDA_SAFE_CALL( cudaFree(dsrc) );
00278
00279
00280 struct env_pyr cpyr; env_pyr_init(&cpyr, d);
00281 LINFO("CPU pyramid go!"); tim.reset();
00282 for (uint ii = 0; ii < NREP; ++ii)
00283 env_pyr_build_lowpass_5_cpu(&einput, firstlevel, &imath, &cpyr);
00284 LINFO("CPU pyramid done! %fms", tim.getSecs() * 1000.0F);
00285
00286
00287 outw[0] = w; outh[0] = h;
00288 for (uint ii = firstlevel; ii < depth; ++ii)
00289 imgcompare((const int*)env_img_pixels(env_pyr_img(&cpyr, ii)),
00290 (const int*)env_img_pixels(env_pyr_img(&gpyr, ii)), outw[ii], outh[ii]);
00291 }
00292
00293
00294
00295
00296 void test_lowpass9(Image<int> &iimg, char *cpu_file, char *gpu_int_file)
00297 {
00298 int *dsrc, *ddst, *ddst2;
00299 const uint w = iimg.getWidth(), h = iimg.getHeight();
00300 const uint siz = w * h * sizeof(int);
00301
00302 LINFO("Processing %ux%u image on lowpass9...", w, h);
00303 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&dsrc, siz) );
00304 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst, siz) );
00305 CUDA_SAFE_CALL( cudaMalloc( (void **)(void *)&ddst2, siz) );
00306 CUDA_SAFE_CALL( cudaThreadSynchronize() );
00307 CUDA_SAFE_CALL( cudaMemcpy(dsrc, iimg.getArrayPtr(), siz, cudaMemcpyHostToDevice) );
00308 Timer tim;
00309
00310 LINFO("GPU int go!"); tim.reset();
00311 for (uint ii = 0; ii < NREP; ++ii)
00312 {
00313 cuda_lowpass_9_x_fewbits_optim(dsrc, w, h, ddst);
00314 CUT_CHECK_ERROR("convolutionRowGPU() execution failed\n");
00315 cuda_lowpass_9_y_fewbits_optim(ddst, w, h, ddst2);
00316 CUT_CHECK_ERROR("convolutionColumnGPU() execution failed\n");
00317 }
00318 LINFO("GPU int done! %fms", tim.getSecs() * 1000.0F);
00319
00320 LINFO("Reading back GPU int results... siz %d",siz);
00321 Image<int> ires(w, h, ZEROS);
00322
00323 CUDA_SAFE_CALL( cudaMemcpy(ires.getArrayPtr(), ddst2, siz, cudaMemcpyDeviceToHost) );
00324 CUDA_SAFE_CALL( cudaFree(ddst2) );
00325 CUDA_SAFE_CALL( cudaFree(ddst) );
00326 CUDA_SAFE_CALL( cudaFree(dsrc) );
00327 CUDA_SAFE_CALL( cudaThreadSynchronize() );
00328
00329 Image<byte> bres = ires;
00330 Raster::WriteGray(bres, gpu_int_file);
00331
00332
00333 Image<int> ires2(w, h, NO_INIT), ires3(w, h, NO_INIT);
00334 LINFO("CPU go!"); tim.reset();
00335 for (uint ii = 0; ii < NREP; ++ii)
00336 {
00337 env_c_lowpass_9_x_fewbits_optim((const intg32*)iimg.getArrayPtr(), w, h,
00338 (intg32*)ires2.getArrayPtr());
00339 env_c_lowpass_9_y_fewbits_optim((const intg32*)ires2.getArrayPtr(), w, h,
00340 (intg32*)ires3.getArrayPtr());
00341 }
00342 LINFO("CPU done! %fms", tim.getSecs() * 1000.0F);
00343
00344 Raster::WriteGray(Image<byte>(ires3), cpu_file);
00345
00346 imgcompare((Image<float>(ires3)).getArrayPtr(), (Image<float>(ires)).getArrayPtr(), w, h);
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430 }
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444