cuda_mathops.h

00001 /*!@file CUDA/cuda-mathops.h CUDA/GPU optimized math operations code */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file:
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_mathops.h $
00035 // $Id: cuda_mathops.h 12962 2010-03-06 02:13:53Z irock $
00036 //
00037 
00038 
00039 #ifndef CUDA_MATHOPS_H_DEFINED
00040 #define CUDA_MATHOPS_H_DEFINED
00041 
00042 #include <cuda.h>
00043 #include "CUDA/cutil.h"
00044 #include "cudadefs.h"
00045 
00046 
00047 // Actual CUDA Implementation, set up as a __device__ function to allow it to be called
00048 //  from other CUDA functions
00049 __device__ void cuda_device_inplaceRectify(float *ptr, const int idx)
00050 {
00051   if(ptr[idx] < 0.0F)
00052   {
00053     ptr[idx] = 0.0F;
00054   }
00055 }
00056 
00057 __global__ void cuda_global_inplaceRectify(float *ptr, const int tile_len, const int sz)
00058 {
00059   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00060   if(src_idx < sz)
00061     cuda_device_inplaceRectify(ptr,src_idx);
00062 }
00063 
00064 __device__ void cuda_device_inplaceClamp(float *ptr, const float cmin, const float cmax, const int idx)
00065 {
00066   if(ptr[idx] < cmin)
00067   {
00068     ptr[idx] = cmin;
00069   }
00070   else if(ptr[idx] > cmax)
00071   {
00072     ptr[idx] = cmax;
00073   }
00074 }
00075 
00076 __global__ void cuda_global_inplaceClamp(float *ptr, const float cmin, const float cmax, const int tile_len, const int sz)
00077 {
00078   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00079   if(src_idx < sz)
00080     cuda_device_inplaceClamp(ptr,cmin,cmax,src_idx);
00081 }
00082 
00083 __device__ void cuda_device_inplaceNormalize(float *ptr, const int idx, const float omin, const float nmin, const float nscale)
00084 {
00085   ptr[idx] = nmin + (ptr[idx] - omin) * nscale;
00086 }
00087 
00088 __global__ void cuda_global_inplaceNormalize(float *ptr, const float* omin, const float* omax, const float nmin, const float nmax, const int tile_len, const int sz)
00089 {
00090   const float scale = omax[0] - omin[0];
00091   const float nscale = (nmax - nmin) / scale;
00092 
00093   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00094   if(src_idx < sz)
00095     cuda_device_inplaceNormalize(ptr,src_idx,omin[0],nmin,nscale);
00096 }
00097 
00098 __device__ void cuda_device_abs(float *ptr, const int idx)
00099 {
00100   if(ptr[idx] < 0)
00101     ptr[idx] *= -1;
00102 }
00103 
00104 __global__ void cuda_global_abs(float *src, const int tile_len, const int sz)
00105 {
00106   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00107   if(src_idx < sz)
00108     cuda_device_abs(src,src_idx);
00109 }
00110 
00111 
00112 __global__ void cuda_global_clear(float *src, const float val, const int tile_len, const int sz)
00113 {
00114   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00115   if(src_idx < sz)
00116     src[src_idx] = val;
00117 }
00118 
00119 
00120 
00121 __global__ void cuda_global_inplaceAddScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00122 {
00123   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00124   if(src_idx < sz)
00125     ptr[src_idx] += offset[0];
00126 }
00127 
00128 __global__ void cuda_global_inplaceSubtractScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00129 {
00130   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00131   if(src_idx < sz)
00132     ptr[src_idx] -= offset[0];
00133 }
00134 
00135 __global__ void cuda_global_inplaceMultiplyScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00136 {
00137   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00138   if(src_idx < sz)
00139     ptr[src_idx] *= offset[0];
00140 }
00141 
00142 __global__ void cuda_global_inplaceDivideScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00143 {
00144   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00145   if(src_idx < sz)
00146     ptr[src_idx] /= offset[0];
00147 }
00148 
00149 
00150 __global__ void cuda_global_inplaceAddHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00151 {
00152   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00153   if(src_idx < sz)
00154     ptr[src_idx] += val;
00155 }
00156 
00157 __global__ void cuda_global_inplaceSubtractHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00158 {
00159   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00160   if(src_idx < sz)
00161     ptr[src_idx] -= val;
00162 }
00163 
00164 __global__ void cuda_global_inplaceMultiplyHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00165 {
00166   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00167   if(src_idx < sz)
00168     ptr[src_idx] *= val;
00169 }
00170 
00171 __global__ void cuda_global_inplaceDivideHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00172 {
00173   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00174   if(src_idx < sz)
00175     ptr[src_idx] /= val;
00176 }
00177 
00178 
00179 
00180 __global__ void cuda_global_inplaceAddImages(float *im1, const float *im2, const int tile_len, const int sz)
00181 {
00182   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00183   if(src_idx < sz)
00184     im1[src_idx] += im2[src_idx];
00185 }
00186 
00187 __global__ void cuda_global_inplaceSubtractImages(float *im1, const float *im2, const int tile_len, const int sz)
00188 {
00189   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00190   if(src_idx < sz)
00191     im1[src_idx] -= im2[src_idx];
00192 }
00193 
00194 __global__ void cuda_global_inplaceMultiplyImages(float *im1, const float *im2, const int tile_len, const int sz)
00195 {
00196   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00197   if(src_idx < sz)
00198     im1[src_idx] *= im2[src_idx];
00199 }
00200 
00201 __global__ void cuda_global_inplaceDivideImages(float *im1, const float *im2, const int tile_len, const int sz)
00202 {
00203   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00204   if(src_idx < sz)
00205     im1[src_idx] /= im2[src_idx];
00206 }
00207 
00208 
00209 __global__ void cuda_global_addScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz)
00210 {
00211   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00212   if(src_idx < sz)
00213     res[src_idx] = im1[src_idx] + offset[0];
00214 }
00215 
00216 __global__ void cuda_global_subtractScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz)
00217 {
00218   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00219   if(src_idx < sz)
00220     res[src_idx] = im1[src_idx] - offset[0];
00221 }
00222 
00223 __global__ void cuda_global_multiplyScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz)
00224 {
00225   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00226   if(src_idx < sz)
00227     res[src_idx] = im1[src_idx] * offset[0];
00228 }
00229 
00230 __global__ void cuda_global_divideScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz)
00231 {
00232   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00233   if(src_idx < sz)
00234     res[src_idx] = im1[src_idx] / offset[0];
00235 }
00236 
00237 
00238 __global__ void cuda_global_addHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00239 {
00240   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00241   if(src_idx < sz)
00242     res[src_idx] = im1[src_idx] + val;
00243 }
00244 
00245 __global__ void cuda_global_subtractHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00246 {
00247   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00248   if(src_idx < sz)
00249     res[src_idx] = im1[src_idx] - val;
00250 }
00251 
00252 __global__ void cuda_global_multiplyHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00253 {
00254   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00255   if(src_idx < sz)
00256     res[src_idx] = im1[src_idx] * val;
00257 }
00258 
00259 __global__ void cuda_global_divideHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00260 {
00261   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00262   if(src_idx < sz)
00263     res[src_idx] = im1[src_idx] / val;
00264 }
00265 
00266 
00267 __global__ void cuda_global_addImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00268 {
00269   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00270   if(src_idx < sz)
00271     res[src_idx] = im1[src_idx] + im2[src_idx];
00272 }
00273 
00274 __global__ void cuda_global_subtractImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00275 {
00276   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00277   if(src_idx < sz)
00278     res[src_idx] = im1[src_idx] - im2[src_idx];
00279 }
00280 
00281 __global__ void cuda_global_multiplyImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00282 {
00283   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00284   if(src_idx < sz)
00285     res[src_idx] = im1[src_idx] * im2[src_idx];
00286 }
00287 
00288 __global__ void cuda_global_divideImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00289 {
00290   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00291   if(src_idx < sz)
00292     res[src_idx] = im1[src_idx] / im2[src_idx];
00293 }
00294 
00295 
00296 __global__ void cuda_global_takeMax(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00297 {
00298   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00299   if(src_idx < sz)
00300   {
00301     float f1 = im1[src_idx];
00302     float f2 = im2[src_idx];
00303     res[src_idx] = (f1 > f2) ? f1 : f2;
00304   }
00305 }
00306 
00307 
00308 __device__ void cuda_device_getMin(const float *src, float *dest, float *buf, float *shr, const int tile_len, const int sz)
00309 {
00310   //ASSERT(blockDim.y == 1 && gridDim.y == 1);
00311 
00312   const int shr_idx = threadIdx.x;
00313   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00314   const bool threadActive = src_idx < sz && shr_idx < tile_len;
00315 
00316 
00317   if(threadActive)
00318   {
00319     shr[shr_idx] = src[src_idx];
00320   }
00321   __syncthreads();
00322 
00323   // Come up with a per block min
00324   int incr = 1;
00325   int mod = 2;
00326   while(incr < sz)
00327   {
00328     if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00329     {
00330       // Check neighbor
00331       if(shr[shr_idx] > shr[shr_idx+incr])
00332         shr[shr_idx] = shr[shr_idx+incr];
00333     }
00334     __syncthreads();
00335 
00336     incr *= 2;
00337     mod *= 2;
00338   }
00339   // Now load the global output
00340   if(threadIdx.x == 0 && threadActive)
00341   {
00342     int dst_idx = blockIdx.x;
00343     buf[dst_idx] = shr[0];
00344     // Have the first block put the value in the final destination, this will eventually be the answer
00345     if(dst_idx == 0)
00346       dest[0] = shr[0];
00347   }
00348   __syncthreads();
00349 }
00350 
00351 __global__ void cuda_global_getMin(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00352 {
00353   cuda_device_getMin(src,dest,buf,shared_data,tile_len,sz);
00354 }
00355 
00356 __device__ void cuda_device_getMax(const float *src, float *dest, float *buf, float *shr, const int tile_len, const int sz)
00357 {
00358   //ASSERT(blockDim.y == 1 && gridDim.y == 1);
00359 
00360   const int shr_idx = threadIdx.x;
00361   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00362   const bool threadActive = src_idx < sz && shr_idx < tile_len;
00363 
00364 
00365   if(threadActive)
00366   {
00367     shr[shr_idx] = src[src_idx];
00368   }
00369   __syncthreads();
00370 
00371   // Come up with a per block max
00372   int incr = 1;
00373   int mod = 2;
00374   while(incr < sz)
00375   {
00376     if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00377     {
00378       // Check neighbor
00379       if(shr[shr_idx] < shr[shr_idx+incr])
00380         shr[shr_idx] = shr[shr_idx+incr];
00381     }
00382     __syncthreads();
00383 
00384     incr *= 2;
00385     mod *= 2;
00386   }
00387   // Now load the global output
00388   if(threadIdx.x == 0 && threadActive)
00389   {
00390     int dst_idx = blockIdx.x;
00391     buf[dst_idx] = shr[0];
00392     // Have the first block put the value in the final destination, this will eventually be the answer
00393     if(dst_idx == 0)
00394       dest[0] = shr[0];
00395 
00396   }
00397   __syncthreads();
00398 }
00399 
00400 
00401 __global__ void cuda_global_getMax(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00402 {
00403   cuda_device_getMax(src,dest,buf,shared_data,tile_len,sz);
00404 }
00405 
00406 __device__ void cuda_device_getSum(const float *src, float *dest, float *buf, float *shr, const int tile_len, const int sz)
00407 {
00408   //ASSERT(blockDim.y == 1 && gridDim.y == 1);
00409 
00410   const int shr_idx = threadIdx.x;
00411   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00412   const bool threadActive = src_idx < sz && shr_idx < tile_len;
00413 
00414   if(threadActive)
00415   {
00416     shr[shr_idx] = src[src_idx];
00417   }
00418   __syncthreads();
00419 
00420   // Come up with a per block sum
00421   int incr = 1;
00422   int mod = 2;
00423   while(incr < sz)
00424   {
00425     if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00426     {
00427       // Sum with neighbor
00428       shr[shr_idx] += shr[shr_idx+incr];
00429     }
00430     __syncthreads();
00431 
00432     incr *= 2;
00433     mod *= 2;
00434   }
00435   // Now load the global output
00436   if(threadIdx.x == 0 && threadActive)
00437   {
00438     int dst_idx = blockIdx.x;
00439     buf[dst_idx] = shr[0];
00440     // Have the first block put the value in the final destination, this will eventually be the answer
00441     if(dst_idx == 0)
00442       dest[0] = shr[0];
00443 
00444   }
00445   __syncthreads();
00446 }
00447 
00448 __global__ void cuda_global_getSum(const float *src, float *dest, float *buf, const int tile_len, const int sz, const int orig_sz)
00449 {
00450   cuda_device_getSum(src,dest,buf,shared_data,tile_len,sz);
00451 }
00452 
00453 __global__ void cuda_global_getAvg(const float *src, float *dest, float *buf, const int tile_len, const int sz, const int orig_sz)
00454 {
00455   cuda_device_getSum(src,dest,buf,shared_data,tile_len,sz);
00456   // If the size of the image is smaller than the tile, than we have a complete sum, which we can then get the average from
00457   if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && sz <= tile_len)
00458     dest[0] = dest[0]/orig_sz;
00459 
00460 }
00461 
00462 __global__ void cuda_global_squared(const float *im, float *res, const int tile_len, const int sz)
00463 {
00464   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00465   if(src_idx < sz)
00466     {
00467       float in = im[src_idx];
00468       res[src_idx] = in*in;
00469     }
00470 }
00471 
00472 __global__ void cuda_global_sqrt(const float *im, float *res, const int tile_len, const int sz)
00473 {
00474   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00475   if(src_idx < sz)
00476     {
00477       res[src_idx] = sqrt(im[src_idx]);
00478     }
00479 }
00480 
00481 
00482 
00483 __global__ void cuda_global_quadEnergy(const float *real, const float *imag, float *out, int tile_len, int sz)
00484 {
00485   const int idx = blockIdx.x*tile_len + threadIdx.x;
00486   if(idx < sz)
00487   {
00488     float re = real[idx];
00489     float im = imag[idx];
00490     out[idx] = sqrtf(re*re + im*im);
00491   }
00492 
00493 }
00494 
00495 __global__ void cuda_global_inplaceAttenuateBorders_x(float *im, int borderSize, int tile_width, int w, int h)
00496 {
00497 
00498   const float increment = 1.0 / (float)(borderSize + 1);
00499 
00500   const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x;
00501   if(x_pos < w)
00502   {
00503     // In the top lines of the border
00504     if(blockIdx.y < borderSize)
00505       {
00506         const int idx = x_pos + IMUL(blockIdx.y,w);
00507         float coeff = increment*(blockIdx.y+1);
00508         im[idx] *= coeff;
00509       }
00510     // In the bottom lines of the border
00511     else if(blockIdx.y < IMUL(borderSize,2))
00512       {
00513         const int idx = x_pos + IMUL((h-borderSize+blockIdx.y-borderSize),w);
00514         float coeff = increment*(IMUL(borderSize,2) - blockIdx.y);
00515         im[idx] *= coeff;
00516       }
00517   }
00518 }
00519 
00520 __global__ void cuda_global_inplaceAttenuateBorders_y(float *im, int borderSize, int tile_height, int w, int h)
00521 {
00522   const float increment = 1.0 / (float)(borderSize + 1);
00523 
00524   const int y_pos = IMUL(blockIdx.y,tile_height) + threadIdx.y;
00525   if(y_pos < h)
00526   {
00527     // In the left lines of the border
00528     if(blockIdx.x < borderSize)
00529       {
00530         const int idx = IMUL(y_pos,w) + blockIdx.x;
00531         float coeff = increment*(blockIdx.x+1);
00532         im[idx] *= coeff;
00533       }
00534     // In the right lines of the border
00535     else if(blockIdx.x < IMUL(borderSize,2))
00536       {
00537         const int idx = IMUL(y_pos,w)+ (blockIdx.x-borderSize) + (w-borderSize);
00538         float coeff = increment*(IMUL(borderSize,2) - blockIdx.x);
00539         im[idx] *= coeff;
00540       }
00541   }
00542 }
00543 
00544 
00545 __device__ void cuda_device_findMax(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz)
00546 {
00547   //ASSERT(blockDim.y == 1 && gridDim.y == 1);
00548 
00549   const int shr_idx = threadIdx.x;
00550   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00551   const bool threadActive = src_idx < sz && shr_idx < tile_len;
00552   float *shrVal = &shared_data[0]; // size of tile_len
00553   float *shrLoc = (float *) &(shared_data[tile_len]); // size of tile_len
00554 
00555   if(threadActive)
00556   {
00557     shrVal[shr_idx] = src[src_idx];
00558     if(srcloc == NULL)
00559       shrLoc[shr_idx] = src_idx;
00560     else
00561       shrLoc[shr_idx] = srcloc[src_idx];
00562   }
00563   __syncthreads();
00564 
00565   // Come up with a per block max
00566   int incr = 1;
00567   int mod = 2;
00568   while(incr < sz)
00569   {
00570     if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00571     {
00572       // Check neighbor
00573       if(shrVal[shr_idx+incr] > shrVal[shr_idx])
00574         {
00575           shrVal[shr_idx] = shrVal[shr_idx+incr];
00576           shrLoc[shr_idx] = shrLoc[shr_idx+incr];
00577         }
00578     }
00579     __syncthreads();
00580 
00581     incr *= 2;
00582     mod *= 2;
00583   }
00584   // Now load the global output
00585   if(threadIdx.x == 0 && threadActive)
00586   {
00587     int dst_idx = blockIdx.x;
00588     buf[dst_idx] = shrVal[0];
00589     loc[dst_idx] = shrLoc[0];
00590   }
00591   __syncthreads();
00592 }
00593 
00594 
00595 __global__ void cuda_global_findMax(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz)
00596 {
00597   cuda_device_findMax(src,srcloc,buf,loc,tile_len,sz);
00598 }
00599 
00600 
00601 __device__ void cuda_device_findMin(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz)
00602 {
00603   //ASSERT(blockDim.y == 1 && gridDim.y == 1);
00604 
00605   const int shr_idx = threadIdx.x;
00606   const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00607   const bool threadActive = src_idx < sz && shr_idx < tile_len;
00608   float *shrVal = &shared_data[0]; // size of tile_len
00609   float *shrLoc = (float *) &(shared_data[tile_len]); // size of tile_len
00610 
00611   if(threadActive)
00612   {
00613     shrVal[shr_idx] = src[src_idx];
00614     if(srcloc == NULL)
00615       shrLoc[shr_idx] = src_idx;
00616     else
00617       shrLoc[shr_idx] = srcloc[src_idx];
00618   }
00619   __syncthreads();
00620 
00621   // Come up with a per block min
00622   int incr = 1;
00623   int mod = 2;
00624   while(incr < sz)
00625   {
00626     if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00627     {
00628       // Check neighbor
00629       if(shrVal[shr_idx+incr] < shrVal[shr_idx])
00630         {
00631           shrVal[shr_idx] = shrVal[shr_idx+incr];
00632           shrLoc[shr_idx] = shrLoc[shr_idx+incr];
00633         }
00634     }
00635     __syncthreads();
00636 
00637     incr *= 2;
00638     mod *= 2;
00639   }
00640   // Now load the global output
00641   if(threadIdx.x == 0 && threadActive)
00642   {
00643     int dst_idx = blockIdx.x;
00644     buf[dst_idx] = shrVal[0];
00645     loc[dst_idx] = shrLoc[0];
00646   }
00647   __syncthreads();
00648 }
00649 
00650 
00651 __global__ void cuda_global_findMin(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz)
00652 {
00653   cuda_device_findMin(src,srcloc,buf,loc,tile_len,sz);
00654 }
00655 
00656 
00657 #endif