00001 /*!@file CUDA/cuda-mathops.h CUDA/GPU optimized math operations code */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_mathops.h $ 00035 // $Id: cuda_mathops.h 12962 2010-03-06 02:13:53Z irock $ 00036 // 00037 00038 00039 #ifndef CUDA_MATHOPS_H_DEFINED 00040 #define CUDA_MATHOPS_H_DEFINED 00041 00042 #include <cuda.h> 00043 #include "CUDA/cutil.h" 00044 #include "cudadefs.h" 00045 00046 00047 // Actual CUDA Implementation, set up as a __device__ function to allow it to be called 00048 // from other CUDA functions 00049 __device__ void cuda_device_inplaceRectify(float *ptr, const int idx) 00050 { 00051 if(ptr[idx] < 0.0F) 00052 { 00053 ptr[idx] = 0.0F; 00054 } 00055 } 00056 00057 __global__ void cuda_global_inplaceRectify(float *ptr, const int tile_len, const int sz) 00058 { 00059 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00060 if(src_idx < sz) 00061 cuda_device_inplaceRectify(ptr,src_idx); 00062 } 00063 00064 __device__ void cuda_device_inplaceClamp(float *ptr, const float cmin, const float cmax, const int idx) 00065 { 00066 if(ptr[idx] < cmin) 00067 { 00068 ptr[idx] = cmin; 00069 } 00070 else if(ptr[idx] > cmax) 00071 { 00072 ptr[idx] = cmax; 00073 } 00074 } 00075 00076 __global__ void cuda_global_inplaceClamp(float *ptr, const float cmin, const float cmax, const int tile_len, const int sz) 00077 { 00078 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00079 if(src_idx < sz) 00080 cuda_device_inplaceClamp(ptr,cmin,cmax,src_idx); 00081 } 00082 00083 __device__ void cuda_device_inplaceNormalize(float *ptr, const int idx, const float omin, const float nmin, const float nscale) 00084 { 00085 ptr[idx] = nmin + (ptr[idx] - omin) * nscale; 00086 } 00087 00088 __global__ void cuda_global_inplaceNormalize(float *ptr, const float* omin, const float* omax, const float nmin, const float nmax, const int tile_len, const int sz) 00089 { 00090 const float scale = omax[0] - omin[0]; 00091 const float nscale = (nmax - nmin) / scale; 00092 00093 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00094 if(src_idx < sz) 00095 cuda_device_inplaceNormalize(ptr,src_idx,omin[0],nmin,nscale); 00096 } 00097 00098 __device__ void cuda_device_abs(float *ptr, const int idx) 00099 { 00100 if(ptr[idx] < 0) 00101 ptr[idx] *= -1; 00102 } 00103 00104 __global__ void cuda_global_abs(float *src, const int tile_len, const int sz) 00105 { 00106 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00107 if(src_idx < sz) 00108 cuda_device_abs(src,src_idx); 00109 } 00110 00111 00112 __global__ void cuda_global_clear(float *src, const float val, const int tile_len, const int sz) 00113 { 00114 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00115 if(src_idx < sz) 00116 src[src_idx] = val; 00117 } 00118 00119 00120 00121 __global__ void cuda_global_inplaceAddScalar(float *ptr, const float *offset, const int tile_len, const int sz) 00122 { 00123 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00124 if(src_idx < sz) 00125 ptr[src_idx] += offset[0]; 00126 } 00127 00128 __global__ void cuda_global_inplaceSubtractScalar(float *ptr, const float *offset, const int tile_len, const int sz) 00129 { 00130 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00131 if(src_idx < sz) 00132 ptr[src_idx] -= offset[0]; 00133 } 00134 00135 __global__ void cuda_global_inplaceMultiplyScalar(float *ptr, const float *offset, const int tile_len, const int sz) 00136 { 00137 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00138 if(src_idx < sz) 00139 ptr[src_idx] *= offset[0]; 00140 } 00141 00142 __global__ void cuda_global_inplaceDivideScalar(float *ptr, const float *offset, const int tile_len, const int sz) 00143 { 00144 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00145 if(src_idx < sz) 00146 ptr[src_idx] /= offset[0]; 00147 } 00148 00149 00150 __global__ void cuda_global_inplaceAddHostScalar(float *ptr, const float val, const int tile_len, const int sz) 00151 { 00152 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00153 if(src_idx < sz) 00154 ptr[src_idx] += val; 00155 } 00156 00157 __global__ void cuda_global_inplaceSubtractHostScalar(float *ptr, const float val, const int tile_len, const int sz) 00158 { 00159 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00160 if(src_idx < sz) 00161 ptr[src_idx] -= val; 00162 } 00163 00164 __global__ void cuda_global_inplaceMultiplyHostScalar(float *ptr, const float val, const int tile_len, const int sz) 00165 { 00166 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00167 if(src_idx < sz) 00168 ptr[src_idx] *= val; 00169 } 00170 00171 __global__ void cuda_global_inplaceDivideHostScalar(float *ptr, const float val, const int tile_len, const int sz) 00172 { 00173 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00174 if(src_idx < sz) 00175 ptr[src_idx] /= val; 00176 } 00177 00178 00179 00180 __global__ void cuda_global_inplaceAddImages(float *im1, const float *im2, const int tile_len, const int sz) 00181 { 00182 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00183 if(src_idx < sz) 00184 im1[src_idx] += im2[src_idx]; 00185 } 00186 00187 __global__ void cuda_global_inplaceSubtractImages(float *im1, const float *im2, const int tile_len, const int sz) 00188 { 00189 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00190 if(src_idx < sz) 00191 im1[src_idx] -= im2[src_idx]; 00192 } 00193 00194 __global__ void cuda_global_inplaceMultiplyImages(float *im1, const float *im2, const int tile_len, const int sz) 00195 { 00196 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00197 if(src_idx < sz) 00198 im1[src_idx] *= im2[src_idx]; 00199 } 00200 00201 __global__ void cuda_global_inplaceDivideImages(float *im1, const float *im2, const int tile_len, const int sz) 00202 { 00203 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00204 if(src_idx < sz) 00205 im1[src_idx] /= im2[src_idx]; 00206 } 00207 00208 00209 __global__ void cuda_global_addScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz) 00210 { 00211 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00212 if(src_idx < sz) 00213 res[src_idx] = im1[src_idx] + offset[0]; 00214 } 00215 00216 __global__ void cuda_global_subtractScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz) 00217 { 00218 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00219 if(src_idx < sz) 00220 res[src_idx] = im1[src_idx] - offset[0]; 00221 } 00222 00223 __global__ void cuda_global_multiplyScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz) 00224 { 00225 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00226 if(src_idx < sz) 00227 res[src_idx] = im1[src_idx] * offset[0]; 00228 } 00229 00230 __global__ void cuda_global_divideScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz) 00231 { 00232 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00233 if(src_idx < sz) 00234 res[src_idx] = im1[src_idx] / offset[0]; 00235 } 00236 00237 00238 __global__ void cuda_global_addHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz) 00239 { 00240 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00241 if(src_idx < sz) 00242 res[src_idx] = im1[src_idx] + val; 00243 } 00244 00245 __global__ void cuda_global_subtractHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz) 00246 { 00247 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00248 if(src_idx < sz) 00249 res[src_idx] = im1[src_idx] - val; 00250 } 00251 00252 __global__ void cuda_global_multiplyHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz) 00253 { 00254 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00255 if(src_idx < sz) 00256 res[src_idx] = im1[src_idx] * val; 00257 } 00258 00259 __global__ void cuda_global_divideHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz) 00260 { 00261 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00262 if(src_idx < sz) 00263 res[src_idx] = im1[src_idx] / val; 00264 } 00265 00266 00267 __global__ void cuda_global_addImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00268 { 00269 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00270 if(src_idx < sz) 00271 res[src_idx] = im1[src_idx] + im2[src_idx]; 00272 } 00273 00274 __global__ void cuda_global_subtractImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00275 { 00276 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00277 if(src_idx < sz) 00278 res[src_idx] = im1[src_idx] - im2[src_idx]; 00279 } 00280 00281 __global__ void cuda_global_multiplyImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00282 { 00283 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00284 if(src_idx < sz) 00285 res[src_idx] = im1[src_idx] * im2[src_idx]; 00286 } 00287 00288 __global__ void cuda_global_divideImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00289 { 00290 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00291 if(src_idx < sz) 00292 res[src_idx] = im1[src_idx] / im2[src_idx]; 00293 } 00294 00295 00296 __global__ void cuda_global_takeMax(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00297 { 00298 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00299 if(src_idx < sz) 00300 { 00301 float f1 = im1[src_idx]; 00302 float f2 = im2[src_idx]; 00303 res[src_idx] = (f1 > f2) ? f1 : f2; 00304 } 00305 } 00306 00307 00308 __device__ void cuda_device_getMin(const float *src, float *dest, float *buf, float *shr, const int tile_len, const int sz) 00309 { 00310 //ASSERT(blockDim.y == 1 && gridDim.y == 1); 00311 00312 const int shr_idx = threadIdx.x; 00313 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00314 const bool threadActive = src_idx < sz && shr_idx < tile_len; 00315 00316 00317 if(threadActive) 00318 { 00319 shr[shr_idx] = src[src_idx]; 00320 } 00321 __syncthreads(); 00322 00323 // Come up with a per block min 00324 int incr = 1; 00325 int mod = 2; 00326 while(incr < sz) 00327 { 00328 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz) 00329 { 00330 // Check neighbor 00331 if(shr[shr_idx] > shr[shr_idx+incr]) 00332 shr[shr_idx] = shr[shr_idx+incr]; 00333 } 00334 __syncthreads(); 00335 00336 incr *= 2; 00337 mod *= 2; 00338 } 00339 // Now load the global output 00340 if(threadIdx.x == 0 && threadActive) 00341 { 00342 int dst_idx = blockIdx.x; 00343 buf[dst_idx] = shr[0]; 00344 // Have the first block put the value in the final destination, this will eventually be the answer 00345 if(dst_idx == 0) 00346 dest[0] = shr[0]; 00347 } 00348 __syncthreads(); 00349 } 00350 00351 __global__ void cuda_global_getMin(const float *src, float *dest, float *buf, const int tile_len, const int sz) 00352 { 00353 cuda_device_getMin(src,dest,buf,shared_data,tile_len,sz); 00354 } 00355 00356 __device__ void cuda_device_getMax(const float *src, float *dest, float *buf, float *shr, const int tile_len, const int sz) 00357 { 00358 //ASSERT(blockDim.y == 1 && gridDim.y == 1); 00359 00360 const int shr_idx = threadIdx.x; 00361 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00362 const bool threadActive = src_idx < sz && shr_idx < tile_len; 00363 00364 00365 if(threadActive) 00366 { 00367 shr[shr_idx] = src[src_idx]; 00368 } 00369 __syncthreads(); 00370 00371 // Come up with a per block max 00372 int incr = 1; 00373 int mod = 2; 00374 while(incr < sz) 00375 { 00376 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz) 00377 { 00378 // Check neighbor 00379 if(shr[shr_idx] < shr[shr_idx+incr]) 00380 shr[shr_idx] = shr[shr_idx+incr]; 00381 } 00382 __syncthreads(); 00383 00384 incr *= 2; 00385 mod *= 2; 00386 } 00387 // Now load the global output 00388 if(threadIdx.x == 0 && threadActive) 00389 { 00390 int dst_idx = blockIdx.x; 00391 buf[dst_idx] = shr[0]; 00392 // Have the first block put the value in the final destination, this will eventually be the answer 00393 if(dst_idx == 0) 00394 dest[0] = shr[0]; 00395 00396 } 00397 __syncthreads(); 00398 } 00399 00400 00401 __global__ void cuda_global_getMax(const float *src, float *dest, float *buf, const int tile_len, const int sz) 00402 { 00403 cuda_device_getMax(src,dest,buf,shared_data,tile_len,sz); 00404 } 00405 00406 __device__ void cuda_device_getSum(const float *src, float *dest, float *buf, float *shr, const int tile_len, const int sz) 00407 { 00408 //ASSERT(blockDim.y == 1 && gridDim.y == 1); 00409 00410 const int shr_idx = threadIdx.x; 00411 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00412 const bool threadActive = src_idx < sz && shr_idx < tile_len; 00413 00414 if(threadActive) 00415 { 00416 shr[shr_idx] = src[src_idx]; 00417 } 00418 __syncthreads(); 00419 00420 // Come up with a per block sum 00421 int incr = 1; 00422 int mod = 2; 00423 while(incr < sz) 00424 { 00425 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz) 00426 { 00427 // Sum with neighbor 00428 shr[shr_idx] += shr[shr_idx+incr]; 00429 } 00430 __syncthreads(); 00431 00432 incr *= 2; 00433 mod *= 2; 00434 } 00435 // Now load the global output 00436 if(threadIdx.x == 0 && threadActive) 00437 { 00438 int dst_idx = blockIdx.x; 00439 buf[dst_idx] = shr[0]; 00440 // Have the first block put the value in the final destination, this will eventually be the answer 00441 if(dst_idx == 0) 00442 dest[0] = shr[0]; 00443 00444 } 00445 __syncthreads(); 00446 } 00447 00448 __global__ void cuda_global_getSum(const float *src, float *dest, float *buf, const int tile_len, const int sz, const int orig_sz) 00449 { 00450 cuda_device_getSum(src,dest,buf,shared_data,tile_len,sz); 00451 } 00452 00453 __global__ void cuda_global_getAvg(const float *src, float *dest, float *buf, const int tile_len, const int sz, const int orig_sz) 00454 { 00455 cuda_device_getSum(src,dest,buf,shared_data,tile_len,sz); 00456 // If the size of the image is smaller than the tile, than we have a complete sum, which we can then get the average from 00457 if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && sz <= tile_len) 00458 dest[0] = dest[0]/orig_sz; 00459 00460 } 00461 00462 __global__ void cuda_global_squared(const float *im, float *res, const int tile_len, const int sz) 00463 { 00464 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00465 if(src_idx < sz) 00466 { 00467 float in = im[src_idx]; 00468 res[src_idx] = in*in; 00469 } 00470 } 00471 00472 __global__ void cuda_global_sqrt(const float *im, float *res, const int tile_len, const int sz) 00473 { 00474 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00475 if(src_idx < sz) 00476 { 00477 res[src_idx] = sqrt(im[src_idx]); 00478 } 00479 } 00480 00481 00482 00483 __global__ void cuda_global_quadEnergy(const float *real, const float *imag, float *out, int tile_len, int sz) 00484 { 00485 const int idx = blockIdx.x*tile_len + threadIdx.x; 00486 if(idx < sz) 00487 { 00488 float re = real[idx]; 00489 float im = imag[idx]; 00490 out[idx] = sqrtf(re*re + im*im); 00491 } 00492 00493 } 00494 00495 __global__ void cuda_global_inplaceAttenuateBorders_x(float *im, int borderSize, int tile_width, int w, int h) 00496 { 00497 00498 const float increment = 1.0 / (float)(borderSize + 1); 00499 00500 const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x; 00501 if(x_pos < w) 00502 { 00503 // In the top lines of the border 00504 if(blockIdx.y < borderSize) 00505 { 00506 const int idx = x_pos + IMUL(blockIdx.y,w); 00507 float coeff = increment*(blockIdx.y+1); 00508 im[idx] *= coeff; 00509 } 00510 // In the bottom lines of the border 00511 else if(blockIdx.y < IMUL(borderSize,2)) 00512 { 00513 const int idx = x_pos + IMUL((h-borderSize+blockIdx.y-borderSize),w); 00514 float coeff = increment*(IMUL(borderSize,2) - blockIdx.y); 00515 im[idx] *= coeff; 00516 } 00517 } 00518 } 00519 00520 __global__ void cuda_global_inplaceAttenuateBorders_y(float *im, int borderSize, int tile_height, int w, int h) 00521 { 00522 const float increment = 1.0 / (float)(borderSize + 1); 00523 00524 const int y_pos = IMUL(blockIdx.y,tile_height) + threadIdx.y; 00525 if(y_pos < h) 00526 { 00527 // In the left lines of the border 00528 if(blockIdx.x < borderSize) 00529 { 00530 const int idx = IMUL(y_pos,w) + blockIdx.x; 00531 float coeff = increment*(blockIdx.x+1); 00532 im[idx] *= coeff; 00533 } 00534 // In the right lines of the border 00535 else if(blockIdx.x < IMUL(borderSize,2)) 00536 { 00537 const int idx = IMUL(y_pos,w)+ (blockIdx.x-borderSize) + (w-borderSize); 00538 float coeff = increment*(IMUL(borderSize,2) - blockIdx.x); 00539 im[idx] *= coeff; 00540 } 00541 } 00542 } 00543 00544 00545 __device__ void cuda_device_findMax(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz) 00546 { 00547 //ASSERT(blockDim.y == 1 && gridDim.y == 1); 00548 00549 const int shr_idx = threadIdx.x; 00550 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00551 const bool threadActive = src_idx < sz && shr_idx < tile_len; 00552 float *shrVal = &shared_data[0]; // size of tile_len 00553 float *shrLoc = (float *) &(shared_data[tile_len]); // size of tile_len 00554 00555 if(threadActive) 00556 { 00557 shrVal[shr_idx] = src[src_idx]; 00558 if(srcloc == NULL) 00559 shrLoc[shr_idx] = src_idx; 00560 else 00561 shrLoc[shr_idx] = srcloc[src_idx]; 00562 } 00563 __syncthreads(); 00564 00565 // Come up with a per block max 00566 int incr = 1; 00567 int mod = 2; 00568 while(incr < sz) 00569 { 00570 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz) 00571 { 00572 // Check neighbor 00573 if(shrVal[shr_idx+incr] > shrVal[shr_idx]) 00574 { 00575 shrVal[shr_idx] = shrVal[shr_idx+incr]; 00576 shrLoc[shr_idx] = shrLoc[shr_idx+incr]; 00577 } 00578 } 00579 __syncthreads(); 00580 00581 incr *= 2; 00582 mod *= 2; 00583 } 00584 // Now load the global output 00585 if(threadIdx.x == 0 && threadActive) 00586 { 00587 int dst_idx = blockIdx.x; 00588 buf[dst_idx] = shrVal[0]; 00589 loc[dst_idx] = shrLoc[0]; 00590 } 00591 __syncthreads(); 00592 } 00593 00594 00595 __global__ void cuda_global_findMax(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz) 00596 { 00597 cuda_device_findMax(src,srcloc,buf,loc,tile_len,sz); 00598 } 00599 00600 00601 __device__ void cuda_device_findMin(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz) 00602 { 00603 //ASSERT(blockDim.y == 1 && gridDim.y == 1); 00604 00605 const int shr_idx = threadIdx.x; 00606 const int src_idx = blockIdx.x*tile_len + threadIdx.x; 00607 const bool threadActive = src_idx < sz && shr_idx < tile_len; 00608 float *shrVal = &shared_data[0]; // size of tile_len 00609 float *shrLoc = (float *) &(shared_data[tile_len]); // size of tile_len 00610 00611 if(threadActive) 00612 { 00613 shrVal[shr_idx] = src[src_idx]; 00614 if(srcloc == NULL) 00615 shrLoc[shr_idx] = src_idx; 00616 else 00617 shrLoc[shr_idx] = srcloc[src_idx]; 00618 } 00619 __syncthreads(); 00620 00621 // Come up with a per block min 00622 int incr = 1; 00623 int mod = 2; 00624 while(incr < sz) 00625 { 00626 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz) 00627 { 00628 // Check neighbor 00629 if(shrVal[shr_idx+incr] < shrVal[shr_idx]) 00630 { 00631 shrVal[shr_idx] = shrVal[shr_idx+incr]; 00632 shrLoc[shr_idx] = shrLoc[shr_idx+incr]; 00633 } 00634 } 00635 __syncthreads(); 00636 00637 incr *= 2; 00638 mod *= 2; 00639 } 00640 // Now load the global output 00641 if(threadIdx.x == 0 && threadActive) 00642 { 00643 int dst_idx = blockIdx.x; 00644 buf[dst_idx] = shrVal[0]; 00645 loc[dst_idx] = shrLoc[0]; 00646 } 00647 __syncthreads(); 00648 } 00649 00650 00651 __global__ void cuda_global_findMin(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz) 00652 { 00653 cuda_device_findMin(src,srcloc,buf,loc,tile_len,sz); 00654 } 00655 00656 00657 #endif