00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #ifndef CUDA_MATHOPS_H_DEFINED
00040 #define CUDA_MATHOPS_H_DEFINED
00041
00042 #include <cuda.h>
00043 #include "CUDA/cutil.h"
00044 #include "cudadefs.h"
00045
00046
00047
00048
00049 __device__ void cuda_device_inplaceRectify(float *ptr, const int idx)
00050 {
00051 if(ptr[idx] < 0.0F)
00052 {
00053 ptr[idx] = 0.0F;
00054 }
00055 }
00056
00057 __global__ void cuda_global_inplaceRectify(float *ptr, const int tile_len, const int sz)
00058 {
00059 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00060 if(src_idx < sz)
00061 cuda_device_inplaceRectify(ptr,src_idx);
00062 }
00063
00064 __device__ void cuda_device_inplaceClamp(float *ptr, const float cmin, const float cmax, const int idx)
00065 {
00066 if(ptr[idx] < cmin)
00067 {
00068 ptr[idx] = cmin;
00069 }
00070 else if(ptr[idx] > cmax)
00071 {
00072 ptr[idx] = cmax;
00073 }
00074 }
00075
00076 __global__ void cuda_global_inplaceClamp(float *ptr, const float cmin, const float cmax, const int tile_len, const int sz)
00077 {
00078 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00079 if(src_idx < sz)
00080 cuda_device_inplaceClamp(ptr,cmin,cmax,src_idx);
00081 }
00082
00083 __device__ void cuda_device_inplaceNormalize(float *ptr, const int idx, const float omin, const float nmin, const float nscale)
00084 {
00085 ptr[idx] = nmin + (ptr[idx] - omin) * nscale;
00086 }
00087
00088 __global__ void cuda_global_inplaceNormalize(float *ptr, const float* omin, const float* omax, const float nmin, const float nmax, const int tile_len, const int sz)
00089 {
00090 const float scale = omax[0] - omin[0];
00091 const float nscale = (nmax - nmin) / scale;
00092
00093 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00094 if(src_idx < sz)
00095 cuda_device_inplaceNormalize(ptr,src_idx,omin[0],nmin,nscale);
00096 }
00097
00098 __device__ void cuda_device_abs(float *ptr, const int idx)
00099 {
00100 if(ptr[idx] < 0)
00101 ptr[idx] *= -1;
00102 }
00103
00104 __global__ void cuda_global_abs(float *src, const int tile_len, const int sz)
00105 {
00106 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00107 if(src_idx < sz)
00108 cuda_device_abs(src,src_idx);
00109 }
00110
00111
00112 __global__ void cuda_global_clear(float *src, const float val, const int tile_len, const int sz)
00113 {
00114 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00115 if(src_idx < sz)
00116 src[src_idx] = val;
00117 }
00118
00119
00120
00121 __global__ void cuda_global_inplaceAddScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00122 {
00123 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00124 if(src_idx < sz)
00125 ptr[src_idx] += offset[0];
00126 }
00127
00128 __global__ void cuda_global_inplaceSubtractScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00129 {
00130 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00131 if(src_idx < sz)
00132 ptr[src_idx] -= offset[0];
00133 }
00134
00135 __global__ void cuda_global_inplaceMultiplyScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00136 {
00137 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00138 if(src_idx < sz)
00139 ptr[src_idx] *= offset[0];
00140 }
00141
00142 __global__ void cuda_global_inplaceDivideScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00143 {
00144 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00145 if(src_idx < sz)
00146 ptr[src_idx] /= offset[0];
00147 }
00148
00149
00150 __global__ void cuda_global_inplaceAddHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00151 {
00152 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00153 if(src_idx < sz)
00154 ptr[src_idx] += val;
00155 }
00156
00157 __global__ void cuda_global_inplaceSubtractHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00158 {
00159 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00160 if(src_idx < sz)
00161 ptr[src_idx] -= val;
00162 }
00163
00164 __global__ void cuda_global_inplaceMultiplyHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00165 {
00166 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00167 if(src_idx < sz)
00168 ptr[src_idx] *= val;
00169 }
00170
00171 __global__ void cuda_global_inplaceDivideHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00172 {
00173 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00174 if(src_idx < sz)
00175 ptr[src_idx] /= val;
00176 }
00177
00178
00179
00180 __global__ void cuda_global_inplaceAddImages(float *im1, const float *im2, const int tile_len, const int sz)
00181 {
00182 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00183 if(src_idx < sz)
00184 im1[src_idx] += im2[src_idx];
00185 }
00186
00187 __global__ void cuda_global_inplaceSubtractImages(float *im1, const float *im2, const int tile_len, const int sz)
00188 {
00189 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00190 if(src_idx < sz)
00191 im1[src_idx] -= im2[src_idx];
00192 }
00193
00194 __global__ void cuda_global_inplaceMultiplyImages(float *im1, const float *im2, const int tile_len, const int sz)
00195 {
00196 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00197 if(src_idx < sz)
00198 im1[src_idx] *= im2[src_idx];
00199 }
00200
00201 __global__ void cuda_global_inplaceDivideImages(float *im1, const float *im2, const int tile_len, const int sz)
00202 {
00203 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00204 if(src_idx < sz)
00205 im1[src_idx] /= im2[src_idx];
00206 }
00207
00208
00209 __global__ void cuda_global_addScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz)
00210 {
00211 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00212 if(src_idx < sz)
00213 res[src_idx] = im1[src_idx] + offset[0];
00214 }
00215
00216 __global__ void cuda_global_subtractScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz)
00217 {
00218 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00219 if(src_idx < sz)
00220 res[src_idx] = im1[src_idx] - offset[0];
00221 }
00222
00223 __global__ void cuda_global_multiplyScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz)
00224 {
00225 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00226 if(src_idx < sz)
00227 res[src_idx] = im1[src_idx] * offset[0];
00228 }
00229
00230 __global__ void cuda_global_divideScalar(const float *im1, const float *offset, float *res, const int tile_len, const int sz)
00231 {
00232 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00233 if(src_idx < sz)
00234 res[src_idx] = im1[src_idx] / offset[0];
00235 }
00236
00237
00238 __global__ void cuda_global_addHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00239 {
00240 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00241 if(src_idx < sz)
00242 res[src_idx] = im1[src_idx] + val;
00243 }
00244
00245 __global__ void cuda_global_subtractHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00246 {
00247 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00248 if(src_idx < sz)
00249 res[src_idx] = im1[src_idx] - val;
00250 }
00251
00252 __global__ void cuda_global_multiplyHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00253 {
00254 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00255 if(src_idx < sz)
00256 res[src_idx] = im1[src_idx] * val;
00257 }
00258
00259 __global__ void cuda_global_divideHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00260 {
00261 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00262 if(src_idx < sz)
00263 res[src_idx] = im1[src_idx] / val;
00264 }
00265
00266
00267 __global__ void cuda_global_addImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00268 {
00269 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00270 if(src_idx < sz)
00271 res[src_idx] = im1[src_idx] + im2[src_idx];
00272 }
00273
00274 __global__ void cuda_global_subtractImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00275 {
00276 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00277 if(src_idx < sz)
00278 res[src_idx] = im1[src_idx] - im2[src_idx];
00279 }
00280
00281 __global__ void cuda_global_multiplyImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00282 {
00283 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00284 if(src_idx < sz)
00285 res[src_idx] = im1[src_idx] * im2[src_idx];
00286 }
00287
00288 __global__ void cuda_global_divideImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00289 {
00290 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00291 if(src_idx < sz)
00292 res[src_idx] = im1[src_idx] / im2[src_idx];
00293 }
00294
00295
00296 __global__ void cuda_global_takeMax(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00297 {
00298 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00299 if(src_idx < sz)
00300 {
00301 float f1 = im1[src_idx];
00302 float f2 = im2[src_idx];
00303 res[src_idx] = (f1 > f2) ? f1 : f2;
00304 }
00305 }
00306
00307
00308 __device__ void cuda_device_getMin(const float *src, float *dest, float *buf, float *shr, const int tile_len, const int sz)
00309 {
00310
00311
00312 const int shr_idx = threadIdx.x;
00313 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00314 const bool threadActive = src_idx < sz && shr_idx < tile_len;
00315
00316
00317 if(threadActive)
00318 {
00319 shr[shr_idx] = src[src_idx];
00320 }
00321 __syncthreads();
00322
00323
00324 int incr = 1;
00325 int mod = 2;
00326 while(incr < sz)
00327 {
00328 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00329 {
00330
00331 if(shr[shr_idx] > shr[shr_idx+incr])
00332 shr[shr_idx] = shr[shr_idx+incr];
00333 }
00334 __syncthreads();
00335
00336 incr *= 2;
00337 mod *= 2;
00338 }
00339
00340 if(threadIdx.x == 0 && threadActive)
00341 {
00342 int dst_idx = blockIdx.x;
00343 buf[dst_idx] = shr[0];
00344
00345 if(dst_idx == 0)
00346 dest[0] = shr[0];
00347 }
00348 __syncthreads();
00349 }
00350
00351 __global__ void cuda_global_getMin(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00352 {
00353 cuda_device_getMin(src,dest,buf,shared_data,tile_len,sz);
00354 }
00355
00356 __device__ void cuda_device_getMax(const float *src, float *dest, float *buf, float *shr, const int tile_len, const int sz)
00357 {
00358
00359
00360 const int shr_idx = threadIdx.x;
00361 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00362 const bool threadActive = src_idx < sz && shr_idx < tile_len;
00363
00364
00365 if(threadActive)
00366 {
00367 shr[shr_idx] = src[src_idx];
00368 }
00369 __syncthreads();
00370
00371
00372 int incr = 1;
00373 int mod = 2;
00374 while(incr < sz)
00375 {
00376 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00377 {
00378
00379 if(shr[shr_idx] < shr[shr_idx+incr])
00380 shr[shr_idx] = shr[shr_idx+incr];
00381 }
00382 __syncthreads();
00383
00384 incr *= 2;
00385 mod *= 2;
00386 }
00387
00388 if(threadIdx.x == 0 && threadActive)
00389 {
00390 int dst_idx = blockIdx.x;
00391 buf[dst_idx] = shr[0];
00392
00393 if(dst_idx == 0)
00394 dest[0] = shr[0];
00395
00396 }
00397 __syncthreads();
00398 }
00399
00400
00401 __global__ void cuda_global_getMax(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00402 {
00403 cuda_device_getMax(src,dest,buf,shared_data,tile_len,sz);
00404 }
00405
00406 __device__ void cuda_device_getSum(const float *src, float *dest, float *buf, float *shr, const int tile_len, const int sz)
00407 {
00408
00409
00410 const int shr_idx = threadIdx.x;
00411 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00412 const bool threadActive = src_idx < sz && shr_idx < tile_len;
00413
00414 if(threadActive)
00415 {
00416 shr[shr_idx] = src[src_idx];
00417 }
00418 __syncthreads();
00419
00420
00421 int incr = 1;
00422 int mod = 2;
00423 while(incr < sz)
00424 {
00425 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00426 {
00427
00428 shr[shr_idx] += shr[shr_idx+incr];
00429 }
00430 __syncthreads();
00431
00432 incr *= 2;
00433 mod *= 2;
00434 }
00435
00436 if(threadIdx.x == 0 && threadActive)
00437 {
00438 int dst_idx = blockIdx.x;
00439 buf[dst_idx] = shr[0];
00440
00441 if(dst_idx == 0)
00442 dest[0] = shr[0];
00443
00444 }
00445 __syncthreads();
00446 }
00447
00448 __global__ void cuda_global_getSum(const float *src, float *dest, float *buf, const int tile_len, const int sz, const int orig_sz)
00449 {
00450 cuda_device_getSum(src,dest,buf,shared_data,tile_len,sz);
00451 }
00452
00453 __global__ void cuda_global_getAvg(const float *src, float *dest, float *buf, const int tile_len, const int sz, const int orig_sz)
00454 {
00455 cuda_device_getSum(src,dest,buf,shared_data,tile_len,sz);
00456
00457 if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && sz <= tile_len)
00458 dest[0] = dest[0]/orig_sz;
00459
00460 }
00461
00462 __global__ void cuda_global_squared(const float *im, float *res, const int tile_len, const int sz)
00463 {
00464 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00465 if(src_idx < sz)
00466 {
00467 float in = im[src_idx];
00468 res[src_idx] = in*in;
00469 }
00470 }
00471
00472 __global__ void cuda_global_sqrt(const float *im, float *res, const int tile_len, const int sz)
00473 {
00474 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00475 if(src_idx < sz)
00476 {
00477 res[src_idx] = sqrt(im[src_idx]);
00478 }
00479 }
00480
00481
00482
00483 __global__ void cuda_global_quadEnergy(const float *real, const float *imag, float *out, int tile_len, int sz)
00484 {
00485 const int idx = blockIdx.x*tile_len + threadIdx.x;
00486 if(idx < sz)
00487 {
00488 float re = real[idx];
00489 float im = imag[idx];
00490 out[idx] = sqrtf(re*re + im*im);
00491 }
00492
00493 }
00494
00495 __global__ void cuda_global_inplaceAttenuateBorders_x(float *im, int borderSize, int tile_width, int w, int h)
00496 {
00497
00498 const float increment = 1.0 / (float)(borderSize + 1);
00499
00500 const int x_pos = IMUL(blockIdx.x,tile_width) + threadIdx.x;
00501 if(x_pos < w)
00502 {
00503
00504 if(blockIdx.y < borderSize)
00505 {
00506 const int idx = x_pos + IMUL(blockIdx.y,w);
00507 float coeff = increment*(blockIdx.y+1);
00508 im[idx] *= coeff;
00509 }
00510
00511 else if(blockIdx.y < IMUL(borderSize,2))
00512 {
00513 const int idx = x_pos + IMUL((h-borderSize+blockIdx.y-borderSize),w);
00514 float coeff = increment*(IMUL(borderSize,2) - blockIdx.y);
00515 im[idx] *= coeff;
00516 }
00517 }
00518 }
00519
00520 __global__ void cuda_global_inplaceAttenuateBorders_y(float *im, int borderSize, int tile_height, int w, int h)
00521 {
00522 const float increment = 1.0 / (float)(borderSize + 1);
00523
00524 const int y_pos = IMUL(blockIdx.y,tile_height) + threadIdx.y;
00525 if(y_pos < h)
00526 {
00527
00528 if(blockIdx.x < borderSize)
00529 {
00530 const int idx = IMUL(y_pos,w) + blockIdx.x;
00531 float coeff = increment*(blockIdx.x+1);
00532 im[idx] *= coeff;
00533 }
00534
00535 else if(blockIdx.x < IMUL(borderSize,2))
00536 {
00537 const int idx = IMUL(y_pos,w)+ (blockIdx.x-borderSize) + (w-borderSize);
00538 float coeff = increment*(IMUL(borderSize,2) - blockIdx.x);
00539 im[idx] *= coeff;
00540 }
00541 }
00542 }
00543
00544
00545 __device__ void cuda_device_findMax(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz)
00546 {
00547
00548
00549 const int shr_idx = threadIdx.x;
00550 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00551 const bool threadActive = src_idx < sz && shr_idx < tile_len;
00552 float *shrVal = &shared_data[0];
00553 float *shrLoc = (float *) &(shared_data[tile_len]);
00554
00555 if(threadActive)
00556 {
00557 shrVal[shr_idx] = src[src_idx];
00558 if(srcloc == NULL)
00559 shrLoc[shr_idx] = src_idx;
00560 else
00561 shrLoc[shr_idx] = srcloc[src_idx];
00562 }
00563 __syncthreads();
00564
00565
00566 int incr = 1;
00567 int mod = 2;
00568 while(incr < sz)
00569 {
00570 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00571 {
00572
00573 if(shrVal[shr_idx+incr] > shrVal[shr_idx])
00574 {
00575 shrVal[shr_idx] = shrVal[shr_idx+incr];
00576 shrLoc[shr_idx] = shrLoc[shr_idx+incr];
00577 }
00578 }
00579 __syncthreads();
00580
00581 incr *= 2;
00582 mod *= 2;
00583 }
00584
00585 if(threadIdx.x == 0 && threadActive)
00586 {
00587 int dst_idx = blockIdx.x;
00588 buf[dst_idx] = shrVal[0];
00589 loc[dst_idx] = shrLoc[0];
00590 }
00591 __syncthreads();
00592 }
00593
00594
00595 __global__ void cuda_global_findMax(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz)
00596 {
00597 cuda_device_findMax(src,srcloc,buf,loc,tile_len,sz);
00598 }
00599
00600
00601 __device__ void cuda_device_findMin(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz)
00602 {
00603
00604
00605 const int shr_idx = threadIdx.x;
00606 const int src_idx = blockIdx.x*tile_len + threadIdx.x;
00607 const bool threadActive = src_idx < sz && shr_idx < tile_len;
00608 float *shrVal = &shared_data[0];
00609 float *shrLoc = (float *) &(shared_data[tile_len]);
00610
00611 if(threadActive)
00612 {
00613 shrVal[shr_idx] = src[src_idx];
00614 if(srcloc == NULL)
00615 shrLoc[shr_idx] = src_idx;
00616 else
00617 shrLoc[shr_idx] = srcloc[src_idx];
00618 }
00619 __syncthreads();
00620
00621
00622 int incr = 1;
00623 int mod = 2;
00624 while(incr < sz)
00625 {
00626 if(shr_idx % mod == 0 && shr_idx+incr < tile_len && src_idx+incr < sz)
00627 {
00628
00629 if(shrVal[shr_idx+incr] < shrVal[shr_idx])
00630 {
00631 shrVal[shr_idx] = shrVal[shr_idx+incr];
00632 shrLoc[shr_idx] = shrLoc[shr_idx+incr];
00633 }
00634 }
00635 __syncthreads();
00636
00637 incr *= 2;
00638 mod *= 2;
00639 }
00640
00641 if(threadIdx.x == 0 && threadActive)
00642 {
00643 int dst_idx = blockIdx.x;
00644 buf[dst_idx] = shrVal[0];
00645 loc[dst_idx] = shrLoc[0];
00646 }
00647 __syncthreads();
00648 }
00649
00650
00651 __global__ void cuda_global_findMin(const float *src, const int *srcloc, float *buf, int *loc, const int tile_len, const int sz)
00652 {
00653 cuda_device_findMin(src,srcloc,buf,loc,tile_len,sz);
00654 }
00655
00656
00657 #endif