00001
00002 extern __shared__ float shared_data[];
00003
00004 #include "wrap_c_cuda.h"
00005 #include <cuda_runtime.h>
00006 #include "cuda_mersennetwisterkernel.h"
00007 #include "cuda_colorops.h"
00008 #include "cuda_lowpass.h"
00009 #include "cuda_mathops.h"
00010 #include "cuda_kernels.h"
00011 #include "cuda_shapeops.h"
00012 #include "cuda_drawops.h"
00013 #include "cuda_filterops.h"
00014 #include "cuda_transforms.h"
00015 #include "cuda_convolutions.h"
00016 #include "cuda_saliencyops.h"
00017 #include <stdio.h>
00018 #include "cuda_debayer.h"
00019 #include "cuda_cutpaste.h"
00020
00021
00022
00023 void cuda_c_getRGBY(const float3_t *src, float *rgptr, float *byptr, const float thresh,
00024 const float min_range, const float max_range,
00025 const int w, const int h, const int tile_width, const int tile_height)
00026 {
00027 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00028 dim3 threadBlockRows(tile_width,tile_height);
00029 cuda_global_getRGBY<<<blockGridRows,threadBlockRows>>>(src,rgptr,byptr,thresh,min_range,max_range,w,h,tile_width,tile_height);
00030 }
00031
00032 void cuda_c_toRGB(float3_t *dst, const float *src,int sz, const int tile_len)
00033 {
00034 dim3 blockGridRows(iDivUp(sz,tile_len),1);
00035 dim3 threadBlockRows(tile_len,1);
00036 cuda_global_toRGB<<<blockGridRows,threadBlockRows>>>(dst,src,sz,tile_len);
00037 }
00038
00039
00040 void cuda_c_getComponents(const float3_t *srcptr, float *rptr, float *gptr, float *bptr, int w, int h, int tile_width, int tile_height)
00041 {
00042 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00043 dim3 threadBlockRows(tile_width,tile_height);
00044 cuda_global_getComponents<<<blockGridRows,threadBlockRows>>>(srcptr,rptr,gptr,bptr,w,h,tile_width,tile_height);
00045 }
00046
00047
00048 void cuda_c_luminance(float3_t *aptr, float *dptr, int w, int h, int tile_width, int tile_height)
00049 {
00050 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00051 dim3 threadBlockRows(tile_width,tile_height);
00052 cuda_global_luminance<<<blockGridRows,threadBlockRows>>>(aptr,dptr,w,h,tile_width,tile_height);
00053 }
00054
00055
00056 void cuda_c_luminanceNTSC(float3_t *aptr, float *dptr, int w, int h, int tile_width, int tile_height)
00057 {
00058 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00059 dim3 threadBlockRows(tile_width,tile_height);
00060
00061 cuda_global_luminanceNTSC<<<blockGridRows,threadBlockRows>>>(aptr,dptr,w,h,tile_width,tile_height);
00062 }
00063
00064
00065 void cuda_c_drawFilledRect(float *dst, int top, int left, int bottom, int right, const float intensity, const int w, const int h, const int tile_width, const int tile_height)
00066 {
00067 int rect_width = right-left+1;
00068 int rect_height = bottom-top+1;
00069 dim3 blockGridRows(iDivUp(rect_width,tile_width),iDivUp(rect_height,tile_height));
00070 dim3 threadBlockRows(tile_width,tile_height);
00071
00072 cuda_global_drawFilledRect<<<blockGridRows,threadBlockRows>>>(dst,top,left,bottom,right,intensity,w,h,tile_width,tile_height);
00073 }
00074
00075
00076 void cuda_c_drawFilledRectRGB(float3_t *dst, int top, int left, int bottom, int right, const float3_t *color, const int w, const int h, const int tile_width, const int tile_height)
00077 {
00078 int rect_width = right-left+1;
00079 int rect_height = bottom-top+1;
00080 dim3 blockGridRows(iDivUp(rect_width,tile_width),iDivUp(rect_height,tile_height));
00081 dim3 threadBlockRows(tile_width,tile_height);
00082 cuda_global_drawFilledRectRGB<<<blockGridRows,threadBlockRows>>>(dst,top,left,bottom,right,color->p[0],color->p[1],color->p[2],w,h,tile_width,tile_height);
00083 }
00084
00085
00086 void cuda_c_lowpass_5_x_dec_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
00087 {
00088 dim3 blockGridRows(iDivUp(w, tile_width), h);
00089 dim3 threadBlockRows(tile_width);
00090
00091 cuda_global_lowpass_5_x_dec_x<<<blockGridRows, threadBlockRows,(tile_width+4)*sizeof(float)>>>
00092 (src, w, h, dst, tile_width);
00093 }
00094
00095
00096 void cuda_c_lowpass_5_y_dec_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00097 {
00098 dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height));
00099 dim3 threadBlockColumns(tile_width, tile_height);
00100
00101 cuda_global_lowpass_5_y_dec_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+4)*sizeof(float)>>>
00102 (src, w, h, dst, tile_width, tile_height);
00103 }
00104
00105
00106 void cuda_c_lowpass_9_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
00107 {
00108 dim3 blockGridRows(iDivUp(w, tile_width), h);
00109 dim3 threadBlockRows(tile_width);
00110 cuda_global_lowpass_9_x<<<blockGridRows, threadBlockRows,(tile_width+8)*sizeof(float)>>>(src, w, h, dst, tile_width);
00111 }
00112
00113
00114 void cuda_c_lowpass_9_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00115 {
00116 dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height));
00117 dim3 threadBlockColumns(tile_width, tile_height);
00118
00119 cuda_global_lowpass_9_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+8)*sizeof(float)>>>(src, w, h, dst, tile_width, tile_height);
00120 }
00121
00122
00123 void cuda_c_lowpass_9_x_dec_x(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width)
00124 {
00125 const int dest_tile_width = tile_width>>1;
00126 dim3 blockGridRows(iDivUp(dw, dest_tile_width), dh);
00127 dim3 threadBlockRows(dest_tile_width);
00128 cuda_global_lowpass_9_x_dec_x<<<blockGridRows, threadBlockRows,(tile_width+8)*sizeof(float)>>>(src, w, h, dst, dw, dh, dest_tile_width);
00129 }
00130
00131
00132 void cuda_c_lowpass_9_y_dec_y(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width, int tile_height)
00133 {
00134 const int dest_tile_height = tile_height>>1;
00135 dim3 blockGridColumns(iDivUp(dw, tile_width), iDivUp(dh, dest_tile_height));
00136 dim3 threadBlockColumns(tile_width, dest_tile_height);
00137 cuda_global_lowpass_9_y_dec_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+8)*sizeof(float)>>>(src, w, h, dst, dw, dh, tile_width, dest_tile_height);
00138 }
00139
00140
00141 void cuda_c_lowpass_5_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
00142 {
00143 dim3 blockGridRows(iDivUp(w, tile_width), h);
00144 dim3 threadBlockRows(tile_width);
00145
00146 cuda_global_lowpass_5_x<<<blockGridRows, threadBlockRows,(tile_width+4)*sizeof(float)>>>
00147 (src, w, h, dst, tile_width);
00148 }
00149
00150
00151 void cuda_c_lowpass_5_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00152 {
00153 dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height));
00154 dim3 threadBlockColumns(tile_width, tile_height);
00155
00156 cuda_global_lowpass_5_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+4)*sizeof(float)>>>
00157 (src, w, h, dst, tile_width, tile_height);
00158 }
00159
00160
00161 void cuda_c_lowpass_3_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
00162 {
00163 dim3 blockGridRows(iDivUp(w, tile_width), h);
00164 dim3 threadBlockRows(tile_width);
00165
00166 cuda_global_lowpass_3_x<<<blockGridRows, threadBlockRows,(tile_width+4)*sizeof(float)>>>
00167 (src, w, h, dst, tile_width);
00168 }
00169
00170
00171 void cuda_c_lowpass_3_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00172 {
00173 dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height));
00174 dim3 threadBlockColumns(tile_width, tile_height);
00175
00176 cuda_global_lowpass_3_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+4)*sizeof(float)>>>
00177 (src, w, h, dst, tile_width, tile_height);
00178 }
00179
00180
00181 void cuda_c_dec_xy(const float *src, float* dst, const int x_factor, const int y_factor, const unsigned int w, const unsigned int h, int tile_width)
00182 {
00183 dim3 blockGridRows(iDivUp(w/x_factor, tile_width), h/y_factor);
00184 dim3 threadBlockRows(tile_width);
00185 cuda_global_dec_xy<<<blockGridRows, threadBlockRows>>>(src,dst,x_factor,y_factor,w,h,tile_width);
00186 }
00187
00188
00189 void cuda_c_dec_x(const float *src, float* dst, const int x_factor, const unsigned int w, const unsigned int h, int tile_width)
00190 {
00191 dim3 blockGridRows(iDivUp(w/x_factor, tile_width), h);
00192 dim3 threadBlockRows(tile_width);
00193 cuda_global_dec_x<<<blockGridRows, threadBlockRows>>>(src,dst,x_factor,w,h,tile_width);
00194 }
00195
00196
00197 void cuda_c_dec_y(const float *src, float* dst, const int y_factor, const unsigned int w, const unsigned int h, int tile_width)
00198 {
00199 dim3 blockGridRows(iDivUp(w, tile_width), h/y_factor);
00200 dim3 threadBlockRows(tile_width);
00201 cuda_global_dec_y<<<blockGridRows, threadBlockRows>>>(src,dst,y_factor,w,h,tile_width);
00202 }
00203
00204 void cuda_c_quickLocalAvg(const float *in, float *res, float fac, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00205 {
00206 int scalex = lw / sw, scaley = lh / sh;
00207
00208 int remx = (lw % sw), remy = (lh % sh);
00209
00210 dim3 blockGridColumns(iDivUp(sw, tile_width), iDivUp(sh, tile_height));
00211 dim3 threadBlockColumns(tile_width, tile_height);
00212 cuda_global_quickLocalAvg<<<blockGridColumns,threadBlockColumns>>>(in,res,fac,scalex,scaley,remx,remy,lw,lh,sw,sh,tile_width,tile_height);
00213 }
00214
00215 void cuda_c_quickLocalAvg2x2(const float *in, float *res, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00216 {
00217 dim3 blockGridColumns(iDivUp(sw, tile_width), iDivUp(sh, tile_height));
00218 dim3 threadBlockColumns(tile_width, tile_height);
00219 cuda_global_quickLocalAvg2x2<<<blockGridColumns,threadBlockColumns>>>(in,res,lw,lh,sw,sh,tile_width,tile_height);
00220 }
00221
00222 void cuda_c_quickLocalMax(const float *in, float *res, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00223 {
00224 int scalex = lw / sw, scaley = lh / sh;
00225 int remx = lw - 1 - (lw % sw), remy = lh - 1 - (lh % sh);
00226
00227 dim3 blockGridColumns(iDivUp(sw, tile_width), iDivUp(sh, tile_height));
00228 dim3 threadBlockColumns(tile_width, tile_height);
00229 cuda_global_quickLocalMax<<<blockGridColumns,threadBlockColumns>>>(in,res,scalex,scaley,remx,remy,lw,lh,sw,sh,tile_width,tile_height);
00230 }
00231
00232
00233 void cuda_c_rescaleBilinear(const float *src, float *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height)
00234 {
00235 dim3 blockGridColumns(iDivUp(new_w, tile_width), iDivUp(new_h, tile_height));
00236 dim3 threadBlockColumns(tile_width, tile_height);
00237 cuda_global_rescaleBilinear<<<blockGridColumns,threadBlockColumns>>>(src,res,sw,sh,orig_w,orig_h,new_w,new_h,tile_width,tile_height);
00238 }
00239
00240 void cuda_c_rescaleBilinearRGB(const float3_t *src, float3_t *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height)
00241 {
00242 dim3 blockGridColumns(iDivUp(new_w, tile_width), iDivUp(new_h, tile_height));
00243 dim3 threadBlockColumns(tile_width, tile_height);
00244 cuda_global_rescaleBilinearRGB<<<blockGridColumns,threadBlockColumns>>>(src,res,sw,sh,orig_w,orig_h,new_w,new_h,tile_width,tile_height);
00245 }
00246
00247 void cuda_c_clear(float *src, const float val, const int tile_len, const int sz)
00248 {
00249 dim3 blockGridRows(iDivUp(sz,tile_len));
00250 dim3 threadBlockRows(tile_len);
00251 cuda_global_clear<<<blockGridRows,threadBlockRows>>>(src,val,tile_len,sz);
00252 }
00253
00254 void cuda_c_abs(float *src,const int tile_len, const int sz)
00255 {
00256 dim3 blockGridRows(iDivUp(sz,tile_len));
00257 dim3 threadBlockRows(tile_len);
00258 cuda_global_abs<<<blockGridRows,threadBlockRows>>>(src,tile_len,sz);
00259 }
00260
00261 void cuda_c_inplaceAddScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00262 {
00263 dim3 blockGridRows(iDivUp(sz,tile_len));
00264 dim3 threadBlockRows(tile_len);
00265 cuda_global_inplaceAddScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz);
00266 }
00267
00268 void cuda_c_inplaceSubtractScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00269 {
00270 dim3 blockGridRows(iDivUp(sz,tile_len));
00271 dim3 threadBlockRows(tile_len);
00272 cuda_global_inplaceSubtractScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz);
00273 }
00274
00275 void cuda_c_inplaceMultiplyScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00276 {
00277 dim3 blockGridRows(iDivUp(sz,tile_len));
00278 dim3 threadBlockRows(tile_len);
00279 cuda_global_inplaceMultiplyScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz);
00280 }
00281
00282 void cuda_c_inplaceDivideScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00283 {
00284 dim3 blockGridRows(iDivUp(sz,tile_len));
00285 dim3 threadBlockRows(tile_len);
00286 cuda_global_inplaceDivideScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz);
00287 }
00288
00289 void cuda_c_inplaceAddHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00290 {
00291 dim3 blockGridRows(iDivUp(sz,tile_len));
00292 dim3 threadBlockRows(tile_len);
00293 cuda_global_inplaceAddHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz);
00294 }
00295
00296 void cuda_c_inplaceSubtractHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00297 {
00298 dim3 blockGridRows(iDivUp(sz,tile_len));
00299 dim3 threadBlockRows(tile_len);
00300 cuda_global_inplaceSubtractHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz);
00301 }
00302
00303 void cuda_c_inplaceMultiplyHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00304 {
00305 dim3 blockGridRows(iDivUp(sz,tile_len));
00306 dim3 threadBlockRows(tile_len);
00307 cuda_global_inplaceMultiplyHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz);
00308 }
00309
00310 void cuda_c_inplaceDivideHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00311 {
00312 dim3 blockGridRows(iDivUp(sz,tile_len));
00313 dim3 threadBlockRows(tile_len);
00314 cuda_global_inplaceDivideHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz);
00315 }
00316
00317
00318 void cuda_c_inplaceAddImages(float *im1, const float *im2, const int tile_len, const int sz)
00319 {
00320 dim3 blockGridRows(iDivUp(sz,tile_len));
00321 dim3 threadBlockRows(tile_len);
00322 cuda_global_inplaceAddImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz);
00323 }
00324
00325 void cuda_c_inplaceSubtractImages(float *im1, const float *im2, const int tile_len, const int sz)
00326 {
00327 dim3 blockGridRows(iDivUp(sz,tile_len));
00328 dim3 threadBlockRows(tile_len);
00329 cuda_global_inplaceSubtractImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz);
00330 }
00331
00332 void cuda_c_inplaceMultiplyImages(float *im1, const float *im2, const int tile_len, const int sz)
00333 {
00334 dim3 blockGridRows(iDivUp(sz,tile_len));
00335 dim3 threadBlockRows(tile_len);
00336 cuda_global_inplaceMultiplyImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz);
00337 }
00338
00339 void cuda_c_inplaceDivideImages(float *im1, const float *im2, const int tile_len, const int sz)
00340 {
00341 dim3 blockGridRows(iDivUp(sz,tile_len));
00342 dim3 threadBlockRows(tile_len);
00343 cuda_global_inplaceDivideImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz);
00344 }
00345
00346 void cuda_c_addImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00347 {
00348 dim3 blockGridRows(iDivUp(sz,tile_len));
00349 dim3 threadBlockRows(tile_len);
00350 cuda_global_addImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);
00351 }
00352
00353 void cuda_c_subtractImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00354 {
00355 dim3 blockGridRows(iDivUp(sz,tile_len));
00356 dim3 threadBlockRows(tile_len);
00357 cuda_global_subtractImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);
00358 }
00359
00360 void cuda_c_multiplyImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00361 {
00362 dim3 blockGridRows(iDivUp(sz,tile_len));
00363 dim3 threadBlockRows(tile_len);
00364 cuda_global_multiplyImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);
00365 }
00366
00367 void cuda_c_divideImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00368 {
00369 dim3 blockGridRows(iDivUp(sz,tile_len));
00370 dim3 threadBlockRows(tile_len);
00371 cuda_global_divideImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);
00372 }
00373
00374 void cuda_c_takeMax(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00375 {
00376 dim3 blockGridRows(iDivUp(sz,tile_len));
00377 dim3 threadBlockRows(tile_len);
00378 cuda_global_takeMax<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);
00379 }
00380
00381 void cuda_c_addScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00382 {
00383 dim3 blockGridRows(iDivUp(sz,tile_len));
00384 dim3 threadBlockRows(tile_len);
00385 cuda_global_addScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);
00386 }
00387
00388 void cuda_c_subtractScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00389 {
00390 dim3 blockGridRows(iDivUp(sz,tile_len));
00391 dim3 threadBlockRows(tile_len);
00392 cuda_global_subtractScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);
00393 }
00394
00395 void cuda_c_multiplyScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00396 {
00397 dim3 blockGridRows(iDivUp(sz,tile_len));
00398 dim3 threadBlockRows(tile_len);
00399 cuda_global_multiplyScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);
00400 }
00401
00402 void cuda_c_divideScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00403 {
00404 dim3 blockGridRows(iDivUp(sz,tile_len));
00405 dim3 threadBlockRows(tile_len);
00406 cuda_global_divideScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);
00407 }
00408
00409 void cuda_c_addHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00410 {
00411 dim3 blockGridRows(iDivUp(sz,tile_len));
00412 dim3 threadBlockRows(tile_len);
00413 cuda_global_addHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz);
00414 }
00415
00416 void cuda_c_subtractHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00417 {
00418 dim3 blockGridRows(iDivUp(sz,tile_len));
00419 dim3 threadBlockRows(tile_len);
00420 cuda_global_subtractHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz);
00421 }
00422
00423 void cuda_c_multiplyHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00424 {
00425 dim3 blockGridRows(iDivUp(sz,tile_len));
00426 dim3 threadBlockRows(tile_len);
00427 cuda_global_multiplyHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz);
00428 }
00429
00430 void cuda_c_divideHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00431 {
00432 dim3 blockGridRows(iDivUp(sz,tile_len));
00433 dim3 threadBlockRows(tile_len);
00434 cuda_global_divideHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz);
00435 }
00436
00437 void cuda_c_inplaceRectify(float *ptr, const int tile_len, const int sz)
00438 {
00439 dim3 blockGridRows(iDivUp(sz,tile_len));
00440 dim3 threadBlockRows(tile_len);
00441 cuda_global_inplaceRectify<<<blockGridRows,threadBlockRows>>>(ptr,tile_len,sz);
00442 }
00443
00444 void cuda_c_inplaceClamp(float *ptr, const float cmin, const float cmax, const int tile_len, const int sz)
00445 {
00446 dim3 blockGridRows(iDivUp(sz,tile_len));
00447 dim3 threadBlockRows(tile_len);
00448
00449 cuda_global_inplaceClamp<<<blockGridRows,threadBlockRows>>>(ptr,cmin,cmax,tile_len,sz);
00450 }
00451
00452 void cuda_c_inplaceNormalize(float *src, const float *omin, const float *omax, const float nmin, const float nmax, const int tile_len, const int sz)
00453 {
00454 dim3 blockGridRows(iDivUp(sz,tile_len));
00455 dim3 threadBlockRows(tile_len);
00456 cuda_global_inplaceNormalize<<<blockGridRows,threadBlockRows>>>(src,omin,omax,nmin,nmax,tile_len,sz);
00457 }
00458
00459 void cuda_c_getMin(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00460 {
00461 dim3 blockGridRows(iDivUp(sz,tile_len));
00462 dim3 threadBlockRows(tile_len);
00463
00464 const float *in = src;
00465 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00466 {
00467 cuda_global_getMin<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz);
00468 in=buf;
00469 }
00470 }
00471
00472
00473 void cuda_c_getMax(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00474 {
00475 dim3 blockGridRows(iDivUp(sz,tile_len));
00476 dim3 threadBlockRows(tile_len);
00477
00478 const float *in = src;
00479 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00480 {
00481 cuda_global_getMax<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz);
00482 in=buf;
00483 }
00484 }
00485
00486 void cuda_c_getAvg(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00487 {
00488 dim3 blockGridRows(iDivUp(sz,tile_len));
00489 dim3 threadBlockRows(tile_len);
00490
00491 const float *in = src;
00492 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00493 {
00494 cuda_global_getAvg<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz,sz);
00495 in=buf;
00496 }
00497 }
00498
00499 void cuda_c_getSum(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00500 {
00501 dim3 blockGridRows(iDivUp(sz,tile_len));
00502 dim3 threadBlockRows(tile_len);
00503
00504 const float *in = src;
00505 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00506 {
00507 cuda_global_getSum<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz,sz);
00508 in=buf;
00509 }
00510 }
00511
00512 void cuda_c_squared(const float *im, float *res, const int tile_len, const int sz)
00513 {
00514 dim3 blockGridRows(iDivUp(sz,tile_len));
00515 dim3 threadBlockRows(tile_len);
00516 cuda_global_squared<<<blockGridRows,threadBlockRows>>>(im,res,tile_len,sz);
00517 }
00518
00519 void cuda_c_sqrt(const float *im, float *res, const int tile_len, const int sz)
00520 {
00521 dim3 blockGridRows(iDivUp(sz,tile_len));
00522 dim3 threadBlockRows(tile_len);
00523 cuda_global_sqrt<<<blockGridRows,threadBlockRows>>>(im,res,tile_len,sz);
00524 }
00525
00526
00527 void cuda_c_quadEnergy(const float *real, const float *imag, float *out, int tile_len, int sz)
00528 {
00529 dim3 blockGridRows(iDivUp(sz,tile_len));
00530 dim3 threadBlockRows(tile_len);
00531 cuda_global_quadEnergy<<<blockGridRows,threadBlockRows>>>(real,imag,out,tile_len,sz);
00532 }
00533
00534 void cuda_c_inplaceAttenuateBorders(float *im, int borderSize, int tile_len, int w, int h)
00535 {
00536 dim3 xBlock(iDivUp(w,tile_len),borderSize*2);
00537 dim3 xThread(tile_len);
00538 cuda_global_inplaceAttenuateBorders_x<<<xBlock,xThread>>>(im,borderSize,tile_len,w,h);
00539 dim3 yBlock(borderSize*2,iDivUp(h,tile_len));
00540 dim3 yThread(1,tile_len);
00541 cuda_global_inplaceAttenuateBorders_y<<<yBlock,yThread>>>(im,borderSize,tile_len,w,h);
00542 }
00543
00544 void cuda_c_findMax(const float *src, float *buf, int *loc, const int tile_len, const int sz)
00545 {
00546 dim3 blockGridRows(iDivUp(sz,tile_len));
00547 dim3 threadBlockRows(tile_len);
00548
00549 const float *in = src;
00550 const int *inLoc = NULL;
00551 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00552 {
00553 cuda_global_findMax<<<blockGridRows,threadBlockRows,2*tile_len*sizeof(float)>>>(in,inLoc,buf,loc,tile_len,cur_sz);
00554 in=buf;
00555 inLoc=loc;
00556 }
00557 }
00558
00559 void cuda_c_findMin(const float *src, float *buf, int *loc, const int tile_len, const int sz)
00560 {
00561 dim3 blockGridRows(iDivUp(sz,tile_len));
00562 dim3 threadBlockRows(tile_len);
00563
00564 const float *in = src;
00565 const int *inLoc = NULL;
00566 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00567 {
00568 cuda_global_findMin<<<blockGridRows,threadBlockRows,2*tile_len*sizeof(float)>>>(in,inLoc,buf,loc,tile_len,cur_sz);
00569 in=buf;
00570 inLoc=loc;
00571 }
00572 }
00573
00574
00575
00576 void cuda_c_dogFilterHmax(float *dest, const float theta, const float gamma, const int size, const float div, const int tile_width, const int tile_height)
00577 {
00578 dim3 blockGridRows(iDivUp(size,tile_width),iDivUp(size,tile_height));
00579 dim3 threadBlockRows(tile_width,tile_height);
00580 cuda_global_dogFilterHmax<<<blockGridRows,threadBlockRows>>>(dest,theta,gamma,size,div,tile_width,tile_height);
00581 }
00582
00583 void cuda_c_dogFilter(float *dest, float stddev, float theta, int half_size, int size, int tile_width, int tile_height)
00584 {
00585 dim3 blockGridRows(iDivUp(size,tile_width),iDivUp(size,tile_height));
00586 dim3 threadBlockRows(tile_width,tile_height);
00587 cuda_global_dogFilter<<<blockGridRows,threadBlockRows>>>(dest,stddev,theta,half_size,size,tile_width,tile_height);
00588 }
00589
00590 void cuda_c_gaborFilter3(float *kern, const float major_stddev, const float minor_stddev,
00591 const float period, const float phase,
00592 const float theta, const int size, const int tile_len, const int sz)
00593 {
00594 dim3 blockGridRows(iDivUp(sz,tile_len));
00595 dim3 threadBlockRows(tile_len);
00596 cuda_global_gaborFilter3<<<blockGridRows,threadBlockRows>>>(kern,major_stddev,minor_stddev,period,phase,theta,size,tile_len,sz);
00597 }
00598
00599 void cuda_c_gaussian(float *res, float c, float sig22, int hw, int tile_len, int sz)
00600 {
00601 dim3 blockGridRows(iDivUp(sz,tile_len));
00602 dim3 threadBlockRows(tile_len);
00603 cuda_global_gaussian<<<blockGridRows,threadBlockRows>>>(res,c,sig22,hw,tile_len,sz);
00604 }
00605
00606 void cuda_c_orientedFilter(const float *src, float *re, float *im, const float kx, const float ky, const float intensity, const int w, const int h, const int tile_width)
00607 {
00608 dim3 blockGridRows(iDivUp(w, tile_width), h);
00609 dim3 threadBlockRows(tile_width);
00610 cuda_global_orientedFilter<<<blockGridRows,threadBlockRows>>>(src,re,im,kx,ky,intensity,w,h,tile_width);
00611 }
00612
00613 void cuda_c_centerSurroundAbs(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int tile_width )
00614 {
00615 int scalex = lw / sw, remx = lw - 1 - (lw % sw);
00616 int scaley = lh / sh, remy = lh - 1 - (lh % sh);
00617 dim3 blockGridRows(iDivUp(lw, tile_width), lh);
00618 dim3 threadBlockRows(tile_width);
00619 cuda_global_centerSurroundAbs<<<blockGridRows,threadBlockRows>>>(center,surround,res,lw,lh,sw,sh,scalex,scaley,remx,remy,tile_width);
00620 }
00621
00622 void cuda_c_centerSurroundClamped(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int tile_width )
00623 {
00624 int scalex = lw / sw, remx = lw - 1 - (lw % sw);
00625 int scaley = lh / sh, remy = lh - 1 - (lh % sh);
00626 dim3 blockGridRows(iDivUp(lw, tile_width), lh);
00627 dim3 threadBlockRows(tile_width);
00628 cuda_global_centerSurroundClamped<<<blockGridRows,threadBlockRows>>>(center,surround,res,lw,lh,sw,sh,scalex,scaley,remx,remy,tile_width);
00629 }
00630
00631 void cuda_c_centerSurroundDirectional(const float *center, const float *surround, float *pos, float *neg, int lw, int lh, int sw, int sh, int tile_width )
00632 {
00633 int scalex = lw / sw, remx = lw - 1 - (lw % sw);
00634 int scaley = lh / sh, remy = lh - 1 - (lh % sh);
00635 dim3 blockGridRows(iDivUp(lw, tile_width), lh);
00636 dim3 threadBlockRows(tile_width);
00637 cuda_global_centerSurroundDirectional<<<blockGridRows,threadBlockRows>>>(center,surround,pos,neg,lw,lh,sw,sh,scalex,scaley,remx,remy,tile_width);
00638 }
00639
00640 void cuda_c_centerSurroundAbsAttenuate(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int attBorder, int tile_width, int tile_height)
00641 {
00642 int scalex = lw / sw, remx = lw - 1 - (lw % sw);
00643 int scaley = lh / sh, remy = lh - 1 - (lh % sh);
00644 dim3 blockGridRows(iDivUp(lw, tile_width), iDivUp(lh,tile_height));
00645 dim3 threadBlockRows(tile_width,tile_height);
00646 cuda_global_centerSurroundAbsAttenuate<<<blockGridRows,threadBlockRows>>>(center,surround,res,lw,lh,sw,sh,attBorder,scalex,scaley,remx,remy,tile_width,tile_height);
00647 }
00648
00649 void cuda_c_spatialPoolMax(const float *src, float *res, float *buf1, float *buf2, const int src_w_in, const int src_h_in, const int skip_w_in, const int skip_h_in,
00650 const int reg_w_in, const int reg_h_in, int tile_width_in, int tile_height_in)
00651 {
00652 int reg_w = reg_w_in; int reg_h = reg_h_in;
00653 int skip_w = skip_w_in; int skip_h = skip_h_in;
00654 int src_w = src_w_in; int src_h = src_h_in;
00655 int tilesperregion_w, tilesperregion_h;
00656 int tile_width, tile_height;
00657 const int orig_tile_size = tile_width_in*tile_height_in;
00658 const float *in = src;
00659 float *out = buf1;
00660 float *next = buf2;
00661
00662 do
00663 {
00664
00665 if(reg_w < orig_tile_size)
00666 {
00667 tile_width = reg_w;
00668
00669 tile_height = MIN(orig_tile_size/tile_width,reg_h);
00670 }
00671 else if(reg_h < orig_tile_size)
00672 {
00673 tile_height = reg_h;
00674
00675 tile_width = MIN(orig_tile_size/tile_height,reg_w);
00676 }
00677 else
00678 {
00679 tile_width = tile_width_in;
00680 tile_height = tile_height_in;
00681 }
00682 tilesperregion_w = iDivUp(reg_w,tile_width);
00683 tilesperregion_h = iDivUp(reg_h,tile_height);
00684
00685 if(tilesperregion_w == 1 && tilesperregion_h == 1)
00686 out = res;
00687 int num_blocks_w = iDivUp(src_w, skip_w)*tilesperregion_w;
00688 int num_blocks_h = iDivUp(src_h, skip_h)*tilesperregion_h;
00689 dim3 blockGridRows(num_blocks_w,num_blocks_h);
00690 dim3 threadBlockRows(tile_width,tile_height);
00691
00692
00693 cuda_global_spatialPoolMax<<<blockGridRows,threadBlockRows,tile_width*tile_height*sizeof(float)>>>(in,out,src_w,src_h,skip_w,skip_h,reg_w,reg_h,tile_width,tile_height);
00694 src_w = num_blocks_w;
00695 src_h = num_blocks_h;
00696 reg_w = tilesperregion_w;
00697 reg_h = tilesperregion_h;
00698 skip_w = reg_w;
00699 skip_h = reg_h;
00700
00701 in = out;
00702 out = next;
00703 next = (buf1 == in) ? buf1 : buf2;
00704 } while(tilesperregion_w > 1 || tilesperregion_h > 1);
00705
00706 }
00707
00708
00709
00710 void cuda_c_randomMT(float *d_Random, int numVals, int tile_len)
00711 {
00712 if(numVals % MT_RNG_COUNT != 0)
00713 {
00714 printf("ERROR!!!: Cannot request a set of random numbers that is not in MT_RNG_COUNT [%d]units\n",MT_RNG_COUNT);
00715 return;
00716 }
00717 int NPerRng = numVals/MT_RNG_COUNT;
00718 dim3 blockGridColumns(NPerRng,1);
00719 dim3 threadBlockColumns(tile_len,1);
00720 cuda_global_randomMT<<<blockGridColumns,threadBlockColumns>>>(d_Random,NPerRng);
00721 }
00722
00723 void cuda_c_inplaceAddBGnoise2(float *in, float *rnd, const int brd_siz, const float range, int w, int h, int tile_len)
00724 {
00725 dim3 blockGridColumns(iDivUp(w,tile_len),h);
00726 dim3 threadBlockColumns(tile_len,1);
00727 cuda_global_inplaceAddBGnoise2<<<blockGridColumns,threadBlockColumns>>>(in,rnd,brd_siz,range,w,h,tile_len);
00728 }
00729
00730 void cuda_c_convolveHmaxHelper(float *res, const float *src, const int src_w, const int src_h,
00731 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00732 {
00733 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00734 dim3 threadBlockColumns(tile_width,tile_height);
00735 cuda_global_convolveHmaxHelper<<<blockGridColumns,threadBlockColumns,Nx*Ny*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00736 }
00737
00738 void cuda_c_convolveZeroHelper(float *res, const float *src, const int src_w, const int src_h,
00739 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00740 {
00741 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00742 dim3 threadBlockColumns(tile_width,tile_height);
00743 cuda_global_convolveZeroHelper<<<blockGridColumns,threadBlockColumns,Nx*Ny*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00744 }
00745
00746 void cuda_c_convolveCleanHelper(float *res, const float *src, const int src_w, const int src_h,
00747 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00748 {
00749 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00750 dim3 threadBlockColumns(tile_width,tile_height);
00751 cuda_global_convolveCleanHelper<<<blockGridColumns,threadBlockColumns,Nx*Ny*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00752 }
00753
00754 void cuda_c_convolveHmaxHelperOptimized(float *res, const float *src, const int src_w, const int src_h,
00755 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00756 {
00757 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00758 dim3 threadBlockColumns(tile_width,tile_height);
00759 cuda_global_convolveHmaxHelperOptimized<<<blockGridColumns,threadBlockColumns,(Nx*Ny+(Nx+tile_width)*(Ny+tile_height))*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00760 }
00761
00762 void cuda_c_convolveZeroHelperOptimized(float *res, const float *src, const int src_w, const int src_h,
00763 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00764 {
00765 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00766 dim3 threadBlockColumns(tile_width,tile_height);
00767 cuda_global_convolveZeroHelperOptimized<<<blockGridColumns,threadBlockColumns,(Nx*Ny+(Nx+tile_width)*(Ny+tile_height))*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00768 }
00769
00770 void cuda_c_optConvolve(float *res, const float *src, const int src_w, const int src_h,
00771 const float *f, const int fil_w, const int fil_h, const int tile_width, const int tile_height)
00772 {
00773 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00774 dim3 threadBlockColumns(tile_width,tile_height);
00775 cuda_global_optConvolve<<<blockGridColumns,threadBlockColumns,fil_w*fil_h*sizeof(float)>>>(res,src,src_w,src_h,f,fil_w,fil_h,tile_width,tile_height);
00776 }
00777
00778 void cuda_c_xFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00779 {
00780 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00781 dim3 threadBlockColumns(tile_len,1);
00782 cuda_global_xFilterZero<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00783 }
00784
00785 void cuda_c_xFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00786 {
00787 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00788 dim3 threadBlockColumns(tile_len,1);
00789 cuda_global_xFilterClean<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00790 }
00791
00792 void cuda_c_xFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00793 {
00794 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00795 dim3 threadBlockColumns(tile_len,1);
00796 cuda_global_xFilterReplicate<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00797 }
00798
00799 void cuda_c_yFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00800 {
00801 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00802 dim3 threadBlockColumns(1,tile_len);
00803 cuda_global_yFilterZero<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00804 }
00805
00806 void cuda_c_yFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00807 {
00808 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00809 dim3 threadBlockColumns(1,tile_len);
00810 cuda_global_yFilterClean<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00811 }
00812
00813 void cuda_c_yFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00814 {
00815 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00816 dim3 threadBlockColumns(1,tile_len);
00817 cuda_global_yFilterReplicate<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00818 }
00819
00820 void cuda_c_optXFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00821 {
00822 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00823 dim3 threadBlockColumns(tile_len,1);
00824 cuda_global_optXFilterZero<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00825 }
00826
00827 void cuda_c_optYFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00828 {
00829 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00830 dim3 threadBlockColumns(1,tile_len);
00831 cuda_global_optYFilterZero<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00832 }
00833
00834
00835 void cuda_c_optXFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00836 {
00837 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00838 dim3 threadBlockColumns(tile_len,1);
00839 cuda_global_optXFilterClean<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00840 }
00841
00842 void cuda_c_optYFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00843 {
00844 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00845 dim3 threadBlockColumns(1,tile_len);
00846 cuda_global_optYFilterClean<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00847 }
00848
00849 void cuda_c_optXFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00850 {
00851 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00852 dim3 threadBlockColumns(tile_len,1);
00853 cuda_global_optXFilterReplicate<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00854 }
00855
00856 void cuda_c_optYFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00857 {
00858 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00859 dim3 threadBlockColumns(1,tile_len);
00860 cuda_global_optYFilterReplicate<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00861 }
00862
00863
00864 void cuda_2_debayer(float *src,float3_t *dptr,int w, int h, int tile_width, int tile_height)
00865 {
00866 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00867 dim3 threadBlockRows(tile_width,tile_height);
00868 cuda_kernel_debayer<<<blockGridRows,threadBlockRows>>>(src,dptr,w,h,tile_width,tile_height);
00869 }
00870
00871 void cuda_c_crop(const float *src, float *res, int srcw, int srch, int startx, int starty, int endx, int endy, int maxx,int maxy, int tile_width, int tile_height)
00872 {
00873 dim3 blockGridRows(iDivUp(srcw,tile_width),iDivUp(srch,tile_height));
00874 dim3 threadBlockRows(tile_width,tile_height);
00875 cuda_global_crop<<<blockGridRows,threadBlockRows>>>(src,res,srcw,srch,startx,starty,endx,endy,maxx,maxy,tile_width,tile_height);
00876 }
00877
00878 void cuda_c_shiftImage(const float *src, float *dst, int w, int h, float deltax, float deltay, int tile_width, int tile_height)
00879 {
00880 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00881 dim3 threadBlockRows(tile_width,tile_height);
00882 cuda_global_shiftImage<<<blockGridRows,threadBlockRows,(tile_width*tile_height+tile_height+tile_width+1)*sizeof(float)>>>(src,dst,w,h,deltax,deltay,tile_width,tile_height);
00883 }
00884
00885 void cuda_c_inplacePaste(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00886 {
00887 dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height));
00888 dim3 threadBlockRows(tile_width,tile_height);
00889 cuda_global_inplacePaste<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height);
00890 }
00891
00892 void cuda_c_inplacePasteRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00893 {
00894 dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height));
00895 dim3 threadBlockRows(tile_width,tile_height);
00896 cuda_global_inplacePasteRGB<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height);
00897 }
00898
00899 void cuda_c_inplaceOverlay(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00900 {
00901 dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height));
00902 dim3 threadBlockRows(tile_width,tile_height);
00903 cuda_global_inplaceOverlay<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height);
00904 }
00905
00906 void cuda_c_inplaceOverlayRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00907 {
00908 dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height));
00909 dim3 threadBlockRows(tile_width,tile_height);
00910 cuda_global_inplaceOverlayRGB<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height);
00911 }
00912
00913 void cuda_c_inertiaMap(float_t *dst, float s, float r_inv, int px, int py, int tile_width, int tile_height, int w, int h)
00914 {
00915 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00916 dim3 threadBlockRows(tile_width,tile_height);
00917 cuda_global_inertiaMap<<<blockGridRows,threadBlockRows>>>(dst,s,r_inv,px,py,tile_width,tile_height,w,h);
00918 }
00919
00920 void cuda_c_inhibitionMap(float *dst, float factorOld, float factorNew, float radius, int px, int py, int tile_width, int tile_height, int w, int h)
00921 {
00922 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00923 dim3 threadBlockRows(tile_width,tile_height);
00924 cuda_global_inhibitionMap<<<blockGridRows,threadBlockRows>>>(dst,factorOld,factorNew,radius,px,py,tile_width,tile_height,w,h);
00925 }
00926
00927
00928