00001 // Serves as a link to shared memory location on CUDA device 00002 extern __shared__ float shared_data[]; 00003 00004 #include "wrap_c_cuda.h" 00005 #include <cuda_runtime.h> 00006 #include "cuda_mersennetwisterkernel.h" 00007 #include "cuda_colorops.h" 00008 #include "cuda_lowpass.h" 00009 #include "cuda_mathops.h" 00010 #include "cuda_kernels.h" 00011 #include "cuda_shapeops.h" 00012 #include "cuda_drawops.h" 00013 #include "cuda_filterops.h" 00014 #include "cuda_transforms.h" 00015 #include "cuda_convolutions.h" 00016 #include "cuda_saliencyops.h" 00017 #include <stdio.h> 00018 #include "cuda_debayer.h" 00019 #include "cuda_cutpaste.h" 00020 // Externally callable host functions that instructs the device to call the appropriate CUDA function 00021 00022 // ###################################################################### 00023 void cuda_c_getRGBY(const float3_t *src, float *rgptr, float *byptr, const float thresh, 00024 const float min_range, const float max_range, 00025 const int w, const int h, const int tile_width, const int tile_height) 00026 { 00027 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height)); 00028 dim3 threadBlockRows(tile_width,tile_height); 00029 cuda_global_getRGBY<<<blockGridRows,threadBlockRows>>>(src,rgptr,byptr,thresh,min_range,max_range,w,h,tile_width,tile_height); 00030 } 00031 00032 void cuda_c_toRGB(float3_t *dst, const float *src,int sz, const int tile_len) 00033 { 00034 dim3 blockGridRows(iDivUp(sz,tile_len),1); 00035 dim3 threadBlockRows(tile_len,1); 00036 cuda_global_toRGB<<<blockGridRows,threadBlockRows>>>(dst,src,sz,tile_len); 00037 } 00038 00039 // ###################################################################### 00040 void cuda_c_getComponents(const float3_t *srcptr, float *rptr, float *gptr, float *bptr, int w, int h, int tile_width, int tile_height) 00041 { 00042 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height)); 00043 dim3 threadBlockRows(tile_width,tile_height); 00044 cuda_global_getComponents<<<blockGridRows,threadBlockRows>>>(srcptr,rptr,gptr,bptr,w,h,tile_width,tile_height); 00045 } 00046 00047 // ###################################################################### 00048 void cuda_c_luminance(float3_t *aptr, float *dptr, int w, int h, int tile_width, int tile_height) 00049 { 00050 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height)); 00051 dim3 threadBlockRows(tile_width,tile_height); 00052 cuda_global_luminance<<<blockGridRows,threadBlockRows>>>(aptr,dptr,w,h,tile_width,tile_height); 00053 } 00054 00055 // ###################################################################### 00056 void cuda_c_luminanceNTSC(float3_t *aptr, float *dptr, int w, int h, int tile_width, int tile_height) 00057 { 00058 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height)); 00059 dim3 threadBlockRows(tile_width,tile_height); 00060 00061 cuda_global_luminanceNTSC<<<blockGridRows,threadBlockRows>>>(aptr,dptr,w,h,tile_width,tile_height); 00062 } 00063 00064 // ###################################################################### 00065 void cuda_c_drawFilledRect(float *dst, int top, int left, int bottom, int right, const float intensity, const int w, const int h, const int tile_width, const int tile_height) 00066 { 00067 int rect_width = right-left+1; 00068 int rect_height = bottom-top+1; 00069 dim3 blockGridRows(iDivUp(rect_width,tile_width),iDivUp(rect_height,tile_height)); 00070 dim3 threadBlockRows(tile_width,tile_height); 00071 00072 cuda_global_drawFilledRect<<<blockGridRows,threadBlockRows>>>(dst,top,left,bottom,right,intensity,w,h,tile_width,tile_height); 00073 } 00074 00075 // ###################################################################### 00076 void cuda_c_drawFilledRectRGB(float3_t *dst, int top, int left, int bottom, int right, const float3_t *color, const int w, const int h, const int tile_width, const int tile_height) 00077 { 00078 int rect_width = right-left+1; 00079 int rect_height = bottom-top+1; 00080 dim3 blockGridRows(iDivUp(rect_width,tile_width),iDivUp(rect_height,tile_height)); 00081 dim3 threadBlockRows(tile_width,tile_height); 00082 cuda_global_drawFilledRectRGB<<<blockGridRows,threadBlockRows>>>(dst,top,left,bottom,right,color->p[0],color->p[1],color->p[2],w,h,tile_width,tile_height); 00083 } 00084 00085 // ###################################################################### 00086 void cuda_c_lowpass_5_x_dec_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width) 00087 { 00088 dim3 blockGridRows(iDivUp(w, tile_width), h); 00089 dim3 threadBlockRows(tile_width); 00090 00091 cuda_global_lowpass_5_x_dec_x<<<blockGridRows, threadBlockRows,(tile_width+4)*sizeof(float)>>> 00092 (src, w, h, dst, tile_width); 00093 } 00094 00095 // ###################################################################### 00096 void cuda_c_lowpass_5_y_dec_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height) 00097 { 00098 dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height)); 00099 dim3 threadBlockColumns(tile_width, tile_height); 00100 00101 cuda_global_lowpass_5_y_dec_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+4)*sizeof(float)>>> 00102 (src, w, h, dst, tile_width, tile_height); 00103 } 00104 00105 // ##################################################################### 00106 void cuda_c_lowpass_9_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width) 00107 { 00108 dim3 blockGridRows(iDivUp(w, tile_width), h); 00109 dim3 threadBlockRows(tile_width); 00110 cuda_global_lowpass_9_x<<<blockGridRows, threadBlockRows,(tile_width+8)*sizeof(float)>>>(src, w, h, dst, tile_width); 00111 } 00112 00113 // ##################################################################### 00114 void cuda_c_lowpass_9_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height) 00115 { 00116 dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height)); 00117 dim3 threadBlockColumns(tile_width, tile_height); 00118 00119 cuda_global_lowpass_9_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+8)*sizeof(float)>>>(src, w, h, dst, tile_width, tile_height); 00120 } 00121 00122 // ##################################################################### 00123 void cuda_c_lowpass_9_x_dec_x(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width) 00124 { 00125 const int dest_tile_width = tile_width>>1; // Need to reduce, so source can be stored 00126 dim3 blockGridRows(iDivUp(dw, dest_tile_width), dh); 00127 dim3 threadBlockRows(dest_tile_width); 00128 cuda_global_lowpass_9_x_dec_x<<<blockGridRows, threadBlockRows,(tile_width+8)*sizeof(float)>>>(src, w, h, dst, dw, dh, dest_tile_width); 00129 } 00130 00131 // ##################################################################### 00132 void cuda_c_lowpass_9_y_dec_y(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width, int tile_height) 00133 { 00134 const int dest_tile_height = tile_height>>1; // Need to reduce, so source can be stored 00135 dim3 blockGridColumns(iDivUp(dw, tile_width), iDivUp(dh, dest_tile_height)); 00136 dim3 threadBlockColumns(tile_width, dest_tile_height); 00137 cuda_global_lowpass_9_y_dec_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+8)*sizeof(float)>>>(src, w, h, dst, dw, dh, tile_width, dest_tile_height); 00138 } 00139 00140 // ###################################################################### 00141 void cuda_c_lowpass_5_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width) 00142 { 00143 dim3 blockGridRows(iDivUp(w, tile_width), h); 00144 dim3 threadBlockRows(tile_width); 00145 00146 cuda_global_lowpass_5_x<<<blockGridRows, threadBlockRows,(tile_width+4)*sizeof(float)>>> 00147 (src, w, h, dst, tile_width); 00148 } 00149 00150 // ###################################################################### 00151 void cuda_c_lowpass_5_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height) 00152 { 00153 dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height)); 00154 dim3 threadBlockColumns(tile_width, tile_height); 00155 00156 cuda_global_lowpass_5_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+4)*sizeof(float)>>> 00157 (src, w, h, dst, tile_width, tile_height); 00158 } 00159 00160 // ###################################################################### 00161 void cuda_c_lowpass_3_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width) 00162 { 00163 dim3 blockGridRows(iDivUp(w, tile_width), h); 00164 dim3 threadBlockRows(tile_width); 00165 00166 cuda_global_lowpass_3_x<<<blockGridRows, threadBlockRows,(tile_width+4)*sizeof(float)>>> 00167 (src, w, h, dst, tile_width); 00168 } 00169 00170 // ###################################################################### 00171 void cuda_c_lowpass_3_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height) 00172 { 00173 dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height)); 00174 dim3 threadBlockColumns(tile_width, tile_height); 00175 00176 cuda_global_lowpass_3_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+4)*sizeof(float)>>> 00177 (src, w, h, dst, tile_width, tile_height); 00178 } 00179 00180 // ###################################################################### 00181 void cuda_c_dec_xy(const float *src, float* dst, const int x_factor, const int y_factor, const unsigned int w, const unsigned int h, int tile_width) 00182 { 00183 dim3 blockGridRows(iDivUp(w/x_factor, tile_width), h/y_factor); 00184 dim3 threadBlockRows(tile_width); 00185 cuda_global_dec_xy<<<blockGridRows, threadBlockRows>>>(src,dst,x_factor,y_factor,w,h,tile_width); 00186 } 00187 00188 // ###################################################################### 00189 void cuda_c_dec_x(const float *src, float* dst, const int x_factor, const unsigned int w, const unsigned int h, int tile_width) 00190 { 00191 dim3 blockGridRows(iDivUp(w/x_factor, tile_width), h); 00192 dim3 threadBlockRows(tile_width); 00193 cuda_global_dec_x<<<blockGridRows, threadBlockRows>>>(src,dst,x_factor,w,h,tile_width); 00194 } 00195 00196 // ###################################################################### 00197 void cuda_c_dec_y(const float *src, float* dst, const int y_factor, const unsigned int w, const unsigned int h, int tile_width) 00198 { 00199 dim3 blockGridRows(iDivUp(w, tile_width), h/y_factor); 00200 dim3 threadBlockRows(tile_width); 00201 cuda_global_dec_y<<<blockGridRows, threadBlockRows>>>(src,dst,y_factor,w,h,tile_width); 00202 } 00203 00204 void cuda_c_quickLocalAvg(const float *in, float *res, float fac, int lw, int lh, int sw, int sh, int tile_width, int tile_height) 00205 { 00206 int scalex = lw / sw, scaley = lh / sh; // in case the image is very small 00207 // IMPORTANT! remx is JUST the remainder, unlike the CPU version, which is remx=lw-1-(lh%sh) 00208 int remx = (lw % sw), remy = (lh % sh); 00209 00210 dim3 blockGridColumns(iDivUp(sw, tile_width), iDivUp(sh, tile_height)); 00211 dim3 threadBlockColumns(tile_width, tile_height); 00212 cuda_global_quickLocalAvg<<<blockGridColumns,threadBlockColumns>>>(in,res,fac,scalex,scaley,remx,remy,lw,lh,sw,sh,tile_width,tile_height); 00213 } 00214 00215 void cuda_c_quickLocalAvg2x2(const float *in, float *res, int lw, int lh, int sw, int sh, int tile_width, int tile_height) 00216 { 00217 dim3 blockGridColumns(iDivUp(sw, tile_width), iDivUp(sh, tile_height)); 00218 dim3 threadBlockColumns(tile_width, tile_height); 00219 cuda_global_quickLocalAvg2x2<<<blockGridColumns,threadBlockColumns>>>(in,res,lw,lh,sw,sh,tile_width,tile_height); 00220 } 00221 00222 void cuda_c_quickLocalMax(const float *in, float *res, int lw, int lh, int sw, int sh, int tile_width, int tile_height) 00223 { 00224 int scalex = lw / sw, scaley = lh / sh; // in case the image is very small 00225 int remx = lw - 1 - (lw % sw), remy = lh - 1 - (lh % sh); 00226 00227 dim3 blockGridColumns(iDivUp(sw, tile_width), iDivUp(sh, tile_height)); 00228 dim3 threadBlockColumns(tile_width, tile_height); 00229 cuda_global_quickLocalMax<<<blockGridColumns,threadBlockColumns>>>(in,res,scalex,scaley,remx,remy,lw,lh,sw,sh,tile_width,tile_height); 00230 } 00231 00232 00233 void cuda_c_rescaleBilinear(const float *src, float *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height) 00234 { 00235 dim3 blockGridColumns(iDivUp(new_w, tile_width), iDivUp(new_h, tile_height)); 00236 dim3 threadBlockColumns(tile_width, tile_height); 00237 cuda_global_rescaleBilinear<<<blockGridColumns,threadBlockColumns>>>(src,res,sw,sh,orig_w,orig_h,new_w,new_h,tile_width,tile_height); 00238 } 00239 00240 void cuda_c_rescaleBilinearRGB(const float3_t *src, float3_t *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height) 00241 { 00242 dim3 blockGridColumns(iDivUp(new_w, tile_width), iDivUp(new_h, tile_height)); 00243 dim3 threadBlockColumns(tile_width, tile_height); 00244 cuda_global_rescaleBilinearRGB<<<blockGridColumns,threadBlockColumns>>>(src,res,sw,sh,orig_w,orig_h,new_w,new_h,tile_width,tile_height); 00245 } 00246 00247 void cuda_c_clear(float *src, const float val, const int tile_len, const int sz) 00248 { 00249 dim3 blockGridRows(iDivUp(sz,tile_len)); 00250 dim3 threadBlockRows(tile_len); 00251 cuda_global_clear<<<blockGridRows,threadBlockRows>>>(src,val,tile_len,sz); 00252 } 00253 00254 void cuda_c_abs(float *src,const int tile_len, const int sz) 00255 { 00256 dim3 blockGridRows(iDivUp(sz,tile_len)); 00257 dim3 threadBlockRows(tile_len); 00258 cuda_global_abs<<<blockGridRows,threadBlockRows>>>(src,tile_len,sz); 00259 } 00260 00261 void cuda_c_inplaceAddScalar(float *ptr, const float *offset, const int tile_len, const int sz) 00262 { 00263 dim3 blockGridRows(iDivUp(sz,tile_len)); 00264 dim3 threadBlockRows(tile_len); 00265 cuda_global_inplaceAddScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz); 00266 } 00267 00268 void cuda_c_inplaceSubtractScalar(float *ptr, const float *offset, const int tile_len, const int sz) 00269 { 00270 dim3 blockGridRows(iDivUp(sz,tile_len)); 00271 dim3 threadBlockRows(tile_len); 00272 cuda_global_inplaceSubtractScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz); 00273 } 00274 00275 void cuda_c_inplaceMultiplyScalar(float *ptr, const float *offset, const int tile_len, const int sz) 00276 { 00277 dim3 blockGridRows(iDivUp(sz,tile_len)); 00278 dim3 threadBlockRows(tile_len); 00279 cuda_global_inplaceMultiplyScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz); 00280 } 00281 00282 void cuda_c_inplaceDivideScalar(float *ptr, const float *offset, const int tile_len, const int sz) 00283 { 00284 dim3 blockGridRows(iDivUp(sz,tile_len)); 00285 dim3 threadBlockRows(tile_len); 00286 cuda_global_inplaceDivideScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz); 00287 } 00288 00289 void cuda_c_inplaceAddHostScalar(float *ptr, const float val, const int tile_len, const int sz) 00290 { 00291 dim3 blockGridRows(iDivUp(sz,tile_len)); 00292 dim3 threadBlockRows(tile_len); 00293 cuda_global_inplaceAddHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz); 00294 } 00295 00296 void cuda_c_inplaceSubtractHostScalar(float *ptr, const float val, const int tile_len, const int sz) 00297 { 00298 dim3 blockGridRows(iDivUp(sz,tile_len)); 00299 dim3 threadBlockRows(tile_len); 00300 cuda_global_inplaceSubtractHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz); 00301 } 00302 00303 void cuda_c_inplaceMultiplyHostScalar(float *ptr, const float val, const int tile_len, const int sz) 00304 { 00305 dim3 blockGridRows(iDivUp(sz,tile_len)); 00306 dim3 threadBlockRows(tile_len); 00307 cuda_global_inplaceMultiplyHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz); 00308 } 00309 00310 void cuda_c_inplaceDivideHostScalar(float *ptr, const float val, const int tile_len, const int sz) 00311 { 00312 dim3 blockGridRows(iDivUp(sz,tile_len)); 00313 dim3 threadBlockRows(tile_len); 00314 cuda_global_inplaceDivideHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz); 00315 } 00316 00317 00318 void cuda_c_inplaceAddImages(float *im1, const float *im2, const int tile_len, const int sz) 00319 { 00320 dim3 blockGridRows(iDivUp(sz,tile_len)); 00321 dim3 threadBlockRows(tile_len); 00322 cuda_global_inplaceAddImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz); 00323 } 00324 00325 void cuda_c_inplaceSubtractImages(float *im1, const float *im2, const int tile_len, const int sz) 00326 { 00327 dim3 blockGridRows(iDivUp(sz,tile_len)); 00328 dim3 threadBlockRows(tile_len); 00329 cuda_global_inplaceSubtractImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz); 00330 } 00331 00332 void cuda_c_inplaceMultiplyImages(float *im1, const float *im2, const int tile_len, const int sz) 00333 { 00334 dim3 blockGridRows(iDivUp(sz,tile_len)); 00335 dim3 threadBlockRows(tile_len); 00336 cuda_global_inplaceMultiplyImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz); 00337 } 00338 00339 void cuda_c_inplaceDivideImages(float *im1, const float *im2, const int tile_len, const int sz) 00340 { 00341 dim3 blockGridRows(iDivUp(sz,tile_len)); 00342 dim3 threadBlockRows(tile_len); 00343 cuda_global_inplaceDivideImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz); 00344 } 00345 00346 void cuda_c_addImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00347 { 00348 dim3 blockGridRows(iDivUp(sz,tile_len)); 00349 dim3 threadBlockRows(tile_len); 00350 cuda_global_addImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz); 00351 } 00352 00353 void cuda_c_subtractImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00354 { 00355 dim3 blockGridRows(iDivUp(sz,tile_len)); 00356 dim3 threadBlockRows(tile_len); 00357 cuda_global_subtractImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz); 00358 } 00359 00360 void cuda_c_multiplyImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00361 { 00362 dim3 blockGridRows(iDivUp(sz,tile_len)); 00363 dim3 threadBlockRows(tile_len); 00364 cuda_global_multiplyImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz); 00365 } 00366 00367 void cuda_c_divideImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00368 { 00369 dim3 blockGridRows(iDivUp(sz,tile_len)); 00370 dim3 threadBlockRows(tile_len); 00371 cuda_global_divideImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz); 00372 } 00373 00374 void cuda_c_takeMax(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00375 { 00376 dim3 blockGridRows(iDivUp(sz,tile_len)); 00377 dim3 threadBlockRows(tile_len); 00378 cuda_global_takeMax<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz); 00379 } 00380 00381 void cuda_c_addScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00382 { 00383 dim3 blockGridRows(iDivUp(sz,tile_len)); 00384 dim3 threadBlockRows(tile_len); 00385 cuda_global_addScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz); 00386 } 00387 00388 void cuda_c_subtractScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00389 { 00390 dim3 blockGridRows(iDivUp(sz,tile_len)); 00391 dim3 threadBlockRows(tile_len); 00392 cuda_global_subtractScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz); 00393 } 00394 00395 void cuda_c_multiplyScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00396 { 00397 dim3 blockGridRows(iDivUp(sz,tile_len)); 00398 dim3 threadBlockRows(tile_len); 00399 cuda_global_multiplyScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz); 00400 } 00401 00402 void cuda_c_divideScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz) 00403 { 00404 dim3 blockGridRows(iDivUp(sz,tile_len)); 00405 dim3 threadBlockRows(tile_len); 00406 cuda_global_divideScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz); 00407 } 00408 00409 void cuda_c_addHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz) 00410 { 00411 dim3 blockGridRows(iDivUp(sz,tile_len)); 00412 dim3 threadBlockRows(tile_len); 00413 cuda_global_addHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz); 00414 } 00415 00416 void cuda_c_subtractHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz) 00417 { 00418 dim3 blockGridRows(iDivUp(sz,tile_len)); 00419 dim3 threadBlockRows(tile_len); 00420 cuda_global_subtractHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz); 00421 } 00422 00423 void cuda_c_multiplyHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz) 00424 { 00425 dim3 blockGridRows(iDivUp(sz,tile_len)); 00426 dim3 threadBlockRows(tile_len); 00427 cuda_global_multiplyHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz); 00428 } 00429 00430 void cuda_c_divideHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz) 00431 { 00432 dim3 blockGridRows(iDivUp(sz,tile_len)); 00433 dim3 threadBlockRows(tile_len); 00434 cuda_global_divideHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz); 00435 } 00436 00437 void cuda_c_inplaceRectify(float *ptr, const int tile_len, const int sz) 00438 { 00439 dim3 blockGridRows(iDivUp(sz,tile_len)); 00440 dim3 threadBlockRows(tile_len); 00441 cuda_global_inplaceRectify<<<blockGridRows,threadBlockRows>>>(ptr,tile_len,sz); 00442 } 00443 00444 void cuda_c_inplaceClamp(float *ptr, const float cmin, const float cmax, const int tile_len, const int sz) 00445 { 00446 dim3 blockGridRows(iDivUp(sz,tile_len)); 00447 dim3 threadBlockRows(tile_len); 00448 00449 cuda_global_inplaceClamp<<<blockGridRows,threadBlockRows>>>(ptr,cmin,cmax,tile_len,sz); 00450 } 00451 00452 void cuda_c_inplaceNormalize(float *src, const float *omin, const float *omax, const float nmin, const float nmax, const int tile_len, const int sz) 00453 { 00454 dim3 blockGridRows(iDivUp(sz,tile_len)); 00455 dim3 threadBlockRows(tile_len); 00456 cuda_global_inplaceNormalize<<<blockGridRows,threadBlockRows>>>(src,omin,omax,nmin,nmax,tile_len,sz); 00457 } 00458 00459 void cuda_c_getMin(const float *src, float *dest, float *buf, const int tile_len, const int sz) 00460 { 00461 dim3 blockGridRows(iDivUp(sz,tile_len)); 00462 dim3 threadBlockRows(tile_len); 00463 // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls) 00464 const float *in = src; 00465 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len)) 00466 { 00467 cuda_global_getMin<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz); 00468 in=buf; 00469 } 00470 } 00471 00472 00473 void cuda_c_getMax(const float *src, float *dest, float *buf, const int tile_len, const int sz) 00474 { 00475 dim3 blockGridRows(iDivUp(sz,tile_len)); 00476 dim3 threadBlockRows(tile_len); 00477 // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls) 00478 const float *in = src; 00479 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len)) 00480 { 00481 cuda_global_getMax<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz); 00482 in=buf; 00483 } 00484 } 00485 00486 void cuda_c_getAvg(const float *src, float *dest, float *buf, const int tile_len, const int sz) 00487 { 00488 dim3 blockGridRows(iDivUp(sz,tile_len)); 00489 dim3 threadBlockRows(tile_len); 00490 // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls) 00491 const float *in = src; 00492 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len)) 00493 { 00494 cuda_global_getAvg<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz,sz); 00495 in=buf; 00496 } 00497 } 00498 00499 void cuda_c_getSum(const float *src, float *dest, float *buf, const int tile_len, const int sz) 00500 { 00501 dim3 blockGridRows(iDivUp(sz,tile_len)); 00502 dim3 threadBlockRows(tile_len); 00503 // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls) 00504 const float *in = src; 00505 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len)) 00506 { 00507 cuda_global_getSum<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz,sz); 00508 in=buf; 00509 } 00510 } 00511 00512 void cuda_c_squared(const float *im, float *res, const int tile_len, const int sz) 00513 { 00514 dim3 blockGridRows(iDivUp(sz,tile_len)); 00515 dim3 threadBlockRows(tile_len); 00516 cuda_global_squared<<<blockGridRows,threadBlockRows>>>(im,res,tile_len,sz); 00517 } 00518 00519 void cuda_c_sqrt(const float *im, float *res, const int tile_len, const int sz) 00520 { 00521 dim3 blockGridRows(iDivUp(sz,tile_len)); 00522 dim3 threadBlockRows(tile_len); 00523 cuda_global_sqrt<<<blockGridRows,threadBlockRows>>>(im,res,tile_len,sz); 00524 } 00525 00526 00527 void cuda_c_quadEnergy(const float *real, const float *imag, float *out, int tile_len, int sz) 00528 { 00529 dim3 blockGridRows(iDivUp(sz,tile_len)); 00530 dim3 threadBlockRows(tile_len); 00531 cuda_global_quadEnergy<<<blockGridRows,threadBlockRows>>>(real,imag,out,tile_len,sz); 00532 } 00533 00534 void cuda_c_inplaceAttenuateBorders(float *im, int borderSize, int tile_len, int w, int h) 00535 { 00536 dim3 xBlock(iDivUp(w,tile_len),borderSize*2); 00537 dim3 xThread(tile_len); 00538 cuda_global_inplaceAttenuateBorders_x<<<xBlock,xThread>>>(im,borderSize,tile_len,w,h); 00539 dim3 yBlock(borderSize*2,iDivUp(h,tile_len)); 00540 dim3 yThread(1,tile_len); 00541 cuda_global_inplaceAttenuateBorders_y<<<yBlock,yThread>>>(im,borderSize,tile_len,w,h); 00542 } 00543 00544 void cuda_c_findMax(const float *src, float *buf, int *loc, const int tile_len, const int sz) 00545 { 00546 dim3 blockGridRows(iDivUp(sz,tile_len)); 00547 dim3 threadBlockRows(tile_len); 00548 // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls) 00549 const float *in = src; 00550 const int *inLoc = NULL; 00551 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len)) 00552 { 00553 cuda_global_findMax<<<blockGridRows,threadBlockRows,2*tile_len*sizeof(float)>>>(in,inLoc,buf,loc,tile_len,cur_sz); 00554 in=buf; 00555 inLoc=loc; 00556 } 00557 } 00558 00559 void cuda_c_findMin(const float *src, float *buf, int *loc, const int tile_len, const int sz) 00560 { 00561 dim3 blockGridRows(iDivUp(sz,tile_len)); 00562 dim3 threadBlockRows(tile_len); 00563 // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls) 00564 const float *in = src; 00565 const int *inLoc = NULL; 00566 for(int cur_sz=sz; cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len)) 00567 { 00568 cuda_global_findMin<<<blockGridRows,threadBlockRows,2*tile_len*sizeof(float)>>>(in,inLoc,buf,loc,tile_len,cur_sz); 00569 in=buf; 00570 inLoc=loc; 00571 } 00572 } 00573 00574 00575 00576 void cuda_c_dogFilterHmax(float *dest, const float theta, const float gamma, const int size, const float div, const int tile_width, const int tile_height) 00577 { 00578 dim3 blockGridRows(iDivUp(size,tile_width),iDivUp(size,tile_height)); 00579 dim3 threadBlockRows(tile_width,tile_height); 00580 cuda_global_dogFilterHmax<<<blockGridRows,threadBlockRows>>>(dest,theta,gamma,size,div,tile_width,tile_height); 00581 } 00582 00583 void cuda_c_dogFilter(float *dest, float stddev, float theta, int half_size, int size, int tile_width, int tile_height) 00584 { 00585 dim3 blockGridRows(iDivUp(size,tile_width),iDivUp(size,tile_height)); 00586 dim3 threadBlockRows(tile_width,tile_height); 00587 cuda_global_dogFilter<<<blockGridRows,threadBlockRows>>>(dest,stddev,theta,half_size,size,tile_width,tile_height); 00588 } 00589 00590 void cuda_c_gaborFilter3(float *kern, const float major_stddev, const float minor_stddev, 00591 const float period, const float phase, 00592 const float theta, const int size, const int tile_len, const int sz) 00593 { 00594 dim3 blockGridRows(iDivUp(sz,tile_len)); 00595 dim3 threadBlockRows(tile_len); 00596 cuda_global_gaborFilter3<<<blockGridRows,threadBlockRows>>>(kern,major_stddev,minor_stddev,period,phase,theta,size,tile_len,sz); 00597 } 00598 00599 void cuda_c_gaussian(float *res, float c, float sig22, int hw, int tile_len, int sz) 00600 { 00601 dim3 blockGridRows(iDivUp(sz,tile_len)); 00602 dim3 threadBlockRows(tile_len); 00603 cuda_global_gaussian<<<blockGridRows,threadBlockRows>>>(res,c,sig22,hw,tile_len,sz); 00604 } 00605 00606 void cuda_c_orientedFilter(const float *src, float *re, float *im, const float kx, const float ky, const float intensity, const int w, const int h, const int tile_width) 00607 { 00608 dim3 blockGridRows(iDivUp(w, tile_width), h); 00609 dim3 threadBlockRows(tile_width); 00610 cuda_global_orientedFilter<<<blockGridRows,threadBlockRows>>>(src,re,im,kx,ky,intensity,w,h,tile_width); 00611 } 00612 00613 void cuda_c_centerSurroundAbs(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int tile_width ) 00614 { 00615 int scalex = lw / sw, remx = lw - 1 - (lw % sw); 00616 int scaley = lh / sh, remy = lh - 1 - (lh % sh); 00617 dim3 blockGridRows(iDivUp(lw, tile_width), lh); 00618 dim3 threadBlockRows(tile_width); 00619 cuda_global_centerSurroundAbs<<<blockGridRows,threadBlockRows>>>(center,surround,res,lw,lh,sw,sh,scalex,scaley,remx,remy,tile_width); 00620 } 00621 00622 void cuda_c_centerSurroundClamped(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int tile_width ) 00623 { 00624 int scalex = lw / sw, remx = lw - 1 - (lw % sw); 00625 int scaley = lh / sh, remy = lh - 1 - (lh % sh); 00626 dim3 blockGridRows(iDivUp(lw, tile_width), lh); 00627 dim3 threadBlockRows(tile_width); 00628 cuda_global_centerSurroundClamped<<<blockGridRows,threadBlockRows>>>(center,surround,res,lw,lh,sw,sh,scalex,scaley,remx,remy,tile_width); 00629 } 00630 00631 void cuda_c_centerSurroundDirectional(const float *center, const float *surround, float *pos, float *neg, int lw, int lh, int sw, int sh, int tile_width ) 00632 { 00633 int scalex = lw / sw, remx = lw - 1 - (lw % sw); 00634 int scaley = lh / sh, remy = lh - 1 - (lh % sh); 00635 dim3 blockGridRows(iDivUp(lw, tile_width), lh); 00636 dim3 threadBlockRows(tile_width); 00637 cuda_global_centerSurroundDirectional<<<blockGridRows,threadBlockRows>>>(center,surround,pos,neg,lw,lh,sw,sh,scalex,scaley,remx,remy,tile_width); 00638 } 00639 00640 void cuda_c_centerSurroundAbsAttenuate(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int attBorder, int tile_width, int tile_height) 00641 { 00642 int scalex = lw / sw, remx = lw - 1 - (lw % sw); 00643 int scaley = lh / sh, remy = lh - 1 - (lh % sh); 00644 dim3 blockGridRows(iDivUp(lw, tile_width), iDivUp(lh,tile_height)); 00645 dim3 threadBlockRows(tile_width,tile_height); 00646 cuda_global_centerSurroundAbsAttenuate<<<blockGridRows,threadBlockRows>>>(center,surround,res,lw,lh,sw,sh,attBorder,scalex,scaley,remx,remy,tile_width,tile_height); 00647 } 00648 00649 void cuda_c_spatialPoolMax(const float *src, float *res, float *buf1, float *buf2, const int src_w_in, const int src_h_in, const int skip_w_in, const int skip_h_in, 00650 const int reg_w_in, const int reg_h_in, int tile_width_in, int tile_height_in) 00651 { 00652 int reg_w = reg_w_in; int reg_h = reg_h_in; 00653 int skip_w = skip_w_in; int skip_h = skip_h_in; 00654 int src_w = src_w_in; int src_h = src_h_in; 00655 int tilesperregion_w, tilesperregion_h; 00656 int tile_width, tile_height; 00657 const int orig_tile_size = tile_width_in*tile_height_in; 00658 const float *in = src; 00659 float *out = buf1; 00660 float *next = buf2; 00661 // Run the kernel recursively until the region is smaller than the tile 00662 do 00663 { 00664 // Modify the tile size to optimize based on the region 00665 if(reg_w < orig_tile_size) 00666 { 00667 tile_width = reg_w; 00668 // Minimum of the region width or the remaining tile dimension 00669 tile_height = MIN(orig_tile_size/tile_width,reg_h); 00670 } 00671 else if(reg_h < orig_tile_size) 00672 { 00673 tile_height = reg_h; 00674 // Minimum of the region width or the remaining tile dimension 00675 tile_width = MIN(orig_tile_size/tile_height,reg_w); 00676 } 00677 else 00678 { 00679 tile_width = tile_width_in; 00680 tile_height = tile_height_in; 00681 } 00682 tilesperregion_w = iDivUp(reg_w,tile_width); 00683 tilesperregion_h = iDivUp(reg_h,tile_height); 00684 // If this is the last time through, set the output to be the result 00685 if(tilesperregion_w == 1 && tilesperregion_h == 1) 00686 out = res; 00687 int num_blocks_w = iDivUp(src_w, skip_w)*tilesperregion_w; 00688 int num_blocks_h = iDivUp(src_h, skip_h)*tilesperregion_h; 00689 dim3 blockGridRows(num_blocks_w,num_blocks_h); 00690 dim3 threadBlockRows(tile_width,tile_height); 00691 //printf("tile_w:%d _h:%d, reg_w:%d _h:%d, skip_w:%d _h:%d, src_w:%d _h:%d\n", 00692 // tile_width,tile_height,reg_w,reg_h,skip_w,skip_h,src_w,src_h); 00693 cuda_global_spatialPoolMax<<<blockGridRows,threadBlockRows,tile_width*tile_height*sizeof(float)>>>(in,out,src_w,src_h,skip_w,skip_h,reg_w,reg_h,tile_width,tile_height); 00694 src_w = num_blocks_w; 00695 src_h = num_blocks_h; 00696 reg_w = tilesperregion_w; 00697 reg_h = tilesperregion_h; 00698 skip_w = reg_w; 00699 skip_h = reg_h; 00700 // Update the location of the input 00701 in = out; 00702 out = next; 00703 next = (buf1 == in) ? buf1 : buf2; 00704 } while(tilesperregion_w > 1 || tilesperregion_h > 1); 00705 00706 } 00707 00708 // Number of values requested must be divisible by MT_RNG_COUNT 00709 00710 void cuda_c_randomMT(float *d_Random, int numVals, int tile_len) 00711 { 00712 if(numVals % MT_RNG_COUNT != 0) 00713 { 00714 printf("ERROR!!!: Cannot request a set of random numbers that is not in MT_RNG_COUNT [%d]units\n",MT_RNG_COUNT); 00715 return; 00716 } 00717 int NPerRng = numVals/MT_RNG_COUNT; 00718 dim3 blockGridColumns(NPerRng,1); 00719 dim3 threadBlockColumns(tile_len,1); 00720 cuda_global_randomMT<<<blockGridColumns,threadBlockColumns>>>(d_Random,NPerRng); 00721 } 00722 00723 void cuda_c_inplaceAddBGnoise2(float *in, float *rnd, const int brd_siz, const float range, int w, int h, int tile_len) 00724 { 00725 dim3 blockGridColumns(iDivUp(w,tile_len),h); 00726 dim3 threadBlockColumns(tile_len,1); 00727 cuda_global_inplaceAddBGnoise2<<<blockGridColumns,threadBlockColumns>>>(in,rnd,brd_siz,range,w,h,tile_len); 00728 } 00729 00730 void cuda_c_convolveHmaxHelper(float *res, const float *src, const int src_w, const int src_h, 00731 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00732 { 00733 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height)); 00734 dim3 threadBlockColumns(tile_width,tile_height); 00735 cuda_global_convolveHmaxHelper<<<blockGridColumns,threadBlockColumns,Nx*Ny*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height); 00736 } 00737 00738 void cuda_c_convolveZeroHelper(float *res, const float *src, const int src_w, const int src_h, 00739 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00740 { 00741 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height)); 00742 dim3 threadBlockColumns(tile_width,tile_height); 00743 cuda_global_convolveZeroHelper<<<blockGridColumns,threadBlockColumns,Nx*Ny*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height); 00744 } 00745 00746 void cuda_c_convolveCleanHelper(float *res, const float *src, const int src_w, const int src_h, 00747 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00748 { 00749 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height)); 00750 dim3 threadBlockColumns(tile_width,tile_height); 00751 cuda_global_convolveCleanHelper<<<blockGridColumns,threadBlockColumns,Nx*Ny*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height); 00752 } 00753 00754 void cuda_c_convolveHmaxHelperOptimized(float *res, const float *src, const int src_w, const int src_h, 00755 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00756 { 00757 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height)); 00758 dim3 threadBlockColumns(tile_width,tile_height); 00759 cuda_global_convolveHmaxHelperOptimized<<<blockGridColumns,threadBlockColumns,(Nx*Ny+(Nx+tile_width)*(Ny+tile_height))*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height); 00760 } 00761 00762 void cuda_c_convolveZeroHelperOptimized(float *res, const float *src, const int src_w, const int src_h, 00763 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00764 { 00765 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height)); 00766 dim3 threadBlockColumns(tile_width,tile_height); 00767 cuda_global_convolveZeroHelperOptimized<<<blockGridColumns,threadBlockColumns,(Nx*Ny+(Nx+tile_width)*(Ny+tile_height))*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height); 00768 } 00769 00770 void cuda_c_optConvolve(float *res, const float *src, const int src_w, const int src_h, 00771 const float *f, const int fil_w, const int fil_h, const int tile_width, const int tile_height) 00772 { 00773 dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height)); 00774 dim3 threadBlockColumns(tile_width,tile_height); 00775 cuda_global_optConvolve<<<blockGridColumns,threadBlockColumns,fil_w*fil_h*sizeof(float)>>>(res,src,src_w,src_h,f,fil_w,fil_h,tile_width,tile_height); 00776 } 00777 00778 void cuda_c_xFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len) 00779 { 00780 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h); 00781 dim3 threadBlockColumns(tile_len,1); 00782 cuda_global_xFilterZero<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len); 00783 } 00784 00785 void cuda_c_xFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len) 00786 { 00787 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h); 00788 dim3 threadBlockColumns(tile_len,1); 00789 cuda_global_xFilterClean<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len); 00790 } 00791 00792 void cuda_c_xFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len) 00793 { 00794 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h); 00795 dim3 threadBlockColumns(tile_len,1); 00796 cuda_global_xFilterReplicate<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len); 00797 } 00798 00799 void cuda_c_yFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len) 00800 { 00801 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len)); 00802 dim3 threadBlockColumns(1,tile_len); 00803 cuda_global_yFilterZero<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len); 00804 } 00805 00806 void cuda_c_yFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len) 00807 { 00808 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len)); 00809 dim3 threadBlockColumns(1,tile_len); 00810 cuda_global_yFilterClean<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len); 00811 } 00812 00813 void cuda_c_yFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len) 00814 { 00815 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len)); 00816 dim3 threadBlockColumns(1,tile_len); 00817 cuda_global_yFilterReplicate<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len); 00818 } 00819 00820 void cuda_c_optXFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 00821 { 00822 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h); 00823 dim3 threadBlockColumns(tile_len,1); 00824 cuda_global_optXFilterZero<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len); 00825 } 00826 00827 void cuda_c_optYFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 00828 { 00829 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len)); 00830 dim3 threadBlockColumns(1,tile_len); 00831 cuda_global_optYFilterZero<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len); 00832 } 00833 00834 00835 void cuda_c_optXFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 00836 { 00837 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h); 00838 dim3 threadBlockColumns(tile_len,1); 00839 cuda_global_optXFilterClean<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len); 00840 } 00841 00842 void cuda_c_optYFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 00843 { 00844 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len)); 00845 dim3 threadBlockColumns(1,tile_len); 00846 cuda_global_optYFilterClean<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len); 00847 } 00848 00849 void cuda_c_optXFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 00850 { 00851 dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h); 00852 dim3 threadBlockColumns(tile_len,1); 00853 cuda_global_optXFilterReplicate<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len); 00854 } 00855 00856 void cuda_c_optYFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 00857 { 00858 dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len)); 00859 dim3 threadBlockColumns(1,tile_len); 00860 cuda_global_optYFilterReplicate<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len); 00861 } 00862 00863 00864 void cuda_2_debayer(float *src,float3_t *dptr,int w, int h, int tile_width, int tile_height) 00865 { 00866 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height)); 00867 dim3 threadBlockRows(tile_width,tile_height); 00868 cuda_kernel_debayer<<<blockGridRows,threadBlockRows>>>(src,dptr,w,h,tile_width,tile_height); 00869 } 00870 00871 void cuda_c_crop(const float *src, float *res, int srcw, int srch, int startx, int starty, int endx, int endy, int maxx,int maxy, int tile_width, int tile_height) 00872 { 00873 dim3 blockGridRows(iDivUp(srcw,tile_width),iDivUp(srch,tile_height)); 00874 dim3 threadBlockRows(tile_width,tile_height); 00875 cuda_global_crop<<<blockGridRows,threadBlockRows>>>(src,res,srcw,srch,startx,starty,endx,endy,maxx,maxy,tile_width,tile_height); 00876 } 00877 00878 void cuda_c_shiftImage(const float *src, float *dst, int w, int h, float deltax, float deltay, int tile_width, int tile_height) 00879 { 00880 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height)); 00881 dim3 threadBlockRows(tile_width,tile_height); 00882 cuda_global_shiftImage<<<blockGridRows,threadBlockRows,(tile_width*tile_height+tile_height+tile_width+1)*sizeof(float)>>>(src,dst,w,h,deltax,deltay,tile_width,tile_height); 00883 } 00884 00885 void cuda_c_inplacePaste(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height) 00886 { 00887 dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height)); 00888 dim3 threadBlockRows(tile_width,tile_height); 00889 cuda_global_inplacePaste<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height); 00890 } 00891 00892 void cuda_c_inplacePasteRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height) 00893 { 00894 dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height)); 00895 dim3 threadBlockRows(tile_width,tile_height); 00896 cuda_global_inplacePasteRGB<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height); 00897 } 00898 00899 void cuda_c_inplaceOverlay(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height) 00900 { 00901 dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height)); 00902 dim3 threadBlockRows(tile_width,tile_height); 00903 cuda_global_inplaceOverlay<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height); 00904 } 00905 00906 void cuda_c_inplaceOverlayRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height) 00907 { 00908 dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height)); 00909 dim3 threadBlockRows(tile_width,tile_height); 00910 cuda_global_inplaceOverlayRGB<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height); 00911 } 00912 00913 void cuda_c_inertiaMap(float_t *dst, float s, float r_inv, int px, int py, int tile_width, int tile_height, int w, int h) 00914 { 00915 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height)); 00916 dim3 threadBlockRows(tile_width,tile_height); 00917 cuda_global_inertiaMap<<<blockGridRows,threadBlockRows>>>(dst,s,r_inv,px,py,tile_width,tile_height,w,h); 00918 } 00919 00920 void cuda_c_inhibitionMap(float *dst, float factorOld, float factorNew, float radius, int px, int py, int tile_width, int tile_height, int w, int h) 00921 { 00922 dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height)); 00923 dim3 threadBlockRows(tile_width,tile_height); 00924 cuda_global_inhibitionMap<<<blockGridRows,threadBlockRows>>>(dst,factorOld,factorNew,radius,px,py,tile_width,tile_height,w,h); 00925 } 00926 00927 00928