wrap_c_cuda.cu

00001 // Serves as a link to shared memory location on CUDA device
00002 extern __shared__ float shared_data[];
00003 
00004 #include "wrap_c_cuda.h"
00005 #include <cuda_runtime.h>
00006 #include "cuda_mersennetwisterkernel.h"
00007 #include "cuda_colorops.h"
00008 #include "cuda_lowpass.h"
00009 #include "cuda_mathops.h"
00010 #include "cuda_kernels.h"
00011 #include "cuda_shapeops.h"
00012 #include "cuda_drawops.h"
00013 #include "cuda_filterops.h"
00014 #include "cuda_transforms.h"
00015 #include "cuda_convolutions.h"
00016 #include "cuda_saliencyops.h"
00017 #include <stdio.h>
00018 #include "cuda_debayer.h"
00019 #include "cuda_cutpaste.h"
00020 // Externally callable host functions that instructs the device to call the appropriate CUDA function
00021 
00022 // ######################################################################
00023 void cuda_c_getRGBY(const float3_t *src, float *rgptr, float *byptr, const float thresh, 
00024                     const float min_range, const float max_range,
00025                     const int w, const int h, const int tile_width, const int tile_height)
00026 {
00027   dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00028   dim3 threadBlockRows(tile_width,tile_height);
00029   cuda_global_getRGBY<<<blockGridRows,threadBlockRows>>>(src,rgptr,byptr,thresh,min_range,max_range,w,h,tile_width,tile_height);  
00030 }
00031 
00032 void cuda_c_toRGB(float3_t *dst, const float *src,int sz, const int tile_len)
00033 {
00034   dim3 blockGridRows(iDivUp(sz,tile_len),1);
00035   dim3 threadBlockRows(tile_len,1);
00036   cuda_global_toRGB<<<blockGridRows,threadBlockRows>>>(dst,src,sz,tile_len);
00037 }
00038 
00039 // ######################################################################
00040 void cuda_c_getComponents(const float3_t *srcptr, float *rptr, float *gptr, float *bptr, int w, int h, int tile_width, int tile_height)
00041 {
00042   dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00043   dim3 threadBlockRows(tile_width,tile_height);
00044   cuda_global_getComponents<<<blockGridRows,threadBlockRows>>>(srcptr,rptr,gptr,bptr,w,h,tile_width,tile_height);  
00045 }
00046 
00047 // ######################################################################
00048 void cuda_c_luminance(float3_t *aptr, float *dptr, int w, int h, int tile_width, int tile_height)
00049 {
00050   dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00051   dim3 threadBlockRows(tile_width,tile_height);
00052   cuda_global_luminance<<<blockGridRows,threadBlockRows>>>(aptr,dptr,w,h,tile_width,tile_height);
00053 }
00054 
00055 // ######################################################################
00056 void cuda_c_luminanceNTSC(float3_t *aptr, float *dptr, int w, int h, int tile_width, int tile_height)
00057 {
00058   dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00059   dim3 threadBlockRows(tile_width,tile_height);
00060 
00061   cuda_global_luminanceNTSC<<<blockGridRows,threadBlockRows>>>(aptr,dptr,w,h,tile_width,tile_height);
00062 }
00063 
00064 // ######################################################################
00065 void cuda_c_drawFilledRect(float *dst, int top, int left, int bottom, int right, const float intensity, const int w, const int h, const int tile_width, const int tile_height)
00066 {
00067   int rect_width = right-left+1;
00068   int rect_height = bottom-top+1;
00069   dim3 blockGridRows(iDivUp(rect_width,tile_width),iDivUp(rect_height,tile_height));
00070   dim3 threadBlockRows(tile_width,tile_height);
00071 
00072   cuda_global_drawFilledRect<<<blockGridRows,threadBlockRows>>>(dst,top,left,bottom,right,intensity,w,h,tile_width,tile_height);
00073 }
00074 
00075 // ######################################################################
00076 void cuda_c_drawFilledRectRGB(float3_t *dst, int top, int left, int bottom, int right, const float3_t *color, const int w, const int h, const int tile_width, const int tile_height)
00077 {
00078   int rect_width = right-left+1;
00079   int rect_height = bottom-top+1;
00080   dim3 blockGridRows(iDivUp(rect_width,tile_width),iDivUp(rect_height,tile_height));
00081   dim3 threadBlockRows(tile_width,tile_height);
00082   cuda_global_drawFilledRectRGB<<<blockGridRows,threadBlockRows>>>(dst,top,left,bottom,right,color->p[0],color->p[1],color->p[2],w,h,tile_width,tile_height);
00083 }
00084 
00085 // ######################################################################
00086 void cuda_c_lowpass_5_x_dec_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
00087 {
00088   dim3 blockGridRows(iDivUp(w, tile_width), h);
00089   dim3 threadBlockRows(tile_width);
00090 
00091   cuda_global_lowpass_5_x_dec_x<<<blockGridRows, threadBlockRows,(tile_width+4)*sizeof(float)>>>
00092     (src, w, h, dst, tile_width);
00093 }
00094 
00095 // ######################################################################
00096 void cuda_c_lowpass_5_y_dec_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00097 {
00098   dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height));
00099   dim3 threadBlockColumns(tile_width, tile_height);
00100 
00101   cuda_global_lowpass_5_y_dec_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+4)*sizeof(float)>>>
00102     (src, w, h, dst, tile_width, tile_height); 
00103 }
00104 
00105 // #####################################################################
00106 void cuda_c_lowpass_9_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
00107 {
00108   dim3 blockGridRows(iDivUp(w, tile_width), h);
00109   dim3 threadBlockRows(tile_width);
00110   cuda_global_lowpass_9_x<<<blockGridRows, threadBlockRows,(tile_width+8)*sizeof(float)>>>(src, w, h, dst, tile_width);
00111 }
00112 
00113 // #####################################################################
00114 void cuda_c_lowpass_9_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00115 {
00116   dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height));
00117   dim3 threadBlockColumns(tile_width, tile_height);
00118 
00119   cuda_global_lowpass_9_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+8)*sizeof(float)>>>(src, w, h, dst, tile_width, tile_height);
00120 }
00121 
00122 // #####################################################################
00123 void cuda_c_lowpass_9_x_dec_x(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width)
00124 {
00125   const int dest_tile_width = tile_width>>1; // Need to reduce, so source can be stored
00126   dim3 blockGridRows(iDivUp(dw, dest_tile_width), dh);
00127   dim3 threadBlockRows(dest_tile_width);
00128   cuda_global_lowpass_9_x_dec_x<<<blockGridRows, threadBlockRows,(tile_width+8)*sizeof(float)>>>(src, w, h, dst, dw, dh, dest_tile_width);
00129 }
00130 
00131 // #####################################################################
00132 void cuda_c_lowpass_9_y_dec_y(const float* src, const unsigned int w, const unsigned int h, float* dst, const int dw, const int dh, int tile_width, int tile_height)
00133 {
00134   const int dest_tile_height = tile_height>>1; // Need to reduce, so source can be stored
00135   dim3 blockGridColumns(iDivUp(dw, tile_width), iDivUp(dh, dest_tile_height));
00136   dim3 threadBlockColumns(tile_width, dest_tile_height);
00137   cuda_global_lowpass_9_y_dec_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+8)*sizeof(float)>>>(src, w, h, dst, dw, dh, tile_width, dest_tile_height);
00138 }
00139 
00140 // ######################################################################
00141 void cuda_c_lowpass_5_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
00142 {
00143   dim3 blockGridRows(iDivUp(w, tile_width), h);
00144   dim3 threadBlockRows(tile_width);
00145 
00146   cuda_global_lowpass_5_x<<<blockGridRows, threadBlockRows,(tile_width+4)*sizeof(float)>>>
00147     (src, w, h, dst, tile_width);
00148 }
00149 
00150 // ######################################################################
00151 void cuda_c_lowpass_5_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00152 {
00153   dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height));
00154   dim3 threadBlockColumns(tile_width, tile_height);
00155 
00156   cuda_global_lowpass_5_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+4)*sizeof(float)>>>
00157     (src, w, h, dst, tile_width, tile_height); 
00158 }
00159 
00160 // ######################################################################
00161 void cuda_c_lowpass_3_x(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width)
00162 {
00163   dim3 blockGridRows(iDivUp(w, tile_width), h);
00164   dim3 threadBlockRows(tile_width);
00165 
00166   cuda_global_lowpass_3_x<<<blockGridRows, threadBlockRows,(tile_width+4)*sizeof(float)>>>
00167     (src, w, h, dst, tile_width);
00168 }
00169 
00170 // ######################################################################
00171 void cuda_c_lowpass_3_y(const float* src, const unsigned int w, const unsigned int h, float* dst, int tile_width, int tile_height)
00172 {
00173   dim3 blockGridColumns(iDivUp(w, tile_width), iDivUp(h, tile_height));
00174   dim3 threadBlockColumns(tile_width, tile_height);
00175 
00176   cuda_global_lowpass_3_y<<<blockGridColumns, threadBlockColumns,tile_width*(tile_height+4)*sizeof(float)>>>
00177     (src, w, h, dst, tile_width, tile_height); 
00178 }
00179 
00180 // ######################################################################
00181 void cuda_c_dec_xy(const float *src,  float* dst, const int x_factor, const int y_factor, const unsigned int w, const unsigned int h, int tile_width)
00182 {
00183   dim3 blockGridRows(iDivUp(w/x_factor, tile_width), h/y_factor);
00184   dim3 threadBlockRows(tile_width);
00185   cuda_global_dec_xy<<<blockGridRows, threadBlockRows>>>(src,dst,x_factor,y_factor,w,h,tile_width);
00186 }
00187 
00188 // ######################################################################
00189 void cuda_c_dec_x(const float *src,  float* dst, const int x_factor, const unsigned int w, const unsigned int h, int tile_width)
00190 {
00191   dim3 blockGridRows(iDivUp(w/x_factor, tile_width), h);
00192   dim3 threadBlockRows(tile_width);
00193   cuda_global_dec_x<<<blockGridRows, threadBlockRows>>>(src,dst,x_factor,w,h,tile_width);
00194 }
00195 
00196 // ######################################################################
00197 void cuda_c_dec_y(const float *src,  float* dst, const int y_factor, const unsigned int w, const unsigned int h, int tile_width)
00198 {
00199   dim3 blockGridRows(iDivUp(w, tile_width), h/y_factor);
00200   dim3 threadBlockRows(tile_width);
00201   cuda_global_dec_y<<<blockGridRows, threadBlockRows>>>(src,dst,y_factor,w,h,tile_width);
00202 }
00203 
00204 void cuda_c_quickLocalAvg(const float *in, float *res, float fac, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00205 {
00206   int scalex = lw / sw, scaley = lh / sh; // in case the image is very small
00207   // IMPORTANT!  remx is JUST the remainder, unlike the CPU version, which is remx=lw-1-(lh%sh)
00208   int remx = (lw % sw), remy = (lh % sh);
00209 
00210   dim3 blockGridColumns(iDivUp(sw, tile_width), iDivUp(sh, tile_height));
00211   dim3 threadBlockColumns(tile_width, tile_height);  
00212   cuda_global_quickLocalAvg<<<blockGridColumns,threadBlockColumns>>>(in,res,fac,scalex,scaley,remx,remy,lw,lh,sw,sh,tile_width,tile_height);
00213 }
00214 
00215 void cuda_c_quickLocalAvg2x2(const float *in, float *res, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00216 {
00217   dim3 blockGridColumns(iDivUp(sw, tile_width), iDivUp(sh, tile_height));
00218   dim3 threadBlockColumns(tile_width, tile_height);  
00219   cuda_global_quickLocalAvg2x2<<<blockGridColumns,threadBlockColumns>>>(in,res,lw,lh,sw,sh,tile_width,tile_height);
00220 }
00221 
00222 void cuda_c_quickLocalMax(const float *in, float *res, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00223 {
00224   int scalex = lw / sw, scaley = lh / sh;  // in case the image is very small
00225   int remx = lw - 1 - (lw % sw), remy = lh - 1 - (lh % sh);
00226 
00227   dim3 blockGridColumns(iDivUp(sw, tile_width), iDivUp(sh, tile_height));
00228   dim3 threadBlockColumns(tile_width, tile_height);  
00229   cuda_global_quickLocalMax<<<blockGridColumns,threadBlockColumns>>>(in,res,scalex,scaley,remx,remy,lw,lh,sw,sh,tile_width,tile_height);
00230 }
00231 
00232 
00233 void cuda_c_rescaleBilinear(const float *src, float *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height)
00234 {
00235   dim3 blockGridColumns(iDivUp(new_w, tile_width), iDivUp(new_h, tile_height));
00236   dim3 threadBlockColumns(tile_width, tile_height);  
00237   cuda_global_rescaleBilinear<<<blockGridColumns,threadBlockColumns>>>(src,res,sw,sh,orig_w,orig_h,new_w,new_h,tile_width,tile_height);
00238 }
00239 
00240 void cuda_c_rescaleBilinearRGB(const float3_t *src, float3_t *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height)
00241 {
00242   dim3 blockGridColumns(iDivUp(new_w, tile_width), iDivUp(new_h, tile_height));
00243   dim3 threadBlockColumns(tile_width, tile_height);  
00244   cuda_global_rescaleBilinearRGB<<<blockGridColumns,threadBlockColumns>>>(src,res,sw,sh,orig_w,orig_h,new_w,new_h,tile_width,tile_height);
00245 }
00246 
00247 void cuda_c_clear(float *src, const float val, const int tile_len, const int sz)
00248 {
00249   dim3 blockGridRows(iDivUp(sz,tile_len));
00250   dim3 threadBlockRows(tile_len);
00251   cuda_global_clear<<<blockGridRows,threadBlockRows>>>(src,val,tile_len,sz);  
00252 }
00253 
00254 void cuda_c_abs(float *src,const int tile_len, const int sz)
00255 {
00256   dim3 blockGridRows(iDivUp(sz,tile_len));
00257   dim3 threadBlockRows(tile_len);
00258   cuda_global_abs<<<blockGridRows,threadBlockRows>>>(src,tile_len,sz);  
00259 }
00260 
00261 void cuda_c_inplaceAddScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00262 {
00263   dim3 blockGridRows(iDivUp(sz,tile_len));
00264   dim3 threadBlockRows(tile_len);
00265   cuda_global_inplaceAddScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz);  
00266 }
00267 
00268 void cuda_c_inplaceSubtractScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00269 {
00270   dim3 blockGridRows(iDivUp(sz,tile_len));
00271   dim3 threadBlockRows(tile_len);
00272   cuda_global_inplaceSubtractScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz);  
00273 }
00274 
00275 void cuda_c_inplaceMultiplyScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00276 {
00277   dim3 blockGridRows(iDivUp(sz,tile_len));
00278   dim3 threadBlockRows(tile_len);
00279   cuda_global_inplaceMultiplyScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz);  
00280 }
00281 
00282 void cuda_c_inplaceDivideScalar(float *ptr, const float *offset, const int tile_len, const int sz)
00283 {
00284   dim3 blockGridRows(iDivUp(sz,tile_len));
00285   dim3 threadBlockRows(tile_len);
00286   cuda_global_inplaceDivideScalar<<<blockGridRows,threadBlockRows>>>(ptr,offset,tile_len,sz);  
00287 }
00288 
00289 void cuda_c_inplaceAddHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00290 {
00291   dim3 blockGridRows(iDivUp(sz,tile_len));
00292   dim3 threadBlockRows(tile_len);
00293   cuda_global_inplaceAddHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz);  
00294 }
00295 
00296 void cuda_c_inplaceSubtractHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00297 {
00298   dim3 blockGridRows(iDivUp(sz,tile_len));
00299   dim3 threadBlockRows(tile_len);
00300   cuda_global_inplaceSubtractHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz);  
00301 }
00302 
00303 void cuda_c_inplaceMultiplyHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00304 {
00305   dim3 blockGridRows(iDivUp(sz,tile_len));
00306   dim3 threadBlockRows(tile_len);
00307   cuda_global_inplaceMultiplyHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz);  
00308 }
00309 
00310 void cuda_c_inplaceDivideHostScalar(float *ptr, const float val, const int tile_len, const int sz)
00311 {
00312   dim3 blockGridRows(iDivUp(sz,tile_len));
00313   dim3 threadBlockRows(tile_len);
00314   cuda_global_inplaceDivideHostScalar<<<blockGridRows,threadBlockRows>>>(ptr,val,tile_len,sz);  
00315 }
00316 
00317 
00318 void cuda_c_inplaceAddImages(float *im1, const float *im2, const int tile_len, const int sz)
00319 {
00320   dim3 blockGridRows(iDivUp(sz,tile_len));
00321   dim3 threadBlockRows(tile_len);
00322   cuda_global_inplaceAddImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz);  
00323 }
00324 
00325 void cuda_c_inplaceSubtractImages(float *im1, const float *im2, const int tile_len, const int sz)
00326 {
00327   dim3 blockGridRows(iDivUp(sz,tile_len));
00328   dim3 threadBlockRows(tile_len);
00329   cuda_global_inplaceSubtractImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz);  
00330 }
00331 
00332 void cuda_c_inplaceMultiplyImages(float *im1, const float *im2, const int tile_len, const int sz)
00333 {
00334   dim3 blockGridRows(iDivUp(sz,tile_len));
00335   dim3 threadBlockRows(tile_len);
00336   cuda_global_inplaceMultiplyImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz);  
00337 }
00338 
00339 void cuda_c_inplaceDivideImages(float *im1, const float *im2, const int tile_len, const int sz)
00340 {
00341   dim3 blockGridRows(iDivUp(sz,tile_len));
00342   dim3 threadBlockRows(tile_len);
00343   cuda_global_inplaceDivideImages<<<blockGridRows,threadBlockRows>>>(im1,im2,tile_len,sz);  
00344 }
00345 
00346 void cuda_c_addImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00347 {
00348   dim3 blockGridRows(iDivUp(sz,tile_len));
00349   dim3 threadBlockRows(tile_len);
00350   cuda_global_addImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);  
00351 }
00352 
00353 void cuda_c_subtractImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00354 {
00355   dim3 blockGridRows(iDivUp(sz,tile_len));
00356   dim3 threadBlockRows(tile_len);
00357   cuda_global_subtractImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);  
00358 }
00359 
00360 void cuda_c_multiplyImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00361 {
00362   dim3 blockGridRows(iDivUp(sz,tile_len));
00363   dim3 threadBlockRows(tile_len);
00364   cuda_global_multiplyImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);  
00365 }
00366 
00367 void cuda_c_divideImages(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00368 {
00369   dim3 blockGridRows(iDivUp(sz,tile_len));
00370   dim3 threadBlockRows(tile_len);
00371   cuda_global_divideImages<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);  
00372 }
00373 
00374 void cuda_c_takeMax(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00375 {
00376   dim3 blockGridRows(iDivUp(sz,tile_len));
00377   dim3 threadBlockRows(tile_len);
00378   cuda_global_takeMax<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);  
00379 }
00380 
00381 void cuda_c_addScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00382 {
00383   dim3 blockGridRows(iDivUp(sz,tile_len));
00384   dim3 threadBlockRows(tile_len);
00385   cuda_global_addScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);  
00386 }
00387 
00388 void cuda_c_subtractScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00389 {
00390   dim3 blockGridRows(iDivUp(sz,tile_len));
00391   dim3 threadBlockRows(tile_len);
00392   cuda_global_subtractScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);  
00393 }
00394 
00395 void cuda_c_multiplyScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00396 {
00397   dim3 blockGridRows(iDivUp(sz,tile_len));
00398   dim3 threadBlockRows(tile_len);
00399   cuda_global_multiplyScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);  
00400 }
00401 
00402 void cuda_c_divideScalar(const float *im1, const float *im2, float *res, const int tile_len, const int sz)
00403 {
00404   dim3 blockGridRows(iDivUp(sz,tile_len));
00405   dim3 threadBlockRows(tile_len);
00406   cuda_global_divideScalar<<<blockGridRows,threadBlockRows>>>(im1,im2,res,tile_len,sz);  
00407 }
00408 
00409 void cuda_c_addHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00410 {
00411   dim3 blockGridRows(iDivUp(sz,tile_len));
00412   dim3 threadBlockRows(tile_len);
00413   cuda_global_addHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz);  
00414 }
00415 
00416 void cuda_c_subtractHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00417 {
00418   dim3 blockGridRows(iDivUp(sz,tile_len));
00419   dim3 threadBlockRows(tile_len);
00420   cuda_global_subtractHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz);  
00421 }
00422 
00423 void cuda_c_multiplyHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00424 {
00425   dim3 blockGridRows(iDivUp(sz,tile_len));
00426   dim3 threadBlockRows(tile_len);
00427   cuda_global_multiplyHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz);  
00428 }
00429 
00430 void cuda_c_divideHostScalar(const float *im1, const float val, float *res, const int tile_len, const int sz)
00431 {
00432   dim3 blockGridRows(iDivUp(sz,tile_len));
00433   dim3 threadBlockRows(tile_len);
00434   cuda_global_divideHostScalar<<<blockGridRows,threadBlockRows>>>(im1,val,res,tile_len,sz);  
00435 }
00436 
00437 void cuda_c_inplaceRectify(float *ptr, const int tile_len, const int sz)
00438 {
00439   dim3 blockGridRows(iDivUp(sz,tile_len));
00440   dim3 threadBlockRows(tile_len);
00441   cuda_global_inplaceRectify<<<blockGridRows,threadBlockRows>>>(ptr,tile_len,sz);
00442 }
00443 
00444 void cuda_c_inplaceClamp(float *ptr, const float cmin, const float cmax, const int tile_len, const int sz)
00445 {
00446   dim3 blockGridRows(iDivUp(sz,tile_len));
00447   dim3 threadBlockRows(tile_len);
00448 
00449   cuda_global_inplaceClamp<<<blockGridRows,threadBlockRows>>>(ptr,cmin,cmax,tile_len,sz);
00450 }
00451 
00452 void cuda_c_inplaceNormalize(float *src, const float *omin, const float *omax, const float nmin, const float nmax, const int tile_len, const int sz)
00453 {
00454   dim3 blockGridRows(iDivUp(sz,tile_len));
00455   dim3 threadBlockRows(tile_len);
00456   cuda_global_inplaceNormalize<<<blockGridRows,threadBlockRows>>>(src,omin,omax,nmin,nmax,tile_len,sz);
00457 }
00458 
00459 void cuda_c_getMin(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00460 {
00461   dim3 blockGridRows(iDivUp(sz,tile_len));
00462   dim3 threadBlockRows(tile_len);
00463   // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls)
00464   const float *in = src;
00465   for(int cur_sz=sz;   cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00466   {
00467     cuda_global_getMin<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz);
00468     in=buf;
00469   }
00470 }
00471 
00472 
00473 void cuda_c_getMax(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00474 {
00475   dim3 blockGridRows(iDivUp(sz,tile_len));
00476   dim3 threadBlockRows(tile_len);
00477   // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls)
00478   const float *in = src;
00479   for(int cur_sz=sz;   cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00480   {
00481     cuda_global_getMax<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz);
00482     in=buf;
00483   }
00484 }
00485 
00486 void cuda_c_getAvg(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00487 {
00488   dim3 blockGridRows(iDivUp(sz,tile_len));
00489   dim3 threadBlockRows(tile_len);
00490   // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls)
00491   const float *in = src;
00492   for(int cur_sz=sz;   cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00493   {
00494     cuda_global_getAvg<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz,sz);
00495     in=buf;
00496   }
00497 }
00498 
00499 void cuda_c_getSum(const float *src, float *dest, float *buf, const int tile_len, const int sz)
00500 {
00501   dim3 blockGridRows(iDivUp(sz,tile_len));
00502   dim3 threadBlockRows(tile_len);
00503   // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls)
00504   const float *in = src;
00505   for(int cur_sz=sz;   cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00506   {
00507     cuda_global_getSum<<<blockGridRows,threadBlockRows,tile_len*sizeof(float)>>>(in,dest,buf,tile_len,cur_sz,sz);
00508     in=buf;
00509   }
00510 }
00511 
00512 void cuda_c_squared(const float *im, float *res, const int tile_len, const int sz)
00513 {
00514   dim3 blockGridRows(iDivUp(sz,tile_len));
00515   dim3 threadBlockRows(tile_len);
00516   cuda_global_squared<<<blockGridRows,threadBlockRows>>>(im,res,tile_len,sz);
00517 }
00518 
00519 void cuda_c_sqrt(const float *im, float *res, const int tile_len, const int sz)
00520 {
00521   dim3 blockGridRows(iDivUp(sz,tile_len));
00522   dim3 threadBlockRows(tile_len);
00523   cuda_global_sqrt<<<blockGridRows,threadBlockRows>>>(im,res,tile_len,sz);
00524 }
00525 
00526 
00527 void cuda_c_quadEnergy(const float *real, const float *imag, float *out, int tile_len, int sz)
00528 {
00529   dim3 blockGridRows(iDivUp(sz,tile_len));
00530   dim3 threadBlockRows(tile_len);
00531   cuda_global_quadEnergy<<<blockGridRows,threadBlockRows>>>(real,imag,out,tile_len,sz);
00532 }
00533 
00534 void cuda_c_inplaceAttenuateBorders(float *im, int borderSize, int tile_len, int w, int h)
00535 {
00536   dim3 xBlock(iDivUp(w,tile_len),borderSize*2);
00537   dim3 xThread(tile_len);
00538   cuda_global_inplaceAttenuateBorders_x<<<xBlock,xThread>>>(im,borderSize,tile_len,w,h);
00539   dim3 yBlock(borderSize*2,iDivUp(h,tile_len));
00540   dim3 yThread(1,tile_len);
00541   cuda_global_inplaceAttenuateBorders_y<<<yBlock,yThread>>>(im,borderSize,tile_len,w,h);
00542 }
00543 
00544 void cuda_c_findMax(const float *src, float *buf, int *loc, const int tile_len, const int sz)
00545 {
00546   dim3 blockGridRows(iDivUp(sz,tile_len));
00547   dim3 threadBlockRows(tile_len);
00548   // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls)
00549   const float *in = src;
00550   const int *inLoc = NULL;
00551   for(int cur_sz=sz;   cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00552   {
00553     cuda_global_findMax<<<blockGridRows,threadBlockRows,2*tile_len*sizeof(float)>>>(in,inLoc,buf,loc,tile_len,cur_sz);
00554     in=buf;
00555     inLoc=loc;
00556   }
00557 }
00558 
00559 void cuda_c_findMin(const float *src, float *buf, int *loc, const int tile_len, const int sz)
00560 {
00561   dim3 blockGridRows(iDivUp(sz,tile_len));
00562   dim3 threadBlockRows(tile_len);
00563   // This must be iteratively called, because we need to synchronize the whole grid (and that is guaranteed between kernel calls)
00564   const float *in = src;
00565   const int *inLoc = NULL;
00566   for(int cur_sz=sz;   cur_sz>1; cur_sz=IDIVUP(cur_sz,tile_len))
00567   {
00568     cuda_global_findMin<<<blockGridRows,threadBlockRows,2*tile_len*sizeof(float)>>>(in,inLoc,buf,loc,tile_len,cur_sz);
00569     in=buf;
00570     inLoc=loc;
00571   }
00572 }
00573 
00574 
00575 
00576 void cuda_c_dogFilterHmax(float *dest, const float theta, const float gamma, const int size, const float div, const int tile_width, const int tile_height)
00577 {
00578   dim3 blockGridRows(iDivUp(size,tile_width),iDivUp(size,tile_height));
00579   dim3 threadBlockRows(tile_width,tile_height);
00580   cuda_global_dogFilterHmax<<<blockGridRows,threadBlockRows>>>(dest,theta,gamma,size,div,tile_width,tile_height);
00581 }
00582 
00583 void cuda_c_dogFilter(float *dest, float stddev, float theta, int half_size, int size, int tile_width, int tile_height)
00584 {
00585   dim3 blockGridRows(iDivUp(size,tile_width),iDivUp(size,tile_height));
00586   dim3 threadBlockRows(tile_width,tile_height);
00587   cuda_global_dogFilter<<<blockGridRows,threadBlockRows>>>(dest,stddev,theta,half_size,size,tile_width,tile_height);
00588 }
00589 
00590 void cuda_c_gaborFilter3(float *kern, const float major_stddev, const float minor_stddev,
00591                           const float period, const float phase,
00592                          const float theta, const int size, const int tile_len, const int sz)
00593 {
00594   dim3 blockGridRows(iDivUp(sz,tile_len));
00595   dim3 threadBlockRows(tile_len);
00596   cuda_global_gaborFilter3<<<blockGridRows,threadBlockRows>>>(kern,major_stddev,minor_stddev,period,phase,theta,size,tile_len,sz);
00597 }
00598 
00599 void cuda_c_gaussian(float *res, float c, float sig22, int hw, int tile_len, int sz)
00600 {
00601   dim3 blockGridRows(iDivUp(sz,tile_len));
00602   dim3 threadBlockRows(tile_len);
00603   cuda_global_gaussian<<<blockGridRows,threadBlockRows>>>(res,c,sig22,hw,tile_len,sz);
00604 }
00605 
00606 void cuda_c_orientedFilter(const float *src, float *re, float *im, const float kx, const float ky, const float intensity, const int w, const int h, const int tile_width)
00607 {
00608   dim3 blockGridRows(iDivUp(w, tile_width), h);
00609   dim3 threadBlockRows(tile_width);
00610   cuda_global_orientedFilter<<<blockGridRows,threadBlockRows>>>(src,re,im,kx,ky,intensity,w,h,tile_width);
00611 }
00612 
00613 void cuda_c_centerSurroundAbs(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int tile_width )
00614 {
00615   int scalex = lw / sw, remx = lw - 1 - (lw % sw);
00616   int scaley = lh / sh, remy = lh - 1 - (lh % sh);
00617   dim3 blockGridRows(iDivUp(lw, tile_width), lh);
00618   dim3 threadBlockRows(tile_width);
00619   cuda_global_centerSurroundAbs<<<blockGridRows,threadBlockRows>>>(center,surround,res,lw,lh,sw,sh,scalex,scaley,remx,remy,tile_width);
00620 }
00621 
00622 void cuda_c_centerSurroundClamped(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int tile_width )
00623 {
00624   int scalex = lw / sw, remx = lw - 1 - (lw % sw);
00625   int scaley = lh / sh, remy = lh - 1 - (lh % sh);
00626   dim3 blockGridRows(iDivUp(lw, tile_width), lh);
00627   dim3 threadBlockRows(tile_width);
00628   cuda_global_centerSurroundClamped<<<blockGridRows,threadBlockRows>>>(center,surround,res,lw,lh,sw,sh,scalex,scaley,remx,remy,tile_width);
00629 }
00630 
00631 void cuda_c_centerSurroundDirectional(const float *center, const float *surround, float *pos, float *neg, int lw, int lh, int sw, int sh, int tile_width )
00632 {
00633   int scalex = lw / sw, remx = lw - 1 - (lw % sw);
00634   int scaley = lh / sh, remy = lh - 1 - (lh % sh);
00635   dim3 blockGridRows(iDivUp(lw, tile_width), lh);
00636   dim3 threadBlockRows(tile_width);
00637   cuda_global_centerSurroundDirectional<<<blockGridRows,threadBlockRows>>>(center,surround,pos,neg,lw,lh,sw,sh,scalex,scaley,remx,remy,tile_width);
00638 }
00639 
00640 void cuda_c_centerSurroundAbsAttenuate(const float *center, const float *surround, float *res, int lw, int lh, int sw, int sh, int attBorder, int tile_width, int tile_height)
00641 {
00642   int scalex = lw / sw, remx = lw - 1 - (lw % sw);
00643   int scaley = lh / sh, remy = lh - 1 - (lh % sh);
00644   dim3 blockGridRows(iDivUp(lw, tile_width), iDivUp(lh,tile_height));
00645   dim3 threadBlockRows(tile_width,tile_height);
00646   cuda_global_centerSurroundAbsAttenuate<<<blockGridRows,threadBlockRows>>>(center,surround,res,lw,lh,sw,sh,attBorder,scalex,scaley,remx,remy,tile_width,tile_height);
00647 }
00648 
00649 void cuda_c_spatialPoolMax(const float *src, float *res, float *buf1, float *buf2, const int src_w_in, const int src_h_in, const int skip_w_in, const int skip_h_in,  
00650                            const int reg_w_in, const int reg_h_in, int tile_width_in, int tile_height_in)
00651 {
00652   int reg_w = reg_w_in;  int reg_h = reg_h_in;
00653   int skip_w = skip_w_in;  int skip_h = skip_h_in;
00654   int src_w = src_w_in; int src_h = src_h_in;
00655   int tilesperregion_w, tilesperregion_h;
00656   int tile_width, tile_height;
00657   const int orig_tile_size = tile_width_in*tile_height_in;
00658   const float *in = src;
00659   float *out = buf1;
00660   float *next = buf2;
00661   // Run the kernel recursively until the region is smaller than the tile
00662   do
00663     {
00664       // Modify the tile size to optimize based on the region
00665       if(reg_w < orig_tile_size)
00666         {
00667           tile_width = reg_w;
00668           // Minimum of the region width or the remaining tile dimension
00669           tile_height = MIN(orig_tile_size/tile_width,reg_h);
00670         }
00671       else if(reg_h < orig_tile_size)
00672         {
00673           tile_height = reg_h;
00674           // Minimum of the region width or the remaining tile dimension
00675           tile_width = MIN(orig_tile_size/tile_height,reg_w);
00676         }
00677       else
00678         {
00679           tile_width = tile_width_in;
00680           tile_height = tile_height_in;
00681         }
00682       tilesperregion_w = iDivUp(reg_w,tile_width);
00683       tilesperregion_h = iDivUp(reg_h,tile_height);
00684       // If this is the last time through, set the output to be the result
00685       if(tilesperregion_w == 1 && tilesperregion_h == 1)
00686         out = res;
00687       int num_blocks_w = iDivUp(src_w, skip_w)*tilesperregion_w;
00688       int num_blocks_h = iDivUp(src_h, skip_h)*tilesperregion_h;
00689       dim3 blockGridRows(num_blocks_w,num_blocks_h);
00690       dim3 threadBlockRows(tile_width,tile_height);
00691       //printf("tile_w:%d _h:%d, reg_w:%d _h:%d, skip_w:%d _h:%d, src_w:%d _h:%d\n",
00692       //     tile_width,tile_height,reg_w,reg_h,skip_w,skip_h,src_w,src_h);
00693       cuda_global_spatialPoolMax<<<blockGridRows,threadBlockRows,tile_width*tile_height*sizeof(float)>>>(in,out,src_w,src_h,skip_w,skip_h,reg_w,reg_h,tile_width,tile_height);
00694       src_w = num_blocks_w;
00695       src_h = num_blocks_h;
00696       reg_w = tilesperregion_w;
00697       reg_h = tilesperregion_h;
00698       skip_w = reg_w;
00699       skip_h = reg_h;
00700       // Update the location of the input
00701       in = out;
00702       out = next;
00703       next = (buf1 == in) ? buf1 : buf2;
00704     } while(tilesperregion_w > 1 || tilesperregion_h > 1);
00705 
00706 }
00707 
00708 // Number of values requested must be divisible by MT_RNG_COUNT
00709 
00710 void cuda_c_randomMT(float *d_Random, int numVals, int tile_len)
00711 {
00712   if(numVals % MT_RNG_COUNT != 0)
00713   {
00714     printf("ERROR!!!: Cannot request a set of random numbers that is not in MT_RNG_COUNT [%d]units\n",MT_RNG_COUNT);
00715     return;
00716   }
00717   int NPerRng = numVals/MT_RNG_COUNT;
00718   dim3 blockGridColumns(NPerRng,1);
00719   dim3 threadBlockColumns(tile_len,1);
00720   cuda_global_randomMT<<<blockGridColumns,threadBlockColumns>>>(d_Random,NPerRng);
00721 }
00722 
00723 void cuda_c_inplaceAddBGnoise2(float *in, float *rnd, const int brd_siz, const float range, int w, int h, int tile_len)
00724 {
00725   dim3 blockGridColumns(iDivUp(w,tile_len),h);
00726   dim3 threadBlockColumns(tile_len,1);
00727   cuda_global_inplaceAddBGnoise2<<<blockGridColumns,threadBlockColumns>>>(in,rnd,brd_siz,range,w,h,tile_len);
00728 }
00729 
00730 void cuda_c_convolveHmaxHelper(float *res, const float *src, const int src_w, const int src_h, 
00731                              const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00732 {
00733   dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00734   dim3 threadBlockColumns(tile_width,tile_height);
00735   cuda_global_convolveHmaxHelper<<<blockGridColumns,threadBlockColumns,Nx*Ny*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00736 }
00737 
00738 void cuda_c_convolveZeroHelper(float *res, const float *src, const int src_w, const int src_h, 
00739                              const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00740 {
00741   dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00742   dim3 threadBlockColumns(tile_width,tile_height);
00743   cuda_global_convolveZeroHelper<<<blockGridColumns,threadBlockColumns,Nx*Ny*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00744 }
00745 
00746 void cuda_c_convolveCleanHelper(float *res, const float *src, const int src_w, const int src_h, 
00747                              const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00748 {
00749   dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00750   dim3 threadBlockColumns(tile_width,tile_height);
00751   cuda_global_convolveCleanHelper<<<blockGridColumns,threadBlockColumns,Nx*Ny*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00752 }
00753 
00754 void cuda_c_convolveHmaxHelperOptimized(float *res, const float *src, const int src_w, const int src_h, 
00755                              const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00756 {
00757   dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00758   dim3 threadBlockColumns(tile_width,tile_height);
00759   cuda_global_convolveHmaxHelperOptimized<<<blockGridColumns,threadBlockColumns,(Nx*Ny+(Nx+tile_width)*(Ny+tile_height))*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00760 }
00761 
00762 void cuda_c_convolveZeroHelperOptimized(float *res, const float *src, const int src_w, const int src_h, 
00763                              const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height)
00764 {
00765   dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00766   dim3 threadBlockColumns(tile_width,tile_height);
00767   cuda_global_convolveZeroHelperOptimized<<<blockGridColumns,threadBlockColumns,(Nx*Ny+(Nx+tile_width)*(Ny+tile_height))*sizeof(float)>>>(res,src,src_w,src_h,f,Nx,Ny,tile_width,tile_height);
00768 }
00769 
00770 void cuda_c_optConvolve(float *res, const float *src, const int src_w, const int src_h, 
00771                              const float *f, const int fil_w, const int fil_h, const int tile_width, const int tile_height)
00772 {
00773   dim3 blockGridColumns(iDivUp(src_w,tile_width),iDivUp(src_h,tile_height));
00774   dim3 threadBlockColumns(tile_width,tile_height);
00775   cuda_global_optConvolve<<<blockGridColumns,threadBlockColumns,fil_w*fil_h*sizeof(float)>>>(res,src,src_w,src_h,f,fil_w,fil_h,tile_width,tile_height);
00776 }
00777 
00778 void cuda_c_xFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00779 {
00780   dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00781   dim3 threadBlockColumns(tile_len,1);
00782   cuda_global_xFilterZero<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00783 }
00784 
00785 void cuda_c_xFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00786 {
00787   dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00788   dim3 threadBlockColumns(tile_len,1);
00789   cuda_global_xFilterClean<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00790 }
00791 
00792 void cuda_c_xFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00793 {
00794   dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00795   dim3 threadBlockColumns(tile_len,1);
00796   cuda_global_xFilterReplicate<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00797 }
00798 
00799 void cuda_c_yFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00800 {
00801   dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00802   dim3 threadBlockColumns(1,tile_len);
00803   cuda_global_yFilterZero<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00804 }
00805 
00806 void cuda_c_yFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00807 {
00808   dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00809   dim3 threadBlockColumns(1,tile_len);
00810   cuda_global_yFilterClean<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00811 }
00812 
00813 void cuda_c_yFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int share_len, const int tile_len)
00814 {
00815   dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00816   dim3 threadBlockColumns(1,tile_len);
00817   cuda_global_yFilterReplicate<<<blockGridColumns,threadBlockColumns,share_len*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,share_len,tile_len);
00818 }
00819 
00820 void cuda_c_optXFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00821 {
00822   dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00823   dim3 threadBlockColumns(tile_len,1);
00824   cuda_global_optXFilterZero<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00825 }
00826 
00827 void cuda_c_optYFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00828 {
00829   dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00830   dim3 threadBlockColumns(1,tile_len);
00831   cuda_global_optYFilterZero<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00832 }
00833 
00834 
00835 void cuda_c_optXFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00836 {
00837   dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00838   dim3 threadBlockColumns(tile_len,1);
00839   cuda_global_optXFilterClean<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00840 }
00841 
00842 void cuda_c_optYFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00843 {
00844   dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00845   dim3 threadBlockColumns(1,tile_len);
00846   cuda_global_optYFilterClean<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00847 }
00848 
00849 void cuda_c_optXFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00850 {
00851   dim3 blockGridColumns(iDivUp(src_w,tile_len),src_h);
00852   dim3 threadBlockColumns(tile_len,1);
00853   cuda_global_optXFilterReplicate<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00854 }
00855 
00856 void cuda_c_optYFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len)
00857 {
00858   dim3 blockGridColumns(src_w,iDivUp(src_h,tile_len));
00859   dim3 threadBlockColumns(1,tile_len);
00860   cuda_global_optYFilterReplicate<<<blockGridColumns,threadBlockColumns,(tile_len+hfs*2)*sizeof(float)>>>(res,src,src_w,src_h,f,hfs,tile_len);
00861 }
00862 
00863 
00864 void cuda_2_debayer(float *src,float3_t *dptr,int w, int h, int tile_width, int tile_height)
00865 {
00866   dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00867   dim3 threadBlockRows(tile_width,tile_height);
00868   cuda_kernel_debayer<<<blockGridRows,threadBlockRows>>>(src,dptr,w,h,tile_width,tile_height);
00869 }
00870 
00871 void cuda_c_crop(const float *src, float *res, int srcw, int srch, int startx, int starty, int endx, int endy, int maxx,int maxy, int tile_width, int tile_height)
00872 {
00873   dim3 blockGridRows(iDivUp(srcw,tile_width),iDivUp(srch,tile_height));
00874   dim3 threadBlockRows(tile_width,tile_height);
00875   cuda_global_crop<<<blockGridRows,threadBlockRows>>>(src,res,srcw,srch,startx,starty,endx,endy,maxx,maxy,tile_width,tile_height);
00876 }
00877 
00878 void cuda_c_shiftImage(const float *src, float *dst, int w, int h, float deltax, float deltay, int tile_width, int tile_height)
00879 {
00880   dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00881   dim3 threadBlockRows(tile_width,tile_height);
00882   cuda_global_shiftImage<<<blockGridRows,threadBlockRows,(tile_width*tile_height+tile_height+tile_width+1)*sizeof(float)>>>(src,dst,w,h,deltax,deltay,tile_width,tile_height);
00883 }
00884 
00885 void cuda_c_inplacePaste(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00886 {
00887   dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height));
00888   dim3 threadBlockRows(tile_width,tile_height);
00889   cuda_global_inplacePaste<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height);
00890 }
00891 
00892 void cuda_c_inplacePasteRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00893 {
00894   dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height));
00895   dim3 threadBlockRows(tile_width,tile_height);
00896   cuda_global_inplacePasteRGB<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height);
00897 }
00898 
00899 void cuda_c_inplaceOverlay(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00900 {
00901   dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height));
00902   dim3 threadBlockRows(tile_width,tile_height);
00903   cuda_global_inplaceOverlay<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height);
00904 }
00905 
00906 void cuda_c_inplaceOverlayRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00907 {
00908   dim3 blockGridRows(iDivUp(iw,tile_width),iDivUp(ih,tile_height));
00909   dim3 threadBlockRows(tile_width,tile_height);
00910   cuda_global_inplaceOverlayRGB<<<blockGridRows,threadBlockRows>>>(dst,img, w,h,iw,ih,dx,dy,tile_width,tile_height);
00911 }
00912 
00913 void cuda_c_inertiaMap(float_t *dst, float s, float r_inv, int px, int py, int tile_width, int tile_height, int w, int h)
00914 {
00915   dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00916   dim3 threadBlockRows(tile_width,tile_height);
00917   cuda_global_inertiaMap<<<blockGridRows,threadBlockRows>>>(dst,s,r_inv,px,py,tile_width,tile_height,w,h);
00918 }
00919 
00920 void cuda_c_inhibitionMap(float *dst, float factorOld, float factorNew, float radius, int px, int py, int tile_width, int tile_height, int w, int h)
00921 {
00922   dim3 blockGridRows(iDivUp(w,tile_width),iDivUp(h,tile_height));
00923   dim3 threadBlockRows(tile_width,tile_height);
00924   cuda_global_inhibitionMap<<<blockGridRows,threadBlockRows>>>(dst,factorOld,factorNew,radius,px,py,tile_width,tile_height,w,h);
00925 }
00926 
00927 
00928