00001 /*!@file CUDA/cuda-convolutions.h CUDA/GPU convolution methods */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/CUDA/cuda_convolutions.h $ 00035 // $Id: cuda_convolutions.h 12962 2010-03-06 02:13:53Z irock $ 00036 // 00037 00038 #ifndef CUDA_CONVOLUTIONS_H_DEFINED 00039 #define CUDA_CONVOLUTIONS_H_DEFINED 00040 00041 #include <cuda.h> 00042 #include "CUDA/cutil.h" 00043 #include "cudadefs.h" 00044 00045 00046 00047 __global__ void cuda_global_convolveHmaxHelper(float *res, const float *src, const int src_w, const int src_h, 00048 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00049 { 00050 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y; 00051 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x; 00052 const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x; 00053 const int res_idx = IMUL(y_pos,src_w)+x_pos; 00054 00055 // Load the filter into shared memory 00056 float *data = (float *) shared_data; // size of tile_width 00057 const int tile_sz = IMUL(tile_height,tile_width); 00058 const int fil_sz = IMUL(Nx,Ny); 00059 00060 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz) 00061 { 00062 const int curld_idx = ld_idx+i; 00063 data[curld_idx] = f[curld_idx]; 00064 } 00065 __syncthreads(); 00066 00067 if(y_pos < src_h && x_pos < src_w) 00068 { 00069 const int Nx2 = (Nx - 1) / 2; 00070 const int Ny2 = (Ny - 1) / 2; 00071 float sum = 0.0F; 00072 float sumw = 0.0F; 00073 float im; 00074 for(int i=0;i<Ny;i++) 00075 { 00076 const int s_y=y_pos-Ny2+i; 00077 const int f_y=Ny-i-1; 00078 if(s_y >= 0 && s_y < src_h) 00079 { 00080 for(int j=0;j<Nx;j++) 00081 { 00082 const int s_x=x_pos-Nx2+j; 00083 const int f_x=Nx-j-1; 00084 const int flt_idx = IMUL(f_y,Nx)+f_x; 00085 const int src_idx = IMUL(s_y,src_w)+s_x; 00086 if(s_x >=0 && s_x < src_w) 00087 { 00088 im = src[src_idx]; 00089 //sum += im*data[flt_idx]; 00090 sum = __fadd_rn(__fmul_rn(im,data[flt_idx]),sum); 00091 //sumw += im*im; 00092 sumw = __fadd_rn(__fmul_rn(im,im),sumw); 00093 } 00094 } 00095 } 00096 } 00097 if(sumw > 0.0F) 00098 res[res_idx] = fabs(sum)/sqrt(sumw); 00099 else 00100 res[res_idx] = sum; 00101 } 00102 00103 } 00104 00105 00106 __global__ void cuda_global_convolveZeroHelper(float *res, const float *src, const int src_w, const int src_h, 00107 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00108 { 00109 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y; 00110 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x; 00111 const int ld_idx = threadIdx.y*tile_width + threadIdx.x; 00112 const int res_idx = y_pos*src_w+x_pos; 00113 00114 // Load the filter into shared memory 00115 float *data = (float *) shared_data; // size of tile_width 00116 const int tile_sz = IMUL(tile_height,tile_width); 00117 const int fil_sz = IMUL(Nx,Ny); 00118 00119 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz) 00120 { 00121 const int curld_idx = ld_idx+i; 00122 data[curld_idx] = f[curld_idx]; 00123 } 00124 00125 __syncthreads(); 00126 00127 if(y_pos < src_h && x_pos < src_w) 00128 { 00129 const int Nx2 = (Nx - 1) >>1; 00130 const int Ny2 = (Ny - 1) >>1; 00131 float sum = 0.0F; 00132 for(int i=0;i<Ny;i++) 00133 { 00134 const int s_y=y_pos-Ny2+i; 00135 const int f_y=Ny-i-1; 00136 if(s_y >= 0 && s_y < src_h) 00137 { 00138 for(int j=0;j<Nx;j++) 00139 { 00140 const int s_x=x_pos-Nx2+j; 00141 const int f_x=Nx-j-1; 00142 if(s_x >=0 && s_x < src_w) 00143 { 00144 const int flt_idx = IMUL(f_y,Nx)+f_x; 00145 const int src_idx = IMUL(s_y,src_w)+s_x; 00146 //sum += src[src_idx]*data[flt_idx]; 00147 //sum = __fmaf_rn(src[src_idx],data[flt_idx],sum); 00148 sum = __fadd_rn(__fmul_rn(src[src_idx],data[flt_idx]),sum); 00149 } 00150 } 00151 } 00152 } 00153 res[res_idx] = sum; 00154 } 00155 00156 } 00157 00158 __global__ void cuda_global_convolveCleanHelper(float *res, const float *src, const int src_w, const int src_h, 00159 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00160 { 00161 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y; 00162 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x; 00163 const int ld_idx = threadIdx.y*tile_width + threadIdx.x; 00164 const int res_idx = y_pos*src_w+x_pos; 00165 00166 // Load the filter into shared memory 00167 float *data = (float *) shared_data; // size of tile_width 00168 const int tile_sz = IMUL(tile_height,tile_width); 00169 const int fil_sz = IMUL(Nx,Ny); 00170 00171 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz) 00172 { 00173 const int curld_idx = ld_idx+i; 00174 data[curld_idx] = f[curld_idx]; 00175 } 00176 00177 __syncthreads(); 00178 00179 if(y_pos < src_h && x_pos < src_w) 00180 { 00181 const int Nx2 = (Nx - 1) / 2; 00182 const int Ny2 = (Ny - 1) / 2; 00183 float sum = 0.0F; 00184 float sumf = 0.0F; 00185 float sumw = 0.0F; 00186 for(int i=0;i<Ny;i++) 00187 { 00188 const int s_y=y_pos-Ny2+i; 00189 const int f_y=Ny-i-1; 00190 for(int j=0;j<Nx;j++) 00191 { 00192 const int s_x=x_pos-Nx2+j; 00193 const int f_x=Nx-j-1; 00194 const int flt_idx = IMUL(f_y,Nx)+f_x; 00195 const int src_idx = IMUL(s_y,src_w)+s_x; 00196 if(s_y >= 0 && s_y < src_h && s_x >=0 && s_x < src_w) 00197 { 00198 //sum += src[src_idx]*data[flt_idx]; 00199 sum = __fadd_rn(__fmul_rn(src[src_idx],data[flt_idx]),sum); 00200 sumw += data[flt_idx]; 00201 sumf += data[flt_idx]; 00202 } 00203 else 00204 { 00205 sumf += data[flt_idx]; 00206 } 00207 } 00208 } 00209 00210 res[res_idx] = sum*sumf/sumw; 00211 } 00212 00213 } 00214 00215 00216 __global__ void cuda_global_convolveHmaxHelperOptimized(float *res, const float *src, const int src_w, const int src_h, 00217 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00218 { 00219 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y; 00220 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x; 00221 const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x; 00222 const int res_idx = IMUL(y_pos,src_w)+x_pos; 00223 00224 // Load the filter into shared memory 00225 const int tile_sz = IMUL(tile_height,tile_width); 00226 const int fil_sz = IMUL(Nx,Ny); 00227 00228 const int Nx2 = (Nx - 1) >>1; 00229 const int Ny2 = (Ny - 1) >>1; 00230 00231 float *filt = (float *) shared_data; // size of filter: fil_sz 00232 float *data = (float *) &(filt[fil_sz]); // size of tile: (tile_width+Nx)*(tile_height+Ny); 00233 const int dw = tile_width+Nx; 00234 //const int dh = tile_height+Ny; // Not used 00235 const int shr_x = threadIdx.x+Nx2; 00236 const int shr_y = threadIdx.y+Ny2; 00237 const int shr_idx = IMUL(shr_y,dw)+shr_x; 00238 00239 // Load filter 00240 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz) 00241 { 00242 const int curld_idx = ld_idx+i; 00243 filt[curld_idx] = f[curld_idx]; 00244 } 00245 if(y_pos < src_h && x_pos < src_w) 00246 { 00247 // Load bulk of tile 00248 data[shr_idx]=src[res_idx]; 00249 const bool topBorder = (y_pos > Ny2 && threadIdx.y < Ny2); 00250 const bool bottomBorder = (y_pos < src_h-Ny2 && threadIdx.y > tile_height-Ny2-1); 00251 const bool leftBorder = (x_pos > Nx2 && threadIdx.x < Nx2); 00252 const bool rightBorder = (x_pos < src_w-Nx2 && threadIdx.x > tile_width-Nx2-1); 00253 // Load top border 00254 if(topBorder) 00255 data[shr_idx-IMUL(Ny2,dw)] = src[res_idx-IMUL(Ny2,src_w)]; 00256 // Load bottom border 00257 if(bottomBorder) 00258 data[shr_idx+IMUL(Ny2,dw)] = src[res_idx+IMUL(Ny2,src_w)]; 00259 // Load left border 00260 if(leftBorder) 00261 data[shr_idx-Nx2] = src[res_idx-Nx2]; 00262 // Load right border 00263 if(rightBorder) 00264 data[shr_idx+Nx2] = src[res_idx+Nx2]; 00265 // Load corners 00266 if(topBorder && leftBorder) 00267 data[shr_idx-IMUL(Ny2,dw)-Nx2] = src[res_idx-IMUL(Ny2,src_w)-Nx2]; 00268 if(topBorder && rightBorder) 00269 data[shr_idx-IMUL(Ny2,dw)+Nx2] = src[res_idx-IMUL(Ny2,src_w)+Nx2]; 00270 if(bottomBorder && leftBorder) 00271 data[shr_idx+IMUL(Ny2,dw)-Nx2] = src[res_idx+IMUL(Ny2,src_w)-Nx2]; 00272 if(bottomBorder && rightBorder) 00273 data[shr_idx+IMUL(Ny2,dw)+Nx2] = src[res_idx+IMUL(Ny2,src_w)+Nx2]; 00274 __syncthreads(); 00275 00276 float sum = 0.0F, sumw = 0.0F; 00277 for(int i=0;i<Ny;i++) 00278 { 00279 const int s_y=shr_y-Ny2+i; 00280 const int y_loc = y_pos-Ny2+i; 00281 const int f_y=Ny-i-1; 00282 if(y_loc >= 0 && y_loc < src_h) 00283 { 00284 for(int j=0;j<Nx;j++) 00285 { 00286 const int s_x=shr_x-Nx2+j; 00287 const int x_loc = x_pos-Nx2+j; 00288 const int f_x=Nx-j-1; 00289 if(x_loc >=0 && x_loc < src_w) 00290 { 00291 const int flt_idx = IMUL(f_y,Nx)+f_x; 00292 const int dat_idx = IMUL(s_y,dw)+s_x; 00293 //sum += im*data[flt_idx]; 00294 sum = __fadd_rn(__fmul_rn(data[dat_idx],filt[flt_idx]),sum); 00295 //sumw += im*im; 00296 sumw = __fadd_rn(__fmul_rn(data[dat_idx],data[dat_idx]),sumw); 00297 } 00298 } 00299 } 00300 } 00301 if(sumw > 0.0F) 00302 res[res_idx] = fabs(sum)/sqrt(sumw); 00303 else 00304 res[res_idx] = sum; 00305 } 00306 } 00307 00308 00309 00310 __global__ void cuda_global_convolveZeroHelperOptimized(float *res, const float *src, const int src_w, const int src_h, 00311 const float *f, const int Nx, const int Ny, const int tile_width, const int tile_height) 00312 { 00313 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y; 00314 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x; 00315 const int ld_idx = IMUL(threadIdx.y,tile_width) + threadIdx.x; 00316 const int res_idx = IMUL(y_pos,src_w)+x_pos; 00317 00318 // Load the filter into shared memory 00319 const int tile_sz = IMUL(tile_height,tile_width); 00320 const int fil_sz = IMUL(Nx,Ny); 00321 00322 const int Nx2 = (Nx - 1) >>1; 00323 const int Ny2 = (Ny - 1) >>1; 00324 00325 float *filt = (float *) shared_data; // size of filter: fil_sz 00326 float *data = (float *) &(filt[fil_sz]); // size of tile: (tile_width+Nx)*(tile_height+Ny); 00327 const int dw = tile_width+Nx; 00328 //const int dh = tile_height+Ny; // Not used 00329 const int shr_x = threadIdx.x+Nx2; 00330 const int shr_y = threadIdx.y+Ny2; 00331 const int shr_idx = IMUL(shr_y,dw)+shr_x; 00332 00333 // Load filter 00334 for(int i=0;ld_idx+i<fil_sz;i+=tile_sz) 00335 { 00336 const int curld_idx = ld_idx+i; 00337 filt[curld_idx] = f[curld_idx]; 00338 } 00339 if(y_pos < src_h && x_pos < src_w) 00340 { 00341 // Load bulk of tile 00342 data[shr_idx]=src[res_idx]; 00343 const bool topBorder = (y_pos > Ny2 && threadIdx.y < Ny2); 00344 const bool bottomBorder = (y_pos < src_h-Ny2 && threadIdx.y > tile_height-Ny2-1); 00345 const bool leftBorder = (x_pos > Nx2 && threadIdx.x < Nx2); 00346 const bool rightBorder = (x_pos < src_w-Nx2 && threadIdx.x > tile_width-Nx2-1); 00347 // Load top border 00348 if(topBorder) 00349 data[shr_idx-IMUL(Ny2,dw)] = src[res_idx-IMUL(Ny2,src_w)]; 00350 // Load bottom border 00351 if(bottomBorder) 00352 data[shr_idx+IMUL(Ny2,dw)] = src[res_idx+IMUL(Ny2,src_w)]; 00353 // Load left border 00354 if(leftBorder) 00355 data[shr_idx-Nx2] = src[res_idx-Nx2]; 00356 // Load right border 00357 if(rightBorder) 00358 data[shr_idx+Nx2] = src[res_idx+Nx2]; 00359 // Load corners 00360 if(topBorder && leftBorder) 00361 data[shr_idx-IMUL(Ny2,dw)-Nx2] = src[res_idx-IMUL(Ny2,src_w)-Nx2]; 00362 if(topBorder && rightBorder) 00363 data[shr_idx-IMUL(Ny2,dw)+Nx2] = src[res_idx-IMUL(Ny2,src_w)+Nx2]; 00364 if(bottomBorder && leftBorder) 00365 data[shr_idx+IMUL(Ny2,dw)-Nx2] = src[res_idx+IMUL(Ny2,src_w)-Nx2]; 00366 if(bottomBorder && rightBorder) 00367 data[shr_idx+IMUL(Ny2,dw)+Nx2] = src[res_idx+IMUL(Ny2,src_w)+Nx2]; 00368 __syncthreads(); 00369 00370 float sum = 0.0F; 00371 for(int i=0;i<Ny;i++) 00372 { 00373 const int s_y=shr_y-Ny2+i; 00374 const int y_loc = y_pos-Ny2+i; 00375 const int f_y=Ny-i-1; 00376 if(y_loc >= 0 && y_loc < src_h) 00377 { 00378 for(int j=0;j<Nx;j++) 00379 { 00380 const int s_x=shr_x-Nx2+j; 00381 const int x_loc = x_pos-Nx2+j; 00382 const int f_x=Nx-j-1; 00383 if(x_loc >=0 && x_loc < src_w) 00384 { 00385 const int flt_idx = IMUL(f_y,Nx)+f_x; 00386 const int dat_idx = IMUL(s_y,dw)+s_x; 00387 //sum += src[src_idx]*data[flt_idx]; 00388 //sum = __fmaf_rn(src[src_idx],data[flt_idx],sum); 00389 sum = __fadd_rn(__fmul_rn(data[dat_idx],filt[flt_idx]),sum); 00390 } 00391 } 00392 } 00393 } 00394 res[res_idx] = sum; 00395 } 00396 } 00397 00398 00399 00400 00401 __global__ void cuda_global_optConvolve(float *res, const float *src, const int src_w, const int src_h, 00402 const float *f, const int fil_w, const int fil_h, const int tile_width, const int tile_height) 00403 { 00404 const int y_pos = IMUL(blockIdx.y, tile_height) + threadIdx.y; 00405 const int x_pos = IMUL(blockIdx.x, tile_width) + threadIdx.x; 00406 const int ld_idx = threadIdx.y*tile_width + threadIdx.x; 00407 const int res_idx = y_pos*src_w+x_pos; 00408 00409 // Load the filter into shared memory 00410 float *data = (float *) shared_data; // size of tile_width 00411 const int tile_sz = IMUL(tile_height,tile_width); 00412 const int fil_sz = IMUL(fil_w,fil_h); 00413 00414 for(int i=0;i<fil_sz;i+=tile_sz) 00415 { 00416 const int curld_idx = ld_idx+IMUL(i,tile_sz); 00417 if(curld_idx < fil_sz) 00418 data[curld_idx] = f[curld_idx]; 00419 } 00420 __syncthreads(); 00421 00422 const int Nx2 = (fil_w - 1) / 2; 00423 const int Ny2 = (fil_h - 1) / 2; 00424 00425 //const int srow_skip = src_w-fil_w; 00426 00427 //for (int dst_y = 0; dst_y < src_h; ++dst_y) 00428 // Determine if we're safely inside the image in the y-direction: 00429 const bool y_clean = y_pos >= Ny2 && y_pos < (src_h - Nx2); 00430 00431 //for (int dst_x = 0; dst_x < src_w; ++dst_x, ++dptr) 00432 // Determine if we're safely inside the image in the x-direction: 00433 const bool x_clean = x_pos >= Nx2 && x_pos < (src_w - Nx2); 00434 00435 // Here is where we pick whether we can use the optimized inner 00436 // loop (in cases where the filter and image patch overlap 00437 // completely) or whether we must use the inner loop that can 00438 // handle boundary conditions. 00439 00440 if (x_clean && y_clean) 00441 { 00442 float dst_val = 0.0f; 00443 for(int i=0;i<fil_h;i++) 00444 { 00445 const int s_y=y_pos-Ny2+i; 00446 const int f_y=fil_h-i-1; 00447 for(int j=0;j<fil_w;j++) 00448 { 00449 const int s_x=x_pos-Nx2+j; 00450 const int f_x=fil_w-j-1; 00451 const int flt_idx = IMUL(f_y,fil_w)+f_x; 00452 const int src_idx = IMUL(s_y,src_w)+s_x; 00453 dst_val += src[src_idx]*data[flt_idx]; 00454 } 00455 } 00456 res[res_idx] = dst_val; 00457 } 00458 else if(x_pos < src_w && y_pos < src_h) 00459 { 00460 // OK, we're at an image boundary, so what do we do to make 00461 // up for the missing pixels? The approach here is to 00462 // compute the average value of the non-missing pixels, and 00463 // proceed as if the missing pixels had that value. This 00464 // minimizes the introduction of edge artifacts e.g. when 00465 // convolving with an oriented filter. 00466 float dst_val = 0.0f; 00467 float src_sum = 0.0f; 00468 int src_cnt = 0; 00469 float fil_sum_skipped = 0.0f; 00470 00471 for(int i=0;i<fil_h;i++) 00472 { 00473 const int s_y=y_pos-Ny2+i; 00474 const int f_y=fil_h-i-1; 00475 if(s_y >= 0 && s_y < src_h) 00476 { 00477 for(int j=0;j<fil_w;j++) 00478 { 00479 const int s_x=x_pos-Nx2+j; 00480 const int f_x=fil_w-j-1; 00481 const int flt_idx = IMUL(f_y,fil_w)+f_x; 00482 const int src_idx = IMUL(s_y,src_w)+s_x; 00483 if(s_x >=0 && s_x < src_w) 00484 { 00485 float src_val = src[src_idx]; 00486 dst_val += src_val*data[flt_idx]; 00487 src_sum += src_val; 00488 src_cnt++; 00489 } 00490 else 00491 { 00492 fil_sum_skipped += data[flt_idx]; 00493 } 00494 } 00495 } 00496 else 00497 { 00498 for (int f_x = fil_w-1; f_x >= 0; --f_x) 00499 fil_sum_skipped += data[IMUL(f_y,fil_w)+f_x]; 00500 } 00501 } 00502 const float src_avg = src_sum / src_cnt; 00503 res[res_idx] = dst_val + (fil_sum_skipped*src_avg); 00504 00505 00506 } 00507 00508 } 00509 00510 00511 00512 __global__ void cuda_global_xFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len) 00513 { 00514 const int hfs2 = (hfs-1) / 2; 00515 const int y_pos = blockIdx.y; 00516 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x; 00517 const int ld_idx = threadIdx.x; 00518 const int res_idx = y_pos*src_w+x_pos; 00519 00520 // Load the filter into shared memory 00521 float *data = (float *) shared_data; // size of share_len 00522 float sum = 0; 00523 const int numIters = IDIVUP(hfs,shr_len); 00524 // Handle the fact that the filter might be very large, and unable to fit in our shared memory size 00525 for(int iter=0;iter<numIters;iter++) 00526 { 00527 const int filt_beg = IMUL(iter,shr_len); 00528 const int filt_end = filt_beg+shr_len; 00529 const int load_max = min(filt_end,hfs)-filt_beg; 00530 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len) 00531 { 00532 const int filt_idx = data_idx + filt_beg; 00533 data[data_idx] = f[filt_idx]; 00534 } 00535 __syncthreads(); 00536 00537 if(y_pos < src_h && x_pos < src_w) 00538 { 00539 00540 for(int j=0;j<load_max;j++) 00541 { 00542 const int s_x=x_pos+hfs2-(filt_beg+j); 00543 const int f_x=j; 00544 if(s_x >=0 && s_x < src_w) 00545 { 00546 const int src_idx = IMUL(y_pos,src_w)+s_x; 00547 sum += src[src_idx]*data[f_x]; 00548 } 00549 } 00550 00551 } 00552 // Avoid doing this synch unless we have a very large filter 00553 if(numIters > 1) 00554 __syncthreads(); 00555 00556 } 00557 if(y_pos < src_h && x_pos < src_w) 00558 { 00559 res[res_idx] = sum; 00560 } 00561 00562 } 00563 00564 __global__ void cuda_global_xFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len) 00565 { 00566 const int hfs2 = (hfs - 1) / 2; 00567 const int y_pos = blockIdx.y; 00568 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x; 00569 const int ld_idx = threadIdx.x; 00570 const int res_idx = y_pos*src_w+x_pos; 00571 00572 // Load the filter into shared memory 00573 float *data = (float *) shared_data; // size of tile_width 00574 float sum = 0, sumw = 0, sumf = 0; 00575 const int numIters = IDIVUP(hfs,shr_len); 00576 // Handle the fact that the filter might be very large, and unable to fit in our shared memory size 00577 for(int iter=0;iter<numIters;iter++) 00578 { 00579 const int filt_beg = IMUL(iter,shr_len); 00580 const int filt_end = filt_beg+shr_len; 00581 const int load_max = min(filt_end,hfs)-filt_beg; 00582 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len) 00583 { 00584 const int filt_idx = data_idx + filt_beg; 00585 data[data_idx] = f[filt_idx]; 00586 } 00587 __syncthreads(); 00588 00589 if(y_pos < src_h && x_pos < src_w) 00590 { 00591 00592 for(int j=0;j<load_max;j++) 00593 { 00594 const int s_x=x_pos+hfs2-(filt_beg+j); 00595 const int f_x=j; 00596 if(s_x >=0 && s_x < src_w) 00597 { 00598 const int src_idx = IMUL(y_pos,src_w)+s_x; 00599 sum += src[src_idx]*data[f_x]; 00600 sumw += data[f_x]; 00601 } 00602 else 00603 { 00604 sumf += data[f_x]; 00605 } 00606 } 00607 } 00608 // Avoid doing this synch unless we have a very large filter 00609 if(numIters > 1) 00610 __syncthreads(); 00611 00612 } 00613 if(y_pos < src_h && x_pos < src_w) 00614 { 00615 sumf+=sumw; 00616 res[res_idx] = sum*sumf/sumw; 00617 } 00618 00619 } 00620 00621 __global__ void cuda_global_xFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len) 00622 { 00623 const int hfs2 = (hfs - 1) / 2; 00624 const int y_pos = blockIdx.y; 00625 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x; 00626 const int ld_idx = threadIdx.x; 00627 const int res_idx = y_pos*src_w+x_pos; 00628 00629 // Load the filter into shared memory 00630 float *data = (float *) shared_data; // size of tile_width 00631 float sum = 0; 00632 const int numIters = IDIVUP(hfs,shr_len); 00633 // Handle the fact that the filter might be very large, and unable to fit in our shared memory size 00634 for(int iter=0;iter<numIters;iter++) 00635 { 00636 const int filt_beg = IMUL(iter,shr_len); 00637 const int filt_end = filt_beg+shr_len; 00638 const int load_max = min(filt_end,hfs)-filt_beg; 00639 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len) 00640 { 00641 const int filt_idx = data_idx + filt_beg; 00642 data[data_idx] = f[filt_idx]; 00643 } 00644 __syncthreads(); 00645 00646 if(y_pos < src_h && x_pos < src_w) 00647 { 00648 00649 for(int j=0;j<load_max;j++) 00650 { 00651 const int s_x=x_pos+hfs2-(filt_beg+j); 00652 const int f_x=j; 00653 if(s_x < 0) 00654 { 00655 sum += src[IMUL(y_pos,src_w)]*data[f_x]; 00656 } 00657 else if(s_x >= src_w) 00658 { 00659 sum += src[IMUL(y_pos+1,src_w)-1]*data[f_x]; 00660 } 00661 else 00662 { 00663 const int src_idx = IMUL(y_pos,src_w)+s_x; 00664 sum += src[src_idx]*data[f_x]; 00665 } 00666 } 00667 } 00668 // Avoid doing this synch unless we have a very large filter 00669 if(numIters > 1) 00670 __syncthreads(); 00671 00672 } 00673 if(y_pos < src_h && x_pos < src_w) 00674 { 00675 res[res_idx] = sum; 00676 } 00677 00678 00679 } 00680 00681 00682 __global__ void cuda_global_yFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len) 00683 { 00684 const int hfs2 = (hfs - 1) / 2; 00685 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y; 00686 const int x_pos = blockIdx.x; 00687 const int ld_idx = threadIdx.y; 00688 const int res_idx = y_pos*src_w+x_pos; 00689 00690 // Load the filter into shared memory 00691 float *data = (float *) shared_data; // size of tile_width 00692 float sum = 0; 00693 const int numIters = IDIVUP(hfs,shr_len); 00694 // Handle the fact that the filter might be very large, and unable to fit in our memory size 00695 for(int iter=0;iter<numIters;iter++) 00696 { 00697 const int filt_beg = IMUL((iter),shr_len); 00698 const int filt_end = filt_beg+shr_len; 00699 const int load_max = min(filt_end,hfs)-filt_beg; 00700 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len) 00701 { 00702 const int filt_idx = data_idx + filt_beg; 00703 data[data_idx] = f[filt_idx]; 00704 } 00705 __syncthreads(); 00706 00707 if(y_pos < src_h && x_pos < src_w) 00708 { 00709 00710 for(int j=0;j<load_max;j++) 00711 { 00712 const int s_y=y_pos+hfs2-(filt_beg+j); 00713 const int f_y=j; 00714 if(s_y >=0 && s_y < src_h) 00715 { 00716 const int src_idx = IMUL(s_y,src_w)+x_pos; 00717 sum += src[src_idx]*data[f_y]; 00718 } 00719 } 00720 } 00721 // Avoid doing this synch unless we have a very large filter 00722 if(numIters > 1) 00723 __syncthreads(); 00724 00725 } 00726 if(y_pos < src_h && x_pos < src_w) 00727 { 00728 res[res_idx] = sum; 00729 } 00730 00731 } 00732 00733 __global__ void cuda_global_yFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len) 00734 { 00735 const int hfs2 = (hfs - 1) / 2; 00736 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y; 00737 const int x_pos = blockIdx.x; 00738 const int ld_idx = threadIdx.y; 00739 const int res_idx = y_pos*src_w+x_pos; 00740 00741 // Load the filter into shared memory 00742 float *data = (float *) shared_data; // size of tile_width 00743 float sum = 0, sumw = 0, sumf = 0; 00744 const int numIters = IDIVUP(hfs,shr_len); 00745 // Handle the fact that the filter might be very large, and unable to fit in our memory size 00746 for(int iter=0;iter<numIters;iter++) 00747 { 00748 const int filt_beg = IMUL((iter),shr_len); 00749 const int filt_end = filt_beg+shr_len; 00750 const int load_max = min(filt_end,hfs)-filt_beg; 00751 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len) 00752 { 00753 const int filt_idx = data_idx + filt_beg; 00754 data[data_idx] = f[filt_idx]; 00755 } 00756 __syncthreads(); 00757 00758 if(y_pos < src_h && x_pos < src_w) 00759 { 00760 00761 for(int j=0;j<load_max;j++) 00762 { 00763 const int s_y=y_pos+hfs2-(filt_beg+j); 00764 const int f_y=j; 00765 if(s_y >=0 && s_y < src_h) 00766 { 00767 const int src_idx = IMUL(s_y,src_w)+x_pos; 00768 sum += src[src_idx]*data[f_y]; 00769 sumw += data[f_y]; 00770 } 00771 else 00772 { 00773 sumf += data[f_y]; 00774 } 00775 } 00776 } 00777 // Avoid doing this synch unless we have a very large filter 00778 if(numIters > 1) 00779 __syncthreads(); 00780 } 00781 if(y_pos < src_h && x_pos < src_w) 00782 { 00783 sumf+=sumw; 00784 res[res_idx] = sum*sumf/sumw; 00785 } 00786 00787 } 00788 00789 __global__ void cuda_global_yFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int shr_len, const int tile_len) 00790 { 00791 const int hfs2 = (hfs - 1) / 2; 00792 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y; 00793 const int x_pos = blockIdx.x; 00794 const int ld_idx = threadIdx.y; 00795 const int res_idx = y_pos*src_w+x_pos; 00796 00797 00798 // Load the filter into shared memory 00799 float *data = (float *) shared_data; // size of tile_width 00800 float sum = 0; 00801 const int numIters = IDIVUP(hfs,shr_len); 00802 // Handle the fact that the filter might be very large, and unable to fit in our memory size 00803 for(int iter=0;iter<numIters;iter++) 00804 { 00805 const int filt_beg = IMUL((iter),shr_len); 00806 const int filt_end = filt_beg+shr_len; 00807 const int load_max = min(filt_end,hfs)-filt_beg; 00808 for(int data_idx=ld_idx;data_idx<load_max;data_idx+=tile_len) 00809 { 00810 const int filt_idx = data_idx + filt_beg; 00811 data[data_idx] = f[filt_idx]; 00812 } 00813 __syncthreads(); 00814 00815 if(y_pos < src_h && x_pos < src_w) 00816 { 00817 for(int j=0;j<load_max;j++) 00818 { 00819 const int s_y=y_pos+hfs2-(filt_beg+j); 00820 const int f_y=j; 00821 if(s_y < 0) 00822 { 00823 sum += src[x_pos]*data[f_y]; 00824 } 00825 else if(s_y >= src_h) 00826 { 00827 sum += src[IMUL(src_h-1,src_w)+x_pos]*data[f_y]; 00828 } 00829 else 00830 { 00831 const int src_idx = IMUL(s_y,src_w)+x_pos; 00832 sum += src[src_idx]*data[f_y]; 00833 } 00834 } 00835 } 00836 // Avoid doing this synch unless we have a very large filter 00837 if(numIters > 1) 00838 __syncthreads(); 00839 } 00840 if(y_pos < src_h && x_pos < src_w) 00841 { 00842 res[res_idx] = sum; 00843 } 00844 00845 00846 } 00847 00848 00849 __global__ void cuda_global_optXFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 00850 { 00851 const int hfs2 = (hfs-1) / 2; 00852 const int y_pos = blockIdx.y; 00853 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x; 00854 const int data_idx = threadIdx.x; 00855 const int res_idx = y_pos*src_w+x_pos; 00856 const int ctr_off = hfs2; 00857 00858 // Load the filter into shared memory 00859 float *filt = (float *) shared_data; // size of hfs 00860 float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1 00861 00862 float sum = 0; 00863 00864 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len) 00865 { 00866 filt[filt_idx] = f[filt_idx]; 00867 } 00868 data[data_idx+ctr_off] = src[res_idx]; 00869 // Load early border 00870 if(data_idx < ctr_off && x_pos > ctr_off) 00871 { 00872 data[data_idx] = src[res_idx-ctr_off]; 00873 } 00874 // Load late border 00875 if(data_idx > tile_len-ctr_off-1 && x_pos < src_w-ctr_off) 00876 { 00877 data[data_idx+ctr_off+ctr_off] = src[res_idx+ctr_off]; 00878 } 00879 __syncthreads(); 00880 00881 00882 if(y_pos < src_h && x_pos < src_w) 00883 { 00884 00885 for(int j=0;j<hfs;j++) 00886 { 00887 const int s_x=data_idx+ctr_off+hfs2-j; 00888 const int x_loc=x_pos+hfs2-j; 00889 const int f_x=j; 00890 if(x_loc >=0 && x_loc < src_w) 00891 { 00892 sum += data[s_x]*filt[f_x]; 00893 } 00894 } 00895 00896 } 00897 if(y_pos < src_h && x_pos < src_w) 00898 { 00899 res[res_idx] = sum; 00900 } 00901 00902 } 00903 00904 00905 __global__ void cuda_global_optXFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 00906 { 00907 const int hfs2 = (hfs - 1) / 2; 00908 const int y_pos = blockIdx.y; 00909 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x; 00910 const int data_idx = threadIdx.x; 00911 const int res_idx = y_pos*src_w+x_pos; 00912 const int ctr_off = hfs2; 00913 00914 // Load the filter into shared memory 00915 float *filt = (float *) shared_data; // size of hfs 00916 float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1 00917 00918 float sum = 0, sumw = 0, sumf = 0; 00919 00920 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len) 00921 { 00922 filt[filt_idx] = f[filt_idx]; 00923 } 00924 if(y_pos < src_h && x_pos < src_w) 00925 { 00926 00927 data[data_idx+ctr_off] = src[res_idx]; 00928 // Load early border 00929 if(data_idx < ctr_off && x_pos > ctr_off) 00930 { 00931 data[data_idx] = src[res_idx-ctr_off]; 00932 } 00933 // Load late border 00934 if(data_idx > tile_len-ctr_off-1 && x_pos < src_w-ctr_off) 00935 { 00936 data[data_idx+ctr_off+ctr_off] = src[res_idx+ctr_off]; 00937 } 00938 __syncthreads(); 00939 00940 00941 for(int j=0;j<hfs;j++) 00942 { 00943 const int s_x=data_idx+ctr_off+hfs2-j; 00944 const int x_loc=x_pos+hfs2-j; 00945 const int f_x=j; 00946 if(x_loc >=0 && x_loc < src_w) 00947 { 00948 sum += data[s_x]*filt[f_x]; 00949 sumw += filt[f_x]; 00950 } 00951 else 00952 { 00953 sumf += filt[f_x]; 00954 } 00955 } 00956 sumf+=sumw; 00957 res[res_idx] = sum*sumf/sumw; 00958 } 00959 00960 } 00961 00962 00963 __global__ void cuda_global_optXFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 00964 { 00965 const int hfs2 = (hfs - 1) / 2; 00966 const int y_pos = blockIdx.y; 00967 const int x_pos = IMUL(blockIdx.x, tile_len) + threadIdx.x; 00968 const int data_idx = threadIdx.x; 00969 const int res_idx = y_pos*src_w+x_pos; 00970 const int ctr_off = hfs2; 00971 00972 // Load the filter into shared memory 00973 float *filt = (float *) shared_data; // size of hfs 00974 float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1 00975 00976 float sum = 0; 00977 00978 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len) 00979 { 00980 filt[filt_idx] = f[filt_idx]; 00981 } 00982 if(y_pos < src_h && x_pos < src_w) 00983 { 00984 00985 data[data_idx+ctr_off] = src[res_idx]; 00986 // Load early border 00987 if(data_idx < ctr_off && x_pos > ctr_off) 00988 { 00989 data[data_idx] = src[res_idx-ctr_off]; 00990 } 00991 // Load late border 00992 if(data_idx > tile_len-ctr_off-1 && x_pos < src_w-ctr_off) 00993 { 00994 data[data_idx+ctr_off+ctr_off] = src[res_idx+ctr_off]; 00995 } 00996 __syncthreads(); 00997 00998 for(int j=0;j<hfs;j++) 00999 { 01000 const int x_loc=x_pos+hfs2-j; 01001 const int f_x=j; 01002 if(x_loc < 0) 01003 { 01004 // Hold to the leftmost pixel 01005 sum += data[ctr_off]*filt[f_x]; 01006 } 01007 else if(x_loc >= src_w) 01008 { 01009 // Hold to the rightmost pixel 01010 const int max_x = src_w-IMUL(blockIdx.x, tile_len)+ctr_off-1; 01011 sum += data[max_x]*filt[f_x]; 01012 } 01013 else 01014 { 01015 const int s_x=data_idx+ctr_off+hfs2-j; 01016 sum += data[s_x]*filt[f_x]; 01017 } 01018 } 01019 01020 res[res_idx] = sum; 01021 } 01022 01023 } 01024 01025 01026 __global__ void cuda_global_optYFilterZero(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 01027 { 01028 const int hfs2 = (hfs-1) / 2; 01029 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y; 01030 const int x_pos = blockIdx.x; 01031 const int data_idx = threadIdx.y; 01032 const int res_idx = y_pos*src_w+x_pos; 01033 const int ctr_off = hfs2; 01034 01035 // Load the filter into shared memory 01036 float *filt = (float *) shared_data; // size of hfs 01037 float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1 01038 01039 float sum = 0; 01040 01041 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len) 01042 { 01043 filt[filt_idx] = f[filt_idx]; 01044 } 01045 if(y_pos < src_h && x_pos < src_w) 01046 { 01047 data[data_idx+ctr_off] = src[res_idx]; 01048 // Load early border 01049 if(data_idx < ctr_off && y_pos > ctr_off) 01050 { 01051 data[data_idx] = src[res_idx-IMUL(ctr_off,src_w)]; 01052 } 01053 // Load late border 01054 if(data_idx > tile_len-ctr_off-1 && y_pos < src_h-ctr_off) 01055 { 01056 data[data_idx+ctr_off+ctr_off] = src[res_idx+IMUL(ctr_off,src_w)]; 01057 } 01058 __syncthreads(); 01059 01060 01061 for(int j=0;j<hfs;j++) 01062 { 01063 const int s_y=data_idx+ctr_off+hfs2-j; 01064 const int y_loc=y_pos+hfs2-j; 01065 const int f_y=j; 01066 if(y_loc >=0 && y_loc < src_h) 01067 { 01068 sum += data[s_y]*filt[f_y]; 01069 } 01070 } 01071 01072 res[res_idx] = sum; 01073 } 01074 01075 } 01076 01077 01078 __global__ void cuda_global_optYFilterClean(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 01079 { 01080 const int hfs2 = (hfs - 1) / 2; 01081 const int y_pos = IMUL(blockIdx.y,tile_len) + threadIdx.y; 01082 const int x_pos = blockIdx.x; 01083 const int data_idx = threadIdx.y; 01084 const int res_idx = y_pos*src_w+x_pos; 01085 const int ctr_off = hfs2; 01086 01087 // Load the filter into shared memory 01088 float *filt = (float *) shared_data; // size of hfs 01089 float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1 01090 01091 float sum = 0, sumw = 0, sumf = 0; 01092 01093 // Load the filter 01094 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len) 01095 { 01096 filt[filt_idx] = f[filt_idx]; 01097 } 01098 if(y_pos < src_h && x_pos < src_w) 01099 { 01100 // Load the bulk of the tile data 01101 data[data_idx+ctr_off] = src[res_idx]; 01102 // Load early border 01103 if(data_idx < ctr_off && y_pos > ctr_off) 01104 { 01105 data[data_idx] = src[res_idx-IMUL(ctr_off,src_w)]; 01106 } 01107 // Load late border 01108 if(data_idx > tile_len-ctr_off-1 && y_pos < src_h-ctr_off) 01109 { 01110 data[data_idx+ctr_off+ctr_off] = src[res_idx+IMUL(ctr_off,src_w)]; 01111 } 01112 __syncthreads(); 01113 01114 for(int j=0;j<hfs;j++) 01115 { 01116 const int s_y=data_idx+ctr_off+hfs2-j; 01117 const int y_loc=y_pos+hfs2-j; 01118 const int f_y=j; 01119 if(y_loc >=0 && y_loc < src_h) 01120 { 01121 sum += data[s_y]*filt[f_y]; 01122 sumw += filt[f_y]; 01123 } 01124 else 01125 { 01126 sumf += filt[f_y]; 01127 } 01128 } 01129 } 01130 if(y_pos < src_h && x_pos < src_w) 01131 { 01132 sumf+=sumw; 01133 res[res_idx] = sum*sumf/sumw; 01134 } 01135 01136 } 01137 01138 __global__ void cuda_global_optYFilterReplicate(float *res, const float *src, const int src_w, const int src_h, const float *f, const int hfs, const int tile_len) 01139 { 01140 const int hfs2 = (hfs - 1) / 2; 01141 const int y_pos = IMUL(blockIdx.y, tile_len) + threadIdx.y; 01142 const int x_pos = blockIdx.x; 01143 const int data_idx = threadIdx.y; 01144 const int res_idx = y_pos*src_w+x_pos; 01145 const int ctr_off = hfs2; 01146 01147 // Load the filter into shared memory 01148 float *filt = (float *) shared_data; // size of hfs 01149 float *data = (float *) &(shared_data[hfs]); // size of tile_len+hfs-1 01150 01151 float sum = 0; 01152 01153 for(int filt_idx=data_idx;filt_idx<hfs;filt_idx+=tile_len) 01154 { 01155 filt[filt_idx] = f[filt_idx]; 01156 } 01157 if(y_pos < src_h && x_pos < src_w) 01158 { 01159 data[data_idx+ctr_off] = src[res_idx]; 01160 // Load early border 01161 if(data_idx < ctr_off && y_pos > ctr_off) 01162 { 01163 data[data_idx] = src[res_idx-IMUL(ctr_off,src_w)]; 01164 } 01165 // Load late border 01166 if(data_idx > tile_len-ctr_off-1 && y_pos < src_h-ctr_off) 01167 { 01168 data[data_idx+ctr_off+ctr_off] = src[res_idx+IMUL(ctr_off,src_w)]; 01169 } 01170 __syncthreads(); 01171 01172 for(int j=0;j<hfs;j++) 01173 { 01174 const int y_loc=y_pos+hfs2-j; 01175 const int f_y=j; 01176 if(y_loc < 0) 01177 { 01178 // Hold to the topmost pixel 01179 sum += data[ctr_off]*filt[f_y]; 01180 } 01181 else if(y_loc >= src_h) 01182 { 01183 // Hold to the bottommost pixel 01184 const int max_y = src_h-IMUL(blockIdx.y, tile_len)+ctr_off-1; 01185 sum += data[max_y]*filt[f_y]; 01186 } 01187 else 01188 { 01189 const int s_y=data_idx+ctr_off+hfs2-j; 01190 sum += data[s_y]*filt[f_y]; 01191 } 01192 } 01193 } 01194 if(y_pos < src_h && x_pos < src_w) 01195 { 01196 res[res_idx] = sum; 01197 } 01198 01199 } 01200 01201 01202 #endif