00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #ifndef CUDA_CUTPASTE_H_DEFINED
00040 #define CUDA_CUTPASTE_H_DEFINED
00041
00042 #include <cuda.h>
00043 #include "CUDA/cutil.h"
00044 #include "cudadefs.h"
00045
00046 __global__ void cuda_global_crop(const float *src, float *res, int srcw, int srch, int startx, int starty, int endx, int endy, int maxx,int maxy, int tile_width, int tile_height)
00047 {
00048 int resw = endx-startx;
00049 int resh = endy-starty;
00050 int dy = IMUL(blockIdx.y,tile_height)+threadIdx.y;
00051 int dx = IMUL(blockIdx.x,tile_width)+threadIdx.x;
00052 int res_idx = IMUL(dy,resw) + dx;
00053 int src_idx = IMUL(starty+dy,srcw) + startx+dx;
00054
00055 if(dx < resw && dy < resh && startx+dx < endx && starty+dy < endy)
00056 {
00057 if(startx+dx < maxx && starty+dy < maxy)
00058 {
00059 res[res_idx] = src[src_idx];
00060 }
00061 else
00062 {
00063 res[res_idx] = 0.0F;
00064 }
00065 }
00066 }
00067
00068 __global__ void cuda_global_shiftImage(const float *src, float *dst, int w, int h, float deltax, float deltay, int tile_width, int tile_height)
00069 {
00070
00071 float *data = (float *) shared_data;
00072 float *borderY = (float *) &data[tile_width*tile_height];
00073 float *borderX = (float *) &data[tile_width*tile_height+tile_height];
00074 const int sy = threadIdx.y;
00075 const int sx = threadIdx.x;
00076 const int sts = IMUL(blockIdx.y, tile_height);
00077
00078
00079
00080
00081 const int scs = IMUL(blockIdx.x, tile_width) + sx;
00082
00083 int smemPos = IMUL(sy, tile_width) + sx;
00084 int gmemPos = IMUL(sts + sy, w) + scs;
00085 const int ypos=sts+sy, xpos=scs;
00086 float val=0.0F;
00087
00088
00089 int xt = (int)floor(deltax);
00090 float xfrac = deltax - xt;
00091 int startx = MAX(0,xt);
00092 int endx = MIN(0,xt) + w;
00093 if (fabs(xfrac) < 1.0e-10F) xfrac = 0.0F;
00094 else endx--;
00095
00096
00097 int yt = (int)floor(deltay);
00098 float yfrac = deltay - yt;
00099 int starty = MAX(0,yt);
00100 int endy = MIN(0,yt) + h;
00101 if (fabs(yfrac) < 1.0e-10F) yfrac = 0.0F;
00102 else endy--;
00103
00104
00105
00106
00107
00108 if (xfrac > 0.0)
00109 {
00110 xfrac = 1.0 - xfrac;
00111 xt++;
00112 }
00113
00114 if (yfrac > 0.0)
00115 {
00116 yfrac = 1.0 - yfrac;
00117 yt++;
00118 }
00119
00120
00121 float tl = (1.0F - xfrac) * (1.0F - yfrac);
00122 float tr = xfrac * (1.0F - yfrac);
00123 float bl = (1.0F - xfrac) * yfrac;
00124 float br = xfrac * yfrac;
00125
00126
00127 if (xpos < w && ypos < h) {
00128
00129
00130
00131 data[smemPos] = src[gmemPos];
00132
00133 bool bot = (sy == tile_height-1 && ypos+1 < h);
00134
00135 bool tbot = (ypos+1 >= h);
00136
00137 bool rig = (sx == tile_width-1 && xpos+1 < w);
00138
00139 bool trig = (xpos+1 >= w);
00140
00141 if(rig)
00142 borderY[threadIdx.y] = src[gmemPos + 1];
00143
00144
00145 if(bot)
00146 borderX[threadIdx.x] = src[gmemPos+w];
00147
00148
00149 if(bot && rig)
00150 borderX[tile_width] = src[gmemPos+w+1];
00151
00152
00153
00154 __syncthreads();
00155
00156 int dx = xpos+xt;
00157 int dy = ypos+yt;
00158
00159 val=0;
00160 if(dy >= starty && dy < endy && dx >= startx && dx < endx)
00161 {
00162 val += data[smemPos]*tl;
00163 if(bot)
00164 {
00165 if (rig)
00166 {
00167 val += borderY[threadIdx.y]*tr;
00168 val += borderX[threadIdx.x]*bl;
00169 val += borderX[tile_width]*br;
00170 }
00171 else if(trig)
00172 {
00173 val += data[smemPos]*tr;
00174 val += borderX[threadIdx.x]*bl;
00175 val += borderX[threadIdx.x]*br;
00176 }
00177 else
00178 {
00179 val += data[smemPos+1]*tr;
00180 val += borderX[threadIdx.x]*bl;
00181 val += borderX[threadIdx.x+1]*br;
00182 }
00183 }
00184 else if(tbot)
00185 {
00186 if (rig)
00187 {
00188 val += borderY[threadIdx.y]*tr;
00189 val += data[smemPos]*bl;
00190 val += borderY[threadIdx.y]*br;
00191 }
00192 else if(trig)
00193 {
00194
00195 val += data[smemPos]*tr;
00196 val += data[smemPos]*bl;
00197 val += data[smemPos]*br;
00198 }
00199 else
00200 {
00201 val += data[smemPos+1]*tr;
00202 val += data[smemPos]*bl;
00203 val += data[smemPos+1]*br;
00204 }
00205 }
00206 else
00207 {
00208 if (rig)
00209 {
00210 val += borderY[threadIdx.y]*tr;
00211 val += data[smemPos+tile_width]*bl;
00212 val += borderY[threadIdx.y+1]*br;
00213 }
00214 else if(trig)
00215 {
00216 val += data[smemPos]*tr;
00217 val += data[smemPos+tile_width]*bl;
00218 val += data[smemPos+tile_width]*br;
00219 }
00220 else
00221 {
00222 val += data[smemPos+1]*tr;
00223 val += data[smemPos+tile_width]*bl;
00224 val += data[smemPos+tile_width+1]*br;
00225 }
00226 }
00227 }
00228
00229
00230
00231 int dmemPos = IMUL(dy,w) + dx;
00232 if(dx >= 0 && dx < w && dy >= 0 && dy < h)
00233 {
00234 dst[dmemPos] = val;
00235 }
00236 }
00237 }
00238
00239
00240 __global__ void cuda_global_inplacePaste(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00241 {
00242 int sx = threadIdx.x;
00243 int stsx = blockIdx.x*tile_width;
00244 int xpos = stsx + sx;
00245 int sy = threadIdx.y;
00246 int stsy = blockIdx.y*tile_height;
00247 int ypos = stsy + sy;
00248 int didx = (ypos+dy)*w + xpos+dx;
00249 int iidx = ypos*iw + xpos;
00250 if(xpos < iw && ypos < ih)
00251 {
00252 dst[didx] = img[iidx];
00253 }
00254 }
00255
00256
00257 __global__ void cuda_global_inplacePasteRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00258 {
00259 int sx = threadIdx.x;
00260 int stsx = blockIdx.x*tile_width;
00261 int xpos = stsx + sx;
00262 int sy = threadIdx.y;
00263 int stsy = blockIdx.y*tile_height;
00264 int ypos = stsy + sy;
00265 int didx = (ypos+dy)*w + xpos+dx;
00266 int iidx = ypos*iw + xpos;
00267 if(xpos < iw && ypos < ih)
00268 {
00269 dst[didx].p[0] = img[iidx].p[0];
00270 dst[didx].p[1] = img[iidx].p[1];
00271 dst[didx].p[2] = img[iidx].p[2];
00272 }
00273 }
00274
00275 __global__ void cuda_global_inplaceOverlay(float *dst, const float *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00276 {
00277 int sx = threadIdx.x;
00278 int stsx = blockIdx.x*tile_width;
00279 int xpos = stsx + sx;
00280 int sy = threadIdx.y;
00281 int stsy = blockIdx.y*tile_height;
00282 int ypos = stsy + sy;
00283 int didx = (ypos+dy)*w + xpos+dx;
00284 int iidx = ypos*iw + xpos;
00285 if(xpos < iw && ypos < ih)
00286 {
00287 float val = img[iidx];
00288 if(val > 0.0F)
00289 dst[didx] = val;
00290 }
00291 }
00292
00293
00294 __global__ void cuda_global_inplaceOverlayRGB(float3_t *dst, const float3_t *img, int w, int h, int iw, int ih, int dx, int dy, int tile_width, int tile_height)
00295 {
00296 int sx = threadIdx.x;
00297 int stsx = blockIdx.x*tile_width;
00298 int xpos = stsx + sx;
00299 int sy = threadIdx.y;
00300 int stsy = blockIdx.y*tile_height;
00301 int ypos = stsy + sy;
00302 int didx = (ypos+dy)*w + xpos+dx;
00303 int iidx = ypos*iw + xpos;
00304 if(xpos < iw && ypos < ih)
00305 {
00306 float p0,p1,p2;
00307 p0 = img[iidx].p[0];
00308 p1 = img[iidx].p[1];
00309 p2 = img[iidx].p[2];
00310 if(p0+p1+p2 > 0.0F)
00311 {
00312 dst[didx].p[0] = p0;
00313 dst[didx].p[1] = p1;
00314 dst[didx].p[2] = p2;
00315 }
00316 }
00317 }
00318
00319
00320 #endif