00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #ifndef CUDA_SHAPEOPS_H_DEFINED
00039 #define CUDA_SHAPEOPS_H_DEFINED
00040
00041 #include <cuda.h>
00042 #include "CUDA/cutil.h"
00043 #include "cudadefs.h"
00044
00045 __global__ void cuda_global_dec_xy(const float *src, float* dst, const int x_factor, const int y_factor, const unsigned int w, const unsigned int h, int tile_width)
00046 {
00047
00048
00049 const int new_width = w/x_factor;
00050 const int new_height = h/y_factor;
00051 const int new_size = new_width*new_height;
00052
00053 const int dx = threadIdx.x;
00054 const int dts = IMUL(blockIdx.x, tile_width);
00055 const int drs = IMUL(blockIdx.y, new_width);
00056
00057 const int srx = IMUL((dx+dts),x_factor);
00058 const int srs = IMUL(IMUL(blockIdx.y, w),y_factor);
00059
00060 const int writeIdx = drs + dts + dx;
00061 const int loadIdx = srs + srx;
00062
00063
00064 if(writeIdx < new_size && loadIdx < w*h && dts+dx < new_width) {
00065 dst[writeIdx] = src[loadIdx];
00066 }
00067
00068 }
00069
00070 __global__ void cuda_global_dec_x(const float *src, float* dst, const int x_factor, const unsigned int w, const unsigned int h, int tile_width)
00071 {
00072
00073
00074 const int new_width = w/x_factor;
00075 const int new_size = new_width*h;
00076
00077 const int dx = threadIdx.x;
00078 const int dts = IMUL(blockIdx.x, tile_width);
00079 const int drs = IMUL(blockIdx.y, new_width);
00080
00081 const int srx = IMUL((dx+dts),x_factor);
00082 const int srs = IMUL(blockIdx.y, w);
00083
00084 const int writeIdx = drs + dts + dx;
00085 const int loadIdx = srs + srx;
00086
00087
00088 if(writeIdx < new_size && loadIdx < w*h && dts+dx < new_width) {
00089 dst[writeIdx] = src[loadIdx];
00090 }
00091
00092 }
00093
00094
00095 __global__ void cuda_global_dec_y(const float *src, float* dst, const int y_factor, const unsigned int w, const unsigned int h, int tile_width)
00096 {
00097
00098
00099 const int new_height = h/y_factor;
00100 const int new_size = w*new_height;
00101
00102 const int dx = threadIdx.x;
00103 const int dts = IMUL(blockIdx.x, tile_width);
00104 const int drs = IMUL(blockIdx.y, w);
00105
00106 const int srx = dx+dts;
00107 const int srs = IMUL(IMUL(blockIdx.y, w),y_factor);
00108
00109 const int writeIdx = drs + dts + dx;
00110 const int loadIdx = srs + srx;
00111
00112
00113 if(writeIdx < new_size && loadIdx < w*h && dts+dx < w) {
00114 dst[writeIdx] = src[loadIdx];
00115 }
00116
00117 }
00118
00119 __global__ void cuda_global_quickLocalAvg(const float *in, float *res, float fac, int scalex, int scaley, int remx, int remy, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00120 {
00121 const int srs = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00122 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00123 int sidx = IMUL(srs,sw) + scs;
00124
00125 if(scs < sw && srs < sh)
00126 {
00127 res[sidx] = 0;
00128 int offx = 0; int offy=0;
00129
00130 if(scs == sw-1) offx+=remx;
00131 if(srs == sh-1) offy+=remy;
00132
00133 for(int j=0;j<scaley+offy;j++)
00134 {
00135 for(int i=0;i<scalex+offx;i++)
00136 {
00137 const int x_pos = IMUL(scs,scalex)+i;
00138 const int y_pos = IMUL(srs,scaley)+j;
00139 int lidx = IMUL(y_pos,lw) + x_pos;
00140 if(x_pos < lw && y_pos < lh)
00141 res[sidx] += in[lidx];
00142 }
00143 }
00144
00145 res[sidx] *= fac;
00146 }
00147
00148 }
00149
00150
00151 __global__ void cuda_global_quickLocalAvg2x2(const float *in, float *res, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00152 {
00153 const int srs = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00154 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00155 const int sidx = IMUL(srs,sw) + scs;
00156
00157 if(scs < sw && srs < sh)
00158 {
00159 res[sidx] = 0;
00160 const int x_pos = IMUL(scs,2);
00161 const int y_pos = IMUL(srs,2);
00162 int lidx = IMUL(y_pos,lw) + x_pos;
00163
00164 if(x_pos+1 < lw && y_pos+1 < lh)
00165 {
00166 res[sidx] = (in[lidx] + in[lidx+1] + in[lidx+lw] + in[lidx+lw+1])*0.25F;
00167 }
00168 }
00169
00170 }
00171
00172 __global__ void cuda_global_quickLocalMax(const float *in, float *res, int scalex, int scaley, int remx, int remy, int lw, int lh, int sw, int sh, int tile_width, int tile_height)
00173 {
00174 const int srs = IMUL(blockIdx.y, tile_height) + threadIdx.y;
00175 const int scs = IMUL(blockIdx.x, tile_width) + threadIdx.x;
00176 int sidx = IMUL(srs,sw) + scs;
00177 float curRes = -10000.0F;
00178 if(scs < sw && srs < sh)
00179 {
00180 int offx = 0; int offy=0;
00181
00182 if(scs == sw-1) offx+=remx;
00183 if(srs == sh-1) offy+=remy;
00184
00185 for(int j=0;j<scaley+offy;j++)
00186 {
00187 for(int i=0;i<scalex+offx;i++)
00188 {
00189 const int x_pos = IMUL(scs,scalex)+i;
00190 const int y_pos = IMUL(srs,scaley)+j;
00191 int lidx = IMUL(y_pos,lw) + x_pos;
00192 if(x_pos < lw && y_pos < lh)
00193 curRes= fmaxf(in[lidx],curRes);
00194 }
00195 }
00196
00197 res[sidx] = curRes;
00198 }
00199
00200 }
00201
00202
00203
00204 __global__ void cuda_global_rescaleBilinear(const float *src, float *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height)
00205 {
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218 const int dest_col = blockIdx.x*tile_width + threadIdx.x;
00219
00220 const int dest_row = blockIdx.y*tile_height + threadIdx.y;
00221
00222 const int dest_idx = dest_row*new_w + dest_col;
00223
00224 if(dest_col < new_w && dest_row < new_h)
00225 {
00226
00227 const float y = fmaxf(0.0f,(dest_row+0.5f)*sh - 0.5f);
00228 const int src_row0 = int(y);
00229 const int src_row1 = (int) fminf(src_row0+1,orig_h-1);
00230 const float x = fmaxf(0.0f,(dest_col+0.5f)*sw - 0.5f);
00231 const int src_col0 = int(x);
00232 const int src_col1 = (int) fminf(src_col0+1,orig_w-1);
00233 const float fy = y - float(src_row0);
00234 const float fx = x - float(src_col0);
00235 const int yw0 = IMUL(src_row0,orig_w);
00236 const int yw1 = IMUL(src_row1,orig_w);
00237
00238 const float d00 = src[yw0+src_col0];
00239 const float d10 = src[yw0+src_col1];
00240 const float d01 = src[yw1+src_col0];
00241 const float d11 = src[yw1+src_col1];
00242 float dx0 = d00 + (d10 - d00) * fx;
00243 float dx1 = d01 + (d11 - d01) * fx;
00244 res[dest_idx] = dx0 + (dx1 - dx0)*fy;
00245 }
00246 }
00247
00248
00249 __global__ void cuda_global_rescaleBilinearRGB(const float3_t *src, float3_t *res, float sw, float sh, int orig_w, int orig_h, int new_w, int new_h, int tile_width, int tile_height)
00250 {
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263 const int dest_col = blockIdx.x*tile_width + threadIdx.x;
00264
00265 const int dest_row = blockIdx.y*tile_height + threadIdx.y;
00266
00267 const int dest_idx = dest_row*new_w + dest_col;
00268
00269 if(dest_col < new_w && dest_row < new_h)
00270 {
00271
00272 const float y = fmaxf(0.0f,(dest_row+0.5f)*sh - 0.5f);
00273 const int src_row0 = int(y);
00274 const int src_row1 = (int) fminf(src_row0+1,orig_h-1);
00275 const float x = fmaxf(0.0f,(dest_col+0.5f)*sw - 0.5f);
00276 const int src_col0 = int(x);
00277 const int src_col1 = (int) fminf(src_col0+1,orig_w-1);
00278 const float fy = y - float(src_row0);
00279 const float fx = x - float(src_col0);
00280 const int yw0 = IMUL(src_row0,orig_w);
00281 const int yw1 = IMUL(src_row1,orig_w);
00282
00283 float d00,d10,d01,d11;
00284 float dx0,dx1;
00285 for(int i=0;i<3;i++)
00286 {
00287 d00 = src[yw0+src_col0].p[i];
00288 d10 = src[yw0+src_col1].p[i];
00289 d01 = src[yw1+src_col0].p[i];
00290 d11 = src[yw1+src_col1].p[i];
00291 dx0 = d00 + (d10 - d00) * fx;
00292 dx1 = d01 + (d11 - d01) * fx;
00293 res[dest_idx].p[i] = dx0 + (dx1 - dx0)*fy;
00294 }
00295 }
00296 }
00297
00298
00299 #endif