00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include <stdio.h>
00038 #include <stdint.h>
00039 #include <emmintrin.h>
00040 #include <stdlib.h>
00041
00042
00043
00044
00045 #ifndef MACHINE_OS_DARWIN
00046 #include <malloc.h>
00047 #endif
00048
00049 #include "Image/Image.H"
00050 #include "Image/CutPaste.H"
00051 #include "Raster/DeBayerSSE2.H"
00052
00053 using namespace std;
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068 #define BOX_FILT(v1,v2,ptr,str,off) do { \
00069 __m128i t1, t2, t3; \
00070 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00071 v1 = _mm_unpacklo_epi8 (t1, z); \
00072 v2 = _mm_unpackhi_epi8 (t1, z); \
00073 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00074 t2 = _mm_unpacklo_epi8 (t1, z); \
00075 t3 = _mm_unpackhi_epi8 (t1, z); \
00076 v1 = _mm_add_epi16 (v1, t2); \
00077 v2 = _mm_add_epi16 (v2, t3); \
00078 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + off)); \
00079 t2 = _mm_unpacklo_epi8 (t1, z); \
00080 t3 = _mm_unpackhi_epi8 (t1, z); \
00081 v1 = _mm_add_epi16 (v1, t2); \
00082 v2 = _mm_add_epi16 (v2, t3); \
00083 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + (str) + off)); \
00084 t2 = _mm_unpacklo_epi8 (t1, z); \
00085 t3 = _mm_unpackhi_epi8 (t1, z); \
00086 v1 = _mm_add_epi16 (v1, t2); \
00087 v2 = _mm_add_epi16 (v2, t3); \
00088 } while (0)
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099 #define CROSS_FILT_VERT(v1,v2,ptr,str) do { \
00100 __m128i t1, t2, t3, c10; \
00101 c10 = _mm_set1_epi16 (10); \
00102 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00103 v1 = _mm_unpacklo_epi8 (t1, z); \
00104 v2 = _mm_unpackhi_epi8 (t1, z); \
00105 v1 = _mm_mullo_epi16 (v1, c10); \
00106 v2 = _mm_mullo_epi16 (v2, c10); \
00107 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00108 t2 = _mm_unpacklo_epi8 (t1, z); \
00109 t3 = _mm_unpackhi_epi8 (t1, z); \
00110 v1 = _mm_add_epi16 (v1, t2); \
00111 v2 = _mm_add_epi16 (v2, t3); \
00112 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00113 t2 = _mm_unpacklo_epi8 (t1, z); \
00114 t3 = _mm_unpackhi_epi8 (t1, z); \
00115 v1 = _mm_add_epi16 (v1, t2); \
00116 v2 = _mm_add_epi16 (v2, t3); \
00117 v1 = _mm_srli_epi16 (v1, 1); \
00118 v2 = _mm_srli_epi16 (v2, 1); \
00119 t1 = _mm_loadu_si128 ((__m128i *)((ptr) - 1)); \
00120 t2 = _mm_unpacklo_epi8 (t1, z); \
00121 t3 = _mm_unpackhi_epi8 (t1, z); \
00122 v1 = _mm_subs_epi16 (v1, t2); \
00123 v2 = _mm_subs_epi16 (v2, t3); \
00124 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + 1)); \
00125 t2 = _mm_unpacklo_epi8 (t1, z); \
00126 t3 = _mm_unpackhi_epi8 (t1, z); \
00127 v1 = _mm_subs_epi16 (v1, t2); \
00128 v2 = _mm_subs_epi16 (v2, t3); \
00129 } while (0)
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139 #define HORIZ2_FILT(v1,v2,ptr,str,off) do { \
00140 __m128i t1, t2, t3; \
00141 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00142 v1 = _mm_unpacklo_epi8 (t1, z); \
00143 v2 = _mm_unpackhi_epi8 (t1, z); \
00144 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + off)); \
00145 t2 = _mm_unpacklo_epi8 (t1, z); \
00146 t3 = _mm_unpackhi_epi8 (t1, z); \
00147 v1 = _mm_add_epi16 (v1, t2); \
00148 v2 = _mm_add_epi16 (v2, t3); \
00149 } while (0)
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160 #define VERT2_FILT(v1,v2,ptr,str) do { \
00161 __m128i t1, t2, t3; \
00162 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00163 v1 = _mm_unpacklo_epi8 (t1, z); \
00164 v2 = _mm_unpackhi_epi8 (t1, z); \
00165 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00166 t2 = _mm_unpacklo_epi8 (t1, z); \
00167 t3 = _mm_unpackhi_epi8 (t1, z); \
00168 v1 = _mm_add_epi16 (v1, t2); \
00169 v2 = _mm_add_epi16 (v2, t3); \
00170 } while (0)
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181 #define CROSS_FILT_SYM(v1,v2,ptr,str) do { \
00182 __m128i t1, t2, t3; \
00183 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00184 v1 = _mm_unpacklo_epi8 (t1, z); \
00185 v2 = _mm_unpackhi_epi8 (t1, z); \
00186 v1 = _mm_slli_epi16 (v1, 2); \
00187 v2 = _mm_slli_epi16 (v2, 2); \
00188 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00189 t2 = _mm_unpacklo_epi8 (t1, z); \
00190 t3 = _mm_unpackhi_epi8 (t1, z); \
00191 v1 = _mm_subs_epi16 (v1, t2); \
00192 v2 = _mm_subs_epi16 (v2, t3); \
00193 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00194 t2 = _mm_unpacklo_epi8 (t1, z); \
00195 t3 = _mm_unpackhi_epi8 (t1, z); \
00196 v1 = _mm_subs_epi16 (v1, t2); \
00197 v2 = _mm_subs_epi16 (v2, t3); \
00198 t1 = _mm_loadu_si128 ((__m128i *)((ptr) - 1)); \
00199 t2 = _mm_unpacklo_epi8 (t1, z); \
00200 t3 = _mm_unpackhi_epi8 (t1, z); \
00201 v1 = _mm_subs_epi16 (v1, t2); \
00202 v2 = _mm_subs_epi16 (v2, t3); \
00203 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + 1)); \
00204 t2 = _mm_unpacklo_epi8 (t1, z); \
00205 t3 = _mm_unpackhi_epi8 (t1, z); \
00206 v1 = _mm_subs_epi16 (v1, t2); \
00207 v2 = _mm_subs_epi16 (v2, t3); \
00208 } while (0)
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219 #define CROSS_FILT_HORIZ(v1,v2,ptr,str) do { \
00220 __m128i t1, t2, t3, c10; \
00221 c10 = _mm_set1_epi16 (10); \
00222 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00223 v1 = _mm_unpacklo_epi8 (t1, z); \
00224 v2 = _mm_unpackhi_epi8 (t1, z); \
00225 v1 = _mm_mullo_epi16 (v1, c10); \
00226 v2 = _mm_mullo_epi16 (v2, c10); \
00227 t1 = _mm_loadu_si128 ((__m128i *)((ptr) - 1)); \
00228 t2 = _mm_unpacklo_epi8 (t1, z); \
00229 t3 = _mm_unpackhi_epi8 (t1, z); \
00230 v1 = _mm_add_epi16 (v1, t2); \
00231 v2 = _mm_add_epi16 (v2, t3); \
00232 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + 1)); \
00233 t2 = _mm_unpacklo_epi8 (t1, z); \
00234 t3 = _mm_unpackhi_epi8 (t1, z); \
00235 v1 = _mm_add_epi16 (v1, t2); \
00236 v2 = _mm_add_epi16 (v2, t3); \
00237 v1 = _mm_srli_epi16 (v1, 1); \
00238 v2 = _mm_srli_epi16 (v2, 1); \
00239 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00240 t2 = _mm_unpacklo_epi8 (t1, z); \
00241 t3 = _mm_unpackhi_epi8 (t1, z); \
00242 v1 = _mm_subs_epi16 (v1, t2); \
00243 v2 = _mm_subs_epi16 (v2, t3); \
00244 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00245 t2 = _mm_unpacklo_epi8 (t1, z); \
00246 t3 = _mm_unpackhi_epi8 (t1, z); \
00247 v1 = _mm_subs_epi16 (v1, t2); \
00248 v2 = _mm_subs_epi16 (v2, t3); \
00249 } while (0)
00250
00251 #define INTERPOLATE_GB_ROW(kstride, off) do { \
00252 CROSS_FILT_VERT (v1, v2, gb_plane + j*sstride, kstride); \
00253 HORIZ2_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \
00254 w1 = _mm_slli_epi16 (w1, 2); \
00255 w2 = _mm_slli_epi16 (w2, 2); \
00256 v1 = _mm_add_epi16 (v1, w1); \
00257 v2 = _mm_add_epi16 (v2, w2); \
00258 BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \
00259 v1 = _mm_subs_epi16 (v1, w1); \
00260 v2 = _mm_subs_epi16 (v2, w2); \
00261 v1 = _mm_srai_epi16 (v1, 3); \
00262 v2 = _mm_srai_epi16 (v2, 3); \
00263 bg = _mm_packus_epi16 (v1, v2); \
00264 \
00265 VERT2_FILT (v1, v2, gr_plane + j*sstride, -kstride); \
00266 HORIZ2_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00267 v1 = _mm_add_epi16 (v1, w1); \
00268 v2 = _mm_add_epi16 (v2, w2); \
00269 v1 = _mm_slli_epi16 (v1, 1); \
00270 v2 = _mm_slli_epi16 (v2, 1); \
00271 CROSS_FILT_SYM (w1, w2, b_plane + j*sstride, kstride); \
00272 v1 = _mm_add_epi16 (v1, w1); \
00273 v2 = _mm_add_epi16 (v2, w2); \
00274 v1 = _mm_srai_epi16 (v1, 3); \
00275 v2 = _mm_srai_epi16 (v2, 3); \
00276 gb = _mm_packus_epi16 (v1, v2); \
00277 \
00278 CROSS_FILT_HORIZ (v1, v2, gb_plane + j*sstride, kstride); \
00279 VERT2_FILT (w1, w2, r_plane + j*sstride, -kstride); \
00280 w1 = _mm_slli_epi16 (w1, 2); \
00281 w2 = _mm_slli_epi16 (w2, 2); \
00282 v1 = _mm_add_epi16 (v1, w1); \
00283 v2 = _mm_add_epi16 (v2, w2); \
00284 BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \
00285 v1 = _mm_subs_epi16 (v1, w1); \
00286 v2 = _mm_subs_epi16 (v2, w2); \
00287 v1 = _mm_srai_epi16 (v1, 3); \
00288 v2 = _mm_srai_epi16 (v2, 3); \
00289 rg = _mm_packus_epi16 (v1, v2); \
00290 \
00291 CROSS_FILT_SYM (v1, v2, b_plane + j*sstride, kstride); \
00292 v1 = _mm_mullo_epi16 (v1, c3); \
00293 v2 = _mm_mullo_epi16 (v2, c3); \
00294 BOX_FILT (w1, w2, r_plane + j*sstride, -kstride, off); \
00295 w1 = _mm_slli_epi16 (w1, 2); \
00296 w2 = _mm_slli_epi16 (w2, 2); \
00297 v1 = _mm_add_epi16 (v1, w1); \
00298 v2 = _mm_add_epi16 (v2, w2); \
00299 v1 = _mm_srai_epi16 (v1, 4); \
00300 v2 = _mm_srai_epi16 (v2, 4); \
00301 rb = _mm_packus_epi16 (v1, v2); \
00302 \
00303 gg = _mm_load_si128 ((__m128i *)(gb_plane + j*sstride)); \
00304 bgl1 = _mm_unpacklo_epi8 (bg, gg); \
00305 bgl2 = _mm_unpackhi_epi8 (bg, gg); \
00306 \
00307 a = _mm_set1_epi8 (0xff); \
00308 ral1 = _mm_unpacklo_epi8 (rg, a); \
00309 ral2 = _mm_unpackhi_epi8 (rg, a); \
00310 \
00311 bb = _mm_load_si128 ((__m128i *)(b_plane + j*sstride)); \
00312 bgr1 = _mm_unpacklo_epi8 (bb, gb); \
00313 bgr2 = _mm_unpackhi_epi8 (bb, gb); \
00314 \
00315 rar1 = _mm_unpacklo_epi8 (rb, a); \
00316 rar2 = _mm_unpackhi_epi8 (rb, a); \
00317 \
00318 bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \
00319 bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \
00320 bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \
00321 bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \
00322 \
00323 bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \
00324 bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \
00325 bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \
00326 bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \
00327 } while (0)
00328
00329 #define INTERPOLATE_RG_ROW(kstride,off) do { \
00330 CROSS_FILT_SYM (v1, v2, r_plane + j*sstride, kstride); \
00331 v1 = _mm_mullo_epi16 (v1, c3); \
00332 v2 = _mm_mullo_epi16 (v2, c3); \
00333 BOX_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \
00334 w1 = _mm_slli_epi16 (w1, 2); \
00335 w2 = _mm_slli_epi16 (w2, 2); \
00336 v1 = _mm_add_epi16 (v1, w1); \
00337 v2 = _mm_add_epi16 (v2, w2); \
00338 v1 = _mm_srai_epi16 (v1, 4); \
00339 v2 = _mm_srai_epi16 (v2, 4); \
00340 br = _mm_packus_epi16 (v1, v2); \
00341 \
00342 VERT2_FILT (v1, v2, gb_plane + j*sstride, kstride); \
00343 HORIZ2_FILT (w1, w2, gr_plane + j*sstride, kstride, -off); \
00344 v1 = _mm_add_epi16 (v1, w1); \
00345 v2 = _mm_add_epi16 (v2, w2); \
00346 v1 = _mm_slli_epi16 (v1, 1); \
00347 v2 = _mm_slli_epi16 (v2, 1); \
00348 CROSS_FILT_SYM (w1, w2, r_plane + j*sstride, kstride); \
00349 v1 = _mm_add_epi16 (v1, w1); \
00350 v2 = _mm_add_epi16 (v2, w2); \
00351 v1 = _mm_srai_epi16 (v1, 3); \
00352 v2 = _mm_srai_epi16 (v2, 3); \
00353 gr = _mm_packus_epi16 (v1, v2); \
00354 \
00355 CROSS_FILT_HORIZ (v1, v2, gr_plane + j*sstride, kstride); \
00356 VERT2_FILT (w1, w2, b_plane + j*sstride, kstride); \
00357 w1 = _mm_slli_epi16 (w1, 2); \
00358 w2 = _mm_slli_epi16 (w2, 2); \
00359 v1 = _mm_add_epi16 (v1, w1); \
00360 v2 = _mm_add_epi16 (v2, w2); \
00361 BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00362 v1 = _mm_subs_epi16 (v1, w1); \
00363 v2 = _mm_subs_epi16 (v2, w2); \
00364 v1 = _mm_srai_epi16 (v1, 3); \
00365 v2 = _mm_srai_epi16 (v2, 3); \
00366 bg = _mm_packus_epi16 (v1, v2); \
00367 \
00368 CROSS_FILT_VERT (v1, v2, gr_plane + j*sstride, kstride); \
00369 HORIZ2_FILT (w1, w2, r_plane + j*sstride, kstride, off); \
00370 w1 = _mm_slli_epi16 (w1, 2); \
00371 w2 = _mm_slli_epi16 (w2, 2); \
00372 v1 = _mm_add_epi16 (v1, w1); \
00373 v2 = _mm_add_epi16 (v2, w2); \
00374 BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00375 v1 = _mm_subs_epi16 (v1, w1); \
00376 v2 = _mm_subs_epi16 (v2, w2); \
00377 v1 = _mm_srai_epi16 (v1, 3); \
00378 v2 = _mm_srai_epi16 (v2, 3); \
00379 rg = _mm_packus_epi16 (v1, v2); \
00380 \
00381 bgl1 = _mm_unpacklo_epi8 (br, gr); \
00382 bgl2 = _mm_unpackhi_epi8 (br, gr); \
00383 \
00384 rr = _mm_load_si128 ((__m128i *)(r_plane + j*sstride)); \
00385 a = _mm_set1_epi8 (0xff); \
00386 ral1 = _mm_unpacklo_epi8 (rr, a); \
00387 ral2 = _mm_unpackhi_epi8 (rr, a); \
00388 \
00389 gg = _mm_load_si128 ((__m128i *)(gr_plane + j*sstride)); \
00390 bgr1 = _mm_unpacklo_epi8 (bg, gg); \
00391 bgr2 = _mm_unpackhi_epi8 (bg, gg); \
00392 \
00393 rar1 = _mm_unpacklo_epi8 (rg, a); \
00394 rar2 = _mm_unpackhi_epi8 (rg, a); \
00395 \
00396 bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \
00397 bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \
00398 bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \
00399 bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \
00400 \
00401 bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \
00402 bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \
00403 bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \
00404 bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \
00405 } while (0)
00406
00407 template <class T> Image<PixRGB<T> >
00408 debayerSSE2 (const Image<T>& src1, BayerFormat format)
00409 {
00410 #ifndef INVT_USE_SSEDB
00411 LFATAL("you must have SSE2 support");
00412 return Image<PixRGB<T> >();
00413 #else
00414
00415
00416
00417 bool isAligned32 = true;
00418 int patchWidth = 0;
00419 Image<T> src;
00420 if ((src1.getWidth() % 32) != 0)
00421 {
00422 patchWidth = 32 - (src1.getWidth() % 16);
00423 src = concatX(src1, Image<T>(patchWidth, src1.getHeight(), ZEROS));
00424 isAligned32 = false;
00425 }
00426 else
00427 src = src1;
00428
00429 int width = src.getWidth();
00430 int height = src.getHeight();
00431 ASSERT(width % 2 == 0);
00432 ASSERT(height % 2 == 0);
00433 int dstride = width * 4;
00434 int sstride = width;
00435
00436
00437
00438 uint8_t *bayer_planes[4];
00439 int plane_stride = ((width + 0xf)&(~0xf)) + 32;
00440 for (int i = 0; i < 4; i++) {
00441 bayer_planes[i] = (uint8_t*)memalign(16,plane_stride * (height + 2));
00442 }
00443
00444
00445 int bgra_stride = width*4;
00446 uint8_t *bgra_img = (uint8_t*)memalign(16,height * bgra_stride);
00447
00448
00449 int bayer_stride = width;
00450 uint8_t *bayer_img = (uint8_t*) memalign(16,height * bayer_stride);
00451
00452
00453 copy_8u_generic ((uint8_t*)src.getArrayPtr(), sstride,
00454 bayer_img, bayer_stride,
00455 0, 0, 0, 0, width, height, 8);
00456
00457
00458 uint8_t * planes[4] = {
00459 bayer_planes[0] + plane_stride + 16,
00460 bayer_planes[1] + plane_stride + 16,
00461 bayer_planes[2] + plane_stride + 16,
00462 bayer_planes[3] + plane_stride + 16,
00463 };
00464 int p_width = width / 2;
00465 int p_height = height / 2;
00466
00467 splitBayerPlanes_8u (planes, plane_stride,
00468 bayer_img, bayer_stride, p_width, p_height);
00469 for (int j = 0; j < 4; j++)
00470 replicateBorder_8u (planes[j], plane_stride, p_width, p_height);
00471
00472
00473 if(bayerInterpolateTo_8u_bgra_sse2 (planes,plane_stride,
00474 bgra_img, bgra_stride,
00475 width, height, format) < 0)
00476 LFATAL("error in debayer with sse2");
00477
00478
00479 uint8_t * dest = (uint8_t*)memalign(16, dstride*height);
00480 copy_8u_generic (bgra_img, bgra_stride,
00481 dest, dstride, 0, 0, 0, 0, width, height, 8 * 4);
00482
00483 Image<PixRGB<T> > res(width, height, NO_INIT);
00484 typename Image<PixRGB<T> >::iterator dptr = res.beginw();
00485 T* sptr = (T*)dest;
00486
00487 for(int y =0; y < height; y++)
00488 {
00489 for(int x =0; x < width; x++)
00490 {
00491 dptr[0].p[2] = *sptr++;
00492 dptr[0].p[1] = *sptr++;
00493 dptr[0].p[0] = *sptr++;
00494 dptr++;
00495 sptr++;
00496 }
00497 }
00498
00499 for (int i=0; i<4; i++) {
00500 free (bayer_planes[i]);
00501 }
00502 free(dest);
00503 free(bayer_img);
00504 free (bgra_img);
00505
00506 if(!isAligned32)
00507 res = crop(res, Point2D<int>(0,0), Dims(width-patchWidth, height));
00508
00509 return res;
00510 #endif //INVT_USE_SSEDB
00511 }
00512
00513 int
00514 bayerInterpolateTo_8u_bgra_sse2 (uint8_t ** src, int sstride,
00515 uint8_t * dst, int dstride, int width, int height,
00516 BayerFormat format)
00517 {
00518 # ifndef INVT_USE_SSE3
00519 LFATAL("you must have sse3 support");
00520 return -1;
00521 #else
00522 int i, j;
00523 for (i = 0; i < 4; i++) {
00524 if (!IS_ALIGNED16(src[i]) || !IS_ALIGNED16(sstride)) {
00525 LERROR("%s: src[%d] is not 16-byte aligned", __FUNCTION__, i);
00526 return -1;
00527 }
00528 }
00529 if (!IS_ALIGNED16(dst) || !IS_ALIGNED128(dstride)) {
00530 LERROR("%s: dst is not 16-byte aligned or 128-byte stride aligned", __FUNCTION__);
00531 return -1;
00532 }
00533
00534 __m128i z = _mm_set1_epi32 (0);
00535 __m128i c3 = _mm_set1_epi16 (3);
00536 __m128i bg, gb, rg, rb, gg, a, bb, br, gr, rr;
00537 __m128i bgl1, bgl2, ral1, ral2;
00538 __m128i bgr1, bgr2, rar1, rar2;
00539 __m128i bgral1, bgral2, bgral3, bgral4;
00540 __m128i bgrar1, bgrar2, bgrar3, bgrar4;
00541 __m128i v1, v2, w1, w2;
00542
00543 if (format == BAYER_GBRG ||
00544 format == BAYER_RGGB) {
00545 int drow_offset1 = 0;
00546 int drow_offset2 = dstride;
00547 int kernel_stride = sstride;
00548 uint8_t * gb_plane = src[0];
00549 uint8_t * b_plane = src[1];
00550 uint8_t * r_plane = src[2];
00551 uint8_t * gr_plane = src[3];
00552 if (format == BAYER_RGGB) {
00553 drow_offset1 = dstride;
00554 drow_offset2 = 0;
00555 kernel_stride = -sstride;
00556 r_plane = src[0];
00557 gr_plane = src[1];
00558 gb_plane = src[2];
00559 b_plane = src[3];
00560 }
00561
00562 for (i = 0; i < width/2; i += 16) {
00563 uint8_t * dcol = dst + i*8;
00564
00565 for (j = 0; j < height/2; j++) {
00566 INTERPOLATE_GB_ROW (kernel_stride, 1);
00567
00568 uint8_t * drow = dcol + j*2*dstride + drow_offset1;
00569 _mm_store_si128 ((__m128i *)drow,
00570 _mm_unpacklo_epi32 (bgral1, bgrar1));
00571 _mm_store_si128 ((__m128i *)(drow+16),
00572 _mm_unpackhi_epi32 (bgral1, bgrar1));
00573 _mm_store_si128 ((__m128i *)(drow+32),
00574 _mm_unpacklo_epi32 (bgral2, bgrar2));
00575 _mm_store_si128 ((__m128i *)(drow+48),
00576 _mm_unpackhi_epi32 (bgral2, bgrar2));
00577 _mm_store_si128 ((__m128i *)(drow+64),
00578 _mm_unpacklo_epi32 (bgral3, bgrar3));
00579 _mm_store_si128 ((__m128i *)(drow+80),
00580 _mm_unpackhi_epi32 (bgral3, bgrar3));
00581 _mm_store_si128 ((__m128i *)(drow+96),
00582 _mm_unpacklo_epi32 (bgral4, bgrar4));
00583 _mm_store_si128 ((__m128i *)(drow+112),
00584 _mm_unpackhi_epi32 (bgral4, bgrar4));
00585
00586 INTERPOLATE_RG_ROW (kernel_stride, 1);
00587
00588 drow = dcol + j*2*dstride + drow_offset2;
00589 _mm_store_si128 ((__m128i *)drow,
00590 _mm_unpacklo_epi32 (bgral1, bgrar1));
00591 _mm_store_si128 ((__m128i *)(drow+16),
00592 _mm_unpackhi_epi32 (bgral1, bgrar1));
00593 _mm_store_si128 ((__m128i *)(drow+32),
00594 _mm_unpacklo_epi32 (bgral2, bgrar2));
00595 _mm_store_si128 ((__m128i *)(drow+48),
00596 _mm_unpackhi_epi32 (bgral2, bgrar2));
00597 _mm_store_si128 ((__m128i *)(drow+64),
00598 _mm_unpacklo_epi32 (bgral3, bgrar3));
00599 _mm_store_si128 ((__m128i *)(drow+80),
00600 _mm_unpackhi_epi32 (bgral3, bgrar3));
00601 _mm_store_si128 ((__m128i *)(drow+96),
00602 _mm_unpacklo_epi32 (bgral4, bgrar4));
00603 _mm_store_si128 ((__m128i *)(drow+112),
00604 _mm_unpackhi_epi32 (bgral4, bgrar4));
00605
00606 }
00607 gb_plane += 16;
00608 b_plane += 16;
00609 r_plane += 16;
00610 gr_plane += 16;
00611 }
00612 }
00613 else {
00614 int drow_offset1 = 0;
00615 int drow_offset2 = dstride;
00616 int kernel_stride = sstride;
00617 uint8_t * b_plane = src[0];
00618 uint8_t * gb_plane = src[1];
00619 uint8_t * gr_plane = src[2];
00620 uint8_t * r_plane = src[3];
00621 if (format == BAYER_GRBG) {
00622 drow_offset1 = dstride;
00623 drow_offset2 = 0;
00624 kernel_stride = -sstride;
00625 gr_plane = src[0];
00626 r_plane = src[1];
00627 b_plane = src[2];
00628 gb_plane = src[3];
00629 }
00630
00631 for (i = 0; i < width/2; i += 16) {
00632 uint8_t * dcol = dst + i*8;
00633
00634 for (j = 0; j < height/2; j++) {
00635 INTERPOLATE_GB_ROW (kernel_stride, -1);
00636
00637 uint8_t * drow = dcol + j*2*dstride + drow_offset1;
00638 _mm_store_si128 ((__m128i *)drow,
00639 _mm_unpacklo_epi32 (bgrar1, bgral1));
00640 _mm_store_si128 ((__m128i *)(drow+16),
00641 _mm_unpackhi_epi32 (bgrar1, bgral1));
00642 _mm_store_si128 ((__m128i *)(drow+32),
00643 _mm_unpacklo_epi32 (bgrar2, bgral2));
00644 _mm_store_si128 ((__m128i *)(drow+48),
00645 _mm_unpackhi_epi32 (bgrar2, bgral2));
00646 _mm_store_si128 ((__m128i *)(drow+64),
00647 _mm_unpacklo_epi32 (bgrar3, bgral3));
00648 _mm_store_si128 ((__m128i *)(drow+80),
00649 _mm_unpackhi_epi32 (bgrar3, bgral3));
00650 _mm_store_si128 ((__m128i *)(drow+96),
00651 _mm_unpacklo_epi32 (bgrar4, bgral4));
00652 _mm_store_si128 ((__m128i *)(drow+112),
00653 _mm_unpackhi_epi32 (bgrar4, bgral4));
00654
00655 INTERPOLATE_RG_ROW (kernel_stride, -1);
00656
00657 drow = dcol + j*2*dstride + drow_offset2;
00658 _mm_store_si128 ((__m128i *)drow,
00659 _mm_unpacklo_epi32 (bgrar1, bgral1));
00660 _mm_store_si128 ((__m128i *)(drow+16),
00661 _mm_unpackhi_epi32 (bgrar1, bgral1));
00662 _mm_store_si128 ((__m128i *)(drow+32),
00663 _mm_unpacklo_epi32 (bgrar2, bgral2));
00664 _mm_store_si128 ((__m128i *)(drow+48),
00665 _mm_unpackhi_epi32 (bgrar2, bgral2));
00666 _mm_store_si128 ((__m128i *)(drow+64),
00667 _mm_unpacklo_epi32 (bgrar3, bgral3));
00668 _mm_store_si128 ((__m128i *)(drow+80),
00669 _mm_unpackhi_epi32 (bgrar3, bgral3));
00670 _mm_store_si128 ((__m128i *)(drow+96),
00671 _mm_unpacklo_epi32 (bgrar4, bgral4));
00672 _mm_store_si128 ((__m128i *)(drow+112),
00673 _mm_unpackhi_epi32 (bgrar4, bgral4));
00674
00675 }
00676 gb_plane += 16;
00677 b_plane += 16;
00678 r_plane += 16;
00679 gr_plane += 16;
00680 }
00681 }
00682 return 0;
00683 #endif
00684 }
00685
00686 template Image<PixRGB<byte> > debayerSSE2(const Image<byte>& src, BayerFormat format);
00687 template Image<PixRGB<uint16> > debayerSSE2(const Image<uint16>& src, BayerFormat format);
00688
00689
00690
00691
00692
00693