00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include <stdio.h>
00038 #include <stdint.h>
00039 #include <emmintrin.h>
00040 #include <stdlib.h>
00041
00042
00043
00044
00045 #ifndef MACHINE_OS_DARWIN
00046 #include <malloc.h>
00047 #endif
00048
00049 #include "Image/Image.H"
00050 #include "Image/CutPaste.H"
00051 #include "Raster/DeBayerSSE3.H"
00052 #include "Raster/DeBayerSSE2.H"
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069 #define BOX_FILT(v1,v2,ptr,str,off) do { \
00070 __m128i t1, t2, t3; \
00071 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00072 v1 = _mm_unpacklo_epi8 (t1, z); \
00073 v2 = _mm_unpackhi_epi8 (t1, z); \
00074 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00075 t2 = _mm_unpacklo_epi8 (t1, z); \
00076 t3 = _mm_unpackhi_epi8 (t1, z); \
00077 v1 = _mm_add_epi16 (v1, t2); \
00078 v2 = _mm_add_epi16 (v2, t3); \
00079 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + off)); \
00080 t2 = _mm_unpacklo_epi8 (t1, z); \
00081 t3 = _mm_unpackhi_epi8 (t1, z); \
00082 v1 = _mm_add_epi16 (v1, t2); \
00083 v2 = _mm_add_epi16 (v2, t3); \
00084 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + (str) + off)); \
00085 t2 = _mm_unpacklo_epi8 (t1, z); \
00086 t3 = _mm_unpackhi_epi8 (t1, z); \
00087 v1 = _mm_add_epi16 (v1, t2); \
00088 v2 = _mm_add_epi16 (v2, t3); \
00089 } while (0)
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100 #define CROSS_FILT_VERT(v1,v2,ptr,str) do { \
00101 __m128i t1, t2, t3, c10; \
00102 c10 = _mm_set1_epi16 (10); \
00103 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00104 v1 = _mm_unpacklo_epi8 (t1, z); \
00105 v2 = _mm_unpackhi_epi8 (t1, z); \
00106 v1 = _mm_mullo_epi16 (v1, c10); \
00107 v2 = _mm_mullo_epi16 (v2, c10); \
00108 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00109 t2 = _mm_unpacklo_epi8 (t1, z); \
00110 t3 = _mm_unpackhi_epi8 (t1, z); \
00111 v1 = _mm_add_epi16 (v1, t2); \
00112 v2 = _mm_add_epi16 (v2, t3); \
00113 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00114 t2 = _mm_unpacklo_epi8 (t1, z); \
00115 t3 = _mm_unpackhi_epi8 (t1, z); \
00116 v1 = _mm_add_epi16 (v1, t2); \
00117 v2 = _mm_add_epi16 (v2, t3); \
00118 v1 = _mm_srli_epi16 (v1, 1); \
00119 v2 = _mm_srli_epi16 (v2, 1); \
00120 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) - 1)); \
00121 t2 = _mm_unpacklo_epi8 (t1, z); \
00122 t3 = _mm_unpackhi_epi8 (t1, z); \
00123 v1 = _mm_subs_epi16 (v1, t2); \
00124 v2 = _mm_subs_epi16 (v2, t3); \
00125 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + 1)); \
00126 t2 = _mm_unpacklo_epi8 (t1, z); \
00127 t3 = _mm_unpackhi_epi8 (t1, z); \
00128 v1 = _mm_subs_epi16 (v1, t2); \
00129 v2 = _mm_subs_epi16 (v2, t3); \
00130 } while (0)
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140 #define HORIZ2_FILT(v1,v2,ptr,str,off) do { \
00141 __m128i t1, t2, t3; \
00142 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00143 v1 = _mm_unpacklo_epi8 (t1, z); \
00144 v2 = _mm_unpackhi_epi8 (t1, z); \
00145 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + off)); \
00146 t2 = _mm_unpacklo_epi8 (t1, z); \
00147 t3 = _mm_unpackhi_epi8 (t1, z); \
00148 v1 = _mm_add_epi16 (v1, t2); \
00149 v2 = _mm_add_epi16 (v2, t3); \
00150 } while (0)
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161 #define VERT2_FILT(v1,v2,ptr,str) do { \
00162 __m128i t1, t2, t3; \
00163 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00164 v1 = _mm_unpacklo_epi8 (t1, z); \
00165 v2 = _mm_unpackhi_epi8 (t1, z); \
00166 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00167 t2 = _mm_unpacklo_epi8 (t1, z); \
00168 t3 = _mm_unpackhi_epi8 (t1, z); \
00169 v1 = _mm_add_epi16 (v1, t2); \
00170 v2 = _mm_add_epi16 (v2, t3); \
00171 } while (0)
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182 #define CROSS_FILT_SYM(v1,v2,ptr,str) do { \
00183 __m128i t1, t2, t3; \
00184 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00185 v1 = _mm_unpacklo_epi8 (t1, z); \
00186 v2 = _mm_unpackhi_epi8 (t1, z); \
00187 v1 = _mm_slli_epi16 (v1, 2); \
00188 v2 = _mm_slli_epi16 (v2, 2); \
00189 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00190 t2 = _mm_unpacklo_epi8 (t1, z); \
00191 t3 = _mm_unpackhi_epi8 (t1, z); \
00192 v1 = _mm_subs_epi16 (v1, t2); \
00193 v2 = _mm_subs_epi16 (v2, t3); \
00194 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00195 t2 = _mm_unpacklo_epi8 (t1, z); \
00196 t3 = _mm_unpackhi_epi8 (t1, z); \
00197 v1 = _mm_subs_epi16 (v1, t2); \
00198 v2 = _mm_subs_epi16 (v2, t3); \
00199 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) - 1)); \
00200 t2 = _mm_unpacklo_epi8 (t1, z); \
00201 t3 = _mm_unpackhi_epi8 (t1, z); \
00202 v1 = _mm_subs_epi16 (v1, t2); \
00203 v2 = _mm_subs_epi16 (v2, t3); \
00204 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + 1)); \
00205 t2 = _mm_unpacklo_epi8 (t1, z); \
00206 t3 = _mm_unpackhi_epi8 (t1, z); \
00207 v1 = _mm_subs_epi16 (v1, t2); \
00208 v2 = _mm_subs_epi16 (v2, t3); \
00209 } while (0)
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220 #define CROSS_FILT_HORIZ(v1,v2,ptr,str) do { \
00221 __m128i t1, t2, t3, c10; \
00222 c10 = _mm_set1_epi16 (10); \
00223 t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00224 v1 = _mm_unpacklo_epi8 (t1, z); \
00225 v2 = _mm_unpackhi_epi8 (t1, z); \
00226 v1 = _mm_mullo_epi16 (v1, c10); \
00227 v2 = _mm_mullo_epi16 (v2, c10); \
00228 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) - 1)); \
00229 t2 = _mm_unpacklo_epi8 (t1, z); \
00230 t3 = _mm_unpackhi_epi8 (t1, z); \
00231 v1 = _mm_add_epi16 (v1, t2); \
00232 v2 = _mm_add_epi16 (v2, t3); \
00233 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + 1)); \
00234 t2 = _mm_unpacklo_epi8 (t1, z); \
00235 t3 = _mm_unpackhi_epi8 (t1, z); \
00236 v1 = _mm_add_epi16 (v1, t2); \
00237 v2 = _mm_add_epi16 (v2, t3); \
00238 v1 = _mm_srli_epi16 (v1, 1); \
00239 v2 = _mm_srli_epi16 (v2, 1); \
00240 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00241 t2 = _mm_unpacklo_epi8 (t1, z); \
00242 t3 = _mm_unpackhi_epi8 (t1, z); \
00243 v1 = _mm_subs_epi16 (v1, t2); \
00244 v2 = _mm_subs_epi16 (v2, t3); \
00245 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00246 t2 = _mm_unpacklo_epi8 (t1, z); \
00247 t3 = _mm_unpackhi_epi8 (t1, z); \
00248 v1 = _mm_subs_epi16 (v1, t2); \
00249 v2 = _mm_subs_epi16 (v2, t3); \
00250 } while (0)
00251
00252 #define INTERPOLATE_GB_ROW(kstride, off) do { \
00253 CROSS_FILT_VERT (v1, v2, gb_plane + j*sstride, kstride); \
00254 HORIZ2_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \
00255 w1 = _mm_slli_epi16 (w1, 2); \
00256 w2 = _mm_slli_epi16 (w2, 2); \
00257 v1 = _mm_add_epi16 (v1, w1); \
00258 v2 = _mm_add_epi16 (v2, w2); \
00259 BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \
00260 v1 = _mm_subs_epi16 (v1, w1); \
00261 v2 = _mm_subs_epi16 (v2, w2); \
00262 v1 = _mm_srai_epi16 (v1, 3); \
00263 v2 = _mm_srai_epi16 (v2, 3); \
00264 bg = _mm_packus_epi16 (v1, v2); \
00265 \
00266 VERT2_FILT (v1, v2, gr_plane + j*sstride, -kstride); \
00267 HORIZ2_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00268 v1 = _mm_add_epi16 (v1, w1); \
00269 v2 = _mm_add_epi16 (v2, w2); \
00270 v1 = _mm_slli_epi16 (v1, 1); \
00271 v2 = _mm_slli_epi16 (v2, 1); \
00272 CROSS_FILT_SYM (w1, w2, b_plane + j*sstride, kstride); \
00273 v1 = _mm_add_epi16 (v1, w1); \
00274 v2 = _mm_add_epi16 (v2, w2); \
00275 v1 = _mm_srai_epi16 (v1, 3); \
00276 v2 = _mm_srai_epi16 (v2, 3); \
00277 gb = _mm_packus_epi16 (v1, v2); \
00278 \
00279 CROSS_FILT_HORIZ (v1, v2, gb_plane + j*sstride, kstride); \
00280 VERT2_FILT (w1, w2, r_plane + j*sstride, -kstride); \
00281 w1 = _mm_slli_epi16 (w1, 2); \
00282 w2 = _mm_slli_epi16 (w2, 2); \
00283 v1 = _mm_add_epi16 (v1, w1); \
00284 v2 = _mm_add_epi16 (v2, w2); \
00285 BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \
00286 v1 = _mm_subs_epi16 (v1, w1); \
00287 v2 = _mm_subs_epi16 (v2, w2); \
00288 v1 = _mm_srai_epi16 (v1, 3); \
00289 v2 = _mm_srai_epi16 (v2, 3); \
00290 rg = _mm_packus_epi16 (v1, v2); \
00291 \
00292 CROSS_FILT_SYM (v1, v2, b_plane + j*sstride, kstride); \
00293 v1 = _mm_mullo_epi16 (v1, c3); \
00294 v2 = _mm_mullo_epi16 (v2, c3); \
00295 BOX_FILT (w1, w2, r_plane + j*sstride, -kstride, off); \
00296 w1 = _mm_slli_epi16 (w1, 2); \
00297 w2 = _mm_slli_epi16 (w2, 2); \
00298 v1 = _mm_add_epi16 (v1, w1); \
00299 v2 = _mm_add_epi16 (v2, w2); \
00300 v1 = _mm_srai_epi16 (v1, 4); \
00301 v2 = _mm_srai_epi16 (v2, 4); \
00302 rb = _mm_packus_epi16 (v1, v2); \
00303 \
00304 gg = _mm_load_si128 ((__m128i *)(gb_plane + j*sstride)); \
00305 bgl1 = _mm_unpacklo_epi8 (bg, gg); \
00306 bgl2 = _mm_unpackhi_epi8 (bg, gg); \
00307 \
00308 a = _mm_set1_epi8 (0xff); \
00309 ral1 = _mm_unpacklo_epi8 (rg, a); \
00310 ral2 = _mm_unpackhi_epi8 (rg, a); \
00311 \
00312 bb = _mm_load_si128 ((__m128i *)(b_plane + j*sstride)); \
00313 bgr1 = _mm_unpacklo_epi8 (bb, gb); \
00314 bgr2 = _mm_unpackhi_epi8 (bb, gb); \
00315 \
00316 rar1 = _mm_unpacklo_epi8 (rb, a); \
00317 rar2 = _mm_unpackhi_epi8 (rb, a); \
00318 \
00319 bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \
00320 bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \
00321 bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \
00322 bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \
00323 \
00324 bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \
00325 bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \
00326 bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \
00327 bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \
00328 } while (0)
00329
00330 #define INTERPOLATE_RG_ROW(kstride,off) do { \
00331 CROSS_FILT_SYM (v1, v2, r_plane + j*sstride, kstride); \
00332 v1 = _mm_mullo_epi16 (v1, c3); \
00333 v2 = _mm_mullo_epi16 (v2, c3); \
00334 BOX_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \
00335 w1 = _mm_slli_epi16 (w1, 2); \
00336 w2 = _mm_slli_epi16 (w2, 2); \
00337 v1 = _mm_add_epi16 (v1, w1); \
00338 v2 = _mm_add_epi16 (v2, w2); \
00339 v1 = _mm_srai_epi16 (v1, 4); \
00340 v2 = _mm_srai_epi16 (v2, 4); \
00341 br = _mm_packus_epi16 (v1, v2); \
00342 \
00343 VERT2_FILT (v1, v2, gb_plane + j*sstride, kstride); \
00344 HORIZ2_FILT (w1, w2, gr_plane + j*sstride, kstride, -off); \
00345 v1 = _mm_add_epi16 (v1, w1); \
00346 v2 = _mm_add_epi16 (v2, w2); \
00347 v1 = _mm_slli_epi16 (v1, 1); \
00348 v2 = _mm_slli_epi16 (v2, 1); \
00349 CROSS_FILT_SYM (w1, w2, r_plane + j*sstride, kstride); \
00350 v1 = _mm_add_epi16 (v1, w1); \
00351 v2 = _mm_add_epi16 (v2, w2); \
00352 v1 = _mm_srai_epi16 (v1, 3); \
00353 v2 = _mm_srai_epi16 (v2, 3); \
00354 gr = _mm_packus_epi16 (v1, v2); \
00355 \
00356 CROSS_FILT_HORIZ (v1, v2, gr_plane + j*sstride, kstride); \
00357 VERT2_FILT (w1, w2, b_plane + j*sstride, kstride); \
00358 w1 = _mm_slli_epi16 (w1, 2); \
00359 w2 = _mm_slli_epi16 (w2, 2); \
00360 v1 = _mm_add_epi16 (v1, w1); \
00361 v2 = _mm_add_epi16 (v2, w2); \
00362 BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00363 v1 = _mm_subs_epi16 (v1, w1); \
00364 v2 = _mm_subs_epi16 (v2, w2); \
00365 v1 = _mm_srai_epi16 (v1, 3); \
00366 v2 = _mm_srai_epi16 (v2, 3); \
00367 bg = _mm_packus_epi16 (v1, v2); \
00368 \
00369 CROSS_FILT_VERT (v1, v2, gr_plane + j*sstride, kstride); \
00370 HORIZ2_FILT (w1, w2, r_plane + j*sstride, kstride, off); \
00371 w1 = _mm_slli_epi16 (w1, 2); \
00372 w2 = _mm_slli_epi16 (w2, 2); \
00373 v1 = _mm_add_epi16 (v1, w1); \
00374 v2 = _mm_add_epi16 (v2, w2); \
00375 BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00376 v1 = _mm_subs_epi16 (v1, w1); \
00377 v2 = _mm_subs_epi16 (v2, w2); \
00378 v1 = _mm_srai_epi16 (v1, 3); \
00379 v2 = _mm_srai_epi16 (v2, 3); \
00380 rg = _mm_packus_epi16 (v1, v2); \
00381 \
00382 bgl1 = _mm_unpacklo_epi8 (br, gr); \
00383 bgl2 = _mm_unpackhi_epi8 (br, gr); \
00384 \
00385 rr = _mm_load_si128 ((__m128i *)(r_plane + j*sstride)); \
00386 a = _mm_set1_epi8 (0xff); \
00387 ral1 = _mm_unpacklo_epi8 (rr, a); \
00388 ral2 = _mm_unpackhi_epi8 (rr, a); \
00389 \
00390 gg = _mm_load_si128 ((__m128i *)(gr_plane + j*sstride)); \
00391 bgr1 = _mm_unpacklo_epi8 (bg, gg); \
00392 bgr2 = _mm_unpackhi_epi8 (bg, gg); \
00393 \
00394 rar1 = _mm_unpacklo_epi8 (rg, a); \
00395 rar2 = _mm_unpackhi_epi8 (rg, a); \
00396 \
00397 bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \
00398 bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \
00399 bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \
00400 bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \
00401 \
00402 bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \
00403 bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \
00404 bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \
00405 bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \
00406 } while (0)
00407
00408 template <class T> Image<PixRGB<T> >
00409 debayerSSE3 (const Image<T>& src1,
00410 BayerFormat format)
00411 {
00412 # ifndef INVT_USE_SSE3
00413 LFATAL("you must have sse3 support");
00414 return Image<PixRGB<T> >();
00415 #else
00416
00417
00418 bool isAligned32 = true;
00419 int patchWidth = 0;
00420 Image<T> src;
00421 if ((src1.getWidth() % 32) != 0)
00422 {
00423 patchWidth = 32 - (src1.getWidth() % 32);
00424 src = concatX(src1, Image<T>(patchWidth, src1.getHeight(), ZEROS));
00425 isAligned32 = false;
00426 }
00427 else
00428 src = src1;
00429
00430 int width = src.getWidth();
00431 int height = src.getHeight();
00432 ASSERT(width % 2 == 0);
00433 ASSERT(height % 2 == 0);
00434 int dstride = width * 4;
00435 int sstride = width;
00436
00437
00438
00439 uint8_t *bayer_planes[4];
00440 int plane_stride = ((width + 0xf)&(~0xf)) + 32;
00441 for (int i = 0; i < 4; i++) {
00442 bayer_planes[i] = (uint8_t*)memalign(16,plane_stride * (height + 2));
00443 }
00444
00445
00446 int bgra_stride = width*4;
00447 uint8_t *bgra_img = (uint8_t*)memalign(16,height * bgra_stride);
00448
00449
00450 int bayer_stride = width;
00451 uint8_t *bayer_img = (uint8_t*) memalign(16,height * bayer_stride);
00452
00453
00454 copy_8u_generic ((uint8_t*)src.getArrayPtr(), sstride,
00455 bayer_img, bayer_stride,
00456 0, 0, 0, 0, width, height, 8);
00457
00458
00459 uint8_t * planes[4] = {
00460 bayer_planes[0] + plane_stride + 16,
00461 bayer_planes[1] + plane_stride + 16,
00462 bayer_planes[2] + plane_stride + 16,
00463 bayer_planes[3] + plane_stride + 16,
00464 };
00465 int p_width = width / 2;
00466 int p_height = height / 2;
00467
00468 splitBayerPlanes_8u (planes, plane_stride,
00469 bayer_img, bayer_stride, p_width, p_height);
00470 for (int j = 0; j < 4; j++)
00471 replicateBorder_8u (planes[j], plane_stride, p_width, p_height);
00472
00473
00474 if( bayerInterpolateTo_8u_bgra_sse3 (planes,plane_stride,
00475 bgra_img, bgra_stride,
00476 width, height, format) < 0)
00477 LFATAL("error in debayer with sse3");
00478
00479 uint8_t * dest = (uint8_t*)memalign(16, dstride*height);
00480 copy_8u_generic (bgra_img, bgra_stride,
00481 dest, dstride, 0, 0, 0, 0, width, height, 8 * 4);
00482
00483 Image<PixRGB<T> > res(width, height, NO_INIT);
00484 typename Image<PixRGB<T> >::iterator dptr = res.beginw();
00485 T* sptr = (T*)dest;
00486
00487 for(int y =0; y < height; y++)
00488 {
00489 for(int x =0; x < width; x++)
00490 {
00491 dptr[0].p[2] = *sptr++;
00492 dptr[0].p[1] = *sptr++;
00493 dptr[0].p[0] = *sptr++;
00494 dptr++;
00495 sptr++;
00496 }
00497 }
00498
00499 for (int i=0; i<4; i++) {
00500 free (bayer_planes[i]);
00501 }
00502 free(dest);
00503 free(bayer_img);
00504 free (bgra_img);
00505
00506 if(!isAligned32)
00507 res = crop(res, Point2D<int>(0,0), Dims(width-patchWidth, height));
00508 return res;
00509 #endif //INVT_USE_SSE3
00510 }
00511
00512 int
00513 bayerInterpolateTo_8u_bgra_sse3 (uint8_t ** src, int sstride,
00514 uint8_t * dst, int dstride, int width, int height,
00515 BayerFormat format)
00516 {
00517 # ifndef INVT_USE_SSE3
00518 LFATAL("you must have sse3 support");
00519 return -1;
00520 #else
00521 int i, j;
00522 for (i = 0; i < 4; i++) {
00523 if (!IS_ALIGNED16(src[i]) || !IS_ALIGNED16(sstride)) {
00524 LERROR("%s: src[%d] is not 16-byte aligned\n",
00525 __FUNCTION__, i);
00526 return -1;
00527 }
00528 }
00529 if (!IS_ALIGNED16(dst) || !IS_ALIGNED128(dstride)) {
00530 LERROR("%s: dst is not 16-byte aligned or 128-byte stride "
00531 "aligned\n", __FUNCTION__);
00532 return -1;
00533 }
00534
00535 __m128i z = _mm_set1_epi32 (0);
00536 __m128i c3 = _mm_set1_epi16 (3);
00537 __m128i bg, gb, rg, rb, gg, a, bb, br, gr, rr;
00538 __m128i bgl1, bgl2, ral1, ral2;
00539 __m128i bgr1, bgr2, rar1, rar2;
00540 __m128i bgral1, bgral2, bgral3, bgral4;
00541 __m128i bgrar1, bgrar2, bgrar3, bgrar4;
00542 __m128i v1, v2, w1, w2;
00543
00544 if (format == BAYER_GBRG ||
00545 format == BAYER_RGGB) {
00546 int drow_offset1 = 0;
00547 int drow_offset2 = dstride;
00548 int kernel_stride = sstride;
00549 uint8_t * gb_plane = src[0];
00550 uint8_t * b_plane = src[1];
00551 uint8_t * r_plane = src[2];
00552 uint8_t * gr_plane = src[3];
00553 if (format == BAYER_RGGB) {
00554 drow_offset1 = dstride;
00555 drow_offset2 = 0;
00556 kernel_stride = -sstride;
00557 r_plane = src[0];
00558 gr_plane = src[1];
00559 gb_plane = src[2];
00560 b_plane = src[3];
00561 }
00562
00563 for (i = 0; i < width/2; i += 16) {
00564 uint8_t * dcol = dst + i*8;
00565
00566 for (j = 0; j < height/2; j++) {
00567 INTERPOLATE_GB_ROW (kernel_stride, 1);
00568
00569 uint8_t * drow = dcol + j*2*dstride + drow_offset1;
00570 _mm_store_si128 ((__m128i *)drow,
00571 _mm_unpacklo_epi32 (bgral1, bgrar1));
00572 _mm_store_si128 ((__m128i *)(drow+16),
00573 _mm_unpackhi_epi32 (bgral1, bgrar1));
00574 _mm_store_si128 ((__m128i *)(drow+32),
00575 _mm_unpacklo_epi32 (bgral2, bgrar2));
00576 _mm_store_si128 ((__m128i *)(drow+48),
00577 _mm_unpackhi_epi32 (bgral2, bgrar2));
00578 _mm_store_si128 ((__m128i *)(drow+64),
00579 _mm_unpacklo_epi32 (bgral3, bgrar3));
00580 _mm_store_si128 ((__m128i *)(drow+80),
00581 _mm_unpackhi_epi32 (bgral3, bgrar3));
00582 _mm_store_si128 ((__m128i *)(drow+96),
00583 _mm_unpacklo_epi32 (bgral4, bgrar4));
00584 _mm_store_si128 ((__m128i *)(drow+112),
00585 _mm_unpackhi_epi32 (bgral4, bgrar4));
00586
00587 INTERPOLATE_RG_ROW (kernel_stride, 1);
00588
00589 drow = dcol + j*2*dstride + drow_offset2;
00590 _mm_store_si128 ((__m128i *)drow,
00591 _mm_unpacklo_epi32 (bgral1, bgrar1));
00592 _mm_store_si128 ((__m128i *)(drow+16),
00593 _mm_unpackhi_epi32 (bgral1, bgrar1));
00594 _mm_store_si128 ((__m128i *)(drow+32),
00595 _mm_unpacklo_epi32 (bgral2, bgrar2));
00596 _mm_store_si128 ((__m128i *)(drow+48),
00597 _mm_unpackhi_epi32 (bgral2, bgrar2));
00598 _mm_store_si128 ((__m128i *)(drow+64),
00599 _mm_unpacklo_epi32 (bgral3, bgrar3));
00600 _mm_store_si128 ((__m128i *)(drow+80),
00601 _mm_unpackhi_epi32 (bgral3, bgrar3));
00602 _mm_store_si128 ((__m128i *)(drow+96),
00603 _mm_unpacklo_epi32 (bgral4, bgrar4));
00604 _mm_store_si128 ((__m128i *)(drow+112),
00605 _mm_unpackhi_epi32 (bgral4, bgrar4));
00606
00607 }
00608 gb_plane += 16;
00609 b_plane += 16;
00610 r_plane += 16;
00611 gr_plane += 16;
00612 }
00613 }
00614 else {
00615 int drow_offset1 = 0;
00616 int drow_offset2 = dstride;
00617 int kernel_stride = sstride;
00618 uint8_t * b_plane = src[0];
00619 uint8_t * gb_plane = src[1];
00620 uint8_t * gr_plane = src[2];
00621 uint8_t * r_plane = src[3];
00622 if (format == BAYER_GRBG) {
00623 drow_offset1 = dstride;
00624 drow_offset2 = 0;
00625 kernel_stride = -sstride;
00626 gr_plane = src[0];
00627 r_plane = src[1];
00628 b_plane = src[2];
00629 gb_plane = src[3];
00630 }
00631
00632 for (i = 0; i < width/2; i += 16) {
00633 uint8_t * dcol = dst + i*8;
00634
00635 for (j = 0; j < height/2; j++) {
00636 INTERPOLATE_GB_ROW (kernel_stride, -1);
00637
00638 uint8_t * drow = dcol + j*2*dstride + drow_offset1;
00639 _mm_store_si128 ((__m128i *)drow,
00640 _mm_unpacklo_epi32 (bgrar1, bgral1));
00641 _mm_store_si128 ((__m128i *)(drow+16),
00642 _mm_unpackhi_epi32 (bgrar1, bgral1));
00643 _mm_store_si128 ((__m128i *)(drow+32),
00644 _mm_unpacklo_epi32 (bgrar2, bgral2));
00645 _mm_store_si128 ((__m128i *)(drow+48),
00646 _mm_unpackhi_epi32 (bgrar2, bgral2));
00647 _mm_store_si128 ((__m128i *)(drow+64),
00648 _mm_unpacklo_epi32 (bgrar3, bgral3));
00649 _mm_store_si128 ((__m128i *)(drow+80),
00650 _mm_unpackhi_epi32 (bgrar3, bgral3));
00651 _mm_store_si128 ((__m128i *)(drow+96),
00652 _mm_unpacklo_epi32 (bgrar4, bgral4));
00653 _mm_store_si128 ((__m128i *)(drow+112),
00654 _mm_unpackhi_epi32 (bgrar4, bgral4));
00655
00656 INTERPOLATE_RG_ROW (kernel_stride, -1);
00657
00658 drow = dcol + j*2*dstride + drow_offset2;
00659 _mm_store_si128 ((__m128i *)drow,
00660 _mm_unpacklo_epi32 (bgrar1, bgral1));
00661 _mm_store_si128 ((__m128i *)(drow+16),
00662 _mm_unpackhi_epi32 (bgrar1, bgral1));
00663 _mm_store_si128 ((__m128i *)(drow+32),
00664 _mm_unpacklo_epi32 (bgrar2, bgral2));
00665 _mm_store_si128 ((__m128i *)(drow+48),
00666 _mm_unpackhi_epi32 (bgrar2, bgral2));
00667 _mm_store_si128 ((__m128i *)(drow+64),
00668 _mm_unpacklo_epi32 (bgrar3, bgral3));
00669 _mm_store_si128 ((__m128i *)(drow+80),
00670 _mm_unpackhi_epi32 (bgrar3, bgral3));
00671 _mm_store_si128 ((__m128i *)(drow+96),
00672 _mm_unpacklo_epi32 (bgrar4, bgral4));
00673 _mm_store_si128 ((__m128i *)(drow+112),
00674 _mm_unpackhi_epi32 (bgrar4, bgral4));
00675
00676 }
00677 gb_plane += 16;
00678 b_plane += 16;
00679 r_plane += 16;
00680 gr_plane += 16;
00681 }
00682 }
00683 return 0;
00684 #endif
00685 }
00686
00687 template Image<PixRGB<byte> > debayerSSE3(const Image<byte>& src, BayerFormat format);
00688 template Image<PixRGB<uint16> > debayerSSE3(const Image<uint16>& src, BayerFormat format);
00689
00690
00691
00692
00693