00001 /*!@file Raster/DebayerSSE3.C is the debayer class with sse3 */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2001 by the // 00005 // University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: Zhicheng Li <zhicheng@usc.edu> 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Raster/DeBayerSSE3.C $ 00035 // $Id: DeBayerSSE3.C 10794 2009-02-08 06:21:09Z itti $ 00036 // 00037 #include <stdio.h> 00038 #include <stdint.h> 00039 #include <emmintrin.h> 00040 #include <stdlib.h> 00041 00042 // on some platforms, memalign is defined in <malloc.h>, but that file 00043 // does not exist on Darwin. On Darwin, including stdlib.h is sufficient. 00044 // Let's here also include malloc.h unless we are on Darwin: 00045 #ifndef MACHINE_OS_DARWIN 00046 #include <malloc.h> 00047 #endif 00048 00049 #include "Image/Image.H" 00050 #include "Image/CutPaste.H" 00051 #include "Raster/DeBayerSSE3.H" 00052 #include "Raster/DeBayerSSE2.H" 00053 00054 // ######################## debayer with SSE3 accelerate ##############// 00055 // #######################################################################// 00056 00057 00058 /* BOX_FILT evaluates this kernel: 00059 * 1 1 00060 * 1 1 00061 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00062 * the computation (stored as 16s). ptr points to the first pixel of the 00063 * strip, and must be 16-byte aligned. str is the stride of image rows in 00064 * bytes. If stride is positive, the origin of the kernel is in the top 00065 * row, if negative, the origin is in the bottom row. off is 1 to put 00066 * the origin in the left column of the kernel, or -1 to put the origin 00067 * in the right column. 00068 */ 00069 #define BOX_FILT(v1,v2,ptr,str,off) do { \ 00070 __m128i t1, t2, t3; \ 00071 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00072 v1 = _mm_unpacklo_epi8 (t1, z); \ 00073 v2 = _mm_unpackhi_epi8 (t1, z); \ 00074 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00075 t2 = _mm_unpacklo_epi8 (t1, z); \ 00076 t3 = _mm_unpackhi_epi8 (t1, z); \ 00077 v1 = _mm_add_epi16 (v1, t2); \ 00078 v2 = _mm_add_epi16 (v2, t3); \ 00079 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + off)); \ 00080 t2 = _mm_unpacklo_epi8 (t1, z); \ 00081 t3 = _mm_unpackhi_epi8 (t1, z); \ 00082 v1 = _mm_add_epi16 (v1, t2); \ 00083 v2 = _mm_add_epi16 (v2, t3); \ 00084 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + (str) + off)); \ 00085 t2 = _mm_unpacklo_epi8 (t1, z); \ 00086 t3 = _mm_unpackhi_epi8 (t1, z); \ 00087 v1 = _mm_add_epi16 (v1, t2); \ 00088 v2 = _mm_add_epi16 (v2, t3); \ 00089 } while (0) 00090 00091 /* CROSS_FILT_VERT evaluates this kernel: 00092 * 1/2 00093 * -1 5 -1 00094 * 1/2 00095 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00096 * the computation (stored as 16s). ptr points to the first pixel of the 00097 * strip, and must be 16-byte aligned. str is the stride of image rows in 00098 * bytes. The origin of the kernel is at the center. 00099 */ 00100 #define CROSS_FILT_VERT(v1,v2,ptr,str) do { \ 00101 __m128i t1, t2, t3, c10; \ 00102 c10 = _mm_set1_epi16 (10); \ 00103 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00104 v1 = _mm_unpacklo_epi8 (t1, z); \ 00105 v2 = _mm_unpackhi_epi8 (t1, z); \ 00106 v1 = _mm_mullo_epi16 (v1, c10); \ 00107 v2 = _mm_mullo_epi16 (v2, c10); \ 00108 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \ 00109 t2 = _mm_unpacklo_epi8 (t1, z); \ 00110 t3 = _mm_unpackhi_epi8 (t1, z); \ 00111 v1 = _mm_add_epi16 (v1, t2); \ 00112 v2 = _mm_add_epi16 (v2, t3); \ 00113 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00114 t2 = _mm_unpacklo_epi8 (t1, z); \ 00115 t3 = _mm_unpackhi_epi8 (t1, z); \ 00116 v1 = _mm_add_epi16 (v1, t2); \ 00117 v2 = _mm_add_epi16 (v2, t3); \ 00118 v1 = _mm_srli_epi16 (v1, 1); \ 00119 v2 = _mm_srli_epi16 (v2, 1); \ 00120 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) - 1)); \ 00121 t2 = _mm_unpacklo_epi8 (t1, z); \ 00122 t3 = _mm_unpackhi_epi8 (t1, z); \ 00123 v1 = _mm_subs_epi16 (v1, t2); \ 00124 v2 = _mm_subs_epi16 (v2, t3); \ 00125 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + 1)); \ 00126 t2 = _mm_unpacklo_epi8 (t1, z); \ 00127 t3 = _mm_unpackhi_epi8 (t1, z); \ 00128 v1 = _mm_subs_epi16 (v1, t2); \ 00129 v2 = _mm_subs_epi16 (v2, t3); \ 00130 } while (0) 00131 00132 /* HORIZ2_FILT evaluates this kernel: 00133 * 1 1 00134 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00135 * the computation (stored as 16s). ptr points to the first pixel of the 00136 * strip, and must be 16-byte aligned. str is the stride of image rows in 00137 * bytes (unused). off is 1 to put the origin in the left column of the 00138 * kernel, or -1 to put the origin in the right column. 00139 */ 00140 #define HORIZ2_FILT(v1,v2,ptr,str,off) do { \ 00141 __m128i t1, t2, t3; \ 00142 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00143 v1 = _mm_unpacklo_epi8 (t1, z); \ 00144 v2 = _mm_unpackhi_epi8 (t1, z); \ 00145 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + off)); \ 00146 t2 = _mm_unpacklo_epi8 (t1, z); \ 00147 t3 = _mm_unpackhi_epi8 (t1, z); \ 00148 v1 = _mm_add_epi16 (v1, t2); \ 00149 v2 = _mm_add_epi16 (v2, t3); \ 00150 } while (0) 00151 00152 /* VERT2_FILT evaluates this kernel: 00153 * 1 00154 * 1 00155 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00156 * the computation (stored as 16s). ptr points to the first pixel of the 00157 * strip, and must be 16-byte aligned. str is the stride of image rows in 00158 * bytes. If stride is positive, the origin of the kernel is in the top 00159 * row, if negative, the origin is in the bottom row. 00160 */ 00161 #define VERT2_FILT(v1,v2,ptr,str) do { \ 00162 __m128i t1, t2, t3; \ 00163 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00164 v1 = _mm_unpacklo_epi8 (t1, z); \ 00165 v2 = _mm_unpackhi_epi8 (t1, z); \ 00166 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00167 t2 = _mm_unpacklo_epi8 (t1, z); \ 00168 t3 = _mm_unpackhi_epi8 (t1, z); \ 00169 v1 = _mm_add_epi16 (v1, t2); \ 00170 v2 = _mm_add_epi16 (v2, t3); \ 00171 } while (0) 00172 00173 /* CROSS_FILT_SYM evaluates this kernel: 00174 * -1 00175 * -1 4 -1 00176 * -1 00177 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00178 * the computation (stored as 16s). ptr points to the first pixel of the 00179 * strip, and must be 16-byte aligned. str is the stride of image rows in 00180 * bytes. The origin of the kernel is at the center. 00181 */ 00182 #define CROSS_FILT_SYM(v1,v2,ptr,str) do { \ 00183 __m128i t1, t2, t3; \ 00184 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00185 v1 = _mm_unpacklo_epi8 (t1, z); \ 00186 v2 = _mm_unpackhi_epi8 (t1, z); \ 00187 v1 = _mm_slli_epi16 (v1, 2); \ 00188 v2 = _mm_slli_epi16 (v2, 2); \ 00189 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \ 00190 t2 = _mm_unpacklo_epi8 (t1, z); \ 00191 t3 = _mm_unpackhi_epi8 (t1, z); \ 00192 v1 = _mm_subs_epi16 (v1, t2); \ 00193 v2 = _mm_subs_epi16 (v2, t3); \ 00194 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00195 t2 = _mm_unpacklo_epi8 (t1, z); \ 00196 t3 = _mm_unpackhi_epi8 (t1, z); \ 00197 v1 = _mm_subs_epi16 (v1, t2); \ 00198 v2 = _mm_subs_epi16 (v2, t3); \ 00199 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) - 1)); \ 00200 t2 = _mm_unpacklo_epi8 (t1, z); \ 00201 t3 = _mm_unpackhi_epi8 (t1, z); \ 00202 v1 = _mm_subs_epi16 (v1, t2); \ 00203 v2 = _mm_subs_epi16 (v2, t3); \ 00204 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + 1)); \ 00205 t2 = _mm_unpacklo_epi8 (t1, z); \ 00206 t3 = _mm_unpackhi_epi8 (t1, z); \ 00207 v1 = _mm_subs_epi16 (v1, t2); \ 00208 v2 = _mm_subs_epi16 (v2, t3); \ 00209 } while (0) 00210 00211 /* CROSS_FILT_HORIZ evaluates this kernel: 00212 * -1 00213 * 1/2 5 1/2 00214 * -1 00215 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00216 * the computation (stored as 16s). ptr points to the first pixel of the 00217 * strip, and must be 16-byte aligned. str is the stride of image rows in 00218 * bytes. The origin of the kernel is at the center. 00219 */ 00220 #define CROSS_FILT_HORIZ(v1,v2,ptr,str) do { \ 00221 __m128i t1, t2, t3, c10; \ 00222 c10 = _mm_set1_epi16 (10); \ 00223 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00224 v1 = _mm_unpacklo_epi8 (t1, z); \ 00225 v2 = _mm_unpackhi_epi8 (t1, z); \ 00226 v1 = _mm_mullo_epi16 (v1, c10); \ 00227 v2 = _mm_mullo_epi16 (v2, c10); \ 00228 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) - 1)); \ 00229 t2 = _mm_unpacklo_epi8 (t1, z); \ 00230 t3 = _mm_unpackhi_epi8 (t1, z); \ 00231 v1 = _mm_add_epi16 (v1, t2); \ 00232 v2 = _mm_add_epi16 (v2, t3); \ 00233 t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + 1)); \ 00234 t2 = _mm_unpacklo_epi8 (t1, z); \ 00235 t3 = _mm_unpackhi_epi8 (t1, z); \ 00236 v1 = _mm_add_epi16 (v1, t2); \ 00237 v2 = _mm_add_epi16 (v2, t3); \ 00238 v1 = _mm_srli_epi16 (v1, 1); \ 00239 v2 = _mm_srli_epi16 (v2, 1); \ 00240 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \ 00241 t2 = _mm_unpacklo_epi8 (t1, z); \ 00242 t3 = _mm_unpackhi_epi8 (t1, z); \ 00243 v1 = _mm_subs_epi16 (v1, t2); \ 00244 v2 = _mm_subs_epi16 (v2, t3); \ 00245 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00246 t2 = _mm_unpacklo_epi8 (t1, z); \ 00247 t3 = _mm_unpackhi_epi8 (t1, z); \ 00248 v1 = _mm_subs_epi16 (v1, t2); \ 00249 v2 = _mm_subs_epi16 (v2, t3); \ 00250 } while (0) 00251 00252 #define INTERPOLATE_GB_ROW(kstride, off) do { \ 00253 CROSS_FILT_VERT (v1, v2, gb_plane + j*sstride, kstride); \ 00254 HORIZ2_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \ 00255 w1 = _mm_slli_epi16 (w1, 2); \ 00256 w2 = _mm_slli_epi16 (w2, 2); \ 00257 v1 = _mm_add_epi16 (v1, w1); \ 00258 v2 = _mm_add_epi16 (v2, w2); \ 00259 BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \ 00260 v1 = _mm_subs_epi16 (v1, w1); \ 00261 v2 = _mm_subs_epi16 (v2, w2); \ 00262 v1 = _mm_srai_epi16 (v1, 3); \ 00263 v2 = _mm_srai_epi16 (v2, 3); \ 00264 bg = _mm_packus_epi16 (v1, v2); \ 00265 \ 00266 VERT2_FILT (v1, v2, gr_plane + j*sstride, -kstride); \ 00267 HORIZ2_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \ 00268 v1 = _mm_add_epi16 (v1, w1); \ 00269 v2 = _mm_add_epi16 (v2, w2); \ 00270 v1 = _mm_slli_epi16 (v1, 1); \ 00271 v2 = _mm_slli_epi16 (v2, 1); \ 00272 CROSS_FILT_SYM (w1, w2, b_plane + j*sstride, kstride); \ 00273 v1 = _mm_add_epi16 (v1, w1); \ 00274 v2 = _mm_add_epi16 (v2, w2); \ 00275 v1 = _mm_srai_epi16 (v1, 3); \ 00276 v2 = _mm_srai_epi16 (v2, 3); \ 00277 gb = _mm_packus_epi16 (v1, v2); \ 00278 \ 00279 CROSS_FILT_HORIZ (v1, v2, gb_plane + j*sstride, kstride); \ 00280 VERT2_FILT (w1, w2, r_plane + j*sstride, -kstride); \ 00281 w1 = _mm_slli_epi16 (w1, 2); \ 00282 w2 = _mm_slli_epi16 (w2, 2); \ 00283 v1 = _mm_add_epi16 (v1, w1); \ 00284 v2 = _mm_add_epi16 (v2, w2); \ 00285 BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \ 00286 v1 = _mm_subs_epi16 (v1, w1); \ 00287 v2 = _mm_subs_epi16 (v2, w2); \ 00288 v1 = _mm_srai_epi16 (v1, 3); \ 00289 v2 = _mm_srai_epi16 (v2, 3); \ 00290 rg = _mm_packus_epi16 (v1, v2); \ 00291 \ 00292 CROSS_FILT_SYM (v1, v2, b_plane + j*sstride, kstride); \ 00293 v1 = _mm_mullo_epi16 (v1, c3); \ 00294 v2 = _mm_mullo_epi16 (v2, c3); \ 00295 BOX_FILT (w1, w2, r_plane + j*sstride, -kstride, off); \ 00296 w1 = _mm_slli_epi16 (w1, 2); \ 00297 w2 = _mm_slli_epi16 (w2, 2); \ 00298 v1 = _mm_add_epi16 (v1, w1); \ 00299 v2 = _mm_add_epi16 (v2, w2); \ 00300 v1 = _mm_srai_epi16 (v1, 4); \ 00301 v2 = _mm_srai_epi16 (v2, 4); \ 00302 rb = _mm_packus_epi16 (v1, v2); \ 00303 \ 00304 gg = _mm_load_si128 ((__m128i *)(gb_plane + j*sstride)); \ 00305 bgl1 = _mm_unpacklo_epi8 (bg, gg); \ 00306 bgl2 = _mm_unpackhi_epi8 (bg, gg); \ 00307 \ 00308 a = _mm_set1_epi8 (0xff); \ 00309 ral1 = _mm_unpacklo_epi8 (rg, a); \ 00310 ral2 = _mm_unpackhi_epi8 (rg, a); \ 00311 \ 00312 bb = _mm_load_si128 ((__m128i *)(b_plane + j*sstride)); \ 00313 bgr1 = _mm_unpacklo_epi8 (bb, gb); \ 00314 bgr2 = _mm_unpackhi_epi8 (bb, gb); \ 00315 \ 00316 rar1 = _mm_unpacklo_epi8 (rb, a); \ 00317 rar2 = _mm_unpackhi_epi8 (rb, a); \ 00318 \ 00319 bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \ 00320 bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \ 00321 bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \ 00322 bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \ 00323 \ 00324 bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \ 00325 bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \ 00326 bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \ 00327 bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \ 00328 } while (0) 00329 00330 #define INTERPOLATE_RG_ROW(kstride,off) do { \ 00331 CROSS_FILT_SYM (v1, v2, r_plane + j*sstride, kstride); \ 00332 v1 = _mm_mullo_epi16 (v1, c3); \ 00333 v2 = _mm_mullo_epi16 (v2, c3); \ 00334 BOX_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \ 00335 w1 = _mm_slli_epi16 (w1, 2); \ 00336 w2 = _mm_slli_epi16 (w2, 2); \ 00337 v1 = _mm_add_epi16 (v1, w1); \ 00338 v2 = _mm_add_epi16 (v2, w2); \ 00339 v1 = _mm_srai_epi16 (v1, 4); \ 00340 v2 = _mm_srai_epi16 (v2, 4); \ 00341 br = _mm_packus_epi16 (v1, v2); \ 00342 \ 00343 VERT2_FILT (v1, v2, gb_plane + j*sstride, kstride); \ 00344 HORIZ2_FILT (w1, w2, gr_plane + j*sstride, kstride, -off); \ 00345 v1 = _mm_add_epi16 (v1, w1); \ 00346 v2 = _mm_add_epi16 (v2, w2); \ 00347 v1 = _mm_slli_epi16 (v1, 1); \ 00348 v2 = _mm_slli_epi16 (v2, 1); \ 00349 CROSS_FILT_SYM (w1, w2, r_plane + j*sstride, kstride); \ 00350 v1 = _mm_add_epi16 (v1, w1); \ 00351 v2 = _mm_add_epi16 (v2, w2); \ 00352 v1 = _mm_srai_epi16 (v1, 3); \ 00353 v2 = _mm_srai_epi16 (v2, 3); \ 00354 gr = _mm_packus_epi16 (v1, v2); \ 00355 \ 00356 CROSS_FILT_HORIZ (v1, v2, gr_plane + j*sstride, kstride); \ 00357 VERT2_FILT (w1, w2, b_plane + j*sstride, kstride); \ 00358 w1 = _mm_slli_epi16 (w1, 2); \ 00359 w2 = _mm_slli_epi16 (w2, 2); \ 00360 v1 = _mm_add_epi16 (v1, w1); \ 00361 v2 = _mm_add_epi16 (v2, w2); \ 00362 BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \ 00363 v1 = _mm_subs_epi16 (v1, w1); \ 00364 v2 = _mm_subs_epi16 (v2, w2); \ 00365 v1 = _mm_srai_epi16 (v1, 3); \ 00366 v2 = _mm_srai_epi16 (v2, 3); \ 00367 bg = _mm_packus_epi16 (v1, v2); \ 00368 \ 00369 CROSS_FILT_VERT (v1, v2, gr_plane + j*sstride, kstride); \ 00370 HORIZ2_FILT (w1, w2, r_plane + j*sstride, kstride, off); \ 00371 w1 = _mm_slli_epi16 (w1, 2); \ 00372 w2 = _mm_slli_epi16 (w2, 2); \ 00373 v1 = _mm_add_epi16 (v1, w1); \ 00374 v2 = _mm_add_epi16 (v2, w2); \ 00375 BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \ 00376 v1 = _mm_subs_epi16 (v1, w1); \ 00377 v2 = _mm_subs_epi16 (v2, w2); \ 00378 v1 = _mm_srai_epi16 (v1, 3); \ 00379 v2 = _mm_srai_epi16 (v2, 3); \ 00380 rg = _mm_packus_epi16 (v1, v2); \ 00381 \ 00382 bgl1 = _mm_unpacklo_epi8 (br, gr); \ 00383 bgl2 = _mm_unpackhi_epi8 (br, gr); \ 00384 \ 00385 rr = _mm_load_si128 ((__m128i *)(r_plane + j*sstride)); \ 00386 a = _mm_set1_epi8 (0xff); \ 00387 ral1 = _mm_unpacklo_epi8 (rr, a); \ 00388 ral2 = _mm_unpackhi_epi8 (rr, a); \ 00389 \ 00390 gg = _mm_load_si128 ((__m128i *)(gr_plane + j*sstride)); \ 00391 bgr1 = _mm_unpacklo_epi8 (bg, gg); \ 00392 bgr2 = _mm_unpackhi_epi8 (bg, gg); \ 00393 \ 00394 rar1 = _mm_unpacklo_epi8 (rg, a); \ 00395 rar2 = _mm_unpackhi_epi8 (rg, a); \ 00396 \ 00397 bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \ 00398 bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \ 00399 bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \ 00400 bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \ 00401 \ 00402 bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \ 00403 bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \ 00404 bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \ 00405 bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \ 00406 } while (0) 00407 00408 template <class T> Image<PixRGB<T> > 00409 debayerSSE3 (const Image<T>& src1, 00410 BayerFormat format) 00411 { 00412 # ifndef INVT_USE_SSE3 00413 LFATAL("you must have sse3 support"); 00414 return Image<PixRGB<T> >(); 00415 #else 00416 00417 /* make sure that the source image stride can be divied by 32 */ 00418 bool isAligned32 = true; 00419 int patchWidth = 0; 00420 Image<T> src; 00421 if ((src1.getWidth() % 32) != 0) 00422 { 00423 patchWidth = 32 - (src1.getWidth() % 32); 00424 src = concatX(src1, Image<T>(patchWidth, src1.getHeight(), ZEROS)); 00425 isAligned32 = false; 00426 } 00427 else 00428 src = src1; 00429 00430 int width = src.getWidth(); 00431 int height = src.getHeight(); 00432 ASSERT(width % 2 == 0); 00433 ASSERT(height % 2 == 0); 00434 int dstride = width * 4; 00435 int sstride = width; 00436 00437 /* ensure stride is 16-byte aligned and add 32 extra bytes for the 00438 * border padding */ 00439 uint8_t *bayer_planes[4]; 00440 int plane_stride = ((width + 0xf)&(~0xf)) + 32; 00441 for (int i = 0; i < 4; i++) { 00442 bayer_planes[i] = (uint8_t*)memalign(16,plane_stride * (height + 2)); 00443 } 00444 00445 // alocate a 16-byte aligned buffer for the interpolated image 00446 int bgra_stride = width*4; 00447 uint8_t *bgra_img = (uint8_t*)memalign(16,height * bgra_stride); 00448 00449 // allocate a 16-byte aligned buffer for the source image 00450 int bayer_stride = width; 00451 uint8_t *bayer_img = (uint8_t*) memalign(16,height * bayer_stride); 00452 00453 // copy the source image into the 16-byte aligned buffer 00454 copy_8u_generic ((uint8_t*)src.getArrayPtr(), sstride, 00455 bayer_img, bayer_stride, 00456 0, 0, 0, 0, width, height, 8); 00457 00458 // split the bayer image 00459 uint8_t * planes[4] = { 00460 bayer_planes[0] + plane_stride + 16, 00461 bayer_planes[1] + plane_stride + 16, 00462 bayer_planes[2] + plane_stride + 16, 00463 bayer_planes[3] + plane_stride + 16, 00464 }; 00465 int p_width = width / 2; 00466 int p_height = height / 2; 00467 00468 splitBayerPlanes_8u (planes, plane_stride, 00469 bayer_img, bayer_stride, p_width, p_height); 00470 for (int j = 0; j < 4; j++) 00471 replicateBorder_8u (planes[j], plane_stride, p_width, p_height); 00472 00473 00474 if( bayerInterpolateTo_8u_bgra_sse3 (planes,plane_stride, 00475 bgra_img, bgra_stride, 00476 width, height, format) < 0) 00477 LFATAL("error in debayer with sse3"); 00478 // copy to destination 00479 uint8_t * dest = (uint8_t*)memalign(16, dstride*height); 00480 copy_8u_generic (bgra_img, bgra_stride, 00481 dest, dstride, 0, 0, 0, 0, width, height, 8 * 4); 00482 00483 Image<PixRGB<T> > res(width, height, NO_INIT); 00484 typename Image<PixRGB<T> >::iterator dptr = res.beginw(); 00485 T* sptr = (T*)dest; 00486 00487 for(int y =0; y < height; y++) 00488 { 00489 for(int x =0; x < width; x++) 00490 { 00491 dptr[0].p[2] = *sptr++; 00492 dptr[0].p[1] = *sptr++; 00493 dptr[0].p[0] = *sptr++; 00494 dptr++; 00495 sptr++; // for the A channel 00496 } 00497 } 00498 00499 for (int i=0; i<4; i++) { 00500 free (bayer_planes[i]); 00501 } 00502 free(dest); 00503 free(bayer_img); 00504 free (bgra_img); 00505 00506 if(!isAligned32) 00507 res = crop(res, Point2D<int>(0,0), Dims(width-patchWidth, height)); 00508 return res; 00509 #endif //INVT_USE_SSE3 00510 } 00511 00512 int 00513 bayerInterpolateTo_8u_bgra_sse3 (uint8_t ** src, int sstride, 00514 uint8_t * dst, int dstride, int width, int height, 00515 BayerFormat format) 00516 { 00517 # ifndef INVT_USE_SSE3 00518 LFATAL("you must have sse3 support"); 00519 return -1; 00520 #else 00521 int i, j; 00522 for (i = 0; i < 4; i++) { 00523 if (!IS_ALIGNED16(src[i]) || !IS_ALIGNED16(sstride)) { 00524 LERROR("%s: src[%d] is not 16-byte aligned\n", 00525 __FUNCTION__, i); 00526 return -1; 00527 } 00528 } 00529 if (!IS_ALIGNED16(dst) || !IS_ALIGNED128(dstride)) { 00530 LERROR("%s: dst is not 16-byte aligned or 128-byte stride " 00531 "aligned\n", __FUNCTION__); 00532 return -1; 00533 } 00534 00535 __m128i z = _mm_set1_epi32 (0); 00536 __m128i c3 = _mm_set1_epi16 (3); 00537 __m128i bg, gb, rg, rb, gg, a, bb, br, gr, rr; 00538 __m128i bgl1, bgl2, ral1, ral2; 00539 __m128i bgr1, bgr2, rar1, rar2; 00540 __m128i bgral1, bgral2, bgral3, bgral4; 00541 __m128i bgrar1, bgrar2, bgrar3, bgrar4; 00542 __m128i v1, v2, w1, w2; 00543 00544 if (format == BAYER_GBRG || 00545 format == BAYER_RGGB) { 00546 int drow_offset1 = 0; 00547 int drow_offset2 = dstride; 00548 int kernel_stride = sstride; 00549 uint8_t * gb_plane = src[0]; 00550 uint8_t * b_plane = src[1]; 00551 uint8_t * r_plane = src[2]; 00552 uint8_t * gr_plane = src[3]; 00553 if (format == BAYER_RGGB) { 00554 drow_offset1 = dstride; 00555 drow_offset2 = 0; 00556 kernel_stride = -sstride; 00557 r_plane = src[0]; 00558 gr_plane = src[1]; 00559 gb_plane = src[2]; 00560 b_plane = src[3]; 00561 } 00562 00563 for (i = 0; i < width/2; i += 16) { 00564 uint8_t * dcol = dst + i*8; 00565 00566 for (j = 0; j < height/2; j++) { 00567 INTERPOLATE_GB_ROW (kernel_stride, 1); 00568 00569 uint8_t * drow = dcol + j*2*dstride + drow_offset1; 00570 _mm_store_si128 ((__m128i *)drow, 00571 _mm_unpacklo_epi32 (bgral1, bgrar1)); 00572 _mm_store_si128 ((__m128i *)(drow+16), 00573 _mm_unpackhi_epi32 (bgral1, bgrar1)); 00574 _mm_store_si128 ((__m128i *)(drow+32), 00575 _mm_unpacklo_epi32 (bgral2, bgrar2)); 00576 _mm_store_si128 ((__m128i *)(drow+48), 00577 _mm_unpackhi_epi32 (bgral2, bgrar2)); 00578 _mm_store_si128 ((__m128i *)(drow+64), 00579 _mm_unpacklo_epi32 (bgral3, bgrar3)); 00580 _mm_store_si128 ((__m128i *)(drow+80), 00581 _mm_unpackhi_epi32 (bgral3, bgrar3)); 00582 _mm_store_si128 ((__m128i *)(drow+96), 00583 _mm_unpacklo_epi32 (bgral4, bgrar4)); 00584 _mm_store_si128 ((__m128i *)(drow+112), 00585 _mm_unpackhi_epi32 (bgral4, bgrar4)); 00586 00587 INTERPOLATE_RG_ROW (kernel_stride, 1); 00588 00589 drow = dcol + j*2*dstride + drow_offset2; 00590 _mm_store_si128 ((__m128i *)drow, 00591 _mm_unpacklo_epi32 (bgral1, bgrar1)); 00592 _mm_store_si128 ((__m128i *)(drow+16), 00593 _mm_unpackhi_epi32 (bgral1, bgrar1)); 00594 _mm_store_si128 ((__m128i *)(drow+32), 00595 _mm_unpacklo_epi32 (bgral2, bgrar2)); 00596 _mm_store_si128 ((__m128i *)(drow+48), 00597 _mm_unpackhi_epi32 (bgral2, bgrar2)); 00598 _mm_store_si128 ((__m128i *)(drow+64), 00599 _mm_unpacklo_epi32 (bgral3, bgrar3)); 00600 _mm_store_si128 ((__m128i *)(drow+80), 00601 _mm_unpackhi_epi32 (bgral3, bgrar3)); 00602 _mm_store_si128 ((__m128i *)(drow+96), 00603 _mm_unpacklo_epi32 (bgral4, bgrar4)); 00604 _mm_store_si128 ((__m128i *)(drow+112), 00605 _mm_unpackhi_epi32 (bgral4, bgrar4)); 00606 00607 } 00608 gb_plane += 16; 00609 b_plane += 16; 00610 r_plane += 16; 00611 gr_plane += 16; 00612 } 00613 } 00614 else { 00615 int drow_offset1 = 0; 00616 int drow_offset2 = dstride; 00617 int kernel_stride = sstride; 00618 uint8_t * b_plane = src[0]; 00619 uint8_t * gb_plane = src[1]; 00620 uint8_t * gr_plane = src[2]; 00621 uint8_t * r_plane = src[3]; 00622 if (format == BAYER_GRBG) { 00623 drow_offset1 = dstride; 00624 drow_offset2 = 0; 00625 kernel_stride = -sstride; 00626 gr_plane = src[0]; 00627 r_plane = src[1]; 00628 b_plane = src[2]; 00629 gb_plane = src[3]; 00630 } 00631 00632 for (i = 0; i < width/2; i += 16) { 00633 uint8_t * dcol = dst + i*8; 00634 00635 for (j = 0; j < height/2; j++) { 00636 INTERPOLATE_GB_ROW (kernel_stride, -1); 00637 00638 uint8_t * drow = dcol + j*2*dstride + drow_offset1; 00639 _mm_store_si128 ((__m128i *)drow, 00640 _mm_unpacklo_epi32 (bgrar1, bgral1)); 00641 _mm_store_si128 ((__m128i *)(drow+16), 00642 _mm_unpackhi_epi32 (bgrar1, bgral1)); 00643 _mm_store_si128 ((__m128i *)(drow+32), 00644 _mm_unpacklo_epi32 (bgrar2, bgral2)); 00645 _mm_store_si128 ((__m128i *)(drow+48), 00646 _mm_unpackhi_epi32 (bgrar2, bgral2)); 00647 _mm_store_si128 ((__m128i *)(drow+64), 00648 _mm_unpacklo_epi32 (bgrar3, bgral3)); 00649 _mm_store_si128 ((__m128i *)(drow+80), 00650 _mm_unpackhi_epi32 (bgrar3, bgral3)); 00651 _mm_store_si128 ((__m128i *)(drow+96), 00652 _mm_unpacklo_epi32 (bgrar4, bgral4)); 00653 _mm_store_si128 ((__m128i *)(drow+112), 00654 _mm_unpackhi_epi32 (bgrar4, bgral4)); 00655 00656 INTERPOLATE_RG_ROW (kernel_stride, -1); 00657 00658 drow = dcol + j*2*dstride + drow_offset2; 00659 _mm_store_si128 ((__m128i *)drow, 00660 _mm_unpacklo_epi32 (bgrar1, bgral1)); 00661 _mm_store_si128 ((__m128i *)(drow+16), 00662 _mm_unpackhi_epi32 (bgrar1, bgral1)); 00663 _mm_store_si128 ((__m128i *)(drow+32), 00664 _mm_unpacklo_epi32 (bgrar2, bgral2)); 00665 _mm_store_si128 ((__m128i *)(drow+48), 00666 _mm_unpackhi_epi32 (bgrar2, bgral2)); 00667 _mm_store_si128 ((__m128i *)(drow+64), 00668 _mm_unpacklo_epi32 (bgrar3, bgral3)); 00669 _mm_store_si128 ((__m128i *)(drow+80), 00670 _mm_unpackhi_epi32 (bgrar3, bgral3)); 00671 _mm_store_si128 ((__m128i *)(drow+96), 00672 _mm_unpacklo_epi32 (bgrar4, bgral4)); 00673 _mm_store_si128 ((__m128i *)(drow+112), 00674 _mm_unpackhi_epi32 (bgrar4, bgral4)); 00675 00676 } 00677 gb_plane += 16; 00678 b_plane += 16; 00679 r_plane += 16; 00680 gr_plane += 16; 00681 } 00682 } 00683 return 0; 00684 #endif 00685 } 00686 00687 template Image<PixRGB<byte> > debayerSSE3(const Image<byte>& src, BayerFormat format); 00688 template Image<PixRGB<uint16> > debayerSSE3(const Image<uint16>& src, BayerFormat format); 00689 // ###################################################################### 00690 /* So things look consistent in everyone's emacs... */ 00691 /* Local Variables: */ 00692 /* indent-tabs-mode: nil */ 00693 /* End: */