00001 /*!@file Raster/DebayerSSE2.C is the debayer class with sse2 */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2001 by the // 00005 // University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: Zhicheng Li <zhicheng@usc.edu> 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Raster/DeBayerSSE2.C $ 00035 // $Id: DeBayerSSE2.C 10794 2009-02-08 06:21:09Z itti $ 00036 // 00037 #include <stdio.h> 00038 #include <stdint.h> 00039 #include <emmintrin.h> 00040 #include <stdlib.h> 00041 00042 // on some platforms, memalign is defined in <malloc.h>, but that file 00043 // does not exist on Darwin. On Darwin, including stdlib.h is sufficient. 00044 // Let's here also include malloc.h unless we are on Darwin: 00045 #ifndef MACHINE_OS_DARWIN 00046 #include <malloc.h> 00047 #endif 00048 00049 #include "Image/Image.H" 00050 #include "Image/CutPaste.H" 00051 #include "Raster/DeBayerSSE2.H" 00052 00053 using namespace std; 00054 00055 // ######################## debayer with SSE2 accelerate ##############// 00056 // #######################################################################// 00057 /* BOX_FILT evaluates this kernel: 00058 * 1 1 00059 * 1 1 00060 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00061 * the computation (stored as 16s). ptr points to the first pixel of the 00062 * strip, and must be 16-byte aligned. str is the stride of image rows in 00063 * bytes. If stride is positive, the origin of the kernel is in the top 00064 * row, if negative, the origin is in the bottom row. off is 1 to put 00065 * the origin in the left column of the kernel, or -1 to put the origin 00066 * in the right column. 00067 */ 00068 #define BOX_FILT(v1,v2,ptr,str,off) do { \ 00069 __m128i t1, t2, t3; \ 00070 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00071 v1 = _mm_unpacklo_epi8 (t1, z); \ 00072 v2 = _mm_unpackhi_epi8 (t1, z); \ 00073 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00074 t2 = _mm_unpacklo_epi8 (t1, z); \ 00075 t3 = _mm_unpackhi_epi8 (t1, z); \ 00076 v1 = _mm_add_epi16 (v1, t2); \ 00077 v2 = _mm_add_epi16 (v2, t3); \ 00078 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + off)); \ 00079 t2 = _mm_unpacklo_epi8 (t1, z); \ 00080 t3 = _mm_unpackhi_epi8 (t1, z); \ 00081 v1 = _mm_add_epi16 (v1, t2); \ 00082 v2 = _mm_add_epi16 (v2, t3); \ 00083 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + (str) + off)); \ 00084 t2 = _mm_unpacklo_epi8 (t1, z); \ 00085 t3 = _mm_unpackhi_epi8 (t1, z); \ 00086 v1 = _mm_add_epi16 (v1, t2); \ 00087 v2 = _mm_add_epi16 (v2, t3); \ 00088 } while (0) 00089 00090 /* CROSS_FILT_VERT evaluates this kernel: 00091 * 1/2 00092 * -1 5 -1 00093 * 1/2 00094 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00095 * the computation (stored as 16s). ptr points to the first pixel of the 00096 * strip, and must be 16-byte aligned. str is the stride of image rows in 00097 * bytes. The origin of the kernel is at the center. 00098 */ 00099 #define CROSS_FILT_VERT(v1,v2,ptr,str) do { \ 00100 __m128i t1, t2, t3, c10; \ 00101 c10 = _mm_set1_epi16 (10); \ 00102 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00103 v1 = _mm_unpacklo_epi8 (t1, z); \ 00104 v2 = _mm_unpackhi_epi8 (t1, z); \ 00105 v1 = _mm_mullo_epi16 (v1, c10); \ 00106 v2 = _mm_mullo_epi16 (v2, c10); \ 00107 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \ 00108 t2 = _mm_unpacklo_epi8 (t1, z); \ 00109 t3 = _mm_unpackhi_epi8 (t1, z); \ 00110 v1 = _mm_add_epi16 (v1, t2); \ 00111 v2 = _mm_add_epi16 (v2, t3); \ 00112 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00113 t2 = _mm_unpacklo_epi8 (t1, z); \ 00114 t3 = _mm_unpackhi_epi8 (t1, z); \ 00115 v1 = _mm_add_epi16 (v1, t2); \ 00116 v2 = _mm_add_epi16 (v2, t3); \ 00117 v1 = _mm_srli_epi16 (v1, 1); \ 00118 v2 = _mm_srli_epi16 (v2, 1); \ 00119 t1 = _mm_loadu_si128 ((__m128i *)((ptr) - 1)); \ 00120 t2 = _mm_unpacklo_epi8 (t1, z); \ 00121 t3 = _mm_unpackhi_epi8 (t1, z); \ 00122 v1 = _mm_subs_epi16 (v1, t2); \ 00123 v2 = _mm_subs_epi16 (v2, t3); \ 00124 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + 1)); \ 00125 t2 = _mm_unpacklo_epi8 (t1, z); \ 00126 t3 = _mm_unpackhi_epi8 (t1, z); \ 00127 v1 = _mm_subs_epi16 (v1, t2); \ 00128 v2 = _mm_subs_epi16 (v2, t3); \ 00129 } while (0) 00130 00131 /* HORIZ2_FILT evaluates this kernel: 00132 * 1 1 00133 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00134 * the computation (stored as 16s). ptr points to the first pixel of the 00135 * strip, and must be 16-byte aligned. str is the stride of image rows in 00136 * bytes (unused). off is 1 to put the origin in the left column of the 00137 * kernel, or -1 to put the origin in the right column. 00138 */ 00139 #define HORIZ2_FILT(v1,v2,ptr,str,off) do { \ 00140 __m128i t1, t2, t3; \ 00141 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00142 v1 = _mm_unpacklo_epi8 (t1, z); \ 00143 v2 = _mm_unpackhi_epi8 (t1, z); \ 00144 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + off)); \ 00145 t2 = _mm_unpacklo_epi8 (t1, z); \ 00146 t3 = _mm_unpackhi_epi8 (t1, z); \ 00147 v1 = _mm_add_epi16 (v1, t2); \ 00148 v2 = _mm_add_epi16 (v2, t3); \ 00149 } while (0) 00150 00151 /* VERT2_FILT evaluates this kernel: 00152 * 1 00153 * 1 00154 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00155 * the computation (stored as 16s). ptr points to the first pixel of the 00156 * strip, and must be 16-byte aligned. str is the stride of image rows in 00157 * bytes. If stride is positive, the origin of the kernel is in the top 00158 * row, if negative, the origin is in the bottom row. 00159 */ 00160 #define VERT2_FILT(v1,v2,ptr,str) do { \ 00161 __m128i t1, t2, t3; \ 00162 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00163 v1 = _mm_unpacklo_epi8 (t1, z); \ 00164 v2 = _mm_unpackhi_epi8 (t1, z); \ 00165 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00166 t2 = _mm_unpacklo_epi8 (t1, z); \ 00167 t3 = _mm_unpackhi_epi8 (t1, z); \ 00168 v1 = _mm_add_epi16 (v1, t2); \ 00169 v2 = _mm_add_epi16 (v2, t3); \ 00170 } while (0) 00171 00172 /* CROSS_FILT_SYM evaluates this kernel: 00173 * -1 00174 * -1 4 -1 00175 * -1 00176 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00177 * the computation (stored as 16s). ptr points to the first pixel of the 00178 * strip, and must be 16-byte aligned. str is the stride of image rows in 00179 * bytes. The origin of the kernel is at the center. 00180 */ 00181 #define CROSS_FILT_SYM(v1,v2,ptr,str) do { \ 00182 __m128i t1, t2, t3; \ 00183 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00184 v1 = _mm_unpacklo_epi8 (t1, z); \ 00185 v2 = _mm_unpackhi_epi8 (t1, z); \ 00186 v1 = _mm_slli_epi16 (v1, 2); \ 00187 v2 = _mm_slli_epi16 (v2, 2); \ 00188 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \ 00189 t2 = _mm_unpacklo_epi8 (t1, z); \ 00190 t3 = _mm_unpackhi_epi8 (t1, z); \ 00191 v1 = _mm_subs_epi16 (v1, t2); \ 00192 v2 = _mm_subs_epi16 (v2, t3); \ 00193 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00194 t2 = _mm_unpacklo_epi8 (t1, z); \ 00195 t3 = _mm_unpackhi_epi8 (t1, z); \ 00196 v1 = _mm_subs_epi16 (v1, t2); \ 00197 v2 = _mm_subs_epi16 (v2, t3); \ 00198 t1 = _mm_loadu_si128 ((__m128i *)((ptr) - 1)); \ 00199 t2 = _mm_unpacklo_epi8 (t1, z); \ 00200 t3 = _mm_unpackhi_epi8 (t1, z); \ 00201 v1 = _mm_subs_epi16 (v1, t2); \ 00202 v2 = _mm_subs_epi16 (v2, t3); \ 00203 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + 1)); \ 00204 t2 = _mm_unpacklo_epi8 (t1, z); \ 00205 t3 = _mm_unpackhi_epi8 (t1, z); \ 00206 v1 = _mm_subs_epi16 (v1, t2); \ 00207 v2 = _mm_subs_epi16 (v2, t3); \ 00208 } while (0) 00209 00210 /* CROSS_FILT_HORIZ evaluates this kernel: 00211 * -1 00212 * 1/2 5 1/2 00213 * -1 00214 * For a 1x16 strip of pixels of an 8u image. v1 and v2 hold the result of 00215 * the computation (stored as 16s). ptr points to the first pixel of the 00216 * strip, and must be 16-byte aligned. str is the stride of image rows in 00217 * bytes. The origin of the kernel is at the center. 00218 */ 00219 #define CROSS_FILT_HORIZ(v1,v2,ptr,str) do { \ 00220 __m128i t1, t2, t3, c10; \ 00221 c10 = _mm_set1_epi16 (10); \ 00222 t1 = _mm_load_si128 ((__m128i *)(ptr)); \ 00223 v1 = _mm_unpacklo_epi8 (t1, z); \ 00224 v2 = _mm_unpackhi_epi8 (t1, z); \ 00225 v1 = _mm_mullo_epi16 (v1, c10); \ 00226 v2 = _mm_mullo_epi16 (v2, c10); \ 00227 t1 = _mm_loadu_si128 ((__m128i *)((ptr) - 1)); \ 00228 t2 = _mm_unpacklo_epi8 (t1, z); \ 00229 t3 = _mm_unpackhi_epi8 (t1, z); \ 00230 v1 = _mm_add_epi16 (v1, t2); \ 00231 v2 = _mm_add_epi16 (v2, t3); \ 00232 t1 = _mm_loadu_si128 ((__m128i *)((ptr) + 1)); \ 00233 t2 = _mm_unpacklo_epi8 (t1, z); \ 00234 t3 = _mm_unpackhi_epi8 (t1, z); \ 00235 v1 = _mm_add_epi16 (v1, t2); \ 00236 v2 = _mm_add_epi16 (v2, t3); \ 00237 v1 = _mm_srli_epi16 (v1, 1); \ 00238 v2 = _mm_srli_epi16 (v2, 1); \ 00239 t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \ 00240 t2 = _mm_unpacklo_epi8 (t1, z); \ 00241 t3 = _mm_unpackhi_epi8 (t1, z); \ 00242 v1 = _mm_subs_epi16 (v1, t2); \ 00243 v2 = _mm_subs_epi16 (v2, t3); \ 00244 t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \ 00245 t2 = _mm_unpacklo_epi8 (t1, z); \ 00246 t3 = _mm_unpackhi_epi8 (t1, z); \ 00247 v1 = _mm_subs_epi16 (v1, t2); \ 00248 v2 = _mm_subs_epi16 (v2, t3); \ 00249 } while (0) 00250 00251 #define INTERPOLATE_GB_ROW(kstride, off) do { \ 00252 CROSS_FILT_VERT (v1, v2, gb_plane + j*sstride, kstride); \ 00253 HORIZ2_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \ 00254 w1 = _mm_slli_epi16 (w1, 2); \ 00255 w2 = _mm_slli_epi16 (w2, 2); \ 00256 v1 = _mm_add_epi16 (v1, w1); \ 00257 v2 = _mm_add_epi16 (v2, w2); \ 00258 BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \ 00259 v1 = _mm_subs_epi16 (v1, w1); \ 00260 v2 = _mm_subs_epi16 (v2, w2); \ 00261 v1 = _mm_srai_epi16 (v1, 3); \ 00262 v2 = _mm_srai_epi16 (v2, 3); \ 00263 bg = _mm_packus_epi16 (v1, v2); \ 00264 \ 00265 VERT2_FILT (v1, v2, gr_plane + j*sstride, -kstride); \ 00266 HORIZ2_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \ 00267 v1 = _mm_add_epi16 (v1, w1); \ 00268 v2 = _mm_add_epi16 (v2, w2); \ 00269 v1 = _mm_slli_epi16 (v1, 1); \ 00270 v2 = _mm_slli_epi16 (v2, 1); \ 00271 CROSS_FILT_SYM (w1, w2, b_plane + j*sstride, kstride); \ 00272 v1 = _mm_add_epi16 (v1, w1); \ 00273 v2 = _mm_add_epi16 (v2, w2); \ 00274 v1 = _mm_srai_epi16 (v1, 3); \ 00275 v2 = _mm_srai_epi16 (v2, 3); \ 00276 gb = _mm_packus_epi16 (v1, v2); \ 00277 \ 00278 CROSS_FILT_HORIZ (v1, v2, gb_plane + j*sstride, kstride); \ 00279 VERT2_FILT (w1, w2, r_plane + j*sstride, -kstride); \ 00280 w1 = _mm_slli_epi16 (w1, 2); \ 00281 w2 = _mm_slli_epi16 (w2, 2); \ 00282 v1 = _mm_add_epi16 (v1, w1); \ 00283 v2 = _mm_add_epi16 (v2, w2); \ 00284 BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \ 00285 v1 = _mm_subs_epi16 (v1, w1); \ 00286 v2 = _mm_subs_epi16 (v2, w2); \ 00287 v1 = _mm_srai_epi16 (v1, 3); \ 00288 v2 = _mm_srai_epi16 (v2, 3); \ 00289 rg = _mm_packus_epi16 (v1, v2); \ 00290 \ 00291 CROSS_FILT_SYM (v1, v2, b_plane + j*sstride, kstride); \ 00292 v1 = _mm_mullo_epi16 (v1, c3); \ 00293 v2 = _mm_mullo_epi16 (v2, c3); \ 00294 BOX_FILT (w1, w2, r_plane + j*sstride, -kstride, off); \ 00295 w1 = _mm_slli_epi16 (w1, 2); \ 00296 w2 = _mm_slli_epi16 (w2, 2); \ 00297 v1 = _mm_add_epi16 (v1, w1); \ 00298 v2 = _mm_add_epi16 (v2, w2); \ 00299 v1 = _mm_srai_epi16 (v1, 4); \ 00300 v2 = _mm_srai_epi16 (v2, 4); \ 00301 rb = _mm_packus_epi16 (v1, v2); \ 00302 \ 00303 gg = _mm_load_si128 ((__m128i *)(gb_plane + j*sstride)); \ 00304 bgl1 = _mm_unpacklo_epi8 (bg, gg); \ 00305 bgl2 = _mm_unpackhi_epi8 (bg, gg); \ 00306 \ 00307 a = _mm_set1_epi8 (0xff); \ 00308 ral1 = _mm_unpacklo_epi8 (rg, a); \ 00309 ral2 = _mm_unpackhi_epi8 (rg, a); \ 00310 \ 00311 bb = _mm_load_si128 ((__m128i *)(b_plane + j*sstride)); \ 00312 bgr1 = _mm_unpacklo_epi8 (bb, gb); \ 00313 bgr2 = _mm_unpackhi_epi8 (bb, gb); \ 00314 \ 00315 rar1 = _mm_unpacklo_epi8 (rb, a); \ 00316 rar2 = _mm_unpackhi_epi8 (rb, a); \ 00317 \ 00318 bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \ 00319 bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \ 00320 bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \ 00321 bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \ 00322 \ 00323 bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \ 00324 bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \ 00325 bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \ 00326 bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \ 00327 } while (0) 00328 00329 #define INTERPOLATE_RG_ROW(kstride,off) do { \ 00330 CROSS_FILT_SYM (v1, v2, r_plane + j*sstride, kstride); \ 00331 v1 = _mm_mullo_epi16 (v1, c3); \ 00332 v2 = _mm_mullo_epi16 (v2, c3); \ 00333 BOX_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \ 00334 w1 = _mm_slli_epi16 (w1, 2); \ 00335 w2 = _mm_slli_epi16 (w2, 2); \ 00336 v1 = _mm_add_epi16 (v1, w1); \ 00337 v2 = _mm_add_epi16 (v2, w2); \ 00338 v1 = _mm_srai_epi16 (v1, 4); \ 00339 v2 = _mm_srai_epi16 (v2, 4); \ 00340 br = _mm_packus_epi16 (v1, v2); \ 00341 \ 00342 VERT2_FILT (v1, v2, gb_plane + j*sstride, kstride); \ 00343 HORIZ2_FILT (w1, w2, gr_plane + j*sstride, kstride, -off); \ 00344 v1 = _mm_add_epi16 (v1, w1); \ 00345 v2 = _mm_add_epi16 (v2, w2); \ 00346 v1 = _mm_slli_epi16 (v1, 1); \ 00347 v2 = _mm_slli_epi16 (v2, 1); \ 00348 CROSS_FILT_SYM (w1, w2, r_plane + j*sstride, kstride); \ 00349 v1 = _mm_add_epi16 (v1, w1); \ 00350 v2 = _mm_add_epi16 (v2, w2); \ 00351 v1 = _mm_srai_epi16 (v1, 3); \ 00352 v2 = _mm_srai_epi16 (v2, 3); \ 00353 gr = _mm_packus_epi16 (v1, v2); \ 00354 \ 00355 CROSS_FILT_HORIZ (v1, v2, gr_plane + j*sstride, kstride); \ 00356 VERT2_FILT (w1, w2, b_plane + j*sstride, kstride); \ 00357 w1 = _mm_slli_epi16 (w1, 2); \ 00358 w2 = _mm_slli_epi16 (w2, 2); \ 00359 v1 = _mm_add_epi16 (v1, w1); \ 00360 v2 = _mm_add_epi16 (v2, w2); \ 00361 BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \ 00362 v1 = _mm_subs_epi16 (v1, w1); \ 00363 v2 = _mm_subs_epi16 (v2, w2); \ 00364 v1 = _mm_srai_epi16 (v1, 3); \ 00365 v2 = _mm_srai_epi16 (v2, 3); \ 00366 bg = _mm_packus_epi16 (v1, v2); \ 00367 \ 00368 CROSS_FILT_VERT (v1, v2, gr_plane + j*sstride, kstride); \ 00369 HORIZ2_FILT (w1, w2, r_plane + j*sstride, kstride, off); \ 00370 w1 = _mm_slli_epi16 (w1, 2); \ 00371 w2 = _mm_slli_epi16 (w2, 2); \ 00372 v1 = _mm_add_epi16 (v1, w1); \ 00373 v2 = _mm_add_epi16 (v2, w2); \ 00374 BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \ 00375 v1 = _mm_subs_epi16 (v1, w1); \ 00376 v2 = _mm_subs_epi16 (v2, w2); \ 00377 v1 = _mm_srai_epi16 (v1, 3); \ 00378 v2 = _mm_srai_epi16 (v2, 3); \ 00379 rg = _mm_packus_epi16 (v1, v2); \ 00380 \ 00381 bgl1 = _mm_unpacklo_epi8 (br, gr); \ 00382 bgl2 = _mm_unpackhi_epi8 (br, gr); \ 00383 \ 00384 rr = _mm_load_si128 ((__m128i *)(r_plane + j*sstride)); \ 00385 a = _mm_set1_epi8 (0xff); \ 00386 ral1 = _mm_unpacklo_epi8 (rr, a); \ 00387 ral2 = _mm_unpackhi_epi8 (rr, a); \ 00388 \ 00389 gg = _mm_load_si128 ((__m128i *)(gr_plane + j*sstride)); \ 00390 bgr1 = _mm_unpacklo_epi8 (bg, gg); \ 00391 bgr2 = _mm_unpackhi_epi8 (bg, gg); \ 00392 \ 00393 rar1 = _mm_unpacklo_epi8 (rg, a); \ 00394 rar2 = _mm_unpackhi_epi8 (rg, a); \ 00395 \ 00396 bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \ 00397 bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \ 00398 bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \ 00399 bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \ 00400 \ 00401 bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \ 00402 bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \ 00403 bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \ 00404 bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \ 00405 } while (0) 00406 00407 template <class T> Image<PixRGB<T> > 00408 debayerSSE2 (const Image<T>& src1, BayerFormat format) 00409 { 00410 #ifndef INVT_USE_SSEDB 00411 LFATAL("you must have SSE2 support"); 00412 return Image<PixRGB<T> >(); 00413 #else 00414 //! althogh we use a template here but actuaaly we only support the 8 bit depth mode 00415 00416 /* make sure that the source image stride can be divied by 32 */ 00417 bool isAligned32 = true; 00418 int patchWidth = 0; 00419 Image<T> src; 00420 if ((src1.getWidth() % 32) != 0) 00421 { 00422 patchWidth = 32 - (src1.getWidth() % 16); 00423 src = concatX(src1, Image<T>(patchWidth, src1.getHeight(), ZEROS)); 00424 isAligned32 = false; 00425 } 00426 else 00427 src = src1; 00428 00429 int width = src.getWidth(); 00430 int height = src.getHeight(); 00431 ASSERT(width % 2 == 0); 00432 ASSERT(height % 2 == 0); 00433 int dstride = width * 4; 00434 int sstride = width; 00435 00436 /* ensure stride is 16-byte aligned and add 32 extra bytes for the 00437 * border padding */ 00438 uint8_t *bayer_planes[4]; 00439 int plane_stride = ((width + 0xf)&(~0xf)) + 32; 00440 for (int i = 0; i < 4; i++) { 00441 bayer_planes[i] = (uint8_t*)memalign(16,plane_stride * (height + 2)); 00442 } 00443 00444 // alocate a 16-byte aligned buffer for the interpolated image 00445 int bgra_stride = width*4; 00446 uint8_t *bgra_img = (uint8_t*)memalign(16,height * bgra_stride); 00447 00448 // allocate a 16-byte aligned buffer for the source image 00449 int bayer_stride = width; 00450 uint8_t *bayer_img = (uint8_t*) memalign(16,height * bayer_stride); 00451 00452 // copy the source image into the 16-byte aligned buffer 00453 copy_8u_generic ((uint8_t*)src.getArrayPtr(), sstride, 00454 bayer_img, bayer_stride, 00455 0, 0, 0, 0, width, height, 8); 00456 00457 // split the bayer image 00458 uint8_t * planes[4] = { 00459 bayer_planes[0] + plane_stride + 16, 00460 bayer_planes[1] + plane_stride + 16, 00461 bayer_planes[2] + plane_stride + 16, 00462 bayer_planes[3] + plane_stride + 16, 00463 }; 00464 int p_width = width / 2; 00465 int p_height = height / 2; 00466 00467 splitBayerPlanes_8u (planes, plane_stride, 00468 bayer_img, bayer_stride, p_width, p_height); 00469 for (int j = 0; j < 4; j++) 00470 replicateBorder_8u (planes[j], plane_stride, p_width, p_height); 00471 00472 00473 if(bayerInterpolateTo_8u_bgra_sse2 (planes,plane_stride, 00474 bgra_img, bgra_stride, 00475 width, height, format) < 0) 00476 LFATAL("error in debayer with sse2"); 00477 00478 // copy to destination 00479 uint8_t * dest = (uint8_t*)memalign(16, dstride*height); 00480 copy_8u_generic (bgra_img, bgra_stride, 00481 dest, dstride, 0, 0, 0, 0, width, height, 8 * 4); 00482 00483 Image<PixRGB<T> > res(width, height, NO_INIT); 00484 typename Image<PixRGB<T> >::iterator dptr = res.beginw(); 00485 T* sptr = (T*)dest; 00486 00487 for(int y =0; y < height; y++) 00488 { 00489 for(int x =0; x < width; x++) 00490 { 00491 dptr[0].p[2] = *sptr++; 00492 dptr[0].p[1] = *sptr++; 00493 dptr[0].p[0] = *sptr++; 00494 dptr++; 00495 sptr++; // for the A channel 00496 } 00497 } 00498 00499 for (int i=0; i<4; i++) { 00500 free (bayer_planes[i]); 00501 } 00502 free(dest); 00503 free(bayer_img); 00504 free (bgra_img); 00505 00506 if(!isAligned32) 00507 res = crop(res, Point2D<int>(0,0), Dims(width-patchWidth, height)); 00508 00509 return res; 00510 #endif //INVT_USE_SSEDB 00511 } 00512 00513 int 00514 bayerInterpolateTo_8u_bgra_sse2 (uint8_t ** src, int sstride, 00515 uint8_t * dst, int dstride, int width, int height, 00516 BayerFormat format) 00517 { 00518 # ifndef INVT_USE_SSE3 00519 LFATAL("you must have sse3 support"); 00520 return -1; 00521 #else 00522 int i, j; 00523 for (i = 0; i < 4; i++) { 00524 if (!IS_ALIGNED16(src[i]) || !IS_ALIGNED16(sstride)) { 00525 LERROR("%s: src[%d] is not 16-byte aligned", __FUNCTION__, i); 00526 return -1; 00527 } 00528 } 00529 if (!IS_ALIGNED16(dst) || !IS_ALIGNED128(dstride)) { 00530 LERROR("%s: dst is not 16-byte aligned or 128-byte stride aligned", __FUNCTION__); 00531 return -1; 00532 } 00533 00534 __m128i z = _mm_set1_epi32 (0); 00535 __m128i c3 = _mm_set1_epi16 (3); 00536 __m128i bg, gb, rg, rb, gg, a, bb, br, gr, rr; 00537 __m128i bgl1, bgl2, ral1, ral2; 00538 __m128i bgr1, bgr2, rar1, rar2; 00539 __m128i bgral1, bgral2, bgral3, bgral4; 00540 __m128i bgrar1, bgrar2, bgrar3, bgrar4; 00541 __m128i v1, v2, w1, w2; 00542 00543 if (format == BAYER_GBRG || 00544 format == BAYER_RGGB) { 00545 int drow_offset1 = 0; 00546 int drow_offset2 = dstride; 00547 int kernel_stride = sstride; 00548 uint8_t * gb_plane = src[0]; 00549 uint8_t * b_plane = src[1]; 00550 uint8_t * r_plane = src[2]; 00551 uint8_t * gr_plane = src[3]; 00552 if (format == BAYER_RGGB) { 00553 drow_offset1 = dstride; 00554 drow_offset2 = 0; 00555 kernel_stride = -sstride; 00556 r_plane = src[0]; 00557 gr_plane = src[1]; 00558 gb_plane = src[2]; 00559 b_plane = src[3]; 00560 } 00561 00562 for (i = 0; i < width/2; i += 16) { 00563 uint8_t * dcol = dst + i*8; 00564 00565 for (j = 0; j < height/2; j++) { 00566 INTERPOLATE_GB_ROW (kernel_stride, 1); 00567 00568 uint8_t * drow = dcol + j*2*dstride + drow_offset1; 00569 _mm_store_si128 ((__m128i *)drow, 00570 _mm_unpacklo_epi32 (bgral1, bgrar1)); 00571 _mm_store_si128 ((__m128i *)(drow+16), 00572 _mm_unpackhi_epi32 (bgral1, bgrar1)); 00573 _mm_store_si128 ((__m128i *)(drow+32), 00574 _mm_unpacklo_epi32 (bgral2, bgrar2)); 00575 _mm_store_si128 ((__m128i *)(drow+48), 00576 _mm_unpackhi_epi32 (bgral2, bgrar2)); 00577 _mm_store_si128 ((__m128i *)(drow+64), 00578 _mm_unpacklo_epi32 (bgral3, bgrar3)); 00579 _mm_store_si128 ((__m128i *)(drow+80), 00580 _mm_unpackhi_epi32 (bgral3, bgrar3)); 00581 _mm_store_si128 ((__m128i *)(drow+96), 00582 _mm_unpacklo_epi32 (bgral4, bgrar4)); 00583 _mm_store_si128 ((__m128i *)(drow+112), 00584 _mm_unpackhi_epi32 (bgral4, bgrar4)); 00585 00586 INTERPOLATE_RG_ROW (kernel_stride, 1); 00587 00588 drow = dcol + j*2*dstride + drow_offset2; 00589 _mm_store_si128 ((__m128i *)drow, 00590 _mm_unpacklo_epi32 (bgral1, bgrar1)); 00591 _mm_store_si128 ((__m128i *)(drow+16), 00592 _mm_unpackhi_epi32 (bgral1, bgrar1)); 00593 _mm_store_si128 ((__m128i *)(drow+32), 00594 _mm_unpacklo_epi32 (bgral2, bgrar2)); 00595 _mm_store_si128 ((__m128i *)(drow+48), 00596 _mm_unpackhi_epi32 (bgral2, bgrar2)); 00597 _mm_store_si128 ((__m128i *)(drow+64), 00598 _mm_unpacklo_epi32 (bgral3, bgrar3)); 00599 _mm_store_si128 ((__m128i *)(drow+80), 00600 _mm_unpackhi_epi32 (bgral3, bgrar3)); 00601 _mm_store_si128 ((__m128i *)(drow+96), 00602 _mm_unpacklo_epi32 (bgral4, bgrar4)); 00603 _mm_store_si128 ((__m128i *)(drow+112), 00604 _mm_unpackhi_epi32 (bgral4, bgrar4)); 00605 00606 } 00607 gb_plane += 16; 00608 b_plane += 16; 00609 r_plane += 16; 00610 gr_plane += 16; 00611 } 00612 } 00613 else { 00614 int drow_offset1 = 0; 00615 int drow_offset2 = dstride; 00616 int kernel_stride = sstride; 00617 uint8_t * b_plane = src[0]; 00618 uint8_t * gb_plane = src[1]; 00619 uint8_t * gr_plane = src[2]; 00620 uint8_t * r_plane = src[3]; 00621 if (format == BAYER_GRBG) { 00622 drow_offset1 = dstride; 00623 drow_offset2 = 0; 00624 kernel_stride = -sstride; 00625 gr_plane = src[0]; 00626 r_plane = src[1]; 00627 b_plane = src[2]; 00628 gb_plane = src[3]; 00629 } 00630 00631 for (i = 0; i < width/2; i += 16) { 00632 uint8_t * dcol = dst + i*8; 00633 00634 for (j = 0; j < height/2; j++) { 00635 INTERPOLATE_GB_ROW (kernel_stride, -1); 00636 00637 uint8_t * drow = dcol + j*2*dstride + drow_offset1; 00638 _mm_store_si128 ((__m128i *)drow, 00639 _mm_unpacklo_epi32 (bgrar1, bgral1)); 00640 _mm_store_si128 ((__m128i *)(drow+16), 00641 _mm_unpackhi_epi32 (bgrar1, bgral1)); 00642 _mm_store_si128 ((__m128i *)(drow+32), 00643 _mm_unpacklo_epi32 (bgrar2, bgral2)); 00644 _mm_store_si128 ((__m128i *)(drow+48), 00645 _mm_unpackhi_epi32 (bgrar2, bgral2)); 00646 _mm_store_si128 ((__m128i *)(drow+64), 00647 _mm_unpacklo_epi32 (bgrar3, bgral3)); 00648 _mm_store_si128 ((__m128i *)(drow+80), 00649 _mm_unpackhi_epi32 (bgrar3, bgral3)); 00650 _mm_store_si128 ((__m128i *)(drow+96), 00651 _mm_unpacklo_epi32 (bgrar4, bgral4)); 00652 _mm_store_si128 ((__m128i *)(drow+112), 00653 _mm_unpackhi_epi32 (bgrar4, bgral4)); 00654 00655 INTERPOLATE_RG_ROW (kernel_stride, -1); 00656 00657 drow = dcol + j*2*dstride + drow_offset2; 00658 _mm_store_si128 ((__m128i *)drow, 00659 _mm_unpacklo_epi32 (bgrar1, bgral1)); 00660 _mm_store_si128 ((__m128i *)(drow+16), 00661 _mm_unpackhi_epi32 (bgrar1, bgral1)); 00662 _mm_store_si128 ((__m128i *)(drow+32), 00663 _mm_unpacklo_epi32 (bgrar2, bgral2)); 00664 _mm_store_si128 ((__m128i *)(drow+48), 00665 _mm_unpackhi_epi32 (bgrar2, bgral2)); 00666 _mm_store_si128 ((__m128i *)(drow+64), 00667 _mm_unpacklo_epi32 (bgrar3, bgral3)); 00668 _mm_store_si128 ((__m128i *)(drow+80), 00669 _mm_unpackhi_epi32 (bgrar3, bgral3)); 00670 _mm_store_si128 ((__m128i *)(drow+96), 00671 _mm_unpacklo_epi32 (bgrar4, bgral4)); 00672 _mm_store_si128 ((__m128i *)(drow+112), 00673 _mm_unpackhi_epi32 (bgrar4, bgral4)); 00674 00675 } 00676 gb_plane += 16; 00677 b_plane += 16; 00678 r_plane += 16; 00679 gr_plane += 16; 00680 } 00681 } 00682 return 0; 00683 #endif 00684 } 00685 00686 template Image<PixRGB<byte> > debayerSSE2(const Image<byte>& src, BayerFormat format); 00687 template Image<PixRGB<uint16> > debayerSSE2(const Image<uint16>& src, BayerFormat format); 00688 00689 // ###################################################################### 00690 /* So things look consistent in everyone's emacs... */ 00691 /* Local Variables: */ 00692 /* indent-tabs-mode: nil */ 00693 /* End: */