DeBayerSSE3.C

00001 /*!@file Raster/DebayerSSE3.C is the debayer class with sse3 */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2001 by the //
00005 // University of Southern California (USC) and the iLab at USC.         //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file: Zhicheng Li <zhicheng@usc.edu>
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Raster/DeBayerSSE3.C $
00035 // $Id: DeBayerSSE3.C 10794 2009-02-08 06:21:09Z itti $
00036 //
00037 #include <stdio.h>
00038 #include <stdint.h>
00039 #include <emmintrin.h>
00040 #include <stdlib.h>
00041 
00042 // on some platforms, memalign is defined in <malloc.h>, but that file
00043 // does not exist on Darwin. On Darwin, including stdlib.h is sufficient.
00044 // Let's here also include malloc.h unless we are on Darwin:
00045 #ifndef MACHINE_OS_DARWIN
00046 #include <malloc.h>
00047 #endif
00048 
00049 #include "Image/Image.H"
00050 #include "Image/CutPaste.H"
00051 #include "Raster/DeBayerSSE3.H"
00052 #include "Raster/DeBayerSSE2.H"
00053 
00054 // ########################  debayer with SSE3 accelerate   ##############//
00055 // #######################################################################//
00056 
00057 
00058 /* BOX_FILT evaluates this kernel:
00059  *     1  1
00060  *     1  1
00061  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00062  * the computation (stored as 16s).  ptr points to the first pixel of the
00063  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00064  * bytes.  If stride is positive, the origin of the kernel is in the top
00065  * row, if negative, the origin is in the bottom row.  off is 1 to put
00066  * the origin in the left column of the kernel, or -1 to put the origin
00067  * in the right column.
00068  */
00069 #define BOX_FILT(v1,v2,ptr,str,off) do { \
00070     __m128i t1, t2, t3; \
00071     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00072     v1 = _mm_unpacklo_epi8 (t1, z); \
00073     v2 = _mm_unpackhi_epi8 (t1, z); \
00074     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00075     t2 = _mm_unpacklo_epi8 (t1, z); \
00076     t3 = _mm_unpackhi_epi8 (t1, z); \
00077     v1 = _mm_add_epi16 (v1, t2); \
00078     v2 = _mm_add_epi16 (v2, t3); \
00079     t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + off)); \
00080     t2 = _mm_unpacklo_epi8 (t1, z); \
00081     t3 = _mm_unpackhi_epi8 (t1, z); \
00082     v1 = _mm_add_epi16 (v1, t2); \
00083     v2 = _mm_add_epi16 (v2, t3); \
00084     t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + (str) + off)); \
00085     t2 = _mm_unpacklo_epi8 (t1, z); \
00086     t3 = _mm_unpackhi_epi8 (t1, z); \
00087     v1 = _mm_add_epi16 (v1, t2); \
00088     v2 = _mm_add_epi16 (v2, t3); \
00089 } while (0)
00090 
00091 /* CROSS_FILT_VERT evaluates this kernel:
00092  *         1/2
00093  *     -1   5  -1
00094  *         1/2
00095  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00096  * the computation (stored as 16s).  ptr points to the first pixel of the
00097  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00098  * bytes.  The origin of the kernel is at the center.
00099  */
00100 #define CROSS_FILT_VERT(v1,v2,ptr,str) do { \
00101     __m128i t1, t2, t3, c10; \
00102     c10 = _mm_set1_epi16 (10); \
00103     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00104     v1 = _mm_unpacklo_epi8 (t1, z); \
00105     v2 = _mm_unpackhi_epi8 (t1, z); \
00106     v1 = _mm_mullo_epi16 (v1, c10); \
00107     v2 = _mm_mullo_epi16 (v2, c10); \
00108     t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00109     t2 = _mm_unpacklo_epi8 (t1, z); \
00110     t3 = _mm_unpackhi_epi8 (t1, z); \
00111     v1 = _mm_add_epi16 (v1, t2); \
00112     v2 = _mm_add_epi16 (v2, t3); \
00113     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00114     t2 = _mm_unpacklo_epi8 (t1, z); \
00115     t3 = _mm_unpackhi_epi8 (t1, z); \
00116     v1 = _mm_add_epi16 (v1, t2); \
00117     v2 = _mm_add_epi16 (v2, t3); \
00118     v1 = _mm_srli_epi16 (v1, 1); \
00119     v2 = _mm_srli_epi16 (v2, 1); \
00120     t1 = _mm_lddqu_si128 ((__m128i *)((ptr) - 1)); \
00121     t2 = _mm_unpacklo_epi8 (t1, z); \
00122     t3 = _mm_unpackhi_epi8 (t1, z); \
00123     v1 = _mm_subs_epi16 (v1, t2); \
00124     v2 = _mm_subs_epi16 (v2, t3); \
00125     t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + 1)); \
00126     t2 = _mm_unpacklo_epi8 (t1, z); \
00127     t3 = _mm_unpackhi_epi8 (t1, z); \
00128     v1 = _mm_subs_epi16 (v1, t2); \
00129     v2 = _mm_subs_epi16 (v2, t3); \
00130 } while (0)
00131 
00132 /* HORIZ2_FILT evaluates this kernel:
00133  *     1  1
00134  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00135  * the computation (stored as 16s).  ptr points to the first pixel of the
00136  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00137  * bytes (unused).  off is 1 to put the origin in the left column of the
00138  * kernel, or -1 to put the origin in the right column.
00139  */
00140 #define HORIZ2_FILT(v1,v2,ptr,str,off) do { \
00141     __m128i t1, t2, t3; \
00142     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00143     v1 = _mm_unpacklo_epi8 (t1, z); \
00144     v2 = _mm_unpackhi_epi8 (t1, z); \
00145     t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + off)); \
00146     t2 = _mm_unpacklo_epi8 (t1, z); \
00147     t3 = _mm_unpackhi_epi8 (t1, z); \
00148     v1 = _mm_add_epi16 (v1, t2); \
00149     v2 = _mm_add_epi16 (v2, t3); \
00150 } while (0)
00151 
00152 /* VERT2_FILT evaluates this kernel:
00153  *     1
00154  *     1
00155  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00156  * the computation (stored as 16s).  ptr points to the first pixel of the
00157  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00158  * bytes.  If stride is positive, the origin of the kernel is in the top
00159  * row, if negative, the origin is in the bottom row.
00160  */
00161 #define VERT2_FILT(v1,v2,ptr,str) do { \
00162     __m128i t1, t2, t3; \
00163     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00164     v1 = _mm_unpacklo_epi8 (t1, z); \
00165     v2 = _mm_unpackhi_epi8 (t1, z); \
00166     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00167     t2 = _mm_unpacklo_epi8 (t1, z); \
00168     t3 = _mm_unpackhi_epi8 (t1, z); \
00169     v1 = _mm_add_epi16 (v1, t2); \
00170     v2 = _mm_add_epi16 (v2, t3); \
00171 } while (0)
00172 
00173 /* CROSS_FILT_SYM evaluates this kernel:
00174  *         -1
00175  *     -1   4  -1
00176  *         -1
00177  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00178  * the computation (stored as 16s).  ptr points to the first pixel of the
00179  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00180  * bytes.  The origin of the kernel is at the center.
00181  */
00182 #define CROSS_FILT_SYM(v1,v2,ptr,str) do { \
00183     __m128i t1, t2, t3; \
00184     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00185     v1 = _mm_unpacklo_epi8 (t1, z); \
00186     v2 = _mm_unpackhi_epi8 (t1, z); \
00187     v1 = _mm_slli_epi16 (v1, 2); \
00188     v2 = _mm_slli_epi16 (v2, 2); \
00189     t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00190     t2 = _mm_unpacklo_epi8 (t1, z); \
00191     t3 = _mm_unpackhi_epi8 (t1, z); \
00192     v1 = _mm_subs_epi16 (v1, t2); \
00193     v2 = _mm_subs_epi16 (v2, t3); \
00194     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00195     t2 = _mm_unpacklo_epi8 (t1, z); \
00196     t3 = _mm_unpackhi_epi8 (t1, z); \
00197     v1 = _mm_subs_epi16 (v1, t2); \
00198     v2 = _mm_subs_epi16 (v2, t3); \
00199     t1 = _mm_lddqu_si128 ((__m128i *)((ptr) - 1)); \
00200     t2 = _mm_unpacklo_epi8 (t1, z); \
00201     t3 = _mm_unpackhi_epi8 (t1, z); \
00202     v1 = _mm_subs_epi16 (v1, t2); \
00203     v2 = _mm_subs_epi16 (v2, t3); \
00204     t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + 1)); \
00205     t2 = _mm_unpacklo_epi8 (t1, z); \
00206     t3 = _mm_unpackhi_epi8 (t1, z); \
00207     v1 = _mm_subs_epi16 (v1, t2); \
00208     v2 = _mm_subs_epi16 (v2, t3); \
00209 } while (0)
00210 
00211 /* CROSS_FILT_HORIZ evaluates this kernel:
00212  *         -1
00213  *     1/2  5  1/2
00214  *         -1
00215  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00216  * the computation (stored as 16s).  ptr points to the first pixel of the
00217  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00218  * bytes.  The origin of the kernel is at the center.
00219  */
00220 #define CROSS_FILT_HORIZ(v1,v2,ptr,str) do { \
00221     __m128i t1, t2, t3, c10; \
00222     c10 = _mm_set1_epi16 (10); \
00223     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00224     v1 = _mm_unpacklo_epi8 (t1, z); \
00225     v2 = _mm_unpackhi_epi8 (t1, z); \
00226     v1 = _mm_mullo_epi16 (v1, c10); \
00227     v2 = _mm_mullo_epi16 (v2, c10); \
00228     t1 = _mm_lddqu_si128 ((__m128i *)((ptr) - 1)); \
00229     t2 = _mm_unpacklo_epi8 (t1, z); \
00230     t3 = _mm_unpackhi_epi8 (t1, z); \
00231     v1 = _mm_add_epi16 (v1, t2); \
00232     v2 = _mm_add_epi16 (v2, t3); \
00233     t1 = _mm_lddqu_si128 ((__m128i *)((ptr) + 1)); \
00234     t2 = _mm_unpacklo_epi8 (t1, z); \
00235     t3 = _mm_unpackhi_epi8 (t1, z); \
00236     v1 = _mm_add_epi16 (v1, t2); \
00237     v2 = _mm_add_epi16 (v2, t3); \
00238     v1 = _mm_srli_epi16 (v1, 1); \
00239     v2 = _mm_srli_epi16 (v2, 1); \
00240     t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00241     t2 = _mm_unpacklo_epi8 (t1, z); \
00242     t3 = _mm_unpackhi_epi8 (t1, z); \
00243     v1 = _mm_subs_epi16 (v1, t2); \
00244     v2 = _mm_subs_epi16 (v2, t3); \
00245     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00246     t2 = _mm_unpacklo_epi8 (t1, z); \
00247     t3 = _mm_unpackhi_epi8 (t1, z); \
00248     v1 = _mm_subs_epi16 (v1, t2); \
00249     v2 = _mm_subs_epi16 (v2, t3); \
00250 } while (0)
00251 
00252 #define INTERPOLATE_GB_ROW(kstride, off) do { \
00253     CROSS_FILT_VERT (v1, v2, gb_plane + j*sstride, kstride); \
00254     HORIZ2_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \
00255     w1 = _mm_slli_epi16 (w1, 2); \
00256     w2 = _mm_slli_epi16 (w2, 2); \
00257     v1 = _mm_add_epi16 (v1, w1); \
00258     v2 = _mm_add_epi16 (v2, w2); \
00259     BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \
00260     v1 = _mm_subs_epi16 (v1, w1); \
00261     v2 = _mm_subs_epi16 (v2, w2); \
00262     v1 = _mm_srai_epi16 (v1, 3); \
00263     v2 = _mm_srai_epi16 (v2, 3); \
00264     bg = _mm_packus_epi16 (v1, v2); \
00265     \
00266     VERT2_FILT (v1, v2, gr_plane + j*sstride, -kstride); \
00267     HORIZ2_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00268     v1 = _mm_add_epi16 (v1, w1); \
00269     v2 = _mm_add_epi16 (v2, w2); \
00270     v1 = _mm_slli_epi16 (v1, 1); \
00271     v2 = _mm_slli_epi16 (v2, 1); \
00272     CROSS_FILT_SYM (w1, w2, b_plane + j*sstride, kstride); \
00273     v1 = _mm_add_epi16 (v1, w1); \
00274     v2 = _mm_add_epi16 (v2, w2); \
00275     v1 = _mm_srai_epi16 (v1, 3); \
00276     v2 = _mm_srai_epi16 (v2, 3); \
00277     gb = _mm_packus_epi16 (v1, v2); \
00278     \
00279     CROSS_FILT_HORIZ (v1, v2, gb_plane + j*sstride, kstride); \
00280     VERT2_FILT (w1, w2, r_plane + j*sstride, -kstride); \
00281     w1 = _mm_slli_epi16 (w1, 2); \
00282     w2 = _mm_slli_epi16 (w2, 2); \
00283     v1 = _mm_add_epi16 (v1, w1); \
00284     v2 = _mm_add_epi16 (v2, w2); \
00285     BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \
00286     v1 = _mm_subs_epi16 (v1, w1); \
00287     v2 = _mm_subs_epi16 (v2, w2); \
00288     v1 = _mm_srai_epi16 (v1, 3); \
00289     v2 = _mm_srai_epi16 (v2, 3); \
00290     rg = _mm_packus_epi16 (v1, v2); \
00291     \
00292     CROSS_FILT_SYM (v1, v2, b_plane + j*sstride, kstride); \
00293     v1 = _mm_mullo_epi16 (v1, c3); \
00294     v2 = _mm_mullo_epi16 (v2, c3); \
00295     BOX_FILT (w1, w2, r_plane + j*sstride, -kstride, off); \
00296     w1 = _mm_slli_epi16 (w1, 2); \
00297     w2 = _mm_slli_epi16 (w2, 2); \
00298     v1 = _mm_add_epi16 (v1, w1); \
00299     v2 = _mm_add_epi16 (v2, w2); \
00300     v1 = _mm_srai_epi16 (v1, 4); \
00301     v2 = _mm_srai_epi16 (v2, 4); \
00302     rb = _mm_packus_epi16 (v1, v2); \
00303     \
00304     gg = _mm_load_si128 ((__m128i *)(gb_plane + j*sstride)); \
00305     bgl1 = _mm_unpacklo_epi8 (bg, gg); \
00306     bgl2 = _mm_unpackhi_epi8 (bg, gg); \
00307     \
00308     a = _mm_set1_epi8 (0xff); \
00309     ral1 = _mm_unpacklo_epi8 (rg, a); \
00310     ral2 = _mm_unpackhi_epi8 (rg, a); \
00311     \
00312     bb = _mm_load_si128 ((__m128i *)(b_plane + j*sstride)); \
00313     bgr1 = _mm_unpacklo_epi8 (bb, gb); \
00314     bgr2 = _mm_unpackhi_epi8 (bb, gb); \
00315     \
00316     rar1 = _mm_unpacklo_epi8 (rb, a); \
00317     rar2 = _mm_unpackhi_epi8 (rb, a); \
00318     \
00319     bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \
00320     bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \
00321     bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \
00322     bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \
00323     \
00324     bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \
00325     bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \
00326     bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \
00327     bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \
00328 } while (0)
00329 
00330 #define INTERPOLATE_RG_ROW(kstride,off) do { \
00331     CROSS_FILT_SYM (v1, v2, r_plane + j*sstride, kstride); \
00332     v1 = _mm_mullo_epi16 (v1, c3); \
00333     v2 = _mm_mullo_epi16 (v2, c3); \
00334     BOX_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \
00335     w1 = _mm_slli_epi16 (w1, 2); \
00336     w2 = _mm_slli_epi16 (w2, 2); \
00337     v1 = _mm_add_epi16 (v1, w1); \
00338     v2 = _mm_add_epi16 (v2, w2); \
00339     v1 = _mm_srai_epi16 (v1, 4); \
00340     v2 = _mm_srai_epi16 (v2, 4); \
00341     br = _mm_packus_epi16 (v1, v2); \
00342     \
00343     VERT2_FILT (v1, v2, gb_plane + j*sstride, kstride); \
00344     HORIZ2_FILT (w1, w2, gr_plane + j*sstride, kstride, -off); \
00345     v1 = _mm_add_epi16 (v1, w1); \
00346     v2 = _mm_add_epi16 (v2, w2); \
00347     v1 = _mm_slli_epi16 (v1, 1); \
00348     v2 = _mm_slli_epi16 (v2, 1); \
00349     CROSS_FILT_SYM (w1, w2, r_plane + j*sstride, kstride); \
00350     v1 = _mm_add_epi16 (v1, w1); \
00351     v2 = _mm_add_epi16 (v2, w2); \
00352     v1 = _mm_srai_epi16 (v1, 3); \
00353     v2 = _mm_srai_epi16 (v2, 3); \
00354     gr = _mm_packus_epi16 (v1, v2); \
00355     \
00356     CROSS_FILT_HORIZ (v1, v2, gr_plane + j*sstride, kstride); \
00357     VERT2_FILT (w1, w2, b_plane + j*sstride, kstride); \
00358     w1 = _mm_slli_epi16 (w1, 2); \
00359     w2 = _mm_slli_epi16 (w2, 2); \
00360     v1 = _mm_add_epi16 (v1, w1); \
00361     v2 = _mm_add_epi16 (v2, w2); \
00362     BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00363     v1 = _mm_subs_epi16 (v1, w1); \
00364     v2 = _mm_subs_epi16 (v2, w2); \
00365     v1 = _mm_srai_epi16 (v1, 3); \
00366     v2 = _mm_srai_epi16 (v2, 3); \
00367     bg = _mm_packus_epi16 (v1, v2); \
00368     \
00369     CROSS_FILT_VERT (v1, v2, gr_plane + j*sstride, kstride); \
00370     HORIZ2_FILT (w1, w2, r_plane + j*sstride, kstride, off); \
00371     w1 = _mm_slli_epi16 (w1, 2); \
00372     w2 = _mm_slli_epi16 (w2, 2); \
00373     v1 = _mm_add_epi16 (v1, w1); \
00374     v2 = _mm_add_epi16 (v2, w2); \
00375     BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00376     v1 = _mm_subs_epi16 (v1, w1); \
00377     v2 = _mm_subs_epi16 (v2, w2); \
00378     v1 = _mm_srai_epi16 (v1, 3); \
00379     v2 = _mm_srai_epi16 (v2, 3); \
00380     rg = _mm_packus_epi16 (v1, v2); \
00381     \
00382     bgl1 = _mm_unpacklo_epi8 (br, gr); \
00383     bgl2 = _mm_unpackhi_epi8 (br, gr); \
00384     \
00385     rr = _mm_load_si128 ((__m128i *)(r_plane + j*sstride)); \
00386     a = _mm_set1_epi8 (0xff); \
00387     ral1 = _mm_unpacklo_epi8 (rr, a); \
00388     ral2 = _mm_unpackhi_epi8 (rr, a); \
00389     \
00390     gg = _mm_load_si128 ((__m128i *)(gr_plane + j*sstride)); \
00391     bgr1 = _mm_unpacklo_epi8 (bg, gg); \
00392     bgr2 = _mm_unpackhi_epi8 (bg, gg); \
00393     \
00394     rar1 = _mm_unpacklo_epi8 (rg, a); \
00395     rar2 = _mm_unpackhi_epi8 (rg, a); \
00396     \
00397     bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \
00398     bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \
00399     bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \
00400     bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \
00401     \
00402     bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \
00403     bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \
00404     bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \
00405     bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \
00406 } while (0)
00407 
00408 template <class T> Image<PixRGB<T> >
00409 debayerSSE3 (const Image<T>& src1,
00410              BayerFormat format)
00411 {
00412 # ifndef INVT_USE_SSE3
00413   LFATAL("you must have sse3 support");
00414   return Image<PixRGB<T> >();
00415 #else
00416 
00417   /* make sure that the source image stride can be divied by 32 */
00418   bool isAligned32 = true;
00419   int patchWidth = 0;
00420   Image<T> src;
00421   if ((src1.getWidth() % 32) != 0)
00422     {
00423       patchWidth = 32 - (src1.getWidth() % 32);
00424       src = concatX(src1, Image<T>(patchWidth, src1.getHeight(), ZEROS));
00425       isAligned32 = false;
00426     }
00427   else
00428     src = src1;
00429 
00430   int width = src.getWidth();
00431   int height = src.getHeight();
00432   ASSERT(width % 2 == 0);
00433   ASSERT(height % 2 == 0);
00434   int dstride = width * 4;
00435   int sstride = width;
00436 
00437   /* ensure stride is 16-byte aligned and add 32 extra bytes for the
00438    * border padding */
00439   uint8_t *bayer_planes[4];
00440   int plane_stride = ((width + 0xf)&(~0xf)) + 32;
00441   for (int i = 0; i < 4; i++) {
00442     bayer_planes[i] = (uint8_t*)memalign(16,plane_stride * (height + 2));
00443   }
00444 
00445   // alocate a 16-byte aligned buffer for the interpolated image
00446   int bgra_stride = width*4;
00447   uint8_t *bgra_img = (uint8_t*)memalign(16,height * bgra_stride);
00448 
00449   // allocate a 16-byte aligned buffer for the source image
00450   int bayer_stride = width;
00451   uint8_t *bayer_img = (uint8_t*) memalign(16,height * bayer_stride);
00452 
00453   // copy the source image into the 16-byte aligned buffer
00454   copy_8u_generic ((uint8_t*)src.getArrayPtr(), sstride,
00455                              bayer_img, bayer_stride,
00456                              0, 0, 0, 0, width, height, 8);
00457 
00458   // split the bayer image
00459   uint8_t * planes[4] = {
00460     bayer_planes[0] + plane_stride + 16,
00461     bayer_planes[1] + plane_stride + 16,
00462     bayer_planes[2] + plane_stride + 16,
00463     bayer_planes[3] + plane_stride + 16,
00464   };
00465   int p_width = width / 2;
00466   int p_height = height / 2;
00467 
00468   splitBayerPlanes_8u (planes, plane_stride,
00469                                    bayer_img, bayer_stride, p_width, p_height);
00470   for (int j = 0; j < 4; j++)
00471     replicateBorder_8u (planes[j], plane_stride, p_width, p_height);
00472 
00473 
00474   if( bayerInterpolateTo_8u_bgra_sse3 (planes,plane_stride,
00475                                        bgra_img, bgra_stride,
00476                                        width, height, format) < 0)
00477     LFATAL("error in debayer with sse3");
00478   // copy to destination
00479   uint8_t * dest = (uint8_t*)memalign(16, dstride*height);
00480   copy_8u_generic (bgra_img, bgra_stride,
00481                              dest, dstride, 0, 0, 0, 0, width, height, 8 * 4);
00482 
00483   Image<PixRGB<T> > res(width, height, NO_INIT);
00484   typename Image<PixRGB<T> >::iterator dptr = res.beginw();
00485   T* sptr = (T*)dest;
00486 
00487   for(int y =0; y < height; y++)
00488     {
00489       for(int x =0; x < width; x++)
00490         {
00491           dptr[0].p[2] = *sptr++;
00492           dptr[0].p[1] = *sptr++;
00493           dptr[0].p[0] = *sptr++;
00494           dptr++;
00495           sptr++; // for the A channel
00496         }
00497     }
00498 
00499   for (int i=0; i<4; i++) {
00500     free (bayer_planes[i]);
00501   }
00502   free(dest);
00503   free(bayer_img);
00504   free (bgra_img);
00505 
00506   if(!isAligned32)
00507     res = crop(res, Point2D<int>(0,0), Dims(width-patchWidth, height));
00508   return res;
00509 #endif //INVT_USE_SSE3
00510 }
00511 
00512 int
00513 bayerInterpolateTo_8u_bgra_sse3 (uint8_t ** src, int sstride,
00514                                  uint8_t * dst, int dstride, int width, int height,
00515                                  BayerFormat format)
00516 {
00517 # ifndef INVT_USE_SSE3
00518   LFATAL("you must have sse3 support");
00519   return -1;
00520 #else
00521   int i, j;
00522   for (i = 0; i < 4; i++) {
00523     if (!IS_ALIGNED16(src[i]) || !IS_ALIGNED16(sstride)) {
00524       LERROR("%s: src[%d] is not 16-byte aligned\n",
00525                __FUNCTION__, i);
00526       return -1;
00527     }
00528   }
00529   if (!IS_ALIGNED16(dst) || !IS_ALIGNED128(dstride)) {
00530     LERROR("%s: dst is not 16-byte aligned or 128-byte stride "
00531              "aligned\n", __FUNCTION__);
00532     return -1;
00533   }
00534 
00535     __m128i z = _mm_set1_epi32 (0);
00536     __m128i c3 = _mm_set1_epi16 (3);
00537     __m128i bg, gb, rg, rb, gg, a, bb, br, gr, rr;
00538     __m128i bgl1, bgl2, ral1, ral2;
00539     __m128i bgr1, bgr2, rar1, rar2;
00540     __m128i bgral1, bgral2, bgral3, bgral4;
00541     __m128i bgrar1, bgrar2, bgrar3, bgrar4;
00542     __m128i v1, v2, w1, w2;
00543 
00544     if (format ==  BAYER_GBRG ||
00545             format ==  BAYER_RGGB) {
00546         int drow_offset1 = 0;
00547         int drow_offset2 = dstride;
00548         int kernel_stride = sstride;
00549         uint8_t * gb_plane = src[0];
00550         uint8_t * b_plane = src[1];
00551         uint8_t * r_plane = src[2];
00552         uint8_t * gr_plane = src[3];
00553         if (format ==  BAYER_RGGB) {
00554             drow_offset1 = dstride;
00555             drow_offset2 = 0;
00556             kernel_stride = -sstride;
00557             r_plane = src[0];
00558             gr_plane = src[1];
00559             gb_plane = src[2];
00560             b_plane = src[3];
00561         }
00562 
00563         for (i = 0; i < width/2; i += 16) {
00564             uint8_t * dcol = dst + i*8;
00565 
00566             for (j = 0; j < height/2; j++) {
00567                 INTERPOLATE_GB_ROW (kernel_stride, 1);
00568 
00569                 uint8_t * drow = dcol + j*2*dstride + drow_offset1;
00570                 _mm_store_si128 ((__m128i *)drow,
00571                         _mm_unpacklo_epi32 (bgral1, bgrar1));
00572                 _mm_store_si128 ((__m128i *)(drow+16),
00573                         _mm_unpackhi_epi32 (bgral1, bgrar1));
00574                 _mm_store_si128 ((__m128i *)(drow+32),
00575                         _mm_unpacklo_epi32 (bgral2, bgrar2));
00576                 _mm_store_si128 ((__m128i *)(drow+48),
00577                         _mm_unpackhi_epi32 (bgral2, bgrar2));
00578                 _mm_store_si128 ((__m128i *)(drow+64),
00579                         _mm_unpacklo_epi32 (bgral3, bgrar3));
00580                 _mm_store_si128 ((__m128i *)(drow+80),
00581                         _mm_unpackhi_epi32 (bgral3, bgrar3));
00582                 _mm_store_si128 ((__m128i *)(drow+96),
00583                         _mm_unpacklo_epi32 (bgral4, bgrar4));
00584                 _mm_store_si128 ((__m128i *)(drow+112),
00585                         _mm_unpackhi_epi32 (bgral4, bgrar4));
00586 
00587                 INTERPOLATE_RG_ROW (kernel_stride, 1);
00588 
00589                 drow = dcol + j*2*dstride + drow_offset2;
00590                 _mm_store_si128 ((__m128i *)drow,
00591                         _mm_unpacklo_epi32 (bgral1, bgrar1));
00592                 _mm_store_si128 ((__m128i *)(drow+16),
00593                         _mm_unpackhi_epi32 (bgral1, bgrar1));
00594                 _mm_store_si128 ((__m128i *)(drow+32),
00595                         _mm_unpacklo_epi32 (bgral2, bgrar2));
00596                 _mm_store_si128 ((__m128i *)(drow+48),
00597                         _mm_unpackhi_epi32 (bgral2, bgrar2));
00598                 _mm_store_si128 ((__m128i *)(drow+64),
00599                         _mm_unpacklo_epi32 (bgral3, bgrar3));
00600                 _mm_store_si128 ((__m128i *)(drow+80),
00601                         _mm_unpackhi_epi32 (bgral3, bgrar3));
00602                 _mm_store_si128 ((__m128i *)(drow+96),
00603                         _mm_unpacklo_epi32 (bgral4, bgrar4));
00604                 _mm_store_si128 ((__m128i *)(drow+112),
00605                         _mm_unpackhi_epi32 (bgral4, bgrar4));
00606 
00607             }
00608             gb_plane += 16;
00609             b_plane += 16;
00610             r_plane += 16;
00611             gr_plane += 16;
00612         }
00613     }
00614     else {
00615         int drow_offset1 = 0;
00616         int drow_offset2 = dstride;
00617         int kernel_stride = sstride;
00618         uint8_t * b_plane = src[0];
00619         uint8_t * gb_plane = src[1];
00620         uint8_t * gr_plane = src[2];
00621         uint8_t * r_plane = src[3];
00622         if (format ==  BAYER_GRBG) {
00623             drow_offset1 = dstride;
00624             drow_offset2 = 0;
00625             kernel_stride = -sstride;
00626             gr_plane = src[0];
00627             r_plane = src[1];
00628             b_plane = src[2];
00629             gb_plane = src[3];
00630         }
00631 
00632         for (i = 0; i < width/2; i += 16) {
00633             uint8_t * dcol = dst + i*8;
00634 
00635             for (j = 0; j < height/2; j++) {
00636                 INTERPOLATE_GB_ROW (kernel_stride, -1);
00637 
00638                 uint8_t * drow = dcol + j*2*dstride + drow_offset1;
00639                 _mm_store_si128 ((__m128i *)drow,
00640                         _mm_unpacklo_epi32 (bgrar1, bgral1));
00641                 _mm_store_si128 ((__m128i *)(drow+16),
00642                         _mm_unpackhi_epi32 (bgrar1, bgral1));
00643                 _mm_store_si128 ((__m128i *)(drow+32),
00644                         _mm_unpacklo_epi32 (bgrar2, bgral2));
00645                 _mm_store_si128 ((__m128i *)(drow+48),
00646                         _mm_unpackhi_epi32 (bgrar2, bgral2));
00647                 _mm_store_si128 ((__m128i *)(drow+64),
00648                         _mm_unpacklo_epi32 (bgrar3, bgral3));
00649                 _mm_store_si128 ((__m128i *)(drow+80),
00650                         _mm_unpackhi_epi32 (bgrar3, bgral3));
00651                 _mm_store_si128 ((__m128i *)(drow+96),
00652                         _mm_unpacklo_epi32 (bgrar4, bgral4));
00653                 _mm_store_si128 ((__m128i *)(drow+112),
00654                         _mm_unpackhi_epi32 (bgrar4, bgral4));
00655 
00656                 INTERPOLATE_RG_ROW (kernel_stride, -1);
00657 
00658                 drow = dcol + j*2*dstride + drow_offset2;
00659                 _mm_store_si128 ((__m128i *)drow,
00660                         _mm_unpacklo_epi32 (bgrar1, bgral1));
00661                 _mm_store_si128 ((__m128i *)(drow+16),
00662                         _mm_unpackhi_epi32 (bgrar1, bgral1));
00663                 _mm_store_si128 ((__m128i *)(drow+32),
00664                         _mm_unpacklo_epi32 (bgrar2, bgral2));
00665                 _mm_store_si128 ((__m128i *)(drow+48),
00666                         _mm_unpackhi_epi32 (bgrar2, bgral2));
00667                 _mm_store_si128 ((__m128i *)(drow+64),
00668                         _mm_unpacklo_epi32 (bgrar3, bgral3));
00669                 _mm_store_si128 ((__m128i *)(drow+80),
00670                         _mm_unpackhi_epi32 (bgrar3, bgral3));
00671                 _mm_store_si128 ((__m128i *)(drow+96),
00672                         _mm_unpacklo_epi32 (bgrar4, bgral4));
00673                 _mm_store_si128 ((__m128i *)(drow+112),
00674                         _mm_unpackhi_epi32 (bgrar4, bgral4));
00675 
00676             }
00677             gb_plane += 16;
00678             b_plane += 16;
00679             r_plane += 16;
00680             gr_plane += 16;
00681         }
00682     }
00683     return 0;
00684 #endif
00685 }
00686 
00687 template Image<PixRGB<byte> >  debayerSSE3(const Image<byte>& src, BayerFormat format);
00688 template Image<PixRGB<uint16> >  debayerSSE3(const Image<uint16>& src, BayerFormat format);
00689 // ######################################################################
00690 /* So things look consistent in everyone's emacs... */
00691 /* Local Variables: */
00692 /* indent-tabs-mode: nil */
00693 /* End: */