DeBayerSSE2.C

00001 /*!@file Raster/DebayerSSE2.C is the debayer class with sse2 */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2001 by the //
00005 // University of Southern California (USC) and the iLab at USC.         //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file: Zhicheng Li <zhicheng@usc.edu>
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Raster/DeBayerSSE2.C $
00035 // $Id: DeBayerSSE2.C 10794 2009-02-08 06:21:09Z itti $
00036 //
00037 #include <stdio.h>
00038 #include <stdint.h>
00039 #include <emmintrin.h>
00040 #include <stdlib.h>
00041 
00042 // on some platforms, memalign is defined in <malloc.h>, but that file
00043 // does not exist on Darwin. On Darwin, including stdlib.h is sufficient.
00044 // Let's here also include malloc.h unless we are on Darwin:
00045 #ifndef MACHINE_OS_DARWIN
00046 #include <malloc.h>
00047 #endif
00048 
00049 #include "Image/Image.H"
00050 #include "Image/CutPaste.H"
00051 #include "Raster/DeBayerSSE2.H"
00052 
00053 using namespace std;
00054 
00055 // ########################  debayer with SSE2 accelerate   ##############//
00056 // #######################################################################//
00057 /* BOX_FILT evaluates this kernel:
00058  *     1  1
00059  *     1  1
00060  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00061  * the computation (stored as 16s).  ptr points to the first pixel of the
00062  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00063  * bytes.  If stride is positive, the origin of the kernel is in the top
00064  * row, if negative, the origin is in the bottom row.  off is 1 to put
00065  * the origin in the left column of the kernel, or -1 to put the origin
00066  * in the right column.
00067  */
00068 #define BOX_FILT(v1,v2,ptr,str,off) do { \
00069     __m128i t1, t2, t3; \
00070     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00071     v1 = _mm_unpacklo_epi8 (t1, z); \
00072     v2 = _mm_unpackhi_epi8 (t1, z); \
00073     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00074     t2 = _mm_unpacklo_epi8 (t1, z); \
00075     t3 = _mm_unpackhi_epi8 (t1, z); \
00076     v1 = _mm_add_epi16 (v1, t2); \
00077     v2 = _mm_add_epi16 (v2, t3); \
00078     t1 = _mm_loadu_si128 ((__m128i *)((ptr) + off)); \
00079     t2 = _mm_unpacklo_epi8 (t1, z); \
00080     t3 = _mm_unpackhi_epi8 (t1, z); \
00081     v1 = _mm_add_epi16 (v1, t2); \
00082     v2 = _mm_add_epi16 (v2, t3); \
00083     t1 = _mm_loadu_si128 ((__m128i *)((ptr) + (str) + off)); \
00084     t2 = _mm_unpacklo_epi8 (t1, z); \
00085     t3 = _mm_unpackhi_epi8 (t1, z); \
00086     v1 = _mm_add_epi16 (v1, t2); \
00087     v2 = _mm_add_epi16 (v2, t3); \
00088 } while (0)
00089 
00090 /* CROSS_FILT_VERT evaluates this kernel:
00091  *         1/2
00092  *     -1   5  -1
00093  *         1/2
00094  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00095  * the computation (stored as 16s).  ptr points to the first pixel of the
00096  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00097  * bytes.  The origin of the kernel is at the center.
00098  */
00099 #define CROSS_FILT_VERT(v1,v2,ptr,str) do { \
00100     __m128i t1, t2, t3, c10; \
00101     c10 = _mm_set1_epi16 (10); \
00102     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00103     v1 = _mm_unpacklo_epi8 (t1, z); \
00104     v2 = _mm_unpackhi_epi8 (t1, z); \
00105     v1 = _mm_mullo_epi16 (v1, c10); \
00106     v2 = _mm_mullo_epi16 (v2, c10); \
00107     t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00108     t2 = _mm_unpacklo_epi8 (t1, z); \
00109     t3 = _mm_unpackhi_epi8 (t1, z); \
00110     v1 = _mm_add_epi16 (v1, t2); \
00111     v2 = _mm_add_epi16 (v2, t3); \
00112     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00113     t2 = _mm_unpacklo_epi8 (t1, z); \
00114     t3 = _mm_unpackhi_epi8 (t1, z); \
00115     v1 = _mm_add_epi16 (v1, t2); \
00116     v2 = _mm_add_epi16 (v2, t3); \
00117     v1 = _mm_srli_epi16 (v1, 1); \
00118     v2 = _mm_srli_epi16 (v2, 1); \
00119     t1 = _mm_loadu_si128 ((__m128i *)((ptr) - 1)); \
00120     t2 = _mm_unpacklo_epi8 (t1, z); \
00121     t3 = _mm_unpackhi_epi8 (t1, z); \
00122     v1 = _mm_subs_epi16 (v1, t2); \
00123     v2 = _mm_subs_epi16 (v2, t3); \
00124     t1 = _mm_loadu_si128 ((__m128i *)((ptr) + 1)); \
00125     t2 = _mm_unpacklo_epi8 (t1, z); \
00126     t3 = _mm_unpackhi_epi8 (t1, z); \
00127     v1 = _mm_subs_epi16 (v1, t2); \
00128     v2 = _mm_subs_epi16 (v2, t3); \
00129 } while (0)
00130 
00131 /* HORIZ2_FILT evaluates this kernel:
00132  *     1  1
00133  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00134  * the computation (stored as 16s).  ptr points to the first pixel of the
00135  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00136  * bytes (unused).  off is 1 to put the origin in the left column of the
00137  * kernel, or -1 to put the origin in the right column.
00138  */
00139 #define HORIZ2_FILT(v1,v2,ptr,str,off) do { \
00140     __m128i t1, t2, t3; \
00141     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00142     v1 = _mm_unpacklo_epi8 (t1, z); \
00143     v2 = _mm_unpackhi_epi8 (t1, z); \
00144     t1 = _mm_loadu_si128 ((__m128i *)((ptr) + off)); \
00145     t2 = _mm_unpacklo_epi8 (t1, z); \
00146     t3 = _mm_unpackhi_epi8 (t1, z); \
00147     v1 = _mm_add_epi16 (v1, t2); \
00148     v2 = _mm_add_epi16 (v2, t3); \
00149 } while (0)
00150 
00151 /* VERT2_FILT evaluates this kernel:
00152  *     1
00153  *     1
00154  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00155  * the computation (stored as 16s).  ptr points to the first pixel of the
00156  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00157  * bytes.  If stride is positive, the origin of the kernel is in the top
00158  * row, if negative, the origin is in the bottom row.
00159  */
00160 #define VERT2_FILT(v1,v2,ptr,str) do { \
00161     __m128i t1, t2, t3; \
00162     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00163     v1 = _mm_unpacklo_epi8 (t1, z); \
00164     v2 = _mm_unpackhi_epi8 (t1, z); \
00165     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00166     t2 = _mm_unpacklo_epi8 (t1, z); \
00167     t3 = _mm_unpackhi_epi8 (t1, z); \
00168     v1 = _mm_add_epi16 (v1, t2); \
00169     v2 = _mm_add_epi16 (v2, t3); \
00170 } while (0)
00171 
00172 /* CROSS_FILT_SYM evaluates this kernel:
00173  *         -1
00174  *     -1   4  -1
00175  *         -1
00176  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00177  * the computation (stored as 16s).  ptr points to the first pixel of the
00178  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00179  * bytes.  The origin of the kernel is at the center.
00180  */
00181 #define CROSS_FILT_SYM(v1,v2,ptr,str) do { \
00182     __m128i t1, t2, t3; \
00183     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00184     v1 = _mm_unpacklo_epi8 (t1, z); \
00185     v2 = _mm_unpackhi_epi8 (t1, z); \
00186     v1 = _mm_slli_epi16 (v1, 2); \
00187     v2 = _mm_slli_epi16 (v2, 2); \
00188     t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00189     t2 = _mm_unpacklo_epi8 (t1, z); \
00190     t3 = _mm_unpackhi_epi8 (t1, z); \
00191     v1 = _mm_subs_epi16 (v1, t2); \
00192     v2 = _mm_subs_epi16 (v2, t3); \
00193     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00194     t2 = _mm_unpacklo_epi8 (t1, z); \
00195     t3 = _mm_unpackhi_epi8 (t1, z); \
00196     v1 = _mm_subs_epi16 (v1, t2); \
00197     v2 = _mm_subs_epi16 (v2, t3); \
00198     t1 = _mm_loadu_si128 ((__m128i *)((ptr) - 1)); \
00199     t2 = _mm_unpacklo_epi8 (t1, z); \
00200     t3 = _mm_unpackhi_epi8 (t1, z); \
00201     v1 = _mm_subs_epi16 (v1, t2); \
00202     v2 = _mm_subs_epi16 (v2, t3); \
00203     t1 = _mm_loadu_si128 ((__m128i *)((ptr) + 1)); \
00204     t2 = _mm_unpacklo_epi8 (t1, z); \
00205     t3 = _mm_unpackhi_epi8 (t1, z); \
00206     v1 = _mm_subs_epi16 (v1, t2); \
00207     v2 = _mm_subs_epi16 (v2, t3); \
00208 } while (0)
00209 
00210 /* CROSS_FILT_HORIZ evaluates this kernel:
00211  *         -1
00212  *     1/2  5  1/2
00213  *         -1
00214  * For a 1x16 strip of pixels of an 8u image.  v1 and v2 hold the result of
00215  * the computation (stored as 16s).  ptr points to the first pixel of the
00216  * strip, and must be 16-byte aligned.  str is the stride of image rows in
00217  * bytes.  The origin of the kernel is at the center.
00218  */
00219 #define CROSS_FILT_HORIZ(v1,v2,ptr,str) do { \
00220     __m128i t1, t2, t3, c10; \
00221     c10 = _mm_set1_epi16 (10); \
00222     t1 = _mm_load_si128 ((__m128i *)(ptr)); \
00223     v1 = _mm_unpacklo_epi8 (t1, z); \
00224     v2 = _mm_unpackhi_epi8 (t1, z); \
00225     v1 = _mm_mullo_epi16 (v1, c10); \
00226     v2 = _mm_mullo_epi16 (v2, c10); \
00227     t1 = _mm_loadu_si128 ((__m128i *)((ptr) - 1)); \
00228     t2 = _mm_unpacklo_epi8 (t1, z); \
00229     t3 = _mm_unpackhi_epi8 (t1, z); \
00230     v1 = _mm_add_epi16 (v1, t2); \
00231     v2 = _mm_add_epi16 (v2, t3); \
00232     t1 = _mm_loadu_si128 ((__m128i *)((ptr) + 1)); \
00233     t2 = _mm_unpacklo_epi8 (t1, z); \
00234     t3 = _mm_unpackhi_epi8 (t1, z); \
00235     v1 = _mm_add_epi16 (v1, t2); \
00236     v2 = _mm_add_epi16 (v2, t3); \
00237     v1 = _mm_srli_epi16 (v1, 1); \
00238     v2 = _mm_srli_epi16 (v2, 1); \
00239     t1 = _mm_load_si128 ((__m128i *)((ptr) - (str))); \
00240     t2 = _mm_unpacklo_epi8 (t1, z); \
00241     t3 = _mm_unpackhi_epi8 (t1, z); \
00242     v1 = _mm_subs_epi16 (v1, t2); \
00243     v2 = _mm_subs_epi16 (v2, t3); \
00244     t1 = _mm_load_si128 ((__m128i *)((ptr) + (str))); \
00245     t2 = _mm_unpacklo_epi8 (t1, z); \
00246     t3 = _mm_unpackhi_epi8 (t1, z); \
00247     v1 = _mm_subs_epi16 (v1, t2); \
00248     v2 = _mm_subs_epi16 (v2, t3); \
00249 } while (0)
00250 
00251 #define INTERPOLATE_GB_ROW(kstride, off) do { \
00252     CROSS_FILT_VERT (v1, v2, gb_plane + j*sstride, kstride); \
00253     HORIZ2_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \
00254     w1 = _mm_slli_epi16 (w1, 2); \
00255     w2 = _mm_slli_epi16 (w2, 2); \
00256     v1 = _mm_add_epi16 (v1, w1); \
00257     v2 = _mm_add_epi16 (v2, w2); \
00258     BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \
00259     v1 = _mm_subs_epi16 (v1, w1); \
00260     v2 = _mm_subs_epi16 (v2, w2); \
00261     v1 = _mm_srai_epi16 (v1, 3); \
00262     v2 = _mm_srai_epi16 (v2, 3); \
00263     bg = _mm_packus_epi16 (v1, v2); \
00264     \
00265     VERT2_FILT (v1, v2, gr_plane + j*sstride, -kstride); \
00266     HORIZ2_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00267     v1 = _mm_add_epi16 (v1, w1); \
00268     v2 = _mm_add_epi16 (v2, w2); \
00269     v1 = _mm_slli_epi16 (v1, 1); \
00270     v2 = _mm_slli_epi16 (v2, 1); \
00271     CROSS_FILT_SYM (w1, w2, b_plane + j*sstride, kstride); \
00272     v1 = _mm_add_epi16 (v1, w1); \
00273     v2 = _mm_add_epi16 (v2, w2); \
00274     v1 = _mm_srai_epi16 (v1, 3); \
00275     v2 = _mm_srai_epi16 (v2, 3); \
00276     gb = _mm_packus_epi16 (v1, v2); \
00277     \
00278     CROSS_FILT_HORIZ (v1, v2, gb_plane + j*sstride, kstride); \
00279     VERT2_FILT (w1, w2, r_plane + j*sstride, -kstride); \
00280     w1 = _mm_slli_epi16 (w1, 2); \
00281     w2 = _mm_slli_epi16 (w2, 2); \
00282     v1 = _mm_add_epi16 (v1, w1); \
00283     v2 = _mm_add_epi16 (v2, w2); \
00284     BOX_FILT (w1, w2, gr_plane + j*sstride, -kstride, -off); \
00285     v1 = _mm_subs_epi16 (v1, w1); \
00286     v2 = _mm_subs_epi16 (v2, w2); \
00287     v1 = _mm_srai_epi16 (v1, 3); \
00288     v2 = _mm_srai_epi16 (v2, 3); \
00289     rg = _mm_packus_epi16 (v1, v2); \
00290     \
00291     CROSS_FILT_SYM (v1, v2, b_plane + j*sstride, kstride); \
00292     v1 = _mm_mullo_epi16 (v1, c3); \
00293     v2 = _mm_mullo_epi16 (v2, c3); \
00294     BOX_FILT (w1, w2, r_plane + j*sstride, -kstride, off); \
00295     w1 = _mm_slli_epi16 (w1, 2); \
00296     w2 = _mm_slli_epi16 (w2, 2); \
00297     v1 = _mm_add_epi16 (v1, w1); \
00298     v2 = _mm_add_epi16 (v2, w2); \
00299     v1 = _mm_srai_epi16 (v1, 4); \
00300     v2 = _mm_srai_epi16 (v2, 4); \
00301     rb = _mm_packus_epi16 (v1, v2); \
00302     \
00303     gg = _mm_load_si128 ((__m128i *)(gb_plane + j*sstride)); \
00304     bgl1 = _mm_unpacklo_epi8 (bg, gg); \
00305     bgl2 = _mm_unpackhi_epi8 (bg, gg); \
00306     \
00307     a = _mm_set1_epi8 (0xff); \
00308     ral1 = _mm_unpacklo_epi8 (rg, a); \
00309     ral2 = _mm_unpackhi_epi8 (rg, a); \
00310     \
00311     bb = _mm_load_si128 ((__m128i *)(b_plane + j*sstride)); \
00312     bgr1 = _mm_unpacklo_epi8 (bb, gb); \
00313     bgr2 = _mm_unpackhi_epi8 (bb, gb); \
00314     \
00315     rar1 = _mm_unpacklo_epi8 (rb, a); \
00316     rar2 = _mm_unpackhi_epi8 (rb, a); \
00317     \
00318     bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \
00319     bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \
00320     bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \
00321     bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \
00322     \
00323     bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \
00324     bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \
00325     bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \
00326     bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \
00327 } while (0)
00328 
00329 #define INTERPOLATE_RG_ROW(kstride,off) do { \
00330     CROSS_FILT_SYM (v1, v2, r_plane + j*sstride, kstride); \
00331     v1 = _mm_mullo_epi16 (v1, c3); \
00332     v2 = _mm_mullo_epi16 (v2, c3); \
00333     BOX_FILT (w1, w2, b_plane + j*sstride, kstride, -off); \
00334     w1 = _mm_slli_epi16 (w1, 2); \
00335     w2 = _mm_slli_epi16 (w2, 2); \
00336     v1 = _mm_add_epi16 (v1, w1); \
00337     v2 = _mm_add_epi16 (v2, w2); \
00338     v1 = _mm_srai_epi16 (v1, 4); \
00339     v2 = _mm_srai_epi16 (v2, 4); \
00340     br = _mm_packus_epi16 (v1, v2); \
00341     \
00342     VERT2_FILT (v1, v2, gb_plane + j*sstride, kstride); \
00343     HORIZ2_FILT (w1, w2, gr_plane + j*sstride, kstride, -off); \
00344     v1 = _mm_add_epi16 (v1, w1); \
00345     v2 = _mm_add_epi16 (v2, w2); \
00346     v1 = _mm_slli_epi16 (v1, 1); \
00347     v2 = _mm_slli_epi16 (v2, 1); \
00348     CROSS_FILT_SYM (w1, w2, r_plane + j*sstride, kstride); \
00349     v1 = _mm_add_epi16 (v1, w1); \
00350     v2 = _mm_add_epi16 (v2, w2); \
00351     v1 = _mm_srai_epi16 (v1, 3); \
00352     v2 = _mm_srai_epi16 (v2, 3); \
00353     gr = _mm_packus_epi16 (v1, v2); \
00354     \
00355     CROSS_FILT_HORIZ (v1, v2, gr_plane + j*sstride, kstride); \
00356     VERT2_FILT (w1, w2, b_plane + j*sstride, kstride); \
00357     w1 = _mm_slli_epi16 (w1, 2); \
00358     w2 = _mm_slli_epi16 (w2, 2); \
00359     v1 = _mm_add_epi16 (v1, w1); \
00360     v2 = _mm_add_epi16 (v2, w2); \
00361     BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00362     v1 = _mm_subs_epi16 (v1, w1); \
00363     v2 = _mm_subs_epi16 (v2, w2); \
00364     v1 = _mm_srai_epi16 (v1, 3); \
00365     v2 = _mm_srai_epi16 (v2, 3); \
00366     bg = _mm_packus_epi16 (v1, v2); \
00367     \
00368     CROSS_FILT_VERT (v1, v2, gr_plane + j*sstride, kstride); \
00369     HORIZ2_FILT (w1, w2, r_plane + j*sstride, kstride, off); \
00370     w1 = _mm_slli_epi16 (w1, 2); \
00371     w2 = _mm_slli_epi16 (w2, 2); \
00372     v1 = _mm_add_epi16 (v1, w1); \
00373     v2 = _mm_add_epi16 (v2, w2); \
00374     BOX_FILT (w1, w2, gb_plane + j*sstride, kstride, off); \
00375     v1 = _mm_subs_epi16 (v1, w1); \
00376     v2 = _mm_subs_epi16 (v2, w2); \
00377     v1 = _mm_srai_epi16 (v1, 3); \
00378     v2 = _mm_srai_epi16 (v2, 3); \
00379     rg = _mm_packus_epi16 (v1, v2); \
00380     \
00381     bgl1 = _mm_unpacklo_epi8 (br, gr); \
00382     bgl2 = _mm_unpackhi_epi8 (br, gr); \
00383     \
00384     rr = _mm_load_si128 ((__m128i *)(r_plane + j*sstride)); \
00385     a = _mm_set1_epi8 (0xff); \
00386     ral1 = _mm_unpacklo_epi8 (rr, a); \
00387     ral2 = _mm_unpackhi_epi8 (rr, a); \
00388     \
00389     gg = _mm_load_si128 ((__m128i *)(gr_plane + j*sstride)); \
00390     bgr1 = _mm_unpacklo_epi8 (bg, gg); \
00391     bgr2 = _mm_unpackhi_epi8 (bg, gg); \
00392     \
00393     rar1 = _mm_unpacklo_epi8 (rg, a); \
00394     rar2 = _mm_unpackhi_epi8 (rg, a); \
00395     \
00396     bgral1 = _mm_unpacklo_epi16 (bgl1, ral1); \
00397     bgral2 = _mm_unpackhi_epi16 (bgl1, ral1); \
00398     bgral3 = _mm_unpacklo_epi16 (bgl2, ral2); \
00399     bgral4 = _mm_unpackhi_epi16 (bgl2, ral2); \
00400     \
00401     bgrar1 = _mm_unpacklo_epi16 (bgr1, rar1); \
00402     bgrar2 = _mm_unpackhi_epi16 (bgr1, rar1); \
00403     bgrar3 = _mm_unpacklo_epi16 (bgr2, rar2); \
00404     bgrar4 = _mm_unpackhi_epi16 (bgr2, rar2); \
00405 } while (0)
00406 
00407 template <class T> Image<PixRGB<T> >
00408 debayerSSE2 (const Image<T>& src1, BayerFormat format)
00409 {
00410 #ifndef INVT_USE_SSEDB
00411   LFATAL("you must have SSE2 support");
00412   return Image<PixRGB<T> >();
00413 #else
00414   //! althogh we use a template here but actuaaly we only support the 8 bit depth  mode
00415 
00416   /* make sure that the source image stride can be divied by 32 */
00417   bool isAligned32 = true;
00418   int patchWidth = 0;
00419   Image<T> src;
00420   if ((src1.getWidth() % 32) != 0)
00421     {
00422       patchWidth = 32 - (src1.getWidth() % 16);
00423       src = concatX(src1, Image<T>(patchWidth, src1.getHeight(), ZEROS));
00424       isAligned32 = false;
00425     }
00426   else
00427     src = src1;
00428 
00429   int width = src.getWidth();
00430   int height = src.getHeight();
00431   ASSERT(width % 2 == 0);
00432   ASSERT(height % 2 == 0);
00433   int dstride = width * 4;
00434   int sstride = width;
00435 
00436   /* ensure stride is 16-byte aligned and add 32 extra bytes for the
00437    * border padding */
00438   uint8_t *bayer_planes[4];
00439   int plane_stride = ((width + 0xf)&(~0xf)) + 32;
00440   for (int i = 0; i < 4; i++) {
00441     bayer_planes[i] = (uint8_t*)memalign(16,plane_stride * (height + 2));
00442   }
00443 
00444   // alocate a 16-byte aligned buffer for the interpolated image
00445   int bgra_stride = width*4;
00446   uint8_t *bgra_img = (uint8_t*)memalign(16,height * bgra_stride);
00447 
00448   // allocate a 16-byte aligned buffer for the source image
00449   int bayer_stride = width;
00450   uint8_t *bayer_img = (uint8_t*) memalign(16,height * bayer_stride);
00451 
00452   // copy the source image into the 16-byte aligned buffer
00453   copy_8u_generic ((uint8_t*)src.getArrayPtr(), sstride,
00454                              bayer_img, bayer_stride,
00455                              0, 0, 0, 0, width, height, 8);
00456 
00457   // split the bayer image
00458   uint8_t * planes[4] = {
00459     bayer_planes[0] + plane_stride + 16,
00460     bayer_planes[1] + plane_stride + 16,
00461     bayer_planes[2] + plane_stride + 16,
00462     bayer_planes[3] + plane_stride + 16,
00463   };
00464   int p_width = width / 2;
00465   int p_height = height / 2;
00466 
00467   splitBayerPlanes_8u (planes, plane_stride,
00468                                    bayer_img, bayer_stride, p_width, p_height);
00469   for (int j = 0; j < 4; j++)
00470     replicateBorder_8u (planes[j], plane_stride, p_width, p_height);
00471 
00472 
00473   if(bayerInterpolateTo_8u_bgra_sse2 (planes,plane_stride,
00474                                    bgra_img, bgra_stride,
00475                                       width, height, format) < 0)
00476     LFATAL("error in debayer with sse2");
00477 
00478   // copy to destination
00479   uint8_t * dest = (uint8_t*)memalign(16, dstride*height);
00480   copy_8u_generic (bgra_img, bgra_stride,
00481                              dest, dstride, 0, 0, 0, 0, width, height, 8 * 4);
00482 
00483   Image<PixRGB<T> > res(width, height, NO_INIT);
00484   typename Image<PixRGB<T> >::iterator dptr = res.beginw();
00485   T* sptr = (T*)dest;
00486 
00487   for(int y =0; y < height; y++)
00488     {
00489       for(int x =0; x < width; x++)
00490         {
00491           dptr[0].p[2] = *sptr++;
00492           dptr[0].p[1] = *sptr++;
00493           dptr[0].p[0] = *sptr++;
00494           dptr++;
00495           sptr++; // for the A channel
00496         }
00497     }
00498 
00499   for (int i=0; i<4; i++) {
00500     free (bayer_planes[i]);
00501   }
00502   free(dest);
00503   free(bayer_img);
00504   free (bgra_img);
00505 
00506   if(!isAligned32)
00507     res = crop(res, Point2D<int>(0,0), Dims(width-patchWidth, height));
00508 
00509   return res;
00510 #endif //INVT_USE_SSEDB
00511 }
00512 
00513 int
00514 bayerInterpolateTo_8u_bgra_sse2 (uint8_t ** src, int sstride,
00515                                  uint8_t * dst, int dstride, int width, int height,
00516                                  BayerFormat format)
00517 {
00518 # ifndef INVT_USE_SSE3
00519   LFATAL("you must have sse3 support");
00520   return -1;
00521 #else
00522     int i, j;
00523     for (i = 0; i < 4; i++) {
00524         if (!IS_ALIGNED16(src[i]) || !IS_ALIGNED16(sstride)) {
00525           LERROR("%s: src[%d] is not 16-byte aligned", __FUNCTION__, i);
00526             return -1;
00527         }
00528     }
00529     if (!IS_ALIGNED16(dst) || !IS_ALIGNED128(dstride)) {
00530       LERROR("%s: dst is not 16-byte aligned or 128-byte stride aligned", __FUNCTION__);
00531         return -1;
00532     }
00533 
00534     __m128i z = _mm_set1_epi32 (0);
00535     __m128i c3 = _mm_set1_epi16 (3);
00536     __m128i bg, gb, rg, rb, gg, a, bb, br, gr, rr;
00537     __m128i bgl1, bgl2, ral1, ral2;
00538     __m128i bgr1, bgr2, rar1, rar2;
00539     __m128i bgral1, bgral2, bgral3, bgral4;
00540     __m128i bgrar1, bgrar2, bgrar3, bgrar4;
00541     __m128i v1, v2, w1, w2;
00542 
00543     if (format == BAYER_GBRG ||
00544             format == BAYER_RGGB) {
00545         int drow_offset1 = 0;
00546         int drow_offset2 = dstride;
00547         int kernel_stride = sstride;
00548         uint8_t * gb_plane = src[0];
00549         uint8_t * b_plane = src[1];
00550         uint8_t * r_plane = src[2];
00551         uint8_t * gr_plane = src[3];
00552         if (format == BAYER_RGGB) {
00553             drow_offset1 = dstride;
00554             drow_offset2 = 0;
00555             kernel_stride = -sstride;
00556             r_plane = src[0];
00557             gr_plane = src[1];
00558             gb_plane = src[2];
00559             b_plane = src[3];
00560         }
00561 
00562         for (i = 0; i < width/2; i += 16) {
00563             uint8_t * dcol = dst + i*8;
00564 
00565             for (j = 0; j < height/2; j++) {
00566                 INTERPOLATE_GB_ROW (kernel_stride, 1);
00567 
00568                 uint8_t * drow = dcol + j*2*dstride + drow_offset1;
00569                 _mm_store_si128 ((__m128i *)drow,
00570                         _mm_unpacklo_epi32 (bgral1, bgrar1));
00571                 _mm_store_si128 ((__m128i *)(drow+16),
00572                         _mm_unpackhi_epi32 (bgral1, bgrar1));
00573                 _mm_store_si128 ((__m128i *)(drow+32),
00574                         _mm_unpacklo_epi32 (bgral2, bgrar2));
00575                 _mm_store_si128 ((__m128i *)(drow+48),
00576                         _mm_unpackhi_epi32 (bgral2, bgrar2));
00577                 _mm_store_si128 ((__m128i *)(drow+64),
00578                         _mm_unpacklo_epi32 (bgral3, bgrar3));
00579                 _mm_store_si128 ((__m128i *)(drow+80),
00580                         _mm_unpackhi_epi32 (bgral3, bgrar3));
00581                 _mm_store_si128 ((__m128i *)(drow+96),
00582                         _mm_unpacklo_epi32 (bgral4, bgrar4));
00583                 _mm_store_si128 ((__m128i *)(drow+112),
00584                         _mm_unpackhi_epi32 (bgral4, bgrar4));
00585 
00586                 INTERPOLATE_RG_ROW (kernel_stride, 1);
00587 
00588                 drow = dcol + j*2*dstride + drow_offset2;
00589                 _mm_store_si128 ((__m128i *)drow,
00590                         _mm_unpacklo_epi32 (bgral1, bgrar1));
00591                 _mm_store_si128 ((__m128i *)(drow+16),
00592                         _mm_unpackhi_epi32 (bgral1, bgrar1));
00593                 _mm_store_si128 ((__m128i *)(drow+32),
00594                         _mm_unpacklo_epi32 (bgral2, bgrar2));
00595                 _mm_store_si128 ((__m128i *)(drow+48),
00596                         _mm_unpackhi_epi32 (bgral2, bgrar2));
00597                 _mm_store_si128 ((__m128i *)(drow+64),
00598                         _mm_unpacklo_epi32 (bgral3, bgrar3));
00599                 _mm_store_si128 ((__m128i *)(drow+80),
00600                         _mm_unpackhi_epi32 (bgral3, bgrar3));
00601                 _mm_store_si128 ((__m128i *)(drow+96),
00602                         _mm_unpacklo_epi32 (bgral4, bgrar4));
00603                 _mm_store_si128 ((__m128i *)(drow+112),
00604                         _mm_unpackhi_epi32 (bgral4, bgrar4));
00605 
00606             }
00607             gb_plane += 16;
00608             b_plane += 16;
00609             r_plane += 16;
00610             gr_plane += 16;
00611         }
00612     }
00613     else {
00614         int drow_offset1 = 0;
00615         int drow_offset2 = dstride;
00616         int kernel_stride = sstride;
00617         uint8_t * b_plane = src[0];
00618         uint8_t * gb_plane = src[1];
00619         uint8_t * gr_plane = src[2];
00620         uint8_t * r_plane = src[3];
00621         if (format == BAYER_GRBG) {
00622             drow_offset1 = dstride;
00623             drow_offset2 = 0;
00624             kernel_stride = -sstride;
00625             gr_plane = src[0];
00626             r_plane = src[1];
00627             b_plane = src[2];
00628             gb_plane = src[3];
00629         }
00630 
00631         for (i = 0; i < width/2; i += 16) {
00632             uint8_t * dcol = dst + i*8;
00633 
00634             for (j = 0; j < height/2; j++) {
00635                 INTERPOLATE_GB_ROW (kernel_stride, -1);
00636 
00637                 uint8_t * drow = dcol + j*2*dstride + drow_offset1;
00638                 _mm_store_si128 ((__m128i *)drow,
00639                         _mm_unpacklo_epi32 (bgrar1, bgral1));
00640                 _mm_store_si128 ((__m128i *)(drow+16),
00641                         _mm_unpackhi_epi32 (bgrar1, bgral1));
00642                 _mm_store_si128 ((__m128i *)(drow+32),
00643                         _mm_unpacklo_epi32 (bgrar2, bgral2));
00644                 _mm_store_si128 ((__m128i *)(drow+48),
00645                         _mm_unpackhi_epi32 (bgrar2, bgral2));
00646                 _mm_store_si128 ((__m128i *)(drow+64),
00647                         _mm_unpacklo_epi32 (bgrar3, bgral3));
00648                 _mm_store_si128 ((__m128i *)(drow+80),
00649                         _mm_unpackhi_epi32 (bgrar3, bgral3));
00650                 _mm_store_si128 ((__m128i *)(drow+96),
00651                         _mm_unpacklo_epi32 (bgrar4, bgral4));
00652                 _mm_store_si128 ((__m128i *)(drow+112),
00653                         _mm_unpackhi_epi32 (bgrar4, bgral4));
00654 
00655                 INTERPOLATE_RG_ROW (kernel_stride, -1);
00656 
00657                 drow = dcol + j*2*dstride + drow_offset2;
00658                 _mm_store_si128 ((__m128i *)drow,
00659                         _mm_unpacklo_epi32 (bgrar1, bgral1));
00660                 _mm_store_si128 ((__m128i *)(drow+16),
00661                         _mm_unpackhi_epi32 (bgrar1, bgral1));
00662                 _mm_store_si128 ((__m128i *)(drow+32),
00663                         _mm_unpacklo_epi32 (bgrar2, bgral2));
00664                 _mm_store_si128 ((__m128i *)(drow+48),
00665                         _mm_unpackhi_epi32 (bgrar2, bgral2));
00666                 _mm_store_si128 ((__m128i *)(drow+64),
00667                         _mm_unpacklo_epi32 (bgrar3, bgral3));
00668                 _mm_store_si128 ((__m128i *)(drow+80),
00669                         _mm_unpackhi_epi32 (bgrar3, bgral3));
00670                 _mm_store_si128 ((__m128i *)(drow+96),
00671                         _mm_unpacklo_epi32 (bgrar4, bgral4));
00672                 _mm_store_si128 ((__m128i *)(drow+112),
00673                         _mm_unpackhi_epi32 (bgrar4, bgral4));
00674 
00675             }
00676             gb_plane += 16;
00677             b_plane += 16;
00678             r_plane += 16;
00679             gr_plane += 16;
00680         }
00681     }
00682     return 0;
00683 #endif
00684 }
00685 
00686 template Image<PixRGB<byte> >  debayerSSE2(const Image<byte>& src, BayerFormat format);
00687 template Image<PixRGB<uint16> >  debayerSSE2(const Image<uint16>& src, BayerFormat format);
00688 
00689 // ######################################################################
00690 /* So things look consistent in everyone's emacs... */
00691 /* Local Variables: */
00692 /* indent-tabs-mode: nil */
00693 /* End: */