mmx-sse.C

Go to the documentation of this file.
00001 /*!@file Util/mmx-sse.C -- Optimized implementations of low-level functions
00002   for MMX/SSE */
00003 
00004 // //////////////////////////////////////////////////////////////////// //
00005 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2003   //
00006 // by the University of Southern California (USC) and the iLab at USC.  //
00007 // See http://iLab.usc.edu for information about this project.          //
00008 // //////////////////////////////////////////////////////////////////// //
00009 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00010 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00011 // in Visual Environments, and Applications'' by Christof Koch and      //
00012 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00013 // pending; application number 09/912,225 filed July 23, 2001; see      //
00014 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00015 // //////////////////////////////////////////////////////////////////// //
00016 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00017 //                                                                      //
00018 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00019 // redistribute it and/or modify it under the terms of the GNU General  //
00020 // Public License as published by the Free Software Foundation; either  //
00021 // version 2 of the License, or (at your option) any later version.     //
00022 //                                                                      //
00023 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00024 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00025 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00026 // PURPOSE.  See the GNU General Public License for more details.       //
00027 //                                                                      //
00028 // You should have received a copy of the GNU General Public License    //
00029 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00030 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00031 // Boston, MA 02111-1307 USA.                                           //
00032 // //////////////////////////////////////////////////////////////////// //
00033 //
00034 // Primary maintainer for this file: Nitin Dhavale <dhavale@usc.edu>
00035 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Util/mmx-sse.C $
00036 // $Id: mmx-sse.C 10118 2008-08-18 23:51:38Z ilab24 $
00037 //
00038 
00039 #include "Util/mmx-sse.H"
00040 #include "Util/log.H"
00041 
00042 // specific types only to the code that is in this file
00043 typedef int int32;
00044 typedef unsigned  char byte;
00045 typedef float float32;
00046 
00047 #ifdef INVT_CPU_OPTERON
00048 
00049 #ifdef INVT_USE_SSE
00050 
00051 //######################################################################
00052 void sse_absDiff(const double *a, const double *b, double *diff, const int32 sz)
00053 {
00054   static int32 rcx= sz>>2;
00055   static int32 rdx= sz & 0x3;
00056 
00057   asm (
00058        "or %%rcx, %%rcx;\n\t"
00059        "jz .AG2;\n\t"
00060        ".AG1:;\n\t"
00061        "movupd  0(%%rsi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0
00062        "movupd  0(%%rdi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0
00063        "movupd  16(%%rsi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4
00064        "movupd  16(%%rdi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4
00065        "movupd  %%xmm0, %%xmm4;\n\t"   // xmm4 <- a3 a2 a1 a0
00066        "movupd  %%xmm1, %%xmm5;\n\t"   // xmm5 <- b3 b2 b1 b0
00067        "movupd  %%xmm2, %%xmm6;\n\t"   // xmm6 <- a7 a6 a5 a4
00068        "movupd  %%xmm3, %%xmm7;\n\t"   // xmm7 <- b7 b6 b5 b4
00069        "subpd   %%xmm1, %%xmm0;\n\t"   // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0)
00070        "subpd   %%xmm3, %%xmm2;\n\t"   // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4)
00071        "subpd   %%xmm4, %%xmm5;\n\t"   // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0)
00072        "subpd   %%xmm6, %%xmm7;\n\t"   // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4)
00073        "maxpd   %%xmm0, %%xmm5;\n\t"   // xmm5 <- max(xmm0,xmm5)
00074        "maxpd   %%xmm2, %%xmm7;\n\t"   // xmm7 <- max(xmm2,xmm7)
00075        "movupd  %%xmm5, 0(%%rbx);\n\t"
00076        "movupd  %%xmm7, 16(%%rbx);\n\t"
00077        "add $32, %%rsi;\n\t"
00078        "add $32, %%rdi;\n\t"
00079        "add $32, %%rbx;\n\t"
00080        "loop  .AG1;\n\t"
00081        ".AG2:;\n\t"
00082        "mov %%rdx, %%rcx;\n\t"
00083        "or %%rcx, %%rcx;\n\t"
00084        "jz .AG4;\n\t"
00085        ".AG3:;\n\t"
00086        "movsd 0(%%rsi), %%xmm0;\n\t"
00087        "movsd 0(%%rdi), %%xmm1;\n\t"
00088        "movsd %%xmm0, %%xmm2;\n\t"
00089        "movsd %%xmm1, %%xmm3;\n\t"
00090        "subsd %%xmm3, %%xmm2;\n\t"
00091        "subsd %%xmm0, %%xmm1;\n\t"
00092        "maxsd %%xmm2, %%xmm1;\n\t"
00093        "movsd %%xmm1, 0(%%rbx);\n\t"
00094        "add $8, %%rsi;\n\t"
00095        "add $8, %%rdi;\n\t"
00096        "add $8, %%rbx;\n\t"
00097        "loop .AG3;\n\t"
00098        ".AG4:;\n\t"
00099        :
00100        :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00101        :"memory"
00102        );
00103 }
00104 #endif
00105 
00106 #ifdef INVT_USE_MMXSSE2
00107 //######################################################################
00108 // speedup ~= 2.1
00109 void sse2_absDiff(const float *a, const float *b, float *diff, const int32 sz)
00110 {
00111   static int32 rcx= sz>>3;
00112   static int32 rdx= sz & 0x7;
00113 
00114   asm (
00115        "or %%rcx, %%rcx;\n\t"
00116        "jz .AE2;\n\t"
00117        ".AE1:;\n\t"
00118        "movups  0(%%rsi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0
00119        "movups  0(%%rdi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0
00120        "movups  16(%%rsi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4
00121        "movups  16(%%rdi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4
00122        "movups  %%xmm0, %%xmm4;\n\t"   // xmm4 <- a3 a2 a1 a0
00123        "movups  %%xmm1, %%xmm5;\n\t"   // xmm5 <- b3 b2 b1 b0
00124        "movups  %%xmm2, %%xmm6;\n\t"   // xmm6 <- a7 a6 a5 a4
00125        "movups  %%xmm3, %%xmm7;\n\t"   // xmm7 <- b7 b6 b5 b4
00126        "subps   %%xmm1, %%xmm0;\n\t"   // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0)
00127        "subps   %%xmm3, %%xmm2;\n\t"   // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4)
00128        "subps   %%xmm4, %%xmm5;\n\t"   // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0)
00129        "subps   %%xmm6, %%xmm7;\n\t"   // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4)
00130        "maxps   %%xmm0, %%xmm5;\n\t"   // xmm5 <- max(xmm0,xmm5)
00131        "maxps   %%xmm2, %%xmm7;\n\t"   // xmm7 <- max(xmm2,xmm7)
00132        "movups  %%xmm5, 0(%%rbx);\n\t"
00133        "movups  %%xmm7, 16(%%rbx);\n\t"
00134        "add $32, %%rsi;\n\t"
00135        "add $32, %%rdi;\n\t"
00136        "add $32, %%rbx;\n\t"
00137        "loop  .AE1;\n\t"
00138        ".AE2:;\n\t"
00139        "mov %%rdx, %%rcx;\n\t"
00140        "or %%rcx, %%rcx;\n\t"
00141        "jz .AE4;\n\t"
00142        ".AE3:;\n\t"
00143        "movss 0(%%rsi), %%xmm0;\n\t"
00144        "movss 0(%%rdi), %%xmm1;\n\t"
00145        "movss %%xmm0, %%xmm2;\n\t"
00146        "movss %%xmm1, %%xmm3;\n\t"
00147        "subss %%xmm3, %%xmm2;\n\t"
00148        "subss %%xmm0, %%xmm1;\n\t"
00149        "maxss %%xmm2, %%xmm1;\n\t"
00150        "movss %%xmm1, 0(%%rbx);\n\t"
00151        "add $4, %%rsi;\n\t"
00152        "add $4, %%rdi;\n\t"
00153        "add $4, %%rbx;\n\t"
00154        "loop .AE3;\n\t"
00155        ".AE4:;\n\t"
00156        "emms;\n\t"
00157        :
00158        :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00159        :"memory"
00160        );
00161 }
00162 
00163 
00164 
00165 //######################################################################
00166 // speedup ~= 3.4
00167 void sse2_absDiff(const int32 *a, const int32 *b, int32 *diff, const int32 sz)
00168 {
00169   static int32 rcx= sz>>3;
00170   static int32 rdx= sz&0x7;
00171 
00172   asm (
00173        "or %%rcx, %%rcx;\n\t"
00174        "jz .AF2;\n\t"
00175        ".AF1:;\n\t"
00176        "movdqu  0(%%rsi), %%xmm0;\n\t"
00177        "movdqu  0(%%rdi), %%xmm1;\n\t"
00178        "movdqu  16(%%rsi), %%xmm2;\n\t"
00179        "movdqu  16(%%rdi), %%xmm3;\n\t"
00180        "movdqu  %%xmm0, %%xmm4;\n\t"
00181        "movdqu  %%xmm1, %%xmm5;\n\t"
00182        "movdqu  %%xmm2, %%xmm6;\n\t"
00183        "movdqu  %%xmm3, %%xmm7;\n\t"
00184        "psubusw %%xmm1, %%xmm0;\n\t"
00185        "psubusw %%xmm3, %%xmm2;\n\t"
00186        "psubusw %%xmm4, %%xmm5;\n\t"
00187        "psubusw %%xmm6, %%xmm7;\n\t"
00188        "pmaxsw  %%xmm0, %%xmm5;\n\t"
00189        "pmaxsw  %%xmm2, %%xmm7;\n\t"
00190        "movdqu  %%xmm5, 0(%%rbx);\n\t"
00191        "movdqu  %%xmm7, 16(%%rbx);\n\t"
00192        "add $32, %%rsi;\n\t"
00193        "add $32, %%rdi;\n\t"
00194        "add $32, %%rbx;\n\t"
00195        "loop  .AF1;\n\t"
00196        ".AF2:;\n\t"
00197        "mov %%rdx, %%rcx;\n\t"
00198        "or %%rcx, %%rcx;\n\t"
00199        "jz .AF4;\n\t"
00200        ".AF3:;\n\t"
00201        "mov (%%rsi), %%rax;\n\t"
00202        "mov (%%rdi), %%rdx;\n\t"
00203        "cmp %%rdx, %%rax;\n\t"
00204        "ja .AF5;\n\t"
00205        "xchg %%rax, %%rdx;\n\t"
00206        ".AF5:;\n\t"
00207        "sub %%rdx, %%rax;\n\t"
00208        "mov %%rax, (%%rbx);\n\t"
00209        "add $4, %%rsi;\n\t"
00210        "add $4, %%rdi;\n\t"
00211        "add $4, %%rbx;\n\t"
00212        "loop .AF3;\n\t"
00213        ".AF4:;\n\t"
00214        "emms;\n\t"
00215        :
00216        :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00217        :"memory"
00218        );
00219 }
00220 
00221 
00222 //######################################################################
00223 // speedup ~=10.0!
00224 void sse2_absDiff(const byte *a, const byte *b, byte *diff, const int32 sz)
00225 {
00226   static int32 rcx= sz>>5;
00227   static int32 rdx= sz&0x1f;
00228 
00229   asm (
00230        "or %%rcx, %%rcx;\n\t"
00231        "jz .AD2;\n\t"
00232        ".AD1:;\n\t"
00233        "movdqu  0(%%rsi), %%xmm0;\n\t" // xmm0<- a15 ... a3 a2 a1 a0
00234        "movdqu  0(%%rdi), %%xmm1;\n\t" // xmm1<- b15 ... b3 b2 b1 b0
00235        "movdqu  16(%%rsi), %%xmm2;\n\t"// xmm2<- a31 ... a18 a17 a16
00236        "movdqu  16(%%rdi), %%xmm3;\n\t"// xmm3<- b31 ... b18 b17 b16
00237        "movdqu  %%xmm0, %%xmm4;\n\t"   // xmm4<- a15 ... a3 a2 a1 a0
00238        "movdqu  %%xmm1, %%xmm5;\n\t"   // xmm5<- b15 ... b3 b2 b1 b0
00239        "movdqu  %%xmm2, %%xmm6;\n\t"   // xmm6<- a31 ... a18 a17 a16
00240        "movdqu  %%xmm3, %%xmm7;\n\t"   // xmm7<- b31 ... b18 b17 b16
00241        "psubusb %%xmm1, %%xmm0;\n\t"   // xmm0<-(a15-b15)...( a1-b1 )(a0-b0)
00242        "psubusb %%xmm3, %%xmm2;\n\t"   // xmm2<-(a31-b31)...(a17-b17)(a16-b16)
00243        "psubusb %%xmm4, %%xmm5;\n\t"   // xmm5<-(b15-a15)...(b17-a17)(b16-a16)
00244        "psubusb %%xmm6, %%xmm7;\n\t"   // xmm7<-(b31-a31)...(b17-a17)(b16-a16)
00245        "pmaxub  %%xmm0, %%xmm5;\n\t"   // xmm5<- max(xmm0,xmm5)
00246        "pmaxub  %%xmm2, %%xmm7;\n\t"   // xmm7<- max(xmm2,xmm7)
00247        "movdqu  %%xmm5, 0(%%rbx);\n\t"
00248        "movdqu  %%xmm7, 16(%%rbx);\n\t"
00249        "add $32, %%rsi;\n\t"
00250        "add $32, %%rdi;\n\t"
00251        "add $32, %%rbx;\n\t"
00252        "loop  .AD1;\n\t"
00253        ".AD2:;\n\t"
00254        "mov %%rdx, %%rcx;\n\t"
00255        "or %%rcx, %%rcx;\n\t"
00256        "jz .AD4;\n\t"
00257        ".AD3:;\n\t"
00258        "movb (%%rsi), %%al;\n\t"
00259        "movb (%%rdi), %%dl;\n\t"
00260        "cmpb %%dl, %%al;\n\t"
00261        "ja .AD5;\n\t"
00262        "xchgb %%al, %%dl;\n\t"
00263        ".AD5:;\n\t"
00264        "subb %%dl, %%al;\n\t"
00265        "movb %%al, (%%rbx);\n\t"
00266        "inc %%rbx;\n\t"
00267        "inc %%rsi;\n\t"
00268        "inc %%rdi;\n\t"
00269        "loop .AD3;\n\t"
00270        ".AD4:;\n\t"
00271        "emms;\n\t"
00272        :
00273        :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00274        :"memory"
00275        );
00276 }
00277 #endif
00278 
00279 #ifdef INVT_USE_SSE
00280 //######################################################################
00281 // speedup ~= 2.0
00282 void sse_sum(const double *a, double *sum, const int32 sz)
00283 {
00284   static int32 rcx = sz>>3;
00285   static int32 rdx = sz&0x7;
00286 
00287   asm (
00288        "pxor %%xmm4, %%xmm4;\n\t"
00289        "pxor %%xmm5, %%xmm5;\n\t"
00290        "pxor %%xmm6, %%xmm6;\n\t"
00291        "pxor %%xmm7, %%xmm7;\n\t"
00292        "or %%rcx, %%rcx;\n\t"
00293        "jz  BE1;\n\t"
00294        ".BE0:\n\t"
00295        "movupd     0(%%rsi), %%xmm0;\n\t"
00296        "movupd  16(%%rsi), %%xmm1;\n\t"
00297        "movupd  32(%%rsi), %%xmm2;\n\t"
00298        "movupd  48(%%rsi), %%xmm3;\n\t"
00299        "addpd %%xmm0, %%xmm4;\n\t"
00300        "addpd %%xmm1, %%xmm5;\n\t"
00301        "addpd %%xmm2, %%xmm6;\n\t"
00302        "addpd %%xmm3, %%xmm7;\n\t"
00303        "add $64, %%rsi;\n\t"
00304        "loop .BE0;\n\t"
00305        "BE1:;\n\t"
00306        "mov %%rdx, %%rcx;\n\t"
00307        "pxor %%xmm0, %%xmm0;\n\t"
00308        "or %%rcx, %%rcx;\n\t"
00309        "jz BE2;\n\t"
00310        "BE3:;\n\t"
00311        "movupd 0(%%rsi), %%xmm1;\n\t"
00312        "addpd %%xmm1, %%xmm0;\n\t"
00313        "add $16, %%rsi;\n\t"
00314        "loop BE3;\n\t"
00315        "BE2:;\n\t"
00316        "addpd %%xmm4, %%xmm7;\n\t"
00317        "addpd %%xmm5, %%xmm7;\n\t"
00318        "addpd %%xmm6, %%xmm7;\n\t"
00319        "addpd %%xmm7, %%xmm0;\n\t"
00320        "movhpd %%xmm0, (%%rbx);\n\t"
00321        "addsd  (%%rbx), %%xmm0;\n\t"
00322        "movsd %%xmm0, (%%rbx);\n\t"
00323        "emms;\n\t"
00324        :
00325        :"S"(a), "b"(sum), "c"(rcx), "d"(rdx)
00326        :"memory"
00327        );
00328 }
00329 #endif
00330 
00331 #ifdef INVT_USE_MMXSSE2
00332 //######################################################################
00333 //speedup ~= 4
00334 void sse2_sum(const float *a, double *sum, const int32 sz)
00335 {
00336   static int32 rcx = sz>>3;
00337   static int32 rdx = sz & 0x7;
00338 
00339   asm (
00340        "pxor %%xmm4, %%xmm4;\n\t"
00341        "pxor %%xmm5, %%xmm5;\n\t"
00342        "pxor %%xmm6, %%xmm6;\n\t"
00343        "pxor %%xmm7, %%xmm7;\n\t"
00344        "or %%rcx, %%rcx;\n\t"
00345        "jz  BA1;\n\t"
00346        ".BA0:\n\t"
00347        "cvtps2pd  0(%%rsi), %%xmm0;\n\t"
00348        "cvtps2pd  8(%%rsi), %%xmm1;\n\t"
00349        "cvtps2pd  16(%%rsi), %%xmm2;\n\t"
00350        "cvtps2pd 24(%%rsi), %%xmm3;\n\t"
00351        "addpd %%xmm0, %%xmm4;\n\t"
00352        "addpd %%xmm1, %%xmm5;\n\t"
00353        "addpd %%xmm2, %%xmm6;\n\t"
00354        "addpd %%xmm3, %%xmm7;\n\t"
00355        "add $32, %%rsi;\n\t"
00356        "loop .BA0;\n\t"
00357        "BA1:;\n\t"
00358        "pxor %%xmm0, %%xmm0;\n\t"
00359        "mov %%rdx, %%rcx;\n\t"
00360        "or %%rcx, %%rcx;\n\t"
00361        "jz BA2;\n\t"
00362        "BA3:;\n\t"
00363        "cvtps2pd 0(%%rsi), %%xmm1;\n\t"
00364        "addpd %%xmm1, %%xmm0;\n\t"
00365        "add $8, %%rsi;\n\t"
00366        "loop BA3;\n\t"
00367        "BA2:;\n\t"
00368        "addpd %%xmm4, %%xmm7;\n\t"
00369        "addpd %%xmm5, %%xmm7;\n\t"
00370        "addpd %%xmm6, %%xmm7;\n\t"
00371        "addpd %%xmm7, %%xmm0;\n\t"
00372        "movhpd %%xmm0, (%%rbx);\n\t"
00373        "addsd  (%%rbx), %%xmm0;\n\t"
00374        "movsd %%xmm0, (%%rbx);\n\t"
00375        "emms;\n\t"
00376        :
00377        :"S"(a), "b"(sum), "c"(rcx), "d"(rdx)
00378        :"memory"
00379        );
00380 }
00381 
00382 
00383 //######################################################################
00384 // speedup ~= 4.0
00385 void sse2_sum(const int32 *a, double *sum, const int32 sz)
00386 {
00387   static int32 rcx = sz>>3;
00388   static int32 rdx = sz & 0x7;
00389 
00390   asm (
00391        "pxor %%xmm4, %%xmm4;\n\t"
00392        "pxor %%xmm5, %%xmm5;\n\t"
00393        "pxor %%xmm6, %%xmm6;\n\t"
00394        "pxor %%xmm7, %%xmm7;\n\t"
00395        "or %%rcx, %%rcx;\n\t"
00396        ".BC0:\n\t"
00397        "cvtdq2pd  0(%%rsi), %%xmm0;\n\t"
00398        "cvtdq2pd  8(%%rsi), %%xmm1;\n\t"
00399        "cvtdq2pd  16(%%rsi), %%xmm2;\n\t"
00400        "cvtdq2pd 24(%%rsi), %%xmm3;\n\t"
00401        "addpd %%xmm0, %%xmm4;\n\t"
00402        "addpd %%xmm1, %%xmm5;\n\t"
00403        "addpd %%xmm2, %%xmm6;\n\t"
00404        "addpd %%xmm3, %%xmm7;\n\t"
00405        "add $32, %%rsi;\n\t"
00406        "loop .BC0;\n\t"
00407        "BC1:;\n\t"
00408        "pxor %%xmm0, %%xmm0;\n\t"
00409        "mov %%rdx, %%rcx;\n\t"
00410        "or %%rcx, %%rcx;\n\t"
00411        "jz BC2;\n\t"
00412        "BC3:;\n\t"
00413        "cvtdq2pd 0(%%rsi), %%xmm1;\n\t"
00414        "addpd %%xmm1, %%xmm0;\n\t"
00415        "add $8, %%rsi;\n\t"
00416        "loop BC3;\n\t"
00417        "BC2:;\n\t"
00418        "addpd %%xmm4, %%xmm7;\n\t"
00419        "addpd %%xmm5, %%xmm7;\n\t"
00420        "addpd %%xmm6, %%xmm7;\n\t"
00421        "addpd %%xmm7, %%xmm0;\n\t"
00422        "movhpd %%xmm0, (%%rbx);\n\t"
00423        "addsd  (%%rbx), %%xmm0;\n\t"
00424        "movsd %%xmm0, (%%rbx);\n\t"
00425        "emms;\n\t"
00426        :
00427        :"S"(a), "b"(sum), "c"(rcx), "d"(rdx)
00428        :"memory"
00429        );
00430 }
00431 
00432 
00433 
00434 //######################################################################
00435 void sse2_sum(const byte *a, double *sum, const int32 sz)
00436 {
00437   static int rcx = sz>>5;
00438   static int rdx = sz & 0x1f;
00439 
00440   asm (
00441        "or %%rcx, %%rcx;\n\t"
00442        "jz  BB1;\n\t"
00443        "pxor %%xmm7, %%xmm7;\n\t"
00444        "push %%rbx;\n\t"
00445        "push %%rdx;\n\t"
00446        "BB3:;\n\t"
00447        "pxor %%xmm5, %%xmm5;\n\t"
00448        "pxor %%xmm6, %%xmm6;\n\t"
00449        "movdqu (%%rsi), %%xmm0;\n\t"
00450        "movdqu 16(%%rsi), %%xmm1;\n\t"
00451        "psadbw %%xmm0, %%xmm5;\n\t"
00452        "psadbw %%xmm1, %%xmm6;\n\t"
00453        "pextrw $0, %%xmm5, %%rax;\n\t"
00454        "cvtsi2sd %%rax, %%xmm0;\n\t"
00455        "pextrw $4, %%xmm5, %%rbx;\n\t"
00456        "cvtsi2sd %%rbx, %%xmm1;\n\t"
00457        "pextrw $0, %%xmm6, %%rdx;\n\t"
00458        "cvtsi2sd %%rdx, %%xmm2;\n\t"
00459        "pextrw $4, %%xmm6, %%rdi;\n\t"
00460        "cvtsi2sd %%rdi, %%xmm3;\n\t"
00461        "addsd %%xmm0, %%xmm1;\n\t"
00462        "addsd %%xmm2, %%xmm3;\n\t"
00463        "addsd %%xmm1, %%xmm7;\n\t"
00464        "addsd %%xmm3, %%xmm7;\n\t"
00465        "add $32, %%rsi;\n\t"
00466        "loop BB3;\n\t"
00467        "pop %%rdx;\n\t"
00468        "pop %%rbx;\n\t"
00469        "BB1:;\n\t"
00470        "xor %%rdi, %%rdi;\n\t"
00471        "mov %%rdx, %%rcx;\n\t"
00472        "or %%rcx, %%rcx;\n\t"
00473        "jz BB2;\n\t"
00474        "BB5:;\n\t"
00475        "xor %%rax, %%rax;\n\t"
00476        "movb (%%rsi), %%al;\n\t"
00477        "add %%rax, %%rdi;\n\t"
00478        "inc %%rsi;\n\t"
00479        "loop BB5;\n\t"
00480        "BB2:\n\t"
00481        "cvtsi2sd %%rdi, %%xmm0;\n\t"
00482        "addsd %%xmm0, %%xmm7;\n\t"
00483        "movhpd %%xmm7, (%%rbx);\n\t"
00484        "addsd  (%%rbx), %%xmm7;\n\t"
00485        "movsd %%xmm7, (%%rbx);\n\t"
00486        "BB6:;\n\t"
00487        "emms;\n\t"
00488        :
00489        :"S"(a), "c"(rcx),"b"(sum),"d"(rdx)
00490        :"memory","rax","rdi"
00491        );
00492 }
00493 #endif
00494 
00495 #ifdef INVT_USE_SSE
00496 //######################################################################
00497 // speedup ~= 10 !
00498 void sse_clampedDiff(const byte *a, const byte *b, byte *result, const int32 sz)
00499 {
00500   int rcx = sz >> 6;
00501   int rdx = sz & 0x7f;
00502 
00503   asm (
00504        "or %%rcx, %%rcx;\n\t"
00505        "jz .DA0;\n\t"
00506        ".DA1:;\n\t"
00507        "movdqu (%%rsi), %%xmm0;\n\t"
00508        "movdqu (%%rdi), %%xmm4;\n\t"
00509        "movdqu 16(%%rsi), %%xmm1;\n\t"
00510        "movdqu 16(%%rdi), %%xmm5;\n\t"
00511        "movdqu 32(%%rsi), %%xmm2;\n\t"
00512        "movdqu 32(%%rdi), %%xmm6;\n\t"
00513        "movdqu 48(%%rsi), %%xmm3;\n\t"
00514        "movdqu 48(%%rdi), %%xmm7;\n\t"
00515        "psubusb %%xmm4, %%xmm0;\n\t"
00516        "psubusb %%xmm5, %%xmm1;\n\t"
00517        "psubusb %%xmm6, %%xmm2;\n\t"
00518        "psubusb %%xmm7, %%xmm3;\n\t"
00519        "movdqu  %%xmm0, 0(%%rbx);\n\t"
00520        "movdqu  %%xmm1, 16(%%rbx);\n\t"
00521        "movdqu  %%xmm2, 32(%%rbx);\n\t"
00522        "movdqu  %%xmm3, 48(%%rbx);\n\t"
00523        "add $64, %%rsi;\n\t"
00524        "add $64, %%rdi;\n\t"
00525        "add $64, %%rbx;\n\t"
00526        "loop .DA1;\n\t"
00527        ".DA0:;\n\t"
00528        "mov %%rdx, %%rcx;\n\t"
00529        "or %%rcx, %%rcx;\n\t"
00530        "jz .DA2;\n\t"
00531        ".DA3:;\n\t"
00532        "movb (%%rsi), %%al;\n\t"
00533        "movb (%%rdi), %%dl;\n\t"
00534        "cmpb %%bl, %%al;\n\t"
00535        "ja .DA4;\n\t"
00536        "xchg %%al, %%bl;\n\t"
00537        ".DA4:;\n\t"
00538        "subb %%bl, %%al;\n\t"
00539        "movb %%al, (%%rbx);\n\t"
00540        "inc %%rsi;\n\t"
00541        "inc %%rdi;\n\t"
00542        "inc %%rbx;\n\t"
00543        "loop .DA3;\n\t"
00544        ".DA2:;\n\t"
00545        "emms;\n\t"
00546        :
00547        :"S"(a),"D"(b),"c"(rcx),"d"(rdx),"b"(result)
00548        );
00549 }
00550 
00551 
00552 //######################################################################
00553 // speedup ~= 20 !
00554 void sse_clampedDiff(const float32 *a, const float32 *b, float32 *result,
00555                         const int32 sz)
00556 {
00557   int32 rcx=sz>>5;
00558   int32 rdx=sz&0x1f;
00559 
00560   asm (
00561        "or %%rcx, %%rcx;\n\t"
00562        "jz .DB0;\n\t"
00563        ".DB1:;\n\t"
00564        "movups  0(%%rsi), %%xmm0;\n\t"
00565        "movups  0(%%rdi), %%xmm1;\n\t"
00566        "movups 16(%%rsi), %%xmm2;\n\t"
00567        "movups 16(%%rdi), %%xmm3;\n\t"
00568        "movups %%xmm1, %%xmm6;\n\t"
00569        "movups %%xmm3, %%xmm7;\n\t"
00570        "cmpps  $1, %%xmm0, %%xmm6;\n\t"
00571        "cmpps  $1, %%xmm2, %%xmm7;\n\t"
00572        "subps  %%xmm1, %%xmm0;\n\t"
00573        "subps  %%xmm3, %%xmm2;\n\t"
00574        "andps  %%xmm6, %%xmm0;\n\t"
00575        "andps  %%xmm7, %%xmm2;\n\t"
00576        "movups %%xmm0, (%%rbx);\n\t"
00577        "movups %%xmm2, 16(%%rbx);\n\t"
00578        "add  $32, %%rsi;\n\t"
00579        "add  $32, %%rdi;\n\t"
00580        "add  $32, %%rbx;\n\t"
00581        "loop .DB1;\n\t"
00582        ".DB0:;\n\t"
00583        "mov %%rdx, %%rcx;\n\t"
00584        "or %%rcx, %%rcx;\n\t"
00585        "jz .DB2;\n\t"
00586        ".DB3:;\n\t"
00587        "movss (%%rsi), %%xmm0;\n\t"
00588        "movss (%%rdi), %%xmm1;\n\t"
00589        "movss %%xmm1, %%xmm2;\n\t"
00590        "cmpss $1, %%xmm0,  %%xmm2;\n\t"
00591        "andps %%xmm2, %%xmm0;\n\t"
00592        "andps %%xmm2, %%xmm1;\n\t"
00593        "subss %%xmm1,  %%xmm0;\n\t"
00594        "movss %%xmm0,  (%%rbx);\n\t"
00595        "add $4, %%rsi;\n\t"
00596        "add $4, %%rdi;\n\t"
00597        "add $4, %%rbx;\n\t"
00598        "loop .DB3;\n\t"
00599        ".DB2:;\n\t"
00600        :
00601        :"S"(a), "D"(b), "b"(result), "c"(rcx), "d"(rdx)
00602        :"memory"
00603        );
00604 }
00605 
00606 
00607 //######################################################################
00608 // speedup ~= 3
00609 void sse_clampedDiff(const int32 *a, const int32 *b, int32 *c, const int32 sz)
00610 {
00611   int32 rcx=sz>>3;
00612   int32 rdx=sz&0x7;
00613   asm (
00614        "or %%rcx, %%rcx;\n\t"
00615        "jz .DC0;\n\t"
00616        ".DC1:;\n\t"
00617        "movdqu 0(%%rsi), %%xmm0;\n\t" //xmm0=  a3     a2     a1     a0
00618        "movdqu 0(%%rdi), %%xmm1;\n\t" //xmm1=  b3     b2     b1     b0
00619        "movdqu 16(%%rsi), %%xmm3;\n\t"//xmm3=  a7     a6     a5     a4
00620        "movdqu 16(%%rdi), %%xmm4;\n\t"//xmm4=  b7     b6     b5     b4
00621        "movdqu  %%xmm0, %%xmm2;\n\t"  //xmm2=  a3     a2     a1     a0
00622        "movdqu  %%xmm3, %%xmm5;\n\t"  //xmm5=  a7     a6     a5     a4
00623        "pcmpgtd %%xmm1, %%xmm2;\n\t"  //xmm2=(a3>b3)(a2>b2)(a1>b1)(a0>b0)
00624        "pcmpgtd %%xmm4, %%xmm5;\n\t"  //xmm5=(a7>b7)(a6>b6)(b5>a5)(a4>b4)
00625        "psubd   %%xmm1, %%xmm0;\n\t"  //xmm0=(a3-b3)(a2-b2)(a1-b1)(a0-b0)
00626        "psubd   %%xmm4, %%xmm3;\n\t"  //xmm3=(a7-b7)(a6-b6)(a5-b5)(a4-b4)
00627        "pand    %%xmm2, %%xmm0;\n\t"
00628        "pand    %%xmm5, %%xmm3;\n\t"
00629        "movdqu  %%xmm0, (%%rbx);\n\t"
00630        "movdqu  %%xmm3, 16(%%rbx);\n\t"
00631        "add $32, %%rsi;\n\t"
00632        "add $32, %%rdi;\n\t"
00633        "add $32, %%rbx;\n\t"
00634        "loop .DC1;\n\t"
00635        ".DC0:;\n\t"
00636        "mov %%rdx, %%rcx;\n\t"
00637        "or  %%rcx, %%rcx;\n\t"
00638        "jz .DC2;\n\t"
00639        ".DC3:;\n\t"
00640        "movsd 0(%%rsi), %%xmm0;\n\t"
00641        "movsd 0(%%rdi), %%xmm1;\n\t"
00642        "movdqu %%xmm0, %%xmm2;\n\t"
00643        "pcmpgtd %%xmm1, %%xmm2;\n\t"
00644        "psubd   %%xmm1, %%xmm0;\n\t"
00645        "pand    %%xmm2, %%xmm0;\n\t"
00646        "movsd    %%xmm0, (%%rbx);\n\t"
00647        "add $4, %%rsi;\n\t"
00648        "add $4, %%rdi;\n\t"
00649        "add $4, %%rbx;\n\t"
00650        "loop .DC3;\n\t"
00651        ".DC2:;\n\t"
00652        :
00653        :"S"(a), "D"(b), "c"(rcx), "d"(rdx), "b"(c)
00654        :"memory"
00655        );
00656 }
00657 
00658 
00659 //######################################################################
00660 // speedup ~= 4-5
00661 void sse_binaryReverse(const byte *a, byte *result, const byte val, const
00662                                 int32 sz)
00663 {
00664   static unsigned int rcx=(sz>>7);
00665   static unsigned int rdx=sz&0x7f;
00666 
00667   byte pVal[16];
00668 
00669   memset(result, val, 16);
00670 
00671   asm (
00672        "or %%rcx, %%rcx;\n\t"
00673        "jz .FA0;\n\t"
00674        ".FA1:;\n\t"
00675        "movdqu  0(%%rbx), %%xmm0;\n\t"
00676        "movdqu  0(%%rbx), %%xmm1;\n\t"
00677        "movdqu  %%xmm0, %%xmm2;\n\t"
00678        "movdqu  %%xmm1, %%xmm3;\n\t"
00679        "movdqu  %%xmm0, %%xmm4;\n\t"
00680        "movdqu  %%xmm1, %%xmm5;\n\t"
00681        "movdqu  %%xmm0, %%xmm6;\n\t"
00682        "movdqu  %%xmm1, %%xmm7;\n\t"
00683        "psubb (%%rsi), %%xmm0;\n\t"
00684        "psubb 16(%%rsi), %%xmm1;\n\t"
00685        "psubb 32(%%rsi), %%xmm2;\n\t"
00686        "psubb 48(%%rsi), %%xmm3;\n\t"
00687        "psubb 64(%%rsi), %%xmm4;\n\t"
00688        "psubb 80(%%rsi), %%xmm5;\n\t"
00689        "psubb 96(%%rsi), %%xmm6;\n\t"
00690        "psubb 112(%%rsi), %%xmm7;\n\t"
00691        "movdqu %%xmm0, (%%rdi);\n\t"
00692        "movdqu %%xmm1, 16(%%rdi);\n\t"
00693        "movdqu %%xmm2, 32(%%rdi);\n\t"
00694        "movdqu %%xmm3, 48(%%rdi);\n\t"
00695        "movdqu %%xmm4, 64(%%rdi);\n\t"
00696        "movdqu %%xmm5, 80(%%rdi);\n\t"
00697        "movdqu %%xmm6, 96(%%rdi);\n\t"
00698        "movdqu %%xmm7, 112(%%rdi);\n\t"
00699        "add $128, %%rdi;\n\t"
00700        "add $128, %%rsi;\n\t"
00701        "loop .FA1;\n\t"
00702        ".FA0:;\n\t"
00703        "mov %%rdx, %%rcx;\n\t"
00704        "or %%rcx, %%rcx;\n\t"
00705        "jz .FA2;\n\t"
00706        "movb (%%rbx), %%dl;\n\t"
00707        ".FA3:;\n\t"
00708        "movb %%dl, %%dh;\n\t"
00709        "movb (%%rsi), %%al;\n\t"
00710        "subb %%al, %%dh;\n\t"
00711        "movb %%dh, (%%rdi);\n\t"
00712        "inc %%rsi;\n\t"
00713        "inc %%rdi;\n\t"
00714        "loop .FA3;\n\t"
00715        ".FA2:;\n\t"
00716        :
00717        :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx)
00718        :"memory","rax"
00719        );
00720 }
00721 
00722 
00723 //######################################################################
00724 // speedup ~= 2
00725 void sse_binaryReverse(const float *a, float *result, const float val,
00726                                 const int sz)
00727 {
00728   static unsigned int rcx = sz>>5;
00729   static unsigned int rdx = sz&0x1f;
00730   int i;
00731   float pVal[16];
00732 
00733   for(i=0;i<16;++i)
00734     pVal[i] = val;
00735 
00736 
00737   asm (
00738        "or %%rcx, %%rcx;\n\t"
00739        "jz .FB4;\n\t"
00740        ".FB2:;\n\t"
00741        "movups (%%rbx), %%xmm0;\n\t"
00742        "movups (%%rbx), %%xmm1;\n\t"
00743        "movups %%xmm0, %%xmm2;\n\t"
00744        "movups %%xmm1, %%xmm3;\n\t"
00745        "movups %%xmm0, %%xmm4;\n\t"
00746        "movups %%xmm1, %%xmm5;\n\t"
00747        "movups %%xmm0, %%xmm6;\n\t"
00748        "movups %%xmm1, %%xmm7;\n\t"
00749        "psubq (%%rsi), %%xmm0;\n\t"
00750        "psubq 16(%%rsi), %%xmm1;\n\t"
00751        "psubq 32(%%rsi), %%xmm2;\n\t"
00752        "psubq 48(%%rsi), %%xmm3;\n\t"
00753        "psubq 64(%%rsi), %%xmm4;\n\t"
00754        "psubq 80(%%rsi), %%xmm5;\n\t"
00755        "psubq 96(%%rsi), %%xmm6;\n\t"
00756        "psubq 112(%%rsi), %%xmm7;\n\t"
00757        "movups %%xmm0,  0(%%rdi);\n\t"
00758        "movups %%xmm1, 16(%%rdi);\n\t"
00759        "movups %%xmm2, 32(%%rdi);\n\t"
00760        "movups %%xmm3, 48(%%rdi);\n\t"
00761        "movups %%xmm4, 64(%%rdi);\n\t"
00762        "movups %%xmm5, 80(%%rdi);\n\t"
00763        "movups %%xmm6, 96(%%rdi);\n\t"
00764        "movups %%xmm7,112(%%rdi);\n\t"
00765        "add $128, %%rsi;\n\t"
00766        "add $128, %%rdi;\n\t"
00767        "loop .FB2;\n\t"
00768        ".FB4:\n\t"
00769        "or  %%rdx, %%rdx;\n\t"
00770        "jz .FB1;\n\t"
00771        "mov %%rdx, %%rcx;\n\t"
00772        ".FB3:;\n\t"
00773        "movss 0(%%rbx), %%xmm0;\n\t"
00774        "subss (%%rsi), %%xmm0;\n\t"
00775        "movups %%xmm0, (%%rdi);\n\t"
00776        "add $16, %%rsi;\n\t"
00777        "add $16, %%rdi;\n\t"
00778        "loop .FB3;\n\t"
00779        ".FB1:;\n\t"
00780        :
00781        :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx)
00782        :"memory","rax"
00783        );
00784 }
00785 
00786 
00787 
00788 //######################################################################
00789 
00790 void sse_binaryReverse(const int32 *a, int32 *result, const int32 val,
00791                         const int32 sz)
00792 {
00793   int32 rcx=sz>>5;
00794   int32 rdx=sz&31;
00795   int32 pVal[16];
00796   int i;
00797 
00798   for(i=0;i<16;++i) pVal[i] = val;
00799 
00800   asm (
00801        "or %%rcx, %%rcx;\n\t"
00802        "jz .FC4;\n\t"
00803        ".FC2:;\n\t"
00804        "movdqu (%%rbx), %%xmm0;\n\t"
00805        "movdqu (%%rbx), %%xmm1;\n\t"
00806        "movdqu %%xmm0, %%xmm2;\n\t"
00807        "movdqu %%xmm1, %%xmm3;\n\t"
00808        "movdqu %%xmm0, %%xmm4;\n\t"
00809        "movdqu %%xmm1, %%xmm5;\n\t"
00810        "movdqu %%xmm0, %%xmm6;\n\t"
00811        "movdqu %%xmm1, %%xmm7;\n\t"
00812        "psubd  (%%rsi), %%xmm0;\n\t"
00813        "psubd  16(%%rsi), %%xmm1;\n\t"
00814        "psubd  32(%%rsi), %%xmm2;\n\t"
00815        "psubd  48(%%rsi), %%xmm3;\n\t"
00816        "psubd  64(%%rsi), %%xmm4;\n\t"
00817        "psubd  80(%%rsi), %%xmm5;\n\t"
00818        "psubd  96(%%rsi), %%xmm6;\n\t"
00819        "psubd  112(%%rsi), %%xmm7;\n\t"
00820        "movdqu %%xmm0,  0(%%rdi);\n\t"
00821        "movdqu %%xmm1, 16(%%rdi);\n\t"
00822        "movdqu %%xmm2, 32(%%rdi);\n\t"
00823        "movdqu %%xmm3, 48(%%rdi);\n\t"
00824        "movdqu %%xmm4, 64(%%rdi);\n\t"
00825        "movdqu %%xmm5, 80(%%rdi);\n\t"
00826        "movdqu %%xmm6, 96(%%rdi);\n\t"
00827        "movdqu %%xmm7,112(%%rdi);\n\t"
00828        "add $128, %%rsi;\n\t"
00829        "add $128, %%rdi;\n\t"
00830        "loop .FC2;\n\t"
00831        ".FC4:;\n\t"
00832        "or  %%rdx, %%rdx;\n\t"
00833        "jz .FC1;\n\t"
00834        "mov %%rdx, %%rcx;\n\t"
00835        ".FC3:;\n\t"
00836        "movdqu 0(%%rbx), %%xmm0;\n\t"
00837        "psubd (%%rsi), %%xmm0;\n\t"
00838        "movups %%xmm0, (%%rdi);\n\t"
00839        "add $16, %%rsi;\n\t"
00840        "add $16, %%rdi;\n\t"
00841        "loop .FC3;\n\t"
00842        ".FC1:;\n\t"
00843        :
00844        :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx)
00845        :"memory","rax"
00846        );
00847 }
00848 
00849 
00850 
00851 //######################################################################
00852 
00853 void sse_cvt_byte_to_int(const byte *a, int32 *b, const int32 sz)
00854 {
00855   int32 rcx=sz>>4;
00856   int32 rdx=sz&0xf;
00857 
00858   asm(
00859       "or %%rcx, %%rcx;\n\t"
00860       "jz .GA4;\n\t"
00861       "pxor %%xmm0, %%xmm0;\n\t"
00862       ".GA2:;\n\t"
00863       "movdqu 0(%%rsi), %%xmm1;\n\t"
00864       "movdqa %%xmm1, %%xmm2;\n\t"
00865       "movdqa %%xmm1, %%xmm3;\n\t"
00866       "movdqa %%xmm1, %%xmm4;\n\t"
00867       "psrldq $4, %%xmm2;\n\t"
00868       "psrldq $8, %%xmm3;\n\t"
00869       "psrldq $12, %%xmm4;\n\t"
00870       "punpcklbw %%xmm0, %%xmm1;\n\t"
00871       "punpcklbw %%xmm0, %%xmm2;\n\t"
00872       "punpcklbw %%xmm0, %%xmm3;\n\t"
00873       "punpcklbw %%xmm0, %%xmm4;\n\t"
00874       "punpcklbw %%xmm0, %%xmm1;\n\t"
00875       "punpcklbw %%xmm0, %%xmm2;\n\t"
00876       "punpcklbw %%xmm0, %%xmm3;\n\t"
00877       "punpcklbw %%xmm0, %%xmm4;\n\t"
00878       "movdqu %%xmm1, (%%rdi);\n\t"
00879       "movdqu %%xmm2, 16(%%rdi);\n\t"
00880       "movdqu %%xmm3, 32(%%rdi);\n\t"
00881       "movdqu %%xmm4, 48(%%rdi);\n\t"
00882       "add $16, %%rsi;\n\t"
00883       "add $64, %%rdi;\n\t"
00884       "loop .GA2;\n\t"
00885       ".GA4:;\n\t"
00886       "or %%rdx, %%rdx;\n\t"
00887       "jz .GA1;\n\t"
00888       "mov %%rdx, %%rcx;\n\t"
00889       ".GA3:;\n\t"
00890       "xor %%rax, %%rax;\n\t"
00891       "movb (%%rsi), %%al;\n\t"
00892       "mov %%rax, (%%rdi);\n\t"
00893       "inc %%rsi;\n\t"
00894       "add $4, %%rdi;\n\t"
00895       "loop .GA3;\n\t"
00896       ".GA1:;"
00897       :
00898       :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
00899       :"memory"
00900       );
00901 
00902 
00903 }
00904 
00905 #endif
00906 
00907 #ifdef INVT_USE_MMXSSE2
00908 
00909 //######################################################################
00910 // speedup ~= 1.5
00911 void sse2_cvt_byte_to_float(const byte *a, float32 *b, const int32 sz)
00912 {
00913   int32 rcx=sz>>4;
00914   int32 rdx=sz&0xf;
00915 
00916   asm(
00917       "or %%rcx, %%rcx;\n\t"
00918       "jz .GB4;\n\t"
00919       ".GB2:;\n\t"
00920       "pxor %%xmm0, %%xmm0;\n\t"
00921       "movdqu 0(%%rsi), %%xmm1;\n\t"
00922       "movdqu 4(%%rsi), %%xmm2;\n\t"
00923       "movdqu 8(%%rsi), %%xmm3;\n\t"
00924       "movdqu 12(%%rsi), %%xmm4;\n\t"
00925       "punpcklbw %%xmm0, %%xmm1;\n\t"
00926       "punpcklbw %%xmm0, %%xmm2;\n\t"
00927       "punpcklbw %%xmm0, %%xmm3;\n\t"
00928       "punpcklbw %%xmm0, %%xmm4;\n\t"
00929       "punpcklbw %%xmm0, %%xmm1;\n\t"
00930       "punpcklbw %%xmm0, %%xmm2;\n\t"
00931       "punpcklbw %%xmm0, %%xmm3;\n\t"
00932       "punpcklbw %%xmm0, %%xmm4;\n\t"
00933       "cvtdq2ps %%xmm1, %%xmm1;\n\t"
00934       "cvtdq2ps %%xmm2, %%xmm2;\n\t"
00935       "movups  %%xmm1, (%%rdi);\n\t"
00936       "movups  %%xmm2, 16(%%rdi);\n\t"
00937       "cvtdq2ps %%xmm3, %%xmm3;\n\t"
00938       "cvtdq2ps %%xmm4, %%xmm4;\n\t"
00939       "movups  %%xmm3, 32(%%rdi);\n\t"
00940       "movups  %%xmm4, 48(%%rdi);\n\t"
00941       "add $16, %%rsi;\n\t"
00942       "add $64, %%rdi;\n\t"
00943       "loop .GB2;\n\t"
00944       ".GB4:;\n\t"
00945       "or %%rdx, %%rdx;\n\t"
00946       "jz .GB1;\n\t"
00947       "mov %%rdx, %%rcx;\n\t"
00948       ".GB3:;\n\t"
00949       "xor %%rax, %%rax;\n\t"
00950       "movb (%%rsi), %%al;\n\t"
00951       "movd %%rax, %%xmm0;\n\t"
00952       "cvtdq2ps %%xmm0, %%xmm1;\n\t"
00953       "movss %%xmm1, (%%rdi);\n\t"
00954       "inc %%rsi;\n\t"
00955       "add $4, %%rdi;\n\t"
00956       "loop .GB3;\n\t"
00957       ".GB1:;"
00958       :
00959       :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
00960       :"memory"
00961       );
00962 }
00963 
00964 
00965 
00966 //######################################################################
00967 // speedup ~= 1.15
00968 void sse2_cvt_byte_to_double(const byte *a, double *b, int32 sz)
00969 {
00970   int32 rcx=sz>>3;
00971   int32 rdx=sz&0x7;
00972 
00973   asm(
00974       "or %%rcx, %%rcx;\n\t"
00975       "jz .GC4;\n\t"
00976       ".GC2:;\n\t"
00977       "pxor %%xmm0, %%xmm0;\n\t"
00978       "movdqu 0(%%rsi), %%xmm1;\n\t"
00979       "movdqu 2(%%rsi), %%xmm2;\n\t"
00980       "movdqu 4(%%rsi), %%xmm3;\n\t"
00981       "movdqu 6(%%rsi), %%xmm4;\n\t"
00982       "punpcklbw %%xmm0, %%xmm1;\n\t"
00983       "punpcklbw %%xmm0, %%xmm2;\n\t"
00984       "punpcklbw %%xmm0, %%xmm3;\n\t"
00985       "punpcklbw %%xmm0, %%xmm4;\n\t"
00986       "punpcklbw %%xmm0, %%xmm1;\n\t"
00987       "punpcklbw %%xmm0, %%xmm2;\n\t"
00988       "punpcklbw %%xmm0, %%xmm3;\n\t"
00989       "punpcklbw %%xmm0, %%xmm4;\n\t"
00990       "cvtdq2pd %%xmm1, %%xmm1;\n\t"
00991       "cvtdq2pd %%xmm2, %%xmm2;\n\t"
00992       "movupd  %%xmm1, (%%rdi);\n\t"
00993       "movupd  %%xmm2, 16(%%rdi);\n\t"
00994       "cvtdq2pd %%xmm3, %%xmm3;\n\t"
00995       "cvtdq2pd %%xmm4, %%xmm4;\n\t"
00996       "movupd  %%xmm3, 32(%%rdi);\n\t"
00997       "movupd  %%xmm4, 48(%%rdi);\n\t"
00998       "add $8, %%rsi;\n\t"
00999       "add $64, %%rdi;\n\t"
01000       "loop .GC2;\n\t"
01001       ".GC4:;\n\t"
01002       "or %%rdx, %%rdx;\n\t"
01003       "jz .GC1;\n\t"
01004       "mov %%rdx, %%rcx;\n\t"
01005       ".GC3:;\n\t"
01006       "xor %%rax, %%rax;\n\t"
01007       "movb (%%rsi), %%al;\n\t"
01008       "movd %%rax, %%xmm0;\n\t"
01009       "cvtdq2pd %%xmm0, %%xmm1;\n\t"
01010       "movsd %%xmm1, (%%rdi);\n\t"
01011       "inc %%rsi;\n\t"
01012       "add $8, %%rdi;\n\t"
01013       "loop .GC3;\n\t"
01014       ".GC1:;"
01015       :
01016       :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01017       :"memory"
01018       );
01019 
01020 }
01021 
01022 
01023 
01024 //######################################################################
01025 
01026 void sse2_cvt_int_to_float(const int32 *a, float *b, const int32 sz)
01027 {
01028   int32 rcx=sz>>5;
01029   int32 rdx=sz&0x1f;
01030 
01031   asm(
01032       "or %%rcx, %%rcx;\n\t"
01033       "jz .GD4;\n\t"
01034       ".GD2:;\n\t"
01035       "movdqu 0(%%rsi), %%xmm0;\n\t"
01036       "movdqu 16(%%rsi), %%xmm1;\n\t"
01037       "movdqu 32(%%rsi), %%xmm2;\n\t"
01038       "movdqu 48(%%rsi), %%xmm3;\n\t"
01039       "movdqu 64(%%rsi), %%xmm4;\n\t"
01040       "movdqu 80(%%rsi), %%xmm5;\n\t"
01041       "movdqu 96(%%rsi), %%xmm6;\n\t"
01042       "movdqu 112(%%rsi), %%xmm7;\n\t"
01043       "cvtdq2ps %%xmm0, %%xmm0;\n\t"
01044       "cvtdq2ps %%xmm1, %%xmm1;\n\t"
01045       "cvtdq2ps %%xmm2, %%xmm2;\n\t"
01046       "cvtdq2ps %%xmm3, %%xmm3;\n\t"
01047       "cvtdq2ps %%xmm4, %%xmm4;\n\t"
01048       "cvtdq2ps %%xmm5, %%xmm5;\n\t"
01049       "cvtdq2ps %%xmm6, %%xmm6;\n\t"
01050       "cvtdq2ps %%xmm7, %%xmm7;\n\t"
01051       "movups %%xmm0, 0(%%rdi);\n\t"
01052       "movups %%xmm1, 16(%%rdi);\n\t"
01053       "movups %%xmm2, 32(%%rdi);\n\t"
01054       "movups %%xmm3, 48(%%rdi);\n\t"
01055       "movups %%xmm4, 64(%%rdi);\n\t"
01056       "movups %%xmm5, 80(%%rdi);\n\t"
01057       "movups %%xmm6, 96(%%rdi);\n\t"
01058       "movups %%xmm7, 112(%%rdi);\n\t"
01059       "add $128, %%rsi;\n\t"
01060       "add $128, %%rdi;\n\t"
01061       "dec %%rcx;\n\t"
01062       "jnz .GD2;\n\t"
01063       ".GD4:;\n\t"
01064       "or %%rdx, %%rdx;\n\t"
01065       "jz .GD1;\n\t"
01066       "mov %%rdx, %%rcx;\n\t"
01067       ".GD3:;\n\t"
01068       "movsd (%%rsi), %%xmm0;\n\t"
01069       "cvtdq2ps %%xmm0, %%xmm0;\n\t"
01070       "movss %%xmm0, (%%rdi);\n\t"
01071       "add $4, %%rsi;\n\t"
01072       "add $4, %%rdi;\n\t"
01073       "loop .GD3;\n\t"
01074       ".GD1:;"
01075       :
01076       :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01077       :"memory"
01078       );
01079 
01080 }
01081 
01082 //######################################################################
01083 // speedup ~= 1.2
01084 void sse2_cvt_int_to_double(const int32 *a, double *b, const int32 sz)
01085 {
01086   int32 rcx=sz>>4;
01087   int32 rdx=sz&0xf;
01088 
01089   asm(
01090       "or %%rcx, %%rcx;\n\t"
01091       "jz .GE4;\n\t"
01092       ".GE2:;\n\t"
01093       "movdqu 0(%%rsi), %%xmm0;\n\t"
01094       "movdqu  8(%%rsi), %%xmm1;\n\t"
01095       "movdqu 16(%%rsi), %%xmm2;\n\t"
01096       "movdqu 24(%%rsi), %%xmm3;\n\t"
01097       "movdqu 32(%%rsi), %%xmm4;\n\t"
01098       "movdqu 40(%%rsi), %%xmm5;\n\t"
01099       "movdqu 48(%%rsi), %%xmm6;\n\t"
01100       "movdqu 56(%%rsi), %%xmm7;\n\t"
01101       "cvtdq2pd %%xmm0, %%xmm0;\n\t"
01102       "cvtdq2pd %%xmm1, %%xmm1;\n\t"
01103       "cvtdq2pd %%xmm2, %%xmm2;\n\t"
01104       "cvtdq2pd %%xmm3, %%xmm3;\n\t"
01105       "cvtdq2pd %%xmm4, %%xmm4;\n\t"
01106       "cvtdq2pd %%xmm5, %%xmm5;\n\t"
01107       "cvtdq2pd %%xmm6, %%xmm6;\n\t"
01108       "cvtdq2pd %%xmm7, %%xmm7;\n\t"
01109       "movups %%xmm0, 0(%%rdi);\n\t"
01110       "movups %%xmm1, 16(%%rdi);\n\t"
01111       "movups %%xmm2, 32(%%rdi);\n\t"
01112       "movups %%xmm3, 48(%%rdi);\n\t"
01113       "movups %%xmm4, 64(%%rdi);\n\t"
01114       "movups %%xmm5, 80(%%rdi);\n\t"
01115       "movups %%xmm6, 96(%%rdi);\n\t"
01116       "movups %%xmm7, 112(%%rdi);\n\t"
01117       "add $64, %%rsi;\n\t"
01118       "add $128, %%rdi;\n\t"
01119       "dec %%rcx;\n\t"
01120       "jnz .GE2;\n\t"
01121       ".GE4:;\n\t"
01122       "or %%rdx, %%rdx;\n\t"
01123       "jz .GE1;\n\t"
01124       "mov %%rdx, %%rcx;\n\t"
01125       ".GE3:;\n\t"
01126       "movsd (%%rsi), %%xmm0;\n\t"
01127       "cvtdq2pd %%xmm0, %%xmm0;\n\t"
01128       "movsd %%xmm0, (%%rdi);\n\t"
01129       "add $4, %%rsi;\n\t"
01130       "add $8, %%rdi;\n\t"
01131       "loop .GE3;\n\t"
01132       ".GE1:;"
01133       :
01134       :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01135       :"memory"
01136       );
01137 
01138 }
01139 
01140 //######################################################################
01141 void sse2_cvt_float_to_int(const float *a, int *b, const int32 sz)
01142 {
01143   int32 rcx=sz;
01144   int32 rdx=sz;
01145 
01146   asm (
01147        "or %%rcx, %%rcx;\n\t"
01148        "jz .GF1;\n\t"
01149        ".GF2:;\n\t"
01150        "movdqu 0(%%rsi), %%xmm0;\n\t"
01151        "movdqu  8(%%rsi), %%xmm1;\n\t"
01152        "movdqu 16(%%rsi), %%xmm2;\n\t"
01153        "movdqu 24(%%rsi), %%xmm3;\n\t"
01154        "movdqu 32(%%rsi), %%xmm4;\n\t"
01155        "movdqu 40(%%rsi), %%xmm5;\n\t"
01156        "movdqu 48(%%rsi), %%xmm6;\n\t"
01157        "movdqu 56(%%rsi), %%xmm7;\n\t"
01158        "cvtps2dq %%xmm0, %%xmm0;\n\t"
01159        "cvtps2dq %%xmm1, %%xmm1;\n\t"
01160        "cvtps2dq %%xmm2, %%xmm2;\n\t"
01161        "cvtps2dq %%xmm3, %%xmm3;\n\t"
01162        "cvtps2dq %%xmm4, %%xmm4;\n\t"
01163        "cvtps2dq %%xmm5, %%xmm5;\n\t"
01164        "cvtps2dq %%xmm6, %%xmm6;\n\t"
01165        "cvtps2dq %%xmm7, %%xmm7;\n\t"
01166        "movdqu %%xmm0, 0(%%rdi);\n\t"
01167        "movdqu %%xmm1, 16(%%rdi);\n\t"
01168        "movdqu %%xmm2, 32(%%rdi);\n\t"
01169        "movdqu %%xmm3, 48(%%rdi);\n\t"
01170        "movdqu %%xmm4, 64(%%rdi);\n\t"
01171        "movdqu %%xmm5, 80(%%rdi);\n\t"
01172        "movdqu %%xmm6, 96(%%rdi);\n\t"
01173        "movdqu %%xmm7, 112(%%rdi);\n\t"
01174        "add $64, %%rsi;\n\t"
01175        "add $128, %%rdi;\n\t"
01176        "dec %%rcx;\n\t"
01177        "jnz .GF2;\n\t"
01178        ".GF4:;\n\t"
01179        "or %%rdx, %%rdx;\n\t"
01180        "jz .GF1;\n\t"
01181        "mov %%rdx, %%rcx;\n\t"
01182        ".GF3:;\n\t"
01183        "movsd (%%rsi), %%xmm0;\n\t"
01184        "cvtps2dq %%xmm0, %%xmm0;\n\t"
01185        "movsd  %%xmm0, (%%rdi);\n\t"
01186        "add $4, %%rsi;\n\t"
01187        "add $8, %%rdi;\n\t"
01188        "loop .GF3;\n\t"
01189        ".GF1:;"
01190        :
01191        :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01192        :"memory"
01193        );
01194 
01195 }
01196 
01197 
01198 
01199 //######################################################################
01200 void sse2_cvt_float_to_double(const float *a, double *b, const int32 sz)
01201 {
01202   int32 rcx=sz>>4;
01203   int32 rdx=sz&0xf;
01204 
01205   asm(
01206       "or %%rcx, %%rcx;\n\t"
01207       "jz .GG4;\n\t"
01208       ".GG2:;\n\t"
01209       "movups 0(%%rsi), %%xmm0;\n\t"
01210       "movups  8(%%rsi), %%xmm1;\n\t"
01211       "movups 16(%%rsi), %%xmm2;\n\t"
01212       "movups 24(%%rsi), %%xmm3;\n\t"
01213       "movups 32(%%rsi), %%xmm4;\n\t"
01214       "movups 40(%%rsi), %%xmm5;\n\t"
01215       "movups 48(%%rsi), %%xmm6;\n\t"
01216       "movups 56(%%rsi), %%xmm7;\n\t"
01217       "cvtps2pd %%xmm0, %%xmm0;\n\t"
01218       "cvtps2pd %%xmm1, %%xmm1;\n\t"
01219       "cvtps2pd %%xmm2, %%xmm2;\n\t"
01220       "cvtps2pd %%xmm3, %%xmm3;\n\t"
01221       "cvtps2pd %%xmm4, %%xmm4;\n\t"
01222       "cvtps2pd %%xmm5, %%xmm5;\n\t"
01223       "cvtps2pd %%xmm6, %%xmm6;\n\t"
01224       "cvtps2pd %%xmm7, %%xmm7;\n\t"
01225       "movupd %%xmm0, 0(%%rdi);\n\t"
01226       "movupd %%xmm1, 16(%%rdi);\n\t"
01227       "movupd %%xmm2, 32(%%rdi);\n\t"
01228       "movupd %%xmm3, 48(%%rdi);\n\t"
01229       "movupd %%xmm4, 64(%%rdi);\n\t"
01230       "movupd %%xmm5, 80(%%rdi);\n\t"
01231       "movupd %%xmm6, 96(%%rdi);\n\t"
01232       "movupd %%xmm7, 112(%%rdi);\n\t"
01233       "add $64, %%rsi;\n\t"
01234       "add $128, %%rdi;\n\t"
01235       "dec %%rcx;\n\t"
01236       "jnz .GG2;\n\t"
01237       ".GG4:;\n\t"
01238       "or %%rdx, %%rdx;\n\t"
01239       "jz .GG1;\n\t"
01240       "mov %%rdx, %%rcx;\n\t"
01241       ".GG3:;\n\t"
01242       "movsd (%%rsi), %%xmm0;\n\t"
01243       "cvtps2pd %%xmm0, %%xmm0;\n\t"
01244       "movsd %%xmm0, (%%rdi);\n\t"
01245       "add $4, %%rsi;\n\t"
01246       "add $8, %%rdi;\n\t"
01247       "loop .GG3;\n\t"
01248       ".GG1:;"
01249       :
01250       :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01251       :"memory"
01252       );
01253 }
01254 
01255 #endif
01256 
01257 #ifdef INVT_USE_SSE
01258 
01259 //######################################################################
01260 void sse_lowPass3x(const float *a, float *b, const int h, const int w)
01261 {
01262   const float coeffs[] = { 3.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0, 4.0};
01263   int rdx = (w-2)/12;
01264   int rax = (w-2)%12;
01265 
01266   asm (
01267        //       "movups 16(%%rbx), %%xmm7;\n\t"
01268        "or %%rcx, %%rcx;\n\t"
01269        "jz  .HA1;\n\t"
01270        ".HA2:;\n\t"
01271 
01272        // *dptr++ = (sptr[0]+sptr[0]+sptr[1])/3.0
01273        "movss 0(%%rsi), %%xmm1;\n\t"  // xmm1 <- sptr[0]
01274        "movss 4(%%rsi), %%xmm2;\n\t" // xmm2 <- sptr[1]
01275        "addss %%xmm1, %%xmm1;\n\t"   // xmm2 <- sptr[0] + sptr[0]
01276        "addss %%xmm1, %%xmm2;\n\t"   // xmm2 <- xmm2 + sptr[1]
01277        "divss (%%rbx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0
01278        "movss %%xmm2, (%%rdi);\n\t"  // *dptr <- xmm2
01279        "add  $4, %%rdi;\n\t"        // ++dptr
01280 
01281        //  for (int i = 0; i < w - 2; i ++)
01282        "or %%rdx, %%rdx;\n\t"
01283        "jz .HA4;\n\t"
01284 
01285        "push %%rdx;\n\t"
01286        ".HA3:;\n\t"
01287        "movups 00(%%rsi),  %%xmm0;\n\t"
01288        "movups 04(%%rsi),  %%xmm1;\n\t"
01289        "movups 8(%%rsi),  %%xmm2;\n\t"
01290        "movups 16(%%rsi),  %%xmm3;\n\t"
01291        "movups 20(%%rsi),  %%xmm4;\n\t"
01292        "movups 24(%%rsi),  %%xmm5;\n\t"
01293        "movups 32(%%rsi),  %%xmm6;\n\t"
01294        "movups 36(%%rsi),  %%xmm7;\n\t"
01295        "addps  %%xmm1, %%xmm0;\n\t"
01296        "addps  %%xmm4, %%xmm3;\n\t"
01297        "addps  %%xmm1, %%xmm0;\n\t"
01298        "addps  %%xmm4, %%xmm3;\n\t"
01299        "movups 40(%%rsi), %%xmm1;\n\t"
01300        "addps  %%xmm7, %%xmm6;\n\t"
01301        "addps  %%xmm2, %%xmm0;\n\t"
01302        "addps  %%xmm1, %%xmm6;\n\t"
01303        "addps  %%xmm5, %%xmm3;\n\t"
01304        "addps  %%xmm7, %%xmm6;\n\t"
01305        "divps  16(%%rbx ), %%xmm0;\n\t"
01306        "divps  16(%%rbx ), %%xmm3;\n\t"
01307        "divps  16(%%rbx ), %%xmm6;\n\t"
01308        "movups %%xmm0, (%%rdi);\n\t"
01309        "movups %%xmm3, 16(%%rdi);\n\t"
01310        "movups %%xmm6, 32(%%rdi);\n\t"
01311        "add   $48, %%rsi;\n\t"
01312        "add   $48, %%rdi;\n\t"
01313        "dec   %%rdx;\n\t"
01314        "jnz  .HA3;\n\t"
01315        "pop %%rdx;\n\t"
01316        ".HA4:;\n\t"
01317 
01318        "or  %%rax, %%rax;\n\t"
01319        "jz .HA6;\n\t"
01320        "push %%rax;\n\t"
01321        ".HA5:;\n\t"
01322        "movss  00(%%rsi),  %%xmm0;\n\t"
01323        "movss  04(%%rsi),  %%xmm1;\n\t"
01324        "movss  8(%%rsi),  %%xmm2;\n\t"
01325        "addps  %%xmm1, %%xmm0;\n\t"
01326        "addps  %%xmm1, %%xmm2;\n\t"
01327        "addps  %%xmm2, %%xmm0;\n\t"
01328        "divss  16(%%rbx ), %%xmm0;\n\t"
01329        "movss  %%xmm0, (%%rdi);\n\t"
01330        "add   $4, %%rsi;\n\t"
01331        "add   $4, %%rdi;\n\t"
01332        "dec %%rax;\n\t"
01333        "jnz .HA5;\n\t"
01334        "pop %%rax;\n\t"
01335 
01336        ".HA6:;\n\t"
01337        "movss (%%rsi), %%xmm1;\n\t"  // xmm1 <- sptr[0]
01338        "movss 4(%%rsi), %%xmm2;\n\t" // xmm2 <- sptr[1]
01339        "addss %%xmm2, %%xmm2;\n\t"   // xmm2 <- sptr[0] + sptr[1]
01340        "addss %%xmm1, %%xmm2;\n\t"   // xmm2 <- xmm2 + sptr[0]
01341        "divss 0(%%rbx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0
01342 
01343        "movss %%xmm2, (%%rdi);\n\t"     // *dptr <- xmm2
01344        "add  $4, %%rdi;\n\t"        // ++dptr
01345        "add  $8, %%rsi;\n\t"        // sptr += 2
01346        "dec %%rcx;\n\t"
01347        "jnz .HA2;\n\t"
01348        ".HA1:;\n\t"
01349        :
01350        :"S"(a), "D"(b),"c"(h),"a"(rax),"d"(rdx),"b"(coeffs)
01351        :"memory"
01352        );
01353 
01354 }
01355 
01356 
01357 
01358 
01359 //######################################################################
01360 
01361 void sse_lowPass3y(const float *a, float *b, const int h, const int w)
01362 {
01363   const float coeffs[] = { 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0};
01364 
01365   if (h < 2){
01366     memcpy(b, a, w*h*sizeof(b[0]));
01367     return; // nothing to smooth
01368   }
01369 
01370   if (h < 2){
01371     memcpy(b, a, w*h*sizeof(b[0]));
01372     return; // nothing to smooth
01373   }
01374 
01375   asm (
01376        // top row
01377        "mov %%rdx, %%rcx;\n\t"
01378        "or %%rcx, %%rcx;\n\t"
01379        "jz .HU1;\n\t"
01380        "push %%rsi;\n\t"
01381        ".HU0:;\n\t"
01382        "movss (%%rsi), %%xmm0;\n\t" // xmm0 <- sptr[0]
01383        "movss (%%rsi, %%rdx, 4), %%xmm1;\n\t" //xmm1 <- sptr[w]
01384        "addss %%xmm0, %%xmm0;\n\t"
01385        "addss %%xmm1, %%xmm0;\n\t"
01386        "divss (%%rbx), %%xmm0;\n\t"
01387        "add $4, %%rsi;\n\t"
01388        "movss %%xmm0, (%%rdi);\n\t"
01389        "add  $4, %%rdi;\n\t"
01390        "dec %%rcx;\n\t"
01391        "jnz .HU0;\n\t"
01392        "pop %%rsi;\n\t"
01393        ".HU1:;\n\t"
01394        "cmp $2, %%rax;\n\t"
01395        "jle .HU5;\n\t"
01396 
01397        "push %%rax;\n\t"
01398        "sub $2, %%rax;\n\t"
01399        "jle .HU4;\n\t"
01400        ".HU2:;\n\t"
01401        "mov %%rdx, %%rcx;\n\t"
01402        "push %%rdx;\n\t"
01403        ".HU3:;\n\t"
01404        "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- sptr[0]
01405        "movss (%%rsi,%%rdx,4), %%xmm1;\n\t" //xmm1 <- sptr[w]
01406        "movss (%%rsi,%%rdx,8), %%xmm2;\n\t" //xmm2 <- sptr[2*w]
01407        "addss %%xmm1, %%xmm0;\n\t"
01408        "addss %%xmm1, %%xmm2;\n\t"
01409        "addss %%xmm2, %%xmm0;\n\t"
01410        "divss 16(%%rbx), %%xmm0;\n\t"
01411        "movss %%xmm0, (%%rdi);\n\t"
01412        "add  $4, %%rsi;\n\t"
01413        "add  $4, %%rdi;\n\t"
01414        "dec  %%rcx;\n\t"
01415        "jnz .HU3;\n\t"
01416        "pop %%rdx;\n\t"
01417        "dec %%rax;\n\t"
01418        "jnz .HU2;\n\t"
01419 
01420        ".HU4:;\n\t"
01421        "pop %%rax;\n\t"
01422        ".HU5:;\n\t"
01423        "or %%rdx, %%rdx;\n\t"
01424        "jz .HU7;\n\t"
01425        "push %%rdx;\n\t"
01426        "mov  %%rdx, %%rcx;\n\t"
01427        ".HU6:;\n\t"
01428        "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- sptr[0]
01429        "movss (%%rsi,%%rcx,4), %%xmm1;\n\t" //xmm1 <- sptr[w]
01430        "addss %%xmm1, %%xmm1;\n\t"
01431        "addss %%xmm1, %%xmm0;\n\t"
01432        "divss (%%rbx), %%xmm0;\n\t"
01433        "movss %%xmm0, (%%rdi);\n\t"
01434        "add $4, %%rsi;\n\t"
01435        "add $4, %%rdi;\n\t"
01436        "dec %%rdx;\n\t"
01437        "jnz .HU6;\n\t"
01438        "pop %%rdx;\n\t"
01439        ".HU7:;\n\t"
01440        :
01441        :"S"(a),"D"(b),"a"(h),"d"(w),"b"(coeffs)
01442        );
01443 
01444 }
01445 
01446 
01447 //######################################################################
01448 
01449 void sse_lowPass5x(const float *src, float *dest, const int h, const int w)
01450 {
01451   const float *sptr= src;
01452   float *dptr= dest;
01453 
01454   if(w<2)
01455     {
01456       memcpy(dest,src,h*w*sizeof(dest[0]));
01457       return;
01458     }
01459 
01460   if (w == 2) //////////////////////////////////////////////////
01461     for (int j = 0; j < h; j ++)
01462       {
01463         // leftmost point  [ (6^) 4 ] / 10
01464         *dptr++ = sptr[0] * (6.0F / 10.0F) + sptr[1] * (4.0F / 10.0F);
01465 
01466         // rightmost point  [ 4^ (6) ] / 10
01467         *dptr++ = sptr[0] * (4.0F / 10.0F) + sptr[1] * (6.0F / 10.0F);
01468 
01469         sptr += 2;  // sptr back to same position as dptr
01470       }
01471   else if (w == 3) //////////////////////////////////////////////////
01472     for (int j = 0; j < h; j ++)
01473       {
01474         // leftmost point  [ (6^) 4 1 ] / 11
01475         *dptr++ = sptr[0] * (6.0F / 11.0F) +
01476           sptr[1] * (4.0F / 11.0F) +
01477           sptr[2] * (1.0F / 11.0F);
01478 
01479         // middle point    [ 4^ (6) 4 ] / 14
01480         *dptr++ = (sptr[0] + sptr[2]) * (4.0F / 14.0F) +
01481           sptr[1] * (6.0F / 14.0F);
01482 
01483         // rightmost point  [ 1^ 4 (6) ] / 11
01484         *dptr++ = sptr[0] * (1.0F / 11.0F) +
01485           sptr[1] * (4.0F / 11.0F) +
01486           sptr[2] * (6.0F / 11.0F);
01487 
01488         sptr += 3;  // sptr back to same position as dptr
01489       }
01490   else
01491     if(w>3)
01492       {
01493         const float coeffs[] = {6.0/11.0, 4.0/11.0, 1.0/11.0, 4.0/15.0,
01494                                 4.0/15.0, 6.0/15.0, 1.0/15.0, 1.0/16.0,
01495                                 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
01496                                 4.0/16.0, 4.0/16.0, 4.0/16.0, 4.0/16.0,
01497                                 6.0/16.0, 6.0/16.0, 6.0/16.0, 6.0/16.0,
01498                                 1.0/15.0, 4.0/15.0, 6.0/15.0, 1.0/15.0,
01499                                 1.0/11.0, 4.0/11.0, 6.0/11.0, 1.0/11.0
01500         };
01501 
01502         int rax= (w-4)&3;
01503         int rdx= (w-4)>>2;
01504 
01505         asm(
01506             "or %%rcx, %%rcx;\n\t"  // rcx <- h
01507             "jz .HG6;\n\t"
01508             ".HG0:;\n\t"
01509             "movss   (%%rsi), %%xmm0;\n\t" // xmm0 <- s[0]
01510             "movss  4(%%rsi), %%xmm2;\n\t" // xmm2 <- s[1]
01511             "movss  8(%%rsi), %%xmm4;\n\t" // xmm4 <- s[2]
01512             "movss 12(%%rsi), %%xmm6;\n\t" // xmm6 <- s[3]
01513             "movss  %%xmm0, %%xmm1;\n\t"   // xmm1 <- s[0]
01514             "movss  %%xmm2, %%xmm3;\n\t"   // xmm3 <- s[1]
01515             "movss  %%xmm4, %%xmm5;\n\t"   // xmm5 <- s[2]
01516             "mulss   (%%rbx), %%xmm0;\n\t" // xmm0 <- 6.0/11.0*s[0]
01517             "mulss  4(%%rbx), %%xmm2;\n\t" // xmm2 <- 4.0/11.0*s[1]
01518             "mulss  8(%%rbx), %%xmm4;\n\t" // xmm4 <- 1.0/11.0*s[2]
01519             "addss  %%xmm5, %%xmm1;\n\t"   // xmm1 <- s[2]+s[0]
01520             "mulss 16(%%rbx), %%xmm1;\n\t" // xmm1 <- (s2+s0)*4.0/15.0
01521             "mulss 20(%%rbx), %%xmm3;\n\t"
01522             "mulss 24(%%rbx), %%xmm6;\n\t"
01523             "addss %%xmm2, %%xmm0;\n\t"
01524             "addss %%xmm3, %%xmm1;\n\t"
01525             "addss %%xmm4, %%xmm0;\n\t"
01526             "addss %%xmm6, %%xmm1;\n\t"
01527             "movss %%xmm0,   (%%rdi);\n\t"
01528             "movss %%xmm1,  4(%%rdi);\n\t"
01529             "add  $8, %%rdi;\n\t"
01530 
01531             "or   %%rdx, %%rdx;\n\t"
01532             "jz .HG5;\n\t"
01533 
01534             "push %%rdx;\n\t"   // rdx <- (w-4)/4
01535             "movups  32(%%rbx), %%xmm5;\n\t" // xmm5 <- 1.0/16.0 1.0/16.0 1.0/16 1.0/16
01536             "movups  48(%%rbx), %%xmm6;\n\t" // xmm6 <- 4.0/16.0 ......................
01537             "movups  64(%%rbx), %%xmm7;\n\t" // xmm7 <- 6.0/16.0 ......................
01538             ".HG1:;\n\t"
01539             "movups   0(%%rsi), %%xmm0;\n\t" // xmm0 <- s0  s1  s2  s3
01540             "movups 04(%%rsi), %%xmm1;\n\t" // xmm1 <- s1  s2  s3  s4
01541             "movups  8(%%rsi), %%xmm2;\n\t" // xmm2 <- s2  s3  s4  s5
01542             "movups 12(%%rsi), %%xmm3;\n\t" // xmm3 <- s3  s4  s5  s6
01543              "movups 16(%%rsi), %%xmm4;\n\t" // xmm4 <- s4  s5  s6  s7
01544             "addps  %%xmm4, %%xmm0;\n\t"
01545             "addps  %%xmm3, %%xmm1;\n\t"
01546             "mulps  %%xmm5, %%xmm0;\n\t"
01547             "mulps  %%xmm6, %%xmm1;\n\t"
01548             "mulps  %%xmm7, %%xmm2;\n\t"
01549             "addps  %%xmm1, %%xmm0;\n\t"
01550             "addps  %%xmm2, %%xmm0;\n\t"
01551             "movups %%xmm0, (%%rdi);\n\t"
01552             "add   $16, %%rsi;\n\t"
01553             "add   $16, %%rdi;\n\t"
01554             "dec   %%rdx;\n\t"
01555             "jnz .HG1;\n\t"
01556             "pop %%rdx;\n\t"
01557 
01558             ".HG5:;\n\t"
01559             "or  %%rax, %%rax;\n\t"
01560             "jz  .HG3;\n\t"
01561             "push %%rax;\n\t"       // rax <- (w-4)%4
01562             "movups 32(%%rbx), %%xmm5;\n\t"
01563             "movups 48(%%rbx), %%xmm6;\n\t"
01564             "movups 64(%%rbx), %%xmm7;\n\t"
01565             ".HG2:;\n\t"
01566             "movss    (%%rsi), %%xmm0;\n\t"
01567             "movss   4(%%rsi), %%xmm1;\n\t"
01568             "movss   8(%%rsi), %%xmm2;\n\t"
01569             "movss  12(%%rsi), %%xmm3;\n\t"
01570             "movss  16(%%rsi), %%xmm4;\n\t"
01571             "mulss  %%xmm5   , %%xmm0;\n\t"
01572             "mulss  %%xmm6   , %%xmm1;\n\t"
01573             "mulss  %%xmm7   , %%xmm2;\n\t"
01574             "mulss  %%xmm6   , %%xmm3;\n\t"
01575             "mulss  %%xmm5   , %%xmm4;\n\t"
01576             "addss  %%xmm1, %%xmm0;\n\t"
01577             "addss  %%xmm3, %%xmm2;\n\t"
01578             "addss  %%xmm4, %%xmm0;\n\t"
01579             "addss  %%xmm2, %%xmm0;\n\t"
01580             "add   $4, %%rsi;\n\t"
01581             "movss  %%xmm0, (%%rdi);\n\t"
01582             "add   $4, %%rdi;\n\t"
01583             "dec   %%rax;\n\t"
01584             "jnz .HG2;\n\t"
01585             "pop  %%rax;\n\t"
01586             ".HG3:;\n\t"
01587             "movss  (%%rsi), %%xmm0;\n\t"  // xmm0 <- s0
01588             "movss 4(%%rsi), %%xmm1;\n\t"  // xmm1 <- s1
01589             "movss 8(%%rsi), %%xmm2;\n\t"  // xmm2 <- s2
01590             "movss 12(%%rsi), %%xmm3;\n\t" // xmm3 <- s3
01591             "movss %%xmm1, %%xmm4;\n\t"    // xmm4 <- s1
01592             "movss %%xmm2, %%xmm5;\n\t"    // xmm5 <- s2
01593             "movss %%xmm3, %%xmm6;\n\t"    // xmm6 <- s3
01594             "addps %%xmm1, %%xmm3;\n\t"    // xmm3 <- s1+s3
01595             "mulss 80(%%rbx), %%xmm0;\n\t" // xmm0 <- 1.0/15.0*s0
01596             "mulss 84(%%rbx), %%xmm3;\n\t" // xmm3 <- 4.0/15.0*(s1+s3)
01597             "mulss 88(%%rbx), %%xmm2;\n\t" // xmm2 <- 6.0/15.0*s2
01598             "addss %%xmm3, %%xmm0;\n\t"
01599             "addss %%xmm2, %%xmm0;\n\t"
01600             "movss %%xmm0, (%%rdi);\n\t"
01601             "mulss 96(%%rbx), %%xmm4;\n\t"
01602             "mulss 100(%%rbx), %%xmm5;\n\t"
01603             "mulss 104(%%rbx), %%xmm6;\n\t"
01604             "addss %%xmm5, %%xmm4;\n\t"
01605             "addss %%xmm6, %%xmm4;\n\t"
01606             "movss %%xmm4, 4(%%rdi);\n\t"
01607             "add $16, %%rsi;\n\t"
01608             "add $8, %%rdi;\n\t"
01609             "dec %%rcx;\n\t"
01610             "jnz .HG0;\n\t"
01611             ".HG6:;\n\t"
01612             :
01613             :"S"(sptr),"D"(dptr),"a"(rax),"b"(coeffs),"c"(h),"d"(rdx)
01614             :"memory"
01615             );
01616       }
01617 
01618 }
01619 
01620 
01621 
01622 //######################################################################
01623 
01624 void sse_lowPass5y(const float *src, float *dest, const int h,
01625                        const int w)
01626 {
01627 
01628   /*
01629   if (h < 2){
01630     memcpy(dest, src, h*w*sizeof(dest[0]));
01631     return; // nothing to smooth
01632   }
01633 
01634   const float *sptr= src;
01635   float *dptr= dest;
01636 
01637   // ########## vertical pass  (even though we scan horiz for speedup)
01638   const int w2 = w * 2; // speedup
01639 
01640 
01641   if (h == 2) //////////////////////////////////////////////////
01642     {
01643       // topmost points  ( [ (6^) 4 ] / 10 )^T
01644       for (int i = 0; i < w; i ++)
01645         {
01646           *dptr++ = sptr[0] * (6.0F / 10.0F) +
01647             sptr[w] * (4.0F / 10.0F);
01648           sptr++;
01649         }
01650       sptr -= w;  // go back to top-left
01651 
01652       // bottommost points  ( [ 4^ (6) ] / 10 )^T
01653       for (int i = 0; i < w; i ++)
01654         {
01655           *dptr++ = sptr[0] * (4.0F / 10.0F) +
01656             sptr[w] * (6.0F / 10.0F);
01657           sptr++;
01658         }
01659     }
01660   else if (h == 3) //////////////////////////////////////////////////
01661     {
01662       // topmost points  ( [ (6^) 4 1 ] / 11 )^T
01663       for (int i = 0; i < w; i ++)
01664         {
01665           *dptr++ = sptr[ 0] * (6.0F / 11.0F) +
01666             sptr[ w] * (4.0F / 11.0F) +
01667             sptr[w2] * (1.0F / 11.0F);
01668           sptr++;
01669         }
01670       sptr -= w;  // go back to top-left
01671 
01672       // middle points  ( [ 4^ (6) 4 ] / 14 )^T
01673       for (int i = 0; i < w; i ++)
01674         {
01675           *dptr++ = (sptr[ 0] + sptr[w2]) * (4.0F / 14.0F) +
01676             sptr[ w] * (6.0F / 14.0F);
01677           sptr++;
01678         }
01679       sptr -= w;  // go back to top-left
01680 
01681       // bottommost points  ( [ 1^ 4 (6) ] / 11 )^T
01682       for (int i = 0; i < w; i ++)
01683         {
01684           *dptr++ = sptr[ 0] * (1.0F / 11.0F) +
01685             sptr[ w] * (4.0F / 11.0F) +
01686             sptr[w2] * (6.0F / 11.0F);
01687           sptr++;
01688         }
01689     }
01690   else  ///////////////////////////////// general case for height >= 4
01691     {
01692       // topmost points  ( [ (6^) 4 1 ] / 11 )^T
01693 
01694       static const float coeffs[] = {
01695         6.0/11.0, 6.0/11.0, 6.0/11.0, 6.0/11.0, //0
01696         4.0/11.0, 4.0/11.0, 4.0/11.0, 4.0/11.0, //16
01697         1.0/11.0, 1.0/11.0, 1.0/11.0, 1.0/11.0, //32
01698         4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, //48
01699         6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, //64
01700         1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, //80
01701         1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0, //96
01702         4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, //112
01703         6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F  //128
01704       };
01705 
01706       int rcx=h-4;
01707       int rdx=w>>2;
01708       int rax=w&3;
01709 
01710        asm (
01711             "push %%rbp;\n\t"
01712             "mov %0, %%rbp;\n\t"
01713             "add %%rbp, %%rbp;\n\t"
01714             "add %%rbp, %%rbp;\n\t"
01715 
01716             // 1st loop
01717             "movups (%%rbx), %%xmm4;\n\t"          //xmm4 <- 6.0/11.0 ...
01718             "movups 16(%%rbx), %%xmm5;\n\t"        //xmm5 <- 4.0/11.0
01719             "movups 32(%%rbx), %%xmm6;\n\t"        //xmm6 <- 1.0/11.0
01720             "push %%rsi;\n\t"
01721             "or  %%rdx, %%rdx;\n\t"
01722             "jz .IA1;\n\t"
01723             ".align 4;\n\t"
01724             "push %%rdx;\n\t"
01725             ".IA0:;\n\t"
01726             ".align 4;\n\t"
01727             "movups (%%rsi), %%xmm0;\n\t"          //xmm0 <- s0   s0   s0   s0
01728             "movups (%%rsi,%%rbp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
01729             "movups (%%rsi,%%rbp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
01730             "mulps  %%xmm4, %%xmm0;\n\t"
01731             "mulps  %%xmm5, %%xmm1;\n\t"
01732             "mulps  %%xmm6, %%xmm2;\n\t"
01733             "addps  %%xmm1, %%xmm0;\n\t"
01734             "addps  %%xmm2, %%xmm0;\n\t"
01735             "movups %%xmm0, (%%rdi);\n\t"
01736             "add $16, %%rsi;\n\t"
01737             "add $16, %%rdi;\n\t"
01738             "dec %%rdx;\n\t"
01739             "jnz .IA0;\n\t"
01740             "pop %%rdx;\n\t"
01741             ".IA1:;\n\t"
01742             ".align 4;\n\t"
01743             "or %%rax, %%rax;\n\t"
01744             "jz .IA3;\n\t"
01745             "push %%rax;\n\t"
01746             ".IA2:;\n\t"
01747             ".align 4;\n\t"
01748             "movss  (%%rsi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
01749             "movss  (%%rsi,%%rbp,1), %%xmm1;\n\t"  //xmm1 <- sW+3 sW+2 sW+1 sW
01750             "movss  (%%rsi,%%rbp,2), %%xmm2;\n\t"  //xmm2 <- sP+3 sP+3 sP+1 sP
01751             "mulss  %%xmm4, %%xmm0;\n\t"
01752             "mulss  %%xmm5, %%xmm1;\n\t"
01753             "mulss  %%xmm6, %%xmm2;\n\t"
01754             "addss  %%xmm1, %%xmm0;\n\t"
01755             "addss  %%xmm2, %%xmm0;\n\t"
01756             "movss  %%xmm0, (%%rdi);\n\t"
01757             "add $4, %%rsi;\n\t"
01758             "add $4, %%rdi;\n\t"
01759             "dec %%rax;\n\t"
01760             "jnz .IA2;\n\t"
01761             "pop %%rax;\n\t"
01762             ".IA3:;\n\t"
01763             "pop %%rsi;\n\t"  // restore sptr
01764 
01765             // 2nd loop
01766             "movups 48(%%rbx), %%xmm4;\n\t" //xmm4 <- 4.0/15.0
01767             "movups 64(%%rbx), %%xmm5;\n\t" //xmm5 <- 6.0/15.0
01768             "movups 80(%%rbx), %%xmm6;\n\t" //xmm6 <- 1.0/15.0
01769             "push %%rsi;\n\t"
01770             "or   %%rdx, %%rdx;\n\t"
01771             "jz .IA5;\n\t"
01772             "push %%rdx;\n\t"
01773             "push %%rax;\n\t"
01774             "mov  %%rbp, %%rax;\n\t"
01775             "add  %%rbp, %%rax;\n\t"
01776             "add  %%rbp, %%rax;\n\t"
01777             ".IA4:;\n\t"
01778             "movups (%%rsi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
01779             "movups (%%rsi,%%rbp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
01780             "movups (%%rsi,%%rbp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
01781             "movups (%%rsi,%%rax,1), %%xmm3;\n\t"  //xmm3 <- sW3  sW3  sW3  sW3
01782             "addps  %%xmm2, %%xmm0;\n\t"
01783             "mulps  %%xmm4, %%xmm0;\n\t"
01784             "mulps  %%xmm5, %%xmm1;\n\t"
01785             "mulps  %%xmm6, %%xmm3;\n\t"
01786             "addps  %%xmm1, %%xmm0;\n\t"
01787             "addps  %%xmm3, %%xmm0;\n\t"
01788             "movups %%xmm0, (%%rdi);\n\t"
01789             "add $16, %%rsi;\n\t"
01790             "add $16, %%rdi;\n\t"
01791             "dec %%rdx;\n\t"
01792             "jnz .IA4;\n\t"
01793             "pop %%rax;\n\t"
01794             "pop %%rdx;\n\t"
01795             ".IA5:;\n\t"
01796             "or %%rax, %%rax;\n\t"
01797             "jz .IA7;\n\t"
01798             "push %%rax;\n\t"
01799             "push %%rdx;\n\t"
01800             "mov  %%rbp, %%rdx;\n\t"
01801             "add  %%rbp, %%rdx;\n\t"
01802             "add  %%rbp, %%rdx;\n\t"
01803             ".IA6:;\n\t"
01804             "movss  (%%rsi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
01805             "movss  (%%rsi,%%rbp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
01806             "movss  (%%rsi,%%rbp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
01807             "movss  (%%rsi,%%rdx,1), %%xmm3;\n\t" //xmm3 <- sW3  sW3  sW3  sW3
01808             "addss  %%xmm2, %%xmm0;\n\t"
01809             "mulss  %%xmm4, %%xmm0;\n\t"
01810             "mulss  %%xmm5, %%xmm1;\n\t"
01811             "mulss  %%xmm6, %%xmm3;\n\t"
01812             "addss  %%xmm1, %%xmm0;\n\t"
01813             "addss  %%xmm3, %%xmm0;\n\t"
01814             "movss  %%xmm0, (%%rdi);\n\t"
01815             "add $4, %%rsi;\n\t"
01816             "add $4, %%rdi;\n\t"
01817             "dec %%rax;\n\t"
01818             "jnz .IA6;\n\t"
01819             "pop %%rdx;\n\t"
01820             "pop %%rax;\n\t"
01821             ".IA7:;\n\t"
01822             "pop %%rsi;\n\t"  // restore sptr
01823 
01824 
01825             //            the double loops
01826             "or %%rcx, %%rcx;\n\t"
01827             "jz .IA29;\n\t"
01828             "push %%rcx;\n\t"
01829             "movups 96(%%rbx), %%xmm5;\n\t"    // xmm5 <- 1.0/16.0
01830             "movups 112(%%rbx), %%xmm6;\n\t"   // xmm6 <- 4.0/16.0
01831             "movups 128(%%rbx), %%xmm7;\n\t"   // xmm7 <- 6.0/16.0
01832             ".IA8:;\n\t"
01833             "or  %%rdx, %%rdx;\n\t"
01834             "jz .IA10;\n\t"
01835             "push %%rdx;\n\t"
01836             "push %%rax;\n\t"
01837             "mov  %%rbp, %%rax;\n\t"
01838             "add  %%rbp, %%rax;\n\t"
01839             "add  %%rbp, %%rax;\n\t"                 // rax <- 3*W
01840             ".IA9:;\n\t"
01841             "movups  (%%rsi),  %%xmm0;\n\t"          // xmm0 <- s    s    s    s
01842             "movups  (%%rsi,%%rbp,1),  %%xmm1;\n\t"  // xmm1 <- sW   sW   sW   sW
01843             "movups  (%%rsi,%%rbp,2),  %%xmm2;\n\t"  // xmm2 <- sW2  sW2  sW2  sW2
01844             "movups  (%%rsi,%%rax,1), %%xmm3;\n\t"   // xmm3 <- sW3  sW3  sW3  sW3
01845             "movups  (%%rsi,%%rbp,4), %%xmm4;\n\t"   // xmm4 <- sW4  sW4  sW4  sW4
01846             "addps   %%xmm3, %%xmm1;\n\t"            // xmm1 <- sW3 + sW1
01847             "addps   %%xmm4, %%xmm0;\n\t"            // xmm0 <- s0  + sW4
01848             "mulps   %%xmm6, %%xmm1;\n\t"            // xmm1 <- 4.0/16.0*(sW3+sW1)
01849             "mulps   %%xmm5, %%xmm0;\n\t"            // xmm0 <- 1.0/16.08(s0 +sW4)
01850             "mulps   %%xmm7, %%xmm2;\n\t"            // xmm2 <- 6.0/16.0*sW2
01851             "addps   %%xmm1, %%xmm0;\n\t"
01852             "addps   %%xmm2, %%xmm0;\n\t"
01853             "add    $16, %%rsi;\n\t"
01854             "movups  %%xmm0, (%%rdi);\n\t"
01855             "add    $16, %%rdi;\n\t"
01856             "dec   %%rdx;\n\t"
01857             "jnz .IA9;\n\t"
01858             "pop   %%rax;\n\t"
01859             "pop   %%rdx;\n\t"
01860             ".IA10:;\n\t"
01861             "or  %%rax, %%rax;\n\t"
01862             "jz .IA12;\n\t"
01863             "push %%rax;\n\t"
01864             "push %%rdx;\n\t"
01865             "mov  %%rbp, %%rdx;\n\t"
01866             "add  %%rbp, %%rdx;\n\t"
01867             "add  %%rbp, %%rdx;\n\t"
01868             ".IA11:;\n\t"
01869             "movss   (%%rsi),  %%xmm0;\n\t"          // xmm0 <- s    s    s    s
01870             "movss   (%%rsi,%%rbp,1),  %%xmm1;\n\t"  // xmm1 <- sW   sW   sW   sW
01871             "movss   (%%rsi,%%rbp,2),  %%xmm2;\n\t"  // xmm2 <- sW2  sW2  sW2  sW2
01872             "movss   (%%rsi,%%rdx,1), %%xmm3;\n\t"   // xmm3 <- sW3  sW3  sW3  sW3
01873             "movss   (%%rsi,%%rbp,4), %%xmm4;\n\t"   // xmm4 <- sW4  sW4  sW4  sW4
01874             "addss   %%xmm3, %%xmm1;\n\t"
01875             "addss   %%xmm4, %%xmm0;\n\t"
01876             "mulss   %%xmm6, %%xmm1;\n\t"
01877             "mulss   %%xmm5, %%xmm0;\n\t"
01878             "mulss   %%xmm7, %%xmm2;\n\t"
01879             "addss   %%xmm1, %%xmm0;\n\t"
01880             "addss   %%xmm2, %%xmm0;\n\t"
01881             "add    $4, %%rsi;\n\t"
01882             "movss   %%xmm0, (%%rdi);\n\t"
01883             "add    $4, %%rdi;\n\t"
01884             "dec  %%rax;\n\t"
01885             "jnz .IA11;\n\t"
01886             "pop %%rdx;\n\t"
01887             "pop %%rax;\n\t"
01888             ".IA12:;\n\t"
01889             "dec %%rcx;\n\t"
01890             "jnz .IA8;\n\t"
01891             "pop %%rcx;\n\t"
01892             ".IA29:;\n\t"
01893 
01894             // fourth loop
01895             "movups 48(%%rbx), %%xmm4;\n\t"  //xmm4 <- 4.0/15.0
01896             "movups 64(%%rbx), %%xmm5;\n\t"  //xmm5 <- 6.0/15.0
01897             "movups 80(%%rbx), %%xmm6;\n\t"  //xmm6 <- 1.0/15.0
01898             "or  %%rdx, %%rdx;\n\t"
01899             "jz .IA14;\n\t"
01900             "push %%rdx;\n\t"
01901             "push %%rax;\n\t"
01902             "mov  %%rbp, %%rax;\n\t"
01903             "add  %%rbp, %%rax;\n\t"
01904             "add  %%rbp, %%rax;\n\t"
01905             ".IA13:;\n\t"
01906             "movups (%%rsi), %%xmm0;\n\t"          //xmm0 <- s0   s0   s0   s0
01907             "movups (%%rsi,%S%rbp,1), %%xmm1;\n\t" //xmm1 <- sW1  sW1  sW1  sW1
01908             "movups (%%rsi,%%rbp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
01909             "movups (%%rsi,%%rax,1),%%xmm3;\n\t"  //xmm3 <- sW3  sW3  sW3  sW3
01910             "addps  %%xmm3, %%xmm1;\n\t"          //xmm1 <- sW3 + sW1
01911             "mulps  %%xmm6, %%xmm0;\n\t"          //xmm0 <- 1.0/15.0 * s0
01912             "mulps  %%xmm5, %%xmm2;\n\t"          //xmm2 <- 6.0/15.0 * sW2
01913             "mulps  %%xmm4, %%xmm1;\n\t"          //xmm4 <- 4.0/15.0 * (sW3+sW1)
01914             "addps  %%xmm2, %%xmm0;\n\t"
01915             "addps  %%xmm1, %%xmm0;\n\t"
01916             "movups %%xmm0, (%%rdi);\n\t"
01917             "add $16, %%rsi;\n\t"
01918             "add $16, %%rdi;\n\t"
01919             "dec %%rdx;\n\t"
01920             "jnz .IA13;\n\t"
01921             "pop %%rax;\n\t"
01922             "pop %%rdx;\n\t"
01923             ".IA14:;\n\t"
01924             "or %%rax, %%rax;\n\t"
01925             "jz .IA16;\n\t"
01926             "push %%rax;\n\t"
01927             "push %%rdx;\n\t"
01928             "mov %%rbp, %%rdx;\n\t"
01929             "add %%rbp, %%rdx;\n\t"
01930             "add %%rbp, %%rdx;\n\t"
01931             ".IA15:;\n\t"
01932             "movss  (%%rsi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
01933             "movss  (%%rsi, %%rbp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
01934             "movss  (%%rsi, %%rbp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
01935             "movss  (%%rsi, %%rdx,1), %%xmm3;\n\t" //xmm3 <- sW3  sW3  sW3  sW3
01936             "addss  %%xmm3, %%xmm1;\n\t"
01937             "mulss  %%xmm6, %%xmm0;\n\t"
01938             "mulss  %%xmm5, %%xmm2;\n\t"
01939             "mulss  %%xmm4, %%xmm1;\n\t"
01940             "addss  %%xmm2, %%xmm0;\n\t"
01941             "addss  %%xmm1, %%xmm0;\n\t"
01942             "movss  %%xmm0, (%%rdi);\n\t"
01943             "add $4, %%rsi;\n\t"
01944             "add $4, %%rdi;\n\t"
01945             "dec %%rax;\n\t"
01946             "jnz .IA15;\n\t"
01947             "pop %%rdx;\n\t"
01948             "pop %%rax;\n\t"
01949             ".IA16:;\n\t"
01950 
01951              // final loop
01952             "movups 32(%%rbx), %%xmm4;\n\t"
01953             "movups 16(%%rbx), %%xmm5;\n\t"
01954             "movups   (%%rbx), %%xmm6;\n\t"
01955             "or  %%rdx, %%rdx;\n\t"
01956             "jz .IA18;\n\t"
01957             "push %%rdx;\n\t"
01958             ".IA17:;\n\t"
01959             "movups (%%rsi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
01960             "movups (%%rsi,%%rbp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
01961             "movups (%%rsi,%%rbp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
01962             "mulps  %%xmm4, %%xmm0;\n\t"
01963             "mulps  %%xmm5, %%xmm1;\n\t"
01964             "mulps  %%xmm6, %%xmm2;\n\t"
01965             "addps  %%xmm1, %%xmm0;\n\t"
01966             "addps  %%xmm2, %%xmm0;\n\t"
01967             "movups %%xmm0, (%%rdi);\n\t"
01968             "add $16, %%rsi;\n\t"
01969             "add $16, %%rdi;\n\t"
01970             "dec %%rdx;\n\t"
01971             "jnz .IA17;\n\t"
01972             "pop %%rdx;\n\t"
01973             ".IA18:;\n\t"
01974             "or %%rax, %%rax;\n\t"
01975             "jz .IA20;\n\t"
01976             "push %%rax;\n\t"
01977             ".IA19:;\n\t"
01978             "movss  (%%rsi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
01979             "movss  (%%rsi,%%rbp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
01980             "movss  (%%rsi,%%rbp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
01981             "mulss  %%xmm4, %%xmm0;\n\t"
01982             "mulss  %%xmm5, %%xmm1;\n\t"
01983             "mulss  %%xmm6, %%xmm2;\n\t"
01984             "addss  %%xmm1, %%xmm0;\n\t"
01985             "addss  %%xmm2, %%xmm0;\n\t"
01986             "movss  %%xmm0, (%%rdi);\n\t"
01987             "add $4, %%rsi;\n\t"
01988             "add $4, %%rdi;\n\t"
01989             "dec %%rax;\n\t"
01990             "jnz .IA19;\n\t"
01991             "pop %%rax;\n\t"
01992             ".IA20:;\n\t"
01993 
01994             "pop %%rbp;\n\t"
01995             :
01996             :"m"(w),"S"(sptr),"D"(dptr),"a"(rax),"b"(coeffs),"c"(rcx),"d"(rdx)
01997             :
01998             );
01999 
02000     }
02001   */
02002 }
02003 
02004 
02005 // ######################################################################
02006 
02007 void sse_yuv411_to_rgb_mode_640x480(const byte *src, byte *dest,
02008                                     const int nbpix2)
02009 {
02010   int rcx=nbpix2/6;
02011 
02012   const float coeffs[] = {
02013     0.0F,       -0.198242F,   1.014648F,     0.0F,  // R  G   B  xx  -> u
02014     0.700195F,  -0.29052F,    0.0F,          0.0F,  // R  G   B  xx  -> v
02015     128.0F,        128.0F,    128.0F,      128.0F   // division factor
02016   };
02017 
02018   asm (
02019        ".JA0:;\n\t"
02020        "or %%rcx, %%rcx;\n\t"
02021        "jz .JA1;\n\t"
02022        "pxor  %%mm7, %%mm7;\n\t"    //mm7 <-  00 00 00 00
02023        "xor  %%rax, %%rax;\n\t"
02024        "xor  %%rbx, %%rbx;\n\t"
02025        "mov  (%%rsi),   %%rax;\n\t" // rax <-   v   y1  y0 u
02026        "movw 4(%%rsi),  %%bx;\n\t"   // rbx <-   xx  xx  y3 y2
02027        "movd %%rax, %%mm0;\n\t"        // mm0<- xx xx xx xx v  y1  y0  u
02028        "movd %%rax, %%mm1;\n\t"        // mm1<- xx xx xx xx v  y1  y0  u
02029        "movd %%rbx, %%mm2;\n\t"        // mm2<- xx xx xx xx xx xx  y3  y2
02030        "psrlq $16,  %%mm1;\n\t"        // mm1<- xx xx xx xx xx xx  v   y1
02031        "punpcklbw %%mm7, %%mm0;\n\t"   // mm0<- xx xx xx xx 0  y0  0   u
02032        "punpcklbw %%mm7, %%mm1;\n\t"   // mm1<- xx xx xx xx 00 v   00  y1
02033        "punpcklbw %%mm7, %%mm2;\n\t"   // mm2<- xx xx xx xx 00 y3  00  y2
02034        "punpcklwd %%mm7, %%mm0;\n\t"   // mm0<- 00 00 00 y0 00 00  00  u
02035        "punpcklwd %%mm7, %%mm1;\n\t"   // mm1<- 00 00 00 v  00 00  00  y1
02036        "punpcklwd %%mm7, %%mm2;\n\t"   // mm2<- 00 00 00 y3 00 00  00  y2
02037 
02038        "cvtpi2ps %%mm0, %%xmm0;\n\t"   // xmm0 <- 00 00 y0 u
02039        "cvtpi2ps %%mm1, %%xmm1;\n\t"   // xmm1 <- 00 00 v  y1
02040        "cvtpi2ps %%mm2, %%xmm2;\n\t"   // xmm2 <- 00 00 y3 y2
02041 
02042        // 01 01 01 01
02043        "movaps %%xmm0, %%xmm3;\n\t"
02044 
02045        // 00 00 00 00
02046        "movaps %%xmm1, %%xmm4;\n\t"
02047 
02048        // 00 00 00 00
02049        "movaps %%xmm2, %%xmm5;\n\t"
02050 
02051        // 01 01 01 01
02052        "movaps %%xmm2, %%xmm6;\n\t"
02053 
02054        "shufps $0x55, %%xmm3, %%xmm3;\n\t"// xmm3 <- y0 y0 y0 y0
02055        "shufps $00, %%xmm4, %%xmm4;\n\t"  // xmm4 <- y1 y1 y1 y1
02056        "shufps $0x00, %%xmm5, %%xmm5;\n\t"// xmm5 <- y2 y2 y2 y2
02057        "shufps $0x55, %%xmm6, %%xmm6;\n\t"// xmm6 <- y3 y3 y3 y3
02058 
02059        // 00 00 00 00
02060        "shufps $0, %%xmm0, %%xmm0;\n\t"  // xmm0 <- u  u  u  u
02061        // 01 01 01 01
02062        "shufps $0x55, %%xmm1, %%xmm1;\n\t" // xmm1 <- v  v  v  v
02063 
02064        "subps  32(%%rdx), %%xmm0;\n\t"
02065        "subps  32(%%rdx), %%xmm1;\n\t"
02066 
02067        "mulps (%%rdx), %%xmm0;\n\t"
02068        "mulps 16(%%rdx),%%xmm1;\n\t"
02069 
02070        "addps %%xmm0, %%xmm3;\n\t"
02071        "addps %%xmm0, %%xmm4;\n\t"
02072        "addps %%xmm0, %%xmm5;\n\t"
02073        "addps %%xmm0, %%xmm6;\n\t"
02074 
02075        "addps %%xmm1, %%xmm3;\n\t"    // xmm3 <- xx b0 g0 r0
02076        "addps %%xmm1, %%xmm4;\n\t"    // xmm4 <- xx b1 g1 r1
02077        "addps %%xmm1, %%xmm5;\n\t"    // xmm5 <- xx b2 g2 r2
02078        "addps %%xmm1, %%xmm6;\n\t"    // xmm6 <- xx b3 g3 r3
02079 
02080        "cvtps2pi %%xmm3, %%mm0;\n\t"  //mm0  <- g0 r0
02081        "movhlps  %%xmm3, %%xmm3;\n\t" //xmm3 <- g0 r0 xx b0
02082        "cvtps2pi %%xmm3, %%mm1;\n\t"  //mm1  <- xx b0
02083        "packssdw %%mm1, %%mm0;\n\t"   //mm0<- xx b0 g0 r0
02084 
02085        "cvtps2pi %%xmm4, %%mm2;\n\t"  //mm2  <- g1 r1
02086        "movhlps  %%xmm4, %%xmm4;\n\t" //xmm4 <- g1 r1 xx b1
02087        "cvtps2pi %%xmm4, %%mm3;\n\t"  //mm3  <- xx b1
02088        "packssdw %%mm3, %%mm2;\n\t"   //mm2<- xx b1 g1 r1
02089 
02090        "cvtps2pi %%xmm5, %%mm4;\n\t"  //mm4  <- g2 r2
02091        "movhlps  %%xmm5, %%xmm5;\n\t" //xmm5 <- g2 r2 xx b2
02092        "cvtps2pi %%xmm5, %%mm5;\n\t"  //mm5  <- xx b2
02093        "packssdw %%mm5, %%mm4;\n\t"   //mm4<- xx b2 g2 r2
02094 
02095        "cvtps2pi %%xmm6, %%mm6;\n\t"  //mm6  <- g3 r3
02096        "movhlps  %%xmm6, %%xmm6;\n\t" //xmm3 <- g3 r3 xx b3
02097        "cvtps2pi %%xmm6, %%mm7;\n\t"  //mm7  <- xx b3
02098        "packssdw %%mm7, %%mm6;\n\t"   //mm6<- xx b3 g3 r3
02099 
02100        "pxor %%mm1, %%mm1;\n\t"
02101        "pcmpgtw %%mm0, %%mm1;\n\t"
02102        "pandn %%mm0, %%mm1;\n\t"
02103 
02104        "pxor %%mm3, %%mm3;\n\t"
02105        "pcmpgtw %%mm2, %%mm3;\n\t"
02106        "pandn %%mm2, %%mm3;\n\t"
02107 
02108        "pxor %%mm5, %%mm5;\n\t"
02109        "pcmpgtw %%mm4, %%mm5;\n\t"
02110        "pandn %%mm4, %%mm5;\n\t"
02111 
02112        "pxor %%mm7, %%mm7;\n\t"
02113        "pcmpgtw %%mm6, %%mm7;\n\t"
02114        "pandn %%mm6, %%mm7;\n\t"
02115 
02116        "packuswb %%mm1, %%mm1;\n\t"   //mm0<- xx xx xx xx xx b0 g0 r0
02117        "packuswb %%mm3, %%mm3;\n\t"   //mm2<- xx xx xx xx xx b1 g1 r1
02118        "packuswb %%mm5, %%mm5;\n\t"   //mm4<- xx xx xx xx xx b2 g2 r2
02119        "packuswb %%mm7, %%mm7;\n\t"   //mm6<- xx xx xx xx xx b3 g3 r3
02120 
02121        "push %%rcx;\n\t"
02122        "push %%rdx;\n\t"
02123        "movd %%mm1, %%rax;\n\t"  // rax <- xx b0 g0 r0
02124        "movd %%mm3, %%rbx;\n\t"  // rbx <- xx b1 g1 r1
02125        "movd %%mm5, %%rcx;\n\t"  // rcx <- xx b2 g2 r2
02126        "movd %%mm7, %%rdx;\n\t"  // rdx <- xx b3 g3 r3
02127        "movw %%ax, (%%rdi);\n\t"
02128        "movw %%bx,3(%%rdi);\n\t"
02129        "movw %%cx,6(%%rdi);\n\t"
02130        "movw %%dx,9(%%rdi);\n\t"
02131        "shr $8, %%rax;\n\t"
02132        "shr $8, %%rbx;\n\t"
02133        "shr $8, %%rcx;\n\t"
02134        "shr $8, %%rdx;\n\t"
02135        "movb %%ah, 2(%%rdi);\n\t"
02136        "movb %%bh, 5(%%rdi);\n\t"
02137        "movb %%ch, 8(%%rdi);\n\t"
02138        "movb %%dh,11(%%rdi);\n\t"
02139        "pop %%rdx;\n\t"
02140        "pop %%rcx;\n\t"
02141 
02142        "add $12,%%rdi;\n\t"
02143        "dec %%rcx;\n\t"
02144        "add $6, %%rsi;\n\t"
02145        "jmp .JA0;\n\t"
02146        ".JA1:;\n\t"
02147        "emms;\n\t"
02148        :
02149        :"S"(src),"D"(dest),"c"(rcx),"d"(coeffs)
02150        :"rax","rbx","memory"
02151        );
02152 
02153 }
02154 
02155 
02156 
02157 
02158 void sse_lowPass9x(const float *sptr, float *dptr, const int h, const int w)
02159 {
02160 
02161  for (int j = 0; j < h; j ++)
02162     {
02163       // leftmost points
02164       *dptr++ = sptr[0] * (70.0F / 163.0F) +
02165         sptr[1] * (56.0F / 163.0F) +
02166         sptr[2] * (28.0F / 163.0F) +
02167         sptr[3] * ( 8.0F / 163.0F) +
02168         sptr[4] * ( 1.0F / 163.0F);
02169       *dptr++ = (sptr[0] + sptr[2]) * (56.0F / 219.0F) +
02170         sptr[1] * (70.0F / 219.0F) +
02171         sptr[3] * (28.0F / 219.0F) +
02172         sptr[4] * ( 8.0F / 219.0F) +
02173         sptr[5] * ( 1.0F / 219.0F);
02174       *dptr++ = (sptr[0] + sptr[4]) * (28.0F / 247.0F) +
02175         (sptr[1] + sptr[3]) * (56.0F / 247.0F) +
02176         sptr[2] * (70.0F / 247.0F) +
02177         sptr[5] * ( 8.0F / 247.0F) +
02178         sptr[6] * ( 1.0F / 247.0F);
02179       *dptr++ = (sptr[0] + sptr[6]) * ( 8.0F / 255.0F) +
02180         (sptr[1] + sptr[5]) * (28.0F / 255.0F) +
02181         (sptr[2] + sptr[4]) * (56.0F / 255.0F) +
02182         sptr[3] * (70.0F / 255.0F) +
02183         sptr[7] * ( 1.0F / 255.0F);
02184 
02185       // far from the borders
02186       for (int i = 0; i < w - 8; i ++)
02187         {
02188           *dptr++ = (sptr[0] + sptr[8]) * ( 1.0F / 256.0F) +
02189             (sptr[1] + sptr[7]) * ( 8.0F / 256.0F) +
02190             (sptr[2] + sptr[6]) * (28.0F / 256.0F) +
02191             (sptr[3] + sptr[5]) * (56.0F / 256.0F) +
02192             sptr[4] * (70.0F / 256.0F);
02193           sptr ++;
02194         }
02195 
02196       // rightmost points
02197       *dptr++ = sptr[0] * ( 1.0F / 255.0F) +
02198         (sptr[1] + sptr[7]) * ( 8.0F / 255.0F) +
02199         (sptr[2] + sptr[6]) * (28.0F / 255.0F) +
02200         (sptr[3] + sptr[5]) * (56.0F / 255.0F) +
02201         sptr[4] * (70.0F / 255.0F);
02202       sptr ++;
02203       *dptr++ = sptr[0] * ( 1.0F / 247.0F) +
02204         sptr[1] * ( 8.0F / 247.0F) +
02205         (sptr[2] + sptr[6]) * (28.0F / 247.0F) +
02206         (sptr[3] + sptr[5]) * (56.0F / 247.0F) +
02207         sptr[4] * (70.0F / 247.0F);
02208       sptr ++;
02209       *dptr++ = sptr[0] * ( 1.0F / 219.0F) +
02210         sptr[1] * ( 8.0F / 219.0F) +
02211         sptr[2] * (28.0F / 219.0F) +
02212         (sptr[3] + sptr[5]) * (56.0F / 219.0F) +
02213         sptr[4] * (70.0F / 219.0F);
02214       sptr ++;
02215       *dptr++ = sptr[0] * ( 1.0F / 163.0F) +
02216         sptr[1] * ( 8.0F / 163.0F) +
02217         sptr[2] * (28.0F / 163.0F) +
02218         sptr[3] * (56.0F / 163.0F) +
02219         sptr[4] * (70.0F / 163.0F);
02220       sptr += 5;  // sptr back to same as dptr (start of next line)
02221     }
02222 }
02223 #endif
02224 
02225 //############################################################################
02226 /* So things look consistent in everyone's emacs... */
02227 /* Local Variables: */
02228 /* indent-tabs-mode: nil */
02229 /* End: */
02230 
02231 #endif
02232 
02233 #ifndef INVT_CPU_OPTERON
02234 
02235 #ifdef INVT_USE_SSE
02236 
02237 //######################################################################
02238 void sse_absDiff(const double *a, const double *b, double *diff, const int32 sz)
02239 {
02240   static int32 ecx= sz>>2;
02241   static int32 edx= sz & 0x3;
02242 
02243   asm (
02244        "orl %%ecx, %%ecx;\n\t"
02245        "jz .AG2;\n\t"
02246        ".AG1:;\n\t"
02247        "movupd  0(%%esi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0
02248        "movupd  0(%%edi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0
02249        "movupd  16(%%esi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4
02250        "movupd  16(%%edi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4
02251        "movupd  %%xmm0, %%xmm4;\n\t"   // xmm4 <- a3 a2 a1 a0
02252        "movupd  %%xmm1, %%xmm5;\n\t"   // xmm5 <- b3 b2 b1 b0
02253        "movupd  %%xmm2, %%xmm6;\n\t"   // xmm6 <- a7 a6 a5 a4
02254        "movupd  %%xmm3, %%xmm7;\n\t"   // xmm7 <- b7 b6 b5 b4
02255        "subpd   %%xmm1, %%xmm0;\n\t"   // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0)
02256        "subpd   %%xmm3, %%xmm2;\n\t"   // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4)
02257        "subpd   %%xmm4, %%xmm5;\n\t"   // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0)
02258        "subpd   %%xmm6, %%xmm7;\n\t"   // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4)
02259        "maxpd   %%xmm0, %%xmm5;\n\t"   // xmm5 <- max(xmm0,xmm5)
02260        "maxpd   %%xmm2, %%xmm7;\n\t"   // xmm7 <- max(xmm2,xmm7)
02261        "movupd  %%xmm5, 0(%%ebx);\n\t"
02262        "movupd  %%xmm7, 16(%%ebx);\n\t"
02263        "addl $32, %%esi;\n\t"
02264        "addl $32, %%edi;\n\t"
02265        "addl $32, %%ebx;\n\t"
02266        "loop  .AG1;\n\t"
02267        ".AG2:;\n\t"
02268        "movl %%edx, %%ecx;\n\t"
02269        "orl %%ecx, %%ecx;\n\t"
02270        "jz .AG4;\n\t"
02271        ".AG3:;\n\t"
02272        "movsd 0(%%esi), %%xmm0;\n\t"
02273        "movsd 0(%%edi), %%xmm1;\n\t"
02274        "movsd %%xmm0, %%xmm2;\n\t"
02275        "movsd %%xmm1, %%xmm3;\n\t"
02276        "subsd %%xmm3, %%xmm2;\n\t"
02277        "subsd %%xmm0, %%xmm1;\n\t"
02278        "maxsd %%xmm2, %%xmm1;\n\t"
02279        "movsd %%xmm1, 0(%%ebx);\n\t"
02280        "addl $8, %%esi;\n\t"
02281        "addl $8, %%edi;\n\t"
02282        "addl $8, %%ebx;\n\t"
02283        "loop .AG3;\n\t"
02284        ".AG4:;\n\t"
02285        :
02286        :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx)
02287        :"memory"
02288        );
02289 }
02290 #endif
02291 
02292 #ifdef INVT_USE_MMXSSE2
02293 //######################################################################
02294 // speedup ~= 2.1
02295 void sse2_absDiff(const float *a, const float *b, float *diff, const int32 sz)
02296 {
02297   static int32 ecx= sz>>3;
02298   static int32 edx= sz & 0x7;
02299 
02300   asm (
02301        "orl %%ecx, %%ecx;\n\t"
02302        "jz .AE2;\n\t"
02303        ".AE1:;\n\t"
02304        "movups  0(%%esi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0
02305        "movups  0(%%edi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0
02306        "movups  16(%%esi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4
02307        "movups  16(%%edi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4
02308        "movups  %%xmm0, %%xmm4;\n\t"   // xmm4 <- a3 a2 a1 a0
02309        "movups  %%xmm1, %%xmm5;\n\t"   // xmm5 <- b3 b2 b1 b0
02310        "movups  %%xmm2, %%xmm6;\n\t"   // xmm6 <- a7 a6 a5 a4
02311        "movups  %%xmm3, %%xmm7;\n\t"   // xmm7 <- b7 b6 b5 b4
02312        "subps   %%xmm1, %%xmm0;\n\t"   // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0)
02313        "subps   %%xmm3, %%xmm2;\n\t"   // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4)
02314        "subps   %%xmm4, %%xmm5;\n\t"   // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0)
02315        "subps   %%xmm6, %%xmm7;\n\t"   // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4)
02316        "maxps   %%xmm0, %%xmm5;\n\t"   // xmm5 <- max(xmm0,xmm5)
02317        "maxps   %%xmm2, %%xmm7;\n\t"   // xmm7 <- max(xmm2,xmm7)
02318        "movups  %%xmm5, 0(%%ebx);\n\t"
02319        "movups  %%xmm7, 16(%%ebx);\n\t"
02320        "addl $32, %%esi;\n\t"
02321        "addl $32, %%edi;\n\t"
02322        "addl $32, %%ebx;\n\t"
02323        "loop  .AE1;\n\t"
02324        ".AE2:;\n\t"
02325        "movl %%edx, %%ecx;\n\t"
02326        "orl %%ecx, %%ecx;\n\t"
02327        "jz .AE4;\n\t"
02328        ".AE3:;\n\t"
02329        "movss 0(%%esi), %%xmm0;\n\t"
02330        "movss 0(%%edi), %%xmm1;\n\t"
02331        "movss %%xmm0, %%xmm2;\n\t"
02332        "movss %%xmm1, %%xmm3;\n\t"
02333        "subss %%xmm3, %%xmm2;\n\t"
02334        "subss %%xmm0, %%xmm1;\n\t"
02335        "maxss %%xmm2, %%xmm1;\n\t"
02336        "movss %%xmm1, 0(%%ebx);\n\t"
02337        "addl $4, %%esi;\n\t"
02338        "addl $4, %%edi;\n\t"
02339        "addl $4, %%ebx;\n\t"
02340        "loop .AE3;\n\t"
02341        ".AE4:;\n\t"
02342        "emms;\n\t"
02343        :
02344        :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx)
02345        :"memory"
02346        );
02347 }
02348 
02349 
02350 
02351 //######################################################################
02352 // speedup ~= 3.4
02353 void sse2_absDiff(const int32 *a, const int32 *b, int32 *diff, const int32 sz)
02354 {
02355   static int32 ecx= sz>>3;
02356   static int32 edx= sz&0x7;
02357 
02358   asm (
02359        "orl %%ecx, %%ecx;\n\t"
02360        "jz .AF2;\n\t"
02361        ".AF1:;\n\t"
02362        "movdqu  0(%%esi), %%xmm0;\n\t"
02363        "movdqu  0(%%edi), %%xmm1;\n\t"
02364        "movdqu  16(%%esi), %%xmm2;\n\t"
02365        "movdqu  16(%%edi), %%xmm3;\n\t"
02366        "movdqu  %%xmm0, %%xmm4;\n\t"
02367        "movdqu  %%xmm1, %%xmm5;\n\t"
02368        "movdqu  %%xmm2, %%xmm6;\n\t"
02369        "movdqu  %%xmm3, %%xmm7;\n\t"
02370        "psubusw %%xmm1, %%xmm0;\n\t"
02371        "psubusw %%xmm3, %%xmm2;\n\t"
02372        "psubusw %%xmm4, %%xmm5;\n\t"
02373        "psubusw %%xmm6, %%xmm7;\n\t"
02374        "pmaxsw  %%xmm0, %%xmm5;\n\t"
02375        "pmaxsw  %%xmm2, %%xmm7;\n\t"
02376        "movdqu  %%xmm5, 0(%%ebx);\n\t"
02377        "movdqu  %%xmm7, 16(%%ebx);\n\t"
02378        "addl $32, %%esi;\n\t"
02379        "addl $32, %%edi;\n\t"
02380        "addl $32, %%ebx;\n\t"
02381        "loop  .AF1;\n\t"
02382        ".AF2:;\n\t"
02383        "movl %%edx, %%ecx;\n\t"
02384        "orl %%ecx, %%ecx;\n\t"
02385        "jz .AF4;\n\t"
02386        ".AF3:;\n\t"
02387        "movl (%%esi), %%eax;\n\t"
02388        "movl (%%edi), %%edx;\n\t"
02389        "cmpl %%edx, %%eax;\n\t"
02390        "ja .AF5;\n\t"
02391        "xchgl %%eax, %%edx;\n\t"
02392        ".AF5:;\n\t"
02393        "subl %%edx, %%eax;\n\t"
02394        "movl %%eax, (%%ebx);\n\t"
02395        "addl $4, %%esi;\n\t"
02396        "addl $4, %%edi;\n\t"
02397        "addl $4, %%ebx;\n\t"
02398        "loop .AF3;\n\t"
02399        ".AF4:;\n\t"
02400        "emms;\n\t"
02401        :
02402        :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx)
02403        :"memory"
02404        );
02405 }
02406 
02407 
02408 //######################################################################
02409 // speedup ~=10.0!
02410 void sse2_absDiff(const byte *a, const byte *b, byte *diff, const int32 sz)
02411 {
02412   static int32 ecx= sz>>5;
02413   static int32 edx= sz&0x1f;
02414 
02415   asm (
02416        "orl %%ecx, %%ecx;\n\t"
02417        "jz .AD2;\n\t"
02418        ".AD1:;\n\t"
02419        "movdqu  0(%%esi), %%xmm0;\n\t" // xmm0<- a15 ... a3 a2 a1 a0
02420        "movdqu  0(%%edi), %%xmm1;\n\t" // xmm1<- b15 ... b3 b2 b1 b0
02421        "movdqu  16(%%esi), %%xmm2;\n\t"// xmm2<- a31 ... a18 a17 a16
02422        "movdqu  16(%%edi), %%xmm3;\n\t"// xmm3<- b31 ... b18 b17 b16
02423        "movdqu  %%xmm0, %%xmm4;\n\t"   // xmm4<- a15 ... a3 a2 a1 a0
02424        "movdqu  %%xmm1, %%xmm5;\n\t"   // xmm5<- b15 ... b3 b2 b1 b0
02425        "movdqu  %%xmm2, %%xmm6;\n\t"   // xmm6<- a31 ... a18 a17 a16
02426        "movdqu  %%xmm3, %%xmm7;\n\t"   // xmm7<- b31 ... b18 b17 b16
02427        "psubusb %%xmm1, %%xmm0;\n\t"   // xmm0<-(a15-b15)...( a1-b1 )(a0-b0)
02428        "psubusb %%xmm3, %%xmm2;\n\t"   // xmm2<-(a31-b31)...(a17-b17)(a16-b16)
02429        "psubusb %%xmm4, %%xmm5;\n\t"   // xmm5<-(b15-a15)...(b17-a17)(b16-a16)
02430        "psubusb %%xmm6, %%xmm7;\n\t"   // xmm7<-(b31-a31)...(b17-a17)(b16-a16)
02431        "pmaxub  %%xmm0, %%xmm5;\n\t"   // xmm5<- max(xmm0,xmm5)
02432        "pmaxub  %%xmm2, %%xmm7;\n\t"   // xmm7<- max(xmm2,xmm7)
02433        "movdqu  %%xmm5, 0(%%ebx);\n\t"
02434        "movdqu  %%xmm7, 16(%%ebx);\n\t"
02435        "addl $32, %%esi;\n\t"
02436        "addl $32, %%edi;\n\t"
02437        "addl $32, %%ebx;\n\t"
02438        "loop  .AD1;\n\t"
02439        ".AD2:;\n\t"
02440        "movl %%edx, %%ecx;\n\t"
02441        "orl %%ecx, %%ecx;\n\t"
02442        "jz .AD4;\n\t"
02443        ".AD3:;\n\t"
02444        "movb (%%esi), %%al;\n\t"
02445        "movb (%%edi), %%dl;\n\t"
02446        "cmpb %%dl, %%al;\n\t"
02447        "ja .AD5;\n\t"
02448        "xchgb %%al, %%dl;\n\t"
02449        ".AD5:;\n\t"
02450        "subb %%dl, %%al;\n\t"
02451        "movb %%al, (%%ebx);\n\t"
02452        "incl %%ebx;\n\t"
02453        "incl %%esi;\n\t"
02454        "incl %%edi;\n\t"
02455        "loop .AD3;\n\t"
02456        ".AD4:;\n\t"
02457        "emms;\n\t"
02458        :
02459        :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx)
02460        :"memory"
02461        );
02462 }
02463 #endif
02464 
02465 #ifdef INVT_USE_SSE
02466 //######################################################################
02467 // speedup ~= 2.0
02468 void sse_sum(const double *a, double *sum, const int32 sz)
02469 {
02470   static int32 ecx = sz>>3;
02471   static int32 edx = sz&0x7;
02472 
02473   asm (
02474        "pxor %%xmm4, %%xmm4;\n\t"
02475        "pxor %%xmm5, %%xmm5;\n\t"
02476        "pxor %%xmm6, %%xmm6;\n\t"
02477        "pxor %%xmm7, %%xmm7;\n\t"
02478        "orl %%ecx, %%ecx;\n\t"
02479        "jz  BE1;\n\t"
02480        ".BE0:\n\t"
02481        "movupd     0(%%esi), %%xmm0;\n\t"
02482        "movupd  16(%%esi), %%xmm1;\n\t"
02483        "movupd  32(%%esi), %%xmm2;\n\t"
02484        "movupd  48(%%esi), %%xmm3;\n\t"
02485        "addpd %%xmm0, %%xmm4;\n\t"
02486        "addpd %%xmm1, %%xmm5;\n\t"
02487        "addpd %%xmm2, %%xmm6;\n\t"
02488        "addpd %%xmm3, %%xmm7;\n\t"
02489        "addl $64, %%esi;\n\t"
02490        "loop .BE0;\n\t"
02491        "BE1:;\n\t"
02492        "mov %%edx, %%ecx;\n\t"
02493        "pxor %%xmm0, %%xmm0;\n\t"
02494        "orl %%ecx, %%ecx;\n\t"
02495        "jz BE2;\n\t"
02496        "BE3:;\n\t"
02497        "movupd 0(%%esi), %%xmm1;\n\t"
02498        "addpd %%xmm1, %%xmm0;\n\t"
02499        "addl $16, %%esi;\n\t"
02500        "loop BE3;\n\t"
02501        "BE2:;\n\t"
02502        "addpd %%xmm4, %%xmm7;\n\t"
02503        "addpd %%xmm5, %%xmm7;\n\t"
02504        "addpd %%xmm6, %%xmm7;\n\t"
02505        "addpd %%xmm7, %%xmm0;\n\t"
02506        "movhpd %%xmm0, (%%ebx);\n\t"
02507        "addsd  (%%ebx), %%xmm0;\n\t"
02508        "movlpd %%xmm0, (%%ebx);\n\t"
02509        "emms;\n\t"
02510        :
02511        :"S"(a), "b"(sum), "c"(ecx), "d"(edx)
02512        :"memory"
02513        );
02514 }
02515 #endif
02516 
02517 #ifdef INVT_USE_MMXSSE2
02518 //######################################################################
02519 //speedup ~= 4
02520 void sse2_sum(const float *a, double *sum, const int32 sz)
02521 {
02522   static int32 ecx = sz>>3;
02523   static int32 edx = sz & 0x7;
02524 
02525   asm (
02526        "pxor %%xmm4, %%xmm4;\n\t"
02527        "pxor %%xmm5, %%xmm5;\n\t"
02528        "pxor %%xmm6, %%xmm6;\n\t"
02529        "pxor %%xmm7, %%xmm7;\n\t"
02530        "orl %%ecx, %%ecx;\n\t"
02531        "jz  BA1;\n\t"
02532        ".BA0:\n\t"
02533        "cvtps2pd  0(%%esi), %%xmm0;\n\t"
02534        "cvtps2pd  8(%%esi), %%xmm1;\n\t"
02535        "cvtps2pd  16(%%esi), %%xmm2;\n\t"
02536        "cvtps2pd 24(%%esi), %%xmm3;\n\t"
02537        "addpd %%xmm0, %%xmm4;\n\t"
02538        "addpd %%xmm1, %%xmm5;\n\t"
02539        "addpd %%xmm2, %%xmm6;\n\t"
02540        "addpd %%xmm3, %%xmm7;\n\t"
02541        "addl $32, %%esi;\n\t"
02542        "loop .BA0;\n\t"
02543        "BA1:;\n\t"
02544        "pxor %%xmm0, %%xmm0;\n\t"
02545        "mov %%edx, %%ecx;\n\t"
02546        "orl %%ecx, %%ecx;\n\t"
02547        "jz BA2;\n\t"
02548        "BA3:;\n\t"
02549        "cvtps2pd 0(%%esi), %%xmm1;\n\t"
02550        "addpd %%xmm1, %%xmm0;\n\t"
02551        "addl $8, %%esi;\n\t"
02552        "loop BA3;\n\t"
02553        "BA2:;\n\t"
02554        "addpd %%xmm4, %%xmm7;\n\t"
02555        "addpd %%xmm5, %%xmm7;\n\t"
02556        "addpd %%xmm6, %%xmm7;\n\t"
02557        "addpd %%xmm7, %%xmm0;\n\t"
02558        "movhpd %%xmm0, (%%ebx);\n\t"
02559        "addsd  (%%ebx), %%xmm0;\n\t"
02560        "movlpd %%xmm0, (%%ebx);\n\t"
02561        "emms;\n\t"
02562        :
02563        :"S"(a), "b"(sum), "c"(ecx), "d"(edx)
02564        :"memory"
02565        );
02566 }
02567 
02568 
02569 //######################################################################
02570 // speedup ~= 4.0
02571 void sse2_sum(const int32 *a, double *sum, const int32 sz)
02572 {
02573   static int32 ecx = sz>>3;
02574   static int32 edx = sz & 0x7;
02575 
02576   asm (
02577        "pxor %%xmm4, %%xmm4;\n\t"
02578        "pxor %%xmm5, %%xmm5;\n\t"
02579        "pxor %%xmm6, %%xmm6;\n\t"
02580        "pxor %%xmm7, %%xmm7;\n\t"
02581        "orl %%ecx, %%ecx;\n\t"
02582        ".BC0:\n\t"
02583        "cvtdq2pd  0(%%esi), %%xmm0;\n\t"
02584        "cvtdq2pd  8(%%esi), %%xmm1;\n\t"
02585        "cvtdq2pd  16(%%esi), %%xmm2;\n\t"
02586        "cvtdq2pd 24(%%esi), %%xmm3;\n\t"
02587        "addpd %%xmm0, %%xmm4;\n\t"
02588        "addpd %%xmm1, %%xmm5;\n\t"
02589        "addpd %%xmm2, %%xmm6;\n\t"
02590        "addpd %%xmm3, %%xmm7;\n\t"
02591        "addl $32, %%esi;\n\t"
02592        "loop .BC0;\n\t"
02593        "BC1:;\n\t"
02594        "pxor %%xmm0, %%xmm0;\n\t"
02595        "mov %%edx, %%ecx;\n\t"
02596        "orl %%ecx, %%ecx;\n\t"
02597        "jz BC2;\n\t"
02598        "BC3:;\n\t"
02599        "cvtdq2pd 0(%%esi), %%xmm1;\n\t"
02600        "addpd %%xmm1, %%xmm0;\n\t"
02601        "addl $8, %%esi;\n\t"
02602        "loop BC3;\n\t"
02603        "BC2:;\n\t"
02604        "addpd %%xmm4, %%xmm7;\n\t"
02605        "addpd %%xmm5, %%xmm7;\n\t"
02606        "addpd %%xmm6, %%xmm7;\n\t"
02607        "addpd %%xmm7, %%xmm0;\n\t"
02608        "movhpd %%xmm0, (%%ebx);\n\t"
02609        "addsd  (%%ebx), %%xmm0;\n\t"
02610        "movlpd %%xmm0, (%%ebx);\n\t"
02611        "emms;\n\t"
02612        :
02613        :"S"(a), "b"(sum), "c"(ecx), "d"(edx)
02614        :"memory"
02615        );
02616 }
02617 
02618 
02619 
02620 //######################################################################
02621 void sse2_sum(const byte *a, double *sum, const int32 sz)
02622 {
02623   static int ecx = sz>>5;
02624   static int edx = sz & 0x1f;
02625 
02626   asm (
02627        "orl %%ecx, %%ecx;\n\t"
02628        "jz  BB1;\n\t"
02629        "pxor %%xmm7, %%xmm7;\n\t"
02630        "pushl %%ebx;\n\t"
02631        "pushl %%edx;\n\t"
02632        "BB3:;\n\t"
02633        "pxor %%xmm5, %%xmm5;\n\t"
02634        "pxor %%xmm6, %%xmm6;\n\t"
02635        "movdqu (%%esi), %%xmm0;\n\t"
02636        "movdqu 16(%%esi), %%xmm1;\n\t"
02637        "psadbw %%xmm0, %%xmm5;\n\t"
02638        "psadbw %%xmm1, %%xmm6;\n\t"
02639        "pextrw $0, %%xmm5, %%eax;\n\t"
02640        "cvtsi2sd %%eax, %%xmm0;\n\t"
02641        "pextrw $4, %%xmm5, %%ebx;\n\t"
02642        "cvtsi2sd %%ebx, %%xmm1;\n\t"
02643        "pextrw $0, %%xmm6, %%edx;\n\t"
02644        "cvtsi2sd %%edx, %%xmm2;\n\t"
02645        "pextrw $4, %%xmm6, %%edi;\n\t"
02646        "cvtsi2sd %%edi, %%xmm3;\n\t"
02647        "addsd %%xmm0, %%xmm1;\n\t"
02648        "addsd %%xmm2, %%xmm3;\n\t"
02649        "addsd %%xmm1, %%xmm7;\n\t"
02650        "addsd %%xmm3, %%xmm7;\n\t"
02651        "addl $32, %%esi;\n\t"
02652        "loop BB3;\n\t"
02653        "popl %%edx;\n\t"
02654        "popl %%ebx;\n\t"
02655        "BB1:;\n\t"
02656        "xorl %%edi, %%edi;\n\t"
02657        "movl %%edx, %%ecx;\n\t"
02658        "orl %%ecx, %%ecx;\n\t"
02659        "jz BB2;\n\t"
02660        "BB5:;\n\t"
02661        "xorl %%eax, %%eax;\n\t"
02662        "movb (%%esi), %%al;\n\t"
02663        "addl %%eax, %%edi;\n\t"
02664        "incl %%esi;\n\t"
02665        "loop BB5;\n\t"
02666        "BB2:\n\t"
02667        "cvtsi2sd %%edi, %%xmm0;\n\t"
02668        "addsd %%xmm0, %%xmm7;\n\t"
02669        "movhpd %%xmm7, (%%ebx);\n\t"
02670        "addsd  (%%ebx), %%xmm7;\n\t"
02671        "movlpd %%xmm7, (%%ebx);\n\t"
02672        "BB6:;\n\t"
02673        "emms;\n\t"
02674        :
02675        :"S"(a), "c"(ecx),"b"(sum),"d"(edx)
02676        :"memory","eax","edi"
02677        );
02678 }
02679 #endif
02680 
02681 #ifdef INVT_USE_SSE
02682 //######################################################################
02683 // speedup ~= 10 !
02684 void sse_clampedDiff(const byte *a, const byte *b, byte *result, const int32 sz)
02685 {
02686   int ecx = sz >> 6;
02687   int edx = sz & 0x7f;
02688 
02689   asm (
02690        "orl %%ecx, %%ecx;\n\t"
02691        "jz .DA0;\n\t"
02692        ".DA1:;\n\t"
02693        "movdqu (%%esi), %%xmm0;\n\t"
02694        "movdqu (%%edi), %%xmm4;\n\t"
02695        "movdqu 16(%%esi), %%xmm1;\n\t"
02696        "movdqu 16(%%edi), %%xmm5;\n\t"
02697        "movdqu 32(%%esi), %%xmm2;\n\t"
02698        "movdqu 32(%%edi), %%xmm6;\n\t"
02699        "movdqu 48(%%esi), %%xmm3;\n\t"
02700        "movdqu 48(%%edi), %%xmm7;\n\t"
02701        "psubusb %%xmm4, %%xmm0;\n\t"
02702        "psubusb %%xmm5, %%xmm1;\n\t"
02703        "psubusb %%xmm6, %%xmm2;\n\t"
02704        "psubusb %%xmm7, %%xmm3;\n\t"
02705        "movdqu  %%xmm0, 0(%%ebx);\n\t"
02706        "movdqu  %%xmm1, 16(%%ebx);\n\t"
02707        "movdqu  %%xmm2, 32(%%ebx);\n\t"
02708        "movdqu  %%xmm3, 48(%%ebx);\n\t"
02709        "addl $64, %%esi;\n\t"
02710        "addl $64, %%edi;\n\t"
02711        "addl $64, %%ebx;\n\t"
02712        "loop .DA1;\n\t"
02713        ".DA0:;\n\t"
02714        "movl %%edx, %%ecx;\n\t"
02715        "orl %%ecx, %%ecx;\n\t"
02716        "jz .DA2;\n\t"
02717        ".DA3:;\n\t"
02718        "movb (%%esi), %%al;\n\t"
02719        "movb (%%edi), %%dl;\n\t"
02720        "cmpb %%bl, %%al;\n\t"
02721        "ja .DA4;\n\t"
02722        "xchg %%al, %%bl;\n\t"
02723        ".DA4:;\n\t"
02724        "subb %%bl, %%al;\n\t"
02725        "movb %%al, (%%ebx);\n\t"
02726        "incl %%esi;\n\t"
02727        "incl %%edi;\n\t"
02728        "incl %%ebx;\n\t"
02729        "loop .DA3;\n\t"
02730        ".DA2:;\n\t"
02731        "emms;\n\t"
02732        :
02733        :"S"(a),"D"(b),"c"(ecx),"d"(edx),"b"(result)
02734        );
02735 }
02736 
02737 
02738 //######################################################################
02739 // speedup ~= 20 !
02740 void sse_clampedDiff(const float32 *a, const float32 *b, float32 *result,
02741                         const int32 sz)
02742 {
02743   int32 ecx=sz>>5;
02744   int32 edx=sz&0x1f;
02745 
02746   asm (
02747        "orl %%ecx, %%ecx;\n\t"
02748        "jz .DB0;\n\t"
02749        ".DB1:;\n\t"
02750        "movups  0(%%esi), %%xmm0;\n\t"
02751        "movups  0(%%edi), %%xmm1;\n\t"
02752        "movups 16(%%esi), %%xmm2;\n\t"
02753        "movups 16(%%edi), %%xmm3;\n\t"
02754        "movups %%xmm1, %%xmm6;\n\t"
02755        "movups %%xmm3, %%xmm7;\n\t"
02756        "cmpps  $1, %%xmm0, %%xmm6;\n\t"
02757        "cmpps  $1, %%xmm2, %%xmm7;\n\t"
02758        "subps  %%xmm1, %%xmm0;\n\t"
02759        "subps  %%xmm3, %%xmm2;\n\t"
02760        "andps  %%xmm6, %%xmm0;\n\t"
02761        "andps  %%xmm7, %%xmm2;\n\t"
02762        "movups %%xmm0, (%%ebx);\n\t"
02763        "movups %%xmm2, 16(%%ebx);\n\t"
02764        "addl  $32, %%esi;\n\t"
02765        "addl  $32, %%edi;\n\t"
02766        "addl  $32, %%ebx;\n\t"
02767        "loop .DB1;\n\t"
02768        ".DB0:;\n\t"
02769        "movl %%edx, %%ecx;\n\t"
02770        "orl %%ecx, %%ecx;\n\t"
02771        "jz .DB2;\n\t"
02772        ".DB3:;\n\t"
02773        "movss (%%esi), %%xmm0;\n\t"
02774        "movss (%%edi), %%xmm1;\n\t"
02775        "movss %%xmm1, %%xmm2;\n\t"
02776        "cmpss $1, %%xmm0,  %%xmm2;\n\t"
02777        "andps %%xmm2, %%xmm0;\n\t"
02778        "andps %%xmm2, %%xmm1;\n\t"
02779        "subss %%xmm1,  %%xmm0;\n\t"
02780        "movss %%xmm0,  (%%ebx);\n\t"
02781        "addl $4, %%esi;\n\t"
02782        "addl $4, %%edi;\n\t"
02783        "addl $4, %%ebx;\n\t"
02784        "loop .DB3;\n\t"
02785        ".DB2:;\n\t"
02786        :
02787        :"S"(a), "D"(b), "b"(result), "c"(ecx), "d"(edx)
02788        :"memory"
02789        );
02790 }
02791 
02792 
02793 //######################################################################
02794 // speedup ~= 3
02795 void sse_clampedDiff(const int32 *a, const int32 *b, int32 *c, const int32 sz)
02796 {
02797   int32 ecx=sz>>3;
02798   int32 edx=sz&0x7;
02799   asm (
02800        "orl %%ecx, %%ecx;\n\t"
02801        "jz .DC0;\n\t"
02802        ".DC1:;\n\t"
02803        "movdqu 0(%%esi), %%xmm0;\n\t" //xmm0=  a3     a2     a1     a0
02804        "movdqu 0(%%edi), %%xmm1;\n\t" //xmm1=  b3     b2     b1     b0
02805        "movdqu 16(%%esi), %%xmm3;\n\t"//xmm3=  a7     a6     a5     a4
02806        "movdqu 16(%%edi), %%xmm4;\n\t"//xmm4=  b7     b6     b5     b4
02807        "movdqu  %%xmm0, %%xmm2;\n\t"  //xmm2=  a3     a2     a1     a0
02808        "movdqu  %%xmm3, %%xmm5;\n\t"  //xmm5=  a7     a6     a5     a4
02809        "pcmpgtd %%xmm1, %%xmm2;\n\t"  //xmm2=(a3>b3)(a2>b2)(a1>b1)(a0>b0)
02810        "pcmpgtd %%xmm4, %%xmm5;\n\t"  //xmm5=(a7>b7)(a6>b6)(b5>a5)(a4>b4)
02811        "psubd   %%xmm1, %%xmm0;\n\t"  //xmm0=(a3-b3)(a2-b2)(a1-b1)(a0-b0)
02812        "psubd   %%xmm4, %%xmm3;\n\t"  //xmm3=(a7-b7)(a6-b6)(a5-b5)(a4-b4)
02813        "pand    %%xmm2, %%xmm0;\n\t"
02814        "pand    %%xmm5, %%xmm3;\n\t"
02815        "movdqu  %%xmm0, (%%ebx);\n\t"
02816        "movdqu  %%xmm3, 16(%%ebx);\n\t"
02817        "addl $32, %%esi;\n\t"
02818        "addl $32, %%edi;\n\t"
02819        "addl $32, %%ebx;\n\t"
02820        "loop .DC1;\n\t"
02821        ".DC0:;\n\t"
02822        "movl %%edx, %%ecx;\n\t"
02823        "orl  %%ecx, %%ecx;\n\t"
02824        "jz .DC2;\n\t"
02825        ".DC3:;\n\t"
02826        "movd 0(%%esi), %%xmm0;\n\t"
02827        "movd 0(%%edi), %%xmm1;\n\t"
02828        "movdqu %%xmm0, %%xmm2;\n\t"
02829        "pcmpgtd %%xmm1, %%xmm2;\n\t"
02830        "psubd   %%xmm1, %%xmm0;\n\t"
02831        "pand    %%xmm2, %%xmm0;\n\t"
02832        "movd    %%xmm0, (%%ebx);\n\t"
02833        "addl $4, %%esi;\n\t"
02834        "addl $4, %%edi;\n\t"
02835        "addl $4, %%ebx;\n\t"
02836        "loop .DC3;\n\t"
02837        ".DC2:;\n\t"
02838        :
02839        :"S"(a), "D"(b), "c"(ecx), "d"(edx), "b"(c)
02840        :"memory"
02841        );
02842 }
02843 
02844 
02845 //######################################################################
02846 // speedup ~= 4-5
02847 void sse_binaryReverse(const byte *a, byte *result, const byte val, const
02848                                 int32 sz)
02849 {
02850   static unsigned int ecx=(sz>>7);
02851   static unsigned int edx=sz&0x7f;
02852 
02853   byte pVal[16];
02854 
02855   memset(result, val, 16);
02856 
02857   asm (
02858        "orl %%ecx, %%ecx;\n\t"
02859        "jz .FA0;\n\t"
02860        ".FA1:;\n\t"
02861        "movdqu  0(%%ebx), %%xmm0;\n\t"
02862        "movdqu  0(%%ebx), %%xmm1;\n\t"
02863        "movdqu  %%xmm0, %%xmm2;\n\t"
02864        "movdqu  %%xmm1, %%xmm3;\n\t"
02865        "movdqu  %%xmm0, %%xmm4;\n\t"
02866        "movdqu  %%xmm1, %%xmm5;\n\t"
02867        "movdqu  %%xmm0, %%xmm6;\n\t"
02868        "movdqu  %%xmm1, %%xmm7;\n\t"
02869        "psubb (%%esi), %%xmm0;\n\t"
02870        "psubb 16(%%esi), %%xmm1;\n\t"
02871        "psubb 32(%%esi), %%xmm2;\n\t"
02872        "psubb 48(%%esi), %%xmm3;\n\t"
02873        "psubb 64(%%esi), %%xmm4;\n\t"
02874        "psubb 80(%%esi), %%xmm5;\n\t"
02875        "psubb 96(%%esi), %%xmm6;\n\t"
02876        "psubb 112(%%esi), %%xmm7;\n\t"
02877        "movdqu %%xmm0, (%%edi);\n\t"
02878        "movdqu %%xmm1, 16(%%edi);\n\t"
02879        "movdqu %%xmm2, 32(%%edi);\n\t"
02880        "movdqu %%xmm3, 48(%%edi);\n\t"
02881        "movdqu %%xmm4, 64(%%edi);\n\t"
02882        "movdqu %%xmm5, 80(%%edi);\n\t"
02883        "movdqu %%xmm6, 96(%%edi);\n\t"
02884        "movdqu %%xmm7, 112(%%edi);\n\t"
02885        "addl $128, %%edi;\n\t"
02886        "addl $128, %%esi;\n\t"
02887        "loop .FA1;\n\t"
02888        ".FA0:;\n\t"
02889        "movl %%edx, %%ecx;\n\t"
02890        "orl %%ecx, %%ecx;\n\t"
02891        "jz .FA2;\n\t"
02892        "movb (%%ebx), %%dl;\n\t"
02893        ".FA3:;\n\t"
02894        "movb %%dl, %%dh;\n\t"
02895        "movb (%%esi), %%al;\n\t"
02896        "subb %%al, %%dh;\n\t"
02897        "movb %%dh, (%%edi);\n\t"
02898        "incl %%esi;\n\t"
02899        "incl %%edi;\n\t"
02900        "loop .FA3;\n\t"
02901        ".FA2:;\n\t"
02902        :
02903        :"S"(a), "D"(result), "b"(pVal),"c"(ecx),"d"(edx)
02904        :"memory","eax"
02905        );
02906 }
02907 
02908 
02909 //######################################################################
02910 // speedup ~= 2
02911 void sse_binaryReverse(const float *a, float *result, const float val,
02912                                 const int sz)
02913 {
02914   static unsigned int ecx = sz>>5;
02915   static unsigned int edx = sz&0x1f;
02916   int i;
02917   float pVal[16];
02918 
02919   for(i=0;i<16;++i)
02920     pVal[i] = val;
02921 
02922 
02923   asm (
02924        "orl %%ecx, %%ecx;\n\t"
02925        "jz .FB4;\n\t"
02926        ".FB2:;\n\t"
02927        "movups (%%ebx), %%xmm0;\n\t"
02928        "movups (%%ebx), %%xmm1;\n\t"
02929        "movups %%xmm0, %%xmm2;\n\t"
02930        "movups %%xmm1, %%xmm3;\n\t"
02931        "movups %%xmm0, %%xmm4;\n\t"
02932        "movups %%xmm1, %%xmm5;\n\t"
02933        "movups %%xmm0, %%xmm6;\n\t"
02934        "movups %%xmm1, %%xmm7;\n\t"
02935        "psubq (%%esi), %%xmm0;\n\t"
02936        "psubq 16(%%esi), %%xmm1;\n\t"
02937        "psubq 32(%%esi), %%xmm2;\n\t"
02938        "psubq 48(%%esi), %%xmm3;\n\t"
02939        "psubq 64(%%esi), %%xmm4;\n\t"
02940        "psubq 80(%%esi), %%xmm5;\n\t"
02941        "psubq 96(%%esi), %%xmm6;\n\t"
02942        "psubq 112(%%esi), %%xmm7;\n\t"
02943        "movups %%xmm0,  0(%%edi);\n\t"
02944        "movups %%xmm1, 16(%%edi);\n\t"
02945        "movups %%xmm2, 32(%%edi);\n\t"
02946        "movups %%xmm3, 48(%%edi);\n\t"
02947        "movups %%xmm4, 64(%%edi);\n\t"
02948        "movups %%xmm5, 80(%%edi);\n\t"
02949        "movups %%xmm6, 96(%%edi);\n\t"
02950        "movups %%xmm7,112(%%edi);\n\t"
02951        "addl $128, %%esi;\n\t"
02952        "addl $128, %%edi;\n\t"
02953        "loop .FB2;\n\t"
02954        ".FB4:\n\t"
02955        "orl  %%edx, %%edx;\n\t"
02956        "jz .FB1;\n\t"
02957        "movl %%edx, %%ecx;\n\t"
02958        ".FB3:;\n\t"
02959        "movss 0(%%ebx), %%xmm0;\n\t"
02960        "subss (%%esi), %%xmm0;\n\t"
02961        "movups %%xmm0, (%%edi);\n\t"
02962        "addl $16, %%esi;\n\t"
02963        "addl $16, %%edi;\n\t"
02964        "loop .FB3;\n\t"
02965        ".FB1:;\n\t"
02966        :
02967        :"S"(a), "D"(result), "b"(pVal),"c"(ecx),"d"(edx)
02968        :"memory","eax"
02969        );
02970 }
02971 
02972 
02973 
02974 //######################################################################
02975 
02976 void sse_binaryReverse(const int32 *a, int32 *result, const int32 val,
02977                         const int32 sz)
02978 {
02979   int32 ecx=sz>>5;
02980   int32 edx=sz&31;
02981   int32 pVal[16];
02982   int i;
02983 
02984   for(i=0;i<16;++i) pVal[i] = val;
02985 
02986   asm (
02987        "orl %%ecx, %%ecx;\n\t"
02988        "jz .FC4;\n\t"
02989        ".FC2:;\n\t"
02990        "movdqu (%%ebx), %%xmm0;\n\t"
02991        "movdqu (%%ebx), %%xmm1;\n\t"
02992        "movdqu %%xmm0, %%xmm2;\n\t"
02993        "movdqu %%xmm1, %%xmm3;\n\t"
02994        "movdqu %%xmm0, %%xmm4;\n\t"
02995        "movdqu %%xmm1, %%xmm5;\n\t"
02996        "movdqu %%xmm0, %%xmm6;\n\t"
02997        "movdqu %%xmm1, %%xmm7;\n\t"
02998        "psubd  (%%esi), %%xmm0;\n\t"
02999        "psubd  16(%%esi), %%xmm1;\n\t"
03000        "psubd  32(%%esi), %%xmm2;\n\t"
03001        "psubd  48(%%esi), %%xmm3;\n\t"
03002        "psubd  64(%%esi), %%xmm4;\n\t"
03003        "psubd  80(%%esi), %%xmm5;\n\t"
03004        "psubd  96(%%esi), %%xmm6;\n\t"
03005        "psubd  112(%%esi), %%xmm7;\n\t"
03006        "movdqu %%xmm0,  0(%%edi);\n\t"
03007        "movdqu %%xmm1, 16(%%edi);\n\t"
03008        "movdqu %%xmm2, 32(%%edi);\n\t"
03009        "movdqu %%xmm3, 48(%%edi);\n\t"
03010        "movdqu %%xmm4, 64(%%edi);\n\t"
03011        "movdqu %%xmm5, 80(%%edi);\n\t"
03012        "movdqu %%xmm6, 96(%%edi);\n\t"
03013        "movdqu %%xmm7,112(%%edi);\n\t"
03014        "addl $128, %%esi;\n\t"
03015        "addl $128, %%edi;\n\t"
03016        "loop .FC2;\n\t"
03017        ".FC4:;\n\t"
03018        "orl  %%edx, %%edx;\n\t"
03019        "jz .FC1;\n\t"
03020        "movl %%edx, %%ecx;\n\t"
03021        ".FC3:;\n\t"
03022        "movdqu 0(%%ebx), %%xmm0;\n\t"
03023        "psubd (%%esi), %%xmm0;\n\t"
03024        "movups %%xmm0, (%%edi);\n\t"
03025        "addl $16, %%esi;\n\t"
03026        "addl $16, %%edi;\n\t"
03027        "loop .FC3;\n\t"
03028        ".FC1:;\n\t"
03029        :
03030        :"S"(a), "D"(result), "b"(pVal),"c"(ecx),"d"(edx)
03031        :"memory","eax"
03032        );
03033 }
03034 
03035 
03036 
03037 //######################################################################
03038 
03039 void sse_cvt_byte_to_int(const byte *a, int32 *b, const int32 sz)
03040 {
03041   int32 ecx=sz>>4;
03042   int32 edx=sz&0xf;
03043 
03044   asm(
03045       "orl %%ecx, %%ecx;\n\t"
03046       "jz .GA4;\n\t"
03047       "pxor %%xmm0, %%xmm0;\n\t"
03048       ".GA2:;\n\t"
03049       "movdqu 0(%%esi), %%xmm1;\n\t"
03050       "movdqa %%xmm1, %%xmm2;\n\t"
03051       "movdqa %%xmm1, %%xmm3;\n\t"
03052       "movdqa %%xmm1, %%xmm4;\n\t"
03053       "psrldq $4, %%xmm2;\n\t"
03054       "psrldq $8, %%xmm3;\n\t"
03055       "psrldq $12, %%xmm4;\n\t"
03056       "punpcklbw %%xmm0, %%xmm1;\n\t"
03057       "punpcklbw %%xmm0, %%xmm2;\n\t"
03058       "punpcklbw %%xmm0, %%xmm3;\n\t"
03059       "punpcklbw %%xmm0, %%xmm4;\n\t"
03060       "punpcklbw %%xmm0, %%xmm1;\n\t"
03061       "punpcklbw %%xmm0, %%xmm2;\n\t"
03062       "punpcklbw %%xmm0, %%xmm3;\n\t"
03063       "punpcklbw %%xmm0, %%xmm4;\n\t"
03064       "movdqu %%xmm1, (%%edi);\n\t"
03065       "movdqu %%xmm2, 16(%%edi);\n\t"
03066       "movdqu %%xmm3, 32(%%edi);\n\t"
03067       "movdqu %%xmm4, 48(%%edi);\n\t"
03068       "addl $16, %%esi;\n\t"
03069       "addl $64, %%edi;\n\t"
03070       "loop .GA2;\n\t"
03071       ".GA4:;\n\t"
03072       "orl %%edx, %%edx;\n\t"
03073       "jz .GA1;\n\t"
03074       "mov %%edx, %%ecx;\n\t"
03075       ".GA3:;\n\t"
03076       "xorl %%eax, %%eax;\n\t"
03077       "movb (%%esi), %%al;\n\t"
03078       "movl %%eax, (%%edi);\n\t"
03079       "incl %%esi;\n\t"
03080       "addl $4, %%edi;\n\t"
03081       "loop .GA3;\n\t"
03082       ".GA1:;"
03083       :
03084       :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03085       :"memory"
03086       );
03087 
03088 
03089 }
03090 
03091 #endif
03092 
03093 #ifdef INVT_USE_MMXSSE2
03094 
03095 //######################################################################
03096 // speedup ~= 1.5
03097 void sse2_cvt_byte_to_float(const byte *a, float32 *b, const int32 sz)
03098 {
03099   int32 ecx=sz>>4;
03100   int32 edx=sz&0xf;
03101 
03102   asm(
03103       "orl %%ecx, %%ecx;\n\t"
03104       "jz .GB4;\n\t"
03105       ".GB2:;\n\t"
03106       "pxor %%xmm0, %%xmm0;\n\t"
03107       "movdqu 0(%%esi), %%xmm1;\n\t"
03108       "movdqu 4(%%esi), %%xmm2;\n\t"
03109       "movdqu 8(%%esi), %%xmm3;\n\t"
03110       "movdqu 12(%%esi), %%xmm4;\n\t"
03111       "punpcklbw %%xmm0, %%xmm1;\n\t"
03112       "punpcklbw %%xmm0, %%xmm2;\n\t"
03113       "punpcklbw %%xmm0, %%xmm3;\n\t"
03114       "punpcklbw %%xmm0, %%xmm4;\n\t"
03115       "punpcklbw %%xmm0, %%xmm1;\n\t"
03116       "punpcklbw %%xmm0, %%xmm2;\n\t"
03117       "punpcklbw %%xmm0, %%xmm3;\n\t"
03118       "punpcklbw %%xmm0, %%xmm4;\n\t"
03119       "cvtdq2ps %%xmm1, %%xmm1;\n\t"
03120       "cvtdq2ps %%xmm2, %%xmm2;\n\t"
03121       "movups  %%xmm1, (%%edi);\n\t"
03122       "movups  %%xmm2, 16(%%edi);\n\t"
03123       "cvtdq2ps %%xmm3, %%xmm3;\n\t"
03124       "cvtdq2ps %%xmm4, %%xmm4;\n\t"
03125       "movups  %%xmm3, 32(%%edi);\n\t"
03126       "movups  %%xmm4, 48(%%edi);\n\t"
03127       "addl $16, %%esi;\n\t"
03128       "addl $64, %%edi;\n\t"
03129       "loop .GB2;\n\t"
03130       ".GB4:;\n\t"
03131       "orl %%edx, %%edx;\n\t"
03132       "jz .GB1;\n\t"
03133       "movl %%edx, %%ecx;\n\t"
03134       ".GB3:;\n\t"
03135       "xorl %%eax, %%eax;\n\t"
03136       "movb (%%esi), %%al;\n\t"
03137       "movd %%eax, %%xmm0;\n\t"
03138       "cvtdq2ps %%xmm0, %%xmm1;\n\t"
03139       "movss %%xmm1, (%%edi);\n\t"
03140       "incl %%esi;\n\t"
03141       "addl $4, %%edi;\n\t"
03142       "loop .GB3;\n\t"
03143       ".GB1:;"
03144       :
03145       :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03146       :"memory"
03147       );
03148 }
03149 
03150 
03151 
03152 //######################################################################
03153 // speedup ~= 1.15
03154 void sse2_cvt_byte_to_double(const byte *a, double *b, int32 sz)
03155 {
03156   int32 ecx=sz>>3;
03157   int32 edx=sz&0x7;
03158 
03159   asm(
03160       "orl %%ecx, %%ecx;\n\t"
03161       "jz .GC4;\n\t"
03162       ".GC2:;\n\t"
03163       "pxor %%xmm0, %%xmm0;\n\t"
03164       "movdqu 0(%%esi), %%xmm1;\n\t"
03165       "movdqu 2(%%esi), %%xmm2;\n\t"
03166       "movdqu 4(%%esi), %%xmm3;\n\t"
03167       "movdqu 6(%%esi), %%xmm4;\n\t"
03168       "punpcklbw %%xmm0, %%xmm1;\n\t"
03169       "punpcklbw %%xmm0, %%xmm2;\n\t"
03170       "punpcklbw %%xmm0, %%xmm3;\n\t"
03171       "punpcklbw %%xmm0, %%xmm4;\n\t"
03172       "punpcklbw %%xmm0, %%xmm1;\n\t"
03173       "punpcklbw %%xmm0, %%xmm2;\n\t"
03174       "punpcklbw %%xmm0, %%xmm3;\n\t"
03175       "punpcklbw %%xmm0, %%xmm4;\n\t"
03176       "cvtdq2pd %%xmm1, %%xmm1;\n\t"
03177       "cvtdq2pd %%xmm2, %%xmm2;\n\t"
03178       "movupd  %%xmm1, (%%edi);\n\t"
03179       "movupd  %%xmm2, 16(%%edi);\n\t"
03180       "cvtdq2pd %%xmm3, %%xmm3;\n\t"
03181       "cvtdq2pd %%xmm4, %%xmm4;\n\t"
03182       "movupd  %%xmm3, 32(%%edi);\n\t"
03183       "movupd  %%xmm4, 48(%%edi);\n\t"
03184       "addl $8, %%esi;\n\t"
03185       "addl $64, %%edi;\n\t"
03186       "loop .GC2;\n\t"
03187       ".GC4:;\n\t"
03188       "orl %%edx, %%edx;\n\t"
03189       "jz .GC1;\n\t"
03190       "movl %%edx, %%ecx;\n\t"
03191       ".GC3:;\n\t"
03192       "xorl %%eax, %%eax;\n\t"
03193       "movb (%%esi), %%al;\n\t"
03194       "movd %%eax, %%xmm0;\n\t"
03195       "cvtdq2pd %%xmm0, %%xmm1;\n\t"
03196       "movsd %%xmm1, (%%edi);\n\t"
03197       "incl %%esi;\n\t"
03198       "addl $8, %%edi;\n\t"
03199       "loop .GC3;\n\t"
03200       ".GC1:;"
03201       :
03202       :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03203       :"memory"
03204       );
03205 
03206 }
03207 
03208 
03209 
03210 //######################################################################
03211 
03212 void sse2_cvt_int_to_float(const int32 *a, float *b, const int32 sz)
03213 {
03214   int32 ecx=sz>>5;
03215   int32 edx=sz&0x1f;
03216 
03217   asm(
03218       "orl %%ecx, %%ecx;\n\t"
03219       "jz .GD4;\n\t"
03220       ".GD2:;\n\t"
03221       "movdqu 0(%%esi), %%xmm0;\n\t"
03222       "movdqu 16(%%esi), %%xmm1;\n\t"
03223       "movdqu 32(%%esi), %%xmm2;\n\t"
03224       "movdqu 48(%%esi), %%xmm3;\n\t"
03225       "movdqu 64(%%esi), %%xmm4;\n\t"
03226       "movdqu 80(%%esi), %%xmm5;\n\t"
03227       "movdqu 96(%%esi), %%xmm6;\n\t"
03228       "movdqu 112(%%esi), %%xmm7;\n\t"
03229       "cvtdq2ps %%xmm0, %%xmm0;\n\t"
03230       "cvtdq2ps %%xmm1, %%xmm1;\n\t"
03231       "cvtdq2ps %%xmm2, %%xmm2;\n\t"
03232       "cvtdq2ps %%xmm3, %%xmm3;\n\t"
03233       "cvtdq2ps %%xmm4, %%xmm4;\n\t"
03234       "cvtdq2ps %%xmm5, %%xmm5;\n\t"
03235       "cvtdq2ps %%xmm6, %%xmm6;\n\t"
03236       "cvtdq2ps %%xmm7, %%xmm7;\n\t"
03237       "movups %%xmm0, 0(%%edi);\n\t"
03238       "movups %%xmm1, 16(%%edi);\n\t"
03239       "movups %%xmm2, 32(%%edi);\n\t"
03240       "movups %%xmm3, 48(%%edi);\n\t"
03241       "movups %%xmm4, 64(%%edi);\n\t"
03242       "movups %%xmm5, 80(%%edi);\n\t"
03243       "movups %%xmm6, 96(%%edi);\n\t"
03244       "movups %%xmm7, 112(%%edi);\n\t"
03245       "addl $128, %%esi;\n\t"
03246       "addl $128, %%edi;\n\t"
03247       "decl %%ecx;\n\t"
03248       "jnz .GD2;\n\t"
03249       ".GD4:;\n\t"
03250       "orl %%edx, %%edx;\n\t"
03251       "jz .GD1;\n\t"
03252       "movl %%edx, %%ecx;\n\t"
03253       ".GD3:;\n\t"
03254       "movd (%%esi), %%xmm0;\n\t"
03255       "cvtdq2ps %%xmm0, %%xmm0;\n\t"
03256       "movss %%xmm0, (%%edi);\n\t"
03257       "addl $4, %%esi;\n\t"
03258       "addl $4, %%edi;\n\t"
03259       "loop .GD3;\n\t"
03260       ".GD1:;"
03261       :
03262       :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03263       :"memory"
03264       );
03265 
03266 }
03267 
03268 //######################################################################
03269 // speedup ~= 1.2
03270 void sse2_cvt_int_to_double(const int32 *a, double *b, const int32 sz)
03271 {
03272   int32 ecx=sz>>4;
03273   int32 edx=sz&0xf;
03274 
03275   asm(
03276       "orl %%ecx, %%ecx;\n\t"
03277       "jz .GE4;\n\t"
03278       ".GE2:;\n\t"
03279       "movdqu 0(%%esi), %%xmm0;\n\t"
03280       "movdqu  8(%%esi), %%xmm1;\n\t"
03281       "movdqu 16(%%esi), %%xmm2;\n\t"
03282       "movdqu 24(%%esi), %%xmm3;\n\t"
03283       "movdqu 32(%%esi), %%xmm4;\n\t"
03284       "movdqu 40(%%esi), %%xmm5;\n\t"
03285       "movdqu 48(%%esi), %%xmm6;\n\t"
03286       "movdqu 56(%%esi), %%xmm7;\n\t"
03287       "cvtdq2pd %%xmm0, %%xmm0;\n\t"
03288       "cvtdq2pd %%xmm1, %%xmm1;\n\t"
03289       "cvtdq2pd %%xmm2, %%xmm2;\n\t"
03290       "cvtdq2pd %%xmm3, %%xmm3;\n\t"
03291       "cvtdq2pd %%xmm4, %%xmm4;\n\t"
03292       "cvtdq2pd %%xmm5, %%xmm5;\n\t"
03293       "cvtdq2pd %%xmm6, %%xmm6;\n\t"
03294       "cvtdq2pd %%xmm7, %%xmm7;\n\t"
03295       "movups %%xmm0, 0(%%edi);\n\t"
03296       "movups %%xmm1, 16(%%edi);\n\t"
03297       "movups %%xmm2, 32(%%edi);\n\t"
03298       "movups %%xmm3, 48(%%edi);\n\t"
03299       "movups %%xmm4, 64(%%edi);\n\t"
03300       "movups %%xmm5, 80(%%edi);\n\t"
03301       "movups %%xmm6, 96(%%edi);\n\t"
03302       "movups %%xmm7, 112(%%edi);\n\t"
03303       "addl $64, %%esi;\n\t"
03304       "addl $128, %%edi;\n\t"
03305       "decl %%ecx;\n\t"
03306       "jnz .GE2;\n\t"
03307       ".GE4:;\n\t"
03308       "orl %%edx, %%edx;\n\t"
03309       "jz .GE1;\n\t"
03310       "movl %%edx, %%ecx;\n\t"
03311       ".GE3:;\n\t"
03312       "movd (%%esi), %%xmm0;\n\t"
03313       "cvtdq2pd %%xmm0, %%xmm0;\n\t"
03314       "movsd %%xmm0, (%%edi);\n\t"
03315       "addl $4, %%esi;\n\t"
03316       "addl $8, %%edi;\n\t"
03317       "loop .GE3;\n\t"
03318       ".GE1:;"
03319       :
03320       :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03321       :"memory"
03322       );
03323 
03324 }
03325 
03326 //######################################################################
03327 void sse2_cvt_float_to_int(const float *a, int *b, const int32 sz)
03328 {
03329   int32 ecx=sz;
03330   int32 edx=sz;
03331 
03332   asm (
03333        "orl %%ecx, %%ecx;\n\t"
03334        "jz .GF1;\n\t"
03335        ".GF2:;\n\t"
03336        "movdqu 0(%%esi), %%xmm0;\n\t"
03337        "movdqu  8(%%esi), %%xmm1;\n\t"
03338        "movdqu 16(%%esi), %%xmm2;\n\t"
03339        "movdqu 24(%%esi), %%xmm3;\n\t"
03340        "movdqu 32(%%esi), %%xmm4;\n\t"
03341        "movdqu 40(%%esi), %%xmm5;\n\t"
03342        "movdqu 48(%%esi), %%xmm6;\n\t"
03343        "movdqu 56(%%esi), %%xmm7;\n\t"
03344        "cvtps2dq %%xmm0, %%xmm0;\n\t"
03345        "cvtps2dq %%xmm1, %%xmm1;\n\t"
03346        "cvtps2dq %%xmm2, %%xmm2;\n\t"
03347        "cvtps2dq %%xmm3, %%xmm3;\n\t"
03348        "cvtps2dq %%xmm4, %%xmm4;\n\t"
03349        "cvtps2dq %%xmm5, %%xmm5;\n\t"
03350        "cvtps2dq %%xmm6, %%xmm6;\n\t"
03351        "cvtps2dq %%xmm7, %%xmm7;\n\t"
03352        "movdqu %%xmm0, 0(%%edi);\n\t"
03353        "movdqu %%xmm1, 16(%%edi);\n\t"
03354        "movdqu %%xmm2, 32(%%edi);\n\t"
03355        "movdqu %%xmm3, 48(%%edi);\n\t"
03356        "movdqu %%xmm4, 64(%%edi);\n\t"
03357        "movdqu %%xmm5, 80(%%edi);\n\t"
03358        "movdqu %%xmm6, 96(%%edi);\n\t"
03359        "movdqu %%xmm7, 112(%%edi);\n\t"
03360        "addl $64, %%esi;\n\t"
03361        "addl $128, %%edi;\n\t"
03362        "decl %%ecx;\n\t"
03363        "jnz .GF2;\n\t"
03364        ".GF4:;\n\t"
03365        "orl %%edx, %%edx;\n\t"
03366        "jz .GF1;\n\t"
03367        "movl %%edx, %%ecx;\n\t"
03368        ".GF3:;\n\t"
03369        "movd (%%esi), %%xmm0;\n\t"
03370        "cvtps2dq %%xmm0, %%xmm0;\n\t"
03371        "movd  %%xmm0, (%%edi);\n\t"
03372        "addl $4, %%esi;\n\t"
03373        "addl $8, %%edi;\n\t"
03374        "loop .GF3;\n\t"
03375        ".GF1:;"
03376        :
03377        :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03378        :"memory"
03379        );
03380 
03381 }
03382 
03383 
03384 
03385 //######################################################################
03386 void sse2_cvt_float_to_double(const float *a, double *b, const int32 sz)
03387 {
03388   int32 ecx=sz>>4;
03389   int32 edx=sz&0xf;
03390 
03391   asm(
03392       "orl %%ecx, %%ecx;\n\t"
03393       "jz .GG4;\n\t"
03394       ".GG2:;\n\t"
03395       "movups 0(%%esi), %%xmm0;\n\t"
03396       "movups  8(%%esi), %%xmm1;\n\t"
03397       "movups 16(%%esi), %%xmm2;\n\t"
03398       "movups 24(%%esi), %%xmm3;\n\t"
03399       "movups 32(%%esi), %%xmm4;\n\t"
03400       "movups 40(%%esi), %%xmm5;\n\t"
03401       "movups 48(%%esi), %%xmm6;\n\t"
03402       "movups 56(%%esi), %%xmm7;\n\t"
03403       "cvtps2pd %%xmm0, %%xmm0;\n\t"
03404       "cvtps2pd %%xmm1, %%xmm1;\n\t"
03405       "cvtps2pd %%xmm2, %%xmm2;\n\t"
03406       "cvtps2pd %%xmm3, %%xmm3;\n\t"
03407       "cvtps2pd %%xmm4, %%xmm4;\n\t"
03408       "cvtps2pd %%xmm5, %%xmm5;\n\t"
03409       "cvtps2pd %%xmm6, %%xmm6;\n\t"
03410       "cvtps2pd %%xmm7, %%xmm7;\n\t"
03411       "movupd %%xmm0, 0(%%edi);\n\t"
03412       "movupd %%xmm1, 16(%%edi);\n\t"
03413       "movupd %%xmm2, 32(%%edi);\n\t"
03414       "movupd %%xmm3, 48(%%edi);\n\t"
03415       "movupd %%xmm4, 64(%%edi);\n\t"
03416       "movupd %%xmm5, 80(%%edi);\n\t"
03417       "movupd %%xmm6, 96(%%edi);\n\t"
03418       "movupd %%xmm7, 112(%%edi);\n\t"
03419       "addl $64, %%esi;\n\t"
03420       "addl $128, %%edi;\n\t"
03421       "decl %%ecx;\n\t"
03422       "jnz .GG2;\n\t"
03423       ".GG4:;\n\t"
03424       "orl %%edx, %%edx;\n\t"
03425       "jz .GG1;\n\t"
03426       "movl %%edx, %%ecx;\n\t"
03427       ".GG3:;\n\t"
03428       "movd (%%esi), %%xmm0;\n\t"
03429       "cvtps2pd %%xmm0, %%xmm0;\n\t"
03430       "movsd %%xmm0, (%%edi);\n\t"
03431       "addl $4, %%esi;\n\t"
03432       "addl $8, %%edi;\n\t"
03433       "loop .GG3;\n\t"
03434       ".GG1:;"
03435       :
03436       :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03437       :"memory"
03438       );
03439 }
03440 
03441 #endif
03442 
03443 #ifdef INVT_USE_SSE
03444 
03445 //######################################################################
03446 void sse_lowPass3x(const float *a, float *b, const int h, const int w)
03447 {
03448   const float coeffs[] = { 3.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0, 4.0};
03449   int edx = (w-2)/12;
03450   int eax = (w-2)%12;
03451 
03452   asm (
03453        //       "movups 16(%%ebx), %%xmm7;\n\t"
03454        "orl %%ecx, %%ecx;\n\t"
03455        "jz  .HA1;\n\t"
03456        ".HA2:;\n\t"
03457 
03458        // *dptr++ = (sptr[0]+sptr[0]+sptr[1])/3.0
03459        "movss 0(%%esi), %%xmm1;\n\t"  // xmm1 <- sptr[0]
03460        "movss 4(%%esi), %%xmm2;\n\t" // xmm2 <- sptr[1]
03461        "addss %%xmm1, %%xmm1;\n\t"   // xmm2 <- sptr[0] + sptr[0]
03462        "addss %%xmm1, %%xmm2;\n\t"   // xmm2 <- xmm2 + sptr[1]
03463        "divss (%%ebx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0
03464        "movss %%xmm2, (%%edi);\n\t"  // *dptr <- xmm2
03465        "addl  $4, %%edi;\n\t"        // ++dptr
03466 
03467        //  for (int i = 0; i < w - 2; i ++)
03468        "orl %%edx, %%edx;\n\t"
03469        "jz .HA4;\n\t"
03470 
03471        "pushl %%edx;\n\t"
03472        ".HA3:;\n\t"
03473        "movups 00(%%esi),  %%xmm0;\n\t"
03474        "movups 04(%%esi),  %%xmm1;\n\t"
03475        "movups 8(%%esi),  %%xmm2;\n\t"
03476        "movups 16(%%esi),  %%xmm3;\n\t"
03477        "movups 20(%%esi),  %%xmm4;\n\t"
03478        "movups 24(%%esi),  %%xmm5;\n\t"
03479        "movups 32(%%esi),  %%xmm6;\n\t"
03480        "movups 36(%%esi),  %%xmm7;\n\t"
03481        "addps  %%xmm1, %%xmm0;\n\t"
03482        "addps  %%xmm4, %%xmm3;\n\t"
03483        "addps  %%xmm1, %%xmm0;\n\t"
03484        "addps  %%xmm4, %%xmm3;\n\t"
03485        "movups 40(%%esi), %%xmm1;\n\t"
03486        "addps  %%xmm7, %%xmm6;\n\t"
03487        "addps  %%xmm2, %%xmm0;\n\t"
03488        "addps  %%xmm1, %%xmm6;\n\t"
03489        "addps  %%xmm5, %%xmm3;\n\t"
03490        "addps  %%xmm7, %%xmm6;\n\t"
03491        "divps  16(%%ebx ), %%xmm0;\n\t"
03492        "divps  16(%%ebx ), %%xmm3;\n\t"
03493        "divps  16(%%ebx ), %%xmm6;\n\t"
03494        "movups %%xmm0, (%%edi);\n\t"
03495        "movups %%xmm3, 16(%%edi);\n\t"
03496        "movups %%xmm6, 32(%%edi);\n\t"
03497        "addl   $48, %%esi;\n\t"
03498        "addl   $48, %%edi;\n\t"
03499        "decl   %%edx;\n\t"
03500        "jnz  .HA3;\n\t"
03501        "popl %%edx;\n\t"
03502        ".HA4:;\n\t"
03503 
03504        "orl  %%eax, %%eax;\n\t"
03505        "jz .HA6;\n\t"
03506        "pushl %%eax;\n\t"
03507        ".HA5:;\n\t"
03508        "movss  00(%%esi),  %%xmm0;\n\t"
03509        "movss  04(%%esi),  %%xmm1;\n\t"
03510        "movss  8(%%esi),  %%xmm2;\n\t"
03511        "addps  %%xmm1, %%xmm0;\n\t"
03512        "addps  %%xmm1, %%xmm2;\n\t"
03513        "addps  %%xmm2, %%xmm0;\n\t"
03514        "divss  16(%%ebx ), %%xmm0;\n\t"
03515        "movss  %%xmm0, (%%edi);\n\t"
03516        "addl   $4, %%esi;\n\t"
03517        "addl   $4, %%edi;\n\t"
03518        "decl %%eax;\n\t"
03519        "jnz .HA5;\n\t"
03520        "popl %%eax;\n\t"
03521 
03522        ".HA6:;\n\t"
03523        "movss (%%esi), %%xmm1;\n\t"  // xmm1 <- sptr[0]
03524        "movss 4(%%esi), %%xmm2;\n\t" // xmm2 <- sptr[1]
03525        "addss %%xmm2, %%xmm2;\n\t"   // xmm2 <- sptr[0] + sptr[1]
03526        "addss %%xmm1, %%xmm2;\n\t"   // xmm2 <- xmm2 + sptr[0]
03527        "divss 0(%%ebx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0
03528 
03529        "movss %%xmm2, (%%edi);\n\t"     // *dptr <- xmm2
03530        "addl  $4, %%edi;\n\t"        // ++dptr
03531        "addl  $8, %%esi;\n\t"        // sptr += 2
03532        "decl %%ecx;\n\t"
03533        "jnz .HA2;\n\t"
03534        ".HA1:;\n\t"
03535        :
03536        :"S"(a), "D"(b),"c"(h),"a"(eax),"d"(edx),"b"(coeffs)
03537        :"memory"
03538        );
03539 
03540 }
03541 
03542 
03543 
03544 
03545 //######################################################################
03546 
03547 void sse_lowPass3y(const float *a, float *b, const int h, const int w)
03548 {
03549   const float coeffs[] = { 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0};
03550 
03551   if (h < 2){
03552     memcpy(b, a, w*h*sizeof(b[0]));
03553     return; // nothing to smooth
03554   }
03555 
03556   if (h < 2){
03557     memcpy(b, a, w*h*sizeof(b[0]));
03558     return; // nothing to smooth
03559   }
03560 
03561   asm (
03562        // top row
03563        "movl %%edx, %%ecx;\n\t"
03564        "orl %%ecx, %%ecx;\n\t"
03565        "jz .HU1;\n\t"
03566        "push %%esi;\n\t"
03567        ".HU0:;\n\t"
03568        "movss (%%esi), %%xmm0;\n\t" // xmm0 <- sptr[0]
03569        "movss (%%esi, %%edx, 4), %%xmm1;\n\t" //xmm1 <- sptr[w]
03570        "addss %%xmm0, %%xmm0;\n\t"
03571        "addss %%xmm1, %%xmm0;\n\t"
03572        "divss (%%ebx), %%xmm0;\n\t"
03573        "addl $4, %%esi;\n\t"
03574        "movss %%xmm0, (%%edi);\n\t"
03575        "addl  $4, %%edi;\n\t"
03576        "decl %%ecx;\n\t"
03577        "jnz .HU0;\n\t"
03578        "popl %%esi;\n\t"
03579        ".HU1:;\n\t"
03580        "cmpl $2, %%eax;\n\t"
03581        "jle .HU5;\n\t"
03582 
03583        "pushl %%eax;\n\t"
03584        "subl $2, %%eax;\n\t"
03585        "jle .HU4;\n\t"
03586        ".HU2:;\n\t"
03587        "movl %%edx, %%ecx;\n\t"
03588        "pushl %%edx;\n\t"
03589        ".HU3:;\n\t"
03590        "movss (%%esi), %%xmm0;\n\t" //xmm0 <- sptr[0]
03591        "movss (%%esi,%%edx,4), %%xmm1;\n\t" //xmm1 <- sptr[w]
03592        "movss (%%esi,%%edx,8), %%xmm2;\n\t" //xmm2 <- sptr[2*w]
03593        "addss %%xmm1, %%xmm0;\n\t"
03594        "addss %%xmm1, %%xmm2;\n\t"
03595        "addss %%xmm2, %%xmm0;\n\t"
03596        "divss 16(%%ebx), %%xmm0;\n\t"
03597        "movss %%xmm0, (%%edi);\n\t"
03598        "addl  $4, %%esi;\n\t"
03599        "addl  $4, %%edi;\n\t"
03600        "decl  %%ecx;\n\t"
03601        "jnz .HU3;\n\t"
03602        "popl %%edx;\n\t"
03603        "decl %%eax;\n\t"
03604        "jnz .HU2;\n\t"
03605 
03606        ".HU4:;\n\t"
03607        "popl %%eax;\n\t"
03608        ".HU5:;\n\t"
03609        "orl %%edx, %%edx;\n\t"
03610        "jz .HU7;\n\t"
03611        "pushl %%edx;\n\t"
03612        "movl  %%edx, %%ecx;\n\t"
03613        ".HU6:;\n\t"
03614        "movss (%%esi), %%xmm0;\n\t" //xmm0 <- sptr[0]
03615        "movss (%%esi,%%ecx,4), %%xmm1;\n\t" //xmm1 <- sptr[w]
03616        "addss %%xmm1, %%xmm1;\n\t"
03617        "addss %%xmm1, %%xmm0;\n\t"
03618        "divss (%%ebx), %%xmm0;\n\t"
03619        "movss %%xmm0, (%%edi);\n\t"
03620        "addl $4, %%esi;\n\t"
03621        "addl $4, %%edi;\n\t"
03622        "decl %%edx;\n\t"
03623        "jnz .HU6;\n\t"
03624        "popl %%edx;\n\t"
03625        ".HU7:;\n\t"
03626        :
03627        :"S"(a),"D"(b),"a"(h),"d"(w),"b"(coeffs)
03628        );
03629 
03630 }
03631 
03632 
03633 //######################################################################
03634 
03635 void sse_lowPass5x(const float *src, float *dest, const int h, const int w)
03636 {
03637   const float *sptr= src;
03638   float *dptr= dest;
03639 
03640   if(w<2)
03641     {
03642       memcpy(dest,src,h*w*sizeof(dest[0]));
03643       return;
03644     }
03645 
03646   if (w == 2) //////////////////////////////////////////////////
03647     for (int j = 0; j < h; j ++)
03648       {
03649         // leftmost point  [ (6^) 4 ] / 10
03650         *dptr++ = sptr[0] * (6.0F / 10.0F) + sptr[1] * (4.0F / 10.0F);
03651 
03652         // rightmost point  [ 4^ (6) ] / 10
03653         *dptr++ = sptr[0] * (4.0F / 10.0F) + sptr[1] * (6.0F / 10.0F);
03654 
03655         sptr += 2;  // sptr back to same position as dptr
03656       }
03657   else if (w == 3) //////////////////////////////////////////////////
03658     for (int j = 0; j < h; j ++)
03659       {
03660         // leftmost point  [ (6^) 4 1 ] / 11
03661         *dptr++ = sptr[0] * (6.0F / 11.0F) +
03662           sptr[1] * (4.0F / 11.0F) +
03663           sptr[2] * (1.0F / 11.0F);
03664 
03665         // middle point    [ 4^ (6) 4 ] / 14
03666         *dptr++ = (sptr[0] + sptr[2]) * (4.0F / 14.0F) +
03667           sptr[1] * (6.0F / 14.0F);
03668 
03669         // rightmost point  [ 1^ 4 (6) ] / 11
03670         *dptr++ = sptr[0] * (1.0F / 11.0F) +
03671           sptr[1] * (4.0F / 11.0F) +
03672           sptr[2] * (6.0F / 11.0F);
03673 
03674         sptr += 3;  // sptr back to same position as dptr
03675       }
03676   else
03677     if(w>3)
03678       {
03679         const float coeffs[] = {6.0/11.0, 4.0/11.0, 1.0/11.0, 4.0/15.0,
03680                                 4.0/15.0, 6.0/15.0, 1.0/15.0, 1.0/16.0,
03681                                 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
03682                                 4.0/16.0, 4.0/16.0, 4.0/16.0, 4.0/16.0,
03683                                 6.0/16.0, 6.0/16.0, 6.0/16.0, 6.0/16.0,
03684                                 1.0/15.0, 4.0/15.0, 6.0/15.0, 1.0/15.0,
03685                                 1.0/11.0, 4.0/11.0, 6.0/11.0, 1.0/11.0
03686         };
03687 
03688         int eax= (w-4)&3;
03689         int edx= (w-4)>>2;
03690 
03691         asm(
03692             "orl %%ecx, %%ecx;\n\t"  // ecx <- h
03693             "jz .HG6;\n\t"
03694             ".HG0:;\n\t"
03695             "movss   (%%esi), %%xmm0;\n\t" // xmm0 <- s[0]
03696             "movss  4(%%esi), %%xmm2;\n\t" // xmm2 <- s[1]
03697             "movss  8(%%esi), %%xmm4;\n\t" // xmm4 <- s[2]
03698             "movss 12(%%esi), %%xmm6;\n\t" // xmm6 <- s[3]
03699             "movss  %%xmm0, %%xmm1;\n\t"   // xmm1 <- s[0]
03700             "movss  %%xmm2, %%xmm3;\n\t"   // xmm3 <- s[1]
03701             "movss  %%xmm4, %%xmm5;\n\t"   // xmm5 <- s[2]
03702             "mulss   (%%ebx), %%xmm0;\n\t" // xmm0 <- 6.0/11.0*s[0]
03703             "mulss  4(%%ebx), %%xmm2;\n\t" // xmm2 <- 4.0/11.0*s[1]
03704             "mulss  8(%%ebx), %%xmm4;\n\t" // xmm4 <- 1.0/11.0*s[2]
03705             "addss  %%xmm5, %%xmm1;\n\t"   // xmm1 <- s[2]+s[0]
03706             "mulss 16(%%ebx), %%xmm1;\n\t" // xmm1 <- (s2+s0)*4.0/15.0
03707             "mulss 20(%%ebx), %%xmm3;\n\t"
03708             "mulss 24(%%ebx), %%xmm6;\n\t"
03709             "addss %%xmm2, %%xmm0;\n\t"
03710             "addss %%xmm3, %%xmm1;\n\t"
03711             "addss %%xmm4, %%xmm0;\n\t"
03712             "addss %%xmm6, %%xmm1;\n\t"
03713             "movss %%xmm0,   (%%edi);\n\t"
03714             "movss %%xmm1,  4(%%edi);\n\t"
03715             "addl  $8, %%edi;\n\t"
03716 
03717             "orl   %%edx, %%edx;\n\t"
03718             "jz .HG5;\n\t"
03719 
03720             "pushl %%edx;\n\t"   // edx <- (w-4)/4
03721             "movups  32(%%ebx), %%xmm5;\n\t" // xmm5 <- 1.0/16.0 1.0/16.0 1.0/16 1.0/16
03722             "movups  48(%%ebx), %%xmm6;\n\t" // xmm6 <- 4.0/16.0 ......................
03723             "movups  64(%%ebx), %%xmm7;\n\t" // xmm7 <- 6.0/16.0 ......................
03724             ".HG1:;\n\t"
03725             "movups   0(%%esi), %%xmm0;\n\t" // xmm0 <- s0  s1  s2  s3
03726             "movups 04(%%esi), %%xmm1;\n\t" // xmm1 <- s1  s2  s3  s4
03727             "movups  8(%%esi), %%xmm2;\n\t" // xmm2 <- s2  s3  s4  s5
03728             "movups 12(%%esi), %%xmm3;\n\t" // xmm3 <- s3  s4  s5  s6
03729              "movups 16(%%esi), %%xmm4;\n\t" // xmm4 <- s4  s5  s6  s7
03730             "addps  %%xmm4, %%xmm0;\n\t"
03731             "addps  %%xmm3, %%xmm1;\n\t"
03732             "mulps  %%xmm5, %%xmm0;\n\t"
03733             "mulps  %%xmm6, %%xmm1;\n\t"
03734             "mulps  %%xmm7, %%xmm2;\n\t"
03735             "addps  %%xmm1, %%xmm0;\n\t"
03736             "addps  %%xmm2, %%xmm0;\n\t"
03737             "movups %%xmm0, (%%edi);\n\t"
03738             "addl   $16, %%esi;\n\t"
03739             "addl   $16, %%edi;\n\t"
03740             "decl   %%edx;\n\t"
03741             "jnz .HG1;\n\t"
03742             "popl %%edx;\n\t"
03743 
03744             ".HG5:;\n\t"
03745             "orl  %%eax, %%eax;\n\t"
03746             "jz  .HG3;\n\t"
03747             "pushl %%eax;\n\t"       // eax <- (w-4)%4
03748             "movups 32(%%ebx), %%xmm5;\n\t"
03749             "movups 48(%%ebx), %%xmm6;\n\t"
03750             "movups 64(%%ebx), %%xmm7;\n\t"
03751             ".HG2:;\n\t"
03752             "movss    (%%esi), %%xmm0;\n\t"
03753             "movss   4(%%esi), %%xmm1;\n\t"
03754             "movss   8(%%esi), %%xmm2;\n\t"
03755             "movss  12(%%esi), %%xmm3;\n\t"
03756             "movss  16(%%esi), %%xmm4;\n\t"
03757             "mulss  %%xmm5   , %%xmm0;\n\t"
03758             "mulss  %%xmm6   , %%xmm1;\n\t"
03759             "mulss  %%xmm7   , %%xmm2;\n\t"
03760             "mulss  %%xmm6   , %%xmm3;\n\t"
03761             "mulss  %%xmm5   , %%xmm4;\n\t"
03762             "addss  %%xmm1, %%xmm0;\n\t"
03763             "addss  %%xmm3, %%xmm2;\n\t"
03764             "addss  %%xmm4, %%xmm0;\n\t"
03765             "addss  %%xmm2, %%xmm0;\n\t"
03766             "addl   $4, %%esi;\n\t"
03767             "movss  %%xmm0, (%%edi);\n\t"
03768             "addl   $4, %%edi;\n\t"
03769             "decl   %%eax;\n\t"
03770             "jnz .HG2;\n\t"
03771             "popl  %%eax;\n\t"
03772             ".HG3:;\n\t"
03773             "movss  (%%esi), %%xmm0;\n\t"  // xmm0 <- s0
03774             "movss 4(%%esi), %%xmm1;\n\t"  // xmm1 <- s1
03775             "movss 8(%%esi), %%xmm2;\n\t"  // xmm2 <- s2
03776             "movss 12(%%esi), %%xmm3;\n\t" // xmm3 <- s3
03777             "movss %%xmm1, %%xmm4;\n\t"    // xmm4 <- s1
03778             "movss %%xmm2, %%xmm5;\n\t"    // xmm5 <- s2
03779             "movss %%xmm3, %%xmm6;\n\t"    // xmm6 <- s3
03780             "addps %%xmm1, %%xmm3;\n\t"    // xmm3 <- s1+s3
03781             "mulss 80(%%ebx), %%xmm0;\n\t" // xmm0 <- 1.0/15.0*s0
03782             "mulss 84(%%ebx), %%xmm3;\n\t" // xmm3 <- 4.0/15.0*(s1+s3)
03783             "mulss 88(%%ebx), %%xmm2;\n\t" // xmm2 <- 6.0/15.0*s2
03784             "addss %%xmm3, %%xmm0;\n\t"
03785             "addss %%xmm2, %%xmm0;\n\t"
03786             "movss %%xmm0, (%%edi);\n\t"
03787             "mulss 96(%%ebx), %%xmm4;\n\t"
03788             "mulss 100(%%ebx), %%xmm5;\n\t"
03789             "mulss 104(%%ebx), %%xmm6;\n\t"
03790             "addss %%xmm5, %%xmm4;\n\t"
03791             "addss %%xmm6, %%xmm4;\n\t"
03792             "movss %%xmm4, 4(%%edi);\n\t"
03793             "addl $16, %%esi;\n\t"
03794             "addl $8, %%edi;\n\t"
03795             "decl %%ecx;\n\t"
03796             "jnz .HG0;\n\t"
03797             ".HG6:;\n\t"
03798             :
03799             :"S"(sptr),"D"(dptr),"a"(eax),"b"(coeffs),"c"(h),"d"(edx)
03800             :"memory"
03801             );
03802       }
03803 
03804 }
03805 
03806 
03807 
03808 //######################################################################
03809 
03810 void sse_lowPass5y(const float *src, float *dest, const int h,
03811                        const int w)
03812 {
03813   if (h < 2){
03814     memcpy(dest, src, h*w*sizeof(dest[0]));
03815     return; // nothing to smooth
03816   }
03817 
03818   const float *sptr= src;
03819   float *dptr= dest;
03820 
03821   // ########## vertical pass  (even though we scan horiz for speedup)
03822   const int w2 = w * 2; // speedup
03823 
03824 
03825   if (h == 2) //////////////////////////////////////////////////
03826     {
03827       // topmost points  ( [ (6^) 4 ] / 10 )^T
03828       for (int i = 0; i < w; i ++)
03829         {
03830           *dptr++ = sptr[0] * (6.0F / 10.0F) +
03831             sptr[w] * (4.0F / 10.0F);
03832           sptr++;
03833         }
03834       sptr -= w;  // go back to top-left
03835 
03836       // bottommost points  ( [ 4^ (6) ] / 10 )^T
03837       for (int i = 0; i < w; i ++)
03838         {
03839           *dptr++ = sptr[0] * (4.0F / 10.0F) +
03840             sptr[w] * (6.0F / 10.0F);
03841           sptr++;
03842         }
03843     }
03844   else if (h == 3) //////////////////////////////////////////////////
03845     {
03846       // topmost points  ( [ (6^) 4 1 ] / 11 )^T
03847       for (int i = 0; i < w; i ++)
03848         {
03849           *dptr++ = sptr[ 0] * (6.0F / 11.0F) +
03850             sptr[ w] * (4.0F / 11.0F) +
03851             sptr[w2] * (1.0F / 11.0F);
03852           sptr++;
03853         }
03854       sptr -= w;  // go back to top-left
03855 
03856       // middle points  ( [ 4^ (6) 4 ] / 14 )^T
03857       for (int i = 0; i < w; i ++)
03858         {
03859           *dptr++ = (sptr[ 0] + sptr[w2]) * (4.0F / 14.0F) +
03860             sptr[ w] * (6.0F / 14.0F);
03861           sptr++;
03862         }
03863       sptr -= w;  // go back to top-left
03864 
03865       // bottommost points  ( [ 1^ 4 (6) ] / 11 )^T
03866       for (int i = 0; i < w; i ++)
03867         {
03868           *dptr++ = sptr[ 0] * (1.0F / 11.0F) +
03869             sptr[ w] * (4.0F / 11.0F) +
03870             sptr[w2] * (6.0F / 11.0F);
03871           sptr++;
03872         }
03873     }
03874   else  ///////////////////////////////// general case for height >= 4
03875     {
03876       // topmost points  ( [ (6^) 4 1 ] / 11 )^T
03877 
03878       static const float coeffs[] = {
03879         6.0/11.0, 6.0/11.0, 6.0/11.0, 6.0/11.0, //0
03880         4.0/11.0, 4.0/11.0, 4.0/11.0, 4.0/11.0, //16
03881         1.0/11.0, 1.0/11.0, 1.0/11.0, 1.0/11.0, //32
03882         4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, //48
03883         6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, //64
03884         1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, //80
03885         1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0, //96
03886         4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, //112
03887         6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F  //128
03888       };
03889 
03890       int ecx=h-4;
03891       int edx=w>>2;
03892       int eax=w&3;
03893 
03894       asm (
03895            "pushl %%ebp;\n\t"
03896            "movl %0, %%ebp;\n\t"
03897            "addl %%ebp, %%ebp;\n\t"
03898            "addl %%ebp, %%ebp;\n\t"
03899 
03900            // 1st loop
03901            "movups (%%ebx), %%xmm4;\n\t"          //xmm4 <- 6.0/11.0 ...
03902            "movups 16(%%ebx), %%xmm5;\n\t"        //xmm5 <- 4.0/11.0
03903            "movups 32(%%ebx), %%xmm6;\n\t"        //xmm6 <- 1.0/11.0
03904            "pushl %%esi;\n\t"
03905            "orl  %%edx, %%edx;\n\t"
03906            "jz .IA1;\n\t"
03907            ".align 4;\n\t"
03908            "pushl %%edx;\n\t"
03909            ".IA0:;\n\t"
03910            ".align 4;\n\t"
03911            "movups (%%esi), %%xmm0;\n\t"          //xmm0 <- s0   s0   s0   s0
03912            "movups (%%esi,%%ebp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
03913            "movups (%%esi,%%ebp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
03914            "mulps  %%xmm4, %%xmm0;\n\t"
03915            "mulps  %%xmm5, %%xmm1;\n\t"
03916            "mulps  %%xmm6, %%xmm2;\n\t"
03917            "addps  %%xmm1, %%xmm0;\n\t"
03918            "addps  %%xmm2, %%xmm0;\n\t"
03919            "movups %%xmm0, (%%edi);\n\t"
03920            "addl $16, %%esi;\n\t"
03921            "addl $16, %%edi;\n\t"
03922            "decl %%edx;\n\t"
03923            "jnz .IA0;\n\t"
03924            "popl %%edx;\n\t"
03925            ".IA1:;\n\t"
03926            ".align 4;\n\t"
03927            "orl %%eax, %%eax;\n\t"
03928            "jz .IA3;\n\t"
03929            "pushl %%eax;\n\t"
03930            ".IA2:;\n\t"
03931            ".align 4;\n\t"
03932            "movss  (%%esi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
03933            "movss  (%%esi,%%ebp,1), %%xmm1;\n\t"  //xmm1 <- sW+3 sW+2 sW+1 sW
03934            "movss  (%%esi,%%ebp,2), %%xmm2;\n\t"  //xmm2 <- sP+3 sP+3 sP+1 sP
03935            "mulss  %%xmm4, %%xmm0;\n\t"
03936            "mulss  %%xmm5, %%xmm1;\n\t"
03937            "mulss  %%xmm6, %%xmm2;\n\t"
03938            "addss  %%xmm1, %%xmm0;\n\t"
03939            "addss  %%xmm2, %%xmm0;\n\t"
03940            "movss  %%xmm0, (%%edi);\n\t"
03941            "addl $4, %%esi;\n\t"
03942            "addl $4, %%edi;\n\t"
03943            "decl %%eax;\n\t"
03944            "jnz .IA2;\n\t"
03945            "popl %%eax;\n\t"
03946            ".IA3:;\n\t"
03947            "popl %%esi;\n\t"  // restore sptr
03948 
03949            // 2nd loop
03950            "movups 48(%%ebx), %%xmm4;\n\t" //xmm4 <- 4.0/15.0
03951            "movups 64(%%ebx), %%xmm5;\n\t" //xmm5 <- 6.0/15.0
03952            "movups 80(%%ebx), %%xmm6;\n\t" //xmm6 <- 1.0/15.0
03953            "pushl %%esi;\n\t"
03954            "orl   %%edx, %%edx;\n\t"
03955            "jz .IA5;\n\t"
03956            "pushl %%edx;\n\t"
03957            "pushl %%eax;\n\t"
03958            "movl  %%ebp, %%eax;\n\t"
03959            "addl  %%ebp, %%eax;\n\t"
03960            "addl  %%ebp, %%eax;\n\t"
03961            ".IA4:;\n\t"
03962            "movups (%%esi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
03963            "movups (%%esi,%%ebp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
03964            "movups (%%esi,%%ebp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
03965            "movups (%%esi,%%eax,1), %%xmm3;\n\t"  //xmm3 <- sW3  sW3  sW3  sW3
03966            "addps  %%xmm2, %%xmm0;\n\t"
03967            "mulps  %%xmm4, %%xmm0;\n\t"
03968            "mulps  %%xmm5, %%xmm1;\n\t"
03969            "mulps  %%xmm6, %%xmm3;\n\t"
03970            "addps  %%xmm1, %%xmm0;\n\t"
03971            "addps  %%xmm3, %%xmm0;\n\t"
03972            "movups %%xmm0, (%%edi);\n\t"
03973            "addl $16, %%esi;\n\t"
03974            "addl $16, %%edi;\n\t"
03975            "decl %%edx;\n\t"
03976            "jnz .IA4;\n\t"
03977            "popl %%eax;\n\t"
03978            "popl %%edx;\n\t"
03979            ".IA5:;\n\t"
03980            "orl %%eax, %%eax;\n\t"
03981            "jz .IA7;\n\t"
03982            "pushl %%eax;\n\t"
03983            "pushl %%edx;\n\t"
03984            "movl  %%ebp, %%edx;\n\t"
03985            "addl  %%ebp, %%edx;\n\t"
03986            "addl  %%ebp, %%edx;\n\t"
03987            ".IA6:;\n\t"
03988            "movss  (%%esi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
03989            "movss  (%%esi,%%ebp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
03990            "movss  (%%esi,%%ebp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
03991            "movss  (%%esi,%%edx,1), %%xmm3;\n\t" //xmm3 <- sW3  sW3  sW3  sW3
03992            "addss  %%xmm2, %%xmm0;\n\t"
03993            "mulss  %%xmm4, %%xmm0;\n\t"
03994            "mulss  %%xmm5, %%xmm1;\n\t"
03995            "mulss  %%xmm6, %%xmm3;\n\t"
03996            "addss  %%xmm1, %%xmm0;\n\t"
03997            "addss  %%xmm3, %%xmm0;\n\t"
03998            "movss  %%xmm0, (%%edi);\n\t"
03999            "addl $4, %%esi;\n\t"
04000            "addl $4, %%edi;\n\t"
04001            "decl %%eax;\n\t"
04002            "jnz .IA6;\n\t"
04003            "popl %%edx;\n\t"
04004            "popl %%eax;\n\t"
04005            ".IA7:;\n\t"
04006            "popl %%esi;\n\t"  // restore sptr
04007 
04008 
04009            //            the double loops
04010            "orl %%ecx, %%ecx;\n\t"
04011            "jz .IA29;\n\t"
04012            "pushl %%ecx;\n\t"
04013            "movups 96(%%ebx), %%xmm5;\n\t"    // xmm5 <- 1.0/16.0
04014            "movups 112(%%ebx), %%xmm6;\n\t"   // xmm6 <- 4.0/16.0
04015            "movups 128(%%ebx), %%xmm7;\n\t"   // xmm7 <- 6.0/16.0
04016            ".IA8:;\n\t"
04017            "orl  %%edx, %%edx;\n\t"
04018            "jz .IA10;\n\t"
04019            "pushl %%edx;\n\t"
04020            "pushl %%eax;\n\t"
04021            "movl  %%ebp, %%eax;\n\t"
04022            "addl  %%ebp, %%eax;\n\t"
04023            "addl  %%ebp, %%eax;\n\t"                // eax <- 3*W
04024            ".IA9:;\n\t"
04025            "movups  (%%esi),  %%xmm0;\n\t"          // xmm0 <- s    s    s    s
04026            "movups  (%%esi,%%ebp,1),  %%xmm1;\n\t"  // xmm1 <- sW   sW   sW   sW
04027            "movups  (%%esi,%%ebp,2),  %%xmm2;\n\t"  // xmm2 <- sW2  sW2  sW2  sW2
04028            "movups  (%%esi,%%eax,1), %%xmm3;\n\t"   // xmm3 <- sW3  sW3  sW3  sW3
04029            "movups  (%%esi,%%ebp,4), %%xmm4;\n\t"   // xmm4 <- sW4  sW4  sW4  sW4
04030            "addps   %%xmm3, %%xmm1;\n\t"            // xmm1 <- sW3 + sW1
04031            "addps   %%xmm4, %%xmm0;\n\t"            // xmm0 <- s0  + sW4
04032            "mulps   %%xmm6, %%xmm1;\n\t"            // xmm1 <- 4.0/16.0*(sW3+sW1)
04033            "mulps   %%xmm5, %%xmm0;\n\t"            // xmm0 <- 1.0/16.08(s0 +sW4)
04034            "mulps   %%xmm7, %%xmm2;\n\t"            // xmm2 <- 6.0/16.0*sW2
04035            "addps   %%xmm1, %%xmm0;\n\t"
04036            "addps   %%xmm2, %%xmm0;\n\t"
04037            "addl    $16, %%esi;\n\t"
04038            "movups  %%xmm0, (%%edi);\n\t"
04039            "addl    $16, %%edi;\n\t"
04040            "decl   %%edx;\n\t"
04041            "jnz .IA9;\n\t"
04042            "popl   %%eax;\n\t"
04043            "popl   %%edx;\n\t"
04044            ".IA10:;\n\t"
04045            "orl  %%eax, %%eax;\n\t"
04046            "jz .IA12;\n\t"
04047            "pushl %%eax;\n\t"
04048            "pushl %%edx;\n\t"
04049            "movl  %%ebp, %%edx;\n\t"
04050            "addl  %%ebp, %%edx;\n\t"
04051            "addl  %%ebp, %%edx;\n\t"
04052            ".IA11:;\n\t"
04053            "movss   (%%esi),  %%xmm0;\n\t"          // xmm0 <- s    s    s    s
04054            "movss   (%%esi,%%ebp,1),  %%xmm1;\n\t"  // xmm1 <- sW   sW   sW   sW
04055            "movss   (%%esi,%%ebp,2),  %%xmm2;\n\t"  // xmm2 <- sW2  sW2  sW2  sW2
04056            "movss   (%%esi,%%edx,1), %%xmm3;\n\t"   // xmm3 <- sW3  sW3  sW3  sW3
04057            "movss   (%%esi,%%ebp,4), %%xmm4;\n\t"   // xmm4 <- sW4  sW4  sW4  sW4
04058            "addss   %%xmm3, %%xmm1;\n\t"
04059            "addss   %%xmm4, %%xmm0;\n\t"
04060            "mulss   %%xmm6, %%xmm1;\n\t"
04061            "mulss   %%xmm5, %%xmm0;\n\t"
04062            "mulss   %%xmm7, %%xmm2;\n\t"
04063            "addss   %%xmm1, %%xmm0;\n\t"
04064            "addss   %%xmm2, %%xmm0;\n\t"
04065            "addl    $4, %%esi;\n\t"
04066            "movss   %%xmm0, (%%edi);\n\t"
04067            "addl    $4, %%edi;\n\t"
04068            "decl  %%eax;\n\t"
04069            "jnz .IA11;\n\t"
04070            "popl %%edx;\n\t"
04071            "popl %%eax;\n\t"
04072            ".IA12:;\n\t"
04073            "decl %%ecx;\n\t"
04074            "jnz .IA8;\n\t"
04075            "popl %%ecx;\n\t"
04076            ".IA29:;\n\t"
04077 
04078            // fourth loop
04079            "movups 48(%%ebx), %%xmm4;\n\t"  //xmm4 <- 4.0/15.0
04080            "movups 64(%%ebx), %%xmm5;\n\t"  //xmm5 <- 6.0/15.0
04081            "movups 80(%%ebx), %%xmm6;\n\t"  //xmm6 <- 1.0/15.0
04082            "orl  %%edx, %%edx;\n\t"
04083            "jz .IA14;\n\t"
04084            "pushl %%edx;\n\t"
04085            "pushl %%eax;\n\t"
04086            "movl  %%ebp, %%eax;\n\t"
04087            "addl  %%ebp, %%eax;\n\t"
04088            "addl  %%ebp, %%eax;\n\t"
04089            ".IA13:;\n\t"
04090            "movups (%%esi), %%xmm0;\n\t"          //xmm0 <- s0   s0   s0   s0
04091            "movups (%%esi,%%ebp,1), %%xmm1;\n\t"  //xmm1 <- sW1  sW1  sW1  sW1
04092            "movups (%%esi,%%ebp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
04093            "movups (%%esi,%%eax,1),%%xmm3;\n\t"  //xmm3 <- sW3  sW3  sW3  sW3
04094            "addps  %%xmm3, %%xmm1;\n\t"          //xmm1 <- sW3 + sW1
04095            "mulps  %%xmm6, %%xmm0;\n\t"          //xmm0 <- 1.0/15.0 * s0
04096            "mulps  %%xmm5, %%xmm2;\n\t"          //xmm2 <- 6.0/15.0 * sW2
04097            "mulps  %%xmm4, %%xmm1;\n\t"          //xmm4 <- 4.0/15.0 * (sW3+sW1)
04098            "addps  %%xmm2, %%xmm0;\n\t"
04099            "addps  %%xmm1, %%xmm0;\n\t"
04100            "movups %%xmm0, (%%edi);\n\t"
04101            "addl $16, %%esi;\n\t"
04102            "addl $16, %%edi;\n\t"
04103            "decl %%edx;\n\t"
04104            "jnz .IA13;\n\t"
04105            "popl %%eax;\n\t"
04106            "popl %%edx;\n\t"
04107            ".IA14:;\n\t"
04108            "orl %%eax, %%eax;\n\t"
04109            "jz .IA16;\n\t"
04110            "pushl %%eax;\n\t"
04111            "pushl %%edx;\n\t"
04112            "movl %%ebp, %%edx;\n\t"
04113            "addl %%ebp, %%edx;\n\t"
04114            "addl %%ebp, %%edx;\n\t"
04115            ".IA15:;\n\t"
04116            "movss  (%%esi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
04117            "movss  (%%esi, %%ebp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
04118            "movss  (%%esi, %%ebp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
04119            "movss  (%%esi, %%edx,1), %%xmm3;\n\t" //xmm3 <- sW3  sW3  sW3  sW3
04120            "addss  %%xmm3, %%xmm1;\n\t"
04121            "mulss  %%xmm6, %%xmm0;\n\t"
04122            "mulss  %%xmm5, %%xmm2;\n\t"
04123            "mulss  %%xmm4, %%xmm1;\n\t"
04124            "addss  %%xmm2, %%xmm0;\n\t"
04125            "addss  %%xmm1, %%xmm0;\n\t"
04126            "movss  %%xmm0, (%%edi);\n\t"
04127            "addl $4, %%esi;\n\t"
04128            "addl $4, %%edi;\n\t"
04129            "decl %%eax;\n\t"
04130            "jnz .IA15;\n\t"
04131            "popl %%edx;\n\t"
04132            "popl %%eax;\n\t"
04133            ".IA16:;\n\t"
04134 
04135             // final loop
04136            "movups 32(%%ebx), %%xmm4;\n\t"
04137            "movups 16(%%ebx), %%xmm5;\n\t"
04138            "movups   (%%ebx), %%xmm6;\n\t"
04139            "orl  %%edx, %%edx;\n\t"
04140            "jz .IA18;\n\t"
04141            "pushl %%edx;\n\t"
04142            ".IA17:;\n\t"
04143            "movups (%%esi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
04144            "movups (%%esi,%%ebp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
04145            "movups (%%esi,%%ebp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
04146            "mulps  %%xmm4, %%xmm0;\n\t"
04147            "mulps  %%xmm5, %%xmm1;\n\t"
04148            "mulps  %%xmm6, %%xmm2;\n\t"
04149            "addps  %%xmm1, %%xmm0;\n\t"
04150            "addps  %%xmm2, %%xmm0;\n\t"
04151            "movups %%xmm0, (%%edi);\n\t"
04152            "addl $16, %%esi;\n\t"
04153            "addl $16, %%edi;\n\t"
04154            "decl %%edx;\n\t"
04155            "jnz .IA17;\n\t"
04156            "popl %%edx;\n\t"
04157            ".IA18:;\n\t"
04158            "orl %%eax, %%eax;\n\t"
04159            "jz .IA20;\n\t"
04160            "pushl %%eax;\n\t"
04161            ".IA19:;\n\t"
04162            "movss  (%%esi), %%xmm0;\n\t"          //xmm0 <- s3   s2   s1   s0
04163            "movss  (%%esi,%%ebp,1), %%xmm1;\n\t"  //xmm1 <- sW   sW   sW   sW
04164            "movss  (%%esi,%%ebp,2), %%xmm2;\n\t"  //xmm2 <- sW2  sW2  sW2  sW2
04165            "mulss  %%xmm4, %%xmm0;\n\t"
04166            "mulss  %%xmm5, %%xmm1;\n\t"
04167            "mulss  %%xmm6, %%xmm2;\n\t"
04168            "addss  %%xmm1, %%xmm0;\n\t"
04169            "addss  %%xmm2, %%xmm0;\n\t"
04170            "movss  %%xmm0, (%%edi);\n\t"
04171            "addl $4, %%esi;\n\t"
04172            "addl $4, %%edi;\n\t"
04173            "decl %%eax;\n\t"
04174            "jnz .IA19;\n\t"
04175            "popl %%eax;\n\t"
04176            ".IA20:;\n\t"
04177 
04178            "popl %%ebp;\n\t"
04179            :
04180            :"m"(w),"S"(sptr),"D"(dptr),"a"(eax),"b"(coeffs),"c"(ecx),"d"(edx)
04181            );
04182 
04183     }
04184 }
04185 
04186 
04187 // ######################################################################
04188 
04189 void sse_yuv411_to_rgb_mode_640x480(const byte *src, byte *dest,
04190                                     const int nbpix2)
04191 {
04192   int ecx=nbpix2/6;
04193 
04194   const float coeffs[] = {
04195     0.0F,       -0.198242F,   1.014648F,     0.0F,  // R  G   B  xx  -> u
04196     0.700195F,  -0.29052F,    0.0F,          0.0F,  // R  G   B  xx  -> v
04197     128.0F,        128.0F,    128.0F,      128.0F   // division factor
04198   };
04199 
04200   asm (
04201        ".JA0:;\n\t"
04202        "orl %%ecx, %%ecx;\n\t"
04203        "jz .JA1;\n\t"
04204        "pxor  %%mm7, %%mm7;\n\t"    //mm7 <-  00 00 00 00
04205        "xorl  %%eax, %%eax;\n\t"
04206        "xorl  %%ebx, %%ebx;\n\t"
04207        "movl  (%%esi),   %%eax;\n\t" // eax <-   v   y1  y0 u
04208        "movw 4(%%esi),  %%bx;\n\t"   // ebx <-   xx  xx  y3 y2
04209        "movd %%eax, %%mm0;\n\t"        // mm0<- xx xx xx xx v  y1  y0  u
04210        "movd %%eax, %%mm1;\n\t"        // mm1<- xx xx xx xx v  y1  y0  u
04211        "movd %%ebx, %%mm2;\n\t"        // mm2<- xx xx xx xx xx xx  y3  y2
04212        "psrlq $16,  %%mm1;\n\t"        // mm1<- xx xx xx xx xx xx  v   y1
04213        "punpcklbw %%mm7, %%mm0;\n\t"   // mm0<- xx xx xx xx 0  y0  0   u
04214        "punpcklbw %%mm7, %%mm1;\n\t"   // mm1<- xx xx xx xx 00 v   00  y1
04215        "punpcklbw %%mm7, %%mm2;\n\t"   // mm2<- xx xx xx xx 00 y3  00  y2
04216        "punpcklwd %%mm7, %%mm0;\n\t"   // mm0<- 00 00 00 y0 00 00  00  u
04217        "punpcklwd %%mm7, %%mm1;\n\t"   // mm1<- 00 00 00 v  00 00  00  y1
04218        "punpcklwd %%mm7, %%mm2;\n\t"   // mm2<- 00 00 00 y3 00 00  00  y2
04219 
04220        "cvtpi2ps %%mm0, %%xmm0;\n\t"   // xmm0 <- 00 00 y0 u
04221        "cvtpi2ps %%mm1, %%xmm1;\n\t"   // xmm1 <- 00 00 v  y1
04222        "cvtpi2ps %%mm2, %%xmm2;\n\t"   // xmm2 <- 00 00 y3 y2
04223 
04224        // 01 01 01 01
04225        "movaps %%xmm0, %%xmm3;\n\t"
04226 
04227        // 00 00 00 00
04228        "movaps %%xmm1, %%xmm4;\n\t"
04229 
04230        // 00 00 00 00
04231        "movaps %%xmm2, %%xmm5;\n\t"
04232 
04233        // 01 01 01 01
04234        "movaps %%xmm2, %%xmm6;\n\t"
04235 
04236        "shufps $0x55, %%xmm3, %%xmm3;\n\t"// xmm3 <- y0 y0 y0 y0
04237        "shufps $00, %%xmm4, %%xmm4;\n\t"  // xmm4 <- y1 y1 y1 y1
04238        "shufps $0x00, %%xmm5, %%xmm5;\n\t"// xmm5 <- y2 y2 y2 y2
04239        "shufps $0x55, %%xmm6, %%xmm6;\n\t"// xmm6 <- y3 y3 y3 y3
04240 
04241        // 00 00 00 00
04242        "shufps $0, %%xmm0, %%xmm0;\n\t"  // xmm0 <- u  u  u  u
04243        // 01 01 01 01
04244        "shufps $0x55, %%xmm1, %%xmm1;\n\t" // xmm1 <- v  v  v  v
04245 
04246        "subps  32(%%edx), %%xmm0;\n\t"
04247        "subps  32(%%edx), %%xmm1;\n\t"
04248 
04249        "mulps (%%edx), %%xmm0;\n\t"
04250        "mulps 16(%%edx),%%xmm1;\n\t"
04251 
04252        "addps %%xmm0, %%xmm3;\n\t"
04253        "addps %%xmm0, %%xmm4;\n\t"
04254        "addps %%xmm0, %%xmm5;\n\t"
04255        "addps %%xmm0, %%xmm6;\n\t"
04256 
04257        "addps %%xmm1, %%xmm3;\n\t"    // xmm3 <- xx b0 g0 r0
04258        "addps %%xmm1, %%xmm4;\n\t"    // xmm4 <- xx b1 g1 r1
04259        "addps %%xmm1, %%xmm5;\n\t"    // xmm5 <- xx b2 g2 r2
04260        "addps %%xmm1, %%xmm6;\n\t"    // xmm6 <- xx b3 g3 r3
04261 
04262        "cvtps2pi %%xmm3, %%mm0;\n\t"  //mm0  <- g0 r0
04263        "movhlps  %%xmm3, %%xmm3;\n\t" //xmm3 <- g0 r0 xx b0
04264        "cvtps2pi %%xmm3, %%mm1;\n\t"  //mm1  <- xx b0
04265        "packssdw %%mm1, %%mm0;\n\t"   //mm0<- xx b0 g0 r0
04266 
04267        "cvtps2pi %%xmm4, %%mm2;\n\t"  //mm2  <- g1 r1
04268        "movhlps  %%xmm4, %%xmm4;\n\t" //xmm4 <- g1 r1 xx b1
04269        "cvtps2pi %%xmm4, %%mm3;\n\t"  //mm3  <- xx b1
04270        "packssdw %%mm3, %%mm2;\n\t"   //mm2<- xx b1 g1 r1
04271 
04272        "cvtps2pi %%xmm5, %%mm4;\n\t"  //mm4  <- g2 r2
04273        "movhlps  %%xmm5, %%xmm5;\n\t" //xmm5 <- g2 r2 xx b2
04274        "cvtps2pi %%xmm5, %%mm5;\n\t"  //mm5  <- xx b2
04275        "packssdw %%mm5, %%mm4;\n\t"   //mm4<- xx b2 g2 r2
04276 
04277        "cvtps2pi %%xmm6, %%mm6;\n\t"  //mm6  <- g3 r3
04278        "movhlps  %%xmm6, %%xmm6;\n\t" //xmm3 <- g3 r3 xx b3
04279        "cvtps2pi %%xmm6, %%mm7;\n\t"  //mm7  <- xx b3
04280        "packssdw %%mm7, %%mm6;\n\t"   //mm6<- xx b3 g3 r3
04281 
04282        "pxor %%mm1, %%mm1;\n\t"
04283        "pcmpgtw %%mm0, %%mm1;\n\t"
04284        "pandn %%mm0, %%mm1;\n\t"
04285 
04286        "pxor %%mm3, %%mm3;\n\t"
04287        "pcmpgtw %%mm2, %%mm3;\n\t"
04288        "pandn %%mm2, %%mm3;\n\t"
04289 
04290        "pxor %%mm5, %%mm5;\n\t"
04291        "pcmpgtw %%mm4, %%mm5;\n\t"
04292        "pandn %%mm4, %%mm5;\n\t"
04293 
04294        "pxor %%mm7, %%mm7;\n\t"
04295        "pcmpgtw %%mm6, %%mm7;\n\t"
04296        "pandn %%mm6, %%mm7;\n\t"
04297 
04298        "packuswb %%mm1, %%mm1;\n\t"   //mm0<- xx xx xx xx xx b0 g0 r0
04299        "packuswb %%mm3, %%mm3;\n\t"   //mm2<- xx xx xx xx xx b1 g1 r1
04300        "packuswb %%mm5, %%mm5;\n\t"   //mm4<- xx xx xx xx xx b2 g2 r2
04301        "packuswb %%mm7, %%mm7;\n\t"   //mm6<- xx xx xx xx xx b3 g3 r3
04302 
04303        "pushl %%ecx;\n\t"
04304        "pushl %%edx;\n\t"
04305        "movd %%mm1, %%eax;\n\t"  // eax <- xx b0 g0 r0
04306        "movd %%mm3, %%ebx;\n\t"  // ebx <- xx b1 g1 r1
04307        "movd %%mm5, %%ecx;\n\t"  // ecx <- xx b2 g2 r2
04308        "movd %%mm7, %%edx;\n\t"  // edx <- xx b3 g3 r3
04309        "movw %%ax, (%%edi);\n\t"
04310        "movw %%bx,3(%%edi);\n\t"
04311        "movw %%cx,6(%%edi);\n\t"
04312        "movw %%dx,9(%%edi);\n\t"
04313        "shrl $8, %%eax;\n\t"
04314        "shrl $8, %%ebx;\n\t"
04315        "shrl $8, %%ecx;\n\t"
04316        "shrl $8, %%edx;\n\t"
04317        "movb %%ah, 2(%%edi);\n\t"
04318        "movb %%bh, 5(%%edi);\n\t"
04319        "movb %%ch, 8(%%edi);\n\t"
04320        "movb %%dh,11(%%edi);\n\t"
04321        "popl %%edx;\n\t"
04322        "popl %%ecx;\n\t"
04323 
04324        "addl $12,%%edi;\n\t"
04325        "decl %%ecx;\n\t"
04326        "addl $6, %%esi;\n\t"
04327        "jmp .JA0;\n\t"
04328        ".JA1:;\n\t"
04329        "emms;\n\t"
04330        :
04331        :"S"(src),"D"(dest),"c"(ecx),"d"(coeffs)
04332        :"eax","ebx","memory"
04333        );
04334 
04335 }
04336 
04337 
04338 
04339 
04340 void sse_lowPass9x(const float *sptr, float *dptr, const int h, const int w)
04341 {
04342 
04343  for (int j = 0; j < h; j ++)
04344     {
04345       // leftmost points
04346       *dptr++ = sptr[0] * (70.0F / 163.0F) +
04347         sptr[1] * (56.0F / 163.0F) +
04348         sptr[2] * (28.0F / 163.0F) +
04349         sptr[3] * ( 8.0F / 163.0F) +
04350         sptr[4] * ( 1.0F / 163.0F);
04351       *dptr++ = (sptr[0] + sptr[2]) * (56.0F / 219.0F) +
04352         sptr[1] * (70.0F / 219.0F) +
04353         sptr[3] * (28.0F / 219.0F) +
04354         sptr[4] * ( 8.0F / 219.0F) +
04355         sptr[5] * ( 1.0F / 219.0F);
04356       *dptr++ = (sptr[0] + sptr[4]) * (28.0F / 247.0F) +
04357         (sptr[1] + sptr[3]) * (56.0F / 247.0F) +
04358         sptr[2] * (70.0F / 247.0F) +
04359         sptr[5] * ( 8.0F / 247.0F) +
04360         sptr[6] * ( 1.0F / 247.0F);
04361       *dptr++ = (sptr[0] + sptr[6]) * ( 8.0F / 255.0F) +
04362         (sptr[1] + sptr[5]) * (28.0F / 255.0F) +
04363         (sptr[2] + sptr[4]) * (56.0F / 255.0F) +
04364         sptr[3] * (70.0F / 255.0F) +
04365         sptr[7] * ( 1.0F / 255.0F);
04366 
04367       // far from the borders
04368       for (int i = 0; i < w - 8; i ++)
04369         {
04370           *dptr++ = (sptr[0] + sptr[8]) * ( 1.0F / 256.0F) +
04371             (sptr[1] + sptr[7]) * ( 8.0F / 256.0F) +
04372             (sptr[2] + sptr[6]) * (28.0F / 256.0F) +
04373             (sptr[3] + sptr[5]) * (56.0F / 256.0F) +
04374             sptr[4] * (70.0F / 256.0F);
04375           sptr ++;
04376         }
04377 
04378       // rightmost points
04379       *dptr++ = sptr[0] * ( 1.0F / 255.0F) +
04380         (sptr[1] + sptr[7]) * ( 8.0F / 255.0F) +
04381         (sptr[2] + sptr[6]) * (28.0F / 255.0F) +
04382         (sptr[3] + sptr[5]) * (56.0F / 255.0F) +
04383         sptr[4] * (70.0F / 255.0F);
04384       sptr ++;
04385       *dptr++ = sptr[0] * ( 1.0F / 247.0F) +
04386         sptr[1] * ( 8.0F / 247.0F) +
04387         (sptr[2] + sptr[6]) * (28.0F / 247.0F) +
04388         (sptr[3] + sptr[5]) * (56.0F / 247.0F) +
04389         sptr[4] * (70.0F / 247.0F);
04390       sptr ++;
04391       *dptr++ = sptr[0] * ( 1.0F / 219.0F) +
04392         sptr[1] * ( 8.0F / 219.0F) +
04393         sptr[2] * (28.0F / 219.0F) +
04394         (sptr[3] + sptr[5]) * (56.0F / 219.0F) +
04395         sptr[4] * (70.0F / 219.0F);
04396       sptr ++;
04397       *dptr++ = sptr[0] * ( 1.0F / 163.0F) +
04398         sptr[1] * ( 8.0F / 163.0F) +
04399         sptr[2] * (28.0F / 163.0F) +
04400         sptr[3] * (56.0F / 163.0F) +
04401         sptr[4] * (70.0F / 163.0F);
04402       sptr += 5;  // sptr back to same as dptr (start of next line)
04403     }
04404 }
04405 #endif
04406 
04407 //############################################################################
04408 /* So things look consistent in everyone's emacs... */
04409 /* Local Variables: */
04410 /* indent-tabs-mode: nil */
04411 /* End: */
04412 
04413 #endif
04414 
Generated on Sun May 8 08:06:58 2011 for iLab Neuromorphic Vision Toolkit by  doxygen 1.6.3