00001 /*!@file Util/mmx-sse.C -- Optimized implementations of low-level functions 00002 for MMX/SSE */ 00003 00004 // //////////////////////////////////////////////////////////////////// // 00005 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2003 // 00006 // by the University of Southern California (USC) and the iLab at USC. // 00007 // See http://iLab.usc.edu for information about this project. // 00008 // //////////////////////////////////////////////////////////////////// // 00009 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00010 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00011 // in Visual Environments, and Applications'' by Christof Koch and // 00012 // Laurent Itti, California Institute of Technology, 2001 (patent // 00013 // pending; application number 09/912,225 filed July 23, 2001; see // 00014 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00015 // //////////////////////////////////////////////////////////////////// // 00016 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00017 // // 00018 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00019 // redistribute it and/or modify it under the terms of the GNU General // 00020 // Public License as published by the Free Software Foundation; either // 00021 // version 2 of the License, or (at your option) any later version. // 00022 // // 00023 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00024 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00025 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00026 // PURPOSE. See the GNU General Public License for more details. // 00027 // // 00028 // You should have received a copy of the GNU General Public License // 00029 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00030 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00031 // Boston, MA 02111-1307 USA. // 00032 // //////////////////////////////////////////////////////////////////// // 00033 // 00034 // Primary maintainer for this file: Nitin Dhavale <dhavale@usc.edu> 00035 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Util/mmx-sse.C $ 00036 // $Id: mmx-sse.C 10118 2008-08-18 23:51:38Z ilab24 $ 00037 // 00038 00039 #include "Util/mmx-sse.H" 00040 #include "Util/log.H" 00041 00042 // specific types only to the code that is in this file 00043 typedef int int32; 00044 typedef unsigned char byte; 00045 typedef float float32; 00046 00047 #ifdef INVT_CPU_OPTERON 00048 00049 #ifdef INVT_USE_SSE 00050 00051 //###################################################################### 00052 void sse_absDiff(const double *a, const double *b, double *diff, const int32 sz) 00053 { 00054 static int32 rcx= sz>>2; 00055 static int32 rdx= sz & 0x3; 00056 00057 asm ( 00058 "or %%rcx, %%rcx;\n\t" 00059 "jz .AG2;\n\t" 00060 ".AG1:;\n\t" 00061 "movupd 0(%%rsi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0 00062 "movupd 0(%%rdi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0 00063 "movupd 16(%%rsi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4 00064 "movupd 16(%%rdi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4 00065 "movupd %%xmm0, %%xmm4;\n\t" // xmm4 <- a3 a2 a1 a0 00066 "movupd %%xmm1, %%xmm5;\n\t" // xmm5 <- b3 b2 b1 b0 00067 "movupd %%xmm2, %%xmm6;\n\t" // xmm6 <- a7 a6 a5 a4 00068 "movupd %%xmm3, %%xmm7;\n\t" // xmm7 <- b7 b6 b5 b4 00069 "subpd %%xmm1, %%xmm0;\n\t" // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0) 00070 "subpd %%xmm3, %%xmm2;\n\t" // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4) 00071 "subpd %%xmm4, %%xmm5;\n\t" // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0) 00072 "subpd %%xmm6, %%xmm7;\n\t" // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4) 00073 "maxpd %%xmm0, %%xmm5;\n\t" // xmm5 <- max(xmm0,xmm5) 00074 "maxpd %%xmm2, %%xmm7;\n\t" // xmm7 <- max(xmm2,xmm7) 00075 "movupd %%xmm5, 0(%%rbx);\n\t" 00076 "movupd %%xmm7, 16(%%rbx);\n\t" 00077 "add $32, %%rsi;\n\t" 00078 "add $32, %%rdi;\n\t" 00079 "add $32, %%rbx;\n\t" 00080 "loop .AG1;\n\t" 00081 ".AG2:;\n\t" 00082 "mov %%rdx, %%rcx;\n\t" 00083 "or %%rcx, %%rcx;\n\t" 00084 "jz .AG4;\n\t" 00085 ".AG3:;\n\t" 00086 "movsd 0(%%rsi), %%xmm0;\n\t" 00087 "movsd 0(%%rdi), %%xmm1;\n\t" 00088 "movsd %%xmm0, %%xmm2;\n\t" 00089 "movsd %%xmm1, %%xmm3;\n\t" 00090 "subsd %%xmm3, %%xmm2;\n\t" 00091 "subsd %%xmm0, %%xmm1;\n\t" 00092 "maxsd %%xmm2, %%xmm1;\n\t" 00093 "movsd %%xmm1, 0(%%rbx);\n\t" 00094 "add $8, %%rsi;\n\t" 00095 "add $8, %%rdi;\n\t" 00096 "add $8, %%rbx;\n\t" 00097 "loop .AG3;\n\t" 00098 ".AG4:;\n\t" 00099 : 00100 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx) 00101 :"memory" 00102 ); 00103 } 00104 #endif 00105 00106 #ifdef INVT_USE_MMXSSE2 00107 //###################################################################### 00108 // speedup ~= 2.1 00109 void sse2_absDiff(const float *a, const float *b, float *diff, const int32 sz) 00110 { 00111 static int32 rcx= sz>>3; 00112 static int32 rdx= sz & 0x7; 00113 00114 asm ( 00115 "or %%rcx, %%rcx;\n\t" 00116 "jz .AE2;\n\t" 00117 ".AE1:;\n\t" 00118 "movups 0(%%rsi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0 00119 "movups 0(%%rdi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0 00120 "movups 16(%%rsi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4 00121 "movups 16(%%rdi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4 00122 "movups %%xmm0, %%xmm4;\n\t" // xmm4 <- a3 a2 a1 a0 00123 "movups %%xmm1, %%xmm5;\n\t" // xmm5 <- b3 b2 b1 b0 00124 "movups %%xmm2, %%xmm6;\n\t" // xmm6 <- a7 a6 a5 a4 00125 "movups %%xmm3, %%xmm7;\n\t" // xmm7 <- b7 b6 b5 b4 00126 "subps %%xmm1, %%xmm0;\n\t" // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0) 00127 "subps %%xmm3, %%xmm2;\n\t" // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4) 00128 "subps %%xmm4, %%xmm5;\n\t" // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0) 00129 "subps %%xmm6, %%xmm7;\n\t" // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4) 00130 "maxps %%xmm0, %%xmm5;\n\t" // xmm5 <- max(xmm0,xmm5) 00131 "maxps %%xmm2, %%xmm7;\n\t" // xmm7 <- max(xmm2,xmm7) 00132 "movups %%xmm5, 0(%%rbx);\n\t" 00133 "movups %%xmm7, 16(%%rbx);\n\t" 00134 "add $32, %%rsi;\n\t" 00135 "add $32, %%rdi;\n\t" 00136 "add $32, %%rbx;\n\t" 00137 "loop .AE1;\n\t" 00138 ".AE2:;\n\t" 00139 "mov %%rdx, %%rcx;\n\t" 00140 "or %%rcx, %%rcx;\n\t" 00141 "jz .AE4;\n\t" 00142 ".AE3:;\n\t" 00143 "movss 0(%%rsi), %%xmm0;\n\t" 00144 "movss 0(%%rdi), %%xmm1;\n\t" 00145 "movss %%xmm0, %%xmm2;\n\t" 00146 "movss %%xmm1, %%xmm3;\n\t" 00147 "subss %%xmm3, %%xmm2;\n\t" 00148 "subss %%xmm0, %%xmm1;\n\t" 00149 "maxss %%xmm2, %%xmm1;\n\t" 00150 "movss %%xmm1, 0(%%rbx);\n\t" 00151 "add $4, %%rsi;\n\t" 00152 "add $4, %%rdi;\n\t" 00153 "add $4, %%rbx;\n\t" 00154 "loop .AE3;\n\t" 00155 ".AE4:;\n\t" 00156 "emms;\n\t" 00157 : 00158 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx) 00159 :"memory" 00160 ); 00161 } 00162 00163 00164 00165 //###################################################################### 00166 // speedup ~= 3.4 00167 void sse2_absDiff(const int32 *a, const int32 *b, int32 *diff, const int32 sz) 00168 { 00169 static int32 rcx= sz>>3; 00170 static int32 rdx= sz&0x7; 00171 00172 asm ( 00173 "or %%rcx, %%rcx;\n\t" 00174 "jz .AF2;\n\t" 00175 ".AF1:;\n\t" 00176 "movdqu 0(%%rsi), %%xmm0;\n\t" 00177 "movdqu 0(%%rdi), %%xmm1;\n\t" 00178 "movdqu 16(%%rsi), %%xmm2;\n\t" 00179 "movdqu 16(%%rdi), %%xmm3;\n\t" 00180 "movdqu %%xmm0, %%xmm4;\n\t" 00181 "movdqu %%xmm1, %%xmm5;\n\t" 00182 "movdqu %%xmm2, %%xmm6;\n\t" 00183 "movdqu %%xmm3, %%xmm7;\n\t" 00184 "psubusw %%xmm1, %%xmm0;\n\t" 00185 "psubusw %%xmm3, %%xmm2;\n\t" 00186 "psubusw %%xmm4, %%xmm5;\n\t" 00187 "psubusw %%xmm6, %%xmm7;\n\t" 00188 "pmaxsw %%xmm0, %%xmm5;\n\t" 00189 "pmaxsw %%xmm2, %%xmm7;\n\t" 00190 "movdqu %%xmm5, 0(%%rbx);\n\t" 00191 "movdqu %%xmm7, 16(%%rbx);\n\t" 00192 "add $32, %%rsi;\n\t" 00193 "add $32, %%rdi;\n\t" 00194 "add $32, %%rbx;\n\t" 00195 "loop .AF1;\n\t" 00196 ".AF2:;\n\t" 00197 "mov %%rdx, %%rcx;\n\t" 00198 "or %%rcx, %%rcx;\n\t" 00199 "jz .AF4;\n\t" 00200 ".AF3:;\n\t" 00201 "mov (%%rsi), %%rax;\n\t" 00202 "mov (%%rdi), %%rdx;\n\t" 00203 "cmp %%rdx, %%rax;\n\t" 00204 "ja .AF5;\n\t" 00205 "xchg %%rax, %%rdx;\n\t" 00206 ".AF5:;\n\t" 00207 "sub %%rdx, %%rax;\n\t" 00208 "mov %%rax, (%%rbx);\n\t" 00209 "add $4, %%rsi;\n\t" 00210 "add $4, %%rdi;\n\t" 00211 "add $4, %%rbx;\n\t" 00212 "loop .AF3;\n\t" 00213 ".AF4:;\n\t" 00214 "emms;\n\t" 00215 : 00216 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx) 00217 :"memory" 00218 ); 00219 } 00220 00221 00222 //###################################################################### 00223 // speedup ~=10.0! 00224 void sse2_absDiff(const byte *a, const byte *b, byte *diff, const int32 sz) 00225 { 00226 static int32 rcx= sz>>5; 00227 static int32 rdx= sz&0x1f; 00228 00229 asm ( 00230 "or %%rcx, %%rcx;\n\t" 00231 "jz .AD2;\n\t" 00232 ".AD1:;\n\t" 00233 "movdqu 0(%%rsi), %%xmm0;\n\t" // xmm0<- a15 ... a3 a2 a1 a0 00234 "movdqu 0(%%rdi), %%xmm1;\n\t" // xmm1<- b15 ... b3 b2 b1 b0 00235 "movdqu 16(%%rsi), %%xmm2;\n\t"// xmm2<- a31 ... a18 a17 a16 00236 "movdqu 16(%%rdi), %%xmm3;\n\t"// xmm3<- b31 ... b18 b17 b16 00237 "movdqu %%xmm0, %%xmm4;\n\t" // xmm4<- a15 ... a3 a2 a1 a0 00238 "movdqu %%xmm1, %%xmm5;\n\t" // xmm5<- b15 ... b3 b2 b1 b0 00239 "movdqu %%xmm2, %%xmm6;\n\t" // xmm6<- a31 ... a18 a17 a16 00240 "movdqu %%xmm3, %%xmm7;\n\t" // xmm7<- b31 ... b18 b17 b16 00241 "psubusb %%xmm1, %%xmm0;\n\t" // xmm0<-(a15-b15)...( a1-b1 )(a0-b0) 00242 "psubusb %%xmm3, %%xmm2;\n\t" // xmm2<-(a31-b31)...(a17-b17)(a16-b16) 00243 "psubusb %%xmm4, %%xmm5;\n\t" // xmm5<-(b15-a15)...(b17-a17)(b16-a16) 00244 "psubusb %%xmm6, %%xmm7;\n\t" // xmm7<-(b31-a31)...(b17-a17)(b16-a16) 00245 "pmaxub %%xmm0, %%xmm5;\n\t" // xmm5<- max(xmm0,xmm5) 00246 "pmaxub %%xmm2, %%xmm7;\n\t" // xmm7<- max(xmm2,xmm7) 00247 "movdqu %%xmm5, 0(%%rbx);\n\t" 00248 "movdqu %%xmm7, 16(%%rbx);\n\t" 00249 "add $32, %%rsi;\n\t" 00250 "add $32, %%rdi;\n\t" 00251 "add $32, %%rbx;\n\t" 00252 "loop .AD1;\n\t" 00253 ".AD2:;\n\t" 00254 "mov %%rdx, %%rcx;\n\t" 00255 "or %%rcx, %%rcx;\n\t" 00256 "jz .AD4;\n\t" 00257 ".AD3:;\n\t" 00258 "movb (%%rsi), %%al;\n\t" 00259 "movb (%%rdi), %%dl;\n\t" 00260 "cmpb %%dl, %%al;\n\t" 00261 "ja .AD5;\n\t" 00262 "xchgb %%al, %%dl;\n\t" 00263 ".AD5:;\n\t" 00264 "subb %%dl, %%al;\n\t" 00265 "movb %%al, (%%rbx);\n\t" 00266 "inc %%rbx;\n\t" 00267 "inc %%rsi;\n\t" 00268 "inc %%rdi;\n\t" 00269 "loop .AD3;\n\t" 00270 ".AD4:;\n\t" 00271 "emms;\n\t" 00272 : 00273 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx) 00274 :"memory" 00275 ); 00276 } 00277 #endif 00278 00279 #ifdef INVT_USE_SSE 00280 //###################################################################### 00281 // speedup ~= 2.0 00282 void sse_sum(const double *a, double *sum, const int32 sz) 00283 { 00284 static int32 rcx = sz>>3; 00285 static int32 rdx = sz&0x7; 00286 00287 asm ( 00288 "pxor %%xmm4, %%xmm4;\n\t" 00289 "pxor %%xmm5, %%xmm5;\n\t" 00290 "pxor %%xmm6, %%xmm6;\n\t" 00291 "pxor %%xmm7, %%xmm7;\n\t" 00292 "or %%rcx, %%rcx;\n\t" 00293 "jz BE1;\n\t" 00294 ".BE0:\n\t" 00295 "movupd 0(%%rsi), %%xmm0;\n\t" 00296 "movupd 16(%%rsi), %%xmm1;\n\t" 00297 "movupd 32(%%rsi), %%xmm2;\n\t" 00298 "movupd 48(%%rsi), %%xmm3;\n\t" 00299 "addpd %%xmm0, %%xmm4;\n\t" 00300 "addpd %%xmm1, %%xmm5;\n\t" 00301 "addpd %%xmm2, %%xmm6;\n\t" 00302 "addpd %%xmm3, %%xmm7;\n\t" 00303 "add $64, %%rsi;\n\t" 00304 "loop .BE0;\n\t" 00305 "BE1:;\n\t" 00306 "mov %%rdx, %%rcx;\n\t" 00307 "pxor %%xmm0, %%xmm0;\n\t" 00308 "or %%rcx, %%rcx;\n\t" 00309 "jz BE2;\n\t" 00310 "BE3:;\n\t" 00311 "movupd 0(%%rsi), %%xmm1;\n\t" 00312 "addpd %%xmm1, %%xmm0;\n\t" 00313 "add $16, %%rsi;\n\t" 00314 "loop BE3;\n\t" 00315 "BE2:;\n\t" 00316 "addpd %%xmm4, %%xmm7;\n\t" 00317 "addpd %%xmm5, %%xmm7;\n\t" 00318 "addpd %%xmm6, %%xmm7;\n\t" 00319 "addpd %%xmm7, %%xmm0;\n\t" 00320 "movhpd %%xmm0, (%%rbx);\n\t" 00321 "addsd (%%rbx), %%xmm0;\n\t" 00322 "movsd %%xmm0, (%%rbx);\n\t" 00323 "emms;\n\t" 00324 : 00325 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx) 00326 :"memory" 00327 ); 00328 } 00329 #endif 00330 00331 #ifdef INVT_USE_MMXSSE2 00332 //###################################################################### 00333 //speedup ~= 4 00334 void sse2_sum(const float *a, double *sum, const int32 sz) 00335 { 00336 static int32 rcx = sz>>3; 00337 static int32 rdx = sz & 0x7; 00338 00339 asm ( 00340 "pxor %%xmm4, %%xmm4;\n\t" 00341 "pxor %%xmm5, %%xmm5;\n\t" 00342 "pxor %%xmm6, %%xmm6;\n\t" 00343 "pxor %%xmm7, %%xmm7;\n\t" 00344 "or %%rcx, %%rcx;\n\t" 00345 "jz BA1;\n\t" 00346 ".BA0:\n\t" 00347 "cvtps2pd 0(%%rsi), %%xmm0;\n\t" 00348 "cvtps2pd 8(%%rsi), %%xmm1;\n\t" 00349 "cvtps2pd 16(%%rsi), %%xmm2;\n\t" 00350 "cvtps2pd 24(%%rsi), %%xmm3;\n\t" 00351 "addpd %%xmm0, %%xmm4;\n\t" 00352 "addpd %%xmm1, %%xmm5;\n\t" 00353 "addpd %%xmm2, %%xmm6;\n\t" 00354 "addpd %%xmm3, %%xmm7;\n\t" 00355 "add $32, %%rsi;\n\t" 00356 "loop .BA0;\n\t" 00357 "BA1:;\n\t" 00358 "pxor %%xmm0, %%xmm0;\n\t" 00359 "mov %%rdx, %%rcx;\n\t" 00360 "or %%rcx, %%rcx;\n\t" 00361 "jz BA2;\n\t" 00362 "BA3:;\n\t" 00363 "cvtps2pd 0(%%rsi), %%xmm1;\n\t" 00364 "addpd %%xmm1, %%xmm0;\n\t" 00365 "add $8, %%rsi;\n\t" 00366 "loop BA3;\n\t" 00367 "BA2:;\n\t" 00368 "addpd %%xmm4, %%xmm7;\n\t" 00369 "addpd %%xmm5, %%xmm7;\n\t" 00370 "addpd %%xmm6, %%xmm7;\n\t" 00371 "addpd %%xmm7, %%xmm0;\n\t" 00372 "movhpd %%xmm0, (%%rbx);\n\t" 00373 "addsd (%%rbx), %%xmm0;\n\t" 00374 "movsd %%xmm0, (%%rbx);\n\t" 00375 "emms;\n\t" 00376 : 00377 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx) 00378 :"memory" 00379 ); 00380 } 00381 00382 00383 //###################################################################### 00384 // speedup ~= 4.0 00385 void sse2_sum(const int32 *a, double *sum, const int32 sz) 00386 { 00387 static int32 rcx = sz>>3; 00388 static int32 rdx = sz & 0x7; 00389 00390 asm ( 00391 "pxor %%xmm4, %%xmm4;\n\t" 00392 "pxor %%xmm5, %%xmm5;\n\t" 00393 "pxor %%xmm6, %%xmm6;\n\t" 00394 "pxor %%xmm7, %%xmm7;\n\t" 00395 "or %%rcx, %%rcx;\n\t" 00396 ".BC0:\n\t" 00397 "cvtdq2pd 0(%%rsi), %%xmm0;\n\t" 00398 "cvtdq2pd 8(%%rsi), %%xmm1;\n\t" 00399 "cvtdq2pd 16(%%rsi), %%xmm2;\n\t" 00400 "cvtdq2pd 24(%%rsi), %%xmm3;\n\t" 00401 "addpd %%xmm0, %%xmm4;\n\t" 00402 "addpd %%xmm1, %%xmm5;\n\t" 00403 "addpd %%xmm2, %%xmm6;\n\t" 00404 "addpd %%xmm3, %%xmm7;\n\t" 00405 "add $32, %%rsi;\n\t" 00406 "loop .BC0;\n\t" 00407 "BC1:;\n\t" 00408 "pxor %%xmm0, %%xmm0;\n\t" 00409 "mov %%rdx, %%rcx;\n\t" 00410 "or %%rcx, %%rcx;\n\t" 00411 "jz BC2;\n\t" 00412 "BC3:;\n\t" 00413 "cvtdq2pd 0(%%rsi), %%xmm1;\n\t" 00414 "addpd %%xmm1, %%xmm0;\n\t" 00415 "add $8, %%rsi;\n\t" 00416 "loop BC3;\n\t" 00417 "BC2:;\n\t" 00418 "addpd %%xmm4, %%xmm7;\n\t" 00419 "addpd %%xmm5, %%xmm7;\n\t" 00420 "addpd %%xmm6, %%xmm7;\n\t" 00421 "addpd %%xmm7, %%xmm0;\n\t" 00422 "movhpd %%xmm0, (%%rbx);\n\t" 00423 "addsd (%%rbx), %%xmm0;\n\t" 00424 "movsd %%xmm0, (%%rbx);\n\t" 00425 "emms;\n\t" 00426 : 00427 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx) 00428 :"memory" 00429 ); 00430 } 00431 00432 00433 00434 //###################################################################### 00435 void sse2_sum(const byte *a, double *sum, const int32 sz) 00436 { 00437 static int rcx = sz>>5; 00438 static int rdx = sz & 0x1f; 00439 00440 asm ( 00441 "or %%rcx, %%rcx;\n\t" 00442 "jz BB1;\n\t" 00443 "pxor %%xmm7, %%xmm7;\n\t" 00444 "push %%rbx;\n\t" 00445 "push %%rdx;\n\t" 00446 "BB3:;\n\t" 00447 "pxor %%xmm5, %%xmm5;\n\t" 00448 "pxor %%xmm6, %%xmm6;\n\t" 00449 "movdqu (%%rsi), %%xmm0;\n\t" 00450 "movdqu 16(%%rsi), %%xmm1;\n\t" 00451 "psadbw %%xmm0, %%xmm5;\n\t" 00452 "psadbw %%xmm1, %%xmm6;\n\t" 00453 "pextrw $0, %%xmm5, %%rax;\n\t" 00454 "cvtsi2sd %%rax, %%xmm0;\n\t" 00455 "pextrw $4, %%xmm5, %%rbx;\n\t" 00456 "cvtsi2sd %%rbx, %%xmm1;\n\t" 00457 "pextrw $0, %%xmm6, %%rdx;\n\t" 00458 "cvtsi2sd %%rdx, %%xmm2;\n\t" 00459 "pextrw $4, %%xmm6, %%rdi;\n\t" 00460 "cvtsi2sd %%rdi, %%xmm3;\n\t" 00461 "addsd %%xmm0, %%xmm1;\n\t" 00462 "addsd %%xmm2, %%xmm3;\n\t" 00463 "addsd %%xmm1, %%xmm7;\n\t" 00464 "addsd %%xmm3, %%xmm7;\n\t" 00465 "add $32, %%rsi;\n\t" 00466 "loop BB3;\n\t" 00467 "pop %%rdx;\n\t" 00468 "pop %%rbx;\n\t" 00469 "BB1:;\n\t" 00470 "xor %%rdi, %%rdi;\n\t" 00471 "mov %%rdx, %%rcx;\n\t" 00472 "or %%rcx, %%rcx;\n\t" 00473 "jz BB2;\n\t" 00474 "BB5:;\n\t" 00475 "xor %%rax, %%rax;\n\t" 00476 "movb (%%rsi), %%al;\n\t" 00477 "add %%rax, %%rdi;\n\t" 00478 "inc %%rsi;\n\t" 00479 "loop BB5;\n\t" 00480 "BB2:\n\t" 00481 "cvtsi2sd %%rdi, %%xmm0;\n\t" 00482 "addsd %%xmm0, %%xmm7;\n\t" 00483 "movhpd %%xmm7, (%%rbx);\n\t" 00484 "addsd (%%rbx), %%xmm7;\n\t" 00485 "movsd %%xmm7, (%%rbx);\n\t" 00486 "BB6:;\n\t" 00487 "emms;\n\t" 00488 : 00489 :"S"(a), "c"(rcx),"b"(sum),"d"(rdx) 00490 :"memory","rax","rdi" 00491 ); 00492 } 00493 #endif 00494 00495 #ifdef INVT_USE_SSE 00496 //###################################################################### 00497 // speedup ~= 10 ! 00498 void sse_clampedDiff(const byte *a, const byte *b, byte *result, const int32 sz) 00499 { 00500 int rcx = sz >> 6; 00501 int rdx = sz & 0x7f; 00502 00503 asm ( 00504 "or %%rcx, %%rcx;\n\t" 00505 "jz .DA0;\n\t" 00506 ".DA1:;\n\t" 00507 "movdqu (%%rsi), %%xmm0;\n\t" 00508 "movdqu (%%rdi), %%xmm4;\n\t" 00509 "movdqu 16(%%rsi), %%xmm1;\n\t" 00510 "movdqu 16(%%rdi), %%xmm5;\n\t" 00511 "movdqu 32(%%rsi), %%xmm2;\n\t" 00512 "movdqu 32(%%rdi), %%xmm6;\n\t" 00513 "movdqu 48(%%rsi), %%xmm3;\n\t" 00514 "movdqu 48(%%rdi), %%xmm7;\n\t" 00515 "psubusb %%xmm4, %%xmm0;\n\t" 00516 "psubusb %%xmm5, %%xmm1;\n\t" 00517 "psubusb %%xmm6, %%xmm2;\n\t" 00518 "psubusb %%xmm7, %%xmm3;\n\t" 00519 "movdqu %%xmm0, 0(%%rbx);\n\t" 00520 "movdqu %%xmm1, 16(%%rbx);\n\t" 00521 "movdqu %%xmm2, 32(%%rbx);\n\t" 00522 "movdqu %%xmm3, 48(%%rbx);\n\t" 00523 "add $64, %%rsi;\n\t" 00524 "add $64, %%rdi;\n\t" 00525 "add $64, %%rbx;\n\t" 00526 "loop .DA1;\n\t" 00527 ".DA0:;\n\t" 00528 "mov %%rdx, %%rcx;\n\t" 00529 "or %%rcx, %%rcx;\n\t" 00530 "jz .DA2;\n\t" 00531 ".DA3:;\n\t" 00532 "movb (%%rsi), %%al;\n\t" 00533 "movb (%%rdi), %%dl;\n\t" 00534 "cmpb %%bl, %%al;\n\t" 00535 "ja .DA4;\n\t" 00536 "xchg %%al, %%bl;\n\t" 00537 ".DA4:;\n\t" 00538 "subb %%bl, %%al;\n\t" 00539 "movb %%al, (%%rbx);\n\t" 00540 "inc %%rsi;\n\t" 00541 "inc %%rdi;\n\t" 00542 "inc %%rbx;\n\t" 00543 "loop .DA3;\n\t" 00544 ".DA2:;\n\t" 00545 "emms;\n\t" 00546 : 00547 :"S"(a),"D"(b),"c"(rcx),"d"(rdx),"b"(result) 00548 ); 00549 } 00550 00551 00552 //###################################################################### 00553 // speedup ~= 20 ! 00554 void sse_clampedDiff(const float32 *a, const float32 *b, float32 *result, 00555 const int32 sz) 00556 { 00557 int32 rcx=sz>>5; 00558 int32 rdx=sz&0x1f; 00559 00560 asm ( 00561 "or %%rcx, %%rcx;\n\t" 00562 "jz .DB0;\n\t" 00563 ".DB1:;\n\t" 00564 "movups 0(%%rsi), %%xmm0;\n\t" 00565 "movups 0(%%rdi), %%xmm1;\n\t" 00566 "movups 16(%%rsi), %%xmm2;\n\t" 00567 "movups 16(%%rdi), %%xmm3;\n\t" 00568 "movups %%xmm1, %%xmm6;\n\t" 00569 "movups %%xmm3, %%xmm7;\n\t" 00570 "cmpps $1, %%xmm0, %%xmm6;\n\t" 00571 "cmpps $1, %%xmm2, %%xmm7;\n\t" 00572 "subps %%xmm1, %%xmm0;\n\t" 00573 "subps %%xmm3, %%xmm2;\n\t" 00574 "andps %%xmm6, %%xmm0;\n\t" 00575 "andps %%xmm7, %%xmm2;\n\t" 00576 "movups %%xmm0, (%%rbx);\n\t" 00577 "movups %%xmm2, 16(%%rbx);\n\t" 00578 "add $32, %%rsi;\n\t" 00579 "add $32, %%rdi;\n\t" 00580 "add $32, %%rbx;\n\t" 00581 "loop .DB1;\n\t" 00582 ".DB0:;\n\t" 00583 "mov %%rdx, %%rcx;\n\t" 00584 "or %%rcx, %%rcx;\n\t" 00585 "jz .DB2;\n\t" 00586 ".DB3:;\n\t" 00587 "movss (%%rsi), %%xmm0;\n\t" 00588 "movss (%%rdi), %%xmm1;\n\t" 00589 "movss %%xmm1, %%xmm2;\n\t" 00590 "cmpss $1, %%xmm0, %%xmm2;\n\t" 00591 "andps %%xmm2, %%xmm0;\n\t" 00592 "andps %%xmm2, %%xmm1;\n\t" 00593 "subss %%xmm1, %%xmm0;\n\t" 00594 "movss %%xmm0, (%%rbx);\n\t" 00595 "add $4, %%rsi;\n\t" 00596 "add $4, %%rdi;\n\t" 00597 "add $4, %%rbx;\n\t" 00598 "loop .DB3;\n\t" 00599 ".DB2:;\n\t" 00600 : 00601 :"S"(a), "D"(b), "b"(result), "c"(rcx), "d"(rdx) 00602 :"memory" 00603 ); 00604 } 00605 00606 00607 //###################################################################### 00608 // speedup ~= 3 00609 void sse_clampedDiff(const int32 *a, const int32 *b, int32 *c, const int32 sz) 00610 { 00611 int32 rcx=sz>>3; 00612 int32 rdx=sz&0x7; 00613 asm ( 00614 "or %%rcx, %%rcx;\n\t" 00615 "jz .DC0;\n\t" 00616 ".DC1:;\n\t" 00617 "movdqu 0(%%rsi), %%xmm0;\n\t" //xmm0= a3 a2 a1 a0 00618 "movdqu 0(%%rdi), %%xmm1;\n\t" //xmm1= b3 b2 b1 b0 00619 "movdqu 16(%%rsi), %%xmm3;\n\t"//xmm3= a7 a6 a5 a4 00620 "movdqu 16(%%rdi), %%xmm4;\n\t"//xmm4= b7 b6 b5 b4 00621 "movdqu %%xmm0, %%xmm2;\n\t" //xmm2= a3 a2 a1 a0 00622 "movdqu %%xmm3, %%xmm5;\n\t" //xmm5= a7 a6 a5 a4 00623 "pcmpgtd %%xmm1, %%xmm2;\n\t" //xmm2=(a3>b3)(a2>b2)(a1>b1)(a0>b0) 00624 "pcmpgtd %%xmm4, %%xmm5;\n\t" //xmm5=(a7>b7)(a6>b6)(b5>a5)(a4>b4) 00625 "psubd %%xmm1, %%xmm0;\n\t" //xmm0=(a3-b3)(a2-b2)(a1-b1)(a0-b0) 00626 "psubd %%xmm4, %%xmm3;\n\t" //xmm3=(a7-b7)(a6-b6)(a5-b5)(a4-b4) 00627 "pand %%xmm2, %%xmm0;\n\t" 00628 "pand %%xmm5, %%xmm3;\n\t" 00629 "movdqu %%xmm0, (%%rbx);\n\t" 00630 "movdqu %%xmm3, 16(%%rbx);\n\t" 00631 "add $32, %%rsi;\n\t" 00632 "add $32, %%rdi;\n\t" 00633 "add $32, %%rbx;\n\t" 00634 "loop .DC1;\n\t" 00635 ".DC0:;\n\t" 00636 "mov %%rdx, %%rcx;\n\t" 00637 "or %%rcx, %%rcx;\n\t" 00638 "jz .DC2;\n\t" 00639 ".DC3:;\n\t" 00640 "movsd 0(%%rsi), %%xmm0;\n\t" 00641 "movsd 0(%%rdi), %%xmm1;\n\t" 00642 "movdqu %%xmm0, %%xmm2;\n\t" 00643 "pcmpgtd %%xmm1, %%xmm2;\n\t" 00644 "psubd %%xmm1, %%xmm0;\n\t" 00645 "pand %%xmm2, %%xmm0;\n\t" 00646 "movsd %%xmm0, (%%rbx);\n\t" 00647 "add $4, %%rsi;\n\t" 00648 "add $4, %%rdi;\n\t" 00649 "add $4, %%rbx;\n\t" 00650 "loop .DC3;\n\t" 00651 ".DC2:;\n\t" 00652 : 00653 :"S"(a), "D"(b), "c"(rcx), "d"(rdx), "b"(c) 00654 :"memory" 00655 ); 00656 } 00657 00658 00659 //###################################################################### 00660 // speedup ~= 4-5 00661 void sse_binaryReverse(const byte *a, byte *result, const byte val, const 00662 int32 sz) 00663 { 00664 static unsigned int rcx=(sz>>7); 00665 static unsigned int rdx=sz&0x7f; 00666 00667 byte pVal[16]; 00668 00669 memset(result, val, 16); 00670 00671 asm ( 00672 "or %%rcx, %%rcx;\n\t" 00673 "jz .FA0;\n\t" 00674 ".FA1:;\n\t" 00675 "movdqu 0(%%rbx), %%xmm0;\n\t" 00676 "movdqu 0(%%rbx), %%xmm1;\n\t" 00677 "movdqu %%xmm0, %%xmm2;\n\t" 00678 "movdqu %%xmm1, %%xmm3;\n\t" 00679 "movdqu %%xmm0, %%xmm4;\n\t" 00680 "movdqu %%xmm1, %%xmm5;\n\t" 00681 "movdqu %%xmm0, %%xmm6;\n\t" 00682 "movdqu %%xmm1, %%xmm7;\n\t" 00683 "psubb (%%rsi), %%xmm0;\n\t" 00684 "psubb 16(%%rsi), %%xmm1;\n\t" 00685 "psubb 32(%%rsi), %%xmm2;\n\t" 00686 "psubb 48(%%rsi), %%xmm3;\n\t" 00687 "psubb 64(%%rsi), %%xmm4;\n\t" 00688 "psubb 80(%%rsi), %%xmm5;\n\t" 00689 "psubb 96(%%rsi), %%xmm6;\n\t" 00690 "psubb 112(%%rsi), %%xmm7;\n\t" 00691 "movdqu %%xmm0, (%%rdi);\n\t" 00692 "movdqu %%xmm1, 16(%%rdi);\n\t" 00693 "movdqu %%xmm2, 32(%%rdi);\n\t" 00694 "movdqu %%xmm3, 48(%%rdi);\n\t" 00695 "movdqu %%xmm4, 64(%%rdi);\n\t" 00696 "movdqu %%xmm5, 80(%%rdi);\n\t" 00697 "movdqu %%xmm6, 96(%%rdi);\n\t" 00698 "movdqu %%xmm7, 112(%%rdi);\n\t" 00699 "add $128, %%rdi;\n\t" 00700 "add $128, %%rsi;\n\t" 00701 "loop .FA1;\n\t" 00702 ".FA0:;\n\t" 00703 "mov %%rdx, %%rcx;\n\t" 00704 "or %%rcx, %%rcx;\n\t" 00705 "jz .FA2;\n\t" 00706 "movb (%%rbx), %%dl;\n\t" 00707 ".FA3:;\n\t" 00708 "movb %%dl, %%dh;\n\t" 00709 "movb (%%rsi), %%al;\n\t" 00710 "subb %%al, %%dh;\n\t" 00711 "movb %%dh, (%%rdi);\n\t" 00712 "inc %%rsi;\n\t" 00713 "inc %%rdi;\n\t" 00714 "loop .FA3;\n\t" 00715 ".FA2:;\n\t" 00716 : 00717 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx) 00718 :"memory","rax" 00719 ); 00720 } 00721 00722 00723 //###################################################################### 00724 // speedup ~= 2 00725 void sse_binaryReverse(const float *a, float *result, const float val, 00726 const int sz) 00727 { 00728 static unsigned int rcx = sz>>5; 00729 static unsigned int rdx = sz&0x1f; 00730 int i; 00731 float pVal[16]; 00732 00733 for(i=0;i<16;++i) 00734 pVal[i] = val; 00735 00736 00737 asm ( 00738 "or %%rcx, %%rcx;\n\t" 00739 "jz .FB4;\n\t" 00740 ".FB2:;\n\t" 00741 "movups (%%rbx), %%xmm0;\n\t" 00742 "movups (%%rbx), %%xmm1;\n\t" 00743 "movups %%xmm0, %%xmm2;\n\t" 00744 "movups %%xmm1, %%xmm3;\n\t" 00745 "movups %%xmm0, %%xmm4;\n\t" 00746 "movups %%xmm1, %%xmm5;\n\t" 00747 "movups %%xmm0, %%xmm6;\n\t" 00748 "movups %%xmm1, %%xmm7;\n\t" 00749 "psubq (%%rsi), %%xmm0;\n\t" 00750 "psubq 16(%%rsi), %%xmm1;\n\t" 00751 "psubq 32(%%rsi), %%xmm2;\n\t" 00752 "psubq 48(%%rsi), %%xmm3;\n\t" 00753 "psubq 64(%%rsi), %%xmm4;\n\t" 00754 "psubq 80(%%rsi), %%xmm5;\n\t" 00755 "psubq 96(%%rsi), %%xmm6;\n\t" 00756 "psubq 112(%%rsi), %%xmm7;\n\t" 00757 "movups %%xmm0, 0(%%rdi);\n\t" 00758 "movups %%xmm1, 16(%%rdi);\n\t" 00759 "movups %%xmm2, 32(%%rdi);\n\t" 00760 "movups %%xmm3, 48(%%rdi);\n\t" 00761 "movups %%xmm4, 64(%%rdi);\n\t" 00762 "movups %%xmm5, 80(%%rdi);\n\t" 00763 "movups %%xmm6, 96(%%rdi);\n\t" 00764 "movups %%xmm7,112(%%rdi);\n\t" 00765 "add $128, %%rsi;\n\t" 00766 "add $128, %%rdi;\n\t" 00767 "loop .FB2;\n\t" 00768 ".FB4:\n\t" 00769 "or %%rdx, %%rdx;\n\t" 00770 "jz .FB1;\n\t" 00771 "mov %%rdx, %%rcx;\n\t" 00772 ".FB3:;\n\t" 00773 "movss 0(%%rbx), %%xmm0;\n\t" 00774 "subss (%%rsi), %%xmm0;\n\t" 00775 "movups %%xmm0, (%%rdi);\n\t" 00776 "add $16, %%rsi;\n\t" 00777 "add $16, %%rdi;\n\t" 00778 "loop .FB3;\n\t" 00779 ".FB1:;\n\t" 00780 : 00781 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx) 00782 :"memory","rax" 00783 ); 00784 } 00785 00786 00787 00788 //###################################################################### 00789 00790 void sse_binaryReverse(const int32 *a, int32 *result, const int32 val, 00791 const int32 sz) 00792 { 00793 int32 rcx=sz>>5; 00794 int32 rdx=sz&31; 00795 int32 pVal[16]; 00796 int i; 00797 00798 for(i=0;i<16;++i) pVal[i] = val; 00799 00800 asm ( 00801 "or %%rcx, %%rcx;\n\t" 00802 "jz .FC4;\n\t" 00803 ".FC2:;\n\t" 00804 "movdqu (%%rbx), %%xmm0;\n\t" 00805 "movdqu (%%rbx), %%xmm1;\n\t" 00806 "movdqu %%xmm0, %%xmm2;\n\t" 00807 "movdqu %%xmm1, %%xmm3;\n\t" 00808 "movdqu %%xmm0, %%xmm4;\n\t" 00809 "movdqu %%xmm1, %%xmm5;\n\t" 00810 "movdqu %%xmm0, %%xmm6;\n\t" 00811 "movdqu %%xmm1, %%xmm7;\n\t" 00812 "psubd (%%rsi), %%xmm0;\n\t" 00813 "psubd 16(%%rsi), %%xmm1;\n\t" 00814 "psubd 32(%%rsi), %%xmm2;\n\t" 00815 "psubd 48(%%rsi), %%xmm3;\n\t" 00816 "psubd 64(%%rsi), %%xmm4;\n\t" 00817 "psubd 80(%%rsi), %%xmm5;\n\t" 00818 "psubd 96(%%rsi), %%xmm6;\n\t" 00819 "psubd 112(%%rsi), %%xmm7;\n\t" 00820 "movdqu %%xmm0, 0(%%rdi);\n\t" 00821 "movdqu %%xmm1, 16(%%rdi);\n\t" 00822 "movdqu %%xmm2, 32(%%rdi);\n\t" 00823 "movdqu %%xmm3, 48(%%rdi);\n\t" 00824 "movdqu %%xmm4, 64(%%rdi);\n\t" 00825 "movdqu %%xmm5, 80(%%rdi);\n\t" 00826 "movdqu %%xmm6, 96(%%rdi);\n\t" 00827 "movdqu %%xmm7,112(%%rdi);\n\t" 00828 "add $128, %%rsi;\n\t" 00829 "add $128, %%rdi;\n\t" 00830 "loop .FC2;\n\t" 00831 ".FC4:;\n\t" 00832 "or %%rdx, %%rdx;\n\t" 00833 "jz .FC1;\n\t" 00834 "mov %%rdx, %%rcx;\n\t" 00835 ".FC3:;\n\t" 00836 "movdqu 0(%%rbx), %%xmm0;\n\t" 00837 "psubd (%%rsi), %%xmm0;\n\t" 00838 "movups %%xmm0, (%%rdi);\n\t" 00839 "add $16, %%rsi;\n\t" 00840 "add $16, %%rdi;\n\t" 00841 "loop .FC3;\n\t" 00842 ".FC1:;\n\t" 00843 : 00844 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx) 00845 :"memory","rax" 00846 ); 00847 } 00848 00849 00850 00851 //###################################################################### 00852 00853 void sse_cvt_byte_to_int(const byte *a, int32 *b, const int32 sz) 00854 { 00855 int32 rcx=sz>>4; 00856 int32 rdx=sz&0xf; 00857 00858 asm( 00859 "or %%rcx, %%rcx;\n\t" 00860 "jz .GA4;\n\t" 00861 "pxor %%xmm0, %%xmm0;\n\t" 00862 ".GA2:;\n\t" 00863 "movdqu 0(%%rsi), %%xmm1;\n\t" 00864 "movdqa %%xmm1, %%xmm2;\n\t" 00865 "movdqa %%xmm1, %%xmm3;\n\t" 00866 "movdqa %%xmm1, %%xmm4;\n\t" 00867 "psrldq $4, %%xmm2;\n\t" 00868 "psrldq $8, %%xmm3;\n\t" 00869 "psrldq $12, %%xmm4;\n\t" 00870 "punpcklbw %%xmm0, %%xmm1;\n\t" 00871 "punpcklbw %%xmm0, %%xmm2;\n\t" 00872 "punpcklbw %%xmm0, %%xmm3;\n\t" 00873 "punpcklbw %%xmm0, %%xmm4;\n\t" 00874 "punpcklbw %%xmm0, %%xmm1;\n\t" 00875 "punpcklbw %%xmm0, %%xmm2;\n\t" 00876 "punpcklbw %%xmm0, %%xmm3;\n\t" 00877 "punpcklbw %%xmm0, %%xmm4;\n\t" 00878 "movdqu %%xmm1, (%%rdi);\n\t" 00879 "movdqu %%xmm2, 16(%%rdi);\n\t" 00880 "movdqu %%xmm3, 32(%%rdi);\n\t" 00881 "movdqu %%xmm4, 48(%%rdi);\n\t" 00882 "add $16, %%rsi;\n\t" 00883 "add $64, %%rdi;\n\t" 00884 "loop .GA2;\n\t" 00885 ".GA4:;\n\t" 00886 "or %%rdx, %%rdx;\n\t" 00887 "jz .GA1;\n\t" 00888 "mov %%rdx, %%rcx;\n\t" 00889 ".GA3:;\n\t" 00890 "xor %%rax, %%rax;\n\t" 00891 "movb (%%rsi), %%al;\n\t" 00892 "mov %%rax, (%%rdi);\n\t" 00893 "inc %%rsi;\n\t" 00894 "add $4, %%rdi;\n\t" 00895 "loop .GA3;\n\t" 00896 ".GA1:;" 00897 : 00898 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 00899 :"memory" 00900 ); 00901 00902 00903 } 00904 00905 #endif 00906 00907 #ifdef INVT_USE_MMXSSE2 00908 00909 //###################################################################### 00910 // speedup ~= 1.5 00911 void sse2_cvt_byte_to_float(const byte *a, float32 *b, const int32 sz) 00912 { 00913 int32 rcx=sz>>4; 00914 int32 rdx=sz&0xf; 00915 00916 asm( 00917 "or %%rcx, %%rcx;\n\t" 00918 "jz .GB4;\n\t" 00919 ".GB2:;\n\t" 00920 "pxor %%xmm0, %%xmm0;\n\t" 00921 "movdqu 0(%%rsi), %%xmm1;\n\t" 00922 "movdqu 4(%%rsi), %%xmm2;\n\t" 00923 "movdqu 8(%%rsi), %%xmm3;\n\t" 00924 "movdqu 12(%%rsi), %%xmm4;\n\t" 00925 "punpcklbw %%xmm0, %%xmm1;\n\t" 00926 "punpcklbw %%xmm0, %%xmm2;\n\t" 00927 "punpcklbw %%xmm0, %%xmm3;\n\t" 00928 "punpcklbw %%xmm0, %%xmm4;\n\t" 00929 "punpcklbw %%xmm0, %%xmm1;\n\t" 00930 "punpcklbw %%xmm0, %%xmm2;\n\t" 00931 "punpcklbw %%xmm0, %%xmm3;\n\t" 00932 "punpcklbw %%xmm0, %%xmm4;\n\t" 00933 "cvtdq2ps %%xmm1, %%xmm1;\n\t" 00934 "cvtdq2ps %%xmm2, %%xmm2;\n\t" 00935 "movups %%xmm1, (%%rdi);\n\t" 00936 "movups %%xmm2, 16(%%rdi);\n\t" 00937 "cvtdq2ps %%xmm3, %%xmm3;\n\t" 00938 "cvtdq2ps %%xmm4, %%xmm4;\n\t" 00939 "movups %%xmm3, 32(%%rdi);\n\t" 00940 "movups %%xmm4, 48(%%rdi);\n\t" 00941 "add $16, %%rsi;\n\t" 00942 "add $64, %%rdi;\n\t" 00943 "loop .GB2;\n\t" 00944 ".GB4:;\n\t" 00945 "or %%rdx, %%rdx;\n\t" 00946 "jz .GB1;\n\t" 00947 "mov %%rdx, %%rcx;\n\t" 00948 ".GB3:;\n\t" 00949 "xor %%rax, %%rax;\n\t" 00950 "movb (%%rsi), %%al;\n\t" 00951 "movd %%rax, %%xmm0;\n\t" 00952 "cvtdq2ps %%xmm0, %%xmm1;\n\t" 00953 "movss %%xmm1, (%%rdi);\n\t" 00954 "inc %%rsi;\n\t" 00955 "add $4, %%rdi;\n\t" 00956 "loop .GB3;\n\t" 00957 ".GB1:;" 00958 : 00959 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 00960 :"memory" 00961 ); 00962 } 00963 00964 00965 00966 //###################################################################### 00967 // speedup ~= 1.15 00968 void sse2_cvt_byte_to_double(const byte *a, double *b, int32 sz) 00969 { 00970 int32 rcx=sz>>3; 00971 int32 rdx=sz&0x7; 00972 00973 asm( 00974 "or %%rcx, %%rcx;\n\t" 00975 "jz .GC4;\n\t" 00976 ".GC2:;\n\t" 00977 "pxor %%xmm0, %%xmm0;\n\t" 00978 "movdqu 0(%%rsi), %%xmm1;\n\t" 00979 "movdqu 2(%%rsi), %%xmm2;\n\t" 00980 "movdqu 4(%%rsi), %%xmm3;\n\t" 00981 "movdqu 6(%%rsi), %%xmm4;\n\t" 00982 "punpcklbw %%xmm0, %%xmm1;\n\t" 00983 "punpcklbw %%xmm0, %%xmm2;\n\t" 00984 "punpcklbw %%xmm0, %%xmm3;\n\t" 00985 "punpcklbw %%xmm0, %%xmm4;\n\t" 00986 "punpcklbw %%xmm0, %%xmm1;\n\t" 00987 "punpcklbw %%xmm0, %%xmm2;\n\t" 00988 "punpcklbw %%xmm0, %%xmm3;\n\t" 00989 "punpcklbw %%xmm0, %%xmm4;\n\t" 00990 "cvtdq2pd %%xmm1, %%xmm1;\n\t" 00991 "cvtdq2pd %%xmm2, %%xmm2;\n\t" 00992 "movupd %%xmm1, (%%rdi);\n\t" 00993 "movupd %%xmm2, 16(%%rdi);\n\t" 00994 "cvtdq2pd %%xmm3, %%xmm3;\n\t" 00995 "cvtdq2pd %%xmm4, %%xmm4;\n\t" 00996 "movupd %%xmm3, 32(%%rdi);\n\t" 00997 "movupd %%xmm4, 48(%%rdi);\n\t" 00998 "add $8, %%rsi;\n\t" 00999 "add $64, %%rdi;\n\t" 01000 "loop .GC2;\n\t" 01001 ".GC4:;\n\t" 01002 "or %%rdx, %%rdx;\n\t" 01003 "jz .GC1;\n\t" 01004 "mov %%rdx, %%rcx;\n\t" 01005 ".GC3:;\n\t" 01006 "xor %%rax, %%rax;\n\t" 01007 "movb (%%rsi), %%al;\n\t" 01008 "movd %%rax, %%xmm0;\n\t" 01009 "cvtdq2pd %%xmm0, %%xmm1;\n\t" 01010 "movsd %%xmm1, (%%rdi);\n\t" 01011 "inc %%rsi;\n\t" 01012 "add $8, %%rdi;\n\t" 01013 "loop .GC3;\n\t" 01014 ".GC1:;" 01015 : 01016 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01017 :"memory" 01018 ); 01019 01020 } 01021 01022 01023 01024 //###################################################################### 01025 01026 void sse2_cvt_int_to_float(const int32 *a, float *b, const int32 sz) 01027 { 01028 int32 rcx=sz>>5; 01029 int32 rdx=sz&0x1f; 01030 01031 asm( 01032 "or %%rcx, %%rcx;\n\t" 01033 "jz .GD4;\n\t" 01034 ".GD2:;\n\t" 01035 "movdqu 0(%%rsi), %%xmm0;\n\t" 01036 "movdqu 16(%%rsi), %%xmm1;\n\t" 01037 "movdqu 32(%%rsi), %%xmm2;\n\t" 01038 "movdqu 48(%%rsi), %%xmm3;\n\t" 01039 "movdqu 64(%%rsi), %%xmm4;\n\t" 01040 "movdqu 80(%%rsi), %%xmm5;\n\t" 01041 "movdqu 96(%%rsi), %%xmm6;\n\t" 01042 "movdqu 112(%%rsi), %%xmm7;\n\t" 01043 "cvtdq2ps %%xmm0, %%xmm0;\n\t" 01044 "cvtdq2ps %%xmm1, %%xmm1;\n\t" 01045 "cvtdq2ps %%xmm2, %%xmm2;\n\t" 01046 "cvtdq2ps %%xmm3, %%xmm3;\n\t" 01047 "cvtdq2ps %%xmm4, %%xmm4;\n\t" 01048 "cvtdq2ps %%xmm5, %%xmm5;\n\t" 01049 "cvtdq2ps %%xmm6, %%xmm6;\n\t" 01050 "cvtdq2ps %%xmm7, %%xmm7;\n\t" 01051 "movups %%xmm0, 0(%%rdi);\n\t" 01052 "movups %%xmm1, 16(%%rdi);\n\t" 01053 "movups %%xmm2, 32(%%rdi);\n\t" 01054 "movups %%xmm3, 48(%%rdi);\n\t" 01055 "movups %%xmm4, 64(%%rdi);\n\t" 01056 "movups %%xmm5, 80(%%rdi);\n\t" 01057 "movups %%xmm6, 96(%%rdi);\n\t" 01058 "movups %%xmm7, 112(%%rdi);\n\t" 01059 "add $128, %%rsi;\n\t" 01060 "add $128, %%rdi;\n\t" 01061 "dec %%rcx;\n\t" 01062 "jnz .GD2;\n\t" 01063 ".GD4:;\n\t" 01064 "or %%rdx, %%rdx;\n\t" 01065 "jz .GD1;\n\t" 01066 "mov %%rdx, %%rcx;\n\t" 01067 ".GD3:;\n\t" 01068 "movsd (%%rsi), %%xmm0;\n\t" 01069 "cvtdq2ps %%xmm0, %%xmm0;\n\t" 01070 "movss %%xmm0, (%%rdi);\n\t" 01071 "add $4, %%rsi;\n\t" 01072 "add $4, %%rdi;\n\t" 01073 "loop .GD3;\n\t" 01074 ".GD1:;" 01075 : 01076 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01077 :"memory" 01078 ); 01079 01080 } 01081 01082 //###################################################################### 01083 // speedup ~= 1.2 01084 void sse2_cvt_int_to_double(const int32 *a, double *b, const int32 sz) 01085 { 01086 int32 rcx=sz>>4; 01087 int32 rdx=sz&0xf; 01088 01089 asm( 01090 "or %%rcx, %%rcx;\n\t" 01091 "jz .GE4;\n\t" 01092 ".GE2:;\n\t" 01093 "movdqu 0(%%rsi), %%xmm0;\n\t" 01094 "movdqu 8(%%rsi), %%xmm1;\n\t" 01095 "movdqu 16(%%rsi), %%xmm2;\n\t" 01096 "movdqu 24(%%rsi), %%xmm3;\n\t" 01097 "movdqu 32(%%rsi), %%xmm4;\n\t" 01098 "movdqu 40(%%rsi), %%xmm5;\n\t" 01099 "movdqu 48(%%rsi), %%xmm6;\n\t" 01100 "movdqu 56(%%rsi), %%xmm7;\n\t" 01101 "cvtdq2pd %%xmm0, %%xmm0;\n\t" 01102 "cvtdq2pd %%xmm1, %%xmm1;\n\t" 01103 "cvtdq2pd %%xmm2, %%xmm2;\n\t" 01104 "cvtdq2pd %%xmm3, %%xmm3;\n\t" 01105 "cvtdq2pd %%xmm4, %%xmm4;\n\t" 01106 "cvtdq2pd %%xmm5, %%xmm5;\n\t" 01107 "cvtdq2pd %%xmm6, %%xmm6;\n\t" 01108 "cvtdq2pd %%xmm7, %%xmm7;\n\t" 01109 "movups %%xmm0, 0(%%rdi);\n\t" 01110 "movups %%xmm1, 16(%%rdi);\n\t" 01111 "movups %%xmm2, 32(%%rdi);\n\t" 01112 "movups %%xmm3, 48(%%rdi);\n\t" 01113 "movups %%xmm4, 64(%%rdi);\n\t" 01114 "movups %%xmm5, 80(%%rdi);\n\t" 01115 "movups %%xmm6, 96(%%rdi);\n\t" 01116 "movups %%xmm7, 112(%%rdi);\n\t" 01117 "add $64, %%rsi;\n\t" 01118 "add $128, %%rdi;\n\t" 01119 "dec %%rcx;\n\t" 01120 "jnz .GE2;\n\t" 01121 ".GE4:;\n\t" 01122 "or %%rdx, %%rdx;\n\t" 01123 "jz .GE1;\n\t" 01124 "mov %%rdx, %%rcx;\n\t" 01125 ".GE3:;\n\t" 01126 "movsd (%%rsi), %%xmm0;\n\t" 01127 "cvtdq2pd %%xmm0, %%xmm0;\n\t" 01128 "movsd %%xmm0, (%%rdi);\n\t" 01129 "add $4, %%rsi;\n\t" 01130 "add $8, %%rdi;\n\t" 01131 "loop .GE3;\n\t" 01132 ".GE1:;" 01133 : 01134 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01135 :"memory" 01136 ); 01137 01138 } 01139 01140 //###################################################################### 01141 void sse2_cvt_float_to_int(const float *a, int *b, const int32 sz) 01142 { 01143 int32 rcx=sz; 01144 int32 rdx=sz; 01145 01146 asm ( 01147 "or %%rcx, %%rcx;\n\t" 01148 "jz .GF1;\n\t" 01149 ".GF2:;\n\t" 01150 "movdqu 0(%%rsi), %%xmm0;\n\t" 01151 "movdqu 8(%%rsi), %%xmm1;\n\t" 01152 "movdqu 16(%%rsi), %%xmm2;\n\t" 01153 "movdqu 24(%%rsi), %%xmm3;\n\t" 01154 "movdqu 32(%%rsi), %%xmm4;\n\t" 01155 "movdqu 40(%%rsi), %%xmm5;\n\t" 01156 "movdqu 48(%%rsi), %%xmm6;\n\t" 01157 "movdqu 56(%%rsi), %%xmm7;\n\t" 01158 "cvtps2dq %%xmm0, %%xmm0;\n\t" 01159 "cvtps2dq %%xmm1, %%xmm1;\n\t" 01160 "cvtps2dq %%xmm2, %%xmm2;\n\t" 01161 "cvtps2dq %%xmm3, %%xmm3;\n\t" 01162 "cvtps2dq %%xmm4, %%xmm4;\n\t" 01163 "cvtps2dq %%xmm5, %%xmm5;\n\t" 01164 "cvtps2dq %%xmm6, %%xmm6;\n\t" 01165 "cvtps2dq %%xmm7, %%xmm7;\n\t" 01166 "movdqu %%xmm0, 0(%%rdi);\n\t" 01167 "movdqu %%xmm1, 16(%%rdi);\n\t" 01168 "movdqu %%xmm2, 32(%%rdi);\n\t" 01169 "movdqu %%xmm3, 48(%%rdi);\n\t" 01170 "movdqu %%xmm4, 64(%%rdi);\n\t" 01171 "movdqu %%xmm5, 80(%%rdi);\n\t" 01172 "movdqu %%xmm6, 96(%%rdi);\n\t" 01173 "movdqu %%xmm7, 112(%%rdi);\n\t" 01174 "add $64, %%rsi;\n\t" 01175 "add $128, %%rdi;\n\t" 01176 "dec %%rcx;\n\t" 01177 "jnz .GF2;\n\t" 01178 ".GF4:;\n\t" 01179 "or %%rdx, %%rdx;\n\t" 01180 "jz .GF1;\n\t" 01181 "mov %%rdx, %%rcx;\n\t" 01182 ".GF3:;\n\t" 01183 "movsd (%%rsi), %%xmm0;\n\t" 01184 "cvtps2dq %%xmm0, %%xmm0;\n\t" 01185 "movsd %%xmm0, (%%rdi);\n\t" 01186 "add $4, %%rsi;\n\t" 01187 "add $8, %%rdi;\n\t" 01188 "loop .GF3;\n\t" 01189 ".GF1:;" 01190 : 01191 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01192 :"memory" 01193 ); 01194 01195 } 01196 01197 01198 01199 //###################################################################### 01200 void sse2_cvt_float_to_double(const float *a, double *b, const int32 sz) 01201 { 01202 int32 rcx=sz>>4; 01203 int32 rdx=sz&0xf; 01204 01205 asm( 01206 "or %%rcx, %%rcx;\n\t" 01207 "jz .GG4;\n\t" 01208 ".GG2:;\n\t" 01209 "movups 0(%%rsi), %%xmm0;\n\t" 01210 "movups 8(%%rsi), %%xmm1;\n\t" 01211 "movups 16(%%rsi), %%xmm2;\n\t" 01212 "movups 24(%%rsi), %%xmm3;\n\t" 01213 "movups 32(%%rsi), %%xmm4;\n\t" 01214 "movups 40(%%rsi), %%xmm5;\n\t" 01215 "movups 48(%%rsi), %%xmm6;\n\t" 01216 "movups 56(%%rsi), %%xmm7;\n\t" 01217 "cvtps2pd %%xmm0, %%xmm0;\n\t" 01218 "cvtps2pd %%xmm1, %%xmm1;\n\t" 01219 "cvtps2pd %%xmm2, %%xmm2;\n\t" 01220 "cvtps2pd %%xmm3, %%xmm3;\n\t" 01221 "cvtps2pd %%xmm4, %%xmm4;\n\t" 01222 "cvtps2pd %%xmm5, %%xmm5;\n\t" 01223 "cvtps2pd %%xmm6, %%xmm6;\n\t" 01224 "cvtps2pd %%xmm7, %%xmm7;\n\t" 01225 "movupd %%xmm0, 0(%%rdi);\n\t" 01226 "movupd %%xmm1, 16(%%rdi);\n\t" 01227 "movupd %%xmm2, 32(%%rdi);\n\t" 01228 "movupd %%xmm3, 48(%%rdi);\n\t" 01229 "movupd %%xmm4, 64(%%rdi);\n\t" 01230 "movupd %%xmm5, 80(%%rdi);\n\t" 01231 "movupd %%xmm6, 96(%%rdi);\n\t" 01232 "movupd %%xmm7, 112(%%rdi);\n\t" 01233 "add $64, %%rsi;\n\t" 01234 "add $128, %%rdi;\n\t" 01235 "dec %%rcx;\n\t" 01236 "jnz .GG2;\n\t" 01237 ".GG4:;\n\t" 01238 "or %%rdx, %%rdx;\n\t" 01239 "jz .GG1;\n\t" 01240 "mov %%rdx, %%rcx;\n\t" 01241 ".GG3:;\n\t" 01242 "movsd (%%rsi), %%xmm0;\n\t" 01243 "cvtps2pd %%xmm0, %%xmm0;\n\t" 01244 "movsd %%xmm0, (%%rdi);\n\t" 01245 "add $4, %%rsi;\n\t" 01246 "add $8, %%rdi;\n\t" 01247 "loop .GG3;\n\t" 01248 ".GG1:;" 01249 : 01250 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01251 :"memory" 01252 ); 01253 } 01254 01255 #endif 01256 01257 #ifdef INVT_USE_SSE 01258 01259 //###################################################################### 01260 void sse_lowPass3x(const float *a, float *b, const int h, const int w) 01261 { 01262 const float coeffs[] = { 3.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0, 4.0}; 01263 int rdx = (w-2)/12; 01264 int rax = (w-2)%12; 01265 01266 asm ( 01267 // "movups 16(%%rbx), %%xmm7;\n\t" 01268 "or %%rcx, %%rcx;\n\t" 01269 "jz .HA1;\n\t" 01270 ".HA2:;\n\t" 01271 01272 // *dptr++ = (sptr[0]+sptr[0]+sptr[1])/3.0 01273 "movss 0(%%rsi), %%xmm1;\n\t" // xmm1 <- sptr[0] 01274 "movss 4(%%rsi), %%xmm2;\n\t" // xmm2 <- sptr[1] 01275 "addss %%xmm1, %%xmm1;\n\t" // xmm2 <- sptr[0] + sptr[0] 01276 "addss %%xmm1, %%xmm2;\n\t" // xmm2 <- xmm2 + sptr[1] 01277 "divss (%%rbx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0 01278 "movss %%xmm2, (%%rdi);\n\t" // *dptr <- xmm2 01279 "add $4, %%rdi;\n\t" // ++dptr 01280 01281 // for (int i = 0; i < w - 2; i ++) 01282 "or %%rdx, %%rdx;\n\t" 01283 "jz .HA4;\n\t" 01284 01285 "push %%rdx;\n\t" 01286 ".HA3:;\n\t" 01287 "movups 00(%%rsi), %%xmm0;\n\t" 01288 "movups 04(%%rsi), %%xmm1;\n\t" 01289 "movups 8(%%rsi), %%xmm2;\n\t" 01290 "movups 16(%%rsi), %%xmm3;\n\t" 01291 "movups 20(%%rsi), %%xmm4;\n\t" 01292 "movups 24(%%rsi), %%xmm5;\n\t" 01293 "movups 32(%%rsi), %%xmm6;\n\t" 01294 "movups 36(%%rsi), %%xmm7;\n\t" 01295 "addps %%xmm1, %%xmm0;\n\t" 01296 "addps %%xmm4, %%xmm3;\n\t" 01297 "addps %%xmm1, %%xmm0;\n\t" 01298 "addps %%xmm4, %%xmm3;\n\t" 01299 "movups 40(%%rsi), %%xmm1;\n\t" 01300 "addps %%xmm7, %%xmm6;\n\t" 01301 "addps %%xmm2, %%xmm0;\n\t" 01302 "addps %%xmm1, %%xmm6;\n\t" 01303 "addps %%xmm5, %%xmm3;\n\t" 01304 "addps %%xmm7, %%xmm6;\n\t" 01305 "divps 16(%%rbx ), %%xmm0;\n\t" 01306 "divps 16(%%rbx ), %%xmm3;\n\t" 01307 "divps 16(%%rbx ), %%xmm6;\n\t" 01308 "movups %%xmm0, (%%rdi);\n\t" 01309 "movups %%xmm3, 16(%%rdi);\n\t" 01310 "movups %%xmm6, 32(%%rdi);\n\t" 01311 "add $48, %%rsi;\n\t" 01312 "add $48, %%rdi;\n\t" 01313 "dec %%rdx;\n\t" 01314 "jnz .HA3;\n\t" 01315 "pop %%rdx;\n\t" 01316 ".HA4:;\n\t" 01317 01318 "or %%rax, %%rax;\n\t" 01319 "jz .HA6;\n\t" 01320 "push %%rax;\n\t" 01321 ".HA5:;\n\t" 01322 "movss 00(%%rsi), %%xmm0;\n\t" 01323 "movss 04(%%rsi), %%xmm1;\n\t" 01324 "movss 8(%%rsi), %%xmm2;\n\t" 01325 "addps %%xmm1, %%xmm0;\n\t" 01326 "addps %%xmm1, %%xmm2;\n\t" 01327 "addps %%xmm2, %%xmm0;\n\t" 01328 "divss 16(%%rbx ), %%xmm0;\n\t" 01329 "movss %%xmm0, (%%rdi);\n\t" 01330 "add $4, %%rsi;\n\t" 01331 "add $4, %%rdi;\n\t" 01332 "dec %%rax;\n\t" 01333 "jnz .HA5;\n\t" 01334 "pop %%rax;\n\t" 01335 01336 ".HA6:;\n\t" 01337 "movss (%%rsi), %%xmm1;\n\t" // xmm1 <- sptr[0] 01338 "movss 4(%%rsi), %%xmm2;\n\t" // xmm2 <- sptr[1] 01339 "addss %%xmm2, %%xmm2;\n\t" // xmm2 <- sptr[0] + sptr[1] 01340 "addss %%xmm1, %%xmm2;\n\t" // xmm2 <- xmm2 + sptr[0] 01341 "divss 0(%%rbx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0 01342 01343 "movss %%xmm2, (%%rdi);\n\t" // *dptr <- xmm2 01344 "add $4, %%rdi;\n\t" // ++dptr 01345 "add $8, %%rsi;\n\t" // sptr += 2 01346 "dec %%rcx;\n\t" 01347 "jnz .HA2;\n\t" 01348 ".HA1:;\n\t" 01349 : 01350 :"S"(a), "D"(b),"c"(h),"a"(rax),"d"(rdx),"b"(coeffs) 01351 :"memory" 01352 ); 01353 01354 } 01355 01356 01357 01358 01359 //###################################################################### 01360 01361 void sse_lowPass3y(const float *a, float *b, const int h, const int w) 01362 { 01363 const float coeffs[] = { 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0}; 01364 01365 if (h < 2){ 01366 memcpy(b, a, w*h*sizeof(b[0])); 01367 return; // nothing to smooth 01368 } 01369 01370 if (h < 2){ 01371 memcpy(b, a, w*h*sizeof(b[0])); 01372 return; // nothing to smooth 01373 } 01374 01375 asm ( 01376 // top row 01377 "mov %%rdx, %%rcx;\n\t" 01378 "or %%rcx, %%rcx;\n\t" 01379 "jz .HU1;\n\t" 01380 "push %%rsi;\n\t" 01381 ".HU0:;\n\t" 01382 "movss (%%rsi), %%xmm0;\n\t" // xmm0 <- sptr[0] 01383 "movss (%%rsi, %%rdx, 4), %%xmm1;\n\t" //xmm1 <- sptr[w] 01384 "addss %%xmm0, %%xmm0;\n\t" 01385 "addss %%xmm1, %%xmm0;\n\t" 01386 "divss (%%rbx), %%xmm0;\n\t" 01387 "add $4, %%rsi;\n\t" 01388 "movss %%xmm0, (%%rdi);\n\t" 01389 "add $4, %%rdi;\n\t" 01390 "dec %%rcx;\n\t" 01391 "jnz .HU0;\n\t" 01392 "pop %%rsi;\n\t" 01393 ".HU1:;\n\t" 01394 "cmp $2, %%rax;\n\t" 01395 "jle .HU5;\n\t" 01396 01397 "push %%rax;\n\t" 01398 "sub $2, %%rax;\n\t" 01399 "jle .HU4;\n\t" 01400 ".HU2:;\n\t" 01401 "mov %%rdx, %%rcx;\n\t" 01402 "push %%rdx;\n\t" 01403 ".HU3:;\n\t" 01404 "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- sptr[0] 01405 "movss (%%rsi,%%rdx,4), %%xmm1;\n\t" //xmm1 <- sptr[w] 01406 "movss (%%rsi,%%rdx,8), %%xmm2;\n\t" //xmm2 <- sptr[2*w] 01407 "addss %%xmm1, %%xmm0;\n\t" 01408 "addss %%xmm1, %%xmm2;\n\t" 01409 "addss %%xmm2, %%xmm0;\n\t" 01410 "divss 16(%%rbx), %%xmm0;\n\t" 01411 "movss %%xmm0, (%%rdi);\n\t" 01412 "add $4, %%rsi;\n\t" 01413 "add $4, %%rdi;\n\t" 01414 "dec %%rcx;\n\t" 01415 "jnz .HU3;\n\t" 01416 "pop %%rdx;\n\t" 01417 "dec %%rax;\n\t" 01418 "jnz .HU2;\n\t" 01419 01420 ".HU4:;\n\t" 01421 "pop %%rax;\n\t" 01422 ".HU5:;\n\t" 01423 "or %%rdx, %%rdx;\n\t" 01424 "jz .HU7;\n\t" 01425 "push %%rdx;\n\t" 01426 "mov %%rdx, %%rcx;\n\t" 01427 ".HU6:;\n\t" 01428 "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- sptr[0] 01429 "movss (%%rsi,%%rcx,4), %%xmm1;\n\t" //xmm1 <- sptr[w] 01430 "addss %%xmm1, %%xmm1;\n\t" 01431 "addss %%xmm1, %%xmm0;\n\t" 01432 "divss (%%rbx), %%xmm0;\n\t" 01433 "movss %%xmm0, (%%rdi);\n\t" 01434 "add $4, %%rsi;\n\t" 01435 "add $4, %%rdi;\n\t" 01436 "dec %%rdx;\n\t" 01437 "jnz .HU6;\n\t" 01438 "pop %%rdx;\n\t" 01439 ".HU7:;\n\t" 01440 : 01441 :"S"(a),"D"(b),"a"(h),"d"(w),"b"(coeffs) 01442 ); 01443 01444 } 01445 01446 01447 //###################################################################### 01448 01449 void sse_lowPass5x(const float *src, float *dest, const int h, const int w) 01450 { 01451 const float *sptr= src; 01452 float *dptr= dest; 01453 01454 if(w<2) 01455 { 01456 memcpy(dest,src,h*w*sizeof(dest[0])); 01457 return; 01458 } 01459 01460 if (w == 2) ////////////////////////////////////////////////// 01461 for (int j = 0; j < h; j ++) 01462 { 01463 // leftmost point [ (6^) 4 ] / 10 01464 *dptr++ = sptr[0] * (6.0F / 10.0F) + sptr[1] * (4.0F / 10.0F); 01465 01466 // rightmost point [ 4^ (6) ] / 10 01467 *dptr++ = sptr[0] * (4.0F / 10.0F) + sptr[1] * (6.0F / 10.0F); 01468 01469 sptr += 2; // sptr back to same position as dptr 01470 } 01471 else if (w == 3) ////////////////////////////////////////////////// 01472 for (int j = 0; j < h; j ++) 01473 { 01474 // leftmost point [ (6^) 4 1 ] / 11 01475 *dptr++ = sptr[0] * (6.0F / 11.0F) + 01476 sptr[1] * (4.0F / 11.0F) + 01477 sptr[2] * (1.0F / 11.0F); 01478 01479 // middle point [ 4^ (6) 4 ] / 14 01480 *dptr++ = (sptr[0] + sptr[2]) * (4.0F / 14.0F) + 01481 sptr[1] * (6.0F / 14.0F); 01482 01483 // rightmost point [ 1^ 4 (6) ] / 11 01484 *dptr++ = sptr[0] * (1.0F / 11.0F) + 01485 sptr[1] * (4.0F / 11.0F) + 01486 sptr[2] * (6.0F / 11.0F); 01487 01488 sptr += 3; // sptr back to same position as dptr 01489 } 01490 else 01491 if(w>3) 01492 { 01493 const float coeffs[] = {6.0/11.0, 4.0/11.0, 1.0/11.0, 4.0/15.0, 01494 4.0/15.0, 6.0/15.0, 1.0/15.0, 1.0/16.0, 01495 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0, 01496 4.0/16.0, 4.0/16.0, 4.0/16.0, 4.0/16.0, 01497 6.0/16.0, 6.0/16.0, 6.0/16.0, 6.0/16.0, 01498 1.0/15.0, 4.0/15.0, 6.0/15.0, 1.0/15.0, 01499 1.0/11.0, 4.0/11.0, 6.0/11.0, 1.0/11.0 01500 }; 01501 01502 int rax= (w-4)&3; 01503 int rdx= (w-4)>>2; 01504 01505 asm( 01506 "or %%rcx, %%rcx;\n\t" // rcx <- h 01507 "jz .HG6;\n\t" 01508 ".HG0:;\n\t" 01509 "movss (%%rsi), %%xmm0;\n\t" // xmm0 <- s[0] 01510 "movss 4(%%rsi), %%xmm2;\n\t" // xmm2 <- s[1] 01511 "movss 8(%%rsi), %%xmm4;\n\t" // xmm4 <- s[2] 01512 "movss 12(%%rsi), %%xmm6;\n\t" // xmm6 <- s[3] 01513 "movss %%xmm0, %%xmm1;\n\t" // xmm1 <- s[0] 01514 "movss %%xmm2, %%xmm3;\n\t" // xmm3 <- s[1] 01515 "movss %%xmm4, %%xmm5;\n\t" // xmm5 <- s[2] 01516 "mulss (%%rbx), %%xmm0;\n\t" // xmm0 <- 6.0/11.0*s[0] 01517 "mulss 4(%%rbx), %%xmm2;\n\t" // xmm2 <- 4.0/11.0*s[1] 01518 "mulss 8(%%rbx), %%xmm4;\n\t" // xmm4 <- 1.0/11.0*s[2] 01519 "addss %%xmm5, %%xmm1;\n\t" // xmm1 <- s[2]+s[0] 01520 "mulss 16(%%rbx), %%xmm1;\n\t" // xmm1 <- (s2+s0)*4.0/15.0 01521 "mulss 20(%%rbx), %%xmm3;\n\t" 01522 "mulss 24(%%rbx), %%xmm6;\n\t" 01523 "addss %%xmm2, %%xmm0;\n\t" 01524 "addss %%xmm3, %%xmm1;\n\t" 01525 "addss %%xmm4, %%xmm0;\n\t" 01526 "addss %%xmm6, %%xmm1;\n\t" 01527 "movss %%xmm0, (%%rdi);\n\t" 01528 "movss %%xmm1, 4(%%rdi);\n\t" 01529 "add $8, %%rdi;\n\t" 01530 01531 "or %%rdx, %%rdx;\n\t" 01532 "jz .HG5;\n\t" 01533 01534 "push %%rdx;\n\t" // rdx <- (w-4)/4 01535 "movups 32(%%rbx), %%xmm5;\n\t" // xmm5 <- 1.0/16.0 1.0/16.0 1.0/16 1.0/16 01536 "movups 48(%%rbx), %%xmm6;\n\t" // xmm6 <- 4.0/16.0 ...................... 01537 "movups 64(%%rbx), %%xmm7;\n\t" // xmm7 <- 6.0/16.0 ...................... 01538 ".HG1:;\n\t" 01539 "movups 0(%%rsi), %%xmm0;\n\t" // xmm0 <- s0 s1 s2 s3 01540 "movups 04(%%rsi), %%xmm1;\n\t" // xmm1 <- s1 s2 s3 s4 01541 "movups 8(%%rsi), %%xmm2;\n\t" // xmm2 <- s2 s3 s4 s5 01542 "movups 12(%%rsi), %%xmm3;\n\t" // xmm3 <- s3 s4 s5 s6 01543 "movups 16(%%rsi), %%xmm4;\n\t" // xmm4 <- s4 s5 s6 s7 01544 "addps %%xmm4, %%xmm0;\n\t" 01545 "addps %%xmm3, %%xmm1;\n\t" 01546 "mulps %%xmm5, %%xmm0;\n\t" 01547 "mulps %%xmm6, %%xmm1;\n\t" 01548 "mulps %%xmm7, %%xmm2;\n\t" 01549 "addps %%xmm1, %%xmm0;\n\t" 01550 "addps %%xmm2, %%xmm0;\n\t" 01551 "movups %%xmm0, (%%rdi);\n\t" 01552 "add $16, %%rsi;\n\t" 01553 "add $16, %%rdi;\n\t" 01554 "dec %%rdx;\n\t" 01555 "jnz .HG1;\n\t" 01556 "pop %%rdx;\n\t" 01557 01558 ".HG5:;\n\t" 01559 "or %%rax, %%rax;\n\t" 01560 "jz .HG3;\n\t" 01561 "push %%rax;\n\t" // rax <- (w-4)%4 01562 "movups 32(%%rbx), %%xmm5;\n\t" 01563 "movups 48(%%rbx), %%xmm6;\n\t" 01564 "movups 64(%%rbx), %%xmm7;\n\t" 01565 ".HG2:;\n\t" 01566 "movss (%%rsi), %%xmm0;\n\t" 01567 "movss 4(%%rsi), %%xmm1;\n\t" 01568 "movss 8(%%rsi), %%xmm2;\n\t" 01569 "movss 12(%%rsi), %%xmm3;\n\t" 01570 "movss 16(%%rsi), %%xmm4;\n\t" 01571 "mulss %%xmm5 , %%xmm0;\n\t" 01572 "mulss %%xmm6 , %%xmm1;\n\t" 01573 "mulss %%xmm7 , %%xmm2;\n\t" 01574 "mulss %%xmm6 , %%xmm3;\n\t" 01575 "mulss %%xmm5 , %%xmm4;\n\t" 01576 "addss %%xmm1, %%xmm0;\n\t" 01577 "addss %%xmm3, %%xmm2;\n\t" 01578 "addss %%xmm4, %%xmm0;\n\t" 01579 "addss %%xmm2, %%xmm0;\n\t" 01580 "add $4, %%rsi;\n\t" 01581 "movss %%xmm0, (%%rdi);\n\t" 01582 "add $4, %%rdi;\n\t" 01583 "dec %%rax;\n\t" 01584 "jnz .HG2;\n\t" 01585 "pop %%rax;\n\t" 01586 ".HG3:;\n\t" 01587 "movss (%%rsi), %%xmm0;\n\t" // xmm0 <- s0 01588 "movss 4(%%rsi), %%xmm1;\n\t" // xmm1 <- s1 01589 "movss 8(%%rsi), %%xmm2;\n\t" // xmm2 <- s2 01590 "movss 12(%%rsi), %%xmm3;\n\t" // xmm3 <- s3 01591 "movss %%xmm1, %%xmm4;\n\t" // xmm4 <- s1 01592 "movss %%xmm2, %%xmm5;\n\t" // xmm5 <- s2 01593 "movss %%xmm3, %%xmm6;\n\t" // xmm6 <- s3 01594 "addps %%xmm1, %%xmm3;\n\t" // xmm3 <- s1+s3 01595 "mulss 80(%%rbx), %%xmm0;\n\t" // xmm0 <- 1.0/15.0*s0 01596 "mulss 84(%%rbx), %%xmm3;\n\t" // xmm3 <- 4.0/15.0*(s1+s3) 01597 "mulss 88(%%rbx), %%xmm2;\n\t" // xmm2 <- 6.0/15.0*s2 01598 "addss %%xmm3, %%xmm0;\n\t" 01599 "addss %%xmm2, %%xmm0;\n\t" 01600 "movss %%xmm0, (%%rdi);\n\t" 01601 "mulss 96(%%rbx), %%xmm4;\n\t" 01602 "mulss 100(%%rbx), %%xmm5;\n\t" 01603 "mulss 104(%%rbx), %%xmm6;\n\t" 01604 "addss %%xmm5, %%xmm4;\n\t" 01605 "addss %%xmm6, %%xmm4;\n\t" 01606 "movss %%xmm4, 4(%%rdi);\n\t" 01607 "add $16, %%rsi;\n\t" 01608 "add $8, %%rdi;\n\t" 01609 "dec %%rcx;\n\t" 01610 "jnz .HG0;\n\t" 01611 ".HG6:;\n\t" 01612 : 01613 :"S"(sptr),"D"(dptr),"a"(rax),"b"(coeffs),"c"(h),"d"(rdx) 01614 :"memory" 01615 ); 01616 } 01617 01618 } 01619 01620 01621 01622 //###################################################################### 01623 01624 void sse_lowPass5y(const float *src, float *dest, const int h, 01625 const int w) 01626 { 01627 01628 /* 01629 if (h < 2){ 01630 memcpy(dest, src, h*w*sizeof(dest[0])); 01631 return; // nothing to smooth 01632 } 01633 01634 const float *sptr= src; 01635 float *dptr= dest; 01636 01637 // ########## vertical pass (even though we scan horiz for speedup) 01638 const int w2 = w * 2; // speedup 01639 01640 01641 if (h == 2) ////////////////////////////////////////////////// 01642 { 01643 // topmost points ( [ (6^) 4 ] / 10 )^T 01644 for (int i = 0; i < w; i ++) 01645 { 01646 *dptr++ = sptr[0] * (6.0F / 10.0F) + 01647 sptr[w] * (4.0F / 10.0F); 01648 sptr++; 01649 } 01650 sptr -= w; // go back to top-left 01651 01652 // bottommost points ( [ 4^ (6) ] / 10 )^T 01653 for (int i = 0; i < w; i ++) 01654 { 01655 *dptr++ = sptr[0] * (4.0F / 10.0F) + 01656 sptr[w] * (6.0F / 10.0F); 01657 sptr++; 01658 } 01659 } 01660 else if (h == 3) ////////////////////////////////////////////////// 01661 { 01662 // topmost points ( [ (6^) 4 1 ] / 11 )^T 01663 for (int i = 0; i < w; i ++) 01664 { 01665 *dptr++ = sptr[ 0] * (6.0F / 11.0F) + 01666 sptr[ w] * (4.0F / 11.0F) + 01667 sptr[w2] * (1.0F / 11.0F); 01668 sptr++; 01669 } 01670 sptr -= w; // go back to top-left 01671 01672 // middle points ( [ 4^ (6) 4 ] / 14 )^T 01673 for (int i = 0; i < w; i ++) 01674 { 01675 *dptr++ = (sptr[ 0] + sptr[w2]) * (4.0F / 14.0F) + 01676 sptr[ w] * (6.0F / 14.0F); 01677 sptr++; 01678 } 01679 sptr -= w; // go back to top-left 01680 01681 // bottommost points ( [ 1^ 4 (6) ] / 11 )^T 01682 for (int i = 0; i < w; i ++) 01683 { 01684 *dptr++ = sptr[ 0] * (1.0F / 11.0F) + 01685 sptr[ w] * (4.0F / 11.0F) + 01686 sptr[w2] * (6.0F / 11.0F); 01687 sptr++; 01688 } 01689 } 01690 else ///////////////////////////////// general case for height >= 4 01691 { 01692 // topmost points ( [ (6^) 4 1 ] / 11 )^T 01693 01694 static const float coeffs[] = { 01695 6.0/11.0, 6.0/11.0, 6.0/11.0, 6.0/11.0, //0 01696 4.0/11.0, 4.0/11.0, 4.0/11.0, 4.0/11.0, //16 01697 1.0/11.0, 1.0/11.0, 1.0/11.0, 1.0/11.0, //32 01698 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, //48 01699 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, //64 01700 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, //80 01701 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0, //96 01702 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, //112 01703 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F //128 01704 }; 01705 01706 int rcx=h-4; 01707 int rdx=w>>2; 01708 int rax=w&3; 01709 01710 asm ( 01711 "push %%rbp;\n\t" 01712 "mov %0, %%rbp;\n\t" 01713 "add %%rbp, %%rbp;\n\t" 01714 "add %%rbp, %%rbp;\n\t" 01715 01716 // 1st loop 01717 "movups (%%rbx), %%xmm4;\n\t" //xmm4 <- 6.0/11.0 ... 01718 "movups 16(%%rbx), %%xmm5;\n\t" //xmm5 <- 4.0/11.0 01719 "movups 32(%%rbx), %%xmm6;\n\t" //xmm6 <- 1.0/11.0 01720 "push %%rsi;\n\t" 01721 "or %%rdx, %%rdx;\n\t" 01722 "jz .IA1;\n\t" 01723 ".align 4;\n\t" 01724 "push %%rdx;\n\t" 01725 ".IA0:;\n\t" 01726 ".align 4;\n\t" 01727 "movups (%%rsi), %%xmm0;\n\t" //xmm0 <- s0 s0 s0 s0 01728 "movups (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01729 "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01730 "mulps %%xmm4, %%xmm0;\n\t" 01731 "mulps %%xmm5, %%xmm1;\n\t" 01732 "mulps %%xmm6, %%xmm2;\n\t" 01733 "addps %%xmm1, %%xmm0;\n\t" 01734 "addps %%xmm2, %%xmm0;\n\t" 01735 "movups %%xmm0, (%%rdi);\n\t" 01736 "add $16, %%rsi;\n\t" 01737 "add $16, %%rdi;\n\t" 01738 "dec %%rdx;\n\t" 01739 "jnz .IA0;\n\t" 01740 "pop %%rdx;\n\t" 01741 ".IA1:;\n\t" 01742 ".align 4;\n\t" 01743 "or %%rax, %%rax;\n\t" 01744 "jz .IA3;\n\t" 01745 "push %%rax;\n\t" 01746 ".IA2:;\n\t" 01747 ".align 4;\n\t" 01748 "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01749 "movss (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW+3 sW+2 sW+1 sW 01750 "movss (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sP+3 sP+3 sP+1 sP 01751 "mulss %%xmm4, %%xmm0;\n\t" 01752 "mulss %%xmm5, %%xmm1;\n\t" 01753 "mulss %%xmm6, %%xmm2;\n\t" 01754 "addss %%xmm1, %%xmm0;\n\t" 01755 "addss %%xmm2, %%xmm0;\n\t" 01756 "movss %%xmm0, (%%rdi);\n\t" 01757 "add $4, %%rsi;\n\t" 01758 "add $4, %%rdi;\n\t" 01759 "dec %%rax;\n\t" 01760 "jnz .IA2;\n\t" 01761 "pop %%rax;\n\t" 01762 ".IA3:;\n\t" 01763 "pop %%rsi;\n\t" // restore sptr 01764 01765 // 2nd loop 01766 "movups 48(%%rbx), %%xmm4;\n\t" //xmm4 <- 4.0/15.0 01767 "movups 64(%%rbx), %%xmm5;\n\t" //xmm5 <- 6.0/15.0 01768 "movups 80(%%rbx), %%xmm6;\n\t" //xmm6 <- 1.0/15.0 01769 "push %%rsi;\n\t" 01770 "or %%rdx, %%rdx;\n\t" 01771 "jz .IA5;\n\t" 01772 "push %%rdx;\n\t" 01773 "push %%rax;\n\t" 01774 "mov %%rbp, %%rax;\n\t" 01775 "add %%rbp, %%rax;\n\t" 01776 "add %%rbp, %%rax;\n\t" 01777 ".IA4:;\n\t" 01778 "movups (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01779 "movups (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01780 "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01781 "movups (%%rsi,%%rax,1), %%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 01782 "addps %%xmm2, %%xmm0;\n\t" 01783 "mulps %%xmm4, %%xmm0;\n\t" 01784 "mulps %%xmm5, %%xmm1;\n\t" 01785 "mulps %%xmm6, %%xmm3;\n\t" 01786 "addps %%xmm1, %%xmm0;\n\t" 01787 "addps %%xmm3, %%xmm0;\n\t" 01788 "movups %%xmm0, (%%rdi);\n\t" 01789 "add $16, %%rsi;\n\t" 01790 "add $16, %%rdi;\n\t" 01791 "dec %%rdx;\n\t" 01792 "jnz .IA4;\n\t" 01793 "pop %%rax;\n\t" 01794 "pop %%rdx;\n\t" 01795 ".IA5:;\n\t" 01796 "or %%rax, %%rax;\n\t" 01797 "jz .IA7;\n\t" 01798 "push %%rax;\n\t" 01799 "push %%rdx;\n\t" 01800 "mov %%rbp, %%rdx;\n\t" 01801 "add %%rbp, %%rdx;\n\t" 01802 "add %%rbp, %%rdx;\n\t" 01803 ".IA6:;\n\t" 01804 "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01805 "movss (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01806 "movss (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01807 "movss (%%rsi,%%rdx,1), %%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 01808 "addss %%xmm2, %%xmm0;\n\t" 01809 "mulss %%xmm4, %%xmm0;\n\t" 01810 "mulss %%xmm5, %%xmm1;\n\t" 01811 "mulss %%xmm6, %%xmm3;\n\t" 01812 "addss %%xmm1, %%xmm0;\n\t" 01813 "addss %%xmm3, %%xmm0;\n\t" 01814 "movss %%xmm0, (%%rdi);\n\t" 01815 "add $4, %%rsi;\n\t" 01816 "add $4, %%rdi;\n\t" 01817 "dec %%rax;\n\t" 01818 "jnz .IA6;\n\t" 01819 "pop %%rdx;\n\t" 01820 "pop %%rax;\n\t" 01821 ".IA7:;\n\t" 01822 "pop %%rsi;\n\t" // restore sptr 01823 01824 01825 // the double loops 01826 "or %%rcx, %%rcx;\n\t" 01827 "jz .IA29;\n\t" 01828 "push %%rcx;\n\t" 01829 "movups 96(%%rbx), %%xmm5;\n\t" // xmm5 <- 1.0/16.0 01830 "movups 112(%%rbx), %%xmm6;\n\t" // xmm6 <- 4.0/16.0 01831 "movups 128(%%rbx), %%xmm7;\n\t" // xmm7 <- 6.0/16.0 01832 ".IA8:;\n\t" 01833 "or %%rdx, %%rdx;\n\t" 01834 "jz .IA10;\n\t" 01835 "push %%rdx;\n\t" 01836 "push %%rax;\n\t" 01837 "mov %%rbp, %%rax;\n\t" 01838 "add %%rbp, %%rax;\n\t" 01839 "add %%rbp, %%rax;\n\t" // rax <- 3*W 01840 ".IA9:;\n\t" 01841 "movups (%%rsi), %%xmm0;\n\t" // xmm0 <- s s s s 01842 "movups (%%rsi,%%rbp,1), %%xmm1;\n\t" // xmm1 <- sW sW sW sW 01843 "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" // xmm2 <- sW2 sW2 sW2 sW2 01844 "movups (%%rsi,%%rax,1), %%xmm3;\n\t" // xmm3 <- sW3 sW3 sW3 sW3 01845 "movups (%%rsi,%%rbp,4), %%xmm4;\n\t" // xmm4 <- sW4 sW4 sW4 sW4 01846 "addps %%xmm3, %%xmm1;\n\t" // xmm1 <- sW3 + sW1 01847 "addps %%xmm4, %%xmm0;\n\t" // xmm0 <- s0 + sW4 01848 "mulps %%xmm6, %%xmm1;\n\t" // xmm1 <- 4.0/16.0*(sW3+sW1) 01849 "mulps %%xmm5, %%xmm0;\n\t" // xmm0 <- 1.0/16.08(s0 +sW4) 01850 "mulps %%xmm7, %%xmm2;\n\t" // xmm2 <- 6.0/16.0*sW2 01851 "addps %%xmm1, %%xmm0;\n\t" 01852 "addps %%xmm2, %%xmm0;\n\t" 01853 "add $16, %%rsi;\n\t" 01854 "movups %%xmm0, (%%rdi);\n\t" 01855 "add $16, %%rdi;\n\t" 01856 "dec %%rdx;\n\t" 01857 "jnz .IA9;\n\t" 01858 "pop %%rax;\n\t" 01859 "pop %%rdx;\n\t" 01860 ".IA10:;\n\t" 01861 "or %%rax, %%rax;\n\t" 01862 "jz .IA12;\n\t" 01863 "push %%rax;\n\t" 01864 "push %%rdx;\n\t" 01865 "mov %%rbp, %%rdx;\n\t" 01866 "add %%rbp, %%rdx;\n\t" 01867 "add %%rbp, %%rdx;\n\t" 01868 ".IA11:;\n\t" 01869 "movss (%%rsi), %%xmm0;\n\t" // xmm0 <- s s s s 01870 "movss (%%rsi,%%rbp,1), %%xmm1;\n\t" // xmm1 <- sW sW sW sW 01871 "movss (%%rsi,%%rbp,2), %%xmm2;\n\t" // xmm2 <- sW2 sW2 sW2 sW2 01872 "movss (%%rsi,%%rdx,1), %%xmm3;\n\t" // xmm3 <- sW3 sW3 sW3 sW3 01873 "movss (%%rsi,%%rbp,4), %%xmm4;\n\t" // xmm4 <- sW4 sW4 sW4 sW4 01874 "addss %%xmm3, %%xmm1;\n\t" 01875 "addss %%xmm4, %%xmm0;\n\t" 01876 "mulss %%xmm6, %%xmm1;\n\t" 01877 "mulss %%xmm5, %%xmm0;\n\t" 01878 "mulss %%xmm7, %%xmm2;\n\t" 01879 "addss %%xmm1, %%xmm0;\n\t" 01880 "addss %%xmm2, %%xmm0;\n\t" 01881 "add $4, %%rsi;\n\t" 01882 "movss %%xmm0, (%%rdi);\n\t" 01883 "add $4, %%rdi;\n\t" 01884 "dec %%rax;\n\t" 01885 "jnz .IA11;\n\t" 01886 "pop %%rdx;\n\t" 01887 "pop %%rax;\n\t" 01888 ".IA12:;\n\t" 01889 "dec %%rcx;\n\t" 01890 "jnz .IA8;\n\t" 01891 "pop %%rcx;\n\t" 01892 ".IA29:;\n\t" 01893 01894 // fourth loop 01895 "movups 48(%%rbx), %%xmm4;\n\t" //xmm4 <- 4.0/15.0 01896 "movups 64(%%rbx), %%xmm5;\n\t" //xmm5 <- 6.0/15.0 01897 "movups 80(%%rbx), %%xmm6;\n\t" //xmm6 <- 1.0/15.0 01898 "or %%rdx, %%rdx;\n\t" 01899 "jz .IA14;\n\t" 01900 "push %%rdx;\n\t" 01901 "push %%rax;\n\t" 01902 "mov %%rbp, %%rax;\n\t" 01903 "add %%rbp, %%rax;\n\t" 01904 "add %%rbp, %%rax;\n\t" 01905 ".IA13:;\n\t" 01906 "movups (%%rsi), %%xmm0;\n\t" //xmm0 <- s0 s0 s0 s0 01907 "movups (%%rsi,%S%rbp,1), %%xmm1;\n\t" //xmm1 <- sW1 sW1 sW1 sW1 01908 "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01909 "movups (%%rsi,%%rax,1),%%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 01910 "addps %%xmm3, %%xmm1;\n\t" //xmm1 <- sW3 + sW1 01911 "mulps %%xmm6, %%xmm0;\n\t" //xmm0 <- 1.0/15.0 * s0 01912 "mulps %%xmm5, %%xmm2;\n\t" //xmm2 <- 6.0/15.0 * sW2 01913 "mulps %%xmm4, %%xmm1;\n\t" //xmm4 <- 4.0/15.0 * (sW3+sW1) 01914 "addps %%xmm2, %%xmm0;\n\t" 01915 "addps %%xmm1, %%xmm0;\n\t" 01916 "movups %%xmm0, (%%rdi);\n\t" 01917 "add $16, %%rsi;\n\t" 01918 "add $16, %%rdi;\n\t" 01919 "dec %%rdx;\n\t" 01920 "jnz .IA13;\n\t" 01921 "pop %%rax;\n\t" 01922 "pop %%rdx;\n\t" 01923 ".IA14:;\n\t" 01924 "or %%rax, %%rax;\n\t" 01925 "jz .IA16;\n\t" 01926 "push %%rax;\n\t" 01927 "push %%rdx;\n\t" 01928 "mov %%rbp, %%rdx;\n\t" 01929 "add %%rbp, %%rdx;\n\t" 01930 "add %%rbp, %%rdx;\n\t" 01931 ".IA15:;\n\t" 01932 "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01933 "movss (%%rsi, %%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01934 "movss (%%rsi, %%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01935 "movss (%%rsi, %%rdx,1), %%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 01936 "addss %%xmm3, %%xmm1;\n\t" 01937 "mulss %%xmm6, %%xmm0;\n\t" 01938 "mulss %%xmm5, %%xmm2;\n\t" 01939 "mulss %%xmm4, %%xmm1;\n\t" 01940 "addss %%xmm2, %%xmm0;\n\t" 01941 "addss %%xmm1, %%xmm0;\n\t" 01942 "movss %%xmm0, (%%rdi);\n\t" 01943 "add $4, %%rsi;\n\t" 01944 "add $4, %%rdi;\n\t" 01945 "dec %%rax;\n\t" 01946 "jnz .IA15;\n\t" 01947 "pop %%rdx;\n\t" 01948 "pop %%rax;\n\t" 01949 ".IA16:;\n\t" 01950 01951 // final loop 01952 "movups 32(%%rbx), %%xmm4;\n\t" 01953 "movups 16(%%rbx), %%xmm5;\n\t" 01954 "movups (%%rbx), %%xmm6;\n\t" 01955 "or %%rdx, %%rdx;\n\t" 01956 "jz .IA18;\n\t" 01957 "push %%rdx;\n\t" 01958 ".IA17:;\n\t" 01959 "movups (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01960 "movups (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01961 "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01962 "mulps %%xmm4, %%xmm0;\n\t" 01963 "mulps %%xmm5, %%xmm1;\n\t" 01964 "mulps %%xmm6, %%xmm2;\n\t" 01965 "addps %%xmm1, %%xmm0;\n\t" 01966 "addps %%xmm2, %%xmm0;\n\t" 01967 "movups %%xmm0, (%%rdi);\n\t" 01968 "add $16, %%rsi;\n\t" 01969 "add $16, %%rdi;\n\t" 01970 "dec %%rdx;\n\t" 01971 "jnz .IA17;\n\t" 01972 "pop %%rdx;\n\t" 01973 ".IA18:;\n\t" 01974 "or %%rax, %%rax;\n\t" 01975 "jz .IA20;\n\t" 01976 "push %%rax;\n\t" 01977 ".IA19:;\n\t" 01978 "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01979 "movss (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01980 "movss (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01981 "mulss %%xmm4, %%xmm0;\n\t" 01982 "mulss %%xmm5, %%xmm1;\n\t" 01983 "mulss %%xmm6, %%xmm2;\n\t" 01984 "addss %%xmm1, %%xmm0;\n\t" 01985 "addss %%xmm2, %%xmm0;\n\t" 01986 "movss %%xmm0, (%%rdi);\n\t" 01987 "add $4, %%rsi;\n\t" 01988 "add $4, %%rdi;\n\t" 01989 "dec %%rax;\n\t" 01990 "jnz .IA19;\n\t" 01991 "pop %%rax;\n\t" 01992 ".IA20:;\n\t" 01993 01994 "pop %%rbp;\n\t" 01995 : 01996 :"m"(w),"S"(sptr),"D"(dptr),"a"(rax),"b"(coeffs),"c"(rcx),"d"(rdx) 01997 : 01998 ); 01999 02000 } 02001 */ 02002 } 02003 02004 02005 // ###################################################################### 02006 02007 void sse_yuv411_to_rgb_mode_640x480(const byte *src, byte *dest, 02008 const int nbpix2) 02009 { 02010 int rcx=nbpix2/6; 02011 02012 const float coeffs[] = { 02013 0.0F, -0.198242F, 1.014648F, 0.0F, // R G B xx -> u 02014 0.700195F, -0.29052F, 0.0F, 0.0F, // R G B xx -> v 02015 128.0F, 128.0F, 128.0F, 128.0F // division factor 02016 }; 02017 02018 asm ( 02019 ".JA0:;\n\t" 02020 "or %%rcx, %%rcx;\n\t" 02021 "jz .JA1;\n\t" 02022 "pxor %%mm7, %%mm7;\n\t" //mm7 <- 00 00 00 00 02023 "xor %%rax, %%rax;\n\t" 02024 "xor %%rbx, %%rbx;\n\t" 02025 "mov (%%rsi), %%rax;\n\t" // rax <- v y1 y0 u 02026 "movw 4(%%rsi), %%bx;\n\t" // rbx <- xx xx y3 y2 02027 "movd %%rax, %%mm0;\n\t" // mm0<- xx xx xx xx v y1 y0 u 02028 "movd %%rax, %%mm1;\n\t" // mm1<- xx xx xx xx v y1 y0 u 02029 "movd %%rbx, %%mm2;\n\t" // mm2<- xx xx xx xx xx xx y3 y2 02030 "psrlq $16, %%mm1;\n\t" // mm1<- xx xx xx xx xx xx v y1 02031 "punpcklbw %%mm7, %%mm0;\n\t" // mm0<- xx xx xx xx 0 y0 0 u 02032 "punpcklbw %%mm7, %%mm1;\n\t" // mm1<- xx xx xx xx 00 v 00 y1 02033 "punpcklbw %%mm7, %%mm2;\n\t" // mm2<- xx xx xx xx 00 y3 00 y2 02034 "punpcklwd %%mm7, %%mm0;\n\t" // mm0<- 00 00 00 y0 00 00 00 u 02035 "punpcklwd %%mm7, %%mm1;\n\t" // mm1<- 00 00 00 v 00 00 00 y1 02036 "punpcklwd %%mm7, %%mm2;\n\t" // mm2<- 00 00 00 y3 00 00 00 y2 02037 02038 "cvtpi2ps %%mm0, %%xmm0;\n\t" // xmm0 <- 00 00 y0 u 02039 "cvtpi2ps %%mm1, %%xmm1;\n\t" // xmm1 <- 00 00 v y1 02040 "cvtpi2ps %%mm2, %%xmm2;\n\t" // xmm2 <- 00 00 y3 y2 02041 02042 // 01 01 01 01 02043 "movaps %%xmm0, %%xmm3;\n\t" 02044 02045 // 00 00 00 00 02046 "movaps %%xmm1, %%xmm4;\n\t" 02047 02048 // 00 00 00 00 02049 "movaps %%xmm2, %%xmm5;\n\t" 02050 02051 // 01 01 01 01 02052 "movaps %%xmm2, %%xmm6;\n\t" 02053 02054 "shufps $0x55, %%xmm3, %%xmm3;\n\t"// xmm3 <- y0 y0 y0 y0 02055 "shufps $00, %%xmm4, %%xmm4;\n\t" // xmm4 <- y1 y1 y1 y1 02056 "shufps $0x00, %%xmm5, %%xmm5;\n\t"// xmm5 <- y2 y2 y2 y2 02057 "shufps $0x55, %%xmm6, %%xmm6;\n\t"// xmm6 <- y3 y3 y3 y3 02058 02059 // 00 00 00 00 02060 "shufps $0, %%xmm0, %%xmm0;\n\t" // xmm0 <- u u u u 02061 // 01 01 01 01 02062 "shufps $0x55, %%xmm1, %%xmm1;\n\t" // xmm1 <- v v v v 02063 02064 "subps 32(%%rdx), %%xmm0;\n\t" 02065 "subps 32(%%rdx), %%xmm1;\n\t" 02066 02067 "mulps (%%rdx), %%xmm0;\n\t" 02068 "mulps 16(%%rdx),%%xmm1;\n\t" 02069 02070 "addps %%xmm0, %%xmm3;\n\t" 02071 "addps %%xmm0, %%xmm4;\n\t" 02072 "addps %%xmm0, %%xmm5;\n\t" 02073 "addps %%xmm0, %%xmm6;\n\t" 02074 02075 "addps %%xmm1, %%xmm3;\n\t" // xmm3 <- xx b0 g0 r0 02076 "addps %%xmm1, %%xmm4;\n\t" // xmm4 <- xx b1 g1 r1 02077 "addps %%xmm1, %%xmm5;\n\t" // xmm5 <- xx b2 g2 r2 02078 "addps %%xmm1, %%xmm6;\n\t" // xmm6 <- xx b3 g3 r3 02079 02080 "cvtps2pi %%xmm3, %%mm0;\n\t" //mm0 <- g0 r0 02081 "movhlps %%xmm3, %%xmm3;\n\t" //xmm3 <- g0 r0 xx b0 02082 "cvtps2pi %%xmm3, %%mm1;\n\t" //mm1 <- xx b0 02083 "packssdw %%mm1, %%mm0;\n\t" //mm0<- xx b0 g0 r0 02084 02085 "cvtps2pi %%xmm4, %%mm2;\n\t" //mm2 <- g1 r1 02086 "movhlps %%xmm4, %%xmm4;\n\t" //xmm4 <- g1 r1 xx b1 02087 "cvtps2pi %%xmm4, %%mm3;\n\t" //mm3 <- xx b1 02088 "packssdw %%mm3, %%mm2;\n\t" //mm2<- xx b1 g1 r1 02089 02090 "cvtps2pi %%xmm5, %%mm4;\n\t" //mm4 <- g2 r2 02091 "movhlps %%xmm5, %%xmm5;\n\t" //xmm5 <- g2 r2 xx b2 02092 "cvtps2pi %%xmm5, %%mm5;\n\t" //mm5 <- xx b2 02093 "packssdw %%mm5, %%mm4;\n\t" //mm4<- xx b2 g2 r2 02094 02095 "cvtps2pi %%xmm6, %%mm6;\n\t" //mm6 <- g3 r3 02096 "movhlps %%xmm6, %%xmm6;\n\t" //xmm3 <- g3 r3 xx b3 02097 "cvtps2pi %%xmm6, %%mm7;\n\t" //mm7 <- xx b3 02098 "packssdw %%mm7, %%mm6;\n\t" //mm6<- xx b3 g3 r3 02099 02100 "pxor %%mm1, %%mm1;\n\t" 02101 "pcmpgtw %%mm0, %%mm1;\n\t" 02102 "pandn %%mm0, %%mm1;\n\t" 02103 02104 "pxor %%mm3, %%mm3;\n\t" 02105 "pcmpgtw %%mm2, %%mm3;\n\t" 02106 "pandn %%mm2, %%mm3;\n\t" 02107 02108 "pxor %%mm5, %%mm5;\n\t" 02109 "pcmpgtw %%mm4, %%mm5;\n\t" 02110 "pandn %%mm4, %%mm5;\n\t" 02111 02112 "pxor %%mm7, %%mm7;\n\t" 02113 "pcmpgtw %%mm6, %%mm7;\n\t" 02114 "pandn %%mm6, %%mm7;\n\t" 02115 02116 "packuswb %%mm1, %%mm1;\n\t" //mm0<- xx xx xx xx xx b0 g0 r0 02117 "packuswb %%mm3, %%mm3;\n\t" //mm2<- xx xx xx xx xx b1 g1 r1 02118 "packuswb %%mm5, %%mm5;\n\t" //mm4<- xx xx xx xx xx b2 g2 r2 02119 "packuswb %%mm7, %%mm7;\n\t" //mm6<- xx xx xx xx xx b3 g3 r3 02120 02121 "push %%rcx;\n\t" 02122 "push %%rdx;\n\t" 02123 "movd %%mm1, %%rax;\n\t" // rax <- xx b0 g0 r0 02124 "movd %%mm3, %%rbx;\n\t" // rbx <- xx b1 g1 r1 02125 "movd %%mm5, %%rcx;\n\t" // rcx <- xx b2 g2 r2 02126 "movd %%mm7, %%rdx;\n\t" // rdx <- xx b3 g3 r3 02127 "movw %%ax, (%%rdi);\n\t" 02128 "movw %%bx,3(%%rdi);\n\t" 02129 "movw %%cx,6(%%rdi);\n\t" 02130 "movw %%dx,9(%%rdi);\n\t" 02131 "shr $8, %%rax;\n\t" 02132 "shr $8, %%rbx;\n\t" 02133 "shr $8, %%rcx;\n\t" 02134 "shr $8, %%rdx;\n\t" 02135 "movb %%ah, 2(%%rdi);\n\t" 02136 "movb %%bh, 5(%%rdi);\n\t" 02137 "movb %%ch, 8(%%rdi);\n\t" 02138 "movb %%dh,11(%%rdi);\n\t" 02139 "pop %%rdx;\n\t" 02140 "pop %%rcx;\n\t" 02141 02142 "add $12,%%rdi;\n\t" 02143 "dec %%rcx;\n\t" 02144 "add $6, %%rsi;\n\t" 02145 "jmp .JA0;\n\t" 02146 ".JA1:;\n\t" 02147 "emms;\n\t" 02148 : 02149 :"S"(src),"D"(dest),"c"(rcx),"d"(coeffs) 02150 :"rax","rbx","memory" 02151 ); 02152 02153 } 02154 02155 02156 02157 02158 void sse_lowPass9x(const float *sptr, float *dptr, const int h, const int w) 02159 { 02160 02161 for (int j = 0; j < h; j ++) 02162 { 02163 // leftmost points 02164 *dptr++ = sptr[0] * (70.0F / 163.0F) + 02165 sptr[1] * (56.0F / 163.0F) + 02166 sptr[2] * (28.0F / 163.0F) + 02167 sptr[3] * ( 8.0F / 163.0F) + 02168 sptr[4] * ( 1.0F / 163.0F); 02169 *dptr++ = (sptr[0] + sptr[2]) * (56.0F / 219.0F) + 02170 sptr[1] * (70.0F / 219.0F) + 02171 sptr[3] * (28.0F / 219.0F) + 02172 sptr[4] * ( 8.0F / 219.0F) + 02173 sptr[5] * ( 1.0F / 219.0F); 02174 *dptr++ = (sptr[0] + sptr[4]) * (28.0F / 247.0F) + 02175 (sptr[1] + sptr[3]) * (56.0F / 247.0F) + 02176 sptr[2] * (70.0F / 247.0F) + 02177 sptr[5] * ( 8.0F / 247.0F) + 02178 sptr[6] * ( 1.0F / 247.0F); 02179 *dptr++ = (sptr[0] + sptr[6]) * ( 8.0F / 255.0F) + 02180 (sptr[1] + sptr[5]) * (28.0F / 255.0F) + 02181 (sptr[2] + sptr[4]) * (56.0F / 255.0F) + 02182 sptr[3] * (70.0F / 255.0F) + 02183 sptr[7] * ( 1.0F / 255.0F); 02184 02185 // far from the borders 02186 for (int i = 0; i < w - 8; i ++) 02187 { 02188 *dptr++ = (sptr[0] + sptr[8]) * ( 1.0F / 256.0F) + 02189 (sptr[1] + sptr[7]) * ( 8.0F / 256.0F) + 02190 (sptr[2] + sptr[6]) * (28.0F / 256.0F) + 02191 (sptr[3] + sptr[5]) * (56.0F / 256.0F) + 02192 sptr[4] * (70.0F / 256.0F); 02193 sptr ++; 02194 } 02195 02196 // rightmost points 02197 *dptr++ = sptr[0] * ( 1.0F / 255.0F) + 02198 (sptr[1] + sptr[7]) * ( 8.0F / 255.0F) + 02199 (sptr[2] + sptr[6]) * (28.0F / 255.0F) + 02200 (sptr[3] + sptr[5]) * (56.0F / 255.0F) + 02201 sptr[4] * (70.0F / 255.0F); 02202 sptr ++; 02203 *dptr++ = sptr[0] * ( 1.0F / 247.0F) + 02204 sptr[1] * ( 8.0F / 247.0F) + 02205 (sptr[2] + sptr[6]) * (28.0F / 247.0F) + 02206 (sptr[3] + sptr[5]) * (56.0F / 247.0F) + 02207 sptr[4] * (70.0F / 247.0F); 02208 sptr ++; 02209 *dptr++ = sptr[0] * ( 1.0F / 219.0F) + 02210 sptr[1] * ( 8.0F / 219.0F) + 02211 sptr[2] * (28.0F / 219.0F) + 02212 (sptr[3] + sptr[5]) * (56.0F / 219.0F) + 02213 sptr[4] * (70.0F / 219.0F); 02214 sptr ++; 02215 *dptr++ = sptr[0] * ( 1.0F / 163.0F) + 02216 sptr[1] * ( 8.0F / 163.0F) + 02217 sptr[2] * (28.0F / 163.0F) + 02218 sptr[3] * (56.0F / 163.0F) + 02219 sptr[4] * (70.0F / 163.0F); 02220 sptr += 5; // sptr back to same as dptr (start of next line) 02221 } 02222 } 02223 #endif 02224 02225 //############################################################################ 02226 /* So things look consistent in everyone's emacs... */ 02227 /* Local Variables: */ 02228 /* indent-tabs-mode: nil */ 02229 /* End: */ 02230 02231 #endif 02232 02233 #ifndef INVT_CPU_OPTERON 02234 02235 #ifdef INVT_USE_SSE 02236 02237 //###################################################################### 02238 void sse_absDiff(const double *a, const double *b, double *diff, const int32 sz) 02239 { 02240 static int32 ecx= sz>>2; 02241 static int32 edx= sz & 0x3; 02242 02243 asm ( 02244 "orl %%ecx, %%ecx;\n\t" 02245 "jz .AG2;\n\t" 02246 ".AG1:;\n\t" 02247 "movupd 0(%%esi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0 02248 "movupd 0(%%edi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0 02249 "movupd 16(%%esi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4 02250 "movupd 16(%%edi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4 02251 "movupd %%xmm0, %%xmm4;\n\t" // xmm4 <- a3 a2 a1 a0 02252 "movupd %%xmm1, %%xmm5;\n\t" // xmm5 <- b3 b2 b1 b0 02253 "movupd %%xmm2, %%xmm6;\n\t" // xmm6 <- a7 a6 a5 a4 02254 "movupd %%xmm3, %%xmm7;\n\t" // xmm7 <- b7 b6 b5 b4 02255 "subpd %%xmm1, %%xmm0;\n\t" // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0) 02256 "subpd %%xmm3, %%xmm2;\n\t" // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4) 02257 "subpd %%xmm4, %%xmm5;\n\t" // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0) 02258 "subpd %%xmm6, %%xmm7;\n\t" // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4) 02259 "maxpd %%xmm0, %%xmm5;\n\t" // xmm5 <- max(xmm0,xmm5) 02260 "maxpd %%xmm2, %%xmm7;\n\t" // xmm7 <- max(xmm2,xmm7) 02261 "movupd %%xmm5, 0(%%ebx);\n\t" 02262 "movupd %%xmm7, 16(%%ebx);\n\t" 02263 "addl $32, %%esi;\n\t" 02264 "addl $32, %%edi;\n\t" 02265 "addl $32, %%ebx;\n\t" 02266 "loop .AG1;\n\t" 02267 ".AG2:;\n\t" 02268 "movl %%edx, %%ecx;\n\t" 02269 "orl %%ecx, %%ecx;\n\t" 02270 "jz .AG4;\n\t" 02271 ".AG3:;\n\t" 02272 "movsd 0(%%esi), %%xmm0;\n\t" 02273 "movsd 0(%%edi), %%xmm1;\n\t" 02274 "movsd %%xmm0, %%xmm2;\n\t" 02275 "movsd %%xmm1, %%xmm3;\n\t" 02276 "subsd %%xmm3, %%xmm2;\n\t" 02277 "subsd %%xmm0, %%xmm1;\n\t" 02278 "maxsd %%xmm2, %%xmm1;\n\t" 02279 "movsd %%xmm1, 0(%%ebx);\n\t" 02280 "addl $8, %%esi;\n\t" 02281 "addl $8, %%edi;\n\t" 02282 "addl $8, %%ebx;\n\t" 02283 "loop .AG3;\n\t" 02284 ".AG4:;\n\t" 02285 : 02286 :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx) 02287 :"memory" 02288 ); 02289 } 02290 #endif 02291 02292 #ifdef INVT_USE_MMXSSE2 02293 //###################################################################### 02294 // speedup ~= 2.1 02295 void sse2_absDiff(const float *a, const float *b, float *diff, const int32 sz) 02296 { 02297 static int32 ecx= sz>>3; 02298 static int32 edx= sz & 0x7; 02299 02300 asm ( 02301 "orl %%ecx, %%ecx;\n\t" 02302 "jz .AE2;\n\t" 02303 ".AE1:;\n\t" 02304 "movups 0(%%esi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0 02305 "movups 0(%%edi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0 02306 "movups 16(%%esi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4 02307 "movups 16(%%edi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4 02308 "movups %%xmm0, %%xmm4;\n\t" // xmm4 <- a3 a2 a1 a0 02309 "movups %%xmm1, %%xmm5;\n\t" // xmm5 <- b3 b2 b1 b0 02310 "movups %%xmm2, %%xmm6;\n\t" // xmm6 <- a7 a6 a5 a4 02311 "movups %%xmm3, %%xmm7;\n\t" // xmm7 <- b7 b6 b5 b4 02312 "subps %%xmm1, %%xmm0;\n\t" // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0) 02313 "subps %%xmm3, %%xmm2;\n\t" // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4) 02314 "subps %%xmm4, %%xmm5;\n\t" // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0) 02315 "subps %%xmm6, %%xmm7;\n\t" // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4) 02316 "maxps %%xmm0, %%xmm5;\n\t" // xmm5 <- max(xmm0,xmm5) 02317 "maxps %%xmm2, %%xmm7;\n\t" // xmm7 <- max(xmm2,xmm7) 02318 "movups %%xmm5, 0(%%ebx);\n\t" 02319 "movups %%xmm7, 16(%%ebx);\n\t" 02320 "addl $32, %%esi;\n\t" 02321 "addl $32, %%edi;\n\t" 02322 "addl $32, %%ebx;\n\t" 02323 "loop .AE1;\n\t" 02324 ".AE2:;\n\t" 02325 "movl %%edx, %%ecx;\n\t" 02326 "orl %%ecx, %%ecx;\n\t" 02327 "jz .AE4;\n\t" 02328 ".AE3:;\n\t" 02329 "movss 0(%%esi), %%xmm0;\n\t" 02330 "movss 0(%%edi), %%xmm1;\n\t" 02331 "movss %%xmm0, %%xmm2;\n\t" 02332 "movss %%xmm1, %%xmm3;\n\t" 02333 "subss %%xmm3, %%xmm2;\n\t" 02334 "subss %%xmm0, %%xmm1;\n\t" 02335 "maxss %%xmm2, %%xmm1;\n\t" 02336 "movss %%xmm1, 0(%%ebx);\n\t" 02337 "addl $4, %%esi;\n\t" 02338 "addl $4, %%edi;\n\t" 02339 "addl $4, %%ebx;\n\t" 02340 "loop .AE3;\n\t" 02341 ".AE4:;\n\t" 02342 "emms;\n\t" 02343 : 02344 :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx) 02345 :"memory" 02346 ); 02347 } 02348 02349 02350 02351 //###################################################################### 02352 // speedup ~= 3.4 02353 void sse2_absDiff(const int32 *a, const int32 *b, int32 *diff, const int32 sz) 02354 { 02355 static int32 ecx= sz>>3; 02356 static int32 edx= sz&0x7; 02357 02358 asm ( 02359 "orl %%ecx, %%ecx;\n\t" 02360 "jz .AF2;\n\t" 02361 ".AF1:;\n\t" 02362 "movdqu 0(%%esi), %%xmm0;\n\t" 02363 "movdqu 0(%%edi), %%xmm1;\n\t" 02364 "movdqu 16(%%esi), %%xmm2;\n\t" 02365 "movdqu 16(%%edi), %%xmm3;\n\t" 02366 "movdqu %%xmm0, %%xmm4;\n\t" 02367 "movdqu %%xmm1, %%xmm5;\n\t" 02368 "movdqu %%xmm2, %%xmm6;\n\t" 02369 "movdqu %%xmm3, %%xmm7;\n\t" 02370 "psubusw %%xmm1, %%xmm0;\n\t" 02371 "psubusw %%xmm3, %%xmm2;\n\t" 02372 "psubusw %%xmm4, %%xmm5;\n\t" 02373 "psubusw %%xmm6, %%xmm7;\n\t" 02374 "pmaxsw %%xmm0, %%xmm5;\n\t" 02375 "pmaxsw %%xmm2, %%xmm7;\n\t" 02376 "movdqu %%xmm5, 0(%%ebx);\n\t" 02377 "movdqu %%xmm7, 16(%%ebx);\n\t" 02378 "addl $32, %%esi;\n\t" 02379 "addl $32, %%edi;\n\t" 02380 "addl $32, %%ebx;\n\t" 02381 "loop .AF1;\n\t" 02382 ".AF2:;\n\t" 02383 "movl %%edx, %%ecx;\n\t" 02384 "orl %%ecx, %%ecx;\n\t" 02385 "jz .AF4;\n\t" 02386 ".AF3:;\n\t" 02387 "movl (%%esi), %%eax;\n\t" 02388 "movl (%%edi), %%edx;\n\t" 02389 "cmpl %%edx, %%eax;\n\t" 02390 "ja .AF5;\n\t" 02391 "xchgl %%eax, %%edx;\n\t" 02392 ".AF5:;\n\t" 02393 "subl %%edx, %%eax;\n\t" 02394 "movl %%eax, (%%ebx);\n\t" 02395 "addl $4, %%esi;\n\t" 02396 "addl $4, %%edi;\n\t" 02397 "addl $4, %%ebx;\n\t" 02398 "loop .AF3;\n\t" 02399 ".AF4:;\n\t" 02400 "emms;\n\t" 02401 : 02402 :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx) 02403 :"memory" 02404 ); 02405 } 02406 02407 02408 //###################################################################### 02409 // speedup ~=10.0! 02410 void sse2_absDiff(const byte *a, const byte *b, byte *diff, const int32 sz) 02411 { 02412 static int32 ecx= sz>>5; 02413 static int32 edx= sz&0x1f; 02414 02415 asm ( 02416 "orl %%ecx, %%ecx;\n\t" 02417 "jz .AD2;\n\t" 02418 ".AD1:;\n\t" 02419 "movdqu 0(%%esi), %%xmm0;\n\t" // xmm0<- a15 ... a3 a2 a1 a0 02420 "movdqu 0(%%edi), %%xmm1;\n\t" // xmm1<- b15 ... b3 b2 b1 b0 02421 "movdqu 16(%%esi), %%xmm2;\n\t"// xmm2<- a31 ... a18 a17 a16 02422 "movdqu 16(%%edi), %%xmm3;\n\t"// xmm3<- b31 ... b18 b17 b16 02423 "movdqu %%xmm0, %%xmm4;\n\t" // xmm4<- a15 ... a3 a2 a1 a0 02424 "movdqu %%xmm1, %%xmm5;\n\t" // xmm5<- b15 ... b3 b2 b1 b0 02425 "movdqu %%xmm2, %%xmm6;\n\t" // xmm6<- a31 ... a18 a17 a16 02426 "movdqu %%xmm3, %%xmm7;\n\t" // xmm7<- b31 ... b18 b17 b16 02427 "psubusb %%xmm1, %%xmm0;\n\t" // xmm0<-(a15-b15)...( a1-b1 )(a0-b0) 02428 "psubusb %%xmm3, %%xmm2;\n\t" // xmm2<-(a31-b31)...(a17-b17)(a16-b16) 02429 "psubusb %%xmm4, %%xmm5;\n\t" // xmm5<-(b15-a15)...(b17-a17)(b16-a16) 02430 "psubusb %%xmm6, %%xmm7;\n\t" // xmm7<-(b31-a31)...(b17-a17)(b16-a16) 02431 "pmaxub %%xmm0, %%xmm5;\n\t" // xmm5<- max(xmm0,xmm5) 02432 "pmaxub %%xmm2, %%xmm7;\n\t" // xmm7<- max(xmm2,xmm7) 02433 "movdqu %%xmm5, 0(%%ebx);\n\t" 02434 "movdqu %%xmm7, 16(%%ebx);\n\t" 02435 "addl $32, %%esi;\n\t" 02436 "addl $32, %%edi;\n\t" 02437 "addl $32, %%ebx;\n\t" 02438 "loop .AD1;\n\t" 02439 ".AD2:;\n\t" 02440 "movl %%edx, %%ecx;\n\t" 02441 "orl %%ecx, %%ecx;\n\t" 02442 "jz .AD4;\n\t" 02443 ".AD3:;\n\t" 02444 "movb (%%esi), %%al;\n\t" 02445 "movb (%%edi), %%dl;\n\t" 02446 "cmpb %%dl, %%al;\n\t" 02447 "ja .AD5;\n\t" 02448 "xchgb %%al, %%dl;\n\t" 02449 ".AD5:;\n\t" 02450 "subb %%dl, %%al;\n\t" 02451 "movb %%al, (%%ebx);\n\t" 02452 "incl %%ebx;\n\t" 02453 "incl %%esi;\n\t" 02454 "incl %%edi;\n\t" 02455 "loop .AD3;\n\t" 02456 ".AD4:;\n\t" 02457 "emms;\n\t" 02458 : 02459 :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx) 02460 :"memory" 02461 ); 02462 } 02463 #endif 02464 02465 #ifdef INVT_USE_SSE 02466 //###################################################################### 02467 // speedup ~= 2.0 02468 void sse_sum(const double *a, double *sum, const int32 sz) 02469 { 02470 static int32 ecx = sz>>3; 02471 static int32 edx = sz&0x7; 02472 02473 asm ( 02474 "pxor %%xmm4, %%xmm4;\n\t" 02475 "pxor %%xmm5, %%xmm5;\n\t" 02476 "pxor %%xmm6, %%xmm6;\n\t" 02477 "pxor %%xmm7, %%xmm7;\n\t" 02478 "orl %%ecx, %%ecx;\n\t" 02479 "jz BE1;\n\t" 02480 ".BE0:\n\t" 02481 "movupd 0(%%esi), %%xmm0;\n\t" 02482 "movupd 16(%%esi), %%xmm1;\n\t" 02483 "movupd 32(%%esi), %%xmm2;\n\t" 02484 "movupd 48(%%esi), %%xmm3;\n\t" 02485 "addpd %%xmm0, %%xmm4;\n\t" 02486 "addpd %%xmm1, %%xmm5;\n\t" 02487 "addpd %%xmm2, %%xmm6;\n\t" 02488 "addpd %%xmm3, %%xmm7;\n\t" 02489 "addl $64, %%esi;\n\t" 02490 "loop .BE0;\n\t" 02491 "BE1:;\n\t" 02492 "mov %%edx, %%ecx;\n\t" 02493 "pxor %%xmm0, %%xmm0;\n\t" 02494 "orl %%ecx, %%ecx;\n\t" 02495 "jz BE2;\n\t" 02496 "BE3:;\n\t" 02497 "movupd 0(%%esi), %%xmm1;\n\t" 02498 "addpd %%xmm1, %%xmm0;\n\t" 02499 "addl $16, %%esi;\n\t" 02500 "loop BE3;\n\t" 02501 "BE2:;\n\t" 02502 "addpd %%xmm4, %%xmm7;\n\t" 02503 "addpd %%xmm5, %%xmm7;\n\t" 02504 "addpd %%xmm6, %%xmm7;\n\t" 02505 "addpd %%xmm7, %%xmm0;\n\t" 02506 "movhpd %%xmm0, (%%ebx);\n\t" 02507 "addsd (%%ebx), %%xmm0;\n\t" 02508 "movlpd %%xmm0, (%%ebx);\n\t" 02509 "emms;\n\t" 02510 : 02511 :"S"(a), "b"(sum), "c"(ecx), "d"(edx) 02512 :"memory" 02513 ); 02514 } 02515 #endif 02516 02517 #ifdef INVT_USE_MMXSSE2 02518 //###################################################################### 02519 //speedup ~= 4 02520 void sse2_sum(const float *a, double *sum, const int32 sz) 02521 { 02522 static int32 ecx = sz>>3; 02523 static int32 edx = sz & 0x7; 02524 02525 asm ( 02526 "pxor %%xmm4, %%xmm4;\n\t" 02527 "pxor %%xmm5, %%xmm5;\n\t" 02528 "pxor %%xmm6, %%xmm6;\n\t" 02529 "pxor %%xmm7, %%xmm7;\n\t" 02530 "orl %%ecx, %%ecx;\n\t" 02531 "jz BA1;\n\t" 02532 ".BA0:\n\t" 02533 "cvtps2pd 0(%%esi), %%xmm0;\n\t" 02534 "cvtps2pd 8(%%esi), %%xmm1;\n\t" 02535 "cvtps2pd 16(%%esi), %%xmm2;\n\t" 02536 "cvtps2pd 24(%%esi), %%xmm3;\n\t" 02537 "addpd %%xmm0, %%xmm4;\n\t" 02538 "addpd %%xmm1, %%xmm5;\n\t" 02539 "addpd %%xmm2, %%xmm6;\n\t" 02540 "addpd %%xmm3, %%xmm7;\n\t" 02541 "addl $32, %%esi;\n\t" 02542 "loop .BA0;\n\t" 02543 "BA1:;\n\t" 02544 "pxor %%xmm0, %%xmm0;\n\t" 02545 "mov %%edx, %%ecx;\n\t" 02546 "orl %%ecx, %%ecx;\n\t" 02547 "jz BA2;\n\t" 02548 "BA3:;\n\t" 02549 "cvtps2pd 0(%%esi), %%xmm1;\n\t" 02550 "addpd %%xmm1, %%xmm0;\n\t" 02551 "addl $8, %%esi;\n\t" 02552 "loop BA3;\n\t" 02553 "BA2:;\n\t" 02554 "addpd %%xmm4, %%xmm7;\n\t" 02555 "addpd %%xmm5, %%xmm7;\n\t" 02556 "addpd %%xmm6, %%xmm7;\n\t" 02557 "addpd %%xmm7, %%xmm0;\n\t" 02558 "movhpd %%xmm0, (%%ebx);\n\t" 02559 "addsd (%%ebx), %%xmm0;\n\t" 02560 "movlpd %%xmm0, (%%ebx);\n\t" 02561 "emms;\n\t" 02562 : 02563 :"S"(a), "b"(sum), "c"(ecx), "d"(edx) 02564 :"memory" 02565 ); 02566 } 02567 02568 02569 //###################################################################### 02570 // speedup ~= 4.0 02571 void sse2_sum(const int32 *a, double *sum, const int32 sz) 02572 { 02573 static int32 ecx = sz>>3; 02574 static int32 edx = sz & 0x7; 02575 02576 asm ( 02577 "pxor %%xmm4, %%xmm4;\n\t" 02578 "pxor %%xmm5, %%xmm5;\n\t" 02579 "pxor %%xmm6, %%xmm6;\n\t" 02580 "pxor %%xmm7, %%xmm7;\n\t" 02581 "orl %%ecx, %%ecx;\n\t" 02582 ".BC0:\n\t" 02583 "cvtdq2pd 0(%%esi), %%xmm0;\n\t" 02584 "cvtdq2pd 8(%%esi), %%xmm1;\n\t" 02585 "cvtdq2pd 16(%%esi), %%xmm2;\n\t" 02586 "cvtdq2pd 24(%%esi), %%xmm3;\n\t" 02587 "addpd %%xmm0, %%xmm4;\n\t" 02588 "addpd %%xmm1, %%xmm5;\n\t" 02589 "addpd %%xmm2, %%xmm6;\n\t" 02590 "addpd %%xmm3, %%xmm7;\n\t" 02591 "addl $32, %%esi;\n\t" 02592 "loop .BC0;\n\t" 02593 "BC1:;\n\t" 02594 "pxor %%xmm0, %%xmm0;\n\t" 02595 "mov %%edx, %%ecx;\n\t" 02596 "orl %%ecx, %%ecx;\n\t" 02597 "jz BC2;\n\t" 02598 "BC3:;\n\t" 02599 "cvtdq2pd 0(%%esi), %%xmm1;\n\t" 02600 "addpd %%xmm1, %%xmm0;\n\t" 02601 "addl $8, %%esi;\n\t" 02602 "loop BC3;\n\t" 02603 "BC2:;\n\t" 02604 "addpd %%xmm4, %%xmm7;\n\t" 02605 "addpd %%xmm5, %%xmm7;\n\t" 02606 "addpd %%xmm6, %%xmm7;\n\t" 02607 "addpd %%xmm7, %%xmm0;\n\t" 02608 "movhpd %%xmm0, (%%ebx);\n\t" 02609 "addsd (%%ebx), %%xmm0;\n\t" 02610 "movlpd %%xmm0, (%%ebx);\n\t" 02611 "emms;\n\t" 02612 : 02613 :"S"(a), "b"(sum), "c"(ecx), "d"(edx) 02614 :"memory" 02615 ); 02616 } 02617 02618 02619 02620 //###################################################################### 02621 void sse2_sum(const byte *a, double *sum, const int32 sz) 02622 { 02623 static int ecx = sz>>5; 02624 static int edx = sz & 0x1f; 02625 02626 asm ( 02627 "orl %%ecx, %%ecx;\n\t" 02628 "jz BB1;\n\t" 02629 "pxor %%xmm7, %%xmm7;\n\t" 02630 "pushl %%ebx;\n\t" 02631 "pushl %%edx;\n\t" 02632 "BB3:;\n\t" 02633 "pxor %%xmm5, %%xmm5;\n\t" 02634 "pxor %%xmm6, %%xmm6;\n\t" 02635 "movdqu (%%esi), %%xmm0;\n\t" 02636 "movdqu 16(%%esi), %%xmm1;\n\t" 02637 "psadbw %%xmm0, %%xmm5;\n\t" 02638 "psadbw %%xmm1, %%xmm6;\n\t" 02639 "pextrw $0, %%xmm5, %%eax;\n\t" 02640 "cvtsi2sd %%eax, %%xmm0;\n\t" 02641 "pextrw $4, %%xmm5, %%ebx;\n\t" 02642 "cvtsi2sd %%ebx, %%xmm1;\n\t" 02643 "pextrw $0, %%xmm6, %%edx;\n\t" 02644 "cvtsi2sd %%edx, %%xmm2;\n\t" 02645 "pextrw $4, %%xmm6, %%edi;\n\t" 02646 "cvtsi2sd %%edi, %%xmm3;\n\t" 02647 "addsd %%xmm0, %%xmm1;\n\t" 02648 "addsd %%xmm2, %%xmm3;\n\t" 02649 "addsd %%xmm1, %%xmm7;\n\t" 02650 "addsd %%xmm3, %%xmm7;\n\t" 02651 "addl $32, %%esi;\n\t" 02652 "loop BB3;\n\t" 02653 "popl %%edx;\n\t" 02654 "popl %%ebx;\n\t" 02655 "BB1:;\n\t" 02656 "xorl %%edi, %%edi;\n\t" 02657 "movl %%edx, %%ecx;\n\t" 02658 "orl %%ecx, %%ecx;\n\t" 02659 "jz BB2;\n\t" 02660 "BB5:;\n\t" 02661 "xorl %%eax, %%eax;\n\t" 02662 "movb (%%esi), %%al;\n\t" 02663 "addl %%eax, %%edi;\n\t" 02664 "incl %%esi;\n\t" 02665 "loop BB5;\n\t" 02666 "BB2:\n\t" 02667 "cvtsi2sd %%edi, %%xmm0;\n\t" 02668 "addsd %%xmm0, %%xmm7;\n\t" 02669 "movhpd %%xmm7, (%%ebx);\n\t" 02670 "addsd (%%ebx), %%xmm7;\n\t" 02671 "movlpd %%xmm7, (%%ebx);\n\t" 02672 "BB6:;\n\t" 02673 "emms;\n\t" 02674 : 02675 :"S"(a), "c"(ecx),"b"(sum),"d"(edx) 02676 :"memory","eax","edi" 02677 ); 02678 } 02679 #endif 02680 02681 #ifdef INVT_USE_SSE 02682 //###################################################################### 02683 // speedup ~= 10 ! 02684 void sse_clampedDiff(const byte *a, const byte *b, byte *result, const int32 sz) 02685 { 02686 int ecx = sz >> 6; 02687 int edx = sz & 0x7f; 02688 02689 asm ( 02690 "orl %%ecx, %%ecx;\n\t" 02691 "jz .DA0;\n\t" 02692 ".DA1:;\n\t" 02693 "movdqu (%%esi), %%xmm0;\n\t" 02694 "movdqu (%%edi), %%xmm4;\n\t" 02695 "movdqu 16(%%esi), %%xmm1;\n\t" 02696 "movdqu 16(%%edi), %%xmm5;\n\t" 02697 "movdqu 32(%%esi), %%xmm2;\n\t" 02698 "movdqu 32(%%edi), %%xmm6;\n\t" 02699 "movdqu 48(%%esi), %%xmm3;\n\t" 02700 "movdqu 48(%%edi), %%xmm7;\n\t" 02701 "psubusb %%xmm4, %%xmm0;\n\t" 02702 "psubusb %%xmm5, %%xmm1;\n\t" 02703 "psubusb %%xmm6, %%xmm2;\n\t" 02704 "psubusb %%xmm7, %%xmm3;\n\t" 02705 "movdqu %%xmm0, 0(%%ebx);\n\t" 02706 "movdqu %%xmm1, 16(%%ebx);\n\t" 02707 "movdqu %%xmm2, 32(%%ebx);\n\t" 02708 "movdqu %%xmm3, 48(%%ebx);\n\t" 02709 "addl $64, %%esi;\n\t" 02710 "addl $64, %%edi;\n\t" 02711 "addl $64, %%ebx;\n\t" 02712 "loop .DA1;\n\t" 02713 ".DA0:;\n\t" 02714 "movl %%edx, %%ecx;\n\t" 02715 "orl %%ecx, %%ecx;\n\t" 02716 "jz .DA2;\n\t" 02717 ".DA3:;\n\t" 02718 "movb (%%esi), %%al;\n\t" 02719 "movb (%%edi), %%dl;\n\t" 02720 "cmpb %%bl, %%al;\n\t" 02721 "ja .DA4;\n\t" 02722 "xchg %%al, %%bl;\n\t" 02723 ".DA4:;\n\t" 02724 "subb %%bl, %%al;\n\t" 02725 "movb %%al, (%%ebx);\n\t" 02726 "incl %%esi;\n\t" 02727 "incl %%edi;\n\t" 02728 "incl %%ebx;\n\t" 02729 "loop .DA3;\n\t" 02730 ".DA2:;\n\t" 02731 "emms;\n\t" 02732 : 02733 :"S"(a),"D"(b),"c"(ecx),"d"(edx),"b"(result) 02734 ); 02735 } 02736 02737 02738 //###################################################################### 02739 // speedup ~= 20 ! 02740 void sse_clampedDiff(const float32 *a, const float32 *b, float32 *result, 02741 const int32 sz) 02742 { 02743 int32 ecx=sz>>5; 02744 int32 edx=sz&0x1f; 02745 02746 asm ( 02747 "orl %%ecx, %%ecx;\n\t" 02748 "jz .DB0;\n\t" 02749 ".DB1:;\n\t" 02750 "movups 0(%%esi), %%xmm0;\n\t" 02751 "movups 0(%%edi), %%xmm1;\n\t" 02752 "movups 16(%%esi), %%xmm2;\n\t" 02753 "movups 16(%%edi), %%xmm3;\n\t" 02754 "movups %%xmm1, %%xmm6;\n\t" 02755 "movups %%xmm3, %%xmm7;\n\t" 02756 "cmpps $1, %%xmm0, %%xmm6;\n\t" 02757 "cmpps $1, %%xmm2, %%xmm7;\n\t" 02758 "subps %%xmm1, %%xmm0;\n\t" 02759 "subps %%xmm3, %%xmm2;\n\t" 02760 "andps %%xmm6, %%xmm0;\n\t" 02761 "andps %%xmm7, %%xmm2;\n\t" 02762 "movups %%xmm0, (%%ebx);\n\t" 02763 "movups %%xmm2, 16(%%ebx);\n\t" 02764 "addl $32, %%esi;\n\t" 02765 "addl $32, %%edi;\n\t" 02766 "addl $32, %%ebx;\n\t" 02767 "loop .DB1;\n\t" 02768 ".DB0:;\n\t" 02769 "movl %%edx, %%ecx;\n\t" 02770 "orl %%ecx, %%ecx;\n\t" 02771 "jz .DB2;\n\t" 02772 ".DB3:;\n\t" 02773 "movss (%%esi), %%xmm0;\n\t" 02774 "movss (%%edi), %%xmm1;\n\t" 02775 "movss %%xmm1, %%xmm2;\n\t" 02776 "cmpss $1, %%xmm0, %%xmm2;\n\t" 02777 "andps %%xmm2, %%xmm0;\n\t" 02778 "andps %%xmm2, %%xmm1;\n\t" 02779 "subss %%xmm1, %%xmm0;\n\t" 02780 "movss %%xmm0, (%%ebx);\n\t" 02781 "addl $4, %%esi;\n\t" 02782 "addl $4, %%edi;\n\t" 02783 "addl $4, %%ebx;\n\t" 02784 "loop .DB3;\n\t" 02785 ".DB2:;\n\t" 02786 : 02787 :"S"(a), "D"(b), "b"(result), "c"(ecx), "d"(edx) 02788 :"memory" 02789 ); 02790 } 02791 02792 02793 //###################################################################### 02794 // speedup ~= 3 02795 void sse_clampedDiff(const int32 *a, const int32 *b, int32 *c, const int32 sz) 02796 { 02797 int32 ecx=sz>>3; 02798 int32 edx=sz&0x7; 02799 asm ( 02800 "orl %%ecx, %%ecx;\n\t" 02801 "jz .DC0;\n\t" 02802 ".DC1:;\n\t" 02803 "movdqu 0(%%esi), %%xmm0;\n\t" //xmm0= a3 a2 a1 a0 02804 "movdqu 0(%%edi), %%xmm1;\n\t" //xmm1= b3 b2 b1 b0 02805 "movdqu 16(%%esi), %%xmm3;\n\t"//xmm3= a7 a6 a5 a4 02806 "movdqu 16(%%edi), %%xmm4;\n\t"//xmm4= b7 b6 b5 b4 02807 "movdqu %%xmm0, %%xmm2;\n\t" //xmm2= a3 a2 a1 a0 02808 "movdqu %%xmm3, %%xmm5;\n\t" //xmm5= a7 a6 a5 a4 02809 "pcmpgtd %%xmm1, %%xmm2;\n\t" //xmm2=(a3>b3)(a2>b2)(a1>b1)(a0>b0) 02810 "pcmpgtd %%xmm4, %%xmm5;\n\t" //xmm5=(a7>b7)(a6>b6)(b5>a5)(a4>b4) 02811 "psubd %%xmm1, %%xmm0;\n\t" //xmm0=(a3-b3)(a2-b2)(a1-b1)(a0-b0) 02812 "psubd %%xmm4, %%xmm3;\n\t" //xmm3=(a7-b7)(a6-b6)(a5-b5)(a4-b4) 02813 "pand %%xmm2, %%xmm0;\n\t" 02814 "pand %%xmm5, %%xmm3;\n\t" 02815 "movdqu %%xmm0, (%%ebx);\n\t" 02816 "movdqu %%xmm3, 16(%%ebx);\n\t" 02817 "addl $32, %%esi;\n\t" 02818 "addl $32, %%edi;\n\t" 02819 "addl $32, %%ebx;\n\t" 02820 "loop .DC1;\n\t" 02821 ".DC0:;\n\t" 02822 "movl %%edx, %%ecx;\n\t" 02823 "orl %%ecx, %%ecx;\n\t" 02824 "jz .DC2;\n\t" 02825 ".DC3:;\n\t" 02826 "movd 0(%%esi), %%xmm0;\n\t" 02827 "movd 0(%%edi), %%xmm1;\n\t" 02828 "movdqu %%xmm0, %%xmm2;\n\t" 02829 "pcmpgtd %%xmm1, %%xmm2;\n\t" 02830 "psubd %%xmm1, %%xmm0;\n\t" 02831 "pand %%xmm2, %%xmm0;\n\t" 02832 "movd %%xmm0, (%%ebx);\n\t" 02833 "addl $4, %%esi;\n\t" 02834 "addl $4, %%edi;\n\t" 02835 "addl $4, %%ebx;\n\t" 02836 "loop .DC3;\n\t" 02837 ".DC2:;\n\t" 02838 : 02839 :"S"(a), "D"(b), "c"(ecx), "d"(edx), "b"(c) 02840 :"memory" 02841 ); 02842 } 02843 02844 02845 //###################################################################### 02846 // speedup ~= 4-5 02847 void sse_binaryReverse(const byte *a, byte *result, const byte val, const 02848 int32 sz) 02849 { 02850 static unsigned int ecx=(sz>>7); 02851 static unsigned int edx=sz&0x7f; 02852 02853 byte pVal[16]; 02854 02855 memset(result, val, 16); 02856 02857 asm ( 02858 "orl %%ecx, %%ecx;\n\t" 02859 "jz .FA0;\n\t" 02860 ".FA1:;\n\t" 02861 "movdqu 0(%%ebx), %%xmm0;\n\t" 02862 "movdqu 0(%%ebx), %%xmm1;\n\t" 02863 "movdqu %%xmm0, %%xmm2;\n\t" 02864 "movdqu %%xmm1, %%xmm3;\n\t" 02865 "movdqu %%xmm0, %%xmm4;\n\t" 02866 "movdqu %%xmm1, %%xmm5;\n\t" 02867 "movdqu %%xmm0, %%xmm6;\n\t" 02868 "movdqu %%xmm1, %%xmm7;\n\t" 02869 "psubb (%%esi), %%xmm0;\n\t" 02870 "psubb 16(%%esi), %%xmm1;\n\t" 02871 "psubb 32(%%esi), %%xmm2;\n\t" 02872 "psubb 48(%%esi), %%xmm3;\n\t" 02873 "psubb 64(%%esi), %%xmm4;\n\t" 02874 "psubb 80(%%esi), %%xmm5;\n\t" 02875 "psubb 96(%%esi), %%xmm6;\n\t" 02876 "psubb 112(%%esi), %%xmm7;\n\t" 02877 "movdqu %%xmm0, (%%edi);\n\t" 02878 "movdqu %%xmm1, 16(%%edi);\n\t" 02879 "movdqu %%xmm2, 32(%%edi);\n\t" 02880 "movdqu %%xmm3, 48(%%edi);\n\t" 02881 "movdqu %%xmm4, 64(%%edi);\n\t" 02882 "movdqu %%xmm5, 80(%%edi);\n\t" 02883 "movdqu %%xmm6, 96(%%edi);\n\t" 02884 "movdqu %%xmm7, 112(%%edi);\n\t" 02885 "addl $128, %%edi;\n\t" 02886 "addl $128, %%esi;\n\t" 02887 "loop .FA1;\n\t" 02888 ".FA0:;\n\t" 02889 "movl %%edx, %%ecx;\n\t" 02890 "orl %%ecx, %%ecx;\n\t" 02891 "jz .FA2;\n\t" 02892 "movb (%%ebx), %%dl;\n\t" 02893 ".FA3:;\n\t" 02894 "movb %%dl, %%dh;\n\t" 02895 "movb (%%esi), %%al;\n\t" 02896 "subb %%al, %%dh;\n\t" 02897 "movb %%dh, (%%edi);\n\t" 02898 "incl %%esi;\n\t" 02899 "incl %%edi;\n\t" 02900 "loop .FA3;\n\t" 02901 ".FA2:;\n\t" 02902 : 02903 :"S"(a), "D"(result), "b"(pVal),"c"(ecx),"d"(edx) 02904 :"memory","eax" 02905 ); 02906 } 02907 02908 02909 //###################################################################### 02910 // speedup ~= 2 02911 void sse_binaryReverse(const float *a, float *result, const float val, 02912 const int sz) 02913 { 02914 static unsigned int ecx = sz>>5; 02915 static unsigned int edx = sz&0x1f; 02916 int i; 02917 float pVal[16]; 02918 02919 for(i=0;i<16;++i) 02920 pVal[i] = val; 02921 02922 02923 asm ( 02924 "orl %%ecx, %%ecx;\n\t" 02925 "jz .FB4;\n\t" 02926 ".FB2:;\n\t" 02927 "movups (%%ebx), %%xmm0;\n\t" 02928 "movups (%%ebx), %%xmm1;\n\t" 02929 "movups %%xmm0, %%xmm2;\n\t" 02930 "movups %%xmm1, %%xmm3;\n\t" 02931 "movups %%xmm0, %%xmm4;\n\t" 02932 "movups %%xmm1, %%xmm5;\n\t" 02933 "movups %%xmm0, %%xmm6;\n\t" 02934 "movups %%xmm1, %%xmm7;\n\t" 02935 "psubq (%%esi), %%xmm0;\n\t" 02936 "psubq 16(%%esi), %%xmm1;\n\t" 02937 "psubq 32(%%esi), %%xmm2;\n\t" 02938 "psubq 48(%%esi), %%xmm3;\n\t" 02939 "psubq 64(%%esi), %%xmm4;\n\t" 02940 "psubq 80(%%esi), %%xmm5;\n\t" 02941 "psubq 96(%%esi), %%xmm6;\n\t" 02942 "psubq 112(%%esi), %%xmm7;\n\t" 02943 "movups %%xmm0, 0(%%edi);\n\t" 02944 "movups %%xmm1, 16(%%edi);\n\t" 02945 "movups %%xmm2, 32(%%edi);\n\t" 02946 "movups %%xmm3, 48(%%edi);\n\t" 02947 "movups %%xmm4, 64(%%edi);\n\t" 02948 "movups %%xmm5, 80(%%edi);\n\t" 02949 "movups %%xmm6, 96(%%edi);\n\t" 02950 "movups %%xmm7,112(%%edi);\n\t" 02951 "addl $128, %%esi;\n\t" 02952 "addl $128, %%edi;\n\t" 02953 "loop .FB2;\n\t" 02954 ".FB4:\n\t" 02955 "orl %%edx, %%edx;\n\t" 02956 "jz .FB1;\n\t" 02957 "movl %%edx, %%ecx;\n\t" 02958 ".FB3:;\n\t" 02959 "movss 0(%%ebx), %%xmm0;\n\t" 02960 "subss (%%esi), %%xmm0;\n\t" 02961 "movups %%xmm0, (%%edi);\n\t" 02962 "addl $16, %%esi;\n\t" 02963 "addl $16, %%edi;\n\t" 02964 "loop .FB3;\n\t" 02965 ".FB1:;\n\t" 02966 : 02967 :"S"(a), "D"(result), "b"(pVal),"c"(ecx),"d"(edx) 02968 :"memory","eax" 02969 ); 02970 } 02971 02972 02973 02974 //###################################################################### 02975 02976 void sse_binaryReverse(const int32 *a, int32 *result, const int32 val, 02977 const int32 sz) 02978 { 02979 int32 ecx=sz>>5; 02980 int32 edx=sz&31; 02981 int32 pVal[16]; 02982 int i; 02983 02984 for(i=0;i<16;++i) pVal[i] = val; 02985 02986 asm ( 02987 "orl %%ecx, %%ecx;\n\t" 02988 "jz .FC4;\n\t" 02989 ".FC2:;\n\t" 02990 "movdqu (%%ebx), %%xmm0;\n\t" 02991 "movdqu (%%ebx), %%xmm1;\n\t" 02992 "movdqu %%xmm0, %%xmm2;\n\t" 02993 "movdqu %%xmm1, %%xmm3;\n\t" 02994 "movdqu %%xmm0, %%xmm4;\n\t" 02995 "movdqu %%xmm1, %%xmm5;\n\t" 02996 "movdqu %%xmm0, %%xmm6;\n\t" 02997 "movdqu %%xmm1, %%xmm7;\n\t" 02998 "psubd (%%esi), %%xmm0;\n\t" 02999 "psubd 16(%%esi), %%xmm1;\n\t" 03000 "psubd 32(%%esi), %%xmm2;\n\t" 03001 "psubd 48(%%esi), %%xmm3;\n\t" 03002 "psubd 64(%%esi), %%xmm4;\n\t" 03003 "psubd 80(%%esi), %%xmm5;\n\t" 03004 "psubd 96(%%esi), %%xmm6;\n\t" 03005 "psubd 112(%%esi), %%xmm7;\n\t" 03006 "movdqu %%xmm0, 0(%%edi);\n\t" 03007 "movdqu %%xmm1, 16(%%edi);\n\t" 03008 "movdqu %%xmm2, 32(%%edi);\n\t" 03009 "movdqu %%xmm3, 48(%%edi);\n\t" 03010 "movdqu %%xmm4, 64(%%edi);\n\t" 03011 "movdqu %%xmm5, 80(%%edi);\n\t" 03012 "movdqu %%xmm6, 96(%%edi);\n\t" 03013 "movdqu %%xmm7,112(%%edi);\n\t" 03014 "addl $128, %%esi;\n\t" 03015 "addl $128, %%edi;\n\t" 03016 "loop .FC2;\n\t" 03017 ".FC4:;\n\t" 03018 "orl %%edx, %%edx;\n\t" 03019 "jz .FC1;\n\t" 03020 "movl %%edx, %%ecx;\n\t" 03021 ".FC3:;\n\t" 03022 "movdqu 0(%%ebx), %%xmm0;\n\t" 03023 "psubd (%%esi), %%xmm0;\n\t" 03024 "movups %%xmm0, (%%edi);\n\t" 03025 "addl $16, %%esi;\n\t" 03026 "addl $16, %%edi;\n\t" 03027 "loop .FC3;\n\t" 03028 ".FC1:;\n\t" 03029 : 03030 :"S"(a), "D"(result), "b"(pVal),"c"(ecx),"d"(edx) 03031 :"memory","eax" 03032 ); 03033 } 03034 03035 03036 03037 //###################################################################### 03038 03039 void sse_cvt_byte_to_int(const byte *a, int32 *b, const int32 sz) 03040 { 03041 int32 ecx=sz>>4; 03042 int32 edx=sz&0xf; 03043 03044 asm( 03045 "orl %%ecx, %%ecx;\n\t" 03046 "jz .GA4;\n\t" 03047 "pxor %%xmm0, %%xmm0;\n\t" 03048 ".GA2:;\n\t" 03049 "movdqu 0(%%esi), %%xmm1;\n\t" 03050 "movdqa %%xmm1, %%xmm2;\n\t" 03051 "movdqa %%xmm1, %%xmm3;\n\t" 03052 "movdqa %%xmm1, %%xmm4;\n\t" 03053 "psrldq $4, %%xmm2;\n\t" 03054 "psrldq $8, %%xmm3;\n\t" 03055 "psrldq $12, %%xmm4;\n\t" 03056 "punpcklbw %%xmm0, %%xmm1;\n\t" 03057 "punpcklbw %%xmm0, %%xmm2;\n\t" 03058 "punpcklbw %%xmm0, %%xmm3;\n\t" 03059 "punpcklbw %%xmm0, %%xmm4;\n\t" 03060 "punpcklbw %%xmm0, %%xmm1;\n\t" 03061 "punpcklbw %%xmm0, %%xmm2;\n\t" 03062 "punpcklbw %%xmm0, %%xmm3;\n\t" 03063 "punpcklbw %%xmm0, %%xmm4;\n\t" 03064 "movdqu %%xmm1, (%%edi);\n\t" 03065 "movdqu %%xmm2, 16(%%edi);\n\t" 03066 "movdqu %%xmm3, 32(%%edi);\n\t" 03067 "movdqu %%xmm4, 48(%%edi);\n\t" 03068 "addl $16, %%esi;\n\t" 03069 "addl $64, %%edi;\n\t" 03070 "loop .GA2;\n\t" 03071 ".GA4:;\n\t" 03072 "orl %%edx, %%edx;\n\t" 03073 "jz .GA1;\n\t" 03074 "mov %%edx, %%ecx;\n\t" 03075 ".GA3:;\n\t" 03076 "xorl %%eax, %%eax;\n\t" 03077 "movb (%%esi), %%al;\n\t" 03078 "movl %%eax, (%%edi);\n\t" 03079 "incl %%esi;\n\t" 03080 "addl $4, %%edi;\n\t" 03081 "loop .GA3;\n\t" 03082 ".GA1:;" 03083 : 03084 :"S"(a), "D"(b), "c"(ecx),"d"(edx) 03085 :"memory" 03086 ); 03087 03088 03089 } 03090 03091 #endif 03092 03093 #ifdef INVT_USE_MMXSSE2 03094 03095 //###################################################################### 03096 // speedup ~= 1.5 03097 void sse2_cvt_byte_to_float(const byte *a, float32 *b, const int32 sz) 03098 { 03099 int32 ecx=sz>>4; 03100 int32 edx=sz&0xf; 03101 03102 asm( 03103 "orl %%ecx, %%ecx;\n\t" 03104 "jz .GB4;\n\t" 03105 ".GB2:;\n\t" 03106 "pxor %%xmm0, %%xmm0;\n\t" 03107 "movdqu 0(%%esi), %%xmm1;\n\t" 03108 "movdqu 4(%%esi), %%xmm2;\n\t" 03109 "movdqu 8(%%esi), %%xmm3;\n\t" 03110 "movdqu 12(%%esi), %%xmm4;\n\t" 03111 "punpcklbw %%xmm0, %%xmm1;\n\t" 03112 "punpcklbw %%xmm0, %%xmm2;\n\t" 03113 "punpcklbw %%xmm0, %%xmm3;\n\t" 03114 "punpcklbw %%xmm0, %%xmm4;\n\t" 03115 "punpcklbw %%xmm0, %%xmm1;\n\t" 03116 "punpcklbw %%xmm0, %%xmm2;\n\t" 03117 "punpcklbw %%xmm0, %%xmm3;\n\t" 03118 "punpcklbw %%xmm0, %%xmm4;\n\t" 03119 "cvtdq2ps %%xmm1, %%xmm1;\n\t" 03120 "cvtdq2ps %%xmm2, %%xmm2;\n\t" 03121 "movups %%xmm1, (%%edi);\n\t" 03122 "movups %%xmm2, 16(%%edi);\n\t" 03123 "cvtdq2ps %%xmm3, %%xmm3;\n\t" 03124 "cvtdq2ps %%xmm4, %%xmm4;\n\t" 03125 "movups %%xmm3, 32(%%edi);\n\t" 03126 "movups %%xmm4, 48(%%edi);\n\t" 03127 "addl $16, %%esi;\n\t" 03128 "addl $64, %%edi;\n\t" 03129 "loop .GB2;\n\t" 03130 ".GB4:;\n\t" 03131 "orl %%edx, %%edx;\n\t" 03132 "jz .GB1;\n\t" 03133 "movl %%edx, %%ecx;\n\t" 03134 ".GB3:;\n\t" 03135 "xorl %%eax, %%eax;\n\t" 03136 "movb (%%esi), %%al;\n\t" 03137 "movd %%eax, %%xmm0;\n\t" 03138 "cvtdq2ps %%xmm0, %%xmm1;\n\t" 03139 "movss %%xmm1, (%%edi);\n\t" 03140 "incl %%esi;\n\t" 03141 "addl $4, %%edi;\n\t" 03142 "loop .GB3;\n\t" 03143 ".GB1:;" 03144 : 03145 :"S"(a), "D"(b), "c"(ecx),"d"(edx) 03146 :"memory" 03147 ); 03148 } 03149 03150 03151 03152 //###################################################################### 03153 // speedup ~= 1.15 03154 void sse2_cvt_byte_to_double(const byte *a, double *b, int32 sz) 03155 { 03156 int32 ecx=sz>>3; 03157 int32 edx=sz&0x7; 03158 03159 asm( 03160 "orl %%ecx, %%ecx;\n\t" 03161 "jz .GC4;\n\t" 03162 ".GC2:;\n\t" 03163 "pxor %%xmm0, %%xmm0;\n\t" 03164 "movdqu 0(%%esi), %%xmm1;\n\t" 03165 "movdqu 2(%%esi), %%xmm2;\n\t" 03166 "movdqu 4(%%esi), %%xmm3;\n\t" 03167 "movdqu 6(%%esi), %%xmm4;\n\t" 03168 "punpcklbw %%xmm0, %%xmm1;\n\t" 03169 "punpcklbw %%xmm0, %%xmm2;\n\t" 03170 "punpcklbw %%xmm0, %%xmm3;\n\t" 03171 "punpcklbw %%xmm0, %%xmm4;\n\t" 03172 "punpcklbw %%xmm0, %%xmm1;\n\t" 03173 "punpcklbw %%xmm0, %%xmm2;\n\t" 03174 "punpcklbw %%xmm0, %%xmm3;\n\t" 03175 "punpcklbw %%xmm0, %%xmm4;\n\t" 03176 "cvtdq2pd %%xmm1, %%xmm1;\n\t" 03177 "cvtdq2pd %%xmm2, %%xmm2;\n\t" 03178 "movupd %%xmm1, (%%edi);\n\t" 03179 "movupd %%xmm2, 16(%%edi);\n\t" 03180 "cvtdq2pd %%xmm3, %%xmm3;\n\t" 03181 "cvtdq2pd %%xmm4, %%xmm4;\n\t" 03182 "movupd %%xmm3, 32(%%edi);\n\t" 03183 "movupd %%xmm4, 48(%%edi);\n\t" 03184 "addl $8, %%esi;\n\t" 03185 "addl $64, %%edi;\n\t" 03186 "loop .GC2;\n\t" 03187 ".GC4:;\n\t" 03188 "orl %%edx, %%edx;\n\t" 03189 "jz .GC1;\n\t" 03190 "movl %%edx, %%ecx;\n\t" 03191 ".GC3:;\n\t" 03192 "xorl %%eax, %%eax;\n\t" 03193 "movb (%%esi), %%al;\n\t" 03194 "movd %%eax, %%xmm0;\n\t" 03195 "cvtdq2pd %%xmm0, %%xmm1;\n\t" 03196 "movsd %%xmm1, (%%edi);\n\t" 03197 "incl %%esi;\n\t" 03198 "addl $8, %%edi;\n\t" 03199 "loop .GC3;\n\t" 03200 ".GC1:;" 03201 : 03202 :"S"(a), "D"(b), "c"(ecx),"d"(edx) 03203 :"memory" 03204 ); 03205 03206 } 03207 03208 03209 03210 //###################################################################### 03211 03212 void sse2_cvt_int_to_float(const int32 *a, float *b, const int32 sz) 03213 { 03214 int32 ecx=sz>>5; 03215 int32 edx=sz&0x1f; 03216 03217 asm( 03218 "orl %%ecx, %%ecx;\n\t" 03219 "jz .GD4;\n\t" 03220 ".GD2:;\n\t" 03221 "movdqu 0(%%esi), %%xmm0;\n\t" 03222 "movdqu 16(%%esi), %%xmm1;\n\t" 03223 "movdqu 32(%%esi), %%xmm2;\n\t" 03224 "movdqu 48(%%esi), %%xmm3;\n\t" 03225 "movdqu 64(%%esi), %%xmm4;\n\t" 03226 "movdqu 80(%%esi), %%xmm5;\n\t" 03227 "movdqu 96(%%esi), %%xmm6;\n\t" 03228 "movdqu 112(%%esi), %%xmm7;\n\t" 03229 "cvtdq2ps %%xmm0, %%xmm0;\n\t" 03230 "cvtdq2ps %%xmm1, %%xmm1;\n\t" 03231 "cvtdq2ps %%xmm2, %%xmm2;\n\t" 03232 "cvtdq2ps %%xmm3, %%xmm3;\n\t" 03233 "cvtdq2ps %%xmm4, %%xmm4;\n\t" 03234 "cvtdq2ps %%xmm5, %%xmm5;\n\t" 03235 "cvtdq2ps %%xmm6, %%xmm6;\n\t" 03236 "cvtdq2ps %%xmm7, %%xmm7;\n\t" 03237 "movups %%xmm0, 0(%%edi);\n\t" 03238 "movups %%xmm1, 16(%%edi);\n\t" 03239 "movups %%xmm2, 32(%%edi);\n\t" 03240 "movups %%xmm3, 48(%%edi);\n\t" 03241 "movups %%xmm4, 64(%%edi);\n\t" 03242 "movups %%xmm5, 80(%%edi);\n\t" 03243 "movups %%xmm6, 96(%%edi);\n\t" 03244 "movups %%xmm7, 112(%%edi);\n\t" 03245 "addl $128, %%esi;\n\t" 03246 "addl $128, %%edi;\n\t" 03247 "decl %%ecx;\n\t" 03248 "jnz .GD2;\n\t" 03249 ".GD4:;\n\t" 03250 "orl %%edx, %%edx;\n\t" 03251 "jz .GD1;\n\t" 03252 "movl %%edx, %%ecx;\n\t" 03253 ".GD3:;\n\t" 03254 "movd (%%esi), %%xmm0;\n\t" 03255 "cvtdq2ps %%xmm0, %%xmm0;\n\t" 03256 "movss %%xmm0, (%%edi);\n\t" 03257 "addl $4, %%esi;\n\t" 03258 "addl $4, %%edi;\n\t" 03259 "loop .GD3;\n\t" 03260 ".GD1:;" 03261 : 03262 :"S"(a), "D"(b), "c"(ecx),"d"(edx) 03263 :"memory" 03264 ); 03265 03266 } 03267 03268 //###################################################################### 03269 // speedup ~= 1.2 03270 void sse2_cvt_int_to_double(const int32 *a, double *b, const int32 sz) 03271 { 03272 int32 ecx=sz>>4; 03273 int32 edx=sz&0xf; 03274 03275 asm( 03276 "orl %%ecx, %%ecx;\n\t" 03277 "jz .GE4;\n\t" 03278 ".GE2:;\n\t" 03279 "movdqu 0(%%esi), %%xmm0;\n\t" 03280 "movdqu 8(%%esi), %%xmm1;\n\t" 03281 "movdqu 16(%%esi), %%xmm2;\n\t" 03282 "movdqu 24(%%esi), %%xmm3;\n\t" 03283 "movdqu 32(%%esi), %%xmm4;\n\t" 03284 "movdqu 40(%%esi), %%xmm5;\n\t" 03285 "movdqu 48(%%esi), %%xmm6;\n\t" 03286 "movdqu 56(%%esi), %%xmm7;\n\t" 03287 "cvtdq2pd %%xmm0, %%xmm0;\n\t" 03288 "cvtdq2pd %%xmm1, %%xmm1;\n\t" 03289 "cvtdq2pd %%xmm2, %%xmm2;\n\t" 03290 "cvtdq2pd %%xmm3, %%xmm3;\n\t" 03291 "cvtdq2pd %%xmm4, %%xmm4;\n\t" 03292 "cvtdq2pd %%xmm5, %%xmm5;\n\t" 03293 "cvtdq2pd %%xmm6, %%xmm6;\n\t" 03294 "cvtdq2pd %%xmm7, %%xmm7;\n\t" 03295 "movups %%xmm0, 0(%%edi);\n\t" 03296 "movups %%xmm1, 16(%%edi);\n\t" 03297 "movups %%xmm2, 32(%%edi);\n\t" 03298 "movups %%xmm3, 48(%%edi);\n\t" 03299 "movups %%xmm4, 64(%%edi);\n\t" 03300 "movups %%xmm5, 80(%%edi);\n\t" 03301 "movups %%xmm6, 96(%%edi);\n\t" 03302 "movups %%xmm7, 112(%%edi);\n\t" 03303 "addl $64, %%esi;\n\t" 03304 "addl $128, %%edi;\n\t" 03305 "decl %%ecx;\n\t" 03306 "jnz .GE2;\n\t" 03307 ".GE4:;\n\t" 03308 "orl %%edx, %%edx;\n\t" 03309 "jz .GE1;\n\t" 03310 "movl %%edx, %%ecx;\n\t" 03311 ".GE3:;\n\t" 03312 "movd (%%esi), %%xmm0;\n\t" 03313 "cvtdq2pd %%xmm0, %%xmm0;\n\t" 03314 "movsd %%xmm0, (%%edi);\n\t" 03315 "addl $4, %%esi;\n\t" 03316 "addl $8, %%edi;\n\t" 03317 "loop .GE3;\n\t" 03318 ".GE1:;" 03319 : 03320 :"S"(a), "D"(b), "c"(ecx),"d"(edx) 03321 :"memory" 03322 ); 03323 03324 } 03325 03326 //###################################################################### 03327 void sse2_cvt_float_to_int(const float *a, int *b, const int32 sz) 03328 { 03329 int32 ecx=sz; 03330 int32 edx=sz; 03331 03332 asm ( 03333 "orl %%ecx, %%ecx;\n\t" 03334 "jz .GF1;\n\t" 03335 ".GF2:;\n\t" 03336 "movdqu 0(%%esi), %%xmm0;\n\t" 03337 "movdqu 8(%%esi), %%xmm1;\n\t" 03338 "movdqu 16(%%esi), %%xmm2;\n\t" 03339 "movdqu 24(%%esi), %%xmm3;\n\t" 03340 "movdqu 32(%%esi), %%xmm4;\n\t" 03341 "movdqu 40(%%esi), %%xmm5;\n\t" 03342 "movdqu 48(%%esi), %%xmm6;\n\t" 03343 "movdqu 56(%%esi), %%xmm7;\n\t" 03344 "cvtps2dq %%xmm0, %%xmm0;\n\t" 03345 "cvtps2dq %%xmm1, %%xmm1;\n\t" 03346 "cvtps2dq %%xmm2, %%xmm2;\n\t" 03347 "cvtps2dq %%xmm3, %%xmm3;\n\t" 03348 "cvtps2dq %%xmm4, %%xmm4;\n\t" 03349 "cvtps2dq %%xmm5, %%xmm5;\n\t" 03350 "cvtps2dq %%xmm6, %%xmm6;\n\t" 03351 "cvtps2dq %%xmm7, %%xmm7;\n\t" 03352 "movdqu %%xmm0, 0(%%edi);\n\t" 03353 "movdqu %%xmm1, 16(%%edi);\n\t" 03354 "movdqu %%xmm2, 32(%%edi);\n\t" 03355 "movdqu %%xmm3, 48(%%edi);\n\t" 03356 "movdqu %%xmm4, 64(%%edi);\n\t" 03357 "movdqu %%xmm5, 80(%%edi);\n\t" 03358 "movdqu %%xmm6, 96(%%edi);\n\t" 03359 "movdqu %%xmm7, 112(%%edi);\n\t" 03360 "addl $64, %%esi;\n\t" 03361 "addl $128, %%edi;\n\t" 03362 "decl %%ecx;\n\t" 03363 "jnz .GF2;\n\t" 03364 ".GF4:;\n\t" 03365 "orl %%edx, %%edx;\n\t" 03366 "jz .GF1;\n\t" 03367 "movl %%edx, %%ecx;\n\t" 03368 ".GF3:;\n\t" 03369 "movd (%%esi), %%xmm0;\n\t" 03370 "cvtps2dq %%xmm0, %%xmm0;\n\t" 03371 "movd %%xmm0, (%%edi);\n\t" 03372 "addl $4, %%esi;\n\t" 03373 "addl $8, %%edi;\n\t" 03374 "loop .GF3;\n\t" 03375 ".GF1:;" 03376 : 03377 :"S"(a), "D"(b), "c"(ecx),"d"(edx) 03378 :"memory" 03379 ); 03380 03381 } 03382 03383 03384 03385 //###################################################################### 03386 void sse2_cvt_float_to_double(const float *a, double *b, const int32 sz) 03387 { 03388 int32 ecx=sz>>4; 03389 int32 edx=sz&0xf; 03390 03391 asm( 03392 "orl %%ecx, %%ecx;\n\t" 03393 "jz .GG4;\n\t" 03394 ".GG2:;\n\t" 03395 "movups 0(%%esi), %%xmm0;\n\t" 03396 "movups 8(%%esi), %%xmm1;\n\t" 03397 "movups 16(%%esi), %%xmm2;\n\t" 03398 "movups 24(%%esi), %%xmm3;\n\t" 03399 "movups 32(%%esi), %%xmm4;\n\t" 03400 "movups 40(%%esi), %%xmm5;\n\t" 03401 "movups 48(%%esi), %%xmm6;\n\t" 03402 "movups 56(%%esi), %%xmm7;\n\t" 03403 "cvtps2pd %%xmm0, %%xmm0;\n\t" 03404 "cvtps2pd %%xmm1, %%xmm1;\n\t" 03405 "cvtps2pd %%xmm2, %%xmm2;\n\t" 03406 "cvtps2pd %%xmm3, %%xmm3;\n\t" 03407 "cvtps2pd %%xmm4, %%xmm4;\n\t" 03408 "cvtps2pd %%xmm5, %%xmm5;\n\t" 03409 "cvtps2pd %%xmm6, %%xmm6;\n\t" 03410 "cvtps2pd %%xmm7, %%xmm7;\n\t" 03411 "movupd %%xmm0, 0(%%edi);\n\t" 03412 "movupd %%xmm1, 16(%%edi);\n\t" 03413 "movupd %%xmm2, 32(%%edi);\n\t" 03414 "movupd %%xmm3, 48(%%edi);\n\t" 03415 "movupd %%xmm4, 64(%%edi);\n\t" 03416 "movupd %%xmm5, 80(%%edi);\n\t" 03417 "movupd %%xmm6, 96(%%edi);\n\t" 03418 "movupd %%xmm7, 112(%%edi);\n\t" 03419 "addl $64, %%esi;\n\t" 03420 "addl $128, %%edi;\n\t" 03421 "decl %%ecx;\n\t" 03422 "jnz .GG2;\n\t" 03423 ".GG4:;\n\t" 03424 "orl %%edx, %%edx;\n\t" 03425 "jz .GG1;\n\t" 03426 "movl %%edx, %%ecx;\n\t" 03427 ".GG3:;\n\t" 03428 "movd (%%esi), %%xmm0;\n\t" 03429 "cvtps2pd %%xmm0, %%xmm0;\n\t" 03430 "movsd %%xmm0, (%%edi);\n\t" 03431 "addl $4, %%esi;\n\t" 03432 "addl $8, %%edi;\n\t" 03433 "loop .GG3;\n\t" 03434 ".GG1:;" 03435 : 03436 :"S"(a), "D"(b), "c"(ecx),"d"(edx) 03437 :"memory" 03438 ); 03439 } 03440 03441 #endif 03442 03443 #ifdef INVT_USE_SSE 03444 03445 //###################################################################### 03446 void sse_lowPass3x(const float *a, float *b, const int h, const int w) 03447 { 03448 const float coeffs[] = { 3.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0, 4.0}; 03449 int edx = (w-2)/12; 03450 int eax = (w-2)%12; 03451 03452 asm ( 03453 // "movups 16(%%ebx), %%xmm7;\n\t" 03454 "orl %%ecx, %%ecx;\n\t" 03455 "jz .HA1;\n\t" 03456 ".HA2:;\n\t" 03457 03458 // *dptr++ = (sptr[0]+sptr[0]+sptr[1])/3.0 03459 "movss 0(%%esi), %%xmm1;\n\t" // xmm1 <- sptr[0] 03460 "movss 4(%%esi), %%xmm2;\n\t" // xmm2 <- sptr[1] 03461 "addss %%xmm1, %%xmm1;\n\t" // xmm2 <- sptr[0] + sptr[0] 03462 "addss %%xmm1, %%xmm2;\n\t" // xmm2 <- xmm2 + sptr[1] 03463 "divss (%%ebx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0 03464 "movss %%xmm2, (%%edi);\n\t" // *dptr <- xmm2 03465 "addl $4, %%edi;\n\t" // ++dptr 03466 03467 // for (int i = 0; i < w - 2; i ++) 03468 "orl %%edx, %%edx;\n\t" 03469 "jz .HA4;\n\t" 03470 03471 "pushl %%edx;\n\t" 03472 ".HA3:;\n\t" 03473 "movups 00(%%esi), %%xmm0;\n\t" 03474 "movups 04(%%esi), %%xmm1;\n\t" 03475 "movups 8(%%esi), %%xmm2;\n\t" 03476 "movups 16(%%esi), %%xmm3;\n\t" 03477 "movups 20(%%esi), %%xmm4;\n\t" 03478 "movups 24(%%esi), %%xmm5;\n\t" 03479 "movups 32(%%esi), %%xmm6;\n\t" 03480 "movups 36(%%esi), %%xmm7;\n\t" 03481 "addps %%xmm1, %%xmm0;\n\t" 03482 "addps %%xmm4, %%xmm3;\n\t" 03483 "addps %%xmm1, %%xmm0;\n\t" 03484 "addps %%xmm4, %%xmm3;\n\t" 03485 "movups 40(%%esi), %%xmm1;\n\t" 03486 "addps %%xmm7, %%xmm6;\n\t" 03487 "addps %%xmm2, %%xmm0;\n\t" 03488 "addps %%xmm1, %%xmm6;\n\t" 03489 "addps %%xmm5, %%xmm3;\n\t" 03490 "addps %%xmm7, %%xmm6;\n\t" 03491 "divps 16(%%ebx ), %%xmm0;\n\t" 03492 "divps 16(%%ebx ), %%xmm3;\n\t" 03493 "divps 16(%%ebx ), %%xmm6;\n\t" 03494 "movups %%xmm0, (%%edi);\n\t" 03495 "movups %%xmm3, 16(%%edi);\n\t" 03496 "movups %%xmm6, 32(%%edi);\n\t" 03497 "addl $48, %%esi;\n\t" 03498 "addl $48, %%edi;\n\t" 03499 "decl %%edx;\n\t" 03500 "jnz .HA3;\n\t" 03501 "popl %%edx;\n\t" 03502 ".HA4:;\n\t" 03503 03504 "orl %%eax, %%eax;\n\t" 03505 "jz .HA6;\n\t" 03506 "pushl %%eax;\n\t" 03507 ".HA5:;\n\t" 03508 "movss 00(%%esi), %%xmm0;\n\t" 03509 "movss 04(%%esi), %%xmm1;\n\t" 03510 "movss 8(%%esi), %%xmm2;\n\t" 03511 "addps %%xmm1, %%xmm0;\n\t" 03512 "addps %%xmm1, %%xmm2;\n\t" 03513 "addps %%xmm2, %%xmm0;\n\t" 03514 "divss 16(%%ebx ), %%xmm0;\n\t" 03515 "movss %%xmm0, (%%edi);\n\t" 03516 "addl $4, %%esi;\n\t" 03517 "addl $4, %%edi;\n\t" 03518 "decl %%eax;\n\t" 03519 "jnz .HA5;\n\t" 03520 "popl %%eax;\n\t" 03521 03522 ".HA6:;\n\t" 03523 "movss (%%esi), %%xmm1;\n\t" // xmm1 <- sptr[0] 03524 "movss 4(%%esi), %%xmm2;\n\t" // xmm2 <- sptr[1] 03525 "addss %%xmm2, %%xmm2;\n\t" // xmm2 <- sptr[0] + sptr[1] 03526 "addss %%xmm1, %%xmm2;\n\t" // xmm2 <- xmm2 + sptr[0] 03527 "divss 0(%%ebx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0 03528 03529 "movss %%xmm2, (%%edi);\n\t" // *dptr <- xmm2 03530 "addl $4, %%edi;\n\t" // ++dptr 03531 "addl $8, %%esi;\n\t" // sptr += 2 03532 "decl %%ecx;\n\t" 03533 "jnz .HA2;\n\t" 03534 ".HA1:;\n\t" 03535 : 03536 :"S"(a), "D"(b),"c"(h),"a"(eax),"d"(edx),"b"(coeffs) 03537 :"memory" 03538 ); 03539 03540 } 03541 03542 03543 03544 03545 //###################################################################### 03546 03547 void sse_lowPass3y(const float *a, float *b, const int h, const int w) 03548 { 03549 const float coeffs[] = { 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0}; 03550 03551 if (h < 2){ 03552 memcpy(b, a, w*h*sizeof(b[0])); 03553 return; // nothing to smooth 03554 } 03555 03556 if (h < 2){ 03557 memcpy(b, a, w*h*sizeof(b[0])); 03558 return; // nothing to smooth 03559 } 03560 03561 asm ( 03562 // top row 03563 "movl %%edx, %%ecx;\n\t" 03564 "orl %%ecx, %%ecx;\n\t" 03565 "jz .HU1;\n\t" 03566 "push %%esi;\n\t" 03567 ".HU0:;\n\t" 03568 "movss (%%esi), %%xmm0;\n\t" // xmm0 <- sptr[0] 03569 "movss (%%esi, %%edx, 4), %%xmm1;\n\t" //xmm1 <- sptr[w] 03570 "addss %%xmm0, %%xmm0;\n\t" 03571 "addss %%xmm1, %%xmm0;\n\t" 03572 "divss (%%ebx), %%xmm0;\n\t" 03573 "addl $4, %%esi;\n\t" 03574 "movss %%xmm0, (%%edi);\n\t" 03575 "addl $4, %%edi;\n\t" 03576 "decl %%ecx;\n\t" 03577 "jnz .HU0;\n\t" 03578 "popl %%esi;\n\t" 03579 ".HU1:;\n\t" 03580 "cmpl $2, %%eax;\n\t" 03581 "jle .HU5;\n\t" 03582 03583 "pushl %%eax;\n\t" 03584 "subl $2, %%eax;\n\t" 03585 "jle .HU4;\n\t" 03586 ".HU2:;\n\t" 03587 "movl %%edx, %%ecx;\n\t" 03588 "pushl %%edx;\n\t" 03589 ".HU3:;\n\t" 03590 "movss (%%esi), %%xmm0;\n\t" //xmm0 <- sptr[0] 03591 "movss (%%esi,%%edx,4), %%xmm1;\n\t" //xmm1 <- sptr[w] 03592 "movss (%%esi,%%edx,8), %%xmm2;\n\t" //xmm2 <- sptr[2*w] 03593 "addss %%xmm1, %%xmm0;\n\t" 03594 "addss %%xmm1, %%xmm2;\n\t" 03595 "addss %%xmm2, %%xmm0;\n\t" 03596 "divss 16(%%ebx), %%xmm0;\n\t" 03597 "movss %%xmm0, (%%edi);\n\t" 03598 "addl $4, %%esi;\n\t" 03599 "addl $4, %%edi;\n\t" 03600 "decl %%ecx;\n\t" 03601 "jnz .HU3;\n\t" 03602 "popl %%edx;\n\t" 03603 "decl %%eax;\n\t" 03604 "jnz .HU2;\n\t" 03605 03606 ".HU4:;\n\t" 03607 "popl %%eax;\n\t" 03608 ".HU5:;\n\t" 03609 "orl %%edx, %%edx;\n\t" 03610 "jz .HU7;\n\t" 03611 "pushl %%edx;\n\t" 03612 "movl %%edx, %%ecx;\n\t" 03613 ".HU6:;\n\t" 03614 "movss (%%esi), %%xmm0;\n\t" //xmm0 <- sptr[0] 03615 "movss (%%esi,%%ecx,4), %%xmm1;\n\t" //xmm1 <- sptr[w] 03616 "addss %%xmm1, %%xmm1;\n\t" 03617 "addss %%xmm1, %%xmm0;\n\t" 03618 "divss (%%ebx), %%xmm0;\n\t" 03619 "movss %%xmm0, (%%edi);\n\t" 03620 "addl $4, %%esi;\n\t" 03621 "addl $4, %%edi;\n\t" 03622 "decl %%edx;\n\t" 03623 "jnz .HU6;\n\t" 03624 "popl %%edx;\n\t" 03625 ".HU7:;\n\t" 03626 : 03627 :"S"(a),"D"(b),"a"(h),"d"(w),"b"(coeffs) 03628 ); 03629 03630 } 03631 03632 03633 //###################################################################### 03634 03635 void sse_lowPass5x(const float *src, float *dest, const int h, const int w) 03636 { 03637 const float *sptr= src; 03638 float *dptr= dest; 03639 03640 if(w<2) 03641 { 03642 memcpy(dest,src,h*w*sizeof(dest[0])); 03643 return; 03644 } 03645 03646 if (w == 2) ////////////////////////////////////////////////// 03647 for (int j = 0; j < h; j ++) 03648 { 03649 // leftmost point [ (6^) 4 ] / 10 03650 *dptr++ = sptr[0] * (6.0F / 10.0F) + sptr[1] * (4.0F / 10.0F); 03651 03652 // rightmost point [ 4^ (6) ] / 10 03653 *dptr++ = sptr[0] * (4.0F / 10.0F) + sptr[1] * (6.0F / 10.0F); 03654 03655 sptr += 2; // sptr back to same position as dptr 03656 } 03657 else if (w == 3) ////////////////////////////////////////////////// 03658 for (int j = 0; j < h; j ++) 03659 { 03660 // leftmost point [ (6^) 4 1 ] / 11 03661 *dptr++ = sptr[0] * (6.0F / 11.0F) + 03662 sptr[1] * (4.0F / 11.0F) + 03663 sptr[2] * (1.0F / 11.0F); 03664 03665 // middle point [ 4^ (6) 4 ] / 14 03666 *dptr++ = (sptr[0] + sptr[2]) * (4.0F / 14.0F) + 03667 sptr[1] * (6.0F / 14.0F); 03668 03669 // rightmost point [ 1^ 4 (6) ] / 11 03670 *dptr++ = sptr[0] * (1.0F / 11.0F) + 03671 sptr[1] * (4.0F / 11.0F) + 03672 sptr[2] * (6.0F / 11.0F); 03673 03674 sptr += 3; // sptr back to same position as dptr 03675 } 03676 else 03677 if(w>3) 03678 { 03679 const float coeffs[] = {6.0/11.0, 4.0/11.0, 1.0/11.0, 4.0/15.0, 03680 4.0/15.0, 6.0/15.0, 1.0/15.0, 1.0/16.0, 03681 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0, 03682 4.0/16.0, 4.0/16.0, 4.0/16.0, 4.0/16.0, 03683 6.0/16.0, 6.0/16.0, 6.0/16.0, 6.0/16.0, 03684 1.0/15.0, 4.0/15.0, 6.0/15.0, 1.0/15.0, 03685 1.0/11.0, 4.0/11.0, 6.0/11.0, 1.0/11.0 03686 }; 03687 03688 int eax= (w-4)&3; 03689 int edx= (w-4)>>2; 03690 03691 asm( 03692 "orl %%ecx, %%ecx;\n\t" // ecx <- h 03693 "jz .HG6;\n\t" 03694 ".HG0:;\n\t" 03695 "movss (%%esi), %%xmm0;\n\t" // xmm0 <- s[0] 03696 "movss 4(%%esi), %%xmm2;\n\t" // xmm2 <- s[1] 03697 "movss 8(%%esi), %%xmm4;\n\t" // xmm4 <- s[2] 03698 "movss 12(%%esi), %%xmm6;\n\t" // xmm6 <- s[3] 03699 "movss %%xmm0, %%xmm1;\n\t" // xmm1 <- s[0] 03700 "movss %%xmm2, %%xmm3;\n\t" // xmm3 <- s[1] 03701 "movss %%xmm4, %%xmm5;\n\t" // xmm5 <- s[2] 03702 "mulss (%%ebx), %%xmm0;\n\t" // xmm0 <- 6.0/11.0*s[0] 03703 "mulss 4(%%ebx), %%xmm2;\n\t" // xmm2 <- 4.0/11.0*s[1] 03704 "mulss 8(%%ebx), %%xmm4;\n\t" // xmm4 <- 1.0/11.0*s[2] 03705 "addss %%xmm5, %%xmm1;\n\t" // xmm1 <- s[2]+s[0] 03706 "mulss 16(%%ebx), %%xmm1;\n\t" // xmm1 <- (s2+s0)*4.0/15.0 03707 "mulss 20(%%ebx), %%xmm3;\n\t" 03708 "mulss 24(%%ebx), %%xmm6;\n\t" 03709 "addss %%xmm2, %%xmm0;\n\t" 03710 "addss %%xmm3, %%xmm1;\n\t" 03711 "addss %%xmm4, %%xmm0;\n\t" 03712 "addss %%xmm6, %%xmm1;\n\t" 03713 "movss %%xmm0, (%%edi);\n\t" 03714 "movss %%xmm1, 4(%%edi);\n\t" 03715 "addl $8, %%edi;\n\t" 03716 03717 "orl %%edx, %%edx;\n\t" 03718 "jz .HG5;\n\t" 03719 03720 "pushl %%edx;\n\t" // edx <- (w-4)/4 03721 "movups 32(%%ebx), %%xmm5;\n\t" // xmm5 <- 1.0/16.0 1.0/16.0 1.0/16 1.0/16 03722 "movups 48(%%ebx), %%xmm6;\n\t" // xmm6 <- 4.0/16.0 ...................... 03723 "movups 64(%%ebx), %%xmm7;\n\t" // xmm7 <- 6.0/16.0 ...................... 03724 ".HG1:;\n\t" 03725 "movups 0(%%esi), %%xmm0;\n\t" // xmm0 <- s0 s1 s2 s3 03726 "movups 04(%%esi), %%xmm1;\n\t" // xmm1 <- s1 s2 s3 s4 03727 "movups 8(%%esi), %%xmm2;\n\t" // xmm2 <- s2 s3 s4 s5 03728 "movups 12(%%esi), %%xmm3;\n\t" // xmm3 <- s3 s4 s5 s6 03729 "movups 16(%%esi), %%xmm4;\n\t" // xmm4 <- s4 s5 s6 s7 03730 "addps %%xmm4, %%xmm0;\n\t" 03731 "addps %%xmm3, %%xmm1;\n\t" 03732 "mulps %%xmm5, %%xmm0;\n\t" 03733 "mulps %%xmm6, %%xmm1;\n\t" 03734 "mulps %%xmm7, %%xmm2;\n\t" 03735 "addps %%xmm1, %%xmm0;\n\t" 03736 "addps %%xmm2, %%xmm0;\n\t" 03737 "movups %%xmm0, (%%edi);\n\t" 03738 "addl $16, %%esi;\n\t" 03739 "addl $16, %%edi;\n\t" 03740 "decl %%edx;\n\t" 03741 "jnz .HG1;\n\t" 03742 "popl %%edx;\n\t" 03743 03744 ".HG5:;\n\t" 03745 "orl %%eax, %%eax;\n\t" 03746 "jz .HG3;\n\t" 03747 "pushl %%eax;\n\t" // eax <- (w-4)%4 03748 "movups 32(%%ebx), %%xmm5;\n\t" 03749 "movups 48(%%ebx), %%xmm6;\n\t" 03750 "movups 64(%%ebx), %%xmm7;\n\t" 03751 ".HG2:;\n\t" 03752 "movss (%%esi), %%xmm0;\n\t" 03753 "movss 4(%%esi), %%xmm1;\n\t" 03754 "movss 8(%%esi), %%xmm2;\n\t" 03755 "movss 12(%%esi), %%xmm3;\n\t" 03756 "movss 16(%%esi), %%xmm4;\n\t" 03757 "mulss %%xmm5 , %%xmm0;\n\t" 03758 "mulss %%xmm6 , %%xmm1;\n\t" 03759 "mulss %%xmm7 , %%xmm2;\n\t" 03760 "mulss %%xmm6 , %%xmm3;\n\t" 03761 "mulss %%xmm5 , %%xmm4;\n\t" 03762 "addss %%xmm1, %%xmm0;\n\t" 03763 "addss %%xmm3, %%xmm2;\n\t" 03764 "addss %%xmm4, %%xmm0;\n\t" 03765 "addss %%xmm2, %%xmm0;\n\t" 03766 "addl $4, %%esi;\n\t" 03767 "movss %%xmm0, (%%edi);\n\t" 03768 "addl $4, %%edi;\n\t" 03769 "decl %%eax;\n\t" 03770 "jnz .HG2;\n\t" 03771 "popl %%eax;\n\t" 03772 ".HG3:;\n\t" 03773 "movss (%%esi), %%xmm0;\n\t" // xmm0 <- s0 03774 "movss 4(%%esi), %%xmm1;\n\t" // xmm1 <- s1 03775 "movss 8(%%esi), %%xmm2;\n\t" // xmm2 <- s2 03776 "movss 12(%%esi), %%xmm3;\n\t" // xmm3 <- s3 03777 "movss %%xmm1, %%xmm4;\n\t" // xmm4 <- s1 03778 "movss %%xmm2, %%xmm5;\n\t" // xmm5 <- s2 03779 "movss %%xmm3, %%xmm6;\n\t" // xmm6 <- s3 03780 "addps %%xmm1, %%xmm3;\n\t" // xmm3 <- s1+s3 03781 "mulss 80(%%ebx), %%xmm0;\n\t" // xmm0 <- 1.0/15.0*s0 03782 "mulss 84(%%ebx), %%xmm3;\n\t" // xmm3 <- 4.0/15.0*(s1+s3) 03783 "mulss 88(%%ebx), %%xmm2;\n\t" // xmm2 <- 6.0/15.0*s2 03784 "addss %%xmm3, %%xmm0;\n\t" 03785 "addss %%xmm2, %%xmm0;\n\t" 03786 "movss %%xmm0, (%%edi);\n\t" 03787 "mulss 96(%%ebx), %%xmm4;\n\t" 03788 "mulss 100(%%ebx), %%xmm5;\n\t" 03789 "mulss 104(%%ebx), %%xmm6;\n\t" 03790 "addss %%xmm5, %%xmm4;\n\t" 03791 "addss %%xmm6, %%xmm4;\n\t" 03792 "movss %%xmm4, 4(%%edi);\n\t" 03793 "addl $16, %%esi;\n\t" 03794 "addl $8, %%edi;\n\t" 03795 "decl %%ecx;\n\t" 03796 "jnz .HG0;\n\t" 03797 ".HG6:;\n\t" 03798 : 03799 :"S"(sptr),"D"(dptr),"a"(eax),"b"(coeffs),"c"(h),"d"(edx) 03800 :"memory" 03801 ); 03802 } 03803 03804 } 03805 03806 03807 03808 //###################################################################### 03809 03810 void sse_lowPass5y(const float *src, float *dest, const int h, 03811 const int w) 03812 { 03813 if (h < 2){ 03814 memcpy(dest, src, h*w*sizeof(dest[0])); 03815 return; // nothing to smooth 03816 } 03817 03818 const float *sptr= src; 03819 float *dptr= dest; 03820 03821 // ########## vertical pass (even though we scan horiz for speedup) 03822 const int w2 = w * 2; // speedup 03823 03824 03825 if (h == 2) ////////////////////////////////////////////////// 03826 { 03827 // topmost points ( [ (6^) 4 ] / 10 )^T 03828 for (int i = 0; i < w; i ++) 03829 { 03830 *dptr++ = sptr[0] * (6.0F / 10.0F) + 03831 sptr[w] * (4.0F / 10.0F); 03832 sptr++; 03833 } 03834 sptr -= w; // go back to top-left 03835 03836 // bottommost points ( [ 4^ (6) ] / 10 )^T 03837 for (int i = 0; i < w; i ++) 03838 { 03839 *dptr++ = sptr[0] * (4.0F / 10.0F) + 03840 sptr[w] * (6.0F / 10.0F); 03841 sptr++; 03842 } 03843 } 03844 else if (h == 3) ////////////////////////////////////////////////// 03845 { 03846 // topmost points ( [ (6^) 4 1 ] / 11 )^T 03847 for (int i = 0; i < w; i ++) 03848 { 03849 *dptr++ = sptr[ 0] * (6.0F / 11.0F) + 03850 sptr[ w] * (4.0F / 11.0F) + 03851 sptr[w2] * (1.0F / 11.0F); 03852 sptr++; 03853 } 03854 sptr -= w; // go back to top-left 03855 03856 // middle points ( [ 4^ (6) 4 ] / 14 )^T 03857 for (int i = 0; i < w; i ++) 03858 { 03859 *dptr++ = (sptr[ 0] + sptr[w2]) * (4.0F / 14.0F) + 03860 sptr[ w] * (6.0F / 14.0F); 03861 sptr++; 03862 } 03863 sptr -= w; // go back to top-left 03864 03865 // bottommost points ( [ 1^ 4 (6) ] / 11 )^T 03866 for (int i = 0; i < w; i ++) 03867 { 03868 *dptr++ = sptr[ 0] * (1.0F / 11.0F) + 03869 sptr[ w] * (4.0F / 11.0F) + 03870 sptr[w2] * (6.0F / 11.0F); 03871 sptr++; 03872 } 03873 } 03874 else ///////////////////////////////// general case for height >= 4 03875 { 03876 // topmost points ( [ (6^) 4 1 ] / 11 )^T 03877 03878 static const float coeffs[] = { 03879 6.0/11.0, 6.0/11.0, 6.0/11.0, 6.0/11.0, //0 03880 4.0/11.0, 4.0/11.0, 4.0/11.0, 4.0/11.0, //16 03881 1.0/11.0, 1.0/11.0, 1.0/11.0, 1.0/11.0, //32 03882 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, //48 03883 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, //64 03884 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, //80 03885 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0, //96 03886 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, //112 03887 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F //128 03888 }; 03889 03890 int ecx=h-4; 03891 int edx=w>>2; 03892 int eax=w&3; 03893 03894 asm ( 03895 "pushl %%ebp;\n\t" 03896 "movl %0, %%ebp;\n\t" 03897 "addl %%ebp, %%ebp;\n\t" 03898 "addl %%ebp, %%ebp;\n\t" 03899 03900 // 1st loop 03901 "movups (%%ebx), %%xmm4;\n\t" //xmm4 <- 6.0/11.0 ... 03902 "movups 16(%%ebx), %%xmm5;\n\t" //xmm5 <- 4.0/11.0 03903 "movups 32(%%ebx), %%xmm6;\n\t" //xmm6 <- 1.0/11.0 03904 "pushl %%esi;\n\t" 03905 "orl %%edx, %%edx;\n\t" 03906 "jz .IA1;\n\t" 03907 ".align 4;\n\t" 03908 "pushl %%edx;\n\t" 03909 ".IA0:;\n\t" 03910 ".align 4;\n\t" 03911 "movups (%%esi), %%xmm0;\n\t" //xmm0 <- s0 s0 s0 s0 03912 "movups (%%esi,%%ebp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 03913 "movups (%%esi,%%ebp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 03914 "mulps %%xmm4, %%xmm0;\n\t" 03915 "mulps %%xmm5, %%xmm1;\n\t" 03916 "mulps %%xmm6, %%xmm2;\n\t" 03917 "addps %%xmm1, %%xmm0;\n\t" 03918 "addps %%xmm2, %%xmm0;\n\t" 03919 "movups %%xmm0, (%%edi);\n\t" 03920 "addl $16, %%esi;\n\t" 03921 "addl $16, %%edi;\n\t" 03922 "decl %%edx;\n\t" 03923 "jnz .IA0;\n\t" 03924 "popl %%edx;\n\t" 03925 ".IA1:;\n\t" 03926 ".align 4;\n\t" 03927 "orl %%eax, %%eax;\n\t" 03928 "jz .IA3;\n\t" 03929 "pushl %%eax;\n\t" 03930 ".IA2:;\n\t" 03931 ".align 4;\n\t" 03932 "movss (%%esi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 03933 "movss (%%esi,%%ebp,1), %%xmm1;\n\t" //xmm1 <- sW+3 sW+2 sW+1 sW 03934 "movss (%%esi,%%ebp,2), %%xmm2;\n\t" //xmm2 <- sP+3 sP+3 sP+1 sP 03935 "mulss %%xmm4, %%xmm0;\n\t" 03936 "mulss %%xmm5, %%xmm1;\n\t" 03937 "mulss %%xmm6, %%xmm2;\n\t" 03938 "addss %%xmm1, %%xmm0;\n\t" 03939 "addss %%xmm2, %%xmm0;\n\t" 03940 "movss %%xmm0, (%%edi);\n\t" 03941 "addl $4, %%esi;\n\t" 03942 "addl $4, %%edi;\n\t" 03943 "decl %%eax;\n\t" 03944 "jnz .IA2;\n\t" 03945 "popl %%eax;\n\t" 03946 ".IA3:;\n\t" 03947 "popl %%esi;\n\t" // restore sptr 03948 03949 // 2nd loop 03950 "movups 48(%%ebx), %%xmm4;\n\t" //xmm4 <- 4.0/15.0 03951 "movups 64(%%ebx), %%xmm5;\n\t" //xmm5 <- 6.0/15.0 03952 "movups 80(%%ebx), %%xmm6;\n\t" //xmm6 <- 1.0/15.0 03953 "pushl %%esi;\n\t" 03954 "orl %%edx, %%edx;\n\t" 03955 "jz .IA5;\n\t" 03956 "pushl %%edx;\n\t" 03957 "pushl %%eax;\n\t" 03958 "movl %%ebp, %%eax;\n\t" 03959 "addl %%ebp, %%eax;\n\t" 03960 "addl %%ebp, %%eax;\n\t" 03961 ".IA4:;\n\t" 03962 "movups (%%esi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 03963 "movups (%%esi,%%ebp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 03964 "movups (%%esi,%%ebp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 03965 "movups (%%esi,%%eax,1), %%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 03966 "addps %%xmm2, %%xmm0;\n\t" 03967 "mulps %%xmm4, %%xmm0;\n\t" 03968 "mulps %%xmm5, %%xmm1;\n\t" 03969 "mulps %%xmm6, %%xmm3;\n\t" 03970 "addps %%xmm1, %%xmm0;\n\t" 03971 "addps %%xmm3, %%xmm0;\n\t" 03972 "movups %%xmm0, (%%edi);\n\t" 03973 "addl $16, %%esi;\n\t" 03974 "addl $16, %%edi;\n\t" 03975 "decl %%edx;\n\t" 03976 "jnz .IA4;\n\t" 03977 "popl %%eax;\n\t" 03978 "popl %%edx;\n\t" 03979 ".IA5:;\n\t" 03980 "orl %%eax, %%eax;\n\t" 03981 "jz .IA7;\n\t" 03982 "pushl %%eax;\n\t" 03983 "pushl %%edx;\n\t" 03984 "movl %%ebp, %%edx;\n\t" 03985 "addl %%ebp, %%edx;\n\t" 03986 "addl %%ebp, %%edx;\n\t" 03987 ".IA6:;\n\t" 03988 "movss (%%esi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 03989 "movss (%%esi,%%ebp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 03990 "movss (%%esi,%%ebp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 03991 "movss (%%esi,%%edx,1), %%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 03992 "addss %%xmm2, %%xmm0;\n\t" 03993 "mulss %%xmm4, %%xmm0;\n\t" 03994 "mulss %%xmm5, %%xmm1;\n\t" 03995 "mulss %%xmm6, %%xmm3;\n\t" 03996 "addss %%xmm1, %%xmm0;\n\t" 03997 "addss %%xmm3, %%xmm0;\n\t" 03998 "movss %%xmm0, (%%edi);\n\t" 03999 "addl $4, %%esi;\n\t" 04000 "addl $4, %%edi;\n\t" 04001 "decl %%eax;\n\t" 04002 "jnz .IA6;\n\t" 04003 "popl %%edx;\n\t" 04004 "popl %%eax;\n\t" 04005 ".IA7:;\n\t" 04006 "popl %%esi;\n\t" // restore sptr 04007 04008 04009 // the double loops 04010 "orl %%ecx, %%ecx;\n\t" 04011 "jz .IA29;\n\t" 04012 "pushl %%ecx;\n\t" 04013 "movups 96(%%ebx), %%xmm5;\n\t" // xmm5 <- 1.0/16.0 04014 "movups 112(%%ebx), %%xmm6;\n\t" // xmm6 <- 4.0/16.0 04015 "movups 128(%%ebx), %%xmm7;\n\t" // xmm7 <- 6.0/16.0 04016 ".IA8:;\n\t" 04017 "orl %%edx, %%edx;\n\t" 04018 "jz .IA10;\n\t" 04019 "pushl %%edx;\n\t" 04020 "pushl %%eax;\n\t" 04021 "movl %%ebp, %%eax;\n\t" 04022 "addl %%ebp, %%eax;\n\t" 04023 "addl %%ebp, %%eax;\n\t" // eax <- 3*W 04024 ".IA9:;\n\t" 04025 "movups (%%esi), %%xmm0;\n\t" // xmm0 <- s s s s 04026 "movups (%%esi,%%ebp,1), %%xmm1;\n\t" // xmm1 <- sW sW sW sW 04027 "movups (%%esi,%%ebp,2), %%xmm2;\n\t" // xmm2 <- sW2 sW2 sW2 sW2 04028 "movups (%%esi,%%eax,1), %%xmm3;\n\t" // xmm3 <- sW3 sW3 sW3 sW3 04029 "movups (%%esi,%%ebp,4), %%xmm4;\n\t" // xmm4 <- sW4 sW4 sW4 sW4 04030 "addps %%xmm3, %%xmm1;\n\t" // xmm1 <- sW3 + sW1 04031 "addps %%xmm4, %%xmm0;\n\t" // xmm0 <- s0 + sW4 04032 "mulps %%xmm6, %%xmm1;\n\t" // xmm1 <- 4.0/16.0*(sW3+sW1) 04033 "mulps %%xmm5, %%xmm0;\n\t" // xmm0 <- 1.0/16.08(s0 +sW4) 04034 "mulps %%xmm7, %%xmm2;\n\t" // xmm2 <- 6.0/16.0*sW2 04035 "addps %%xmm1, %%xmm0;\n\t" 04036 "addps %%xmm2, %%xmm0;\n\t" 04037 "addl $16, %%esi;\n\t" 04038 "movups %%xmm0, (%%edi);\n\t" 04039 "addl $16, %%edi;\n\t" 04040 "decl %%edx;\n\t" 04041 "jnz .IA9;\n\t" 04042 "popl %%eax;\n\t" 04043 "popl %%edx;\n\t" 04044 ".IA10:;\n\t" 04045 "orl %%eax, %%eax;\n\t" 04046 "jz .IA12;\n\t" 04047 "pushl %%eax;\n\t" 04048 "pushl %%edx;\n\t" 04049 "movl %%ebp, %%edx;\n\t" 04050 "addl %%ebp, %%edx;\n\t" 04051 "addl %%ebp, %%edx;\n\t" 04052 ".IA11:;\n\t" 04053 "movss (%%esi), %%xmm0;\n\t" // xmm0 <- s s s s 04054 "movss (%%esi,%%ebp,1), %%xmm1;\n\t" // xmm1 <- sW sW sW sW 04055 "movss (%%esi,%%ebp,2), %%xmm2;\n\t" // xmm2 <- sW2 sW2 sW2 sW2 04056 "movss (%%esi,%%edx,1), %%xmm3;\n\t" // xmm3 <- sW3 sW3 sW3 sW3 04057 "movss (%%esi,%%ebp,4), %%xmm4;\n\t" // xmm4 <- sW4 sW4 sW4 sW4 04058 "addss %%xmm3, %%xmm1;\n\t" 04059 "addss %%xmm4, %%xmm0;\n\t" 04060 "mulss %%xmm6, %%xmm1;\n\t" 04061 "mulss %%xmm5, %%xmm0;\n\t" 04062 "mulss %%xmm7, %%xmm2;\n\t" 04063 "addss %%xmm1, %%xmm0;\n\t" 04064 "addss %%xmm2, %%xmm0;\n\t" 04065 "addl $4, %%esi;\n\t" 04066 "movss %%xmm0, (%%edi);\n\t" 04067 "addl $4, %%edi;\n\t" 04068 "decl %%eax;\n\t" 04069 "jnz .IA11;\n\t" 04070 "popl %%edx;\n\t" 04071 "popl %%eax;\n\t" 04072 ".IA12:;\n\t" 04073 "decl %%ecx;\n\t" 04074 "jnz .IA8;\n\t" 04075 "popl %%ecx;\n\t" 04076 ".IA29:;\n\t" 04077 04078 // fourth loop 04079 "movups 48(%%ebx), %%xmm4;\n\t" //xmm4 <- 4.0/15.0 04080 "movups 64(%%ebx), %%xmm5;\n\t" //xmm5 <- 6.0/15.0 04081 "movups 80(%%ebx), %%xmm6;\n\t" //xmm6 <- 1.0/15.0 04082 "orl %%edx, %%edx;\n\t" 04083 "jz .IA14;\n\t" 04084 "pushl %%edx;\n\t" 04085 "pushl %%eax;\n\t" 04086 "movl %%ebp, %%eax;\n\t" 04087 "addl %%ebp, %%eax;\n\t" 04088 "addl %%ebp, %%eax;\n\t" 04089 ".IA13:;\n\t" 04090 "movups (%%esi), %%xmm0;\n\t" //xmm0 <- s0 s0 s0 s0 04091 "movups (%%esi,%%ebp,1), %%xmm1;\n\t" //xmm1 <- sW1 sW1 sW1 sW1 04092 "movups (%%esi,%%ebp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 04093 "movups (%%esi,%%eax,1),%%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 04094 "addps %%xmm3, %%xmm1;\n\t" //xmm1 <- sW3 + sW1 04095 "mulps %%xmm6, %%xmm0;\n\t" //xmm0 <- 1.0/15.0 * s0 04096 "mulps %%xmm5, %%xmm2;\n\t" //xmm2 <- 6.0/15.0 * sW2 04097 "mulps %%xmm4, %%xmm1;\n\t" //xmm4 <- 4.0/15.0 * (sW3+sW1) 04098 "addps %%xmm2, %%xmm0;\n\t" 04099 "addps %%xmm1, %%xmm0;\n\t" 04100 "movups %%xmm0, (%%edi);\n\t" 04101 "addl $16, %%esi;\n\t" 04102 "addl $16, %%edi;\n\t" 04103 "decl %%edx;\n\t" 04104 "jnz .IA13;\n\t" 04105 "popl %%eax;\n\t" 04106 "popl %%edx;\n\t" 04107 ".IA14:;\n\t" 04108 "orl %%eax, %%eax;\n\t" 04109 "jz .IA16;\n\t" 04110 "pushl %%eax;\n\t" 04111 "pushl %%edx;\n\t" 04112 "movl %%ebp, %%edx;\n\t" 04113 "addl %%ebp, %%edx;\n\t" 04114 "addl %%ebp, %%edx;\n\t" 04115 ".IA15:;\n\t" 04116 "movss (%%esi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 04117 "movss (%%esi, %%ebp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 04118 "movss (%%esi, %%ebp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 04119 "movss (%%esi, %%edx,1), %%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 04120 "addss %%xmm3, %%xmm1;\n\t" 04121 "mulss %%xmm6, %%xmm0;\n\t" 04122 "mulss %%xmm5, %%xmm2;\n\t" 04123 "mulss %%xmm4, %%xmm1;\n\t" 04124 "addss %%xmm2, %%xmm0;\n\t" 04125 "addss %%xmm1, %%xmm0;\n\t" 04126 "movss %%xmm0, (%%edi);\n\t" 04127 "addl $4, %%esi;\n\t" 04128 "addl $4, %%edi;\n\t" 04129 "decl %%eax;\n\t" 04130 "jnz .IA15;\n\t" 04131 "popl %%edx;\n\t" 04132 "popl %%eax;\n\t" 04133 ".IA16:;\n\t" 04134 04135 // final loop 04136 "movups 32(%%ebx), %%xmm4;\n\t" 04137 "movups 16(%%ebx), %%xmm5;\n\t" 04138 "movups (%%ebx), %%xmm6;\n\t" 04139 "orl %%edx, %%edx;\n\t" 04140 "jz .IA18;\n\t" 04141 "pushl %%edx;\n\t" 04142 ".IA17:;\n\t" 04143 "movups (%%esi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 04144 "movups (%%esi,%%ebp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 04145 "movups (%%esi,%%ebp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 04146 "mulps %%xmm4, %%xmm0;\n\t" 04147 "mulps %%xmm5, %%xmm1;\n\t" 04148 "mulps %%xmm6, %%xmm2;\n\t" 04149 "addps %%xmm1, %%xmm0;\n\t" 04150 "addps %%xmm2, %%xmm0;\n\t" 04151 "movups %%xmm0, (%%edi);\n\t" 04152 "addl $16, %%esi;\n\t" 04153 "addl $16, %%edi;\n\t" 04154 "decl %%edx;\n\t" 04155 "jnz .IA17;\n\t" 04156 "popl %%edx;\n\t" 04157 ".IA18:;\n\t" 04158 "orl %%eax, %%eax;\n\t" 04159 "jz .IA20;\n\t" 04160 "pushl %%eax;\n\t" 04161 ".IA19:;\n\t" 04162 "movss (%%esi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 04163 "movss (%%esi,%%ebp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 04164 "movss (%%esi,%%ebp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 04165 "mulss %%xmm4, %%xmm0;\n\t" 04166 "mulss %%xmm5, %%xmm1;\n\t" 04167 "mulss %%xmm6, %%xmm2;\n\t" 04168 "addss %%xmm1, %%xmm0;\n\t" 04169 "addss %%xmm2, %%xmm0;\n\t" 04170 "movss %%xmm0, (%%edi);\n\t" 04171 "addl $4, %%esi;\n\t" 04172 "addl $4, %%edi;\n\t" 04173 "decl %%eax;\n\t" 04174 "jnz .IA19;\n\t" 04175 "popl %%eax;\n\t" 04176 ".IA20:;\n\t" 04177 04178 "popl %%ebp;\n\t" 04179 : 04180 :"m"(w),"S"(sptr),"D"(dptr),"a"(eax),"b"(coeffs),"c"(ecx),"d"(edx) 04181 ); 04182 04183 } 04184 } 04185 04186 04187 // ###################################################################### 04188 04189 void sse_yuv411_to_rgb_mode_640x480(const byte *src, byte *dest, 04190 const int nbpix2) 04191 { 04192 int ecx=nbpix2/6; 04193 04194 const float coeffs[] = { 04195 0.0F, -0.198242F, 1.014648F, 0.0F, // R G B xx -> u 04196 0.700195F, -0.29052F, 0.0F, 0.0F, // R G B xx -> v 04197 128.0F, 128.0F, 128.0F, 128.0F // division factor 04198 }; 04199 04200 asm ( 04201 ".JA0:;\n\t" 04202 "orl %%ecx, %%ecx;\n\t" 04203 "jz .JA1;\n\t" 04204 "pxor %%mm7, %%mm7;\n\t" //mm7 <- 00 00 00 00 04205 "xorl %%eax, %%eax;\n\t" 04206 "xorl %%ebx, %%ebx;\n\t" 04207 "movl (%%esi), %%eax;\n\t" // eax <- v y1 y0 u 04208 "movw 4(%%esi), %%bx;\n\t" // ebx <- xx xx y3 y2 04209 "movd %%eax, %%mm0;\n\t" // mm0<- xx xx xx xx v y1 y0 u 04210 "movd %%eax, %%mm1;\n\t" // mm1<- xx xx xx xx v y1 y0 u 04211 "movd %%ebx, %%mm2;\n\t" // mm2<- xx xx xx xx xx xx y3 y2 04212 "psrlq $16, %%mm1;\n\t" // mm1<- xx xx xx xx xx xx v y1 04213 "punpcklbw %%mm7, %%mm0;\n\t" // mm0<- xx xx xx xx 0 y0 0 u 04214 "punpcklbw %%mm7, %%mm1;\n\t" // mm1<- xx xx xx xx 00 v 00 y1 04215 "punpcklbw %%mm7, %%mm2;\n\t" // mm2<- xx xx xx xx 00 y3 00 y2 04216 "punpcklwd %%mm7, %%mm0;\n\t" // mm0<- 00 00 00 y0 00 00 00 u 04217 "punpcklwd %%mm7, %%mm1;\n\t" // mm1<- 00 00 00 v 00 00 00 y1 04218 "punpcklwd %%mm7, %%mm2;\n\t" // mm2<- 00 00 00 y3 00 00 00 y2 04219 04220 "cvtpi2ps %%mm0, %%xmm0;\n\t" // xmm0 <- 00 00 y0 u 04221 "cvtpi2ps %%mm1, %%xmm1;\n\t" // xmm1 <- 00 00 v y1 04222 "cvtpi2ps %%mm2, %%xmm2;\n\t" // xmm2 <- 00 00 y3 y2 04223 04224 // 01 01 01 01 04225 "movaps %%xmm0, %%xmm3;\n\t" 04226 04227 // 00 00 00 00 04228 "movaps %%xmm1, %%xmm4;\n\t" 04229 04230 // 00 00 00 00 04231 "movaps %%xmm2, %%xmm5;\n\t" 04232 04233 // 01 01 01 01 04234 "movaps %%xmm2, %%xmm6;\n\t" 04235 04236 "shufps $0x55, %%xmm3, %%xmm3;\n\t"// xmm3 <- y0 y0 y0 y0 04237 "shufps $00, %%xmm4, %%xmm4;\n\t" // xmm4 <- y1 y1 y1 y1 04238 "shufps $0x00, %%xmm5, %%xmm5;\n\t"// xmm5 <- y2 y2 y2 y2 04239 "shufps $0x55, %%xmm6, %%xmm6;\n\t"// xmm6 <- y3 y3 y3 y3 04240 04241 // 00 00 00 00 04242 "shufps $0, %%xmm0, %%xmm0;\n\t" // xmm0 <- u u u u 04243 // 01 01 01 01 04244 "shufps $0x55, %%xmm1, %%xmm1;\n\t" // xmm1 <- v v v v 04245 04246 "subps 32(%%edx), %%xmm0;\n\t" 04247 "subps 32(%%edx), %%xmm1;\n\t" 04248 04249 "mulps (%%edx), %%xmm0;\n\t" 04250 "mulps 16(%%edx),%%xmm1;\n\t" 04251 04252 "addps %%xmm0, %%xmm3;\n\t" 04253 "addps %%xmm0, %%xmm4;\n\t" 04254 "addps %%xmm0, %%xmm5;\n\t" 04255 "addps %%xmm0, %%xmm6;\n\t" 04256 04257 "addps %%xmm1, %%xmm3;\n\t" // xmm3 <- xx b0 g0 r0 04258 "addps %%xmm1, %%xmm4;\n\t" // xmm4 <- xx b1 g1 r1 04259 "addps %%xmm1, %%xmm5;\n\t" // xmm5 <- xx b2 g2 r2 04260 "addps %%xmm1, %%xmm6;\n\t" // xmm6 <- xx b3 g3 r3 04261 04262 "cvtps2pi %%xmm3, %%mm0;\n\t" //mm0 <- g0 r0 04263 "movhlps %%xmm3, %%xmm3;\n\t" //xmm3 <- g0 r0 xx b0 04264 "cvtps2pi %%xmm3, %%mm1;\n\t" //mm1 <- xx b0 04265 "packssdw %%mm1, %%mm0;\n\t" //mm0<- xx b0 g0 r0 04266 04267 "cvtps2pi %%xmm4, %%mm2;\n\t" //mm2 <- g1 r1 04268 "movhlps %%xmm4, %%xmm4;\n\t" //xmm4 <- g1 r1 xx b1 04269 "cvtps2pi %%xmm4, %%mm3;\n\t" //mm3 <- xx b1 04270 "packssdw %%mm3, %%mm2;\n\t" //mm2<- xx b1 g1 r1 04271 04272 "cvtps2pi %%xmm5, %%mm4;\n\t" //mm4 <- g2 r2 04273 "movhlps %%xmm5, %%xmm5;\n\t" //xmm5 <- g2 r2 xx b2 04274 "cvtps2pi %%xmm5, %%mm5;\n\t" //mm5 <- xx b2 04275 "packssdw %%mm5, %%mm4;\n\t" //mm4<- xx b2 g2 r2 04276 04277 "cvtps2pi %%xmm6, %%mm6;\n\t" //mm6 <- g3 r3 04278 "movhlps %%xmm6, %%xmm6;\n\t" //xmm3 <- g3 r3 xx b3 04279 "cvtps2pi %%xmm6, %%mm7;\n\t" //mm7 <- xx b3 04280 "packssdw %%mm7, %%mm6;\n\t" //mm6<- xx b3 g3 r3 04281 04282 "pxor %%mm1, %%mm1;\n\t" 04283 "pcmpgtw %%mm0, %%mm1;\n\t" 04284 "pandn %%mm0, %%mm1;\n\t" 04285 04286 "pxor %%mm3, %%mm3;\n\t" 04287 "pcmpgtw %%mm2, %%mm3;\n\t" 04288 "pandn %%mm2, %%mm3;\n\t" 04289 04290 "pxor %%mm5, %%mm5;\n\t" 04291 "pcmpgtw %%mm4, %%mm5;\n\t" 04292 "pandn %%mm4, %%mm5;\n\t" 04293 04294 "pxor %%mm7, %%mm7;\n\t" 04295 "pcmpgtw %%mm6, %%mm7;\n\t" 04296 "pandn %%mm6, %%mm7;\n\t" 04297 04298 "packuswb %%mm1, %%mm1;\n\t" //mm0<- xx xx xx xx xx b0 g0 r0 04299 "packuswb %%mm3, %%mm3;\n\t" //mm2<- xx xx xx xx xx b1 g1 r1 04300 "packuswb %%mm5, %%mm5;\n\t" //mm4<- xx xx xx xx xx b2 g2 r2 04301 "packuswb %%mm7, %%mm7;\n\t" //mm6<- xx xx xx xx xx b3 g3 r3 04302 04303 "pushl %%ecx;\n\t" 04304 "pushl %%edx;\n\t" 04305 "movd %%mm1, %%eax;\n\t" // eax <- xx b0 g0 r0 04306 "movd %%mm3, %%ebx;\n\t" // ebx <- xx b1 g1 r1 04307 "movd %%mm5, %%ecx;\n\t" // ecx <- xx b2 g2 r2 04308 "movd %%mm7, %%edx;\n\t" // edx <- xx b3 g3 r3 04309 "movw %%ax, (%%edi);\n\t" 04310 "movw %%bx,3(%%edi);\n\t" 04311 "movw %%cx,6(%%edi);\n\t" 04312 "movw %%dx,9(%%edi);\n\t" 04313 "shrl $8, %%eax;\n\t" 04314 "shrl $8, %%ebx;\n\t" 04315 "shrl $8, %%ecx;\n\t" 04316 "shrl $8, %%edx;\n\t" 04317 "movb %%ah, 2(%%edi);\n\t" 04318 "movb %%bh, 5(%%edi);\n\t" 04319 "movb %%ch, 8(%%edi);\n\t" 04320 "movb %%dh,11(%%edi);\n\t" 04321 "popl %%edx;\n\t" 04322 "popl %%ecx;\n\t" 04323 04324 "addl $12,%%edi;\n\t" 04325 "decl %%ecx;\n\t" 04326 "addl $6, %%esi;\n\t" 04327 "jmp .JA0;\n\t" 04328 ".JA1:;\n\t" 04329 "emms;\n\t" 04330 : 04331 :"S"(src),"D"(dest),"c"(ecx),"d"(coeffs) 04332 :"eax","ebx","memory" 04333 ); 04334 04335 } 04336 04337 04338 04339 04340 void sse_lowPass9x(const float *sptr, float *dptr, const int h, const int w) 04341 { 04342 04343 for (int j = 0; j < h; j ++) 04344 { 04345 // leftmost points 04346 *dptr++ = sptr[0] * (70.0F / 163.0F) + 04347 sptr[1] * (56.0F / 163.0F) + 04348 sptr[2] * (28.0F / 163.0F) + 04349 sptr[3] * ( 8.0F / 163.0F) + 04350 sptr[4] * ( 1.0F / 163.0F); 04351 *dptr++ = (sptr[0] + sptr[2]) * (56.0F / 219.0F) + 04352 sptr[1] * (70.0F / 219.0F) + 04353 sptr[3] * (28.0F / 219.0F) + 04354 sptr[4] * ( 8.0F / 219.0F) + 04355 sptr[5] * ( 1.0F / 219.0F); 04356 *dptr++ = (sptr[0] + sptr[4]) * (28.0F / 247.0F) + 04357 (sptr[1] + sptr[3]) * (56.0F / 247.0F) + 04358 sptr[2] * (70.0F / 247.0F) + 04359 sptr[5] * ( 8.0F / 247.0F) + 04360 sptr[6] * ( 1.0F / 247.0F); 04361 *dptr++ = (sptr[0] + sptr[6]) * ( 8.0F / 255.0F) + 04362 (sptr[1] + sptr[5]) * (28.0F / 255.0F) + 04363 (sptr[2] + sptr[4]) * (56.0F / 255.0F) + 04364 sptr[3] * (70.0F / 255.0F) + 04365 sptr[7] * ( 1.0F / 255.0F); 04366 04367 // far from the borders 04368 for (int i = 0; i < w - 8; i ++) 04369 { 04370 *dptr++ = (sptr[0] + sptr[8]) * ( 1.0F / 256.0F) + 04371 (sptr[1] + sptr[7]) * ( 8.0F / 256.0F) + 04372 (sptr[2] + sptr[6]) * (28.0F / 256.0F) + 04373 (sptr[3] + sptr[5]) * (56.0F / 256.0F) + 04374 sptr[4] * (70.0F / 256.0F); 04375 sptr ++; 04376 } 04377 04378 // rightmost points 04379 *dptr++ = sptr[0] * ( 1.0F / 255.0F) + 04380 (sptr[1] + sptr[7]) * ( 8.0F / 255.0F) + 04381 (sptr[2] + sptr[6]) * (28.0F / 255.0F) + 04382 (sptr[3] + sptr[5]) * (56.0F / 255.0F) + 04383 sptr[4] * (70.0F / 255.0F); 04384 sptr ++; 04385 *dptr++ = sptr[0] * ( 1.0F / 247.0F) + 04386 sptr[1] * ( 8.0F / 247.0F) + 04387 (sptr[2] + sptr[6]) * (28.0F / 247.0F) + 04388 (sptr[3] + sptr[5]) * (56.0F / 247.0F) + 04389 sptr[4] * (70.0F / 247.0F); 04390 sptr ++; 04391 *dptr++ = sptr[0] * ( 1.0F / 219.0F) + 04392 sptr[1] * ( 8.0F / 219.0F) + 04393 sptr[2] * (28.0F / 219.0F) + 04394 (sptr[3] + sptr[5]) * (56.0F / 219.0F) + 04395 sptr[4] * (70.0F / 219.0F); 04396 sptr ++; 04397 *dptr++ = sptr[0] * ( 1.0F / 163.0F) + 04398 sptr[1] * ( 8.0F / 163.0F) + 04399 sptr[2] * (28.0F / 163.0F) + 04400 sptr[3] * (56.0F / 163.0F) + 04401 sptr[4] * (70.0F / 163.0F); 04402 sptr += 5; // sptr back to same as dptr (start of next line) 04403 } 04404 } 04405 #endif 04406 04407 //############################################################################ 04408 /* So things look consistent in everyone's emacs... */ 04409 /* Local Variables: */ 04410 /* indent-tabs-mode: nil */ 04411 /* End: */ 04412 04413 #endif 04414