00001 /*!@file Util/mmx-sse-opteron.C -- Optimized implementations of low-level functions 00002 for MMX/SSE */ 00003 00004 // //////////////////////////////////////////////////////////////////// // 00005 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2003 // 00006 // by the University of Southern California (USC) and the iLab at USC. // 00007 // See http://iLab.usc.edu for information about this project. // 00008 // //////////////////////////////////////////////////////////////////// // 00009 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00010 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00011 // in Visual Environments, and Applications'' by Christof Koch and // 00012 // Laurent Itti, California Institute of Technology, 2001 (patent // 00013 // pending; application number 09/912,225 filed July 23, 2001; see // 00014 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00015 // //////////////////////////////////////////////////////////////////// // 00016 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00017 // // 00018 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00019 // redistribute it and/or modify it under the terms of the GNU General // 00020 // Public License as published by the Free Software Foundation; either // 00021 // version 2 of the License, or (at your option) any later version. // 00022 // // 00023 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00024 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00025 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00026 // PURPOSE. See the GNU General Public License for more details. // 00027 // // 00028 // You should have received a copy of the GNU General Public License // 00029 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00030 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00031 // Boston, MA 02111-1307 USA. // 00032 // //////////////////////////////////////////////////////////////////// // 00033 // 00034 // Primary maintainer for this file: Nitin Dhavale <dhavale@usc.edu> 00035 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Util/mmx-sse-opteron.C $ 00036 // $Id: mmx-sse-opteron.C 5736 2005-10-18 17:30:29Z rjpeters $ 00037 // 00038 00039 #include "Util/mmx-sse.H" 00040 00041 // specific types only to the code that is in this file 00042 typedef int int32; 00043 typedef unsigned char byte; 00044 typedef float float32; 00045 00046 00047 #ifdef INVT_USE_SSE 00048 00049 //###################################################################### 00050 void sse_absDiff(const double *a, const double *b, double *diff, const int32 sz) 00051 { 00052 static int32 rcx= sz>>2; 00053 static int32 rdx= sz & 0x3; 00054 00055 asm ( 00056 "or %%rcx, %%rcx;\n\t" 00057 "jz .AG2;\n\t" 00058 ".AG1:;\n\t" 00059 "movupd 0(%%rsi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0 00060 "movupd 0(%%rdi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0 00061 "movupd 16(%%rsi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4 00062 "movupd 16(%%rdi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4 00063 "movupd %%xmm0, %%xmm4;\n\t" // xmm4 <- a3 a2 a1 a0 00064 "movupd %%xmm1, %%xmm5;\n\t" // xmm5 <- b3 b2 b1 b0 00065 "movupd %%xmm2, %%xmm6;\n\t" // xmm6 <- a7 a6 a5 a4 00066 "movupd %%xmm3, %%xmm7;\n\t" // xmm7 <- b7 b6 b5 b4 00067 "subpd %%xmm1, %%xmm0;\n\t" // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0) 00068 "subpd %%xmm3, %%xmm2;\n\t" // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4) 00069 "subpd %%xmm4, %%xmm5;\n\t" // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0) 00070 "subpd %%xmm6, %%xmm7;\n\t" // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4) 00071 "maxpd %%xmm0, %%xmm5;\n\t" // xmm5 <- max(xmm0,xmm5) 00072 "maxpd %%xmm2, %%xmm7;\n\t" // xmm7 <- max(xmm2,xmm7) 00073 "movupd %%xmm5, 0(%%rbx);\n\t" 00074 "movupd %%xmm7, 16(%%rbx);\n\t" 00075 "add $32, %%rsi;\n\t" 00076 "add $32, %%rdi;\n\t" 00077 "add $32, %%rbx;\n\t" 00078 "loop .AG1;\n\t" 00079 ".AG2:;\n\t" 00080 "mov %%rdx, %%rcx;\n\t" 00081 "or %%rcx, %%rcx;\n\t" 00082 "jz .AG4;\n\t" 00083 ".AG3:;\n\t" 00084 "movsd 0(%%rsi), %%xmm0;\n\t" 00085 "movsd 0(%%rdi), %%xmm1;\n\t" 00086 "movsd %%xmm0, %%xmm2;\n\t" 00087 "movsd %%xmm1, %%xmm3;\n\t" 00088 "subsd %%xmm3, %%xmm2;\n\t" 00089 "subsd %%xmm0, %%xmm1;\n\t" 00090 "maxsd %%xmm2, %%xmm1;\n\t" 00091 "movsd %%xmm1, 0(%%rbx);\n\t" 00092 "add $8, %%rsi;\n\t" 00093 "add $8, %%rdi;\n\t" 00094 "add $8, %%rbx;\n\t" 00095 "loop .AG3;\n\t" 00096 ".AG4:;\n\t" 00097 : 00098 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx) 00099 :"memory" 00100 ); 00101 } 00102 #endif 00103 00104 #ifdef INVT_USE_SSE2 00105 //###################################################################### 00106 // speedup ~= 2.1 00107 void sse2_absDiff(const float *a, const float *b, float *diff, const int32 sz) 00108 { 00109 static int32 rcx= sz>>3; 00110 static int32 rdx= sz & 0x7; 00111 00112 asm ( 00113 "or %%rcx, %%rcx;\n\t" 00114 "jz .AE2;\n\t" 00115 ".AE1:;\n\t" 00116 "movups 0(%%rsi), %%xmm0;\n\t" // xmm0 <- a3 a2 a1 a0 00117 "movups 0(%%rdi), %%xmm1;\n\t" // xmm1 <- b3 b2 b1 b0 00118 "movups 16(%%rsi), %%xmm2;\n\t"// xmm2 <- a7 a6 a5 a4 00119 "movups 16(%%rdi), %%xmm3;\n\t"// xmm3 <- b7 b6 b5 b4 00120 "movups %%xmm0, %%xmm4;\n\t" // xmm4 <- a3 a2 a1 a0 00121 "movups %%xmm1, %%xmm5;\n\t" // xmm5 <- b3 b2 b1 b0 00122 "movups %%xmm2, %%xmm6;\n\t" // xmm6 <- a7 a6 a5 a4 00123 "movups %%xmm3, %%xmm7;\n\t" // xmm7 <- b7 b6 b5 b4 00124 "subps %%xmm1, %%xmm0;\n\t" // xmm0 <- (a3-b3) .. (a1-b1) (a0-b0) 00125 "subps %%xmm3, %%xmm2;\n\t" // xmm2 <- (a7-b7) .. (a5-b5) (a4-b4) 00126 "subps %%xmm4, %%xmm5;\n\t" // xmm5 <- (b3-a3) .. (b1-a1) (b0-a0) 00127 "subps %%xmm6, %%xmm7;\n\t" // xmm7 <- (b7-a7) .. (b5-a5) (b4-a4) 00128 "maxps %%xmm0, %%xmm5;\n\t" // xmm5 <- max(xmm0,xmm5) 00129 "maxps %%xmm2, %%xmm7;\n\t" // xmm7 <- max(xmm2,xmm7) 00130 "movups %%xmm5, 0(%%rbx);\n\t" 00131 "movups %%xmm7, 16(%%rbx);\n\t" 00132 "add $32, %%rsi;\n\t" 00133 "add $32, %%rdi;\n\t" 00134 "add $32, %%rbx;\n\t" 00135 "loop .AE1;\n\t" 00136 ".AE2:;\n\t" 00137 "mov %%rdx, %%rcx;\n\t" 00138 "or %%rcx, %%rcx;\n\t" 00139 "jz .AE4;\n\t" 00140 ".AE3:;\n\t" 00141 "movss 0(%%rsi), %%xmm0;\n\t" 00142 "movss 0(%%rdi), %%xmm1;\n\t" 00143 "movss %%xmm0, %%xmm2;\n\t" 00144 "movss %%xmm1, %%xmm3;\n\t" 00145 "subss %%xmm3, %%xmm2;\n\t" 00146 "subss %%xmm0, %%xmm1;\n\t" 00147 "maxss %%xmm2, %%xmm1;\n\t" 00148 "movss %%xmm1, 0(%%rbx);\n\t" 00149 "add $4, %%rsi;\n\t" 00150 "add $4, %%rdi;\n\t" 00151 "add $4, %%rbx;\n\t" 00152 "loop .AE3;\n\t" 00153 ".AE4:;\n\t" 00154 "emms;\n\t" 00155 : 00156 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx) 00157 :"memory" 00158 ); 00159 } 00160 00161 00162 00163 //###################################################################### 00164 // speedup ~= 3.4 00165 void sse2_absDiff(const int32 *a, const int32 *b, int32 *diff, const int32 sz) 00166 { 00167 static int32 rcx= sz>>3; 00168 static int32 rdx= sz&0x7; 00169 00170 asm ( 00171 "or %%rcx, %%rcx;\n\t" 00172 "jz .AF2;\n\t" 00173 ".AF1:;\n\t" 00174 "movdqu 0(%%rsi), %%xmm0;\n\t" 00175 "movdqu 0(%%rdi), %%xmm1;\n\t" 00176 "movdqu 16(%%rsi), %%xmm2;\n\t" 00177 "movdqu 16(%%rdi), %%xmm3;\n\t" 00178 "movdqu %%xmm0, %%xmm4;\n\t" 00179 "movdqu %%xmm1, %%xmm5;\n\t" 00180 "movdqu %%xmm2, %%xmm6;\n\t" 00181 "movdqu %%xmm3, %%xmm7;\n\t" 00182 "psubusw %%xmm1, %%xmm0;\n\t" 00183 "psubusw %%xmm3, %%xmm2;\n\t" 00184 "psubusw %%xmm4, %%xmm5;\n\t" 00185 "psubusw %%xmm6, %%xmm7;\n\t" 00186 "pmaxsw %%xmm0, %%xmm5;\n\t" 00187 "pmaxsw %%xmm2, %%xmm7;\n\t" 00188 "movdqu %%xmm5, 0(%%rbx);\n\t" 00189 "movdqu %%xmm7, 16(%%rbx);\n\t" 00190 "add $32, %%rsi;\n\t" 00191 "add $32, %%rdi;\n\t" 00192 "add $32, %%rbx;\n\t" 00193 "loop .AF1;\n\t" 00194 ".AF2:;\n\t" 00195 "mov %%rdx, %%rcx;\n\t" 00196 "or %%rcx, %%rcx;\n\t" 00197 "jz .AF4;\n\t" 00198 ".AF3:;\n\t" 00199 "mov (%%rsi), %%rax;\n\t" 00200 "mov (%%rdi), %%rdx;\n\t" 00201 "cmp %%rdx, %%rax;\n\t" 00202 "ja .AF5;\n\t" 00203 "xchg %%rax, %%rdx;\n\t" 00204 ".AF5:;\n\t" 00205 "sub %%rdx, %%rax;\n\t" 00206 "mov %%rax, (%%rbx);\n\t" 00207 "add $4, %%rsi;\n\t" 00208 "add $4, %%rdi;\n\t" 00209 "add $4, %%rbx;\n\t" 00210 "loop .AF3;\n\t" 00211 ".AF4:;\n\t" 00212 "emms;\n\t" 00213 : 00214 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx) 00215 :"memory" 00216 ); 00217 } 00218 00219 00220 //###################################################################### 00221 // speedup ~=10.0! 00222 void sse2_absDiff(const byte *a, const byte *b, byte *diff, const int32 sz) 00223 { 00224 static int32 rcx= sz>>5; 00225 static int32 rdx= sz&0x1f; 00226 00227 asm ( 00228 "or %%rcx, %%rcx;\n\t" 00229 "jz .AD2;\n\t" 00230 ".AD1:;\n\t" 00231 "movdqu 0(%%rsi), %%xmm0;\n\t" // xmm0<- a15 ... a3 a2 a1 a0 00232 "movdqu 0(%%rdi), %%xmm1;\n\t" // xmm1<- b15 ... b3 b2 b1 b0 00233 "movdqu 16(%%rsi), %%xmm2;\n\t"// xmm2<- a31 ... a18 a17 a16 00234 "movdqu 16(%%rdi), %%xmm3;\n\t"// xmm3<- b31 ... b18 b17 b16 00235 "movdqu %%xmm0, %%xmm4;\n\t" // xmm4<- a15 ... a3 a2 a1 a0 00236 "movdqu %%xmm1, %%xmm5;\n\t" // xmm5<- b15 ... b3 b2 b1 b0 00237 "movdqu %%xmm2, %%xmm6;\n\t" // xmm6<- a31 ... a18 a17 a16 00238 "movdqu %%xmm3, %%xmm7;\n\t" // xmm7<- b31 ... b18 b17 b16 00239 "psubusb %%xmm1, %%xmm0;\n\t" // xmm0<-(a15-b15)...( a1-b1 )(a0-b0) 00240 "psubusb %%xmm3, %%xmm2;\n\t" // xmm2<-(a31-b31)...(a17-b17)(a16-b16) 00241 "psubusb %%xmm4, %%xmm5;\n\t" // xmm5<-(b15-a15)...(b17-a17)(b16-a16) 00242 "psubusb %%xmm6, %%xmm7;\n\t" // xmm7<-(b31-a31)...(b17-a17)(b16-a16) 00243 "pmaxub %%xmm0, %%xmm5;\n\t" // xmm5<- max(xmm0,xmm5) 00244 "pmaxub %%xmm2, %%xmm7;\n\t" // xmm7<- max(xmm2,xmm7) 00245 "movdqu %%xmm5, 0(%%rbx);\n\t" 00246 "movdqu %%xmm7, 16(%%rbx);\n\t" 00247 "add $32, %%rsi;\n\t" 00248 "add $32, %%rdi;\n\t" 00249 "add $32, %%rbx;\n\t" 00250 "loop .AD1;\n\t" 00251 ".AD2:;\n\t" 00252 "mov %%rdx, %%rcx;\n\t" 00253 "or %%rcx, %%rcx;\n\t" 00254 "jz .AD4;\n\t" 00255 ".AD3:;\n\t" 00256 "movb (%%rsi), %%al;\n\t" 00257 "movb (%%rdi), %%dl;\n\t" 00258 "cmpb %%dl, %%al;\n\t" 00259 "ja .AD5;\n\t" 00260 "xchgb %%al, %%dl;\n\t" 00261 ".AD5:;\n\t" 00262 "subb %%dl, %%al;\n\t" 00263 "movb %%al, (%%rbx);\n\t" 00264 "inc %%rbx;\n\t" 00265 "inc %%rsi;\n\t" 00266 "inc %%rdi;\n\t" 00267 "loop .AD3;\n\t" 00268 ".AD4:;\n\t" 00269 "emms;\n\t" 00270 : 00271 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx) 00272 :"memory" 00273 ); 00274 } 00275 #endif 00276 00277 #ifdef INVT_USE_SSE 00278 //###################################################################### 00279 // speedup ~= 2.0 00280 void sse_sum(const double *a, double *sum, const int32 sz) 00281 { 00282 static int32 rcx = sz>>3; 00283 static int32 rdx = sz&0x7; 00284 00285 asm ( 00286 "pxor %%xmm4, %%xmm4;\n\t" 00287 "pxor %%xmm5, %%xmm5;\n\t" 00288 "pxor %%xmm6, %%xmm6;\n\t" 00289 "pxor %%xmm7, %%xmm7;\n\t" 00290 "or %%rcx, %%rcx;\n\t" 00291 "jz BE1;\n\t" 00292 ".BE0:\n\t" 00293 "movupd 0(%%rsi), %%xmm0;\n\t" 00294 "movupd 16(%%rsi), %%xmm1;\n\t" 00295 "movupd 32(%%rsi), %%xmm2;\n\t" 00296 "movupd 48(%%rsi), %%xmm3;\n\t" 00297 "addpd %%xmm0, %%xmm4;\n\t" 00298 "addpd %%xmm1, %%xmm5;\n\t" 00299 "addpd %%xmm2, %%xmm6;\n\t" 00300 "addpd %%xmm3, %%xmm7;\n\t" 00301 "add $64, %%rsi;\n\t" 00302 "loop .BE0;\n\t" 00303 "BE1:;\n\t" 00304 "mov %%rdx, %%rcx;\n\t" 00305 "pxor %%xmm0, %%xmm0;\n\t" 00306 "or %%rcx, %%rcx;\n\t" 00307 "jz BE2;\n\t" 00308 "BE3:;\n\t" 00309 "movupd 0(%%rsi), %%xmm1;\n\t" 00310 "addpd %%xmm1, %%xmm0;\n\t" 00311 "add $16, %%rsi;\n\t" 00312 "loop BE3;\n\t" 00313 "BE2:;\n\t" 00314 "addpd %%xmm4, %%xmm7;\n\t" 00315 "addpd %%xmm5, %%xmm7;\n\t" 00316 "addpd %%xmm6, %%xmm7;\n\t" 00317 "addpd %%xmm7, %%xmm0;\n\t" 00318 "movhpd %%xmm0, (%%rbx);\n\t" 00319 "addsd (%%rbx), %%xmm0;\n\t" 00320 "movsd %%xmm0, (%%rbx);\n\t" 00321 "emms;\n\t" 00322 : 00323 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx) 00324 :"memory" 00325 ); 00326 } 00327 #endif 00328 00329 #ifdef INVT_USE_SSE2 00330 //###################################################################### 00331 //speedup ~= 4 00332 void sse2_sum(const float *a, double *sum, const int32 sz) 00333 { 00334 static int32 rcx = sz>>3; 00335 static int32 rdx = sz & 0x7; 00336 00337 asm ( 00338 "pxor %%xmm4, %%xmm4;\n\t" 00339 "pxor %%xmm5, %%xmm5;\n\t" 00340 "pxor %%xmm6, %%xmm6;\n\t" 00341 "pxor %%xmm7, %%xmm7;\n\t" 00342 "or %%rcx, %%rcx;\n\t" 00343 "jz BA1;\n\t" 00344 ".BA0:\n\t" 00345 "cvtps2pd 0(%%rsi), %%xmm0;\n\t" 00346 "cvtps2pd 8(%%rsi), %%xmm1;\n\t" 00347 "cvtps2pd 16(%%rsi), %%xmm2;\n\t" 00348 "cvtps2pd 24(%%rsi), %%xmm3;\n\t" 00349 "addpd %%xmm0, %%xmm4;\n\t" 00350 "addpd %%xmm1, %%xmm5;\n\t" 00351 "addpd %%xmm2, %%xmm6;\n\t" 00352 "addpd %%xmm3, %%xmm7;\n\t" 00353 "add $32, %%rsi;\n\t" 00354 "loop .BA0;\n\t" 00355 "BA1:;\n\t" 00356 "pxor %%xmm0, %%xmm0;\n\t" 00357 "mov %%rdx, %%rcx;\n\t" 00358 "or %%rcx, %%rcx;\n\t" 00359 "jz BA2;\n\t" 00360 "BA3:;\n\t" 00361 "cvtps2pd 0(%%rsi), %%xmm1;\n\t" 00362 "addpd %%xmm1, %%xmm0;\n\t" 00363 "add $8, %%rsi;\n\t" 00364 "loop BA3;\n\t" 00365 "BA2:;\n\t" 00366 "addpd %%xmm4, %%xmm7;\n\t" 00367 "addpd %%xmm5, %%xmm7;\n\t" 00368 "addpd %%xmm6, %%xmm7;\n\t" 00369 "addpd %%xmm7, %%xmm0;\n\t" 00370 "movhpd %%xmm0, (%%rbx);\n\t" 00371 "addsd (%%rbx), %%xmm0;\n\t" 00372 "movsd %%xmm0, (%%rbx);\n\t" 00373 "emms;\n\t" 00374 : 00375 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx) 00376 :"memory" 00377 ); 00378 } 00379 00380 00381 //###################################################################### 00382 // speedup ~= 4.0 00383 void sse2_sum(const int32 *a, double *sum, const int32 sz) 00384 { 00385 static int32 rcx = sz>>3; 00386 static int32 rdx = sz & 0x7; 00387 00388 asm ( 00389 "pxor %%xmm4, %%xmm4;\n\t" 00390 "pxor %%xmm5, %%xmm5;\n\t" 00391 "pxor %%xmm6, %%xmm6;\n\t" 00392 "pxor %%xmm7, %%xmm7;\n\t" 00393 "or %%rcx, %%rcx;\n\t" 00394 ".BC0:\n\t" 00395 "cvtdq2pd 0(%%rsi), %%xmm0;\n\t" 00396 "cvtdq2pd 8(%%rsi), %%xmm1;\n\t" 00397 "cvtdq2pd 16(%%rsi), %%xmm2;\n\t" 00398 "cvtdq2pd 24(%%rsi), %%xmm3;\n\t" 00399 "addpd %%xmm0, %%xmm4;\n\t" 00400 "addpd %%xmm1, %%xmm5;\n\t" 00401 "addpd %%xmm2, %%xmm6;\n\t" 00402 "addpd %%xmm3, %%xmm7;\n\t" 00403 "add $32, %%rsi;\n\t" 00404 "loop .BC0;\n\t" 00405 "BC1:;\n\t" 00406 "pxor %%xmm0, %%xmm0;\n\t" 00407 "mov %%rdx, %%rcx;\n\t" 00408 "or %%rcx, %%rcx;\n\t" 00409 "jz BC2;\n\t" 00410 "BC3:;\n\t" 00411 "cvtdq2pd 0(%%rsi), %%xmm1;\n\t" 00412 "addpd %%xmm1, %%xmm0;\n\t" 00413 "add $8, %%rsi;\n\t" 00414 "loop BC3;\n\t" 00415 "BC2:;\n\t" 00416 "addpd %%xmm4, %%xmm7;\n\t" 00417 "addpd %%xmm5, %%xmm7;\n\t" 00418 "addpd %%xmm6, %%xmm7;\n\t" 00419 "addpd %%xmm7, %%xmm0;\n\t" 00420 "movhpd %%xmm0, (%%rbx);\n\t" 00421 "addsd (%%rbx), %%xmm0;\n\t" 00422 "movsd %%xmm0, (%%rbx);\n\t" 00423 "emms;\n\t" 00424 : 00425 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx) 00426 :"memory" 00427 ); 00428 } 00429 00430 00431 00432 //###################################################################### 00433 void sse2_sum(const byte *a, double *sum, const int32 sz) 00434 { 00435 static int rcx = sz>>5; 00436 static int rdx = sz & 0x1f; 00437 00438 asm ( 00439 "or %%rcx, %%rcx;\n\t" 00440 "jz BB1;\n\t" 00441 "pxor %%xmm7, %%xmm7;\n\t" 00442 "push %%rbx;\n\t" 00443 "push %%rdx;\n\t" 00444 "BB3:;\n\t" 00445 "pxor %%xmm5, %%xmm5;\n\t" 00446 "pxor %%xmm6, %%xmm6;\n\t" 00447 "movdqu (%%rsi), %%xmm0;\n\t" 00448 "movdqu 16(%%rsi), %%xmm1;\n\t" 00449 "psadbw %%xmm0, %%xmm5;\n\t" 00450 "psadbw %%xmm1, %%xmm6;\n\t" 00451 "pextrw $0, %%xmm5, %%rax;\n\t" 00452 "cvtsi2sd %%rax, %%xmm0;\n\t" 00453 "pextrw $4, %%xmm5, %%rbx;\n\t" 00454 "cvtsi2sd %%rbx, %%xmm1;\n\t" 00455 "pextrw $0, %%xmm6, %%rdx;\n\t" 00456 "cvtsi2sd %%rdx, %%xmm2;\n\t" 00457 "pextrw $4, %%xmm6, %%rdi;\n\t" 00458 "cvtsi2sd %%rdi, %%xmm3;\n\t" 00459 "addsd %%xmm0, %%xmm1;\n\t" 00460 "addsd %%xmm2, %%xmm3;\n\t" 00461 "addsd %%xmm1, %%xmm7;\n\t" 00462 "addsd %%xmm3, %%xmm7;\n\t" 00463 "add $32, %%rsi;\n\t" 00464 "loop BB3;\n\t" 00465 "pop %%rdx;\n\t" 00466 "pop %%rbx;\n\t" 00467 "BB1:;\n\t" 00468 "xor %%rdi, %%rdi;\n\t" 00469 "mov %%rdx, %%rcx;\n\t" 00470 "or %%rcx, %%rcx;\n\t" 00471 "jz BB2;\n\t" 00472 "BB5:;\n\t" 00473 "xor %%rax, %%rax;\n\t" 00474 "movb (%%rsi), %%al;\n\t" 00475 "add %%rax, %%rdi;\n\t" 00476 "inc %%rsi;\n\t" 00477 "loop BB5;\n\t" 00478 "BB2:\n\t" 00479 "cvtsi2sd %%rdi, %%xmm0;\n\t" 00480 "addsd %%xmm0, %%xmm7;\n\t" 00481 "movhpd %%xmm7, (%%rbx);\n\t" 00482 "addsd (%%rbx), %%xmm7;\n\t" 00483 "movsd %%xmm7, (%%rbx);\n\t" 00484 "BB6:;\n\t" 00485 "emms;\n\t" 00486 : 00487 :"S"(a), "c"(rcx),"b"(sum),"d"(rdx) 00488 :"memory","rax","rdi" 00489 ); 00490 } 00491 #endif 00492 00493 #ifdef INVT_USE_SSE 00494 //###################################################################### 00495 // speedup ~= 10 ! 00496 void sse_clampedDiff(const byte *a, const byte *b, byte *result, const int32 sz) 00497 { 00498 int rcx = sz >> 6; 00499 int rdx = sz & 0x7f; 00500 00501 asm ( 00502 "or %%rcx, %%rcx;\n\t" 00503 "jz .DA0;\n\t" 00504 ".DA1:;\n\t" 00505 "movdqu (%%rsi), %%xmm0;\n\t" 00506 "movdqu (%%rdi), %%xmm4;\n\t" 00507 "movdqu 16(%%rsi), %%xmm1;\n\t" 00508 "movdqu 16(%%rdi), %%xmm5;\n\t" 00509 "movdqu 32(%%rsi), %%xmm2;\n\t" 00510 "movdqu 32(%%rdi), %%xmm6;\n\t" 00511 "movdqu 48(%%rsi), %%xmm3;\n\t" 00512 "movdqu 48(%%rdi), %%xmm7;\n\t" 00513 "psubusb %%xmm4, %%xmm0;\n\t" 00514 "psubusb %%xmm5, %%xmm1;\n\t" 00515 "psubusb %%xmm6, %%xmm2;\n\t" 00516 "psubusb %%xmm7, %%xmm3;\n\t" 00517 "movdqu %%xmm0, 0(%%rbx);\n\t" 00518 "movdqu %%xmm1, 16(%%rbx);\n\t" 00519 "movdqu %%xmm2, 32(%%rbx);\n\t" 00520 "movdqu %%xmm3, 48(%%rbx);\n\t" 00521 "add $64, %%rsi;\n\t" 00522 "add $64, %%rdi;\n\t" 00523 "add $64, %%rbx;\n\t" 00524 "loop .DA1;\n\t" 00525 ".DA0:;\n\t" 00526 "mov %%rdx, %%rcx;\n\t" 00527 "or %%rcx, %%rcx;\n\t" 00528 "jz .DA2;\n\t" 00529 ".DA3:;\n\t" 00530 "movb (%%rsi), %%al;\n\t" 00531 "movb (%%rdi), %%dl;\n\t" 00532 "cmpb %%bl, %%al;\n\t" 00533 "ja .DA4;\n\t" 00534 "xchg %%al, %%bl;\n\t" 00535 ".DA4:;\n\t" 00536 "subb %%bl, %%al;\n\t" 00537 "movb %%al, (%%rbx);\n\t" 00538 "inc %%rsi;\n\t" 00539 "inc %%rdi;\n\t" 00540 "inc %%rbx;\n\t" 00541 "loop .DA3;\n\t" 00542 ".DA2:;\n\t" 00543 "emms;\n\t" 00544 : 00545 :"S"(a),"D"(b),"c"(rcx),"d"(rdx),"b"(result) 00546 ); 00547 } 00548 00549 00550 //###################################################################### 00551 // speedup ~= 20 ! 00552 void sse_clampedDiff(const float32 *a, const float32 *b, float32 *result, 00553 const int32 sz) 00554 { 00555 int32 rcx=sz>>5; 00556 int32 rdx=sz&0x1f; 00557 00558 asm ( 00559 "or %%rcx, %%rcx;\n\t" 00560 "jz .DB0;\n\t" 00561 ".DB1:;\n\t" 00562 "movups 0(%%rsi), %%xmm0;\n\t" 00563 "movups 0(%%rdi), %%xmm1;\n\t" 00564 "movups 16(%%rsi), %%xmm2;\n\t" 00565 "movups 16(%%rdi), %%xmm3;\n\t" 00566 "movups %%xmm1, %%xmm6;\n\t" 00567 "movups %%xmm3, %%xmm7;\n\t" 00568 "cmpps $1, %%xmm0, %%xmm6;\n\t" 00569 "cmpps $1, %%xmm2, %%xmm7;\n\t" 00570 "subps %%xmm1, %%xmm0;\n\t" 00571 "subps %%xmm3, %%xmm2;\n\t" 00572 "andps %%xmm6, %%xmm0;\n\t" 00573 "andps %%xmm7, %%xmm2;\n\t" 00574 "movups %%xmm0, (%%rbx);\n\t" 00575 "movups %%xmm2, 16(%%rbx);\n\t" 00576 "add $32, %%rsi;\n\t" 00577 "add $32, %%rdi;\n\t" 00578 "add $32, %%rbx;\n\t" 00579 "loop .DB1;\n\t" 00580 ".DB0:;\n\t" 00581 "mov %%rdx, %%rcx;\n\t" 00582 "or %%rcx, %%rcx;\n\t" 00583 "jz .DB2;\n\t" 00584 ".DB3:;\n\t" 00585 "movss (%%rsi), %%xmm0;\n\t" 00586 "movss (%%rdi), %%xmm1;\n\t" 00587 "movss %%xmm1, %%xmm2;\n\t" 00588 "cmpss $1, %%xmm0, %%xmm2;\n\t" 00589 "andps %%xmm2, %%xmm0;\n\t" 00590 "andps %%xmm2, %%xmm1;\n\t" 00591 "subss %%xmm1, %%xmm0;\n\t" 00592 "movss %%xmm0, (%%rbx);\n\t" 00593 "add $4, %%rsi;\n\t" 00594 "add $4, %%rdi;\n\t" 00595 "add $4, %%rbx;\n\t" 00596 "loop .DB3;\n\t" 00597 ".DB2:;\n\t" 00598 : 00599 :"S"(a), "D"(b), "b"(result), "c"(rcx), "d"(rdx) 00600 :"memory" 00601 ); 00602 } 00603 00604 00605 //###################################################################### 00606 // speedup ~= 3 00607 void sse_clampedDiff(const int32 *a, const int32 *b, int32 *c, const int32 sz) 00608 { 00609 int32 rcx=sz>>3; 00610 int32 rdx=sz&0x7; 00611 asm ( 00612 "or %%rcx, %%rcx;\n\t" 00613 "jz .DC0;\n\t" 00614 ".DC1:;\n\t" 00615 "movdqu 0(%%rsi), %%xmm0;\n\t" //xmm0= a3 a2 a1 a0 00616 "movdqu 0(%%rdi), %%xmm1;\n\t" //xmm1= b3 b2 b1 b0 00617 "movdqu 16(%%rsi), %%xmm3;\n\t"//xmm3= a7 a6 a5 a4 00618 "movdqu 16(%%rdi), %%xmm4;\n\t"//xmm4= b7 b6 b5 b4 00619 "movdqu %%xmm0, %%xmm2;\n\t" //xmm2= a3 a2 a1 a0 00620 "movdqu %%xmm3, %%xmm5;\n\t" //xmm5= a7 a6 a5 a4 00621 "pcmpgtd %%xmm1, %%xmm2;\n\t" //xmm2=(a3>b3)(a2>b2)(a1>b1)(a0>b0) 00622 "pcmpgtd %%xmm4, %%xmm5;\n\t" //xmm5=(a7>b7)(a6>b6)(b5>a5)(a4>b4) 00623 "psubd %%xmm1, %%xmm0;\n\t" //xmm0=(a3-b3)(a2-b2)(a1-b1)(a0-b0) 00624 "psubd %%xmm4, %%xmm3;\n\t" //xmm3=(a7-b7)(a6-b6)(a5-b5)(a4-b4) 00625 "pand %%xmm2, %%xmm0;\n\t" 00626 "pand %%xmm5, %%xmm3;\n\t" 00627 "movdqu %%xmm0, (%%rbx);\n\t" 00628 "movdqu %%xmm3, 16(%%rbx);\n\t" 00629 "add $32, %%rsi;\n\t" 00630 "add $32, %%rdi;\n\t" 00631 "add $32, %%rbx;\n\t" 00632 "loop .DC1;\n\t" 00633 ".DC0:;\n\t" 00634 "mov %%rdx, %%rcx;\n\t" 00635 "or %%rcx, %%rcx;\n\t" 00636 "jz .DC2;\n\t" 00637 ".DC3:;\n\t" 00638 "movsd 0(%%rsi), %%xmm0;\n\t" 00639 "movsd 0(%%rdi), %%xmm1;\n\t" 00640 "movdqu %%xmm0, %%xmm2;\n\t" 00641 "pcmpgtd %%xmm1, %%xmm2;\n\t" 00642 "psubd %%xmm1, %%xmm0;\n\t" 00643 "pand %%xmm2, %%xmm0;\n\t" 00644 "movsd %%xmm0, (%%rbx);\n\t" 00645 "add $4, %%rsi;\n\t" 00646 "add $4, %%rdi;\n\t" 00647 "add $4, %%rbx;\n\t" 00648 "loop .DC3;\n\t" 00649 ".DC2:;\n\t" 00650 : 00651 :"S"(a), "D"(b), "c"(rcx), "d"(rdx), "b"(c) 00652 :"memory" 00653 ); 00654 } 00655 00656 00657 //###################################################################### 00658 // speedup ~= 4-5 00659 void sse_binaryReverse(const byte *a, byte *result, const byte val, const 00660 int32 sz) 00661 { 00662 static unsigned int rcx=(sz>>7); 00663 static unsigned int rdx=sz&0x7f; 00664 00665 byte pVal[16]; 00666 00667 memset(result, val, 16); 00668 00669 asm ( 00670 "or %%rcx, %%rcx;\n\t" 00671 "jz .FA0;\n\t" 00672 ".FA1:;\n\t" 00673 "movdqu 0(%%rbx), %%xmm0;\n\t" 00674 "movdqu 0(%%rbx), %%xmm1;\n\t" 00675 "movdqu %%xmm0, %%xmm2;\n\t" 00676 "movdqu %%xmm1, %%xmm3;\n\t" 00677 "movdqu %%xmm0, %%xmm4;\n\t" 00678 "movdqu %%xmm1, %%xmm5;\n\t" 00679 "movdqu %%xmm0, %%xmm6;\n\t" 00680 "movdqu %%xmm1, %%xmm7;\n\t" 00681 "psubb (%%rsi), %%xmm0;\n\t" 00682 "psubb 16(%%rsi), %%xmm1;\n\t" 00683 "psubb 32(%%rsi), %%xmm2;\n\t" 00684 "psubb 48(%%rsi), %%xmm3;\n\t" 00685 "psubb 64(%%rsi), %%xmm4;\n\t" 00686 "psubb 80(%%rsi), %%xmm5;\n\t" 00687 "psubb 96(%%rsi), %%xmm6;\n\t" 00688 "psubb 112(%%rsi), %%xmm7;\n\t" 00689 "movdqu %%xmm0, (%%rdi);\n\t" 00690 "movdqu %%xmm1, 16(%%rdi);\n\t" 00691 "movdqu %%xmm2, 32(%%rdi);\n\t" 00692 "movdqu %%xmm3, 48(%%rdi);\n\t" 00693 "movdqu %%xmm4, 64(%%rdi);\n\t" 00694 "movdqu %%xmm5, 80(%%rdi);\n\t" 00695 "movdqu %%xmm6, 96(%%rdi);\n\t" 00696 "movdqu %%xmm7, 112(%%rdi);\n\t" 00697 "add $128, %%rdi;\n\t" 00698 "add $128, %%rsi;\n\t" 00699 "loop .FA1;\n\t" 00700 ".FA0:;\n\t" 00701 "mov %%rdx, %%rcx;\n\t" 00702 "or %%rcx, %%rcx;\n\t" 00703 "jz .FA2;\n\t" 00704 "movb (%%rbx), %%dl;\n\t" 00705 ".FA3:;\n\t" 00706 "movb %%dl, %%dh;\n\t" 00707 "movb (%%rsi), %%al;\n\t" 00708 "subb %%al, %%dh;\n\t" 00709 "movb %%dh, (%%rdi);\n\t" 00710 "inc %%rsi;\n\t" 00711 "inc %%rdi;\n\t" 00712 "loop .FA3;\n\t" 00713 ".FA2:;\n\t" 00714 : 00715 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx) 00716 :"memory","rax" 00717 ); 00718 } 00719 00720 00721 //###################################################################### 00722 // speedup ~= 2 00723 void sse_binaryReverse(const float *a, float *result, const float val, 00724 const int sz) 00725 { 00726 static unsigned int rcx = sz>>5; 00727 static unsigned int rdx = sz&0x1f; 00728 int i; 00729 float pVal[16]; 00730 00731 for(i=0;i<16;++i) 00732 pVal[i] = val; 00733 00734 00735 asm ( 00736 "or %%rcx, %%rcx;\n\t" 00737 "jz .FB4;\n\t" 00738 ".FB2:;\n\t" 00739 "movups (%%rbx), %%xmm0;\n\t" 00740 "movups (%%rbx), %%xmm1;\n\t" 00741 "movups %%xmm0, %%xmm2;\n\t" 00742 "movups %%xmm1, %%xmm3;\n\t" 00743 "movups %%xmm0, %%xmm4;\n\t" 00744 "movups %%xmm1, %%xmm5;\n\t" 00745 "movups %%xmm0, %%xmm6;\n\t" 00746 "movups %%xmm1, %%xmm7;\n\t" 00747 "psubq (%%rsi), %%xmm0;\n\t" 00748 "psubq 16(%%rsi), %%xmm1;\n\t" 00749 "psubq 32(%%rsi), %%xmm2;\n\t" 00750 "psubq 48(%%rsi), %%xmm3;\n\t" 00751 "psubq 64(%%rsi), %%xmm4;\n\t" 00752 "psubq 80(%%rsi), %%xmm5;\n\t" 00753 "psubq 96(%%rsi), %%xmm6;\n\t" 00754 "psubq 112(%%rsi), %%xmm7;\n\t" 00755 "movups %%xmm0, 0(%%rdi);\n\t" 00756 "movups %%xmm1, 16(%%rdi);\n\t" 00757 "movups %%xmm2, 32(%%rdi);\n\t" 00758 "movups %%xmm3, 48(%%rdi);\n\t" 00759 "movups %%xmm4, 64(%%rdi);\n\t" 00760 "movups %%xmm5, 80(%%rdi);\n\t" 00761 "movups %%xmm6, 96(%%rdi);\n\t" 00762 "movups %%xmm7,112(%%rdi);\n\t" 00763 "add $128, %%rsi;\n\t" 00764 "add $128, %%rdi;\n\t" 00765 "loop .FB2;\n\t" 00766 ".FB4:\n\t" 00767 "or %%rdx, %%rdx;\n\t" 00768 "jz .FB1;\n\t" 00769 "mov %%rdx, %%rcx;\n\t" 00770 ".FB3:;\n\t" 00771 "movss 0(%%rbx), %%xmm0;\n\t" 00772 "subss (%%rsi), %%xmm0;\n\t" 00773 "movups %%xmm0, (%%rdi);\n\t" 00774 "add $16, %%rsi;\n\t" 00775 "add $16, %%rdi;\n\t" 00776 "loop .FB3;\n\t" 00777 ".FB1:;\n\t" 00778 : 00779 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx) 00780 :"memory","rax" 00781 ); 00782 } 00783 00784 00785 00786 //###################################################################### 00787 00788 void sse_binaryReverse(const int32 *a, int32 *result, const int32 val, 00789 const int32 sz) 00790 { 00791 int32 rcx=sz>>5; 00792 int32 rdx=sz&31; 00793 int32 pVal[16]; 00794 int i; 00795 00796 for(i=0;i<16;++i) pVal[i] = val; 00797 00798 asm ( 00799 "or %%rcx, %%rcx;\n\t" 00800 "jz .FC4;\n\t" 00801 ".FC2:;\n\t" 00802 "movdqu (%%rbx), %%xmm0;\n\t" 00803 "movdqu (%%rbx), %%xmm1;\n\t" 00804 "movdqu %%xmm0, %%xmm2;\n\t" 00805 "movdqu %%xmm1, %%xmm3;\n\t" 00806 "movdqu %%xmm0, %%xmm4;\n\t" 00807 "movdqu %%xmm1, %%xmm5;\n\t" 00808 "movdqu %%xmm0, %%xmm6;\n\t" 00809 "movdqu %%xmm1, %%xmm7;\n\t" 00810 "psubd (%%rsi), %%xmm0;\n\t" 00811 "psubd 16(%%rsi), %%xmm1;\n\t" 00812 "psubd 32(%%rsi), %%xmm2;\n\t" 00813 "psubd 48(%%rsi), %%xmm3;\n\t" 00814 "psubd 64(%%rsi), %%xmm4;\n\t" 00815 "psubd 80(%%rsi), %%xmm5;\n\t" 00816 "psubd 96(%%rsi), %%xmm6;\n\t" 00817 "psubd 112(%%rsi), %%xmm7;\n\t" 00818 "movdqu %%xmm0, 0(%%rdi);\n\t" 00819 "movdqu %%xmm1, 16(%%rdi);\n\t" 00820 "movdqu %%xmm2, 32(%%rdi);\n\t" 00821 "movdqu %%xmm3, 48(%%rdi);\n\t" 00822 "movdqu %%xmm4, 64(%%rdi);\n\t" 00823 "movdqu %%xmm5, 80(%%rdi);\n\t" 00824 "movdqu %%xmm6, 96(%%rdi);\n\t" 00825 "movdqu %%xmm7,112(%%rdi);\n\t" 00826 "add $128, %%rsi;\n\t" 00827 "add $128, %%rdi;\n\t" 00828 "loop .FC2;\n\t" 00829 ".FC4:;\n\t" 00830 "or %%rdx, %%rdx;\n\t" 00831 "jz .FC1;\n\t" 00832 "mov %%rdx, %%rcx;\n\t" 00833 ".FC3:;\n\t" 00834 "movdqu 0(%%rbx), %%xmm0;\n\t" 00835 "psubd (%%rsi), %%xmm0;\n\t" 00836 "movups %%xmm0, (%%rdi);\n\t" 00837 "add $16, %%rsi;\n\t" 00838 "add $16, %%rdi;\n\t" 00839 "loop .FC3;\n\t" 00840 ".FC1:;\n\t" 00841 : 00842 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx) 00843 :"memory","rax" 00844 ); 00845 } 00846 00847 00848 00849 //###################################################################### 00850 00851 void sse_cvt_byte_to_int(const byte *a, int32 *b, const int32 sz) 00852 { 00853 int32 rcx=sz>>4; 00854 int32 rdx=sz&0xf; 00855 00856 asm( 00857 "or %%rcx, %%rcx;\n\t" 00858 "jz .GA4;\n\t" 00859 "pxor %%xmm0, %%xmm0;\n\t" 00860 ".GA2:;\n\t" 00861 "movdqu 0(%%rsi), %%xmm1;\n\t" 00862 "movdqa %%xmm1, %%xmm2;\n\t" 00863 "movdqa %%xmm1, %%xmm3;\n\t" 00864 "movdqa %%xmm1, %%xmm4;\n\t" 00865 "psrldq $4, %%xmm2;\n\t" 00866 "psrldq $8, %%xmm3;\n\t" 00867 "psrldq $12, %%xmm4;\n\t" 00868 "punpcklbw %%xmm0, %%xmm1;\n\t" 00869 "punpcklbw %%xmm0, %%xmm2;\n\t" 00870 "punpcklbw %%xmm0, %%xmm3;\n\t" 00871 "punpcklbw %%xmm0, %%xmm4;\n\t" 00872 "punpcklbw %%xmm0, %%xmm1;\n\t" 00873 "punpcklbw %%xmm0, %%xmm2;\n\t" 00874 "punpcklbw %%xmm0, %%xmm3;\n\t" 00875 "punpcklbw %%xmm0, %%xmm4;\n\t" 00876 "movdqu %%xmm1, (%%rdi);\n\t" 00877 "movdqu %%xmm2, 16(%%rdi);\n\t" 00878 "movdqu %%xmm3, 32(%%rdi);\n\t" 00879 "movdqu %%xmm4, 48(%%rdi);\n\t" 00880 "add $16, %%rsi;\n\t" 00881 "add $64, %%rdi;\n\t" 00882 "loop .GA2;\n\t" 00883 ".GA4:;\n\t" 00884 "or %%rdx, %%rdx;\n\t" 00885 "jz .GA1;\n\t" 00886 "mov %%rdx, %%rcx;\n\t" 00887 ".GA3:;\n\t" 00888 "xor %%rax, %%rax;\n\t" 00889 "movb (%%rsi), %%al;\n\t" 00890 "mov %%rax, (%%rdi);\n\t" 00891 "inc %%rsi;\n\t" 00892 "add $4, %%rdi;\n\t" 00893 "loop .GA3;\n\t" 00894 ".GA1:;" 00895 : 00896 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 00897 :"memory" 00898 ); 00899 00900 00901 } 00902 00903 #endif 00904 00905 #ifdef INVT_USE_SSE2 00906 00907 //###################################################################### 00908 // speedup ~= 1.5 00909 void sse2_cvt_byte_to_float(const byte *a, float32 *b, const int32 sz) 00910 { 00911 int32 rcx=sz>>4; 00912 int32 rdx=sz&0xf; 00913 00914 asm( 00915 "or %%rcx, %%rcx;\n\t" 00916 "jz .GB4;\n\t" 00917 ".GB2:;\n\t" 00918 "pxor %%xmm0, %%xmm0;\n\t" 00919 "movdqu 0(%%rsi), %%xmm1;\n\t" 00920 "movdqu 4(%%rsi), %%xmm2;\n\t" 00921 "movdqu 8(%%rsi), %%xmm3;\n\t" 00922 "movdqu 12(%%rsi), %%xmm4;\n\t" 00923 "punpcklbw %%xmm0, %%xmm1;\n\t" 00924 "punpcklbw %%xmm0, %%xmm2;\n\t" 00925 "punpcklbw %%xmm0, %%xmm3;\n\t" 00926 "punpcklbw %%xmm0, %%xmm4;\n\t" 00927 "punpcklbw %%xmm0, %%xmm1;\n\t" 00928 "punpcklbw %%xmm0, %%xmm2;\n\t" 00929 "punpcklbw %%xmm0, %%xmm3;\n\t" 00930 "punpcklbw %%xmm0, %%xmm4;\n\t" 00931 "cvtdq2ps %%xmm1, %%xmm1;\n\t" 00932 "cvtdq2ps %%xmm2, %%xmm2;\n\t" 00933 "movups %%xmm1, (%%rdi);\n\t" 00934 "movups %%xmm2, 16(%%rdi);\n\t" 00935 "cvtdq2ps %%xmm3, %%xmm3;\n\t" 00936 "cvtdq2ps %%xmm4, %%xmm4;\n\t" 00937 "movups %%xmm3, 32(%%rdi);\n\t" 00938 "movups %%xmm4, 48(%%rdi);\n\t" 00939 "add $16, %%rsi;\n\t" 00940 "add $64, %%rdi;\n\t" 00941 "loop .GB2;\n\t" 00942 ".GB4:;\n\t" 00943 "or %%rdx, %%rdx;\n\t" 00944 "jz .GB1;\n\t" 00945 "mov %%rdx, %%rcx;\n\t" 00946 ".GB3:;\n\t" 00947 "xor %%rax, %%rax;\n\t" 00948 "movb (%%rsi), %%al;\n\t" 00949 "movd %%rax, %%xmm0;\n\t" 00950 "cvtdq2ps %%xmm0, %%xmm1;\n\t" 00951 "movss %%xmm1, (%%rdi);\n\t" 00952 "inc %%rsi;\n\t" 00953 "add $4, %%rdi;\n\t" 00954 "loop .GB3;\n\t" 00955 ".GB1:;" 00956 : 00957 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 00958 :"memory" 00959 ); 00960 } 00961 00962 00963 00964 //###################################################################### 00965 // speedup ~= 1.15 00966 void sse2_cvt_byte_to_double(const byte *a, double *b, int32 sz) 00967 { 00968 int32 rcx=sz>>3; 00969 int32 rdx=sz&0x7; 00970 00971 asm( 00972 "or %%rcx, %%rcx;\n\t" 00973 "jz .GC4;\n\t" 00974 ".GC2:;\n\t" 00975 "pxor %%xmm0, %%xmm0;\n\t" 00976 "movdqu 0(%%rsi), %%xmm1;\n\t" 00977 "movdqu 2(%%rsi), %%xmm2;\n\t" 00978 "movdqu 4(%%rsi), %%xmm3;\n\t" 00979 "movdqu 6(%%rsi), %%xmm4;\n\t" 00980 "punpcklbw %%xmm0, %%xmm1;\n\t" 00981 "punpcklbw %%xmm0, %%xmm2;\n\t" 00982 "punpcklbw %%xmm0, %%xmm3;\n\t" 00983 "punpcklbw %%xmm0, %%xmm4;\n\t" 00984 "punpcklbw %%xmm0, %%xmm1;\n\t" 00985 "punpcklbw %%xmm0, %%xmm2;\n\t" 00986 "punpcklbw %%xmm0, %%xmm3;\n\t" 00987 "punpcklbw %%xmm0, %%xmm4;\n\t" 00988 "cvtdq2pd %%xmm1, %%xmm1;\n\t" 00989 "cvtdq2pd %%xmm2, %%xmm2;\n\t" 00990 "movupd %%xmm1, (%%rdi);\n\t" 00991 "movupd %%xmm2, 16(%%rdi);\n\t" 00992 "cvtdq2pd %%xmm3, %%xmm3;\n\t" 00993 "cvtdq2pd %%xmm4, %%xmm4;\n\t" 00994 "movupd %%xmm3, 32(%%rdi);\n\t" 00995 "movupd %%xmm4, 48(%%rdi);\n\t" 00996 "add $8, %%rsi;\n\t" 00997 "add $64, %%rdi;\n\t" 00998 "loop .GC2;\n\t" 00999 ".GC4:;\n\t" 01000 "or %%rdx, %%rdx;\n\t" 01001 "jz .GC1;\n\t" 01002 "mov %%rdx, %%rcx;\n\t" 01003 ".GC3:;\n\t" 01004 "xor %%rax, %%rax;\n\t" 01005 "movb (%%rsi), %%al;\n\t" 01006 "movd %%rax, %%xmm0;\n\t" 01007 "cvtdq2pd %%xmm0, %%xmm1;\n\t" 01008 "movsd %%xmm1, (%%rdi);\n\t" 01009 "inc %%rsi;\n\t" 01010 "add $8, %%rdi;\n\t" 01011 "loop .GC3;\n\t" 01012 ".GC1:;" 01013 : 01014 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01015 :"memory" 01016 ); 01017 01018 } 01019 01020 01021 01022 //###################################################################### 01023 01024 void sse2_cvt_int_to_float(const int32 *a, float *b, const int32 sz) 01025 { 01026 int32 rcx=sz>>5; 01027 int32 rdx=sz&0x1f; 01028 01029 asm( 01030 "or %%rcx, %%rcx;\n\t" 01031 "jz .GD4;\n\t" 01032 ".GD2:;\n\t" 01033 "movdqu 0(%%rsi), %%xmm0;\n\t" 01034 "movdqu 16(%%rsi), %%xmm1;\n\t" 01035 "movdqu 32(%%rsi), %%xmm2;\n\t" 01036 "movdqu 48(%%rsi), %%xmm3;\n\t" 01037 "movdqu 64(%%rsi), %%xmm4;\n\t" 01038 "movdqu 80(%%rsi), %%xmm5;\n\t" 01039 "movdqu 96(%%rsi), %%xmm6;\n\t" 01040 "movdqu 112(%%rsi), %%xmm7;\n\t" 01041 "cvtdq2ps %%xmm0, %%xmm0;\n\t" 01042 "cvtdq2ps %%xmm1, %%xmm1;\n\t" 01043 "cvtdq2ps %%xmm2, %%xmm2;\n\t" 01044 "cvtdq2ps %%xmm3, %%xmm3;\n\t" 01045 "cvtdq2ps %%xmm4, %%xmm4;\n\t" 01046 "cvtdq2ps %%xmm5, %%xmm5;\n\t" 01047 "cvtdq2ps %%xmm6, %%xmm6;\n\t" 01048 "cvtdq2ps %%xmm7, %%xmm7;\n\t" 01049 "movups %%xmm0, 0(%%rdi);\n\t" 01050 "movups %%xmm1, 16(%%rdi);\n\t" 01051 "movups %%xmm2, 32(%%rdi);\n\t" 01052 "movups %%xmm3, 48(%%rdi);\n\t" 01053 "movups %%xmm4, 64(%%rdi);\n\t" 01054 "movups %%xmm5, 80(%%rdi);\n\t" 01055 "movups %%xmm6, 96(%%rdi);\n\t" 01056 "movups %%xmm7, 112(%%rdi);\n\t" 01057 "add $128, %%rsi;\n\t" 01058 "add $128, %%rdi;\n\t" 01059 "dec %%rcx;\n\t" 01060 "jnz .GD2;\n\t" 01061 ".GD4:;\n\t" 01062 "or %%rdx, %%rdx;\n\t" 01063 "jz .GD1;\n\t" 01064 "mov %%rdx, %%rcx;\n\t" 01065 ".GD3:;\n\t" 01066 "movsd (%%rsi), %%xmm0;\n\t" 01067 "cvtdq2ps %%xmm0, %%xmm0;\n\t" 01068 "movss %%xmm0, (%%rdi);\n\t" 01069 "add $4, %%rsi;\n\t" 01070 "add $4, %%rdi;\n\t" 01071 "loop .GD3;\n\t" 01072 ".GD1:;" 01073 : 01074 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01075 :"memory" 01076 ); 01077 01078 } 01079 01080 //###################################################################### 01081 // speedup ~= 1.2 01082 void sse2_cvt_int_to_double(const int32 *a, double *b, const int32 sz) 01083 { 01084 int32 rcx=sz>>4; 01085 int32 rdx=sz&0xf; 01086 01087 asm( 01088 "or %%rcx, %%rcx;\n\t" 01089 "jz .GE4;\n\t" 01090 ".GE2:;\n\t" 01091 "movdqu 0(%%rsi), %%xmm0;\n\t" 01092 "movdqu 8(%%rsi), %%xmm1;\n\t" 01093 "movdqu 16(%%rsi), %%xmm2;\n\t" 01094 "movdqu 24(%%rsi), %%xmm3;\n\t" 01095 "movdqu 32(%%rsi), %%xmm4;\n\t" 01096 "movdqu 40(%%rsi), %%xmm5;\n\t" 01097 "movdqu 48(%%rsi), %%xmm6;\n\t" 01098 "movdqu 56(%%rsi), %%xmm7;\n\t" 01099 "cvtdq2pd %%xmm0, %%xmm0;\n\t" 01100 "cvtdq2pd %%xmm1, %%xmm1;\n\t" 01101 "cvtdq2pd %%xmm2, %%xmm2;\n\t" 01102 "cvtdq2pd %%xmm3, %%xmm3;\n\t" 01103 "cvtdq2pd %%xmm4, %%xmm4;\n\t" 01104 "cvtdq2pd %%xmm5, %%xmm5;\n\t" 01105 "cvtdq2pd %%xmm6, %%xmm6;\n\t" 01106 "cvtdq2pd %%xmm7, %%xmm7;\n\t" 01107 "movups %%xmm0, 0(%%rdi);\n\t" 01108 "movups %%xmm1, 16(%%rdi);\n\t" 01109 "movups %%xmm2, 32(%%rdi);\n\t" 01110 "movups %%xmm3, 48(%%rdi);\n\t" 01111 "movups %%xmm4, 64(%%rdi);\n\t" 01112 "movups %%xmm5, 80(%%rdi);\n\t" 01113 "movups %%xmm6, 96(%%rdi);\n\t" 01114 "movups %%xmm7, 112(%%rdi);\n\t" 01115 "add $64, %%rsi;\n\t" 01116 "add $128, %%rdi;\n\t" 01117 "dec %%rcx;\n\t" 01118 "jnz .GE2;\n\t" 01119 ".GE4:;\n\t" 01120 "or %%rdx, %%rdx;\n\t" 01121 "jz .GE1;\n\t" 01122 "mov %%rdx, %%rcx;\n\t" 01123 ".GE3:;\n\t" 01124 "movsd (%%rsi), %%xmm0;\n\t" 01125 "cvtdq2pd %%xmm0, %%xmm0;\n\t" 01126 "movsd %%xmm0, (%%rdi);\n\t" 01127 "add $4, %%rsi;\n\t" 01128 "add $8, %%rdi;\n\t" 01129 "loop .GE3;\n\t" 01130 ".GE1:;" 01131 : 01132 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01133 :"memory" 01134 ); 01135 01136 } 01137 01138 //###################################################################### 01139 void sse2_cvt_float_to_int(const float *a, int *b, const int32 sz) 01140 { 01141 int32 rcx=sz; 01142 int32 rdx=sz; 01143 01144 asm ( 01145 "or %%rcx, %%rcx;\n\t" 01146 "jz .GF1;\n\t" 01147 ".GF2:;\n\t" 01148 "movdqu 0(%%rsi), %%xmm0;\n\t" 01149 "movdqu 8(%%rsi), %%xmm1;\n\t" 01150 "movdqu 16(%%rsi), %%xmm2;\n\t" 01151 "movdqu 24(%%rsi), %%xmm3;\n\t" 01152 "movdqu 32(%%rsi), %%xmm4;\n\t" 01153 "movdqu 40(%%rsi), %%xmm5;\n\t" 01154 "movdqu 48(%%rsi), %%xmm6;\n\t" 01155 "movdqu 56(%%rsi), %%xmm7;\n\t" 01156 "cvtps2dq %%xmm0, %%xmm0;\n\t" 01157 "cvtps2dq %%xmm1, %%xmm1;\n\t" 01158 "cvtps2dq %%xmm2, %%xmm2;\n\t" 01159 "cvtps2dq %%xmm3, %%xmm3;\n\t" 01160 "cvtps2dq %%xmm4, %%xmm4;\n\t" 01161 "cvtps2dq %%xmm5, %%xmm5;\n\t" 01162 "cvtps2dq %%xmm6, %%xmm6;\n\t" 01163 "cvtps2dq %%xmm7, %%xmm7;\n\t" 01164 "movdqu %%xmm0, 0(%%rdi);\n\t" 01165 "movdqu %%xmm1, 16(%%rdi);\n\t" 01166 "movdqu %%xmm2, 32(%%rdi);\n\t" 01167 "movdqu %%xmm3, 48(%%rdi);\n\t" 01168 "movdqu %%xmm4, 64(%%rdi);\n\t" 01169 "movdqu %%xmm5, 80(%%rdi);\n\t" 01170 "movdqu %%xmm6, 96(%%rdi);\n\t" 01171 "movdqu %%xmm7, 112(%%rdi);\n\t" 01172 "add $64, %%rsi;\n\t" 01173 "add $128, %%rdi;\n\t" 01174 "dec %%rcx;\n\t" 01175 "jnz .GF2;\n\t" 01176 ".GF4:;\n\t" 01177 "or %%rdx, %%rdx;\n\t" 01178 "jz .GF1;\n\t" 01179 "mov %%rdx, %%rcx;\n\t" 01180 ".GF3:;\n\t" 01181 "movsd (%%rsi), %%xmm0;\n\t" 01182 "cvtps2dq %%xmm0, %%xmm0;\n\t" 01183 "movsd %%xmm0, (%%rdi);\n\t" 01184 "add $4, %%rsi;\n\t" 01185 "add $8, %%rdi;\n\t" 01186 "loop .GF3;\n\t" 01187 ".GF1:;" 01188 : 01189 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01190 :"memory" 01191 ); 01192 01193 } 01194 01195 01196 01197 //###################################################################### 01198 void sse2_cvt_float_to_double(const float *a, double *b, const int32 sz) 01199 { 01200 int32 rcx=sz>>4; 01201 int32 rdx=sz&0xf; 01202 01203 asm( 01204 "or %%rcx, %%rcx;\n\t" 01205 "jz .GG4;\n\t" 01206 ".GG2:;\n\t" 01207 "movups 0(%%rsi), %%xmm0;\n\t" 01208 "movups 8(%%rsi), %%xmm1;\n\t" 01209 "movups 16(%%rsi), %%xmm2;\n\t" 01210 "movups 24(%%rsi), %%xmm3;\n\t" 01211 "movups 32(%%rsi), %%xmm4;\n\t" 01212 "movups 40(%%rsi), %%xmm5;\n\t" 01213 "movups 48(%%rsi), %%xmm6;\n\t" 01214 "movups 56(%%rsi), %%xmm7;\n\t" 01215 "cvtps2pd %%xmm0, %%xmm0;\n\t" 01216 "cvtps2pd %%xmm1, %%xmm1;\n\t" 01217 "cvtps2pd %%xmm2, %%xmm2;\n\t" 01218 "cvtps2pd %%xmm3, %%xmm3;\n\t" 01219 "cvtps2pd %%xmm4, %%xmm4;\n\t" 01220 "cvtps2pd %%xmm5, %%xmm5;\n\t" 01221 "cvtps2pd %%xmm6, %%xmm6;\n\t" 01222 "cvtps2pd %%xmm7, %%xmm7;\n\t" 01223 "movupd %%xmm0, 0(%%rdi);\n\t" 01224 "movupd %%xmm1, 16(%%rdi);\n\t" 01225 "movupd %%xmm2, 32(%%rdi);\n\t" 01226 "movupd %%xmm3, 48(%%rdi);\n\t" 01227 "movupd %%xmm4, 64(%%rdi);\n\t" 01228 "movupd %%xmm5, 80(%%rdi);\n\t" 01229 "movupd %%xmm6, 96(%%rdi);\n\t" 01230 "movupd %%xmm7, 112(%%rdi);\n\t" 01231 "add $64, %%rsi;\n\t" 01232 "add $128, %%rdi;\n\t" 01233 "dec %%rcx;\n\t" 01234 "jnz .GG2;\n\t" 01235 ".GG4:;\n\t" 01236 "or %%rdx, %%rdx;\n\t" 01237 "jz .GG1;\n\t" 01238 "mov %%rdx, %%rcx;\n\t" 01239 ".GG3:;\n\t" 01240 "movsd (%%rsi), %%xmm0;\n\t" 01241 "cvtps2pd %%xmm0, %%xmm0;\n\t" 01242 "movsd %%xmm0, (%%rdi);\n\t" 01243 "add $4, %%rsi;\n\t" 01244 "add $8, %%rdi;\n\t" 01245 "loop .GG3;\n\t" 01246 ".GG1:;" 01247 : 01248 :"S"(a), "D"(b), "c"(rcx),"d"(rdx) 01249 :"memory" 01250 ); 01251 } 01252 01253 #endif 01254 01255 #ifdef INVT_USE_SSE 01256 01257 //###################################################################### 01258 void sse_lowPass3x(const float *a, float *b, const int h, const int w) 01259 { 01260 const float coeffs[] = { 3.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0, 4.0}; 01261 int rdx = (w-2)/12; 01262 int rax = (w-2)%12; 01263 01264 asm ( 01265 // "movups 16(%%rbx), %%xmm7;\n\t" 01266 "or %%rcx, %%rcx;\n\t" 01267 "jz .HA1;\n\t" 01268 ".HA2:;\n\t" 01269 01270 // *dptr++ = (sptr[0]+sptr[0]+sptr[1])/3.0 01271 "movss 0(%%rsi), %%xmm1;\n\t" // xmm1 <- sptr[0] 01272 "movss 4(%%rsi), %%xmm2;\n\t" // xmm2 <- sptr[1] 01273 "addss %%xmm1, %%xmm1;\n\t" // xmm2 <- sptr[0] + sptr[0] 01274 "addss %%xmm1, %%xmm2;\n\t" // xmm2 <- xmm2 + sptr[1] 01275 "divss (%%rbx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0 01276 "movss %%xmm2, (%%rdi);\n\t" // *dptr <- xmm2 01277 "add $4, %%rdi;\n\t" // ++dptr 01278 01279 // for (int i = 0; i < w - 2; i ++) 01280 "or %%rdx, %%rdx;\n\t" 01281 "jz .HA4;\n\t" 01282 01283 "push %%rdx;\n\t" 01284 ".HA3:;\n\t" 01285 "movups 00(%%rsi), %%xmm0;\n\t" 01286 "movups 04(%%rsi), %%xmm1;\n\t" 01287 "movups 8(%%rsi), %%xmm2;\n\t" 01288 "movups 16(%%rsi), %%xmm3;\n\t" 01289 "movups 20(%%rsi), %%xmm4;\n\t" 01290 "movups 24(%%rsi), %%xmm5;\n\t" 01291 "movups 32(%%rsi), %%xmm6;\n\t" 01292 "movups 36(%%rsi), %%xmm7;\n\t" 01293 "addps %%xmm1, %%xmm0;\n\t" 01294 "addps %%xmm4, %%xmm3;\n\t" 01295 "addps %%xmm1, %%xmm0;\n\t" 01296 "addps %%xmm4, %%xmm3;\n\t" 01297 "movups 40(%%rsi), %%xmm1;\n\t" 01298 "addps %%xmm7, %%xmm6;\n\t" 01299 "addps %%xmm2, %%xmm0;\n\t" 01300 "addps %%xmm1, %%xmm6;\n\t" 01301 "addps %%xmm5, %%xmm3;\n\t" 01302 "addps %%xmm7, %%xmm6;\n\t" 01303 "divps 16(%%rbx ), %%xmm0;\n\t" 01304 "divps 16(%%rbx ), %%xmm3;\n\t" 01305 "divps 16(%%rbx ), %%xmm6;\n\t" 01306 "movups %%xmm0, (%%rdi);\n\t" 01307 "movups %%xmm3, 16(%%rdi);\n\t" 01308 "movups %%xmm6, 32(%%rdi);\n\t" 01309 "add $48, %%rsi;\n\t" 01310 "add $48, %%rdi;\n\t" 01311 "dec %%rdx;\n\t" 01312 "jnz .HA3;\n\t" 01313 "pop %%rdx;\n\t" 01314 ".HA4:;\n\t" 01315 01316 "or %%rax, %%rax;\n\t" 01317 "jz .HA6;\n\t" 01318 "push %%rax;\n\t" 01319 ".HA5:;\n\t" 01320 "movss 00(%%rsi), %%xmm0;\n\t" 01321 "movss 04(%%rsi), %%xmm1;\n\t" 01322 "movss 8(%%rsi), %%xmm2;\n\t" 01323 "addps %%xmm1, %%xmm0;\n\t" 01324 "addps %%xmm1, %%xmm2;\n\t" 01325 "addps %%xmm2, %%xmm0;\n\t" 01326 "divss 16(%%rbx ), %%xmm0;\n\t" 01327 "movss %%xmm0, (%%rdi);\n\t" 01328 "add $4, %%rsi;\n\t" 01329 "add $4, %%rdi;\n\t" 01330 "dec %%rax;\n\t" 01331 "jnz .HA5;\n\t" 01332 "pop %%rax;\n\t" 01333 01334 ".HA6:;\n\t" 01335 "movss (%%rsi), %%xmm1;\n\t" // xmm1 <- sptr[0] 01336 "movss 4(%%rsi), %%xmm2;\n\t" // xmm2 <- sptr[1] 01337 "addss %%xmm2, %%xmm2;\n\t" // xmm2 <- sptr[0] + sptr[1] 01338 "addss %%xmm1, %%xmm2;\n\t" // xmm2 <- xmm2 + sptr[0] 01339 "divss 0(%%rbx), %%xmm2;\n\t" // xmm2 <- xmm2/3.0 01340 01341 "movss %%xmm2, (%%rdi);\n\t" // *dptr <- xmm2 01342 "add $4, %%rdi;\n\t" // ++dptr 01343 "add $8, %%rsi;\n\t" // sptr += 2 01344 "dec %%rcx;\n\t" 01345 "jnz .HA2;\n\t" 01346 ".HA1:;\n\t" 01347 : 01348 :"S"(a), "D"(b),"c"(h),"a"(rax),"d"(rdx),"b"(coeffs) 01349 :"memory" 01350 ); 01351 01352 } 01353 01354 01355 01356 01357 //###################################################################### 01358 01359 void sse_lowPass3y(const float *a, float *b, const int h, const int w) 01360 { 01361 const float coeffs[] = { 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0}; 01362 01363 if (h < 2){ 01364 memcpy(b, a, w*h*sizeof(b[0])); 01365 return; // nothing to smooth 01366 } 01367 01368 if (h < 2){ 01369 memcpy(b, a, w*h*sizeof(b[0])); 01370 return; // nothing to smooth 01371 } 01372 01373 asm ( 01374 // top row 01375 "mov %%rdx, %%rcx;\n\t" 01376 "or %%rcx, %%rcx;\n\t" 01377 "jz .HU1;\n\t" 01378 "push %%rsi;\n\t" 01379 ".HU0:;\n\t" 01380 "movss (%%rsi), %%xmm0;\n\t" // xmm0 <- sptr[0] 01381 "movss (%%rsi, %%rdx, 4), %%xmm1;\n\t" //xmm1 <- sptr[w] 01382 "addss %%xmm0, %%xmm0;\n\t" 01383 "addss %%xmm1, %%xmm0;\n\t" 01384 "divss (%%rbx), %%xmm0;\n\t" 01385 "add $4, %%rsi;\n\t" 01386 "movss %%xmm0, (%%rdi);\n\t" 01387 "add $4, %%rdi;\n\t" 01388 "dec %%rcx;\n\t" 01389 "jnz .HU0;\n\t" 01390 "pop %%rsi;\n\t" 01391 ".HU1:;\n\t" 01392 "cmp $2, %%rax;\n\t" 01393 "jle .HU5;\n\t" 01394 01395 "push %%rax;\n\t" 01396 "sub $2, %%rax;\n\t" 01397 "jle .HU4;\n\t" 01398 ".HU2:;\n\t" 01399 "mov %%rdx, %%rcx;\n\t" 01400 "push %%rdx;\n\t" 01401 ".HU3:;\n\t" 01402 "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- sptr[0] 01403 "movss (%%rsi,%%rdx,4), %%xmm1;\n\t" //xmm1 <- sptr[w] 01404 "movss (%%rsi,%%rdx,8), %%xmm2;\n\t" //xmm2 <- sptr[2*w] 01405 "addss %%xmm1, %%xmm0;\n\t" 01406 "addss %%xmm1, %%xmm2;\n\t" 01407 "addss %%xmm2, %%xmm0;\n\t" 01408 "divss 16(%%rbx), %%xmm0;\n\t" 01409 "movss %%xmm0, (%%rdi);\n\t" 01410 "add $4, %%rsi;\n\t" 01411 "add $4, %%rdi;\n\t" 01412 "dec %%rcx;\n\t" 01413 "jnz .HU3;\n\t" 01414 "pop %%rdx;\n\t" 01415 "dec %%rax;\n\t" 01416 "jnz .HU2;\n\t" 01417 01418 ".HU4:;\n\t" 01419 "pop %%rax;\n\t" 01420 ".HU5:;\n\t" 01421 "or %%rdx, %%rdx;\n\t" 01422 "jz .HU7;\n\t" 01423 "push %%rdx;\n\t" 01424 "mov %%rdx, %%rcx;\n\t" 01425 ".HU6:;\n\t" 01426 "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- sptr[0] 01427 "movss (%%rsi,%%rcx,4), %%xmm1;\n\t" //xmm1 <- sptr[w] 01428 "addss %%xmm1, %%xmm1;\n\t" 01429 "addss %%xmm1, %%xmm0;\n\t" 01430 "divss (%%rbx), %%xmm0;\n\t" 01431 "movss %%xmm0, (%%rdi);\n\t" 01432 "add $4, %%rsi;\n\t" 01433 "add $4, %%rdi;\n\t" 01434 "dec %%rdx;\n\t" 01435 "jnz .HU6;\n\t" 01436 "pop %%rdx;\n\t" 01437 ".HU7:;\n\t" 01438 : 01439 :"S"(a),"D"(b),"a"(h),"d"(w),"b"(coeffs) 01440 ); 01441 01442 } 01443 01444 01445 //###################################################################### 01446 01447 void sse_lowPass5x(const float *src, float *dest, const int h, const int w) 01448 { 01449 const float *sptr= src; 01450 float *dptr= dest; 01451 01452 if(w<2) 01453 { 01454 memcpy(dest,src,h*w*sizeof(dest[0])); 01455 return; 01456 } 01457 01458 if (w == 2) ////////////////////////////////////////////////// 01459 for (int j = 0; j < h; j ++) 01460 { 01461 // leftmost point [ (6^) 4 ] / 10 01462 *dptr++ = sptr[0] * (6.0F / 10.0F) + sptr[1] * (4.0F / 10.0F); 01463 01464 // rightmost point [ 4^ (6) ] / 10 01465 *dptr++ = sptr[0] * (4.0F / 10.0F) + sptr[1] * (6.0F / 10.0F); 01466 01467 sptr += 2; // sptr back to same position as dptr 01468 } 01469 else if (w == 3) ////////////////////////////////////////////////// 01470 for (int j = 0; j < h; j ++) 01471 { 01472 // leftmost point [ (6^) 4 1 ] / 11 01473 *dptr++ = sptr[0] * (6.0F / 11.0F) + 01474 sptr[1] * (4.0F / 11.0F) + 01475 sptr[2] * (1.0F / 11.0F); 01476 01477 // middle point [ 4^ (6) 4 ] / 14 01478 *dptr++ = (sptr[0] + sptr[2]) * (4.0F / 14.0F) + 01479 sptr[1] * (6.0F / 14.0F); 01480 01481 // rightmost point [ 1^ 4 (6) ] / 11 01482 *dptr++ = sptr[0] * (1.0F / 11.0F) + 01483 sptr[1] * (4.0F / 11.0F) + 01484 sptr[2] * (6.0F / 11.0F); 01485 01486 sptr += 3; // sptr back to same position as dptr 01487 } 01488 else 01489 if(w>3) 01490 { 01491 const float coeffs[] = {6.0/11.0, 4.0/11.0, 1.0/11.0, 4.0/15.0, 01492 4.0/15.0, 6.0/15.0, 1.0/15.0, 1.0/16.0, 01493 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0, 01494 4.0/16.0, 4.0/16.0, 4.0/16.0, 4.0/16.0, 01495 6.0/16.0, 6.0/16.0, 6.0/16.0, 6.0/16.0, 01496 1.0/15.0, 4.0/15.0, 6.0/15.0, 1.0/15.0, 01497 1.0/11.0, 4.0/11.0, 6.0/11.0, 1.0/11.0 01498 }; 01499 01500 int rax= (w-4)&3; 01501 int rdx= (w-4)>>2; 01502 01503 asm( 01504 "or %%rcx, %%rcx;\n\t" // rcx <- h 01505 "jz .HG6;\n\t" 01506 ".HG0:;\n\t" 01507 "movss (%%rsi), %%xmm0;\n\t" // xmm0 <- s[0] 01508 "movss 4(%%rsi), %%xmm2;\n\t" // xmm2 <- s[1] 01509 "movss 8(%%rsi), %%xmm4;\n\t" // xmm4 <- s[2] 01510 "movss 12(%%rsi), %%xmm6;\n\t" // xmm6 <- s[3] 01511 "movss %%xmm0, %%xmm1;\n\t" // xmm1 <- s[0] 01512 "movss %%xmm2, %%xmm3;\n\t" // xmm3 <- s[1] 01513 "movss %%xmm4, %%xmm5;\n\t" // xmm5 <- s[2] 01514 "mulss (%%rbx), %%xmm0;\n\t" // xmm0 <- 6.0/11.0*s[0] 01515 "mulss 4(%%rbx), %%xmm2;\n\t" // xmm2 <- 4.0/11.0*s[1] 01516 "mulss 8(%%rbx), %%xmm4;\n\t" // xmm4 <- 1.0/11.0*s[2] 01517 "addss %%xmm5, %%xmm1;\n\t" // xmm1 <- s[2]+s[0] 01518 "mulss 16(%%rbx), %%xmm1;\n\t" // xmm1 <- (s2+s0)*4.0/15.0 01519 "mulss 20(%%rbx), %%xmm3;\n\t" 01520 "mulss 24(%%rbx), %%xmm6;\n\t" 01521 "addss %%xmm2, %%xmm0;\n\t" 01522 "addss %%xmm3, %%xmm1;\n\t" 01523 "addss %%xmm4, %%xmm0;\n\t" 01524 "addss %%xmm6, %%xmm1;\n\t" 01525 "movss %%xmm0, (%%rdi);\n\t" 01526 "movss %%xmm1, 4(%%rdi);\n\t" 01527 "add $8, %%rdi;\n\t" 01528 01529 "or %%rdx, %%rdx;\n\t" 01530 "jz .HG5;\n\t" 01531 01532 "push %%rdx;\n\t" // rdx <- (w-4)/4 01533 "movups 32(%%rbx), %%xmm5;\n\t" // xmm5 <- 1.0/16.0 1.0/16.0 1.0/16 1.0/16 01534 "movups 48(%%rbx), %%xmm6;\n\t" // xmm6 <- 4.0/16.0 ...................... 01535 "movups 64(%%rbx), %%xmm7;\n\t" // xmm7 <- 6.0/16.0 ...................... 01536 ".HG1:;\n\t" 01537 "movups 0(%%rsi), %%xmm0;\n\t" // xmm0 <- s0 s1 s2 s3 01538 "movups 04(%%rsi), %%xmm1;\n\t" // xmm1 <- s1 s2 s3 s4 01539 "movups 8(%%rsi), %%xmm2;\n\t" // xmm2 <- s2 s3 s4 s5 01540 "movups 12(%%rsi), %%xmm3;\n\t" // xmm3 <- s3 s4 s5 s6 01541 "movups 16(%%rsi), %%xmm4;\n\t" // xmm4 <- s4 s5 s6 s7 01542 "addps %%xmm4, %%xmm0;\n\t" 01543 "addps %%xmm3, %%xmm1;\n\t" 01544 "mulps %%xmm5, %%xmm0;\n\t" 01545 "mulps %%xmm6, %%xmm1;\n\t" 01546 "mulps %%xmm7, %%xmm2;\n\t" 01547 "addps %%xmm1, %%xmm0;\n\t" 01548 "addps %%xmm2, %%xmm0;\n\t" 01549 "movups %%xmm0, (%%rdi);\n\t" 01550 "add $16, %%rsi;\n\t" 01551 "add $16, %%rdi;\n\t" 01552 "dec %%rdx;\n\t" 01553 "jnz .HG1;\n\t" 01554 "pop %%rdx;\n\t" 01555 01556 ".HG5:;\n\t" 01557 "or %%rax, %%rax;\n\t" 01558 "jz .HG3;\n\t" 01559 "push %%rax;\n\t" // rax <- (w-4)%4 01560 "movups 32(%%rbx), %%xmm5;\n\t" 01561 "movups 48(%%rbx), %%xmm6;\n\t" 01562 "movups 64(%%rbx), %%xmm7;\n\t" 01563 ".HG2:;\n\t" 01564 "movss (%%rsi), %%xmm0;\n\t" 01565 "movss 4(%%rsi), %%xmm1;\n\t" 01566 "movss 8(%%rsi), %%xmm2;\n\t" 01567 "movss 12(%%rsi), %%xmm3;\n\t" 01568 "movss 16(%%rsi), %%xmm4;\n\t" 01569 "mulss %%xmm5 , %%xmm0;\n\t" 01570 "mulss %%xmm6 , %%xmm1;\n\t" 01571 "mulss %%xmm7 , %%xmm2;\n\t" 01572 "mulss %%xmm6 , %%xmm3;\n\t" 01573 "mulss %%xmm5 , %%xmm4;\n\t" 01574 "addss %%xmm1, %%xmm0;\n\t" 01575 "addss %%xmm3, %%xmm2;\n\t" 01576 "addss %%xmm4, %%xmm0;\n\t" 01577 "addss %%xmm2, %%xmm0;\n\t" 01578 "add $4, %%rsi;\n\t" 01579 "movss %%xmm0, (%%rdi);\n\t" 01580 "add $4, %%rdi;\n\t" 01581 "dec %%rax;\n\t" 01582 "jnz .HG2;\n\t" 01583 "pop %%rax;\n\t" 01584 ".HG3:;\n\t" 01585 "movss (%%rsi), %%xmm0;\n\t" // xmm0 <- s0 01586 "movss 4(%%rsi), %%xmm1;\n\t" // xmm1 <- s1 01587 "movss 8(%%rsi), %%xmm2;\n\t" // xmm2 <- s2 01588 "movss 12(%%rsi), %%xmm3;\n\t" // xmm3 <- s3 01589 "movss %%xmm1, %%xmm4;\n\t" // xmm4 <- s1 01590 "movss %%xmm2, %%xmm5;\n\t" // xmm5 <- s2 01591 "movss %%xmm3, %%xmm6;\n\t" // xmm6 <- s3 01592 "addps %%xmm1, %%xmm3;\n\t" // xmm3 <- s1+s3 01593 "mulss 80(%%rbx), %%xmm0;\n\t" // xmm0 <- 1.0/15.0*s0 01594 "mulss 84(%%rbx), %%xmm3;\n\t" // xmm3 <- 4.0/15.0*(s1+s3) 01595 "mulss 88(%%rbx), %%xmm2;\n\t" // xmm2 <- 6.0/15.0*s2 01596 "addss %%xmm3, %%xmm0;\n\t" 01597 "addss %%xmm2, %%xmm0;\n\t" 01598 "movss %%xmm0, (%%rdi);\n\t" 01599 "mulss 96(%%rbx), %%xmm4;\n\t" 01600 "mulss 100(%%rbx), %%xmm5;\n\t" 01601 "mulss 104(%%rbx), %%xmm6;\n\t" 01602 "addss %%xmm5, %%xmm4;\n\t" 01603 "addss %%xmm6, %%xmm4;\n\t" 01604 "movss %%xmm4, 4(%%rdi);\n\t" 01605 "add $16, %%rsi;\n\t" 01606 "add $8, %%rdi;\n\t" 01607 "dec %%rcx;\n\t" 01608 "jnz .HG0;\n\t" 01609 ".HG6:;\n\t" 01610 : 01611 :"S"(sptr),"D"(dptr),"a"(rax),"b"(coeffs),"c"(h),"d"(rdx) 01612 :"memory" 01613 ); 01614 } 01615 01616 } 01617 01618 01619 01620 //###################################################################### 01621 01622 void sse_lowPass5y(const float *src, float *dest, const int h, 01623 const int w) 01624 { 01625 if (h < 2){ 01626 memcpy(dest, src, h*w*sizeof(dest[0])); 01627 return; // nothing to smooth 01628 } 01629 01630 const float *sptr= src; 01631 float *dptr= dest; 01632 01633 // ########## vertical pass (even though we scan horiz for speedup) 01634 const int w2 = w * 2; // speedup 01635 01636 01637 if (h == 2) ////////////////////////////////////////////////// 01638 { 01639 // topmost points ( [ (6^) 4 ] / 10 )^T 01640 for (int i = 0; i < w; i ++) 01641 { 01642 *dptr++ = sptr[0] * (6.0F / 10.0F) + 01643 sptr[w] * (4.0F / 10.0F); 01644 sptr++; 01645 } 01646 sptr -= w; // go back to top-left 01647 01648 // bottommost points ( [ 4^ (6) ] / 10 )^T 01649 for (int i = 0; i < w; i ++) 01650 { 01651 *dptr++ = sptr[0] * (4.0F / 10.0F) + 01652 sptr[w] * (6.0F / 10.0F); 01653 sptr++; 01654 } 01655 } 01656 else if (h == 3) ////////////////////////////////////////////////// 01657 { 01658 // topmost points ( [ (6^) 4 1 ] / 11 )^T 01659 for (int i = 0; i < w; i ++) 01660 { 01661 *dptr++ = sptr[ 0] * (6.0F / 11.0F) + 01662 sptr[ w] * (4.0F / 11.0F) + 01663 sptr[w2] * (1.0F / 11.0F); 01664 sptr++; 01665 } 01666 sptr -= w; // go back to top-left 01667 01668 // middle points ( [ 4^ (6) 4 ] / 14 )^T 01669 for (int i = 0; i < w; i ++) 01670 { 01671 *dptr++ = (sptr[ 0] + sptr[w2]) * (4.0F / 14.0F) + 01672 sptr[ w] * (6.0F / 14.0F); 01673 sptr++; 01674 } 01675 sptr -= w; // go back to top-left 01676 01677 // bottommost points ( [ 1^ 4 (6) ] / 11 )^T 01678 for (int i = 0; i < w; i ++) 01679 { 01680 *dptr++ = sptr[ 0] * (1.0F / 11.0F) + 01681 sptr[ w] * (4.0F / 11.0F) + 01682 sptr[w2] * (6.0F / 11.0F); 01683 sptr++; 01684 } 01685 } 01686 else ///////////////////////////////// general case for height >= 4 01687 { 01688 // topmost points ( [ (6^) 4 1 ] / 11 )^T 01689 01690 static const float coeffs[] = { 01691 6.0/11.0, 6.0/11.0, 6.0/11.0, 6.0/11.0, //0 01692 4.0/11.0, 4.0/11.0, 4.0/11.0, 4.0/11.0, //16 01693 1.0/11.0, 1.0/11.0, 1.0/11.0, 1.0/11.0, //32 01694 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, //48 01695 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, //64 01696 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, //80 01697 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0, //96 01698 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, //112 01699 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F //128 01700 }; 01701 01702 int rcx=h-4; 01703 int rdx=w>>2; 01704 int rax=w&3; 01705 01706 // asm ( 01707 // "push %%rbp;\n\t" 01708 // "mov %0, %%rbp;\n\t" 01709 // "add %%rbp, %%rbp;\n\t" 01710 // "add %%rbp, %%rbp;\n\t" 01711 01712 // // 1st loop 01713 // "movups (%%rbx), %%xmm4;\n\t" //xmm4 <- 6.0/11.0 ... 01714 // "movups 16(%%rbx), %%xmm5;\n\t" //xmm5 <- 4.0/11.0 01715 // "movups 32(%%rbx), %%xmm6;\n\t" //xmm6 <- 1.0/11.0 01716 // "push %%rsi;\n\t" 01717 // "or %%rdx, %%rdx;\n\t" 01718 // "jz .IA1;\n\t" 01719 // ".align 4;\n\t" 01720 // "push %%rdx;\n\t" 01721 // ".IA0:;\n\t" 01722 // ".align 4;\n\t" 01723 // "movups (%%rsi), %%xmm0;\n\t" //xmm0 <- s0 s0 s0 s0 01724 // "movups (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01725 // "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01726 // "mulps %%xmm4, %%xmm0;\n\t" 01727 // "mulps %%xmm5, %%xmm1;\n\t" 01728 // "mulps %%xmm6, %%xmm2;\n\t" 01729 // "addps %%xmm1, %%xmm0;\n\t" 01730 // "addps %%xmm2, %%xmm0;\n\t" 01731 // "movups %%xmm0, (%%rdi);\n\t" 01732 // "add $16, %%rsi;\n\t" 01733 // "add $16, %%rdi;\n\t" 01734 // "dec %%rdx;\n\t" 01735 // "jnz .IA0;\n\t" 01736 // "pop %%rdx;\n\t" 01737 // ".IA1:;\n\t" 01738 // ".align 4;\n\t" 01739 // "or %%rax, %%rax;\n\t" 01740 // "jz .IA3;\n\t" 01741 // "push %%rax;\n\t" 01742 // ".IA2:;\n\t" 01743 // ".align 4;\n\t" 01744 // "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01745 // "movss (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW+3 sW+2 sW+1 sW 01746 // "movss (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sP+3 sP+3 sP+1 sP 01747 // "mulss %%xmm4, %%xmm0;\n\t" 01748 // "mulss %%xmm5, %%xmm1;\n\t" 01749 // "mulss %%xmm6, %%xmm2;\n\t" 01750 // "addss %%xmm1, %%xmm0;\n\t" 01751 // "addss %%xmm2, %%xmm0;\n\t" 01752 // "movss %%xmm0, (%%rdi);\n\t" 01753 // "add $4, %%rsi;\n\t" 01754 // "add $4, %%rdi;\n\t" 01755 // "dec %%rax;\n\t" 01756 // "jnz .IA2;\n\t" 01757 // "pop %%rax;\n\t" 01758 // ".IA3:;\n\t" 01759 // "pop %%rsi;\n\t" // restore sptr 01760 01761 // // 2nd loop 01762 // "movups 48(%%rbx), %%xmm4;\n\t" //xmm4 <- 4.0/15.0 01763 // "movups 64(%%rbx), %%xmm5;\n\t" //xmm5 <- 6.0/15.0 01764 // "movups 80(%%rbx), %%xmm6;\n\t" //xmm6 <- 1.0/15.0 01765 // "push %%rsi;\n\t" 01766 // "or %%rdx, %%rdx;\n\t" 01767 // "jz .IA5;\n\t" 01768 // "push %%rdx;\n\t" 01769 // "push %%rax;\n\t" 01770 // "mov %%rbp, %%rax;\n\t" 01771 // "add %%rbp, %%rax;\n\t" 01772 // "add %%rbp, %%rax;\n\t" 01773 // ".IA4:;\n\t" 01774 // "movups (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01775 // "movups (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01776 // "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01777 // "movups (%%rsi,%%rax,1), %%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 01778 // "addps %%xmm2, %%xmm0;\n\t" 01779 // "mulps %%xmm4, %%xmm0;\n\t" 01780 // "mulps %%xmm5, %%xmm1;\n\t" 01781 // "mulps %%xmm6, %%xmm3;\n\t" 01782 // "addps %%xmm1, %%xmm0;\n\t" 01783 // "addps %%xmm3, %%xmm0;\n\t" 01784 // "movups %%xmm0, (%%rdi);\n\t" 01785 // "add $16, %%rsi;\n\t" 01786 // "add $16, %%rdi;\n\t" 01787 // "dec %%rdx;\n\t" 01788 // "jnz .IA4;\n\t" 01789 // "pop %%rax;\n\t" 01790 // "pop %%rdx;\n\t" 01791 // ".IA5:;\n\t" 01792 // "or %%rax, %%rax;\n\t" 01793 // "jz .IA7;\n\t" 01794 // "push %%rax;\n\t" 01795 // "push %%rdx;\n\t" 01796 // "mov %%rbp, %%rdx;\n\t" 01797 // "add %%rbp, %%rdx;\n\t" 01798 // "add %%rbp, %%rdx;\n\t" 01799 // ".IA6:;\n\t" 01800 // "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01801 // "movss (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01802 // "movss (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01803 // "movss (%%rsi,%%rdx,1), %%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 01804 // "addss %%xmm2, %%xmm0;\n\t" 01805 // "mulss %%xmm4, %%xmm0;\n\t" 01806 // "mulss %%xmm5, %%xmm1;\n\t" 01807 // "mulss %%xmm6, %%xmm3;\n\t" 01808 // "addss %%xmm1, %%xmm0;\n\t" 01809 // "addss %%xmm3, %%xmm0;\n\t" 01810 // "movss %%xmm0, (%%rdi);\n\t" 01811 // "add $4, %%rsi;\n\t" 01812 // "add $4, %%rdi;\n\t" 01813 // "dec %%rax;\n\t" 01814 // "jnz .IA6;\n\t" 01815 // "pop %%rdx;\n\t" 01816 // "pop %%rax;\n\t" 01817 // ".IA7:;\n\t" 01818 // "pop %%rsi;\n\t" // restore sptr 01819 01820 01821 // // the double loops 01822 // "or %%rcx, %%rcx;\n\t" 01823 // "jz .IA29;\n\t" 01824 // "push %%rcx;\n\t" 01825 // "movups 96(%%rbx), %%xmm5;\n\t" // xmm5 <- 1.0/16.0 01826 // "movups 112(%%rbx), %%xmm6;\n\t" // xmm6 <- 4.0/16.0 01827 // "movups 128(%%rbx), %%xmm7;\n\t" // xmm7 <- 6.0/16.0 01828 // ".IA8:;\n\t" 01829 // "or %%rdx, %%rdx;\n\t" 01830 // "jz .IA10;\n\t" 01831 // "push %%rdx;\n\t" 01832 // "push %%rax;\n\t" 01833 // "mov %%rbp, %%rax;\n\t" 01834 // "add %%rbp, %%rax;\n\t" 01835 // "add %%rbp, %%rax;\n\t" // rax <- 3*W 01836 // ".IA9:;\n\t" 01837 // "movups (%%rsi), %%xmm0;\n\t" // xmm0 <- s s s s 01838 // "movups (%%rsi,%%rbp,1), %%xmm1;\n\t" // xmm1 <- sW sW sW sW 01839 // "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" // xmm2 <- sW2 sW2 sW2 sW2 01840 // "movups (%%rsi,%%rax,1), %%xmm3;\n\t" // xmm3 <- sW3 sW3 sW3 sW3 01841 // "movups (%%rsi,%%rbp,4), %%xmm4;\n\t" // xmm4 <- sW4 sW4 sW4 sW4 01842 // "addps %%xmm3, %%xmm1;\n\t" // xmm1 <- sW3 + sW1 01843 // "addps %%xmm4, %%xmm0;\n\t" // xmm0 <- s0 + sW4 01844 // "mulps %%xmm6, %%xmm1;\n\t" // xmm1 <- 4.0/16.0*(sW3+sW1) 01845 // "mulps %%xmm5, %%xmm0;\n\t" // xmm0 <- 1.0/16.08(s0 +sW4) 01846 // "mulps %%xmm7, %%xmm2;\n\t" // xmm2 <- 6.0/16.0*sW2 01847 // "addps %%xmm1, %%xmm0;\n\t" 01848 // "addps %%xmm2, %%xmm0;\n\t" 01849 // "add $16, %%rsi;\n\t" 01850 // "movups %%xmm0, (%%rdi);\n\t" 01851 // "add $16, %%rdi;\n\t" 01852 // "dec %%rdx;\n\t" 01853 // "jnz .IA9;\n\t" 01854 // "pop %%rax;\n\t" 01855 // "pop %%rdx;\n\t" 01856 // ".IA10:;\n\t" 01857 // "or %%rax, %%rax;\n\t" 01858 // "jz .IA12;\n\t" 01859 // "push %%rax;\n\t" 01860 // "push %%rdx;\n\t" 01861 // "mov %%rbp, %%rdx;\n\t" 01862 // "add %%rbp, %%rdx;\n\t" 01863 // "add %%rbp, %%rdx;\n\t" 01864 // ".IA11:;\n\t" 01865 // "movss (%%rsi), %%xmm0;\n\t" // xmm0 <- s s s s 01866 // "movss (%%rsi,%%rbp,1), %%xmm1;\n\t" // xmm1 <- sW sW sW sW 01867 // "movss (%%rsi,%%rbp,2), %%xmm2;\n\t" // xmm2 <- sW2 sW2 sW2 sW2 01868 // "movss (%%rsi,%%rdx,1), %%xmm3;\n\t" // xmm3 <- sW3 sW3 sW3 sW3 01869 // "movss (%%rsi,%%rbp,4), %%xmm4;\n\t" // xmm4 <- sW4 sW4 sW4 sW4 01870 // "addss %%xmm3, %%xmm1;\n\t" 01871 // "addss %%xmm4, %%xmm0;\n\t" 01872 // "mulss %%xmm6, %%xmm1;\n\t" 01873 // "mulss %%xmm5, %%xmm0;\n\t" 01874 // "mulss %%xmm7, %%xmm2;\n\t" 01875 // "addss %%xmm1, %%xmm0;\n\t" 01876 // "addss %%xmm2, %%xmm0;\n\t" 01877 // "add $4, %%rsi;\n\t" 01878 // "movss %%xmm0, (%%rdi);\n\t" 01879 // "add $4, %%rdi;\n\t" 01880 // "dec %%rax;\n\t" 01881 // "jnz .IA11;\n\t" 01882 // "pop %%rdx;\n\t" 01883 // "pop %%rax;\n\t" 01884 // ".IA12:;\n\t" 01885 // "dec %%rcx;\n\t" 01886 // "jnz .IA8;\n\t" 01887 // "pop %%rcx;\n\t" 01888 // ".IA29:;\n\t" 01889 01890 // // fourth loop 01891 // "movups 48(%%rbx), %%xmm4;\n\t" //xmm4 <- 4.0/15.0 01892 // "movups 64(%%rbx), %%xmm5;\n\t" //xmm5 <- 6.0/15.0 01893 // "movups 80(%%rbx), %%xmm6;\n\t" //xmm6 <- 1.0/15.0 01894 // "or %%rdx, %%rdx;\n\t" 01895 // "jz .IA14;\n\t" 01896 // "push %%rdx;\n\t" 01897 // "push %%rax;\n\t" 01898 // "mov %%rbp, %%rax;\n\t" 01899 // "add %%rbp, %%rax;\n\t" 01900 // "add %%rbp, %%rax;\n\t" 01901 // ".IA13:;\n\t" 01902 // "movups (%%rsi), %%xmm0;\n\t" //xmm0 <- s0 s0 s0 s0 01903 // "movups (%%rsi,%S%rbp,1), %%xmm1;\n\t" //xmm1 <- sW1 sW1 sW1 sW1 01904 // "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01905 // "movups (%%rsi,%%rax,1),%%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 01906 // "addps %%xmm3, %%xmm1;\n\t" //xmm1 <- sW3 + sW1 01907 // "mulps %%xmm6, %%xmm0;\n\t" //xmm0 <- 1.0/15.0 * s0 01908 // "mulps %%xmm5, %%xmm2;\n\t" //xmm2 <- 6.0/15.0 * sW2 01909 // "mulps %%xmm4, %%xmm1;\n\t" //xmm4 <- 4.0/15.0 * (sW3+sW1) 01910 // "addps %%xmm2, %%xmm0;\n\t" 01911 // "addps %%xmm1, %%xmm0;\n\t" 01912 // "movups %%xmm0, (%%rdi);\n\t" 01913 // "add $16, %%rsi;\n\t" 01914 // "add $16, %%rdi;\n\t" 01915 // "dec %%rdx;\n\t" 01916 // "jnz .IA13;\n\t" 01917 // "pop %%rax;\n\t" 01918 // "pop %%rdx;\n\t" 01919 // ".IA14:;\n\t" 01920 // "or %%rax, %%rax;\n\t" 01921 // "jz .IA16;\n\t" 01922 // "push %%rax;\n\t" 01923 // "push %%rdx;\n\t" 01924 // "mov %%rbp, %%rdx;\n\t" 01925 // "add %%rbp, %%rdx;\n\t" 01926 // "add %%rbp, %%rdx;\n\t" 01927 // ".IA15:;\n\t" 01928 // "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01929 // "movss (%%rsi, %%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01930 // "movss (%%rsi, %%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01931 // "movss (%%rsi, %%rdx,1), %%xmm3;\n\t" //xmm3 <- sW3 sW3 sW3 sW3 01932 // "addss %%xmm3, %%xmm1;\n\t" 01933 // "mulss %%xmm6, %%xmm0;\n\t" 01934 // "mulss %%xmm5, %%xmm2;\n\t" 01935 // "mulss %%xmm4, %%xmm1;\n\t" 01936 // "addss %%xmm2, %%xmm0;\n\t" 01937 // "addss %%xmm1, %%xmm0;\n\t" 01938 // "movss %%xmm0, (%%rdi);\n\t" 01939 // "add $4, %%rsi;\n\t" 01940 // "add $4, %%rdi;\n\t" 01941 // "dec %%rax;\n\t" 01942 // "jnz .IA15;\n\t" 01943 // "pop %%rdx;\n\t" 01944 // "pop %%rax;\n\t" 01945 // ".IA16:;\n\t" 01946 01947 // // final loop 01948 // "movups 32(%%rbx), %%xmm4;\n\t" 01949 // "movups 16(%%rbx), %%xmm5;\n\t" 01950 // "movups (%%rbx), %%xmm6;\n\t" 01951 // "or %%rdx, %%rdx;\n\t" 01952 // "jz .IA18;\n\t" 01953 // "push %%rdx;\n\t" 01954 // ".IA17:;\n\t" 01955 // "movups (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01956 // "movups (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01957 // "movups (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01958 // "mulps %%xmm4, %%xmm0;\n\t" 01959 // "mulps %%xmm5, %%xmm1;\n\t" 01960 // "mulps %%xmm6, %%xmm2;\n\t" 01961 // "addps %%xmm1, %%xmm0;\n\t" 01962 // "addps %%xmm2, %%xmm0;\n\t" 01963 // "movups %%xmm0, (%%rdi);\n\t" 01964 // "add $16, %%rsi;\n\t" 01965 // "add $16, %%rdi;\n\t" 01966 // "dec %%rdx;\n\t" 01967 // "jnz .IA17;\n\t" 01968 // "pop %%rdx;\n\t" 01969 // ".IA18:;\n\t" 01970 // "or %%rax, %%rax;\n\t" 01971 // "jz .IA20;\n\t" 01972 // "push %%rax;\n\t" 01973 // ".IA19:;\n\t" 01974 // "movss (%%rsi), %%xmm0;\n\t" //xmm0 <- s3 s2 s1 s0 01975 // "movss (%%rsi,%%rbp,1), %%xmm1;\n\t" //xmm1 <- sW sW sW sW 01976 // "movss (%%rsi,%%rbp,2), %%xmm2;\n\t" //xmm2 <- sW2 sW2 sW2 sW2 01977 // "mulss %%xmm4, %%xmm0;\n\t" 01978 // "mulss %%xmm5, %%xmm1;\n\t" 01979 // "mulss %%xmm6, %%xmm2;\n\t" 01980 // "addss %%xmm1, %%xmm0;\n\t" 01981 // "addss %%xmm2, %%xmm0;\n\t" 01982 // "movss %%xmm0, (%%rdi);\n\t" 01983 // "add $4, %%rsi;\n\t" 01984 // "add $4, %%rdi;\n\t" 01985 // "dec %%rax;\n\t" 01986 // "jnz .IA19;\n\t" 01987 // "pop %%rax;\n\t" 01988 // ".IA20:;\n\t" 01989 01990 // "pop %%rbp;\n\t" 01991 // : 01992 // :"m"(w),"S"(sptr),"D"(dptr),"a"(rax),"b"(coeffs),"c"(rcx),"d"(rdx) 01993 // : 01994 // ); 01995 01996 } 01997 } 01998 01999 02000 // ###################################################################### 02001 02002 void sse_yuv411_to_rgb_mode_640x480(const byte *src, byte *dest, 02003 const int nbpix2) 02004 { 02005 int rcx=nbpix2/6; 02006 02007 const float coeffs[] = { 02008 0.0F, -0.198242F, 1.014648F, 0.0F, // R G B xx -> u 02009 0.700195F, -0.29052F, 0.0F, 0.0F, // R G B xx -> v 02010 128.0F, 128.0F, 128.0F, 128.0F // division factor 02011 }; 02012 02013 asm ( 02014 ".JA0:;\n\t" 02015 "or %%rcx, %%rcx;\n\t" 02016 "jz .JA1;\n\t" 02017 "pxor %%mm7, %%mm7;\n\t" //mm7 <- 00 00 00 00 02018 "xor %%rax, %%rax;\n\t" 02019 "xor %%rbx, %%rbx;\n\t" 02020 "mov (%%rsi), %%rax;\n\t" // rax <- v y1 y0 u 02021 "movw 4(%%rsi), %%bx;\n\t" // rbx <- xx xx y3 y2 02022 "movd %%rax, %%mm0;\n\t" // mm0<- xx xx xx xx v y1 y0 u 02023 "movd %%rax, %%mm1;\n\t" // mm1<- xx xx xx xx v y1 y0 u 02024 "movd %%rbx, %%mm2;\n\t" // mm2<- xx xx xx xx xx xx y3 y2 02025 "psrlq $16, %%mm1;\n\t" // mm1<- xx xx xx xx xx xx v y1 02026 "punpcklbw %%mm7, %%mm0;\n\t" // mm0<- xx xx xx xx 0 y0 0 u 02027 "punpcklbw %%mm7, %%mm1;\n\t" // mm1<- xx xx xx xx 00 v 00 y1 02028 "punpcklbw %%mm7, %%mm2;\n\t" // mm2<- xx xx xx xx 00 y3 00 y2 02029 "punpcklwd %%mm7, %%mm0;\n\t" // mm0<- 00 00 00 y0 00 00 00 u 02030 "punpcklwd %%mm7, %%mm1;\n\t" // mm1<- 00 00 00 v 00 00 00 y1 02031 "punpcklwd %%mm7, %%mm2;\n\t" // mm2<- 00 00 00 y3 00 00 00 y2 02032 02033 "cvtpi2ps %%mm0, %%xmm0;\n\t" // xmm0 <- 00 00 y0 u 02034 "cvtpi2ps %%mm1, %%xmm1;\n\t" // xmm1 <- 00 00 v y1 02035 "cvtpi2ps %%mm2, %%xmm2;\n\t" // xmm2 <- 00 00 y3 y2 02036 02037 // 01 01 01 01 02038 "movaps %%xmm0, %%xmm3;\n\t" 02039 02040 // 00 00 00 00 02041 "movaps %%xmm1, %%xmm4;\n\t" 02042 02043 // 00 00 00 00 02044 "movaps %%xmm2, %%xmm5;\n\t" 02045 02046 // 01 01 01 01 02047 "movaps %%xmm2, %%xmm6;\n\t" 02048 02049 "shufps $0x55, %%xmm3, %%xmm3;\n\t"// xmm3 <- y0 y0 y0 y0 02050 "shufps $00, %%xmm4, %%xmm4;\n\t" // xmm4 <- y1 y1 y1 y1 02051 "shufps $0x00, %%xmm5, %%xmm5;\n\t"// xmm5 <- y2 y2 y2 y2 02052 "shufps $0x55, %%xmm6, %%xmm6;\n\t"// xmm6 <- y3 y3 y3 y3 02053 02054 // 00 00 00 00 02055 "shufps $0, %%xmm0, %%xmm0;\n\t" // xmm0 <- u u u u 02056 // 01 01 01 01 02057 "shufps $0x55, %%xmm1, %%xmm1;\n\t" // xmm1 <- v v v v 02058 02059 "subps 32(%%rdx), %%xmm0;\n\t" 02060 "subps 32(%%rdx), %%xmm1;\n\t" 02061 02062 "mulps (%%rdx), %%xmm0;\n\t" 02063 "mulps 16(%%rdx),%%xmm1;\n\t" 02064 02065 "addps %%xmm0, %%xmm3;\n\t" 02066 "addps %%xmm0, %%xmm4;\n\t" 02067 "addps %%xmm0, %%xmm5;\n\t" 02068 "addps %%xmm0, %%xmm6;\n\t" 02069 02070 "addps %%xmm1, %%xmm3;\n\t" // xmm3 <- xx b0 g0 r0 02071 "addps %%xmm1, %%xmm4;\n\t" // xmm4 <- xx b1 g1 r1 02072 "addps %%xmm1, %%xmm5;\n\t" // xmm5 <- xx b2 g2 r2 02073 "addps %%xmm1, %%xmm6;\n\t" // xmm6 <- xx b3 g3 r3 02074 02075 "cvtps2pi %%xmm3, %%mm0;\n\t" //mm0 <- g0 r0 02076 "movhlps %%xmm3, %%xmm3;\n\t" //xmm3 <- g0 r0 xx b0 02077 "cvtps2pi %%xmm3, %%mm1;\n\t" //mm1 <- xx b0 02078 "packssdw %%mm1, %%mm0;\n\t" //mm0<- xx b0 g0 r0 02079 02080 "cvtps2pi %%xmm4, %%mm2;\n\t" //mm2 <- g1 r1 02081 "movhlps %%xmm4, %%xmm4;\n\t" //xmm4 <- g1 r1 xx b1 02082 "cvtps2pi %%xmm4, %%mm3;\n\t" //mm3 <- xx b1 02083 "packssdw %%mm3, %%mm2;\n\t" //mm2<- xx b1 g1 r1 02084 02085 "cvtps2pi %%xmm5, %%mm4;\n\t" //mm4 <- g2 r2 02086 "movhlps %%xmm5, %%xmm5;\n\t" //xmm5 <- g2 r2 xx b2 02087 "cvtps2pi %%xmm5, %%mm5;\n\t" //mm5 <- xx b2 02088 "packssdw %%mm5, %%mm4;\n\t" //mm4<- xx b2 g2 r2 02089 02090 "cvtps2pi %%xmm6, %%mm6;\n\t" //mm6 <- g3 r3 02091 "movhlps %%xmm6, %%xmm6;\n\t" //xmm3 <- g3 r3 xx b3 02092 "cvtps2pi %%xmm6, %%mm7;\n\t" //mm7 <- xx b3 02093 "packssdw %%mm7, %%mm6;\n\t" //mm6<- xx b3 g3 r3 02094 02095 "pxor %%mm1, %%mm1;\n\t" 02096 "pcmpgtw %%mm0, %%mm1;\n\t" 02097 "pandn %%mm0, %%mm1;\n\t" 02098 02099 "pxor %%mm3, %%mm3;\n\t" 02100 "pcmpgtw %%mm2, %%mm3;\n\t" 02101 "pandn %%mm2, %%mm3;\n\t" 02102 02103 "pxor %%mm5, %%mm5;\n\t" 02104 "pcmpgtw %%mm4, %%mm5;\n\t" 02105 "pandn %%mm4, %%mm5;\n\t" 02106 02107 "pxor %%mm7, %%mm7;\n\t" 02108 "pcmpgtw %%mm6, %%mm7;\n\t" 02109 "pandn %%mm6, %%mm7;\n\t" 02110 02111 "packuswb %%mm1, %%mm1;\n\t" //mm0<- xx xx xx xx xx b0 g0 r0 02112 "packuswb %%mm3, %%mm3;\n\t" //mm2<- xx xx xx xx xx b1 g1 r1 02113 "packuswb %%mm5, %%mm5;\n\t" //mm4<- xx xx xx xx xx b2 g2 r2 02114 "packuswb %%mm7, %%mm7;\n\t" //mm6<- xx xx xx xx xx b3 g3 r3 02115 02116 "push %%rcx;\n\t" 02117 "push %%rdx;\n\t" 02118 "movd %%mm1, %%rax;\n\t" // rax <- xx b0 g0 r0 02119 "movd %%mm3, %%rbx;\n\t" // rbx <- xx b1 g1 r1 02120 "movd %%mm5, %%rcx;\n\t" // rcx <- xx b2 g2 r2 02121 "movd %%mm7, %%rdx;\n\t" // rdx <- xx b3 g3 r3 02122 "movw %%ax, (%%rdi);\n\t" 02123 "movw %%bx,3(%%rdi);\n\t" 02124 "movw %%cx,6(%%rdi);\n\t" 02125 "movw %%dx,9(%%rdi);\n\t" 02126 "shr $8, %%rax;\n\t" 02127 "shr $8, %%rbx;\n\t" 02128 "shr $8, %%rcx;\n\t" 02129 "shr $8, %%rdx;\n\t" 02130 "movb %%ah, 2(%%rdi);\n\t" 02131 "movb %%bh, 5(%%rdi);\n\t" 02132 "movb %%ch, 8(%%rdi);\n\t" 02133 "movb %%dh,11(%%rdi);\n\t" 02134 "pop %%rdx;\n\t" 02135 "pop %%rcx;\n\t" 02136 02137 "add $12,%%rdi;\n\t" 02138 "dec %%rcx;\n\t" 02139 "add $6, %%rsi;\n\t" 02140 "jmp .JA0;\n\t" 02141 ".JA1:;\n\t" 02142 "emms;\n\t" 02143 : 02144 :"S"(src),"D"(dest),"c"(rcx),"d"(coeffs) 02145 :"rax","rbx","memory" 02146 ); 02147 02148 } 02149 02150 02151 02152 02153 void sse_lowPass9x(const float *sptr, float *dptr, const int h, const int w) 02154 { 02155 02156 for (int j = 0; j < h; j ++) 02157 { 02158 // leftmost points 02159 *dptr++ = sptr[0] * (70.0F / 163.0F) + 02160 sptr[1] * (56.0F / 163.0F) + 02161 sptr[2] * (28.0F / 163.0F) + 02162 sptr[3] * ( 8.0F / 163.0F) + 02163 sptr[4] * ( 1.0F / 163.0F); 02164 *dptr++ = (sptr[0] + sptr[2]) * (56.0F / 219.0F) + 02165 sptr[1] * (70.0F / 219.0F) + 02166 sptr[3] * (28.0F / 219.0F) + 02167 sptr[4] * ( 8.0F / 219.0F) + 02168 sptr[5] * ( 1.0F / 219.0F); 02169 *dptr++ = (sptr[0] + sptr[4]) * (28.0F / 247.0F) + 02170 (sptr[1] + sptr[3]) * (56.0F / 247.0F) + 02171 sptr[2] * (70.0F / 247.0F) + 02172 sptr[5] * ( 8.0F / 247.0F) + 02173 sptr[6] * ( 1.0F / 247.0F); 02174 *dptr++ = (sptr[0] + sptr[6]) * ( 8.0F / 255.0F) + 02175 (sptr[1] + sptr[5]) * (28.0F / 255.0F) + 02176 (sptr[2] + sptr[4]) * (56.0F / 255.0F) + 02177 sptr[3] * (70.0F / 255.0F) + 02178 sptr[7] * ( 1.0F / 255.0F); 02179 02180 // far from the borders 02181 for (int i = 0; i < w - 8; i ++) 02182 { 02183 *dptr++ = (sptr[0] + sptr[8]) * ( 1.0F / 256.0F) + 02184 (sptr[1] + sptr[7]) * ( 8.0F / 256.0F) + 02185 (sptr[2] + sptr[6]) * (28.0F / 256.0F) + 02186 (sptr[3] + sptr[5]) * (56.0F / 256.0F) + 02187 sptr[4] * (70.0F / 256.0F); 02188 sptr ++; 02189 } 02190 02191 // rightmost points 02192 *dptr++ = sptr[0] * ( 1.0F / 255.0F) + 02193 (sptr[1] + sptr[7]) * ( 8.0F / 255.0F) + 02194 (sptr[2] + sptr[6]) * (28.0F / 255.0F) + 02195 (sptr[3] + sptr[5]) * (56.0F / 255.0F) + 02196 sptr[4] * (70.0F / 255.0F); 02197 sptr ++; 02198 *dptr++ = sptr[0] * ( 1.0F / 247.0F) + 02199 sptr[1] * ( 8.0F / 247.0F) + 02200 (sptr[2] + sptr[6]) * (28.0F / 247.0F) + 02201 (sptr[3] + sptr[5]) * (56.0F / 247.0F) + 02202 sptr[4] * (70.0F / 247.0F); 02203 sptr ++; 02204 *dptr++ = sptr[0] * ( 1.0F / 219.0F) + 02205 sptr[1] * ( 8.0F / 219.0F) + 02206 sptr[2] * (28.0F / 219.0F) + 02207 (sptr[3] + sptr[5]) * (56.0F / 219.0F) + 02208 sptr[4] * (70.0F / 219.0F); 02209 sptr ++; 02210 *dptr++ = sptr[0] * ( 1.0F / 163.0F) + 02211 sptr[1] * ( 8.0F / 163.0F) + 02212 sptr[2] * (28.0F / 163.0F) + 02213 sptr[3] * (56.0F / 163.0F) + 02214 sptr[4] * (70.0F / 163.0F); 02215 sptr += 5; // sptr back to same as dptr (start of next line) 02216 } 02217 } 02218 #endif 02219 02220 //############################################################################ 02221 /* So things look consistent in everyone's emacs... */ 02222 /* Local Variables: */ 02223 /* indent-tabs-mode: nil */ 02224 /* End: */ 02225 02226 02227