00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include "Util/mmx-sse.H"
00040 #include "Util/log.H"
00041
00042
00043 typedef int int32;
00044 typedef unsigned char byte;
00045 typedef float float32;
00046
00047 #ifdef INVT_CPU_OPTERON
00048
00049 #ifdef INVT_USE_SSE
00050
00051
00052 void sse_absDiff(const double *a, const double *b, double *diff, const int32 sz)
00053 {
00054 static int32 rcx= sz>>2;
00055 static int32 rdx= sz & 0x3;
00056
00057 asm (
00058 "or %%rcx, %%rcx;\n\t"
00059 "jz .AG2;\n\t"
00060 ".AG1:;\n\t"
00061 "movupd 0(%%rsi), %%xmm0;\n\t"
00062 "movupd 0(%%rdi), %%xmm1;\n\t"
00063 "movupd 16(%%rsi), %%xmm2;\n\t"
00064 "movupd 16(%%rdi), %%xmm3;\n\t"
00065 "movupd %%xmm0, %%xmm4;\n\t"
00066 "movupd %%xmm1, %%xmm5;\n\t"
00067 "movupd %%xmm2, %%xmm6;\n\t"
00068 "movupd %%xmm3, %%xmm7;\n\t"
00069 "subpd %%xmm1, %%xmm0;\n\t"
00070 "subpd %%xmm3, %%xmm2;\n\t"
00071 "subpd %%xmm4, %%xmm5;\n\t"
00072 "subpd %%xmm6, %%xmm7;\n\t"
00073 "maxpd %%xmm0, %%xmm5;\n\t"
00074 "maxpd %%xmm2, %%xmm7;\n\t"
00075 "movupd %%xmm5, 0(%%rbx);\n\t"
00076 "movupd %%xmm7, 16(%%rbx);\n\t"
00077 "add $32, %%rsi;\n\t"
00078 "add $32, %%rdi;\n\t"
00079 "add $32, %%rbx;\n\t"
00080 "loop .AG1;\n\t"
00081 ".AG2:;\n\t"
00082 "mov %%rdx, %%rcx;\n\t"
00083 "or %%rcx, %%rcx;\n\t"
00084 "jz .AG4;\n\t"
00085 ".AG3:;\n\t"
00086 "movsd 0(%%rsi), %%xmm0;\n\t"
00087 "movsd 0(%%rdi), %%xmm1;\n\t"
00088 "movsd %%xmm0, %%xmm2;\n\t"
00089 "movsd %%xmm1, %%xmm3;\n\t"
00090 "subsd %%xmm3, %%xmm2;\n\t"
00091 "subsd %%xmm0, %%xmm1;\n\t"
00092 "maxsd %%xmm2, %%xmm1;\n\t"
00093 "movsd %%xmm1, 0(%%rbx);\n\t"
00094 "add $8, %%rsi;\n\t"
00095 "add $8, %%rdi;\n\t"
00096 "add $8, %%rbx;\n\t"
00097 "loop .AG3;\n\t"
00098 ".AG4:;\n\t"
00099 :
00100 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00101 :"memory"
00102 );
00103 }
00104 #endif
00105
00106 #ifdef INVT_USE_MMXSSE2
00107
00108
00109 void sse2_absDiff(const float *a, const float *b, float *diff, const int32 sz)
00110 {
00111 static int32 rcx= sz>>3;
00112 static int32 rdx= sz & 0x7;
00113
00114 asm (
00115 "or %%rcx, %%rcx;\n\t"
00116 "jz .AE2;\n\t"
00117 ".AE1:;\n\t"
00118 "movups 0(%%rsi), %%xmm0;\n\t"
00119 "movups 0(%%rdi), %%xmm1;\n\t"
00120 "movups 16(%%rsi), %%xmm2;\n\t"
00121 "movups 16(%%rdi), %%xmm3;\n\t"
00122 "movups %%xmm0, %%xmm4;\n\t"
00123 "movups %%xmm1, %%xmm5;\n\t"
00124 "movups %%xmm2, %%xmm6;\n\t"
00125 "movups %%xmm3, %%xmm7;\n\t"
00126 "subps %%xmm1, %%xmm0;\n\t"
00127 "subps %%xmm3, %%xmm2;\n\t"
00128 "subps %%xmm4, %%xmm5;\n\t"
00129 "subps %%xmm6, %%xmm7;\n\t"
00130 "maxps %%xmm0, %%xmm5;\n\t"
00131 "maxps %%xmm2, %%xmm7;\n\t"
00132 "movups %%xmm5, 0(%%rbx);\n\t"
00133 "movups %%xmm7, 16(%%rbx);\n\t"
00134 "add $32, %%rsi;\n\t"
00135 "add $32, %%rdi;\n\t"
00136 "add $32, %%rbx;\n\t"
00137 "loop .AE1;\n\t"
00138 ".AE2:;\n\t"
00139 "mov %%rdx, %%rcx;\n\t"
00140 "or %%rcx, %%rcx;\n\t"
00141 "jz .AE4;\n\t"
00142 ".AE3:;\n\t"
00143 "movss 0(%%rsi), %%xmm0;\n\t"
00144 "movss 0(%%rdi), %%xmm1;\n\t"
00145 "movss %%xmm0, %%xmm2;\n\t"
00146 "movss %%xmm1, %%xmm3;\n\t"
00147 "subss %%xmm3, %%xmm2;\n\t"
00148 "subss %%xmm0, %%xmm1;\n\t"
00149 "maxss %%xmm2, %%xmm1;\n\t"
00150 "movss %%xmm1, 0(%%rbx);\n\t"
00151 "add $4, %%rsi;\n\t"
00152 "add $4, %%rdi;\n\t"
00153 "add $4, %%rbx;\n\t"
00154 "loop .AE3;\n\t"
00155 ".AE4:;\n\t"
00156 "emms;\n\t"
00157 :
00158 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00159 :"memory"
00160 );
00161 }
00162
00163
00164
00165
00166
00167 void sse2_absDiff(const int32 *a, const int32 *b, int32 *diff, const int32 sz)
00168 {
00169 static int32 rcx= sz>>3;
00170 static int32 rdx= sz&0x7;
00171
00172 asm (
00173 "or %%rcx, %%rcx;\n\t"
00174 "jz .AF2;\n\t"
00175 ".AF1:;\n\t"
00176 "movdqu 0(%%rsi), %%xmm0;\n\t"
00177 "movdqu 0(%%rdi), %%xmm1;\n\t"
00178 "movdqu 16(%%rsi), %%xmm2;\n\t"
00179 "movdqu 16(%%rdi), %%xmm3;\n\t"
00180 "movdqu %%xmm0, %%xmm4;\n\t"
00181 "movdqu %%xmm1, %%xmm5;\n\t"
00182 "movdqu %%xmm2, %%xmm6;\n\t"
00183 "movdqu %%xmm3, %%xmm7;\n\t"
00184 "psubusw %%xmm1, %%xmm0;\n\t"
00185 "psubusw %%xmm3, %%xmm2;\n\t"
00186 "psubusw %%xmm4, %%xmm5;\n\t"
00187 "psubusw %%xmm6, %%xmm7;\n\t"
00188 "pmaxsw %%xmm0, %%xmm5;\n\t"
00189 "pmaxsw %%xmm2, %%xmm7;\n\t"
00190 "movdqu %%xmm5, 0(%%rbx);\n\t"
00191 "movdqu %%xmm7, 16(%%rbx);\n\t"
00192 "add $32, %%rsi;\n\t"
00193 "add $32, %%rdi;\n\t"
00194 "add $32, %%rbx;\n\t"
00195 "loop .AF1;\n\t"
00196 ".AF2:;\n\t"
00197 "mov %%rdx, %%rcx;\n\t"
00198 "or %%rcx, %%rcx;\n\t"
00199 "jz .AF4;\n\t"
00200 ".AF3:;\n\t"
00201 "mov (%%rsi), %%rax;\n\t"
00202 "mov (%%rdi), %%rdx;\n\t"
00203 "cmp %%rdx, %%rax;\n\t"
00204 "ja .AF5;\n\t"
00205 "xchg %%rax, %%rdx;\n\t"
00206 ".AF5:;\n\t"
00207 "sub %%rdx, %%rax;\n\t"
00208 "mov %%rax, (%%rbx);\n\t"
00209 "add $4, %%rsi;\n\t"
00210 "add $4, %%rdi;\n\t"
00211 "add $4, %%rbx;\n\t"
00212 "loop .AF3;\n\t"
00213 ".AF4:;\n\t"
00214 "emms;\n\t"
00215 :
00216 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00217 :"memory"
00218 );
00219 }
00220
00221
00222
00223
00224 void sse2_absDiff(const byte *a, const byte *b, byte *diff, const int32 sz)
00225 {
00226 static int32 rcx= sz>>5;
00227 static int32 rdx= sz&0x1f;
00228
00229 asm (
00230 "or %%rcx, %%rcx;\n\t"
00231 "jz .AD2;\n\t"
00232 ".AD1:;\n\t"
00233 "movdqu 0(%%rsi), %%xmm0;\n\t"
00234 "movdqu 0(%%rdi), %%xmm1;\n\t"
00235 "movdqu 16(%%rsi), %%xmm2;\n\t"
00236 "movdqu 16(%%rdi), %%xmm3;\n\t"
00237 "movdqu %%xmm0, %%xmm4;\n\t"
00238 "movdqu %%xmm1, %%xmm5;\n\t"
00239 "movdqu %%xmm2, %%xmm6;\n\t"
00240 "movdqu %%xmm3, %%xmm7;\n\t"
00241 "psubusb %%xmm1, %%xmm0;\n\t"
00242 "psubusb %%xmm3, %%xmm2;\n\t"
00243 "psubusb %%xmm4, %%xmm5;\n\t"
00244 "psubusb %%xmm6, %%xmm7;\n\t"
00245 "pmaxub %%xmm0, %%xmm5;\n\t"
00246 "pmaxub %%xmm2, %%xmm7;\n\t"
00247 "movdqu %%xmm5, 0(%%rbx);\n\t"
00248 "movdqu %%xmm7, 16(%%rbx);\n\t"
00249 "add $32, %%rsi;\n\t"
00250 "add $32, %%rdi;\n\t"
00251 "add $32, %%rbx;\n\t"
00252 "loop .AD1;\n\t"
00253 ".AD2:;\n\t"
00254 "mov %%rdx, %%rcx;\n\t"
00255 "or %%rcx, %%rcx;\n\t"
00256 "jz .AD4;\n\t"
00257 ".AD3:;\n\t"
00258 "movb (%%rsi), %%al;\n\t"
00259 "movb (%%rdi), %%dl;\n\t"
00260 "cmpb %%dl, %%al;\n\t"
00261 "ja .AD5;\n\t"
00262 "xchgb %%al, %%dl;\n\t"
00263 ".AD5:;\n\t"
00264 "subb %%dl, %%al;\n\t"
00265 "movb %%al, (%%rbx);\n\t"
00266 "inc %%rbx;\n\t"
00267 "inc %%rsi;\n\t"
00268 "inc %%rdi;\n\t"
00269 "loop .AD3;\n\t"
00270 ".AD4:;\n\t"
00271 "emms;\n\t"
00272 :
00273 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00274 :"memory"
00275 );
00276 }
00277 #endif
00278
00279 #ifdef INVT_USE_SSE
00280
00281
00282 void sse_sum(const double *a, double *sum, const int32 sz)
00283 {
00284 static int32 rcx = sz>>3;
00285 static int32 rdx = sz&0x7;
00286
00287 asm (
00288 "pxor %%xmm4, %%xmm4;\n\t"
00289 "pxor %%xmm5, %%xmm5;\n\t"
00290 "pxor %%xmm6, %%xmm6;\n\t"
00291 "pxor %%xmm7, %%xmm7;\n\t"
00292 "or %%rcx, %%rcx;\n\t"
00293 "jz BE1;\n\t"
00294 ".BE0:\n\t"
00295 "movupd 0(%%rsi), %%xmm0;\n\t"
00296 "movupd 16(%%rsi), %%xmm1;\n\t"
00297 "movupd 32(%%rsi), %%xmm2;\n\t"
00298 "movupd 48(%%rsi), %%xmm3;\n\t"
00299 "addpd %%xmm0, %%xmm4;\n\t"
00300 "addpd %%xmm1, %%xmm5;\n\t"
00301 "addpd %%xmm2, %%xmm6;\n\t"
00302 "addpd %%xmm3, %%xmm7;\n\t"
00303 "add $64, %%rsi;\n\t"
00304 "loop .BE0;\n\t"
00305 "BE1:;\n\t"
00306 "mov %%rdx, %%rcx;\n\t"
00307 "pxor %%xmm0, %%xmm0;\n\t"
00308 "or %%rcx, %%rcx;\n\t"
00309 "jz BE2;\n\t"
00310 "BE3:;\n\t"
00311 "movupd 0(%%rsi), %%xmm1;\n\t"
00312 "addpd %%xmm1, %%xmm0;\n\t"
00313 "add $16, %%rsi;\n\t"
00314 "loop BE3;\n\t"
00315 "BE2:;\n\t"
00316 "addpd %%xmm4, %%xmm7;\n\t"
00317 "addpd %%xmm5, %%xmm7;\n\t"
00318 "addpd %%xmm6, %%xmm7;\n\t"
00319 "addpd %%xmm7, %%xmm0;\n\t"
00320 "movhpd %%xmm0, (%%rbx);\n\t"
00321 "addsd (%%rbx), %%xmm0;\n\t"
00322 "movsd %%xmm0, (%%rbx);\n\t"
00323 "emms;\n\t"
00324 :
00325 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx)
00326 :"memory"
00327 );
00328 }
00329 #endif
00330
00331 #ifdef INVT_USE_MMXSSE2
00332
00333
00334 void sse2_sum(const float *a, double *sum, const int32 sz)
00335 {
00336 static int32 rcx = sz>>3;
00337 static int32 rdx = sz & 0x7;
00338
00339 asm (
00340 "pxor %%xmm4, %%xmm4;\n\t"
00341 "pxor %%xmm5, %%xmm5;\n\t"
00342 "pxor %%xmm6, %%xmm6;\n\t"
00343 "pxor %%xmm7, %%xmm7;\n\t"
00344 "or %%rcx, %%rcx;\n\t"
00345 "jz BA1;\n\t"
00346 ".BA0:\n\t"
00347 "cvtps2pd 0(%%rsi), %%xmm0;\n\t"
00348 "cvtps2pd 8(%%rsi), %%xmm1;\n\t"
00349 "cvtps2pd 16(%%rsi), %%xmm2;\n\t"
00350 "cvtps2pd 24(%%rsi), %%xmm3;\n\t"
00351 "addpd %%xmm0, %%xmm4;\n\t"
00352 "addpd %%xmm1, %%xmm5;\n\t"
00353 "addpd %%xmm2, %%xmm6;\n\t"
00354 "addpd %%xmm3, %%xmm7;\n\t"
00355 "add $32, %%rsi;\n\t"
00356 "loop .BA0;\n\t"
00357 "BA1:;\n\t"
00358 "pxor %%xmm0, %%xmm0;\n\t"
00359 "mov %%rdx, %%rcx;\n\t"
00360 "or %%rcx, %%rcx;\n\t"
00361 "jz BA2;\n\t"
00362 "BA3:;\n\t"
00363 "cvtps2pd 0(%%rsi), %%xmm1;\n\t"
00364 "addpd %%xmm1, %%xmm0;\n\t"
00365 "add $8, %%rsi;\n\t"
00366 "loop BA3;\n\t"
00367 "BA2:;\n\t"
00368 "addpd %%xmm4, %%xmm7;\n\t"
00369 "addpd %%xmm5, %%xmm7;\n\t"
00370 "addpd %%xmm6, %%xmm7;\n\t"
00371 "addpd %%xmm7, %%xmm0;\n\t"
00372 "movhpd %%xmm0, (%%rbx);\n\t"
00373 "addsd (%%rbx), %%xmm0;\n\t"
00374 "movsd %%xmm0, (%%rbx);\n\t"
00375 "emms;\n\t"
00376 :
00377 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx)
00378 :"memory"
00379 );
00380 }
00381
00382
00383
00384
00385 void sse2_sum(const int32 *a, double *sum, const int32 sz)
00386 {
00387 static int32 rcx = sz>>3;
00388 static int32 rdx = sz & 0x7;
00389
00390 asm (
00391 "pxor %%xmm4, %%xmm4;\n\t"
00392 "pxor %%xmm5, %%xmm5;\n\t"
00393 "pxor %%xmm6, %%xmm6;\n\t"
00394 "pxor %%xmm7, %%xmm7;\n\t"
00395 "or %%rcx, %%rcx;\n\t"
00396 ".BC0:\n\t"
00397 "cvtdq2pd 0(%%rsi), %%xmm0;\n\t"
00398 "cvtdq2pd 8(%%rsi), %%xmm1;\n\t"
00399 "cvtdq2pd 16(%%rsi), %%xmm2;\n\t"
00400 "cvtdq2pd 24(%%rsi), %%xmm3;\n\t"
00401 "addpd %%xmm0, %%xmm4;\n\t"
00402 "addpd %%xmm1, %%xmm5;\n\t"
00403 "addpd %%xmm2, %%xmm6;\n\t"
00404 "addpd %%xmm3, %%xmm7;\n\t"
00405 "add $32, %%rsi;\n\t"
00406 "loop .BC0;\n\t"
00407 "BC1:;\n\t"
00408 "pxor %%xmm0, %%xmm0;\n\t"
00409 "mov %%rdx, %%rcx;\n\t"
00410 "or %%rcx, %%rcx;\n\t"
00411 "jz BC2;\n\t"
00412 "BC3:;\n\t"
00413 "cvtdq2pd 0(%%rsi), %%xmm1;\n\t"
00414 "addpd %%xmm1, %%xmm0;\n\t"
00415 "add $8, %%rsi;\n\t"
00416 "loop BC3;\n\t"
00417 "BC2:;\n\t"
00418 "addpd %%xmm4, %%xmm7;\n\t"
00419 "addpd %%xmm5, %%xmm7;\n\t"
00420 "addpd %%xmm6, %%xmm7;\n\t"
00421 "addpd %%xmm7, %%xmm0;\n\t"
00422 "movhpd %%xmm0, (%%rbx);\n\t"
00423 "addsd (%%rbx), %%xmm0;\n\t"
00424 "movsd %%xmm0, (%%rbx);\n\t"
00425 "emms;\n\t"
00426 :
00427 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx)
00428 :"memory"
00429 );
00430 }
00431
00432
00433
00434
00435 void sse2_sum(const byte *a, double *sum, const int32 sz)
00436 {
00437 static int rcx = sz>>5;
00438 static int rdx = sz & 0x1f;
00439
00440 asm (
00441 "or %%rcx, %%rcx;\n\t"
00442 "jz BB1;\n\t"
00443 "pxor %%xmm7, %%xmm7;\n\t"
00444 "push %%rbx;\n\t"
00445 "push %%rdx;\n\t"
00446 "BB3:;\n\t"
00447 "pxor %%xmm5, %%xmm5;\n\t"
00448 "pxor %%xmm6, %%xmm6;\n\t"
00449 "movdqu (%%rsi), %%xmm0;\n\t"
00450 "movdqu 16(%%rsi), %%xmm1;\n\t"
00451 "psadbw %%xmm0, %%xmm5;\n\t"
00452 "psadbw %%xmm1, %%xmm6;\n\t"
00453 "pextrw $0, %%xmm5, %%rax;\n\t"
00454 "cvtsi2sd %%rax, %%xmm0;\n\t"
00455 "pextrw $4, %%xmm5, %%rbx;\n\t"
00456 "cvtsi2sd %%rbx, %%xmm1;\n\t"
00457 "pextrw $0, %%xmm6, %%rdx;\n\t"
00458 "cvtsi2sd %%rdx, %%xmm2;\n\t"
00459 "pextrw $4, %%xmm6, %%rdi;\n\t"
00460 "cvtsi2sd %%rdi, %%xmm3;\n\t"
00461 "addsd %%xmm0, %%xmm1;\n\t"
00462 "addsd %%xmm2, %%xmm3;\n\t"
00463 "addsd %%xmm1, %%xmm7;\n\t"
00464 "addsd %%xmm3, %%xmm7;\n\t"
00465 "add $32, %%rsi;\n\t"
00466 "loop BB3;\n\t"
00467 "pop %%rdx;\n\t"
00468 "pop %%rbx;\n\t"
00469 "BB1:;\n\t"
00470 "xor %%rdi, %%rdi;\n\t"
00471 "mov %%rdx, %%rcx;\n\t"
00472 "or %%rcx, %%rcx;\n\t"
00473 "jz BB2;\n\t"
00474 "BB5:;\n\t"
00475 "xor %%rax, %%rax;\n\t"
00476 "movb (%%rsi), %%al;\n\t"
00477 "add %%rax, %%rdi;\n\t"
00478 "inc %%rsi;\n\t"
00479 "loop BB5;\n\t"
00480 "BB2:\n\t"
00481 "cvtsi2sd %%rdi, %%xmm0;\n\t"
00482 "addsd %%xmm0, %%xmm7;\n\t"
00483 "movhpd %%xmm7, (%%rbx);\n\t"
00484 "addsd (%%rbx), %%xmm7;\n\t"
00485 "movsd %%xmm7, (%%rbx);\n\t"
00486 "BB6:;\n\t"
00487 "emms;\n\t"
00488 :
00489 :"S"(a), "c"(rcx),"b"(sum),"d"(rdx)
00490 :"memory","rax","rdi"
00491 );
00492 }
00493 #endif
00494
00495 #ifdef INVT_USE_SSE
00496
00497
00498 void sse_clampedDiff(const byte *a, const byte *b, byte *result, const int32 sz)
00499 {
00500 int rcx = sz >> 6;
00501 int rdx = sz & 0x7f;
00502
00503 asm (
00504 "or %%rcx, %%rcx;\n\t"
00505 "jz .DA0;\n\t"
00506 ".DA1:;\n\t"
00507 "movdqu (%%rsi), %%xmm0;\n\t"
00508 "movdqu (%%rdi), %%xmm4;\n\t"
00509 "movdqu 16(%%rsi), %%xmm1;\n\t"
00510 "movdqu 16(%%rdi), %%xmm5;\n\t"
00511 "movdqu 32(%%rsi), %%xmm2;\n\t"
00512 "movdqu 32(%%rdi), %%xmm6;\n\t"
00513 "movdqu 48(%%rsi), %%xmm3;\n\t"
00514 "movdqu 48(%%rdi), %%xmm7;\n\t"
00515 "psubusb %%xmm4, %%xmm0;\n\t"
00516 "psubusb %%xmm5, %%xmm1;\n\t"
00517 "psubusb %%xmm6, %%xmm2;\n\t"
00518 "psubusb %%xmm7, %%xmm3;\n\t"
00519 "movdqu %%xmm0, 0(%%rbx);\n\t"
00520 "movdqu %%xmm1, 16(%%rbx);\n\t"
00521 "movdqu %%xmm2, 32(%%rbx);\n\t"
00522 "movdqu %%xmm3, 48(%%rbx);\n\t"
00523 "add $64, %%rsi;\n\t"
00524 "add $64, %%rdi;\n\t"
00525 "add $64, %%rbx;\n\t"
00526 "loop .DA1;\n\t"
00527 ".DA0:;\n\t"
00528 "mov %%rdx, %%rcx;\n\t"
00529 "or %%rcx, %%rcx;\n\t"
00530 "jz .DA2;\n\t"
00531 ".DA3:;\n\t"
00532 "movb (%%rsi), %%al;\n\t"
00533 "movb (%%rdi), %%dl;\n\t"
00534 "cmpb %%bl, %%al;\n\t"
00535 "ja .DA4;\n\t"
00536 "xchg %%al, %%bl;\n\t"
00537 ".DA4:;\n\t"
00538 "subb %%bl, %%al;\n\t"
00539 "movb %%al, (%%rbx);\n\t"
00540 "inc %%rsi;\n\t"
00541 "inc %%rdi;\n\t"
00542 "inc %%rbx;\n\t"
00543 "loop .DA3;\n\t"
00544 ".DA2:;\n\t"
00545 "emms;\n\t"
00546 :
00547 :"S"(a),"D"(b),"c"(rcx),"d"(rdx),"b"(result)
00548 );
00549 }
00550
00551
00552
00553
00554 void sse_clampedDiff(const float32 *a, const float32 *b, float32 *result,
00555 const int32 sz)
00556 {
00557 int32 rcx=sz>>5;
00558 int32 rdx=sz&0x1f;
00559
00560 asm (
00561 "or %%rcx, %%rcx;\n\t"
00562 "jz .DB0;\n\t"
00563 ".DB1:;\n\t"
00564 "movups 0(%%rsi), %%xmm0;\n\t"
00565 "movups 0(%%rdi), %%xmm1;\n\t"
00566 "movups 16(%%rsi), %%xmm2;\n\t"
00567 "movups 16(%%rdi), %%xmm3;\n\t"
00568 "movups %%xmm1, %%xmm6;\n\t"
00569 "movups %%xmm3, %%xmm7;\n\t"
00570 "cmpps $1, %%xmm0, %%xmm6;\n\t"
00571 "cmpps $1, %%xmm2, %%xmm7;\n\t"
00572 "subps %%xmm1, %%xmm0;\n\t"
00573 "subps %%xmm3, %%xmm2;\n\t"
00574 "andps %%xmm6, %%xmm0;\n\t"
00575 "andps %%xmm7, %%xmm2;\n\t"
00576 "movups %%xmm0, (%%rbx);\n\t"
00577 "movups %%xmm2, 16(%%rbx);\n\t"
00578 "add $32, %%rsi;\n\t"
00579 "add $32, %%rdi;\n\t"
00580 "add $32, %%rbx;\n\t"
00581 "loop .DB1;\n\t"
00582 ".DB0:;\n\t"
00583 "mov %%rdx, %%rcx;\n\t"
00584 "or %%rcx, %%rcx;\n\t"
00585 "jz .DB2;\n\t"
00586 ".DB3:;\n\t"
00587 "movss (%%rsi), %%xmm0;\n\t"
00588 "movss (%%rdi), %%xmm1;\n\t"
00589 "movss %%xmm1, %%xmm2;\n\t"
00590 "cmpss $1, %%xmm0, %%xmm2;\n\t"
00591 "andps %%xmm2, %%xmm0;\n\t"
00592 "andps %%xmm2, %%xmm1;\n\t"
00593 "subss %%xmm1, %%xmm0;\n\t"
00594 "movss %%xmm0, (%%rbx);\n\t"
00595 "add $4, %%rsi;\n\t"
00596 "add $4, %%rdi;\n\t"
00597 "add $4, %%rbx;\n\t"
00598 "loop .DB3;\n\t"
00599 ".DB2:;\n\t"
00600 :
00601 :"S"(a), "D"(b), "b"(result), "c"(rcx), "d"(rdx)
00602 :"memory"
00603 );
00604 }
00605
00606
00607
00608
00609 void sse_clampedDiff(const int32 *a, const int32 *b, int32 *c, const int32 sz)
00610 {
00611 int32 rcx=sz>>3;
00612 int32 rdx=sz&0x7;
00613 asm (
00614 "or %%rcx, %%rcx;\n\t"
00615 "jz .DC0;\n\t"
00616 ".DC1:;\n\t"
00617 "movdqu 0(%%rsi), %%xmm0;\n\t"
00618 "movdqu 0(%%rdi), %%xmm1;\n\t"
00619 "movdqu 16(%%rsi), %%xmm3;\n\t"
00620 "movdqu 16(%%rdi), %%xmm4;\n\t"
00621 "movdqu %%xmm0, %%xmm2;\n\t"
00622 "movdqu %%xmm3, %%xmm5;\n\t"
00623 "pcmpgtd %%xmm1, %%xmm2;\n\t"
00624 "pcmpgtd %%xmm4, %%xmm5;\n\t"
00625 "psubd %%xmm1, %%xmm0;\n\t"
00626 "psubd %%xmm4, %%xmm3;\n\t"
00627 "pand %%xmm2, %%xmm0;\n\t"
00628 "pand %%xmm5, %%xmm3;\n\t"
00629 "movdqu %%xmm0, (%%rbx);\n\t"
00630 "movdqu %%xmm3, 16(%%rbx);\n\t"
00631 "add $32, %%rsi;\n\t"
00632 "add $32, %%rdi;\n\t"
00633 "add $32, %%rbx;\n\t"
00634 "loop .DC1;\n\t"
00635 ".DC0:;\n\t"
00636 "mov %%rdx, %%rcx;\n\t"
00637 "or %%rcx, %%rcx;\n\t"
00638 "jz .DC2;\n\t"
00639 ".DC3:;\n\t"
00640 "movsd 0(%%rsi), %%xmm0;\n\t"
00641 "movsd 0(%%rdi), %%xmm1;\n\t"
00642 "movdqu %%xmm0, %%xmm2;\n\t"
00643 "pcmpgtd %%xmm1, %%xmm2;\n\t"
00644 "psubd %%xmm1, %%xmm0;\n\t"
00645 "pand %%xmm2, %%xmm0;\n\t"
00646 "movsd %%xmm0, (%%rbx);\n\t"
00647 "add $4, %%rsi;\n\t"
00648 "add $4, %%rdi;\n\t"
00649 "add $4, %%rbx;\n\t"
00650 "loop .DC3;\n\t"
00651 ".DC2:;\n\t"
00652 :
00653 :"S"(a), "D"(b), "c"(rcx), "d"(rdx), "b"(c)
00654 :"memory"
00655 );
00656 }
00657
00658
00659
00660
00661 void sse_binaryReverse(const byte *a, byte *result, const byte val, const
00662 int32 sz)
00663 {
00664 static unsigned int rcx=(sz>>7);
00665 static unsigned int rdx=sz&0x7f;
00666
00667 byte pVal[16];
00668
00669 memset(result, val, 16);
00670
00671 asm (
00672 "or %%rcx, %%rcx;\n\t"
00673 "jz .FA0;\n\t"
00674 ".FA1:;\n\t"
00675 "movdqu 0(%%rbx), %%xmm0;\n\t"
00676 "movdqu 0(%%rbx), %%xmm1;\n\t"
00677 "movdqu %%xmm0, %%xmm2;\n\t"
00678 "movdqu %%xmm1, %%xmm3;\n\t"
00679 "movdqu %%xmm0, %%xmm4;\n\t"
00680 "movdqu %%xmm1, %%xmm5;\n\t"
00681 "movdqu %%xmm0, %%xmm6;\n\t"
00682 "movdqu %%xmm1, %%xmm7;\n\t"
00683 "psubb (%%rsi), %%xmm0;\n\t"
00684 "psubb 16(%%rsi), %%xmm1;\n\t"
00685 "psubb 32(%%rsi), %%xmm2;\n\t"
00686 "psubb 48(%%rsi), %%xmm3;\n\t"
00687 "psubb 64(%%rsi), %%xmm4;\n\t"
00688 "psubb 80(%%rsi), %%xmm5;\n\t"
00689 "psubb 96(%%rsi), %%xmm6;\n\t"
00690 "psubb 112(%%rsi), %%xmm7;\n\t"
00691 "movdqu %%xmm0, (%%rdi);\n\t"
00692 "movdqu %%xmm1, 16(%%rdi);\n\t"
00693 "movdqu %%xmm2, 32(%%rdi);\n\t"
00694 "movdqu %%xmm3, 48(%%rdi);\n\t"
00695 "movdqu %%xmm4, 64(%%rdi);\n\t"
00696 "movdqu %%xmm5, 80(%%rdi);\n\t"
00697 "movdqu %%xmm6, 96(%%rdi);\n\t"
00698 "movdqu %%xmm7, 112(%%rdi);\n\t"
00699 "add $128, %%rdi;\n\t"
00700 "add $128, %%rsi;\n\t"
00701 "loop .FA1;\n\t"
00702 ".FA0:;\n\t"
00703 "mov %%rdx, %%rcx;\n\t"
00704 "or %%rcx, %%rcx;\n\t"
00705 "jz .FA2;\n\t"
00706 "movb (%%rbx), %%dl;\n\t"
00707 ".FA3:;\n\t"
00708 "movb %%dl, %%dh;\n\t"
00709 "movb (%%rsi), %%al;\n\t"
00710 "subb %%al, %%dh;\n\t"
00711 "movb %%dh, (%%rdi);\n\t"
00712 "inc %%rsi;\n\t"
00713 "inc %%rdi;\n\t"
00714 "loop .FA3;\n\t"
00715 ".FA2:;\n\t"
00716 :
00717 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx)
00718 :"memory","rax"
00719 );
00720 }
00721
00722
00723
00724
00725 void sse_binaryReverse(const float *a, float *result, const float val,
00726 const int sz)
00727 {
00728 static unsigned int rcx = sz>>5;
00729 static unsigned int rdx = sz&0x1f;
00730 int i;
00731 float pVal[16];
00732
00733 for(i=0;i<16;++i)
00734 pVal[i] = val;
00735
00736
00737 asm (
00738 "or %%rcx, %%rcx;\n\t"
00739 "jz .FB4;\n\t"
00740 ".FB2:;\n\t"
00741 "movups (%%rbx), %%xmm0;\n\t"
00742 "movups (%%rbx), %%xmm1;\n\t"
00743 "movups %%xmm0, %%xmm2;\n\t"
00744 "movups %%xmm1, %%xmm3;\n\t"
00745 "movups %%xmm0, %%xmm4;\n\t"
00746 "movups %%xmm1, %%xmm5;\n\t"
00747 "movups %%xmm0, %%xmm6;\n\t"
00748 "movups %%xmm1, %%xmm7;\n\t"
00749 "psubq (%%rsi), %%xmm0;\n\t"
00750 "psubq 16(%%rsi), %%xmm1;\n\t"
00751 "psubq 32(%%rsi), %%xmm2;\n\t"
00752 "psubq 48(%%rsi), %%xmm3;\n\t"
00753 "psubq 64(%%rsi), %%xmm4;\n\t"
00754 "psubq 80(%%rsi), %%xmm5;\n\t"
00755 "psubq 96(%%rsi), %%xmm6;\n\t"
00756 "psubq 112(%%rsi), %%xmm7;\n\t"
00757 "movups %%xmm0, 0(%%rdi);\n\t"
00758 "movups %%xmm1, 16(%%rdi);\n\t"
00759 "movups %%xmm2, 32(%%rdi);\n\t"
00760 "movups %%xmm3, 48(%%rdi);\n\t"
00761 "movups %%xmm4, 64(%%rdi);\n\t"
00762 "movups %%xmm5, 80(%%rdi);\n\t"
00763 "movups %%xmm6, 96(%%rdi);\n\t"
00764 "movups %%xmm7,112(%%rdi);\n\t"
00765 "add $128, %%rsi;\n\t"
00766 "add $128, %%rdi;\n\t"
00767 "loop .FB2;\n\t"
00768 ".FB4:\n\t"
00769 "or %%rdx, %%rdx;\n\t"
00770 "jz .FB1;\n\t"
00771 "mov %%rdx, %%rcx;\n\t"
00772 ".FB3:;\n\t"
00773 "movss 0(%%rbx), %%xmm0;\n\t"
00774 "subss (%%rsi), %%xmm0;\n\t"
00775 "movups %%xmm0, (%%rdi);\n\t"
00776 "add $16, %%rsi;\n\t"
00777 "add $16, %%rdi;\n\t"
00778 "loop .FB3;\n\t"
00779 ".FB1:;\n\t"
00780 :
00781 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx)
00782 :"memory","rax"
00783 );
00784 }
00785
00786
00787
00788
00789
00790 void sse_binaryReverse(const int32 *a, int32 *result, const int32 val,
00791 const int32 sz)
00792 {
00793 int32 rcx=sz>>5;
00794 int32 rdx=sz&31;
00795 int32 pVal[16];
00796 int i;
00797
00798 for(i=0;i<16;++i) pVal[i] = val;
00799
00800 asm (
00801 "or %%rcx, %%rcx;\n\t"
00802 "jz .FC4;\n\t"
00803 ".FC2:;\n\t"
00804 "movdqu (%%rbx), %%xmm0;\n\t"
00805 "movdqu (%%rbx), %%xmm1;\n\t"
00806 "movdqu %%xmm0, %%xmm2;\n\t"
00807 "movdqu %%xmm1, %%xmm3;\n\t"
00808 "movdqu %%xmm0, %%xmm4;\n\t"
00809 "movdqu %%xmm1, %%xmm5;\n\t"
00810 "movdqu %%xmm0, %%xmm6;\n\t"
00811 "movdqu %%xmm1, %%xmm7;\n\t"
00812 "psubd (%%rsi), %%xmm0;\n\t"
00813 "psubd 16(%%rsi), %%xmm1;\n\t"
00814 "psubd 32(%%rsi), %%xmm2;\n\t"
00815 "psubd 48(%%rsi), %%xmm3;\n\t"
00816 "psubd 64(%%rsi), %%xmm4;\n\t"
00817 "psubd 80(%%rsi), %%xmm5;\n\t"
00818 "psubd 96(%%rsi), %%xmm6;\n\t"
00819 "psubd 112(%%rsi), %%xmm7;\n\t"
00820 "movdqu %%xmm0, 0(%%rdi);\n\t"
00821 "movdqu %%xmm1, 16(%%rdi);\n\t"
00822 "movdqu %%xmm2, 32(%%rdi);\n\t"
00823 "movdqu %%xmm3, 48(%%rdi);\n\t"
00824 "movdqu %%xmm4, 64(%%rdi);\n\t"
00825 "movdqu %%xmm5, 80(%%rdi);\n\t"
00826 "movdqu %%xmm6, 96(%%rdi);\n\t"
00827 "movdqu %%xmm7,112(%%rdi);\n\t"
00828 "add $128, %%rsi;\n\t"
00829 "add $128, %%rdi;\n\t"
00830 "loop .FC2;\n\t"
00831 ".FC4:;\n\t"
00832 "or %%rdx, %%rdx;\n\t"
00833 "jz .FC1;\n\t"
00834 "mov %%rdx, %%rcx;\n\t"
00835 ".FC3:;\n\t"
00836 "movdqu 0(%%rbx), %%xmm0;\n\t"
00837 "psubd (%%rsi), %%xmm0;\n\t"
00838 "movups %%xmm0, (%%rdi);\n\t"
00839 "add $16, %%rsi;\n\t"
00840 "add $16, %%rdi;\n\t"
00841 "loop .FC3;\n\t"
00842 ".FC1:;\n\t"
00843 :
00844 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx)
00845 :"memory","rax"
00846 );
00847 }
00848
00849
00850
00851
00852
00853 void sse_cvt_byte_to_int(const byte *a, int32 *b, const int32 sz)
00854 {
00855 int32 rcx=sz>>4;
00856 int32 rdx=sz&0xf;
00857
00858 asm(
00859 "or %%rcx, %%rcx;\n\t"
00860 "jz .GA4;\n\t"
00861 "pxor %%xmm0, %%xmm0;\n\t"
00862 ".GA2:;\n\t"
00863 "movdqu 0(%%rsi), %%xmm1;\n\t"
00864 "movdqa %%xmm1, %%xmm2;\n\t"
00865 "movdqa %%xmm1, %%xmm3;\n\t"
00866 "movdqa %%xmm1, %%xmm4;\n\t"
00867 "psrldq $4, %%xmm2;\n\t"
00868 "psrldq $8, %%xmm3;\n\t"
00869 "psrldq $12, %%xmm4;\n\t"
00870 "punpcklbw %%xmm0, %%xmm1;\n\t"
00871 "punpcklbw %%xmm0, %%xmm2;\n\t"
00872 "punpcklbw %%xmm0, %%xmm3;\n\t"
00873 "punpcklbw %%xmm0, %%xmm4;\n\t"
00874 "punpcklbw %%xmm0, %%xmm1;\n\t"
00875 "punpcklbw %%xmm0, %%xmm2;\n\t"
00876 "punpcklbw %%xmm0, %%xmm3;\n\t"
00877 "punpcklbw %%xmm0, %%xmm4;\n\t"
00878 "movdqu %%xmm1, (%%rdi);\n\t"
00879 "movdqu %%xmm2, 16(%%rdi);\n\t"
00880 "movdqu %%xmm3, 32(%%rdi);\n\t"
00881 "movdqu %%xmm4, 48(%%rdi);\n\t"
00882 "add $16, %%rsi;\n\t"
00883 "add $64, %%rdi;\n\t"
00884 "loop .GA2;\n\t"
00885 ".GA4:;\n\t"
00886 "or %%rdx, %%rdx;\n\t"
00887 "jz .GA1;\n\t"
00888 "mov %%rdx, %%rcx;\n\t"
00889 ".GA3:;\n\t"
00890 "xor %%rax, %%rax;\n\t"
00891 "movb (%%rsi), %%al;\n\t"
00892 "mov %%rax, (%%rdi);\n\t"
00893 "inc %%rsi;\n\t"
00894 "add $4, %%rdi;\n\t"
00895 "loop .GA3;\n\t"
00896 ".GA1:;"
00897 :
00898 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
00899 :"memory"
00900 );
00901
00902
00903 }
00904
00905 #endif
00906
00907 #ifdef INVT_USE_MMXSSE2
00908
00909
00910
00911 void sse2_cvt_byte_to_float(const byte *a, float32 *b, const int32 sz)
00912 {
00913 int32 rcx=sz>>4;
00914 int32 rdx=sz&0xf;
00915
00916 asm(
00917 "or %%rcx, %%rcx;\n\t"
00918 "jz .GB4;\n\t"
00919 ".GB2:;\n\t"
00920 "pxor %%xmm0, %%xmm0;\n\t"
00921 "movdqu 0(%%rsi), %%xmm1;\n\t"
00922 "movdqu 4(%%rsi), %%xmm2;\n\t"
00923 "movdqu 8(%%rsi), %%xmm3;\n\t"
00924 "movdqu 12(%%rsi), %%xmm4;\n\t"
00925 "punpcklbw %%xmm0, %%xmm1;\n\t"
00926 "punpcklbw %%xmm0, %%xmm2;\n\t"
00927 "punpcklbw %%xmm0, %%xmm3;\n\t"
00928 "punpcklbw %%xmm0, %%xmm4;\n\t"
00929 "punpcklbw %%xmm0, %%xmm1;\n\t"
00930 "punpcklbw %%xmm0, %%xmm2;\n\t"
00931 "punpcklbw %%xmm0, %%xmm3;\n\t"
00932 "punpcklbw %%xmm0, %%xmm4;\n\t"
00933 "cvtdq2ps %%xmm1, %%xmm1;\n\t"
00934 "cvtdq2ps %%xmm2, %%xmm2;\n\t"
00935 "movups %%xmm1, (%%rdi);\n\t"
00936 "movups %%xmm2, 16(%%rdi);\n\t"
00937 "cvtdq2ps %%xmm3, %%xmm3;\n\t"
00938 "cvtdq2ps %%xmm4, %%xmm4;\n\t"
00939 "movups %%xmm3, 32(%%rdi);\n\t"
00940 "movups %%xmm4, 48(%%rdi);\n\t"
00941 "add $16, %%rsi;\n\t"
00942 "add $64, %%rdi;\n\t"
00943 "loop .GB2;\n\t"
00944 ".GB4:;\n\t"
00945 "or %%rdx, %%rdx;\n\t"
00946 "jz .GB1;\n\t"
00947 "mov %%rdx, %%rcx;\n\t"
00948 ".GB3:;\n\t"
00949 "xor %%rax, %%rax;\n\t"
00950 "movb (%%rsi), %%al;\n\t"
00951 "movd %%rax, %%xmm0;\n\t"
00952 "cvtdq2ps %%xmm0, %%xmm1;\n\t"
00953 "movss %%xmm1, (%%rdi);\n\t"
00954 "inc %%rsi;\n\t"
00955 "add $4, %%rdi;\n\t"
00956 "loop .GB3;\n\t"
00957 ".GB1:;"
00958 :
00959 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
00960 :"memory"
00961 );
00962 }
00963
00964
00965
00966
00967
00968 void sse2_cvt_byte_to_double(const byte *a, double *b, int32 sz)
00969 {
00970 int32 rcx=sz>>3;
00971 int32 rdx=sz&0x7;
00972
00973 asm(
00974 "or %%rcx, %%rcx;\n\t"
00975 "jz .GC4;\n\t"
00976 ".GC2:;\n\t"
00977 "pxor %%xmm0, %%xmm0;\n\t"
00978 "movdqu 0(%%rsi), %%xmm1;\n\t"
00979 "movdqu 2(%%rsi), %%xmm2;\n\t"
00980 "movdqu 4(%%rsi), %%xmm3;\n\t"
00981 "movdqu 6(%%rsi), %%xmm4;\n\t"
00982 "punpcklbw %%xmm0, %%xmm1;\n\t"
00983 "punpcklbw %%xmm0, %%xmm2;\n\t"
00984 "punpcklbw %%xmm0, %%xmm3;\n\t"
00985 "punpcklbw %%xmm0, %%xmm4;\n\t"
00986 "punpcklbw %%xmm0, %%xmm1;\n\t"
00987 "punpcklbw %%xmm0, %%xmm2;\n\t"
00988 "punpcklbw %%xmm0, %%xmm3;\n\t"
00989 "punpcklbw %%xmm0, %%xmm4;\n\t"
00990 "cvtdq2pd %%xmm1, %%xmm1;\n\t"
00991 "cvtdq2pd %%xmm2, %%xmm2;\n\t"
00992 "movupd %%xmm1, (%%rdi);\n\t"
00993 "movupd %%xmm2, 16(%%rdi);\n\t"
00994 "cvtdq2pd %%xmm3, %%xmm3;\n\t"
00995 "cvtdq2pd %%xmm4, %%xmm4;\n\t"
00996 "movupd %%xmm3, 32(%%rdi);\n\t"
00997 "movupd %%xmm4, 48(%%rdi);\n\t"
00998 "add $8, %%rsi;\n\t"
00999 "add $64, %%rdi;\n\t"
01000 "loop .GC2;\n\t"
01001 ".GC4:;\n\t"
01002 "or %%rdx, %%rdx;\n\t"
01003 "jz .GC1;\n\t"
01004 "mov %%rdx, %%rcx;\n\t"
01005 ".GC3:;\n\t"
01006 "xor %%rax, %%rax;\n\t"
01007 "movb (%%rsi), %%al;\n\t"
01008 "movd %%rax, %%xmm0;\n\t"
01009 "cvtdq2pd %%xmm0, %%xmm1;\n\t"
01010 "movsd %%xmm1, (%%rdi);\n\t"
01011 "inc %%rsi;\n\t"
01012 "add $8, %%rdi;\n\t"
01013 "loop .GC3;\n\t"
01014 ".GC1:;"
01015 :
01016 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01017 :"memory"
01018 );
01019
01020 }
01021
01022
01023
01024
01025
01026 void sse2_cvt_int_to_float(const int32 *a, float *b, const int32 sz)
01027 {
01028 int32 rcx=sz>>5;
01029 int32 rdx=sz&0x1f;
01030
01031 asm(
01032 "or %%rcx, %%rcx;\n\t"
01033 "jz .GD4;\n\t"
01034 ".GD2:;\n\t"
01035 "movdqu 0(%%rsi), %%xmm0;\n\t"
01036 "movdqu 16(%%rsi), %%xmm1;\n\t"
01037 "movdqu 32(%%rsi), %%xmm2;\n\t"
01038 "movdqu 48(%%rsi), %%xmm3;\n\t"
01039 "movdqu 64(%%rsi), %%xmm4;\n\t"
01040 "movdqu 80(%%rsi), %%xmm5;\n\t"
01041 "movdqu 96(%%rsi), %%xmm6;\n\t"
01042 "movdqu 112(%%rsi), %%xmm7;\n\t"
01043 "cvtdq2ps %%xmm0, %%xmm0;\n\t"
01044 "cvtdq2ps %%xmm1, %%xmm1;\n\t"
01045 "cvtdq2ps %%xmm2, %%xmm2;\n\t"
01046 "cvtdq2ps %%xmm3, %%xmm3;\n\t"
01047 "cvtdq2ps %%xmm4, %%xmm4;\n\t"
01048 "cvtdq2ps %%xmm5, %%xmm5;\n\t"
01049 "cvtdq2ps %%xmm6, %%xmm6;\n\t"
01050 "cvtdq2ps %%xmm7, %%xmm7;\n\t"
01051 "movups %%xmm0, 0(%%rdi);\n\t"
01052 "movups %%xmm1, 16(%%rdi);\n\t"
01053 "movups %%xmm2, 32(%%rdi);\n\t"
01054 "movups %%xmm3, 48(%%rdi);\n\t"
01055 "movups %%xmm4, 64(%%rdi);\n\t"
01056 "movups %%xmm5, 80(%%rdi);\n\t"
01057 "movups %%xmm6, 96(%%rdi);\n\t"
01058 "movups %%xmm7, 112(%%rdi);\n\t"
01059 "add $128, %%rsi;\n\t"
01060 "add $128, %%rdi;\n\t"
01061 "dec %%rcx;\n\t"
01062 "jnz .GD2;\n\t"
01063 ".GD4:;\n\t"
01064 "or %%rdx, %%rdx;\n\t"
01065 "jz .GD1;\n\t"
01066 "mov %%rdx, %%rcx;\n\t"
01067 ".GD3:;\n\t"
01068 "movsd (%%rsi), %%xmm0;\n\t"
01069 "cvtdq2ps %%xmm0, %%xmm0;\n\t"
01070 "movss %%xmm0, (%%rdi);\n\t"
01071 "add $4, %%rsi;\n\t"
01072 "add $4, %%rdi;\n\t"
01073 "loop .GD3;\n\t"
01074 ".GD1:;"
01075 :
01076 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01077 :"memory"
01078 );
01079
01080 }
01081
01082
01083
01084 void sse2_cvt_int_to_double(const int32 *a, double *b, const int32 sz)
01085 {
01086 int32 rcx=sz>>4;
01087 int32 rdx=sz&0xf;
01088
01089 asm(
01090 "or %%rcx, %%rcx;\n\t"
01091 "jz .GE4;\n\t"
01092 ".GE2:;\n\t"
01093 "movdqu 0(%%rsi), %%xmm0;\n\t"
01094 "movdqu 8(%%rsi), %%xmm1;\n\t"
01095 "movdqu 16(%%rsi), %%xmm2;\n\t"
01096 "movdqu 24(%%rsi), %%xmm3;\n\t"
01097 "movdqu 32(%%rsi), %%xmm4;\n\t"
01098 "movdqu 40(%%rsi), %%xmm5;\n\t"
01099 "movdqu 48(%%rsi), %%xmm6;\n\t"
01100 "movdqu 56(%%rsi), %%xmm7;\n\t"
01101 "cvtdq2pd %%xmm0, %%xmm0;\n\t"
01102 "cvtdq2pd %%xmm1, %%xmm1;\n\t"
01103 "cvtdq2pd %%xmm2, %%xmm2;\n\t"
01104 "cvtdq2pd %%xmm3, %%xmm3;\n\t"
01105 "cvtdq2pd %%xmm4, %%xmm4;\n\t"
01106 "cvtdq2pd %%xmm5, %%xmm5;\n\t"
01107 "cvtdq2pd %%xmm6, %%xmm6;\n\t"
01108 "cvtdq2pd %%xmm7, %%xmm7;\n\t"
01109 "movups %%xmm0, 0(%%rdi);\n\t"
01110 "movups %%xmm1, 16(%%rdi);\n\t"
01111 "movups %%xmm2, 32(%%rdi);\n\t"
01112 "movups %%xmm3, 48(%%rdi);\n\t"
01113 "movups %%xmm4, 64(%%rdi);\n\t"
01114 "movups %%xmm5, 80(%%rdi);\n\t"
01115 "movups %%xmm6, 96(%%rdi);\n\t"
01116 "movups %%xmm7, 112(%%rdi);\n\t"
01117 "add $64, %%rsi;\n\t"
01118 "add $128, %%rdi;\n\t"
01119 "dec %%rcx;\n\t"
01120 "jnz .GE2;\n\t"
01121 ".GE4:;\n\t"
01122 "or %%rdx, %%rdx;\n\t"
01123 "jz .GE1;\n\t"
01124 "mov %%rdx, %%rcx;\n\t"
01125 ".GE3:;\n\t"
01126 "movsd (%%rsi), %%xmm0;\n\t"
01127 "cvtdq2pd %%xmm0, %%xmm0;\n\t"
01128 "movsd %%xmm0, (%%rdi);\n\t"
01129 "add $4, %%rsi;\n\t"
01130 "add $8, %%rdi;\n\t"
01131 "loop .GE3;\n\t"
01132 ".GE1:;"
01133 :
01134 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01135 :"memory"
01136 );
01137
01138 }
01139
01140
01141 void sse2_cvt_float_to_int(const float *a, int *b, const int32 sz)
01142 {
01143 int32 rcx=sz;
01144 int32 rdx=sz;
01145
01146 asm (
01147 "or %%rcx, %%rcx;\n\t"
01148 "jz .GF1;\n\t"
01149 ".GF2:;\n\t"
01150 "movdqu 0(%%rsi), %%xmm0;\n\t"
01151 "movdqu 8(%%rsi), %%xmm1;\n\t"
01152 "movdqu 16(%%rsi), %%xmm2;\n\t"
01153 "movdqu 24(%%rsi), %%xmm3;\n\t"
01154 "movdqu 32(%%rsi), %%xmm4;\n\t"
01155 "movdqu 40(%%rsi), %%xmm5;\n\t"
01156 "movdqu 48(%%rsi), %%xmm6;\n\t"
01157 "movdqu 56(%%rsi), %%xmm7;\n\t"
01158 "cvtps2dq %%xmm0, %%xmm0;\n\t"
01159 "cvtps2dq %%xmm1, %%xmm1;\n\t"
01160 "cvtps2dq %%xmm2, %%xmm2;\n\t"
01161 "cvtps2dq %%xmm3, %%xmm3;\n\t"
01162 "cvtps2dq %%xmm4, %%xmm4;\n\t"
01163 "cvtps2dq %%xmm5, %%xmm5;\n\t"
01164 "cvtps2dq %%xmm6, %%xmm6;\n\t"
01165 "cvtps2dq %%xmm7, %%xmm7;\n\t"
01166 "movdqu %%xmm0, 0(%%rdi);\n\t"
01167 "movdqu %%xmm1, 16(%%rdi);\n\t"
01168 "movdqu %%xmm2, 32(%%rdi);\n\t"
01169 "movdqu %%xmm3, 48(%%rdi);\n\t"
01170 "movdqu %%xmm4, 64(%%rdi);\n\t"
01171 "movdqu %%xmm5, 80(%%rdi);\n\t"
01172 "movdqu %%xmm6, 96(%%rdi);\n\t"
01173 "movdqu %%xmm7, 112(%%rdi);\n\t"
01174 "add $64, %%rsi;\n\t"
01175 "add $128, %%rdi;\n\t"
01176 "dec %%rcx;\n\t"
01177 "jnz .GF2;\n\t"
01178 ".GF4:;\n\t"
01179 "or %%rdx, %%rdx;\n\t"
01180 "jz .GF1;\n\t"
01181 "mov %%rdx, %%rcx;\n\t"
01182 ".GF3:;\n\t"
01183 "movsd (%%rsi), %%xmm0;\n\t"
01184 "cvtps2dq %%xmm0, %%xmm0;\n\t"
01185 "movsd %%xmm0, (%%rdi);\n\t"
01186 "add $4, %%rsi;\n\t"
01187 "add $8, %%rdi;\n\t"
01188 "loop .GF3;\n\t"
01189 ".GF1:;"
01190 :
01191 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01192 :"memory"
01193 );
01194
01195 }
01196
01197
01198
01199
01200 void sse2_cvt_float_to_double(const float *a, double *b, const int32 sz)
01201 {
01202 int32 rcx=sz>>4;
01203 int32 rdx=sz&0xf;
01204
01205 asm(
01206 "or %%rcx, %%rcx;\n\t"
01207 "jz .GG4;\n\t"
01208 ".GG2:;\n\t"
01209 "movups 0(%%rsi), %%xmm0;\n\t"
01210 "movups 8(%%rsi), %%xmm1;\n\t"
01211 "movups 16(%%rsi), %%xmm2;\n\t"
01212 "movups 24(%%rsi), %%xmm3;\n\t"
01213 "movups 32(%%rsi), %%xmm4;\n\t"
01214 "movups 40(%%rsi), %%xmm5;\n\t"
01215 "movups 48(%%rsi), %%xmm6;\n\t"
01216 "movups 56(%%rsi), %%xmm7;\n\t"
01217 "cvtps2pd %%xmm0, %%xmm0;\n\t"
01218 "cvtps2pd %%xmm1, %%xmm1;\n\t"
01219 "cvtps2pd %%xmm2, %%xmm2;\n\t"
01220 "cvtps2pd %%xmm3, %%xmm3;\n\t"
01221 "cvtps2pd %%xmm4, %%xmm4;\n\t"
01222 "cvtps2pd %%xmm5, %%xmm5;\n\t"
01223 "cvtps2pd %%xmm6, %%xmm6;\n\t"
01224 "cvtps2pd %%xmm7, %%xmm7;\n\t"
01225 "movupd %%xmm0, 0(%%rdi);\n\t"
01226 "movupd %%xmm1, 16(%%rdi);\n\t"
01227 "movupd %%xmm2, 32(%%rdi);\n\t"
01228 "movupd %%xmm3, 48(%%rdi);\n\t"
01229 "movupd %%xmm4, 64(%%rdi);\n\t"
01230 "movupd %%xmm5, 80(%%rdi);\n\t"
01231 "movupd %%xmm6, 96(%%rdi);\n\t"
01232 "movupd %%xmm7, 112(%%rdi);\n\t"
01233 "add $64, %%rsi;\n\t"
01234 "add $128, %%rdi;\n\t"
01235 "dec %%rcx;\n\t"
01236 "jnz .GG2;\n\t"
01237 ".GG4:;\n\t"
01238 "or %%rdx, %%rdx;\n\t"
01239 "jz .GG1;\n\t"
01240 "mov %%rdx, %%rcx;\n\t"
01241 ".GG3:;\n\t"
01242 "movsd (%%rsi), %%xmm0;\n\t"
01243 "cvtps2pd %%xmm0, %%xmm0;\n\t"
01244 "movsd %%xmm0, (%%rdi);\n\t"
01245 "add $4, %%rsi;\n\t"
01246 "add $8, %%rdi;\n\t"
01247 "loop .GG3;\n\t"
01248 ".GG1:;"
01249 :
01250 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01251 :"memory"
01252 );
01253 }
01254
01255 #endif
01256
01257 #ifdef INVT_USE_SSE
01258
01259
01260 void sse_lowPass3x(const float *a, float *b, const int h, const int w)
01261 {
01262 const float coeffs[] = { 3.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0, 4.0};
01263 int rdx = (w-2)/12;
01264 int rax = (w-2)%12;
01265
01266 asm (
01267
01268 "or %%rcx, %%rcx;\n\t"
01269 "jz .HA1;\n\t"
01270 ".HA2:;\n\t"
01271
01272
01273 "movss 0(%%rsi), %%xmm1;\n\t"
01274 "movss 4(%%rsi), %%xmm2;\n\t"
01275 "addss %%xmm1, %%xmm1;\n\t"
01276 "addss %%xmm1, %%xmm2;\n\t"
01277 "divss (%%rbx), %%xmm2;\n\t"
01278 "movss %%xmm2, (%%rdi);\n\t"
01279 "add $4, %%rdi;\n\t"
01280
01281
01282 "or %%rdx, %%rdx;\n\t"
01283 "jz .HA4;\n\t"
01284
01285 "push %%rdx;\n\t"
01286 ".HA3:;\n\t"
01287 "movups 00(%%rsi), %%xmm0;\n\t"
01288 "movups 04(%%rsi), %%xmm1;\n\t"
01289 "movups 8(%%rsi), %%xmm2;\n\t"
01290 "movups 16(%%rsi), %%xmm3;\n\t"
01291 "movups 20(%%rsi), %%xmm4;\n\t"
01292 "movups 24(%%rsi), %%xmm5;\n\t"
01293 "movups 32(%%rsi), %%xmm6;\n\t"
01294 "movups 36(%%rsi), %%xmm7;\n\t"
01295 "addps %%xmm1, %%xmm0;\n\t"
01296 "addps %%xmm4, %%xmm3;\n\t"
01297 "addps %%xmm1, %%xmm0;\n\t"
01298 "addps %%xmm4, %%xmm3;\n\t"
01299 "movups 40(%%rsi), %%xmm1;\n\t"
01300 "addps %%xmm7, %%xmm6;\n\t"
01301 "addps %%xmm2, %%xmm0;\n\t"
01302 "addps %%xmm1, %%xmm6;\n\t"
01303 "addps %%xmm5, %%xmm3;\n\t"
01304 "addps %%xmm7, %%xmm6;\n\t"
01305 "divps 16(%%rbx ), %%xmm0;\n\t"
01306 "divps 16(%%rbx ), %%xmm3;\n\t"
01307 "divps 16(%%rbx ), %%xmm6;\n\t"
01308 "movups %%xmm0, (%%rdi);\n\t"
01309 "movups %%xmm3, 16(%%rdi);\n\t"
01310 "movups %%xmm6, 32(%%rdi);\n\t"
01311 "add $48, %%rsi;\n\t"
01312 "add $48, %%rdi;\n\t"
01313 "dec %%rdx;\n\t"
01314 "jnz .HA3;\n\t"
01315 "pop %%rdx;\n\t"
01316 ".HA4:;\n\t"
01317
01318 "or %%rax, %%rax;\n\t"
01319 "jz .HA6;\n\t"
01320 "push %%rax;\n\t"
01321 ".HA5:;\n\t"
01322 "movss 00(%%rsi), %%xmm0;\n\t"
01323 "movss 04(%%rsi), %%xmm1;\n\t"
01324 "movss 8(%%rsi), %%xmm2;\n\t"
01325 "addps %%xmm1, %%xmm0;\n\t"
01326 "addps %%xmm1, %%xmm2;\n\t"
01327 "addps %%xmm2, %%xmm0;\n\t"
01328 "divss 16(%%rbx ), %%xmm0;\n\t"
01329 "movss %%xmm0, (%%rdi);\n\t"
01330 "add $4, %%rsi;\n\t"
01331 "add $4, %%rdi;\n\t"
01332 "dec %%rax;\n\t"
01333 "jnz .HA5;\n\t"
01334 "pop %%rax;\n\t"
01335
01336 ".HA6:;\n\t"
01337 "movss (%%rsi), %%xmm1;\n\t"
01338 "movss 4(%%rsi), %%xmm2;\n\t"
01339 "addss %%xmm2, %%xmm2;\n\t"
01340 "addss %%xmm1, %%xmm2;\n\t"
01341 "divss 0(%%rbx), %%xmm2;\n\t"
01342
01343 "movss %%xmm2, (%%rdi);\n\t"
01344 "add $4, %%rdi;\n\t"
01345 "add $8, %%rsi;\n\t"
01346 "dec %%rcx;\n\t"
01347 "jnz .HA2;\n\t"
01348 ".HA1:;\n\t"
01349 :
01350 :"S"(a), "D"(b),"c"(h),"a"(rax),"d"(rdx),"b"(coeffs)
01351 :"memory"
01352 );
01353
01354 }
01355
01356
01357
01358
01359
01360
01361 void sse_lowPass3y(const float *a, float *b, const int h, const int w)
01362 {
01363 const float coeffs[] = { 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0};
01364
01365 if (h < 2){
01366 memcpy(b, a, w*h*sizeof(b[0]));
01367 return;
01368 }
01369
01370 if (h < 2){
01371 memcpy(b, a, w*h*sizeof(b[0]));
01372 return;
01373 }
01374
01375 asm (
01376
01377 "mov %%rdx, %%rcx;\n\t"
01378 "or %%rcx, %%rcx;\n\t"
01379 "jz .HU1;\n\t"
01380 "push %%rsi;\n\t"
01381 ".HU0:;\n\t"
01382 "movss (%%rsi), %%xmm0;\n\t"
01383 "movss (%%rsi, %%rdx, 4), %%xmm1;\n\t"
01384 "addss %%xmm0, %%xmm0;\n\t"
01385 "addss %%xmm1, %%xmm0;\n\t"
01386 "divss (%%rbx), %%xmm0;\n\t"
01387 "add $4, %%rsi;\n\t"
01388 "movss %%xmm0, (%%rdi);\n\t"
01389 "add $4, %%rdi;\n\t"
01390 "dec %%rcx;\n\t"
01391 "jnz .HU0;\n\t"
01392 "pop %%rsi;\n\t"
01393 ".HU1:;\n\t"
01394 "cmp $2, %%rax;\n\t"
01395 "jle .HU5;\n\t"
01396
01397 "push %%rax;\n\t"
01398 "sub $2, %%rax;\n\t"
01399 "jle .HU4;\n\t"
01400 ".HU2:;\n\t"
01401 "mov %%rdx, %%rcx;\n\t"
01402 "push %%rdx;\n\t"
01403 ".HU3:;\n\t"
01404 "movss (%%rsi), %%xmm0;\n\t"
01405 "movss (%%rsi,%%rdx,4), %%xmm1;\n\t"
01406 "movss (%%rsi,%%rdx,8), %%xmm2;\n\t"
01407 "addss %%xmm1, %%xmm0;\n\t"
01408 "addss %%xmm1, %%xmm2;\n\t"
01409 "addss %%xmm2, %%xmm0;\n\t"
01410 "divss 16(%%rbx), %%xmm0;\n\t"
01411 "movss %%xmm0, (%%rdi);\n\t"
01412 "add $4, %%rsi;\n\t"
01413 "add $4, %%rdi;\n\t"
01414 "dec %%rcx;\n\t"
01415 "jnz .HU3;\n\t"
01416 "pop %%rdx;\n\t"
01417 "dec %%rax;\n\t"
01418 "jnz .HU2;\n\t"
01419
01420 ".HU4:;\n\t"
01421 "pop %%rax;\n\t"
01422 ".HU5:;\n\t"
01423 "or %%rdx, %%rdx;\n\t"
01424 "jz .HU7;\n\t"
01425 "push %%rdx;\n\t"
01426 "mov %%rdx, %%rcx;\n\t"
01427 ".HU6:;\n\t"
01428 "movss (%%rsi), %%xmm0;\n\t"
01429 "movss (%%rsi,%%rcx,4), %%xmm1;\n\t"
01430 "addss %%xmm1, %%xmm1;\n\t"
01431 "addss %%xmm1, %%xmm0;\n\t"
01432 "divss (%%rbx), %%xmm0;\n\t"
01433 "movss %%xmm0, (%%rdi);\n\t"
01434 "add $4, %%rsi;\n\t"
01435 "add $4, %%rdi;\n\t"
01436 "dec %%rdx;\n\t"
01437 "jnz .HU6;\n\t"
01438 "pop %%rdx;\n\t"
01439 ".HU7:;\n\t"
01440 :
01441 :"S"(a),"D"(b),"a"(h),"d"(w),"b"(coeffs)
01442 );
01443
01444 }
01445
01446
01447
01448
01449 void sse_lowPass5x(const float *src, float *dest, const int h, const int w)
01450 {
01451 const float *sptr= src;
01452 float *dptr= dest;
01453
01454 if(w<2)
01455 {
01456 memcpy(dest,src,h*w*sizeof(dest[0]));
01457 return;
01458 }
01459
01460 if (w == 2)
01461 for (int j = 0; j < h; j ++)
01462 {
01463
01464 *dptr++ = sptr[0] * (6.0F / 10.0F) + sptr[1] * (4.0F / 10.0F);
01465
01466
01467 *dptr++ = sptr[0] * (4.0F / 10.0F) + sptr[1] * (6.0F / 10.0F);
01468
01469 sptr += 2;
01470 }
01471 else if (w == 3)
01472 for (int j = 0; j < h; j ++)
01473 {
01474
01475 *dptr++ = sptr[0] * (6.0F / 11.0F) +
01476 sptr[1] * (4.0F / 11.0F) +
01477 sptr[2] * (1.0F / 11.0F);
01478
01479
01480 *dptr++ = (sptr[0] + sptr[2]) * (4.0F / 14.0F) +
01481 sptr[1] * (6.0F / 14.0F);
01482
01483
01484 *dptr++ = sptr[0] * (1.0F / 11.0F) +
01485 sptr[1] * (4.0F / 11.0F) +
01486 sptr[2] * (6.0F / 11.0F);
01487
01488 sptr += 3;
01489 }
01490 else
01491 if(w>3)
01492 {
01493 const float coeffs[] = {6.0/11.0, 4.0/11.0, 1.0/11.0, 4.0/15.0,
01494 4.0/15.0, 6.0/15.0, 1.0/15.0, 1.0/16.0,
01495 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
01496 4.0/16.0, 4.0/16.0, 4.0/16.0, 4.0/16.0,
01497 6.0/16.0, 6.0/16.0, 6.0/16.0, 6.0/16.0,
01498 1.0/15.0, 4.0/15.0, 6.0/15.0, 1.0/15.0,
01499 1.0/11.0, 4.0/11.0, 6.0/11.0, 1.0/11.0
01500 };
01501
01502 int rax= (w-4)&3;
01503 int rdx= (w-4)>>2;
01504
01505 asm(
01506 "or %%rcx, %%rcx;\n\t"
01507 "jz .HG6;\n\t"
01508 ".HG0:;\n\t"
01509 "movss (%%rsi), %%xmm0;\n\t"
01510 "movss 4(%%rsi), %%xmm2;\n\t"
01511 "movss 8(%%rsi), %%xmm4;\n\t"
01512 "movss 12(%%rsi), %%xmm6;\n\t"
01513 "movss %%xmm0, %%xmm1;\n\t"
01514 "movss %%xmm2, %%xmm3;\n\t"
01515 "movss %%xmm4, %%xmm5;\n\t"
01516 "mulss (%%rbx), %%xmm0;\n\t"
01517 "mulss 4(%%rbx), %%xmm2;\n\t"
01518 "mulss 8(%%rbx), %%xmm4;\n\t"
01519 "addss %%xmm5, %%xmm1;\n\t"
01520 "mulss 16(%%rbx), %%xmm1;\n\t"
01521 "mulss 20(%%rbx), %%xmm3;\n\t"
01522 "mulss 24(%%rbx), %%xmm6;\n\t"
01523 "addss %%xmm2, %%xmm0;\n\t"
01524 "addss %%xmm3, %%xmm1;\n\t"
01525 "addss %%xmm4, %%xmm0;\n\t"
01526 "addss %%xmm6, %%xmm1;\n\t"
01527 "movss %%xmm0, (%%rdi);\n\t"
01528 "movss %%xmm1, 4(%%rdi);\n\t"
01529 "add $8, %%rdi;\n\t"
01530
01531 "or %%rdx, %%rdx;\n\t"
01532 "jz .HG5;\n\t"
01533
01534 "push %%rdx;\n\t"
01535 "movups 32(%%rbx), %%xmm5;\n\t"
01536 "movups 48(%%rbx), %%xmm6;\n\t"
01537 "movups 64(%%rbx), %%xmm7;\n\t"
01538 ".HG1:;\n\t"
01539 "movups 0(%%rsi), %%xmm0;\n\t"
01540 "movups 04(%%rsi), %%xmm1;\n\t"
01541 "movups 8(%%rsi), %%xmm2;\n\t"
01542 "movups 12(%%rsi), %%xmm3;\n\t"
01543 "movups 16(%%rsi), %%xmm4;\n\t"
01544 "addps %%xmm4, %%xmm0;\n\t"
01545 "addps %%xmm3, %%xmm1;\n\t"
01546 "mulps %%xmm5, %%xmm0;\n\t"
01547 "mulps %%xmm6, %%xmm1;\n\t"
01548 "mulps %%xmm7, %%xmm2;\n\t"
01549 "addps %%xmm1, %%xmm0;\n\t"
01550 "addps %%xmm2, %%xmm0;\n\t"
01551 "movups %%xmm0, (%%rdi);\n\t"
01552 "add $16, %%rsi;\n\t"
01553 "add $16, %%rdi;\n\t"
01554 "dec %%rdx;\n\t"
01555 "jnz .HG1;\n\t"
01556 "pop %%rdx;\n\t"
01557
01558 ".HG5:;\n\t"
01559 "or %%rax, %%rax;\n\t"
01560 "jz .HG3;\n\t"
01561 "push %%rax;\n\t"
01562 "movups 32(%%rbx), %%xmm5;\n\t"
01563 "movups 48(%%rbx), %%xmm6;\n\t"
01564 "movups 64(%%rbx), %%xmm7;\n\t"
01565 ".HG2:;\n\t"
01566 "movss (%%rsi), %%xmm0;\n\t"
01567 "movss 4(%%rsi), %%xmm1;\n\t"
01568 "movss 8(%%rsi), %%xmm2;\n\t"
01569 "movss 12(%%rsi), %%xmm3;\n\t"
01570 "movss 16(%%rsi), %%xmm4;\n\t"
01571 "mulss %%xmm5 , %%xmm0;\n\t"
01572 "mulss %%xmm6 , %%xmm1;\n\t"
01573 "mulss %%xmm7 , %%xmm2;\n\t"
01574 "mulss %%xmm6 , %%xmm3;\n\t"
01575 "mulss %%xmm5 , %%xmm4;\n\t"
01576 "addss %%xmm1, %%xmm0;\n\t"
01577 "addss %%xmm3, %%xmm2;\n\t"
01578 "addss %%xmm4, %%xmm0;\n\t"
01579 "addss %%xmm2, %%xmm0;\n\t"
01580 "add $4, %%rsi;\n\t"
01581 "movss %%xmm0, (%%rdi);\n\t"
01582 "add $4, %%rdi;\n\t"
01583 "dec %%rax;\n\t"
01584 "jnz .HG2;\n\t"
01585 "pop %%rax;\n\t"
01586 ".HG3:;\n\t"
01587 "movss (%%rsi), %%xmm0;\n\t"
01588 "movss 4(%%rsi), %%xmm1;\n\t"
01589 "movss 8(%%rsi), %%xmm2;\n\t"
01590 "movss 12(%%rsi), %%xmm3;\n\t"
01591 "movss %%xmm1, %%xmm4;\n\t"
01592 "movss %%xmm2, %%xmm5;\n\t"
01593 "movss %%xmm3, %%xmm6;\n\t"
01594 "addps %%xmm1, %%xmm3;\n\t"
01595 "mulss 80(%%rbx), %%xmm0;\n\t"
01596 "mulss 84(%%rbx), %%xmm3;\n\t"
01597 "mulss 88(%%rbx), %%xmm2;\n\t"
01598 "addss %%xmm3, %%xmm0;\n\t"
01599 "addss %%xmm2, %%xmm0;\n\t"
01600 "movss %%xmm0, (%%rdi);\n\t"
01601 "mulss 96(%%rbx), %%xmm4;\n\t"
01602 "mulss 100(%%rbx), %%xmm5;\n\t"
01603 "mulss 104(%%rbx), %%xmm6;\n\t"
01604 "addss %%xmm5, %%xmm4;\n\t"
01605 "addss %%xmm6, %%xmm4;\n\t"
01606 "movss %%xmm4, 4(%%rdi);\n\t"
01607 "add $16, %%rsi;\n\t"
01608 "add $8, %%rdi;\n\t"
01609 "dec %%rcx;\n\t"
01610 "jnz .HG0;\n\t"
01611 ".HG6:;\n\t"
01612 :
01613 :"S"(sptr),"D"(dptr),"a"(rax),"b"(coeffs),"c"(h),"d"(rdx)
01614 :"memory"
01615 );
01616 }
01617
01618 }
01619
01620
01621
01622
01623
01624 void sse_lowPass5y(const float *src, float *dest, const int h,
01625 const int w)
01626 {
01627
01628
01629
01630
01631
01632
01633
01634
01635
01636
01637
01638
01639
01640
01641
01642
01643
01644
01645
01646
01647
01648
01649
01650
01651
01652
01653
01654
01655
01656
01657
01658
01659
01660
01661
01662
01663
01664
01665
01666
01667
01668
01669
01670
01671
01672
01673
01674
01675
01676
01677
01678
01679
01680
01681
01682
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695
01696
01697
01698
01699
01700
01701
01702
01703
01704
01705
01706
01707
01708
01709
01710
01711
01712
01713
01714
01715
01716
01717
01718
01719
01720
01721
01722
01723
01724
01725
01726
01727
01728
01729
01730
01731
01732
01733
01734
01735
01736
01737
01738
01739
01740
01741
01742
01743
01744
01745
01746
01747
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757
01758
01759
01760
01761
01762
01763
01764
01765
01766
01767
01768
01769
01770
01771
01772
01773
01774
01775
01776
01777
01778
01779
01780
01781
01782
01783
01784
01785
01786
01787
01788
01789
01790
01791
01792
01793
01794
01795
01796
01797
01798
01799
01800
01801
01802
01803
01804
01805
01806
01807
01808
01809
01810
01811
01812
01813
01814
01815
01816
01817
01818
01819
01820
01821
01822
01823
01824
01825
01826
01827
01828
01829
01830
01831
01832
01833
01834
01835
01836
01837
01838
01839
01840
01841
01842
01843
01844
01845
01846
01847
01848
01849
01850
01851
01852
01853
01854
01855
01856
01857
01858
01859
01860
01861
01862
01863
01864
01865
01866
01867
01868
01869
01870
01871
01872
01873
01874
01875
01876
01877
01878
01879
01880
01881
01882
01883
01884
01885
01886
01887
01888
01889
01890
01891
01892
01893
01894
01895
01896
01897
01898
01899
01900
01901
01902
01903
01904
01905
01906
01907
01908
01909
01910
01911
01912
01913
01914
01915
01916
01917
01918
01919
01920
01921
01922
01923
01924
01925
01926
01927
01928
01929
01930
01931
01932
01933
01934
01935
01936
01937
01938
01939
01940
01941
01942
01943
01944
01945
01946
01947
01948
01949
01950
01951
01952
01953
01954
01955
01956
01957
01958
01959
01960
01961
01962
01963
01964
01965
01966
01967
01968
01969
01970
01971
01972
01973
01974
01975
01976
01977
01978
01979
01980
01981
01982
01983
01984
01985
01986
01987
01988
01989
01990
01991
01992
01993
01994
01995
01996
01997
01998
01999
02000
02001
02002 }
02003
02004
02005
02006
02007 void sse_yuv411_to_rgb_mode_640x480(const byte *src, byte *dest,
02008 const int nbpix2)
02009 {
02010 int rcx=nbpix2/6;
02011
02012 const float coeffs[] = {
02013 0.0F, -0.198242F, 1.014648F, 0.0F,
02014 0.700195F, -0.29052F, 0.0F, 0.0F,
02015 128.0F, 128.0F, 128.0F, 128.0F
02016 };
02017
02018 asm (
02019 ".JA0:;\n\t"
02020 "or %%rcx, %%rcx;\n\t"
02021 "jz .JA1;\n\t"
02022 "pxor %%mm7, %%mm7;\n\t"
02023 "xor %%rax, %%rax;\n\t"
02024 "xor %%rbx, %%rbx;\n\t"
02025 "mov (%%rsi), %%rax;\n\t"
02026 "movw 4(%%rsi), %%bx;\n\t"
02027 "movd %%rax, %%mm0;\n\t"
02028 "movd %%rax, %%mm1;\n\t"
02029 "movd %%rbx, %%mm2;\n\t"
02030 "psrlq $16, %%mm1;\n\t"
02031 "punpcklbw %%mm7, %%mm0;\n\t"
02032 "punpcklbw %%mm7, %%mm1;\n\t"
02033 "punpcklbw %%mm7, %%mm2;\n\t"
02034 "punpcklwd %%mm7, %%mm0;\n\t"
02035 "punpcklwd %%mm7, %%mm1;\n\t"
02036 "punpcklwd %%mm7, %%mm2;\n\t"
02037
02038 "cvtpi2ps %%mm0, %%xmm0;\n\t"
02039 "cvtpi2ps %%mm1, %%xmm1;\n\t"
02040 "cvtpi2ps %%mm2, %%xmm2;\n\t"
02041
02042
02043 "movaps %%xmm0, %%xmm3;\n\t"
02044
02045
02046 "movaps %%xmm1, %%xmm4;\n\t"
02047
02048
02049 "movaps %%xmm2, %%xmm5;\n\t"
02050
02051
02052 "movaps %%xmm2, %%xmm6;\n\t"
02053
02054 "shufps $0x55, %%xmm3, %%xmm3;\n\t"
02055 "shufps $00, %%xmm4, %%xmm4;\n\t"
02056 "shufps $0x00, %%xmm5, %%xmm5;\n\t"
02057 "shufps $0x55, %%xmm6, %%xmm6;\n\t"
02058
02059
02060 "shufps $0, %%xmm0, %%xmm0;\n\t"
02061
02062 "shufps $0x55, %%xmm1, %%xmm1;\n\t"
02063
02064 "subps 32(%%rdx), %%xmm0;\n\t"
02065 "subps 32(%%rdx), %%xmm1;\n\t"
02066
02067 "mulps (%%rdx), %%xmm0;\n\t"
02068 "mulps 16(%%rdx),%%xmm1;\n\t"
02069
02070 "addps %%xmm0, %%xmm3;\n\t"
02071 "addps %%xmm0, %%xmm4;\n\t"
02072 "addps %%xmm0, %%xmm5;\n\t"
02073 "addps %%xmm0, %%xmm6;\n\t"
02074
02075 "addps %%xmm1, %%xmm3;\n\t"
02076 "addps %%xmm1, %%xmm4;\n\t"
02077 "addps %%xmm1, %%xmm5;\n\t"
02078 "addps %%xmm1, %%xmm6;\n\t"
02079
02080 "cvtps2pi %%xmm3, %%mm0;\n\t"
02081 "movhlps %%xmm3, %%xmm3;\n\t"
02082 "cvtps2pi %%xmm3, %%mm1;\n\t"
02083 "packssdw %%mm1, %%mm0;\n\t"
02084
02085 "cvtps2pi %%xmm4, %%mm2;\n\t"
02086 "movhlps %%xmm4, %%xmm4;\n\t"
02087 "cvtps2pi %%xmm4, %%mm3;\n\t"
02088 "packssdw %%mm3, %%mm2;\n\t"
02089
02090 "cvtps2pi %%xmm5, %%mm4;\n\t"
02091 "movhlps %%xmm5, %%xmm5;\n\t"
02092 "cvtps2pi %%xmm5, %%mm5;\n\t"
02093 "packssdw %%mm5, %%mm4;\n\t"
02094
02095 "cvtps2pi %%xmm6, %%mm6;\n\t"
02096 "movhlps %%xmm6, %%xmm6;\n\t"
02097 "cvtps2pi %%xmm6, %%mm7;\n\t"
02098 "packssdw %%mm7, %%mm6;\n\t"
02099
02100 "pxor %%mm1, %%mm1;\n\t"
02101 "pcmpgtw %%mm0, %%mm1;\n\t"
02102 "pandn %%mm0, %%mm1;\n\t"
02103
02104 "pxor %%mm3, %%mm3;\n\t"
02105 "pcmpgtw %%mm2, %%mm3;\n\t"
02106 "pandn %%mm2, %%mm3;\n\t"
02107
02108 "pxor %%mm5, %%mm5;\n\t"
02109 "pcmpgtw %%mm4, %%mm5;\n\t"
02110 "pandn %%mm4, %%mm5;\n\t"
02111
02112 "pxor %%mm7, %%mm7;\n\t"
02113 "pcmpgtw %%mm6, %%mm7;\n\t"
02114 "pandn %%mm6, %%mm7;\n\t"
02115
02116 "packuswb %%mm1, %%mm1;\n\t"
02117 "packuswb %%mm3, %%mm3;\n\t"
02118 "packuswb %%mm5, %%mm5;\n\t"
02119 "packuswb %%mm7, %%mm7;\n\t"
02120
02121 "push %%rcx;\n\t"
02122 "push %%rdx;\n\t"
02123 "movd %%mm1, %%rax;\n\t"
02124 "movd %%mm3, %%rbx;\n\t"
02125 "movd %%mm5, %%rcx;\n\t"
02126 "movd %%mm7, %%rdx;\n\t"
02127 "movw %%ax, (%%rdi);\n\t"
02128 "movw %%bx,3(%%rdi);\n\t"
02129 "movw %%cx,6(%%rdi);\n\t"
02130 "movw %%dx,9(%%rdi);\n\t"
02131 "shr $8, %%rax;\n\t"
02132 "shr $8, %%rbx;\n\t"
02133 "shr $8, %%rcx;\n\t"
02134 "shr $8, %%rdx;\n\t"
02135 "movb %%ah, 2(%%rdi);\n\t"
02136 "movb %%bh, 5(%%rdi);\n\t"
02137 "movb %%ch, 8(%%rdi);\n\t"
02138 "movb %%dh,11(%%rdi);\n\t"
02139 "pop %%rdx;\n\t"
02140 "pop %%rcx;\n\t"
02141
02142 "add $12,%%rdi;\n\t"
02143 "dec %%rcx;\n\t"
02144 "add $6, %%rsi;\n\t"
02145 "jmp .JA0;\n\t"
02146 ".JA1:;\n\t"
02147 "emms;\n\t"
02148 :
02149 :"S"(src),"D"(dest),"c"(rcx),"d"(coeffs)
02150 :"rax","rbx","memory"
02151 );
02152
02153 }
02154
02155
02156
02157
02158 void sse_lowPass9x(const float *sptr, float *dptr, const int h, const int w)
02159 {
02160
02161 for (int j = 0; j < h; j ++)
02162 {
02163
02164 *dptr++ = sptr[0] * (70.0F / 163.0F) +
02165 sptr[1] * (56.0F / 163.0F) +
02166 sptr[2] * (28.0F / 163.0F) +
02167 sptr[3] * ( 8.0F / 163.0F) +
02168 sptr[4] * ( 1.0F / 163.0F);
02169 *dptr++ = (sptr[0] + sptr[2]) * (56.0F / 219.0F) +
02170 sptr[1] * (70.0F / 219.0F) +
02171 sptr[3] * (28.0F / 219.0F) +
02172 sptr[4] * ( 8.0F / 219.0F) +
02173 sptr[5] * ( 1.0F / 219.0F);
02174 *dptr++ = (sptr[0] + sptr[4]) * (28.0F / 247.0F) +
02175 (sptr[1] + sptr[3]) * (56.0F / 247.0F) +
02176 sptr[2] * (70.0F / 247.0F) +
02177 sptr[5] * ( 8.0F / 247.0F) +
02178 sptr[6] * ( 1.0F / 247.0F);
02179 *dptr++ = (sptr[0] + sptr[6]) * ( 8.0F / 255.0F) +
02180 (sptr[1] + sptr[5]) * (28.0F / 255.0F) +
02181 (sptr[2] + sptr[4]) * (56.0F / 255.0F) +
02182 sptr[3] * (70.0F / 255.0F) +
02183 sptr[7] * ( 1.0F / 255.0F);
02184
02185
02186 for (int i = 0; i < w - 8; i ++)
02187 {
02188 *dptr++ = (sptr[0] + sptr[8]) * ( 1.0F / 256.0F) +
02189 (sptr[1] + sptr[7]) * ( 8.0F / 256.0F) +
02190 (sptr[2] + sptr[6]) * (28.0F / 256.0F) +
02191 (sptr[3] + sptr[5]) * (56.0F / 256.0F) +
02192 sptr[4] * (70.0F / 256.0F);
02193 sptr ++;
02194 }
02195
02196
02197 *dptr++ = sptr[0] * ( 1.0F / 255.0F) +
02198 (sptr[1] + sptr[7]) * ( 8.0F / 255.0F) +
02199 (sptr[2] + sptr[6]) * (28.0F / 255.0F) +
02200 (sptr[3] + sptr[5]) * (56.0F / 255.0F) +
02201 sptr[4] * (70.0F / 255.0F);
02202 sptr ++;
02203 *dptr++ = sptr[0] * ( 1.0F / 247.0F) +
02204 sptr[1] * ( 8.0F / 247.0F) +
02205 (sptr[2] + sptr[6]) * (28.0F / 247.0F) +
02206 (sptr[3] + sptr[5]) * (56.0F / 247.0F) +
02207 sptr[4] * (70.0F / 247.0F);
02208 sptr ++;
02209 *dptr++ = sptr[0] * ( 1.0F / 219.0F) +
02210 sptr[1] * ( 8.0F / 219.0F) +
02211 sptr[2] * (28.0F / 219.0F) +
02212 (sptr[3] + sptr[5]) * (56.0F / 219.0F) +
02213 sptr[4] * (70.0F / 219.0F);
02214 sptr ++;
02215 *dptr++ = sptr[0] * ( 1.0F / 163.0F) +
02216 sptr[1] * ( 8.0F / 163.0F) +
02217 sptr[2] * (28.0F / 163.0F) +
02218 sptr[3] * (56.0F / 163.0F) +
02219 sptr[4] * (70.0F / 163.0F);
02220 sptr += 5;
02221 }
02222 }
02223 #endif
02224
02225
02226
02227
02228
02229
02230
02231 #endif
02232
02233 #ifndef INVT_CPU_OPTERON
02234
02235 #ifdef INVT_USE_SSE
02236
02237
02238 void sse_absDiff(const double *a, const double *b, double *diff, const int32 sz)
02239 {
02240 static int32 ecx= sz>>2;
02241 static int32 edx= sz & 0x3;
02242
02243 asm (
02244 "orl %%ecx, %%ecx;\n\t"
02245 "jz .AG2;\n\t"
02246 ".AG1:;\n\t"
02247 "movupd 0(%%esi), %%xmm0;\n\t"
02248 "movupd 0(%%edi), %%xmm1;\n\t"
02249 "movupd 16(%%esi), %%xmm2;\n\t"
02250 "movupd 16(%%edi), %%xmm3;\n\t"
02251 "movupd %%xmm0, %%xmm4;\n\t"
02252 "movupd %%xmm1, %%xmm5;\n\t"
02253 "movupd %%xmm2, %%xmm6;\n\t"
02254 "movupd %%xmm3, %%xmm7;\n\t"
02255 "subpd %%xmm1, %%xmm0;\n\t"
02256 "subpd %%xmm3, %%xmm2;\n\t"
02257 "subpd %%xmm4, %%xmm5;\n\t"
02258 "subpd %%xmm6, %%xmm7;\n\t"
02259 "maxpd %%xmm0, %%xmm5;\n\t"
02260 "maxpd %%xmm2, %%xmm7;\n\t"
02261 "movupd %%xmm5, 0(%%ebx);\n\t"
02262 "movupd %%xmm7, 16(%%ebx);\n\t"
02263 "addl $32, %%esi;\n\t"
02264 "addl $32, %%edi;\n\t"
02265 "addl $32, %%ebx;\n\t"
02266 "loop .AG1;\n\t"
02267 ".AG2:;\n\t"
02268 "movl %%edx, %%ecx;\n\t"
02269 "orl %%ecx, %%ecx;\n\t"
02270 "jz .AG4;\n\t"
02271 ".AG3:;\n\t"
02272 "movsd 0(%%esi), %%xmm0;\n\t"
02273 "movsd 0(%%edi), %%xmm1;\n\t"
02274 "movsd %%xmm0, %%xmm2;\n\t"
02275 "movsd %%xmm1, %%xmm3;\n\t"
02276 "subsd %%xmm3, %%xmm2;\n\t"
02277 "subsd %%xmm0, %%xmm1;\n\t"
02278 "maxsd %%xmm2, %%xmm1;\n\t"
02279 "movsd %%xmm1, 0(%%ebx);\n\t"
02280 "addl $8, %%esi;\n\t"
02281 "addl $8, %%edi;\n\t"
02282 "addl $8, %%ebx;\n\t"
02283 "loop .AG3;\n\t"
02284 ".AG4:;\n\t"
02285 :
02286 :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx)
02287 :"memory"
02288 );
02289 }
02290 #endif
02291
02292 #ifdef INVT_USE_MMXSSE2
02293
02294
02295 void sse2_absDiff(const float *a, const float *b, float *diff, const int32 sz)
02296 {
02297 static int32 ecx= sz>>3;
02298 static int32 edx= sz & 0x7;
02299
02300 asm (
02301 "orl %%ecx, %%ecx;\n\t"
02302 "jz .AE2;\n\t"
02303 ".AE1:;\n\t"
02304 "movups 0(%%esi), %%xmm0;\n\t"
02305 "movups 0(%%edi), %%xmm1;\n\t"
02306 "movups 16(%%esi), %%xmm2;\n\t"
02307 "movups 16(%%edi), %%xmm3;\n\t"
02308 "movups %%xmm0, %%xmm4;\n\t"
02309 "movups %%xmm1, %%xmm5;\n\t"
02310 "movups %%xmm2, %%xmm6;\n\t"
02311 "movups %%xmm3, %%xmm7;\n\t"
02312 "subps %%xmm1, %%xmm0;\n\t"
02313 "subps %%xmm3, %%xmm2;\n\t"
02314 "subps %%xmm4, %%xmm5;\n\t"
02315 "subps %%xmm6, %%xmm7;\n\t"
02316 "maxps %%xmm0, %%xmm5;\n\t"
02317 "maxps %%xmm2, %%xmm7;\n\t"
02318 "movups %%xmm5, 0(%%ebx);\n\t"
02319 "movups %%xmm7, 16(%%ebx);\n\t"
02320 "addl $32, %%esi;\n\t"
02321 "addl $32, %%edi;\n\t"
02322 "addl $32, %%ebx;\n\t"
02323 "loop .AE1;\n\t"
02324 ".AE2:;\n\t"
02325 "movl %%edx, %%ecx;\n\t"
02326 "orl %%ecx, %%ecx;\n\t"
02327 "jz .AE4;\n\t"
02328 ".AE3:;\n\t"
02329 "movss 0(%%esi), %%xmm0;\n\t"
02330 "movss 0(%%edi), %%xmm1;\n\t"
02331 "movss %%xmm0, %%xmm2;\n\t"
02332 "movss %%xmm1, %%xmm3;\n\t"
02333 "subss %%xmm3, %%xmm2;\n\t"
02334 "subss %%xmm0, %%xmm1;\n\t"
02335 "maxss %%xmm2, %%xmm1;\n\t"
02336 "movss %%xmm1, 0(%%ebx);\n\t"
02337 "addl $4, %%esi;\n\t"
02338 "addl $4, %%edi;\n\t"
02339 "addl $4, %%ebx;\n\t"
02340 "loop .AE3;\n\t"
02341 ".AE4:;\n\t"
02342 "emms;\n\t"
02343 :
02344 :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx)
02345 :"memory"
02346 );
02347 }
02348
02349
02350
02351
02352
02353 void sse2_absDiff(const int32 *a, const int32 *b, int32 *diff, const int32 sz)
02354 {
02355 static int32 ecx= sz>>3;
02356 static int32 edx= sz&0x7;
02357
02358 asm (
02359 "orl %%ecx, %%ecx;\n\t"
02360 "jz .AF2;\n\t"
02361 ".AF1:;\n\t"
02362 "movdqu 0(%%esi), %%xmm0;\n\t"
02363 "movdqu 0(%%edi), %%xmm1;\n\t"
02364 "movdqu 16(%%esi), %%xmm2;\n\t"
02365 "movdqu 16(%%edi), %%xmm3;\n\t"
02366 "movdqu %%xmm0, %%xmm4;\n\t"
02367 "movdqu %%xmm1, %%xmm5;\n\t"
02368 "movdqu %%xmm2, %%xmm6;\n\t"
02369 "movdqu %%xmm3, %%xmm7;\n\t"
02370 "psubusw %%xmm1, %%xmm0;\n\t"
02371 "psubusw %%xmm3, %%xmm2;\n\t"
02372 "psubusw %%xmm4, %%xmm5;\n\t"
02373 "psubusw %%xmm6, %%xmm7;\n\t"
02374 "pmaxsw %%xmm0, %%xmm5;\n\t"
02375 "pmaxsw %%xmm2, %%xmm7;\n\t"
02376 "movdqu %%xmm5, 0(%%ebx);\n\t"
02377 "movdqu %%xmm7, 16(%%ebx);\n\t"
02378 "addl $32, %%esi;\n\t"
02379 "addl $32, %%edi;\n\t"
02380 "addl $32, %%ebx;\n\t"
02381 "loop .AF1;\n\t"
02382 ".AF2:;\n\t"
02383 "movl %%edx, %%ecx;\n\t"
02384 "orl %%ecx, %%ecx;\n\t"
02385 "jz .AF4;\n\t"
02386 ".AF3:;\n\t"
02387 "movl (%%esi), %%eax;\n\t"
02388 "movl (%%edi), %%edx;\n\t"
02389 "cmpl %%edx, %%eax;\n\t"
02390 "ja .AF5;\n\t"
02391 "xchgl %%eax, %%edx;\n\t"
02392 ".AF5:;\n\t"
02393 "subl %%edx, %%eax;\n\t"
02394 "movl %%eax, (%%ebx);\n\t"
02395 "addl $4, %%esi;\n\t"
02396 "addl $4, %%edi;\n\t"
02397 "addl $4, %%ebx;\n\t"
02398 "loop .AF3;\n\t"
02399 ".AF4:;\n\t"
02400 "emms;\n\t"
02401 :
02402 :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx)
02403 :"memory"
02404 );
02405 }
02406
02407
02408
02409
02410 void sse2_absDiff(const byte *a, const byte *b, byte *diff, const int32 sz)
02411 {
02412 static int32 ecx= sz>>5;
02413 static int32 edx= sz&0x1f;
02414
02415 asm (
02416 "orl %%ecx, %%ecx;\n\t"
02417 "jz .AD2;\n\t"
02418 ".AD1:;\n\t"
02419 "movdqu 0(%%esi), %%xmm0;\n\t"
02420 "movdqu 0(%%edi), %%xmm1;\n\t"
02421 "movdqu 16(%%esi), %%xmm2;\n\t"
02422 "movdqu 16(%%edi), %%xmm3;\n\t"
02423 "movdqu %%xmm0, %%xmm4;\n\t"
02424 "movdqu %%xmm1, %%xmm5;\n\t"
02425 "movdqu %%xmm2, %%xmm6;\n\t"
02426 "movdqu %%xmm3, %%xmm7;\n\t"
02427 "psubusb %%xmm1, %%xmm0;\n\t"
02428 "psubusb %%xmm3, %%xmm2;\n\t"
02429 "psubusb %%xmm4, %%xmm5;\n\t"
02430 "psubusb %%xmm6, %%xmm7;\n\t"
02431 "pmaxub %%xmm0, %%xmm5;\n\t"
02432 "pmaxub %%xmm2, %%xmm7;\n\t"
02433 "movdqu %%xmm5, 0(%%ebx);\n\t"
02434 "movdqu %%xmm7, 16(%%ebx);\n\t"
02435 "addl $32, %%esi;\n\t"
02436 "addl $32, %%edi;\n\t"
02437 "addl $32, %%ebx;\n\t"
02438 "loop .AD1;\n\t"
02439 ".AD2:;\n\t"
02440 "movl %%edx, %%ecx;\n\t"
02441 "orl %%ecx, %%ecx;\n\t"
02442 "jz .AD4;\n\t"
02443 ".AD3:;\n\t"
02444 "movb (%%esi), %%al;\n\t"
02445 "movb (%%edi), %%dl;\n\t"
02446 "cmpb %%dl, %%al;\n\t"
02447 "ja .AD5;\n\t"
02448 "xchgb %%al, %%dl;\n\t"
02449 ".AD5:;\n\t"
02450 "subb %%dl, %%al;\n\t"
02451 "movb %%al, (%%ebx);\n\t"
02452 "incl %%ebx;\n\t"
02453 "incl %%esi;\n\t"
02454 "incl %%edi;\n\t"
02455 "loop .AD3;\n\t"
02456 ".AD4:;\n\t"
02457 "emms;\n\t"
02458 :
02459 :"S"(a),"D"(b),"b"(diff), "c"(ecx), "d"(edx)
02460 :"memory"
02461 );
02462 }
02463 #endif
02464
02465 #ifdef INVT_USE_SSE
02466
02467
02468 void sse_sum(const double *a, double *sum, const int32 sz)
02469 {
02470 static int32 ecx = sz>>3;
02471 static int32 edx = sz&0x7;
02472
02473 asm (
02474 "pxor %%xmm4, %%xmm4;\n\t"
02475 "pxor %%xmm5, %%xmm5;\n\t"
02476 "pxor %%xmm6, %%xmm6;\n\t"
02477 "pxor %%xmm7, %%xmm7;\n\t"
02478 "orl %%ecx, %%ecx;\n\t"
02479 "jz BE1;\n\t"
02480 ".BE0:\n\t"
02481 "movupd 0(%%esi), %%xmm0;\n\t"
02482 "movupd 16(%%esi), %%xmm1;\n\t"
02483 "movupd 32(%%esi), %%xmm2;\n\t"
02484 "movupd 48(%%esi), %%xmm3;\n\t"
02485 "addpd %%xmm0, %%xmm4;\n\t"
02486 "addpd %%xmm1, %%xmm5;\n\t"
02487 "addpd %%xmm2, %%xmm6;\n\t"
02488 "addpd %%xmm3, %%xmm7;\n\t"
02489 "addl $64, %%esi;\n\t"
02490 "loop .BE0;\n\t"
02491 "BE1:;\n\t"
02492 "mov %%edx, %%ecx;\n\t"
02493 "pxor %%xmm0, %%xmm0;\n\t"
02494 "orl %%ecx, %%ecx;\n\t"
02495 "jz BE2;\n\t"
02496 "BE3:;\n\t"
02497 "movupd 0(%%esi), %%xmm1;\n\t"
02498 "addpd %%xmm1, %%xmm0;\n\t"
02499 "addl $16, %%esi;\n\t"
02500 "loop BE3;\n\t"
02501 "BE2:;\n\t"
02502 "addpd %%xmm4, %%xmm7;\n\t"
02503 "addpd %%xmm5, %%xmm7;\n\t"
02504 "addpd %%xmm6, %%xmm7;\n\t"
02505 "addpd %%xmm7, %%xmm0;\n\t"
02506 "movhpd %%xmm0, (%%ebx);\n\t"
02507 "addsd (%%ebx), %%xmm0;\n\t"
02508 "movlpd %%xmm0, (%%ebx);\n\t"
02509 "emms;\n\t"
02510 :
02511 :"S"(a), "b"(sum), "c"(ecx), "d"(edx)
02512 :"memory"
02513 );
02514 }
02515 #endif
02516
02517 #ifdef INVT_USE_MMXSSE2
02518
02519
02520 void sse2_sum(const float *a, double *sum, const int32 sz)
02521 {
02522 static int32 ecx = sz>>3;
02523 static int32 edx = sz & 0x7;
02524
02525 asm (
02526 "pxor %%xmm4, %%xmm4;\n\t"
02527 "pxor %%xmm5, %%xmm5;\n\t"
02528 "pxor %%xmm6, %%xmm6;\n\t"
02529 "pxor %%xmm7, %%xmm7;\n\t"
02530 "orl %%ecx, %%ecx;\n\t"
02531 "jz BA1;\n\t"
02532 ".BA0:\n\t"
02533 "cvtps2pd 0(%%esi), %%xmm0;\n\t"
02534 "cvtps2pd 8(%%esi), %%xmm1;\n\t"
02535 "cvtps2pd 16(%%esi), %%xmm2;\n\t"
02536 "cvtps2pd 24(%%esi), %%xmm3;\n\t"
02537 "addpd %%xmm0, %%xmm4;\n\t"
02538 "addpd %%xmm1, %%xmm5;\n\t"
02539 "addpd %%xmm2, %%xmm6;\n\t"
02540 "addpd %%xmm3, %%xmm7;\n\t"
02541 "addl $32, %%esi;\n\t"
02542 "loop .BA0;\n\t"
02543 "BA1:;\n\t"
02544 "pxor %%xmm0, %%xmm0;\n\t"
02545 "mov %%edx, %%ecx;\n\t"
02546 "orl %%ecx, %%ecx;\n\t"
02547 "jz BA2;\n\t"
02548 "BA3:;\n\t"
02549 "cvtps2pd 0(%%esi), %%xmm1;\n\t"
02550 "addpd %%xmm1, %%xmm0;\n\t"
02551 "addl $8, %%esi;\n\t"
02552 "loop BA3;\n\t"
02553 "BA2:;\n\t"
02554 "addpd %%xmm4, %%xmm7;\n\t"
02555 "addpd %%xmm5, %%xmm7;\n\t"
02556 "addpd %%xmm6, %%xmm7;\n\t"
02557 "addpd %%xmm7, %%xmm0;\n\t"
02558 "movhpd %%xmm0, (%%ebx);\n\t"
02559 "addsd (%%ebx), %%xmm0;\n\t"
02560 "movlpd %%xmm0, (%%ebx);\n\t"
02561 "emms;\n\t"
02562 :
02563 :"S"(a), "b"(sum), "c"(ecx), "d"(edx)
02564 :"memory"
02565 );
02566 }
02567
02568
02569
02570
02571 void sse2_sum(const int32 *a, double *sum, const int32 sz)
02572 {
02573 static int32 ecx = sz>>3;
02574 static int32 edx = sz & 0x7;
02575
02576 asm (
02577 "pxor %%xmm4, %%xmm4;\n\t"
02578 "pxor %%xmm5, %%xmm5;\n\t"
02579 "pxor %%xmm6, %%xmm6;\n\t"
02580 "pxor %%xmm7, %%xmm7;\n\t"
02581 "orl %%ecx, %%ecx;\n\t"
02582 ".BC0:\n\t"
02583 "cvtdq2pd 0(%%esi), %%xmm0;\n\t"
02584 "cvtdq2pd 8(%%esi), %%xmm1;\n\t"
02585 "cvtdq2pd 16(%%esi), %%xmm2;\n\t"
02586 "cvtdq2pd 24(%%esi), %%xmm3;\n\t"
02587 "addpd %%xmm0, %%xmm4;\n\t"
02588 "addpd %%xmm1, %%xmm5;\n\t"
02589 "addpd %%xmm2, %%xmm6;\n\t"
02590 "addpd %%xmm3, %%xmm7;\n\t"
02591 "addl $32, %%esi;\n\t"
02592 "loop .BC0;\n\t"
02593 "BC1:;\n\t"
02594 "pxor %%xmm0, %%xmm0;\n\t"
02595 "mov %%edx, %%ecx;\n\t"
02596 "orl %%ecx, %%ecx;\n\t"
02597 "jz BC2;\n\t"
02598 "BC3:;\n\t"
02599 "cvtdq2pd 0(%%esi), %%xmm1;\n\t"
02600 "addpd %%xmm1, %%xmm0;\n\t"
02601 "addl $8, %%esi;\n\t"
02602 "loop BC3;\n\t"
02603 "BC2:;\n\t"
02604 "addpd %%xmm4, %%xmm7;\n\t"
02605 "addpd %%xmm5, %%xmm7;\n\t"
02606 "addpd %%xmm6, %%xmm7;\n\t"
02607 "addpd %%xmm7, %%xmm0;\n\t"
02608 "movhpd %%xmm0, (%%ebx);\n\t"
02609 "addsd (%%ebx), %%xmm0;\n\t"
02610 "movlpd %%xmm0, (%%ebx);\n\t"
02611 "emms;\n\t"
02612 :
02613 :"S"(a), "b"(sum), "c"(ecx), "d"(edx)
02614 :"memory"
02615 );
02616 }
02617
02618
02619
02620
02621 void sse2_sum(const byte *a, double *sum, const int32 sz)
02622 {
02623 static int ecx = sz>>5;
02624 static int edx = sz & 0x1f;
02625
02626 asm (
02627 "orl %%ecx, %%ecx;\n\t"
02628 "jz BB1;\n\t"
02629 "pxor %%xmm7, %%xmm7;\n\t"
02630 "pushl %%ebx;\n\t"
02631 "pushl %%edx;\n\t"
02632 "BB3:;\n\t"
02633 "pxor %%xmm5, %%xmm5;\n\t"
02634 "pxor %%xmm6, %%xmm6;\n\t"
02635 "movdqu (%%esi), %%xmm0;\n\t"
02636 "movdqu 16(%%esi), %%xmm1;\n\t"
02637 "psadbw %%xmm0, %%xmm5;\n\t"
02638 "psadbw %%xmm1, %%xmm6;\n\t"
02639 "pextrw $0, %%xmm5, %%eax;\n\t"
02640 "cvtsi2sd %%eax, %%xmm0;\n\t"
02641 "pextrw $4, %%xmm5, %%ebx;\n\t"
02642 "cvtsi2sd %%ebx, %%xmm1;\n\t"
02643 "pextrw $0, %%xmm6, %%edx;\n\t"
02644 "cvtsi2sd %%edx, %%xmm2;\n\t"
02645 "pextrw $4, %%xmm6, %%edi;\n\t"
02646 "cvtsi2sd %%edi, %%xmm3;\n\t"
02647 "addsd %%xmm0, %%xmm1;\n\t"
02648 "addsd %%xmm2, %%xmm3;\n\t"
02649 "addsd %%xmm1, %%xmm7;\n\t"
02650 "addsd %%xmm3, %%xmm7;\n\t"
02651 "addl $32, %%esi;\n\t"
02652 "loop BB3;\n\t"
02653 "popl %%edx;\n\t"
02654 "popl %%ebx;\n\t"
02655 "BB1:;\n\t"
02656 "xorl %%edi, %%edi;\n\t"
02657 "movl %%edx, %%ecx;\n\t"
02658 "orl %%ecx, %%ecx;\n\t"
02659 "jz BB2;\n\t"
02660 "BB5:;\n\t"
02661 "xorl %%eax, %%eax;\n\t"
02662 "movb (%%esi), %%al;\n\t"
02663 "addl %%eax, %%edi;\n\t"
02664 "incl %%esi;\n\t"
02665 "loop BB5;\n\t"
02666 "BB2:\n\t"
02667 "cvtsi2sd %%edi, %%xmm0;\n\t"
02668 "addsd %%xmm0, %%xmm7;\n\t"
02669 "movhpd %%xmm7, (%%ebx);\n\t"
02670 "addsd (%%ebx), %%xmm7;\n\t"
02671 "movlpd %%xmm7, (%%ebx);\n\t"
02672 "BB6:;\n\t"
02673 "emms;\n\t"
02674 :
02675 :"S"(a), "c"(ecx),"b"(sum),"d"(edx)
02676 :"memory","eax","edi"
02677 );
02678 }
02679 #endif
02680
02681 #ifdef INVT_USE_SSE
02682
02683
02684 void sse_clampedDiff(const byte *a, const byte *b, byte *result, const int32 sz)
02685 {
02686 int ecx = sz >> 6;
02687 int edx = sz & 0x7f;
02688
02689 asm (
02690 "orl %%ecx, %%ecx;\n\t"
02691 "jz .DA0;\n\t"
02692 ".DA1:;\n\t"
02693 "movdqu (%%esi), %%xmm0;\n\t"
02694 "movdqu (%%edi), %%xmm4;\n\t"
02695 "movdqu 16(%%esi), %%xmm1;\n\t"
02696 "movdqu 16(%%edi), %%xmm5;\n\t"
02697 "movdqu 32(%%esi), %%xmm2;\n\t"
02698 "movdqu 32(%%edi), %%xmm6;\n\t"
02699 "movdqu 48(%%esi), %%xmm3;\n\t"
02700 "movdqu 48(%%edi), %%xmm7;\n\t"
02701 "psubusb %%xmm4, %%xmm0;\n\t"
02702 "psubusb %%xmm5, %%xmm1;\n\t"
02703 "psubusb %%xmm6, %%xmm2;\n\t"
02704 "psubusb %%xmm7, %%xmm3;\n\t"
02705 "movdqu %%xmm0, 0(%%ebx);\n\t"
02706 "movdqu %%xmm1, 16(%%ebx);\n\t"
02707 "movdqu %%xmm2, 32(%%ebx);\n\t"
02708 "movdqu %%xmm3, 48(%%ebx);\n\t"
02709 "addl $64, %%esi;\n\t"
02710 "addl $64, %%edi;\n\t"
02711 "addl $64, %%ebx;\n\t"
02712 "loop .DA1;\n\t"
02713 ".DA0:;\n\t"
02714 "movl %%edx, %%ecx;\n\t"
02715 "orl %%ecx, %%ecx;\n\t"
02716 "jz .DA2;\n\t"
02717 ".DA3:;\n\t"
02718 "movb (%%esi), %%al;\n\t"
02719 "movb (%%edi), %%dl;\n\t"
02720 "cmpb %%bl, %%al;\n\t"
02721 "ja .DA4;\n\t"
02722 "xchg %%al, %%bl;\n\t"
02723 ".DA4:;\n\t"
02724 "subb %%bl, %%al;\n\t"
02725 "movb %%al, (%%ebx);\n\t"
02726 "incl %%esi;\n\t"
02727 "incl %%edi;\n\t"
02728 "incl %%ebx;\n\t"
02729 "loop .DA3;\n\t"
02730 ".DA2:;\n\t"
02731 "emms;\n\t"
02732 :
02733 :"S"(a),"D"(b),"c"(ecx),"d"(edx),"b"(result)
02734 );
02735 }
02736
02737
02738
02739
02740 void sse_clampedDiff(const float32 *a, const float32 *b, float32 *result,
02741 const int32 sz)
02742 {
02743 int32 ecx=sz>>5;
02744 int32 edx=sz&0x1f;
02745
02746 asm (
02747 "orl %%ecx, %%ecx;\n\t"
02748 "jz .DB0;\n\t"
02749 ".DB1:;\n\t"
02750 "movups 0(%%esi), %%xmm0;\n\t"
02751 "movups 0(%%edi), %%xmm1;\n\t"
02752 "movups 16(%%esi), %%xmm2;\n\t"
02753 "movups 16(%%edi), %%xmm3;\n\t"
02754 "movups %%xmm1, %%xmm6;\n\t"
02755 "movups %%xmm3, %%xmm7;\n\t"
02756 "cmpps $1, %%xmm0, %%xmm6;\n\t"
02757 "cmpps $1, %%xmm2, %%xmm7;\n\t"
02758 "subps %%xmm1, %%xmm0;\n\t"
02759 "subps %%xmm3, %%xmm2;\n\t"
02760 "andps %%xmm6, %%xmm0;\n\t"
02761 "andps %%xmm7, %%xmm2;\n\t"
02762 "movups %%xmm0, (%%ebx);\n\t"
02763 "movups %%xmm2, 16(%%ebx);\n\t"
02764 "addl $32, %%esi;\n\t"
02765 "addl $32, %%edi;\n\t"
02766 "addl $32, %%ebx;\n\t"
02767 "loop .DB1;\n\t"
02768 ".DB0:;\n\t"
02769 "movl %%edx, %%ecx;\n\t"
02770 "orl %%ecx, %%ecx;\n\t"
02771 "jz .DB2;\n\t"
02772 ".DB3:;\n\t"
02773 "movss (%%esi), %%xmm0;\n\t"
02774 "movss (%%edi), %%xmm1;\n\t"
02775 "movss %%xmm1, %%xmm2;\n\t"
02776 "cmpss $1, %%xmm0, %%xmm2;\n\t"
02777 "andps %%xmm2, %%xmm0;\n\t"
02778 "andps %%xmm2, %%xmm1;\n\t"
02779 "subss %%xmm1, %%xmm0;\n\t"
02780 "movss %%xmm0, (%%ebx);\n\t"
02781 "addl $4, %%esi;\n\t"
02782 "addl $4, %%edi;\n\t"
02783 "addl $4, %%ebx;\n\t"
02784 "loop .DB3;\n\t"
02785 ".DB2:;\n\t"
02786 :
02787 :"S"(a), "D"(b), "b"(result), "c"(ecx), "d"(edx)
02788 :"memory"
02789 );
02790 }
02791
02792
02793
02794
02795 void sse_clampedDiff(const int32 *a, const int32 *b, int32 *c, const int32 sz)
02796 {
02797 int32 ecx=sz>>3;
02798 int32 edx=sz&0x7;
02799 asm (
02800 "orl %%ecx, %%ecx;\n\t"
02801 "jz .DC0;\n\t"
02802 ".DC1:;\n\t"
02803 "movdqu 0(%%esi), %%xmm0;\n\t"
02804 "movdqu 0(%%edi), %%xmm1;\n\t"
02805 "movdqu 16(%%esi), %%xmm3;\n\t"
02806 "movdqu 16(%%edi), %%xmm4;\n\t"
02807 "movdqu %%xmm0, %%xmm2;\n\t"
02808 "movdqu %%xmm3, %%xmm5;\n\t"
02809 "pcmpgtd %%xmm1, %%xmm2;\n\t"
02810 "pcmpgtd %%xmm4, %%xmm5;\n\t"
02811 "psubd %%xmm1, %%xmm0;\n\t"
02812 "psubd %%xmm4, %%xmm3;\n\t"
02813 "pand %%xmm2, %%xmm0;\n\t"
02814 "pand %%xmm5, %%xmm3;\n\t"
02815 "movdqu %%xmm0, (%%ebx);\n\t"
02816 "movdqu %%xmm3, 16(%%ebx);\n\t"
02817 "addl $32, %%esi;\n\t"
02818 "addl $32, %%edi;\n\t"
02819 "addl $32, %%ebx;\n\t"
02820 "loop .DC1;\n\t"
02821 ".DC0:;\n\t"
02822 "movl %%edx, %%ecx;\n\t"
02823 "orl %%ecx, %%ecx;\n\t"
02824 "jz .DC2;\n\t"
02825 ".DC3:;\n\t"
02826 "movd 0(%%esi), %%xmm0;\n\t"
02827 "movd 0(%%edi), %%xmm1;\n\t"
02828 "movdqu %%xmm0, %%xmm2;\n\t"
02829 "pcmpgtd %%xmm1, %%xmm2;\n\t"
02830 "psubd %%xmm1, %%xmm0;\n\t"
02831 "pand %%xmm2, %%xmm0;\n\t"
02832 "movd %%xmm0, (%%ebx);\n\t"
02833 "addl $4, %%esi;\n\t"
02834 "addl $4, %%edi;\n\t"
02835 "addl $4, %%ebx;\n\t"
02836 "loop .DC3;\n\t"
02837 ".DC2:;\n\t"
02838 :
02839 :"S"(a), "D"(b), "c"(ecx), "d"(edx), "b"(c)
02840 :"memory"
02841 );
02842 }
02843
02844
02845
02846
02847 void sse_binaryReverse(const byte *a, byte *result, const byte val, const
02848 int32 sz)
02849 {
02850 static unsigned int ecx=(sz>>7);
02851 static unsigned int edx=sz&0x7f;
02852
02853 byte pVal[16];
02854
02855 memset(result, val, 16);
02856
02857 asm (
02858 "orl %%ecx, %%ecx;\n\t"
02859 "jz .FA0;\n\t"
02860 ".FA1:;\n\t"
02861 "movdqu 0(%%ebx), %%xmm0;\n\t"
02862 "movdqu 0(%%ebx), %%xmm1;\n\t"
02863 "movdqu %%xmm0, %%xmm2;\n\t"
02864 "movdqu %%xmm1, %%xmm3;\n\t"
02865 "movdqu %%xmm0, %%xmm4;\n\t"
02866 "movdqu %%xmm1, %%xmm5;\n\t"
02867 "movdqu %%xmm0, %%xmm6;\n\t"
02868 "movdqu %%xmm1, %%xmm7;\n\t"
02869 "psubb (%%esi), %%xmm0;\n\t"
02870 "psubb 16(%%esi), %%xmm1;\n\t"
02871 "psubb 32(%%esi), %%xmm2;\n\t"
02872 "psubb 48(%%esi), %%xmm3;\n\t"
02873 "psubb 64(%%esi), %%xmm4;\n\t"
02874 "psubb 80(%%esi), %%xmm5;\n\t"
02875 "psubb 96(%%esi), %%xmm6;\n\t"
02876 "psubb 112(%%esi), %%xmm7;\n\t"
02877 "movdqu %%xmm0, (%%edi);\n\t"
02878 "movdqu %%xmm1, 16(%%edi);\n\t"
02879 "movdqu %%xmm2, 32(%%edi);\n\t"
02880 "movdqu %%xmm3, 48(%%edi);\n\t"
02881 "movdqu %%xmm4, 64(%%edi);\n\t"
02882 "movdqu %%xmm5, 80(%%edi);\n\t"
02883 "movdqu %%xmm6, 96(%%edi);\n\t"
02884 "movdqu %%xmm7, 112(%%edi);\n\t"
02885 "addl $128, %%edi;\n\t"
02886 "addl $128, %%esi;\n\t"
02887 "loop .FA1;\n\t"
02888 ".FA0:;\n\t"
02889 "movl %%edx, %%ecx;\n\t"
02890 "orl %%ecx, %%ecx;\n\t"
02891 "jz .FA2;\n\t"
02892 "movb (%%ebx), %%dl;\n\t"
02893 ".FA3:;\n\t"
02894 "movb %%dl, %%dh;\n\t"
02895 "movb (%%esi), %%al;\n\t"
02896 "subb %%al, %%dh;\n\t"
02897 "movb %%dh, (%%edi);\n\t"
02898 "incl %%esi;\n\t"
02899 "incl %%edi;\n\t"
02900 "loop .FA3;\n\t"
02901 ".FA2:;\n\t"
02902 :
02903 :"S"(a), "D"(result), "b"(pVal),"c"(ecx),"d"(edx)
02904 :"memory","eax"
02905 );
02906 }
02907
02908
02909
02910
02911 void sse_binaryReverse(const float *a, float *result, const float val,
02912 const int sz)
02913 {
02914 static unsigned int ecx = sz>>5;
02915 static unsigned int edx = sz&0x1f;
02916 int i;
02917 float pVal[16];
02918
02919 for(i=0;i<16;++i)
02920 pVal[i] = val;
02921
02922
02923 asm (
02924 "orl %%ecx, %%ecx;\n\t"
02925 "jz .FB4;\n\t"
02926 ".FB2:;\n\t"
02927 "movups (%%ebx), %%xmm0;\n\t"
02928 "movups (%%ebx), %%xmm1;\n\t"
02929 "movups %%xmm0, %%xmm2;\n\t"
02930 "movups %%xmm1, %%xmm3;\n\t"
02931 "movups %%xmm0, %%xmm4;\n\t"
02932 "movups %%xmm1, %%xmm5;\n\t"
02933 "movups %%xmm0, %%xmm6;\n\t"
02934 "movups %%xmm1, %%xmm7;\n\t"
02935 "psubq (%%esi), %%xmm0;\n\t"
02936 "psubq 16(%%esi), %%xmm1;\n\t"
02937 "psubq 32(%%esi), %%xmm2;\n\t"
02938 "psubq 48(%%esi), %%xmm3;\n\t"
02939 "psubq 64(%%esi), %%xmm4;\n\t"
02940 "psubq 80(%%esi), %%xmm5;\n\t"
02941 "psubq 96(%%esi), %%xmm6;\n\t"
02942 "psubq 112(%%esi), %%xmm7;\n\t"
02943 "movups %%xmm0, 0(%%edi);\n\t"
02944 "movups %%xmm1, 16(%%edi);\n\t"
02945 "movups %%xmm2, 32(%%edi);\n\t"
02946 "movups %%xmm3, 48(%%edi);\n\t"
02947 "movups %%xmm4, 64(%%edi);\n\t"
02948 "movups %%xmm5, 80(%%edi);\n\t"
02949 "movups %%xmm6, 96(%%edi);\n\t"
02950 "movups %%xmm7,112(%%edi);\n\t"
02951 "addl $128, %%esi;\n\t"
02952 "addl $128, %%edi;\n\t"
02953 "loop .FB2;\n\t"
02954 ".FB4:\n\t"
02955 "orl %%edx, %%edx;\n\t"
02956 "jz .FB1;\n\t"
02957 "movl %%edx, %%ecx;\n\t"
02958 ".FB3:;\n\t"
02959 "movss 0(%%ebx), %%xmm0;\n\t"
02960 "subss (%%esi), %%xmm0;\n\t"
02961 "movups %%xmm0, (%%edi);\n\t"
02962 "addl $16, %%esi;\n\t"
02963 "addl $16, %%edi;\n\t"
02964 "loop .FB3;\n\t"
02965 ".FB1:;\n\t"
02966 :
02967 :"S"(a), "D"(result), "b"(pVal),"c"(ecx),"d"(edx)
02968 :"memory","eax"
02969 );
02970 }
02971
02972
02973
02974
02975
02976 void sse_binaryReverse(const int32 *a, int32 *result, const int32 val,
02977 const int32 sz)
02978 {
02979 int32 ecx=sz>>5;
02980 int32 edx=sz&31;
02981 int32 pVal[16];
02982 int i;
02983
02984 for(i=0;i<16;++i) pVal[i] = val;
02985
02986 asm (
02987 "orl %%ecx, %%ecx;\n\t"
02988 "jz .FC4;\n\t"
02989 ".FC2:;\n\t"
02990 "movdqu (%%ebx), %%xmm0;\n\t"
02991 "movdqu (%%ebx), %%xmm1;\n\t"
02992 "movdqu %%xmm0, %%xmm2;\n\t"
02993 "movdqu %%xmm1, %%xmm3;\n\t"
02994 "movdqu %%xmm0, %%xmm4;\n\t"
02995 "movdqu %%xmm1, %%xmm5;\n\t"
02996 "movdqu %%xmm0, %%xmm6;\n\t"
02997 "movdqu %%xmm1, %%xmm7;\n\t"
02998 "psubd (%%esi), %%xmm0;\n\t"
02999 "psubd 16(%%esi), %%xmm1;\n\t"
03000 "psubd 32(%%esi), %%xmm2;\n\t"
03001 "psubd 48(%%esi), %%xmm3;\n\t"
03002 "psubd 64(%%esi), %%xmm4;\n\t"
03003 "psubd 80(%%esi), %%xmm5;\n\t"
03004 "psubd 96(%%esi), %%xmm6;\n\t"
03005 "psubd 112(%%esi), %%xmm7;\n\t"
03006 "movdqu %%xmm0, 0(%%edi);\n\t"
03007 "movdqu %%xmm1, 16(%%edi);\n\t"
03008 "movdqu %%xmm2, 32(%%edi);\n\t"
03009 "movdqu %%xmm3, 48(%%edi);\n\t"
03010 "movdqu %%xmm4, 64(%%edi);\n\t"
03011 "movdqu %%xmm5, 80(%%edi);\n\t"
03012 "movdqu %%xmm6, 96(%%edi);\n\t"
03013 "movdqu %%xmm7,112(%%edi);\n\t"
03014 "addl $128, %%esi;\n\t"
03015 "addl $128, %%edi;\n\t"
03016 "loop .FC2;\n\t"
03017 ".FC4:;\n\t"
03018 "orl %%edx, %%edx;\n\t"
03019 "jz .FC1;\n\t"
03020 "movl %%edx, %%ecx;\n\t"
03021 ".FC3:;\n\t"
03022 "movdqu 0(%%ebx), %%xmm0;\n\t"
03023 "psubd (%%esi), %%xmm0;\n\t"
03024 "movups %%xmm0, (%%edi);\n\t"
03025 "addl $16, %%esi;\n\t"
03026 "addl $16, %%edi;\n\t"
03027 "loop .FC3;\n\t"
03028 ".FC1:;\n\t"
03029 :
03030 :"S"(a), "D"(result), "b"(pVal),"c"(ecx),"d"(edx)
03031 :"memory","eax"
03032 );
03033 }
03034
03035
03036
03037
03038
03039 void sse_cvt_byte_to_int(const byte *a, int32 *b, const int32 sz)
03040 {
03041 int32 ecx=sz>>4;
03042 int32 edx=sz&0xf;
03043
03044 asm(
03045 "orl %%ecx, %%ecx;\n\t"
03046 "jz .GA4;\n\t"
03047 "pxor %%xmm0, %%xmm0;\n\t"
03048 ".GA2:;\n\t"
03049 "movdqu 0(%%esi), %%xmm1;\n\t"
03050 "movdqa %%xmm1, %%xmm2;\n\t"
03051 "movdqa %%xmm1, %%xmm3;\n\t"
03052 "movdqa %%xmm1, %%xmm4;\n\t"
03053 "psrldq $4, %%xmm2;\n\t"
03054 "psrldq $8, %%xmm3;\n\t"
03055 "psrldq $12, %%xmm4;\n\t"
03056 "punpcklbw %%xmm0, %%xmm1;\n\t"
03057 "punpcklbw %%xmm0, %%xmm2;\n\t"
03058 "punpcklbw %%xmm0, %%xmm3;\n\t"
03059 "punpcklbw %%xmm0, %%xmm4;\n\t"
03060 "punpcklbw %%xmm0, %%xmm1;\n\t"
03061 "punpcklbw %%xmm0, %%xmm2;\n\t"
03062 "punpcklbw %%xmm0, %%xmm3;\n\t"
03063 "punpcklbw %%xmm0, %%xmm4;\n\t"
03064 "movdqu %%xmm1, (%%edi);\n\t"
03065 "movdqu %%xmm2, 16(%%edi);\n\t"
03066 "movdqu %%xmm3, 32(%%edi);\n\t"
03067 "movdqu %%xmm4, 48(%%edi);\n\t"
03068 "addl $16, %%esi;\n\t"
03069 "addl $64, %%edi;\n\t"
03070 "loop .GA2;\n\t"
03071 ".GA4:;\n\t"
03072 "orl %%edx, %%edx;\n\t"
03073 "jz .GA1;\n\t"
03074 "mov %%edx, %%ecx;\n\t"
03075 ".GA3:;\n\t"
03076 "xorl %%eax, %%eax;\n\t"
03077 "movb (%%esi), %%al;\n\t"
03078 "movl %%eax, (%%edi);\n\t"
03079 "incl %%esi;\n\t"
03080 "addl $4, %%edi;\n\t"
03081 "loop .GA3;\n\t"
03082 ".GA1:;"
03083 :
03084 :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03085 :"memory"
03086 );
03087
03088
03089 }
03090
03091 #endif
03092
03093 #ifdef INVT_USE_MMXSSE2
03094
03095
03096
03097 void sse2_cvt_byte_to_float(const byte *a, float32 *b, const int32 sz)
03098 {
03099 int32 ecx=sz>>4;
03100 int32 edx=sz&0xf;
03101
03102 asm(
03103 "orl %%ecx, %%ecx;\n\t"
03104 "jz .GB4;\n\t"
03105 ".GB2:;\n\t"
03106 "pxor %%xmm0, %%xmm0;\n\t"
03107 "movdqu 0(%%esi), %%xmm1;\n\t"
03108 "movdqu 4(%%esi), %%xmm2;\n\t"
03109 "movdqu 8(%%esi), %%xmm3;\n\t"
03110 "movdqu 12(%%esi), %%xmm4;\n\t"
03111 "punpcklbw %%xmm0, %%xmm1;\n\t"
03112 "punpcklbw %%xmm0, %%xmm2;\n\t"
03113 "punpcklbw %%xmm0, %%xmm3;\n\t"
03114 "punpcklbw %%xmm0, %%xmm4;\n\t"
03115 "punpcklbw %%xmm0, %%xmm1;\n\t"
03116 "punpcklbw %%xmm0, %%xmm2;\n\t"
03117 "punpcklbw %%xmm0, %%xmm3;\n\t"
03118 "punpcklbw %%xmm0, %%xmm4;\n\t"
03119 "cvtdq2ps %%xmm1, %%xmm1;\n\t"
03120 "cvtdq2ps %%xmm2, %%xmm2;\n\t"
03121 "movups %%xmm1, (%%edi);\n\t"
03122 "movups %%xmm2, 16(%%edi);\n\t"
03123 "cvtdq2ps %%xmm3, %%xmm3;\n\t"
03124 "cvtdq2ps %%xmm4, %%xmm4;\n\t"
03125 "movups %%xmm3, 32(%%edi);\n\t"
03126 "movups %%xmm4, 48(%%edi);\n\t"
03127 "addl $16, %%esi;\n\t"
03128 "addl $64, %%edi;\n\t"
03129 "loop .GB2;\n\t"
03130 ".GB4:;\n\t"
03131 "orl %%edx, %%edx;\n\t"
03132 "jz .GB1;\n\t"
03133 "movl %%edx, %%ecx;\n\t"
03134 ".GB3:;\n\t"
03135 "xorl %%eax, %%eax;\n\t"
03136 "movb (%%esi), %%al;\n\t"
03137 "movd %%eax, %%xmm0;\n\t"
03138 "cvtdq2ps %%xmm0, %%xmm1;\n\t"
03139 "movss %%xmm1, (%%edi);\n\t"
03140 "incl %%esi;\n\t"
03141 "addl $4, %%edi;\n\t"
03142 "loop .GB3;\n\t"
03143 ".GB1:;"
03144 :
03145 :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03146 :"memory"
03147 );
03148 }
03149
03150
03151
03152
03153
03154 void sse2_cvt_byte_to_double(const byte *a, double *b, int32 sz)
03155 {
03156 int32 ecx=sz>>3;
03157 int32 edx=sz&0x7;
03158
03159 asm(
03160 "orl %%ecx, %%ecx;\n\t"
03161 "jz .GC4;\n\t"
03162 ".GC2:;\n\t"
03163 "pxor %%xmm0, %%xmm0;\n\t"
03164 "movdqu 0(%%esi), %%xmm1;\n\t"
03165 "movdqu 2(%%esi), %%xmm2;\n\t"
03166 "movdqu 4(%%esi), %%xmm3;\n\t"
03167 "movdqu 6(%%esi), %%xmm4;\n\t"
03168 "punpcklbw %%xmm0, %%xmm1;\n\t"
03169 "punpcklbw %%xmm0, %%xmm2;\n\t"
03170 "punpcklbw %%xmm0, %%xmm3;\n\t"
03171 "punpcklbw %%xmm0, %%xmm4;\n\t"
03172 "punpcklbw %%xmm0, %%xmm1;\n\t"
03173 "punpcklbw %%xmm0, %%xmm2;\n\t"
03174 "punpcklbw %%xmm0, %%xmm3;\n\t"
03175 "punpcklbw %%xmm0, %%xmm4;\n\t"
03176 "cvtdq2pd %%xmm1, %%xmm1;\n\t"
03177 "cvtdq2pd %%xmm2, %%xmm2;\n\t"
03178 "movupd %%xmm1, (%%edi);\n\t"
03179 "movupd %%xmm2, 16(%%edi);\n\t"
03180 "cvtdq2pd %%xmm3, %%xmm3;\n\t"
03181 "cvtdq2pd %%xmm4, %%xmm4;\n\t"
03182 "movupd %%xmm3, 32(%%edi);\n\t"
03183 "movupd %%xmm4, 48(%%edi);\n\t"
03184 "addl $8, %%esi;\n\t"
03185 "addl $64, %%edi;\n\t"
03186 "loop .GC2;\n\t"
03187 ".GC4:;\n\t"
03188 "orl %%edx, %%edx;\n\t"
03189 "jz .GC1;\n\t"
03190 "movl %%edx, %%ecx;\n\t"
03191 ".GC3:;\n\t"
03192 "xorl %%eax, %%eax;\n\t"
03193 "movb (%%esi), %%al;\n\t"
03194 "movd %%eax, %%xmm0;\n\t"
03195 "cvtdq2pd %%xmm0, %%xmm1;\n\t"
03196 "movsd %%xmm1, (%%edi);\n\t"
03197 "incl %%esi;\n\t"
03198 "addl $8, %%edi;\n\t"
03199 "loop .GC3;\n\t"
03200 ".GC1:;"
03201 :
03202 :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03203 :"memory"
03204 );
03205
03206 }
03207
03208
03209
03210
03211
03212 void sse2_cvt_int_to_float(const int32 *a, float *b, const int32 sz)
03213 {
03214 int32 ecx=sz>>5;
03215 int32 edx=sz&0x1f;
03216
03217 asm(
03218 "orl %%ecx, %%ecx;\n\t"
03219 "jz .GD4;\n\t"
03220 ".GD2:;\n\t"
03221 "movdqu 0(%%esi), %%xmm0;\n\t"
03222 "movdqu 16(%%esi), %%xmm1;\n\t"
03223 "movdqu 32(%%esi), %%xmm2;\n\t"
03224 "movdqu 48(%%esi), %%xmm3;\n\t"
03225 "movdqu 64(%%esi), %%xmm4;\n\t"
03226 "movdqu 80(%%esi), %%xmm5;\n\t"
03227 "movdqu 96(%%esi), %%xmm6;\n\t"
03228 "movdqu 112(%%esi), %%xmm7;\n\t"
03229 "cvtdq2ps %%xmm0, %%xmm0;\n\t"
03230 "cvtdq2ps %%xmm1, %%xmm1;\n\t"
03231 "cvtdq2ps %%xmm2, %%xmm2;\n\t"
03232 "cvtdq2ps %%xmm3, %%xmm3;\n\t"
03233 "cvtdq2ps %%xmm4, %%xmm4;\n\t"
03234 "cvtdq2ps %%xmm5, %%xmm5;\n\t"
03235 "cvtdq2ps %%xmm6, %%xmm6;\n\t"
03236 "cvtdq2ps %%xmm7, %%xmm7;\n\t"
03237 "movups %%xmm0, 0(%%edi);\n\t"
03238 "movups %%xmm1, 16(%%edi);\n\t"
03239 "movups %%xmm2, 32(%%edi);\n\t"
03240 "movups %%xmm3, 48(%%edi);\n\t"
03241 "movups %%xmm4, 64(%%edi);\n\t"
03242 "movups %%xmm5, 80(%%edi);\n\t"
03243 "movups %%xmm6, 96(%%edi);\n\t"
03244 "movups %%xmm7, 112(%%edi);\n\t"
03245 "addl $128, %%esi;\n\t"
03246 "addl $128, %%edi;\n\t"
03247 "decl %%ecx;\n\t"
03248 "jnz .GD2;\n\t"
03249 ".GD4:;\n\t"
03250 "orl %%edx, %%edx;\n\t"
03251 "jz .GD1;\n\t"
03252 "movl %%edx, %%ecx;\n\t"
03253 ".GD3:;\n\t"
03254 "movd (%%esi), %%xmm0;\n\t"
03255 "cvtdq2ps %%xmm0, %%xmm0;\n\t"
03256 "movss %%xmm0, (%%edi);\n\t"
03257 "addl $4, %%esi;\n\t"
03258 "addl $4, %%edi;\n\t"
03259 "loop .GD3;\n\t"
03260 ".GD1:;"
03261 :
03262 :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03263 :"memory"
03264 );
03265
03266 }
03267
03268
03269
03270 void sse2_cvt_int_to_double(const int32 *a, double *b, const int32 sz)
03271 {
03272 int32 ecx=sz>>4;
03273 int32 edx=sz&0xf;
03274
03275 asm(
03276 "orl %%ecx, %%ecx;\n\t"
03277 "jz .GE4;\n\t"
03278 ".GE2:;\n\t"
03279 "movdqu 0(%%esi), %%xmm0;\n\t"
03280 "movdqu 8(%%esi), %%xmm1;\n\t"
03281 "movdqu 16(%%esi), %%xmm2;\n\t"
03282 "movdqu 24(%%esi), %%xmm3;\n\t"
03283 "movdqu 32(%%esi), %%xmm4;\n\t"
03284 "movdqu 40(%%esi), %%xmm5;\n\t"
03285 "movdqu 48(%%esi), %%xmm6;\n\t"
03286 "movdqu 56(%%esi), %%xmm7;\n\t"
03287 "cvtdq2pd %%xmm0, %%xmm0;\n\t"
03288 "cvtdq2pd %%xmm1, %%xmm1;\n\t"
03289 "cvtdq2pd %%xmm2, %%xmm2;\n\t"
03290 "cvtdq2pd %%xmm3, %%xmm3;\n\t"
03291 "cvtdq2pd %%xmm4, %%xmm4;\n\t"
03292 "cvtdq2pd %%xmm5, %%xmm5;\n\t"
03293 "cvtdq2pd %%xmm6, %%xmm6;\n\t"
03294 "cvtdq2pd %%xmm7, %%xmm7;\n\t"
03295 "movups %%xmm0, 0(%%edi);\n\t"
03296 "movups %%xmm1, 16(%%edi);\n\t"
03297 "movups %%xmm2, 32(%%edi);\n\t"
03298 "movups %%xmm3, 48(%%edi);\n\t"
03299 "movups %%xmm4, 64(%%edi);\n\t"
03300 "movups %%xmm5, 80(%%edi);\n\t"
03301 "movups %%xmm6, 96(%%edi);\n\t"
03302 "movups %%xmm7, 112(%%edi);\n\t"
03303 "addl $64, %%esi;\n\t"
03304 "addl $128, %%edi;\n\t"
03305 "decl %%ecx;\n\t"
03306 "jnz .GE2;\n\t"
03307 ".GE4:;\n\t"
03308 "orl %%edx, %%edx;\n\t"
03309 "jz .GE1;\n\t"
03310 "movl %%edx, %%ecx;\n\t"
03311 ".GE3:;\n\t"
03312 "movd (%%esi), %%xmm0;\n\t"
03313 "cvtdq2pd %%xmm0, %%xmm0;\n\t"
03314 "movsd %%xmm0, (%%edi);\n\t"
03315 "addl $4, %%esi;\n\t"
03316 "addl $8, %%edi;\n\t"
03317 "loop .GE3;\n\t"
03318 ".GE1:;"
03319 :
03320 :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03321 :"memory"
03322 );
03323
03324 }
03325
03326
03327 void sse2_cvt_float_to_int(const float *a, int *b, const int32 sz)
03328 {
03329 int32 ecx=sz;
03330 int32 edx=sz;
03331
03332 asm (
03333 "orl %%ecx, %%ecx;\n\t"
03334 "jz .GF1;\n\t"
03335 ".GF2:;\n\t"
03336 "movdqu 0(%%esi), %%xmm0;\n\t"
03337 "movdqu 8(%%esi), %%xmm1;\n\t"
03338 "movdqu 16(%%esi), %%xmm2;\n\t"
03339 "movdqu 24(%%esi), %%xmm3;\n\t"
03340 "movdqu 32(%%esi), %%xmm4;\n\t"
03341 "movdqu 40(%%esi), %%xmm5;\n\t"
03342 "movdqu 48(%%esi), %%xmm6;\n\t"
03343 "movdqu 56(%%esi), %%xmm7;\n\t"
03344 "cvtps2dq %%xmm0, %%xmm0;\n\t"
03345 "cvtps2dq %%xmm1, %%xmm1;\n\t"
03346 "cvtps2dq %%xmm2, %%xmm2;\n\t"
03347 "cvtps2dq %%xmm3, %%xmm3;\n\t"
03348 "cvtps2dq %%xmm4, %%xmm4;\n\t"
03349 "cvtps2dq %%xmm5, %%xmm5;\n\t"
03350 "cvtps2dq %%xmm6, %%xmm6;\n\t"
03351 "cvtps2dq %%xmm7, %%xmm7;\n\t"
03352 "movdqu %%xmm0, 0(%%edi);\n\t"
03353 "movdqu %%xmm1, 16(%%edi);\n\t"
03354 "movdqu %%xmm2, 32(%%edi);\n\t"
03355 "movdqu %%xmm3, 48(%%edi);\n\t"
03356 "movdqu %%xmm4, 64(%%edi);\n\t"
03357 "movdqu %%xmm5, 80(%%edi);\n\t"
03358 "movdqu %%xmm6, 96(%%edi);\n\t"
03359 "movdqu %%xmm7, 112(%%edi);\n\t"
03360 "addl $64, %%esi;\n\t"
03361 "addl $128, %%edi;\n\t"
03362 "decl %%ecx;\n\t"
03363 "jnz .GF2;\n\t"
03364 ".GF4:;\n\t"
03365 "orl %%edx, %%edx;\n\t"
03366 "jz .GF1;\n\t"
03367 "movl %%edx, %%ecx;\n\t"
03368 ".GF3:;\n\t"
03369 "movd (%%esi), %%xmm0;\n\t"
03370 "cvtps2dq %%xmm0, %%xmm0;\n\t"
03371 "movd %%xmm0, (%%edi);\n\t"
03372 "addl $4, %%esi;\n\t"
03373 "addl $8, %%edi;\n\t"
03374 "loop .GF3;\n\t"
03375 ".GF1:;"
03376 :
03377 :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03378 :"memory"
03379 );
03380
03381 }
03382
03383
03384
03385
03386 void sse2_cvt_float_to_double(const float *a, double *b, const int32 sz)
03387 {
03388 int32 ecx=sz>>4;
03389 int32 edx=sz&0xf;
03390
03391 asm(
03392 "orl %%ecx, %%ecx;\n\t"
03393 "jz .GG4;\n\t"
03394 ".GG2:;\n\t"
03395 "movups 0(%%esi), %%xmm0;\n\t"
03396 "movups 8(%%esi), %%xmm1;\n\t"
03397 "movups 16(%%esi), %%xmm2;\n\t"
03398 "movups 24(%%esi), %%xmm3;\n\t"
03399 "movups 32(%%esi), %%xmm4;\n\t"
03400 "movups 40(%%esi), %%xmm5;\n\t"
03401 "movups 48(%%esi), %%xmm6;\n\t"
03402 "movups 56(%%esi), %%xmm7;\n\t"
03403 "cvtps2pd %%xmm0, %%xmm0;\n\t"
03404 "cvtps2pd %%xmm1, %%xmm1;\n\t"
03405 "cvtps2pd %%xmm2, %%xmm2;\n\t"
03406 "cvtps2pd %%xmm3, %%xmm3;\n\t"
03407 "cvtps2pd %%xmm4, %%xmm4;\n\t"
03408 "cvtps2pd %%xmm5, %%xmm5;\n\t"
03409 "cvtps2pd %%xmm6, %%xmm6;\n\t"
03410 "cvtps2pd %%xmm7, %%xmm7;\n\t"
03411 "movupd %%xmm0, 0(%%edi);\n\t"
03412 "movupd %%xmm1, 16(%%edi);\n\t"
03413 "movupd %%xmm2, 32(%%edi);\n\t"
03414 "movupd %%xmm3, 48(%%edi);\n\t"
03415 "movupd %%xmm4, 64(%%edi);\n\t"
03416 "movupd %%xmm5, 80(%%edi);\n\t"
03417 "movupd %%xmm6, 96(%%edi);\n\t"
03418 "movupd %%xmm7, 112(%%edi);\n\t"
03419 "addl $64, %%esi;\n\t"
03420 "addl $128, %%edi;\n\t"
03421 "decl %%ecx;\n\t"
03422 "jnz .GG2;\n\t"
03423 ".GG4:;\n\t"
03424 "orl %%edx, %%edx;\n\t"
03425 "jz .GG1;\n\t"
03426 "movl %%edx, %%ecx;\n\t"
03427 ".GG3:;\n\t"
03428 "movd (%%esi), %%xmm0;\n\t"
03429 "cvtps2pd %%xmm0, %%xmm0;\n\t"
03430 "movsd %%xmm0, (%%edi);\n\t"
03431 "addl $4, %%esi;\n\t"
03432 "addl $8, %%edi;\n\t"
03433 "loop .GG3;\n\t"
03434 ".GG1:;"
03435 :
03436 :"S"(a), "D"(b), "c"(ecx),"d"(edx)
03437 :"memory"
03438 );
03439 }
03440
03441 #endif
03442
03443 #ifdef INVT_USE_SSE
03444
03445
03446 void sse_lowPass3x(const float *a, float *b, const int h, const int w)
03447 {
03448 const float coeffs[] = { 3.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0, 4.0};
03449 int edx = (w-2)/12;
03450 int eax = (w-2)%12;
03451
03452 asm (
03453
03454 "orl %%ecx, %%ecx;\n\t"
03455 "jz .HA1;\n\t"
03456 ".HA2:;\n\t"
03457
03458
03459 "movss 0(%%esi), %%xmm1;\n\t"
03460 "movss 4(%%esi), %%xmm2;\n\t"
03461 "addss %%xmm1, %%xmm1;\n\t"
03462 "addss %%xmm1, %%xmm2;\n\t"
03463 "divss (%%ebx), %%xmm2;\n\t"
03464 "movss %%xmm2, (%%edi);\n\t"
03465 "addl $4, %%edi;\n\t"
03466
03467
03468 "orl %%edx, %%edx;\n\t"
03469 "jz .HA4;\n\t"
03470
03471 "pushl %%edx;\n\t"
03472 ".HA3:;\n\t"
03473 "movups 00(%%esi), %%xmm0;\n\t"
03474 "movups 04(%%esi), %%xmm1;\n\t"
03475 "movups 8(%%esi), %%xmm2;\n\t"
03476 "movups 16(%%esi), %%xmm3;\n\t"
03477 "movups 20(%%esi), %%xmm4;\n\t"
03478 "movups 24(%%esi), %%xmm5;\n\t"
03479 "movups 32(%%esi), %%xmm6;\n\t"
03480 "movups 36(%%esi), %%xmm7;\n\t"
03481 "addps %%xmm1, %%xmm0;\n\t"
03482 "addps %%xmm4, %%xmm3;\n\t"
03483 "addps %%xmm1, %%xmm0;\n\t"
03484 "addps %%xmm4, %%xmm3;\n\t"
03485 "movups 40(%%esi), %%xmm1;\n\t"
03486 "addps %%xmm7, %%xmm6;\n\t"
03487 "addps %%xmm2, %%xmm0;\n\t"
03488 "addps %%xmm1, %%xmm6;\n\t"
03489 "addps %%xmm5, %%xmm3;\n\t"
03490 "addps %%xmm7, %%xmm6;\n\t"
03491 "divps 16(%%ebx ), %%xmm0;\n\t"
03492 "divps 16(%%ebx ), %%xmm3;\n\t"
03493 "divps 16(%%ebx ), %%xmm6;\n\t"
03494 "movups %%xmm0, (%%edi);\n\t"
03495 "movups %%xmm3, 16(%%edi);\n\t"
03496 "movups %%xmm6, 32(%%edi);\n\t"
03497 "addl $48, %%esi;\n\t"
03498 "addl $48, %%edi;\n\t"
03499 "decl %%edx;\n\t"
03500 "jnz .HA3;\n\t"
03501 "popl %%edx;\n\t"
03502 ".HA4:;\n\t"
03503
03504 "orl %%eax, %%eax;\n\t"
03505 "jz .HA6;\n\t"
03506 "pushl %%eax;\n\t"
03507 ".HA5:;\n\t"
03508 "movss 00(%%esi), %%xmm0;\n\t"
03509 "movss 04(%%esi), %%xmm1;\n\t"
03510 "movss 8(%%esi), %%xmm2;\n\t"
03511 "addps %%xmm1, %%xmm0;\n\t"
03512 "addps %%xmm1, %%xmm2;\n\t"
03513 "addps %%xmm2, %%xmm0;\n\t"
03514 "divss 16(%%ebx ), %%xmm0;\n\t"
03515 "movss %%xmm0, (%%edi);\n\t"
03516 "addl $4, %%esi;\n\t"
03517 "addl $4, %%edi;\n\t"
03518 "decl %%eax;\n\t"
03519 "jnz .HA5;\n\t"
03520 "popl %%eax;\n\t"
03521
03522 ".HA6:;\n\t"
03523 "movss (%%esi), %%xmm1;\n\t"
03524 "movss 4(%%esi), %%xmm2;\n\t"
03525 "addss %%xmm2, %%xmm2;\n\t"
03526 "addss %%xmm1, %%xmm2;\n\t"
03527 "divss 0(%%ebx), %%xmm2;\n\t"
03528
03529 "movss %%xmm2, (%%edi);\n\t"
03530 "addl $4, %%edi;\n\t"
03531 "addl $8, %%esi;\n\t"
03532 "decl %%ecx;\n\t"
03533 "jnz .HA2;\n\t"
03534 ".HA1:;\n\t"
03535 :
03536 :"S"(a), "D"(b),"c"(h),"a"(eax),"d"(edx),"b"(coeffs)
03537 :"memory"
03538 );
03539
03540 }
03541
03542
03543
03544
03545
03546
03547 void sse_lowPass3y(const float *a, float *b, const int h, const int w)
03548 {
03549 const float coeffs[] = { 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0};
03550
03551 if (h < 2){
03552 memcpy(b, a, w*h*sizeof(b[0]));
03553 return;
03554 }
03555
03556 if (h < 2){
03557 memcpy(b, a, w*h*sizeof(b[0]));
03558 return;
03559 }
03560
03561 asm (
03562
03563 "movl %%edx, %%ecx;\n\t"
03564 "orl %%ecx, %%ecx;\n\t"
03565 "jz .HU1;\n\t"
03566 "push %%esi;\n\t"
03567 ".HU0:;\n\t"
03568 "movss (%%esi), %%xmm0;\n\t"
03569 "movss (%%esi, %%edx, 4), %%xmm1;\n\t"
03570 "addss %%xmm0, %%xmm0;\n\t"
03571 "addss %%xmm1, %%xmm0;\n\t"
03572 "divss (%%ebx), %%xmm0;\n\t"
03573 "addl $4, %%esi;\n\t"
03574 "movss %%xmm0, (%%edi);\n\t"
03575 "addl $4, %%edi;\n\t"
03576 "decl %%ecx;\n\t"
03577 "jnz .HU0;\n\t"
03578 "popl %%esi;\n\t"
03579 ".HU1:;\n\t"
03580 "cmpl $2, %%eax;\n\t"
03581 "jle .HU5;\n\t"
03582
03583 "pushl %%eax;\n\t"
03584 "subl $2, %%eax;\n\t"
03585 "jle .HU4;\n\t"
03586 ".HU2:;\n\t"
03587 "movl %%edx, %%ecx;\n\t"
03588 "pushl %%edx;\n\t"
03589 ".HU3:;\n\t"
03590 "movss (%%esi), %%xmm0;\n\t"
03591 "movss (%%esi,%%edx,4), %%xmm1;\n\t"
03592 "movss (%%esi,%%edx,8), %%xmm2;\n\t"
03593 "addss %%xmm1, %%xmm0;\n\t"
03594 "addss %%xmm1, %%xmm2;\n\t"
03595 "addss %%xmm2, %%xmm0;\n\t"
03596 "divss 16(%%ebx), %%xmm0;\n\t"
03597 "movss %%xmm0, (%%edi);\n\t"
03598 "addl $4, %%esi;\n\t"
03599 "addl $4, %%edi;\n\t"
03600 "decl %%ecx;\n\t"
03601 "jnz .HU3;\n\t"
03602 "popl %%edx;\n\t"
03603 "decl %%eax;\n\t"
03604 "jnz .HU2;\n\t"
03605
03606 ".HU4:;\n\t"
03607 "popl %%eax;\n\t"
03608 ".HU5:;\n\t"
03609 "orl %%edx, %%edx;\n\t"
03610 "jz .HU7;\n\t"
03611 "pushl %%edx;\n\t"
03612 "movl %%edx, %%ecx;\n\t"
03613 ".HU6:;\n\t"
03614 "movss (%%esi), %%xmm0;\n\t"
03615 "movss (%%esi,%%ecx,4), %%xmm1;\n\t"
03616 "addss %%xmm1, %%xmm1;\n\t"
03617 "addss %%xmm1, %%xmm0;\n\t"
03618 "divss (%%ebx), %%xmm0;\n\t"
03619 "movss %%xmm0, (%%edi);\n\t"
03620 "addl $4, %%esi;\n\t"
03621 "addl $4, %%edi;\n\t"
03622 "decl %%edx;\n\t"
03623 "jnz .HU6;\n\t"
03624 "popl %%edx;\n\t"
03625 ".HU7:;\n\t"
03626 :
03627 :"S"(a),"D"(b),"a"(h),"d"(w),"b"(coeffs)
03628 );
03629
03630 }
03631
03632
03633
03634
03635 void sse_lowPass5x(const float *src, float *dest, const int h, const int w)
03636 {
03637 const float *sptr= src;
03638 float *dptr= dest;
03639
03640 if(w<2)
03641 {
03642 memcpy(dest,src,h*w*sizeof(dest[0]));
03643 return;
03644 }
03645
03646 if (w == 2)
03647 for (int j = 0; j < h; j ++)
03648 {
03649
03650 *dptr++ = sptr[0] * (6.0F / 10.0F) + sptr[1] * (4.0F / 10.0F);
03651
03652
03653 *dptr++ = sptr[0] * (4.0F / 10.0F) + sptr[1] * (6.0F / 10.0F);
03654
03655 sptr += 2;
03656 }
03657 else if (w == 3)
03658 for (int j = 0; j < h; j ++)
03659 {
03660
03661 *dptr++ = sptr[0] * (6.0F / 11.0F) +
03662 sptr[1] * (4.0F / 11.0F) +
03663 sptr[2] * (1.0F / 11.0F);
03664
03665
03666 *dptr++ = (sptr[0] + sptr[2]) * (4.0F / 14.0F) +
03667 sptr[1] * (6.0F / 14.0F);
03668
03669
03670 *dptr++ = sptr[0] * (1.0F / 11.0F) +
03671 sptr[1] * (4.0F / 11.0F) +
03672 sptr[2] * (6.0F / 11.0F);
03673
03674 sptr += 3;
03675 }
03676 else
03677 if(w>3)
03678 {
03679 const float coeffs[] = {6.0/11.0, 4.0/11.0, 1.0/11.0, 4.0/15.0,
03680 4.0/15.0, 6.0/15.0, 1.0/15.0, 1.0/16.0,
03681 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
03682 4.0/16.0, 4.0/16.0, 4.0/16.0, 4.0/16.0,
03683 6.0/16.0, 6.0/16.0, 6.0/16.0, 6.0/16.0,
03684 1.0/15.0, 4.0/15.0, 6.0/15.0, 1.0/15.0,
03685 1.0/11.0, 4.0/11.0, 6.0/11.0, 1.0/11.0
03686 };
03687
03688 int eax= (w-4)&3;
03689 int edx= (w-4)>>2;
03690
03691 asm(
03692 "orl %%ecx, %%ecx;\n\t"
03693 "jz .HG6;\n\t"
03694 ".HG0:;\n\t"
03695 "movss (%%esi), %%xmm0;\n\t"
03696 "movss 4(%%esi), %%xmm2;\n\t"
03697 "movss 8(%%esi), %%xmm4;\n\t"
03698 "movss 12(%%esi), %%xmm6;\n\t"
03699 "movss %%xmm0, %%xmm1;\n\t"
03700 "movss %%xmm2, %%xmm3;\n\t"
03701 "movss %%xmm4, %%xmm5;\n\t"
03702 "mulss (%%ebx), %%xmm0;\n\t"
03703 "mulss 4(%%ebx), %%xmm2;\n\t"
03704 "mulss 8(%%ebx), %%xmm4;\n\t"
03705 "addss %%xmm5, %%xmm1;\n\t"
03706 "mulss 16(%%ebx), %%xmm1;\n\t"
03707 "mulss 20(%%ebx), %%xmm3;\n\t"
03708 "mulss 24(%%ebx), %%xmm6;\n\t"
03709 "addss %%xmm2, %%xmm0;\n\t"
03710 "addss %%xmm3, %%xmm1;\n\t"
03711 "addss %%xmm4, %%xmm0;\n\t"
03712 "addss %%xmm6, %%xmm1;\n\t"
03713 "movss %%xmm0, (%%edi);\n\t"
03714 "movss %%xmm1, 4(%%edi);\n\t"
03715 "addl $8, %%edi;\n\t"
03716
03717 "orl %%edx, %%edx;\n\t"
03718 "jz .HG5;\n\t"
03719
03720 "pushl %%edx;\n\t"
03721 "movups 32(%%ebx), %%xmm5;\n\t"
03722 "movups 48(%%ebx), %%xmm6;\n\t"
03723 "movups 64(%%ebx), %%xmm7;\n\t"
03724 ".HG1:;\n\t"
03725 "movups 0(%%esi), %%xmm0;\n\t"
03726 "movups 04(%%esi), %%xmm1;\n\t"
03727 "movups 8(%%esi), %%xmm2;\n\t"
03728 "movups 12(%%esi), %%xmm3;\n\t"
03729 "movups 16(%%esi), %%xmm4;\n\t"
03730 "addps %%xmm4, %%xmm0;\n\t"
03731 "addps %%xmm3, %%xmm1;\n\t"
03732 "mulps %%xmm5, %%xmm0;\n\t"
03733 "mulps %%xmm6, %%xmm1;\n\t"
03734 "mulps %%xmm7, %%xmm2;\n\t"
03735 "addps %%xmm1, %%xmm0;\n\t"
03736 "addps %%xmm2, %%xmm0;\n\t"
03737 "movups %%xmm0, (%%edi);\n\t"
03738 "addl $16, %%esi;\n\t"
03739 "addl $16, %%edi;\n\t"
03740 "decl %%edx;\n\t"
03741 "jnz .HG1;\n\t"
03742 "popl %%edx;\n\t"
03743
03744 ".HG5:;\n\t"
03745 "orl %%eax, %%eax;\n\t"
03746 "jz .HG3;\n\t"
03747 "pushl %%eax;\n\t"
03748 "movups 32(%%ebx), %%xmm5;\n\t"
03749 "movups 48(%%ebx), %%xmm6;\n\t"
03750 "movups 64(%%ebx), %%xmm7;\n\t"
03751 ".HG2:;\n\t"
03752 "movss (%%esi), %%xmm0;\n\t"
03753 "movss 4(%%esi), %%xmm1;\n\t"
03754 "movss 8(%%esi), %%xmm2;\n\t"
03755 "movss 12(%%esi), %%xmm3;\n\t"
03756 "movss 16(%%esi), %%xmm4;\n\t"
03757 "mulss %%xmm5 , %%xmm0;\n\t"
03758 "mulss %%xmm6 , %%xmm1;\n\t"
03759 "mulss %%xmm7 , %%xmm2;\n\t"
03760 "mulss %%xmm6 , %%xmm3;\n\t"
03761 "mulss %%xmm5 , %%xmm4;\n\t"
03762 "addss %%xmm1, %%xmm0;\n\t"
03763 "addss %%xmm3, %%xmm2;\n\t"
03764 "addss %%xmm4, %%xmm0;\n\t"
03765 "addss %%xmm2, %%xmm0;\n\t"
03766 "addl $4, %%esi;\n\t"
03767 "movss %%xmm0, (%%edi);\n\t"
03768 "addl $4, %%edi;\n\t"
03769 "decl %%eax;\n\t"
03770 "jnz .HG2;\n\t"
03771 "popl %%eax;\n\t"
03772 ".HG3:;\n\t"
03773 "movss (%%esi), %%xmm0;\n\t"
03774 "movss 4(%%esi), %%xmm1;\n\t"
03775 "movss 8(%%esi), %%xmm2;\n\t"
03776 "movss 12(%%esi), %%xmm3;\n\t"
03777 "movss %%xmm1, %%xmm4;\n\t"
03778 "movss %%xmm2, %%xmm5;\n\t"
03779 "movss %%xmm3, %%xmm6;\n\t"
03780 "addps %%xmm1, %%xmm3;\n\t"
03781 "mulss 80(%%ebx), %%xmm0;\n\t"
03782 "mulss 84(%%ebx), %%xmm3;\n\t"
03783 "mulss 88(%%ebx), %%xmm2;\n\t"
03784 "addss %%xmm3, %%xmm0;\n\t"
03785 "addss %%xmm2, %%xmm0;\n\t"
03786 "movss %%xmm0, (%%edi);\n\t"
03787 "mulss 96(%%ebx), %%xmm4;\n\t"
03788 "mulss 100(%%ebx), %%xmm5;\n\t"
03789 "mulss 104(%%ebx), %%xmm6;\n\t"
03790 "addss %%xmm5, %%xmm4;\n\t"
03791 "addss %%xmm6, %%xmm4;\n\t"
03792 "movss %%xmm4, 4(%%edi);\n\t"
03793 "addl $16, %%esi;\n\t"
03794 "addl $8, %%edi;\n\t"
03795 "decl %%ecx;\n\t"
03796 "jnz .HG0;\n\t"
03797 ".HG6:;\n\t"
03798 :
03799 :"S"(sptr),"D"(dptr),"a"(eax),"b"(coeffs),"c"(h),"d"(edx)
03800 :"memory"
03801 );
03802 }
03803
03804 }
03805
03806
03807
03808
03809
03810 void sse_lowPass5y(const float *src, float *dest, const int h,
03811 const int w)
03812 {
03813 if (h < 2){
03814 memcpy(dest, src, h*w*sizeof(dest[0]));
03815 return;
03816 }
03817
03818 const float *sptr= src;
03819 float *dptr= dest;
03820
03821
03822 const int w2 = w * 2;
03823
03824
03825 if (h == 2)
03826 {
03827
03828 for (int i = 0; i < w; i ++)
03829 {
03830 *dptr++ = sptr[0] * (6.0F / 10.0F) +
03831 sptr[w] * (4.0F / 10.0F);
03832 sptr++;
03833 }
03834 sptr -= w;
03835
03836
03837 for (int i = 0; i < w; i ++)
03838 {
03839 *dptr++ = sptr[0] * (4.0F / 10.0F) +
03840 sptr[w] * (6.0F / 10.0F);
03841 sptr++;
03842 }
03843 }
03844 else if (h == 3)
03845 {
03846
03847 for (int i = 0; i < w; i ++)
03848 {
03849 *dptr++ = sptr[ 0] * (6.0F / 11.0F) +
03850 sptr[ w] * (4.0F / 11.0F) +
03851 sptr[w2] * (1.0F / 11.0F);
03852 sptr++;
03853 }
03854 sptr -= w;
03855
03856
03857 for (int i = 0; i < w; i ++)
03858 {
03859 *dptr++ = (sptr[ 0] + sptr[w2]) * (4.0F / 14.0F) +
03860 sptr[ w] * (6.0F / 14.0F);
03861 sptr++;
03862 }
03863 sptr -= w;
03864
03865
03866 for (int i = 0; i < w; i ++)
03867 {
03868 *dptr++ = sptr[ 0] * (1.0F / 11.0F) +
03869 sptr[ w] * (4.0F / 11.0F) +
03870 sptr[w2] * (6.0F / 11.0F);
03871 sptr++;
03872 }
03873 }
03874 else
03875 {
03876
03877
03878 static const float coeffs[] = {
03879 6.0/11.0, 6.0/11.0, 6.0/11.0, 6.0/11.0,
03880 4.0/11.0, 4.0/11.0, 4.0/11.0, 4.0/11.0,
03881 1.0/11.0, 1.0/11.0, 1.0/11.0, 1.0/11.0,
03882 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F,
03883 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F,
03884 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F,
03885 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
03886 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F,
03887 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F
03888 };
03889
03890 int ecx=h-4;
03891 int edx=w>>2;
03892 int eax=w&3;
03893
03894 asm (
03895 "pushl %%ebp;\n\t"
03896 "movl %0, %%ebp;\n\t"
03897 "addl %%ebp, %%ebp;\n\t"
03898 "addl %%ebp, %%ebp;\n\t"
03899
03900
03901 "movups (%%ebx), %%xmm4;\n\t"
03902 "movups 16(%%ebx), %%xmm5;\n\t"
03903 "movups 32(%%ebx), %%xmm6;\n\t"
03904 "pushl %%esi;\n\t"
03905 "orl %%edx, %%edx;\n\t"
03906 "jz .IA1;\n\t"
03907 ".align 4;\n\t"
03908 "pushl %%edx;\n\t"
03909 ".IA0:;\n\t"
03910 ".align 4;\n\t"
03911 "movups (%%esi), %%xmm0;\n\t"
03912 "movups (%%esi,%%ebp,1), %%xmm1;\n\t"
03913 "movups (%%esi,%%ebp,2), %%xmm2;\n\t"
03914 "mulps %%xmm4, %%xmm0;\n\t"
03915 "mulps %%xmm5, %%xmm1;\n\t"
03916 "mulps %%xmm6, %%xmm2;\n\t"
03917 "addps %%xmm1, %%xmm0;\n\t"
03918 "addps %%xmm2, %%xmm0;\n\t"
03919 "movups %%xmm0, (%%edi);\n\t"
03920 "addl $16, %%esi;\n\t"
03921 "addl $16, %%edi;\n\t"
03922 "decl %%edx;\n\t"
03923 "jnz .IA0;\n\t"
03924 "popl %%edx;\n\t"
03925 ".IA1:;\n\t"
03926 ".align 4;\n\t"
03927 "orl %%eax, %%eax;\n\t"
03928 "jz .IA3;\n\t"
03929 "pushl %%eax;\n\t"
03930 ".IA2:;\n\t"
03931 ".align 4;\n\t"
03932 "movss (%%esi), %%xmm0;\n\t"
03933 "movss (%%esi,%%ebp,1), %%xmm1;\n\t"
03934 "movss (%%esi,%%ebp,2), %%xmm2;\n\t"
03935 "mulss %%xmm4, %%xmm0;\n\t"
03936 "mulss %%xmm5, %%xmm1;\n\t"
03937 "mulss %%xmm6, %%xmm2;\n\t"
03938 "addss %%xmm1, %%xmm0;\n\t"
03939 "addss %%xmm2, %%xmm0;\n\t"
03940 "movss %%xmm0, (%%edi);\n\t"
03941 "addl $4, %%esi;\n\t"
03942 "addl $4, %%edi;\n\t"
03943 "decl %%eax;\n\t"
03944 "jnz .IA2;\n\t"
03945 "popl %%eax;\n\t"
03946 ".IA3:;\n\t"
03947 "popl %%esi;\n\t"
03948
03949
03950 "movups 48(%%ebx), %%xmm4;\n\t"
03951 "movups 64(%%ebx), %%xmm5;\n\t"
03952 "movups 80(%%ebx), %%xmm6;\n\t"
03953 "pushl %%esi;\n\t"
03954 "orl %%edx, %%edx;\n\t"
03955 "jz .IA5;\n\t"
03956 "pushl %%edx;\n\t"
03957 "pushl %%eax;\n\t"
03958 "movl %%ebp, %%eax;\n\t"
03959 "addl %%ebp, %%eax;\n\t"
03960 "addl %%ebp, %%eax;\n\t"
03961 ".IA4:;\n\t"
03962 "movups (%%esi), %%xmm0;\n\t"
03963 "movups (%%esi,%%ebp,1), %%xmm1;\n\t"
03964 "movups (%%esi,%%ebp,2), %%xmm2;\n\t"
03965 "movups (%%esi,%%eax,1), %%xmm3;\n\t"
03966 "addps %%xmm2, %%xmm0;\n\t"
03967 "mulps %%xmm4, %%xmm0;\n\t"
03968 "mulps %%xmm5, %%xmm1;\n\t"
03969 "mulps %%xmm6, %%xmm3;\n\t"
03970 "addps %%xmm1, %%xmm0;\n\t"
03971 "addps %%xmm3, %%xmm0;\n\t"
03972 "movups %%xmm0, (%%edi);\n\t"
03973 "addl $16, %%esi;\n\t"
03974 "addl $16, %%edi;\n\t"
03975 "decl %%edx;\n\t"
03976 "jnz .IA4;\n\t"
03977 "popl %%eax;\n\t"
03978 "popl %%edx;\n\t"
03979 ".IA5:;\n\t"
03980 "orl %%eax, %%eax;\n\t"
03981 "jz .IA7;\n\t"
03982 "pushl %%eax;\n\t"
03983 "pushl %%edx;\n\t"
03984 "movl %%ebp, %%edx;\n\t"
03985 "addl %%ebp, %%edx;\n\t"
03986 "addl %%ebp, %%edx;\n\t"
03987 ".IA6:;\n\t"
03988 "movss (%%esi), %%xmm0;\n\t"
03989 "movss (%%esi,%%ebp,1), %%xmm1;\n\t"
03990 "movss (%%esi,%%ebp,2), %%xmm2;\n\t"
03991 "movss (%%esi,%%edx,1), %%xmm3;\n\t"
03992 "addss %%xmm2, %%xmm0;\n\t"
03993 "mulss %%xmm4, %%xmm0;\n\t"
03994 "mulss %%xmm5, %%xmm1;\n\t"
03995 "mulss %%xmm6, %%xmm3;\n\t"
03996 "addss %%xmm1, %%xmm0;\n\t"
03997 "addss %%xmm3, %%xmm0;\n\t"
03998 "movss %%xmm0, (%%edi);\n\t"
03999 "addl $4, %%esi;\n\t"
04000 "addl $4, %%edi;\n\t"
04001 "decl %%eax;\n\t"
04002 "jnz .IA6;\n\t"
04003 "popl %%edx;\n\t"
04004 "popl %%eax;\n\t"
04005 ".IA7:;\n\t"
04006 "popl %%esi;\n\t"
04007
04008
04009
04010 "orl %%ecx, %%ecx;\n\t"
04011 "jz .IA29;\n\t"
04012 "pushl %%ecx;\n\t"
04013 "movups 96(%%ebx), %%xmm5;\n\t"
04014 "movups 112(%%ebx), %%xmm6;\n\t"
04015 "movups 128(%%ebx), %%xmm7;\n\t"
04016 ".IA8:;\n\t"
04017 "orl %%edx, %%edx;\n\t"
04018 "jz .IA10;\n\t"
04019 "pushl %%edx;\n\t"
04020 "pushl %%eax;\n\t"
04021 "movl %%ebp, %%eax;\n\t"
04022 "addl %%ebp, %%eax;\n\t"
04023 "addl %%ebp, %%eax;\n\t"
04024 ".IA9:;\n\t"
04025 "movups (%%esi), %%xmm0;\n\t"
04026 "movups (%%esi,%%ebp,1), %%xmm1;\n\t"
04027 "movups (%%esi,%%ebp,2), %%xmm2;\n\t"
04028 "movups (%%esi,%%eax,1), %%xmm3;\n\t"
04029 "movups (%%esi,%%ebp,4), %%xmm4;\n\t"
04030 "addps %%xmm3, %%xmm1;\n\t"
04031 "addps %%xmm4, %%xmm0;\n\t"
04032 "mulps %%xmm6, %%xmm1;\n\t"
04033 "mulps %%xmm5, %%xmm0;\n\t"
04034 "mulps %%xmm7, %%xmm2;\n\t"
04035 "addps %%xmm1, %%xmm0;\n\t"
04036 "addps %%xmm2, %%xmm0;\n\t"
04037 "addl $16, %%esi;\n\t"
04038 "movups %%xmm0, (%%edi);\n\t"
04039 "addl $16, %%edi;\n\t"
04040 "decl %%edx;\n\t"
04041 "jnz .IA9;\n\t"
04042 "popl %%eax;\n\t"
04043 "popl %%edx;\n\t"
04044 ".IA10:;\n\t"
04045 "orl %%eax, %%eax;\n\t"
04046 "jz .IA12;\n\t"
04047 "pushl %%eax;\n\t"
04048 "pushl %%edx;\n\t"
04049 "movl %%ebp, %%edx;\n\t"
04050 "addl %%ebp, %%edx;\n\t"
04051 "addl %%ebp, %%edx;\n\t"
04052 ".IA11:;\n\t"
04053 "movss (%%esi), %%xmm0;\n\t"
04054 "movss (%%esi,%%ebp,1), %%xmm1;\n\t"
04055 "movss (%%esi,%%ebp,2), %%xmm2;\n\t"
04056 "movss (%%esi,%%edx,1), %%xmm3;\n\t"
04057 "movss (%%esi,%%ebp,4), %%xmm4;\n\t"
04058 "addss %%xmm3, %%xmm1;\n\t"
04059 "addss %%xmm4, %%xmm0;\n\t"
04060 "mulss %%xmm6, %%xmm1;\n\t"
04061 "mulss %%xmm5, %%xmm0;\n\t"
04062 "mulss %%xmm7, %%xmm2;\n\t"
04063 "addss %%xmm1, %%xmm0;\n\t"
04064 "addss %%xmm2, %%xmm0;\n\t"
04065 "addl $4, %%esi;\n\t"
04066 "movss %%xmm0, (%%edi);\n\t"
04067 "addl $4, %%edi;\n\t"
04068 "decl %%eax;\n\t"
04069 "jnz .IA11;\n\t"
04070 "popl %%edx;\n\t"
04071 "popl %%eax;\n\t"
04072 ".IA12:;\n\t"
04073 "decl %%ecx;\n\t"
04074 "jnz .IA8;\n\t"
04075 "popl %%ecx;\n\t"
04076 ".IA29:;\n\t"
04077
04078
04079 "movups 48(%%ebx), %%xmm4;\n\t"
04080 "movups 64(%%ebx), %%xmm5;\n\t"
04081 "movups 80(%%ebx), %%xmm6;\n\t"
04082 "orl %%edx, %%edx;\n\t"
04083 "jz .IA14;\n\t"
04084 "pushl %%edx;\n\t"
04085 "pushl %%eax;\n\t"
04086 "movl %%ebp, %%eax;\n\t"
04087 "addl %%ebp, %%eax;\n\t"
04088 "addl %%ebp, %%eax;\n\t"
04089 ".IA13:;\n\t"
04090 "movups (%%esi), %%xmm0;\n\t"
04091 "movups (%%esi,%%ebp,1), %%xmm1;\n\t"
04092 "movups (%%esi,%%ebp,2), %%xmm2;\n\t"
04093 "movups (%%esi,%%eax,1),%%xmm3;\n\t"
04094 "addps %%xmm3, %%xmm1;\n\t"
04095 "mulps %%xmm6, %%xmm0;\n\t"
04096 "mulps %%xmm5, %%xmm2;\n\t"
04097 "mulps %%xmm4, %%xmm1;\n\t"
04098 "addps %%xmm2, %%xmm0;\n\t"
04099 "addps %%xmm1, %%xmm0;\n\t"
04100 "movups %%xmm0, (%%edi);\n\t"
04101 "addl $16, %%esi;\n\t"
04102 "addl $16, %%edi;\n\t"
04103 "decl %%edx;\n\t"
04104 "jnz .IA13;\n\t"
04105 "popl %%eax;\n\t"
04106 "popl %%edx;\n\t"
04107 ".IA14:;\n\t"
04108 "orl %%eax, %%eax;\n\t"
04109 "jz .IA16;\n\t"
04110 "pushl %%eax;\n\t"
04111 "pushl %%edx;\n\t"
04112 "movl %%ebp, %%edx;\n\t"
04113 "addl %%ebp, %%edx;\n\t"
04114 "addl %%ebp, %%edx;\n\t"
04115 ".IA15:;\n\t"
04116 "movss (%%esi), %%xmm0;\n\t"
04117 "movss (%%esi, %%ebp,1), %%xmm1;\n\t"
04118 "movss (%%esi, %%ebp,2), %%xmm2;\n\t"
04119 "movss (%%esi, %%edx,1), %%xmm3;\n\t"
04120 "addss %%xmm3, %%xmm1;\n\t"
04121 "mulss %%xmm6, %%xmm0;\n\t"
04122 "mulss %%xmm5, %%xmm2;\n\t"
04123 "mulss %%xmm4, %%xmm1;\n\t"
04124 "addss %%xmm2, %%xmm0;\n\t"
04125 "addss %%xmm1, %%xmm0;\n\t"
04126 "movss %%xmm0, (%%edi);\n\t"
04127 "addl $4, %%esi;\n\t"
04128 "addl $4, %%edi;\n\t"
04129 "decl %%eax;\n\t"
04130 "jnz .IA15;\n\t"
04131 "popl %%edx;\n\t"
04132 "popl %%eax;\n\t"
04133 ".IA16:;\n\t"
04134
04135
04136 "movups 32(%%ebx), %%xmm4;\n\t"
04137 "movups 16(%%ebx), %%xmm5;\n\t"
04138 "movups (%%ebx), %%xmm6;\n\t"
04139 "orl %%edx, %%edx;\n\t"
04140 "jz .IA18;\n\t"
04141 "pushl %%edx;\n\t"
04142 ".IA17:;\n\t"
04143 "movups (%%esi), %%xmm0;\n\t"
04144 "movups (%%esi,%%ebp,1), %%xmm1;\n\t"
04145 "movups (%%esi,%%ebp,2), %%xmm2;\n\t"
04146 "mulps %%xmm4, %%xmm0;\n\t"
04147 "mulps %%xmm5, %%xmm1;\n\t"
04148 "mulps %%xmm6, %%xmm2;\n\t"
04149 "addps %%xmm1, %%xmm0;\n\t"
04150 "addps %%xmm2, %%xmm0;\n\t"
04151 "movups %%xmm0, (%%edi);\n\t"
04152 "addl $16, %%esi;\n\t"
04153 "addl $16, %%edi;\n\t"
04154 "decl %%edx;\n\t"
04155 "jnz .IA17;\n\t"
04156 "popl %%edx;\n\t"
04157 ".IA18:;\n\t"
04158 "orl %%eax, %%eax;\n\t"
04159 "jz .IA20;\n\t"
04160 "pushl %%eax;\n\t"
04161 ".IA19:;\n\t"
04162 "movss (%%esi), %%xmm0;\n\t"
04163 "movss (%%esi,%%ebp,1), %%xmm1;\n\t"
04164 "movss (%%esi,%%ebp,2), %%xmm2;\n\t"
04165 "mulss %%xmm4, %%xmm0;\n\t"
04166 "mulss %%xmm5, %%xmm1;\n\t"
04167 "mulss %%xmm6, %%xmm2;\n\t"
04168 "addss %%xmm1, %%xmm0;\n\t"
04169 "addss %%xmm2, %%xmm0;\n\t"
04170 "movss %%xmm0, (%%edi);\n\t"
04171 "addl $4, %%esi;\n\t"
04172 "addl $4, %%edi;\n\t"
04173 "decl %%eax;\n\t"
04174 "jnz .IA19;\n\t"
04175 "popl %%eax;\n\t"
04176 ".IA20:;\n\t"
04177
04178 "popl %%ebp;\n\t"
04179 :
04180 :"m"(w),"S"(sptr),"D"(dptr),"a"(eax),"b"(coeffs),"c"(ecx),"d"(edx)
04181 );
04182
04183 }
04184 }
04185
04186
04187
04188
04189 void sse_yuv411_to_rgb_mode_640x480(const byte *src, byte *dest,
04190 const int nbpix2)
04191 {
04192 int ecx=nbpix2/6;
04193
04194 const float coeffs[] = {
04195 0.0F, -0.198242F, 1.014648F, 0.0F,
04196 0.700195F, -0.29052F, 0.0F, 0.0F,
04197 128.0F, 128.0F, 128.0F, 128.0F
04198 };
04199
04200 asm (
04201 ".JA0:;\n\t"
04202 "orl %%ecx, %%ecx;\n\t"
04203 "jz .JA1;\n\t"
04204 "pxor %%mm7, %%mm7;\n\t"
04205 "xorl %%eax, %%eax;\n\t"
04206 "xorl %%ebx, %%ebx;\n\t"
04207 "movl (%%esi), %%eax;\n\t"
04208 "movw 4(%%esi), %%bx;\n\t"
04209 "movd %%eax, %%mm0;\n\t"
04210 "movd %%eax, %%mm1;\n\t"
04211 "movd %%ebx, %%mm2;\n\t"
04212 "psrlq $16, %%mm1;\n\t"
04213 "punpcklbw %%mm7, %%mm0;\n\t"
04214 "punpcklbw %%mm7, %%mm1;\n\t"
04215 "punpcklbw %%mm7, %%mm2;\n\t"
04216 "punpcklwd %%mm7, %%mm0;\n\t"
04217 "punpcklwd %%mm7, %%mm1;\n\t"
04218 "punpcklwd %%mm7, %%mm2;\n\t"
04219
04220 "cvtpi2ps %%mm0, %%xmm0;\n\t"
04221 "cvtpi2ps %%mm1, %%xmm1;\n\t"
04222 "cvtpi2ps %%mm2, %%xmm2;\n\t"
04223
04224
04225 "movaps %%xmm0, %%xmm3;\n\t"
04226
04227
04228 "movaps %%xmm1, %%xmm4;\n\t"
04229
04230
04231 "movaps %%xmm2, %%xmm5;\n\t"
04232
04233
04234 "movaps %%xmm2, %%xmm6;\n\t"
04235
04236 "shufps $0x55, %%xmm3, %%xmm3;\n\t"
04237 "shufps $00, %%xmm4, %%xmm4;\n\t"
04238 "shufps $0x00, %%xmm5, %%xmm5;\n\t"
04239 "shufps $0x55, %%xmm6, %%xmm6;\n\t"
04240
04241
04242 "shufps $0, %%xmm0, %%xmm0;\n\t"
04243
04244 "shufps $0x55, %%xmm1, %%xmm1;\n\t"
04245
04246 "subps 32(%%edx), %%xmm0;\n\t"
04247 "subps 32(%%edx), %%xmm1;\n\t"
04248
04249 "mulps (%%edx), %%xmm0;\n\t"
04250 "mulps 16(%%edx),%%xmm1;\n\t"
04251
04252 "addps %%xmm0, %%xmm3;\n\t"
04253 "addps %%xmm0, %%xmm4;\n\t"
04254 "addps %%xmm0, %%xmm5;\n\t"
04255 "addps %%xmm0, %%xmm6;\n\t"
04256
04257 "addps %%xmm1, %%xmm3;\n\t"
04258 "addps %%xmm1, %%xmm4;\n\t"
04259 "addps %%xmm1, %%xmm5;\n\t"
04260 "addps %%xmm1, %%xmm6;\n\t"
04261
04262 "cvtps2pi %%xmm3, %%mm0;\n\t"
04263 "movhlps %%xmm3, %%xmm3;\n\t"
04264 "cvtps2pi %%xmm3, %%mm1;\n\t"
04265 "packssdw %%mm1, %%mm0;\n\t"
04266
04267 "cvtps2pi %%xmm4, %%mm2;\n\t"
04268 "movhlps %%xmm4, %%xmm4;\n\t"
04269 "cvtps2pi %%xmm4, %%mm3;\n\t"
04270 "packssdw %%mm3, %%mm2;\n\t"
04271
04272 "cvtps2pi %%xmm5, %%mm4;\n\t"
04273 "movhlps %%xmm5, %%xmm5;\n\t"
04274 "cvtps2pi %%xmm5, %%mm5;\n\t"
04275 "packssdw %%mm5, %%mm4;\n\t"
04276
04277 "cvtps2pi %%xmm6, %%mm6;\n\t"
04278 "movhlps %%xmm6, %%xmm6;\n\t"
04279 "cvtps2pi %%xmm6, %%mm7;\n\t"
04280 "packssdw %%mm7, %%mm6;\n\t"
04281
04282 "pxor %%mm1, %%mm1;\n\t"
04283 "pcmpgtw %%mm0, %%mm1;\n\t"
04284 "pandn %%mm0, %%mm1;\n\t"
04285
04286 "pxor %%mm3, %%mm3;\n\t"
04287 "pcmpgtw %%mm2, %%mm3;\n\t"
04288 "pandn %%mm2, %%mm3;\n\t"
04289
04290 "pxor %%mm5, %%mm5;\n\t"
04291 "pcmpgtw %%mm4, %%mm5;\n\t"
04292 "pandn %%mm4, %%mm5;\n\t"
04293
04294 "pxor %%mm7, %%mm7;\n\t"
04295 "pcmpgtw %%mm6, %%mm7;\n\t"
04296 "pandn %%mm6, %%mm7;\n\t"
04297
04298 "packuswb %%mm1, %%mm1;\n\t"
04299 "packuswb %%mm3, %%mm3;\n\t"
04300 "packuswb %%mm5, %%mm5;\n\t"
04301 "packuswb %%mm7, %%mm7;\n\t"
04302
04303 "pushl %%ecx;\n\t"
04304 "pushl %%edx;\n\t"
04305 "movd %%mm1, %%eax;\n\t"
04306 "movd %%mm3, %%ebx;\n\t"
04307 "movd %%mm5, %%ecx;\n\t"
04308 "movd %%mm7, %%edx;\n\t"
04309 "movw %%ax, (%%edi);\n\t"
04310 "movw %%bx,3(%%edi);\n\t"
04311 "movw %%cx,6(%%edi);\n\t"
04312 "movw %%dx,9(%%edi);\n\t"
04313 "shrl $8, %%eax;\n\t"
04314 "shrl $8, %%ebx;\n\t"
04315 "shrl $8, %%ecx;\n\t"
04316 "shrl $8, %%edx;\n\t"
04317 "movb %%ah, 2(%%edi);\n\t"
04318 "movb %%bh, 5(%%edi);\n\t"
04319 "movb %%ch, 8(%%edi);\n\t"
04320 "movb %%dh,11(%%edi);\n\t"
04321 "popl %%edx;\n\t"
04322 "popl %%ecx;\n\t"
04323
04324 "addl $12,%%edi;\n\t"
04325 "decl %%ecx;\n\t"
04326 "addl $6, %%esi;\n\t"
04327 "jmp .JA0;\n\t"
04328 ".JA1:;\n\t"
04329 "emms;\n\t"
04330 :
04331 :"S"(src),"D"(dest),"c"(ecx),"d"(coeffs)
04332 :"eax","ebx","memory"
04333 );
04334
04335 }
04336
04337
04338
04339
04340 void sse_lowPass9x(const float *sptr, float *dptr, const int h, const int w)
04341 {
04342
04343 for (int j = 0; j < h; j ++)
04344 {
04345
04346 *dptr++ = sptr[0] * (70.0F / 163.0F) +
04347 sptr[1] * (56.0F / 163.0F) +
04348 sptr[2] * (28.0F / 163.0F) +
04349 sptr[3] * ( 8.0F / 163.0F) +
04350 sptr[4] * ( 1.0F / 163.0F);
04351 *dptr++ = (sptr[0] + sptr[2]) * (56.0F / 219.0F) +
04352 sptr[1] * (70.0F / 219.0F) +
04353 sptr[3] * (28.0F / 219.0F) +
04354 sptr[4] * ( 8.0F / 219.0F) +
04355 sptr[5] * ( 1.0F / 219.0F);
04356 *dptr++ = (sptr[0] + sptr[4]) * (28.0F / 247.0F) +
04357 (sptr[1] + sptr[3]) * (56.0F / 247.0F) +
04358 sptr[2] * (70.0F / 247.0F) +
04359 sptr[5] * ( 8.0F / 247.0F) +
04360 sptr[6] * ( 1.0F / 247.0F);
04361 *dptr++ = (sptr[0] + sptr[6]) * ( 8.0F / 255.0F) +
04362 (sptr[1] + sptr[5]) * (28.0F / 255.0F) +
04363 (sptr[2] + sptr[4]) * (56.0F / 255.0F) +
04364 sptr[3] * (70.0F / 255.0F) +
04365 sptr[7] * ( 1.0F / 255.0F);
04366
04367
04368 for (int i = 0; i < w - 8; i ++)
04369 {
04370 *dptr++ = (sptr[0] + sptr[8]) * ( 1.0F / 256.0F) +
04371 (sptr[1] + sptr[7]) * ( 8.0F / 256.0F) +
04372 (sptr[2] + sptr[6]) * (28.0F / 256.0F) +
04373 (sptr[3] + sptr[5]) * (56.0F / 256.0F) +
04374 sptr[4] * (70.0F / 256.0F);
04375 sptr ++;
04376 }
04377
04378
04379 *dptr++ = sptr[0] * ( 1.0F / 255.0F) +
04380 (sptr[1] + sptr[7]) * ( 8.0F / 255.0F) +
04381 (sptr[2] + sptr[6]) * (28.0F / 255.0F) +
04382 (sptr[3] + sptr[5]) * (56.0F / 255.0F) +
04383 sptr[4] * (70.0F / 255.0F);
04384 sptr ++;
04385 *dptr++ = sptr[0] * ( 1.0F / 247.0F) +
04386 sptr[1] * ( 8.0F / 247.0F) +
04387 (sptr[2] + sptr[6]) * (28.0F / 247.0F) +
04388 (sptr[3] + sptr[5]) * (56.0F / 247.0F) +
04389 sptr[4] * (70.0F / 247.0F);
04390 sptr ++;
04391 *dptr++ = sptr[0] * ( 1.0F / 219.0F) +
04392 sptr[1] * ( 8.0F / 219.0F) +
04393 sptr[2] * (28.0F / 219.0F) +
04394 (sptr[3] + sptr[5]) * (56.0F / 219.0F) +
04395 sptr[4] * (70.0F / 219.0F);
04396 sptr ++;
04397 *dptr++ = sptr[0] * ( 1.0F / 163.0F) +
04398 sptr[1] * ( 8.0F / 163.0F) +
04399 sptr[2] * (28.0F / 163.0F) +
04400 sptr[3] * (56.0F / 163.0F) +
04401 sptr[4] * (70.0F / 163.0F);
04402 sptr += 5;
04403 }
04404 }
04405 #endif
04406
04407
04408
04409
04410
04411
04412
04413 #endif
04414