00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include "Util/mmx-sse.H"
00040
00041
00042 typedef int int32;
00043 typedef unsigned char byte;
00044 typedef float float32;
00045
00046
00047 #ifdef INVT_USE_SSE
00048
00049
00050 void sse_absDiff(const double *a, const double *b, double *diff, const int32 sz)
00051 {
00052 static int32 rcx= sz>>2;
00053 static int32 rdx= sz & 0x3;
00054
00055 asm (
00056 "or %%rcx, %%rcx;\n\t"
00057 "jz .AG2;\n\t"
00058 ".AG1:;\n\t"
00059 "movupd 0(%%rsi), %%xmm0;\n\t"
00060 "movupd 0(%%rdi), %%xmm1;\n\t"
00061 "movupd 16(%%rsi), %%xmm2;\n\t"
00062 "movupd 16(%%rdi), %%xmm3;\n\t"
00063 "movupd %%xmm0, %%xmm4;\n\t"
00064 "movupd %%xmm1, %%xmm5;\n\t"
00065 "movupd %%xmm2, %%xmm6;\n\t"
00066 "movupd %%xmm3, %%xmm7;\n\t"
00067 "subpd %%xmm1, %%xmm0;\n\t"
00068 "subpd %%xmm3, %%xmm2;\n\t"
00069 "subpd %%xmm4, %%xmm5;\n\t"
00070 "subpd %%xmm6, %%xmm7;\n\t"
00071 "maxpd %%xmm0, %%xmm5;\n\t"
00072 "maxpd %%xmm2, %%xmm7;\n\t"
00073 "movupd %%xmm5, 0(%%rbx);\n\t"
00074 "movupd %%xmm7, 16(%%rbx);\n\t"
00075 "add $32, %%rsi;\n\t"
00076 "add $32, %%rdi;\n\t"
00077 "add $32, %%rbx;\n\t"
00078 "loop .AG1;\n\t"
00079 ".AG2:;\n\t"
00080 "mov %%rdx, %%rcx;\n\t"
00081 "or %%rcx, %%rcx;\n\t"
00082 "jz .AG4;\n\t"
00083 ".AG3:;\n\t"
00084 "movsd 0(%%rsi), %%xmm0;\n\t"
00085 "movsd 0(%%rdi), %%xmm1;\n\t"
00086 "movsd %%xmm0, %%xmm2;\n\t"
00087 "movsd %%xmm1, %%xmm3;\n\t"
00088 "subsd %%xmm3, %%xmm2;\n\t"
00089 "subsd %%xmm0, %%xmm1;\n\t"
00090 "maxsd %%xmm2, %%xmm1;\n\t"
00091 "movsd %%xmm1, 0(%%rbx);\n\t"
00092 "add $8, %%rsi;\n\t"
00093 "add $8, %%rdi;\n\t"
00094 "add $8, %%rbx;\n\t"
00095 "loop .AG3;\n\t"
00096 ".AG4:;\n\t"
00097 :
00098 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00099 :"memory"
00100 );
00101 }
00102 #endif
00103
00104 #ifdef INVT_USE_SSE2
00105
00106
00107 void sse2_absDiff(const float *a, const float *b, float *diff, const int32 sz)
00108 {
00109 static int32 rcx= sz>>3;
00110 static int32 rdx= sz & 0x7;
00111
00112 asm (
00113 "or %%rcx, %%rcx;\n\t"
00114 "jz .AE2;\n\t"
00115 ".AE1:;\n\t"
00116 "movups 0(%%rsi), %%xmm0;\n\t"
00117 "movups 0(%%rdi), %%xmm1;\n\t"
00118 "movups 16(%%rsi), %%xmm2;\n\t"
00119 "movups 16(%%rdi), %%xmm3;\n\t"
00120 "movups %%xmm0, %%xmm4;\n\t"
00121 "movups %%xmm1, %%xmm5;\n\t"
00122 "movups %%xmm2, %%xmm6;\n\t"
00123 "movups %%xmm3, %%xmm7;\n\t"
00124 "subps %%xmm1, %%xmm0;\n\t"
00125 "subps %%xmm3, %%xmm2;\n\t"
00126 "subps %%xmm4, %%xmm5;\n\t"
00127 "subps %%xmm6, %%xmm7;\n\t"
00128 "maxps %%xmm0, %%xmm5;\n\t"
00129 "maxps %%xmm2, %%xmm7;\n\t"
00130 "movups %%xmm5, 0(%%rbx);\n\t"
00131 "movups %%xmm7, 16(%%rbx);\n\t"
00132 "add $32, %%rsi;\n\t"
00133 "add $32, %%rdi;\n\t"
00134 "add $32, %%rbx;\n\t"
00135 "loop .AE1;\n\t"
00136 ".AE2:;\n\t"
00137 "mov %%rdx, %%rcx;\n\t"
00138 "or %%rcx, %%rcx;\n\t"
00139 "jz .AE4;\n\t"
00140 ".AE3:;\n\t"
00141 "movss 0(%%rsi), %%xmm0;\n\t"
00142 "movss 0(%%rdi), %%xmm1;\n\t"
00143 "movss %%xmm0, %%xmm2;\n\t"
00144 "movss %%xmm1, %%xmm3;\n\t"
00145 "subss %%xmm3, %%xmm2;\n\t"
00146 "subss %%xmm0, %%xmm1;\n\t"
00147 "maxss %%xmm2, %%xmm1;\n\t"
00148 "movss %%xmm1, 0(%%rbx);\n\t"
00149 "add $4, %%rsi;\n\t"
00150 "add $4, %%rdi;\n\t"
00151 "add $4, %%rbx;\n\t"
00152 "loop .AE3;\n\t"
00153 ".AE4:;\n\t"
00154 "emms;\n\t"
00155 :
00156 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00157 :"memory"
00158 );
00159 }
00160
00161
00162
00163
00164
00165 void sse2_absDiff(const int32 *a, const int32 *b, int32 *diff, const int32 sz)
00166 {
00167 static int32 rcx= sz>>3;
00168 static int32 rdx= sz&0x7;
00169
00170 asm (
00171 "or %%rcx, %%rcx;\n\t"
00172 "jz .AF2;\n\t"
00173 ".AF1:;\n\t"
00174 "movdqu 0(%%rsi), %%xmm0;\n\t"
00175 "movdqu 0(%%rdi), %%xmm1;\n\t"
00176 "movdqu 16(%%rsi), %%xmm2;\n\t"
00177 "movdqu 16(%%rdi), %%xmm3;\n\t"
00178 "movdqu %%xmm0, %%xmm4;\n\t"
00179 "movdqu %%xmm1, %%xmm5;\n\t"
00180 "movdqu %%xmm2, %%xmm6;\n\t"
00181 "movdqu %%xmm3, %%xmm7;\n\t"
00182 "psubusw %%xmm1, %%xmm0;\n\t"
00183 "psubusw %%xmm3, %%xmm2;\n\t"
00184 "psubusw %%xmm4, %%xmm5;\n\t"
00185 "psubusw %%xmm6, %%xmm7;\n\t"
00186 "pmaxsw %%xmm0, %%xmm5;\n\t"
00187 "pmaxsw %%xmm2, %%xmm7;\n\t"
00188 "movdqu %%xmm5, 0(%%rbx);\n\t"
00189 "movdqu %%xmm7, 16(%%rbx);\n\t"
00190 "add $32, %%rsi;\n\t"
00191 "add $32, %%rdi;\n\t"
00192 "add $32, %%rbx;\n\t"
00193 "loop .AF1;\n\t"
00194 ".AF2:;\n\t"
00195 "mov %%rdx, %%rcx;\n\t"
00196 "or %%rcx, %%rcx;\n\t"
00197 "jz .AF4;\n\t"
00198 ".AF3:;\n\t"
00199 "mov (%%rsi), %%rax;\n\t"
00200 "mov (%%rdi), %%rdx;\n\t"
00201 "cmp %%rdx, %%rax;\n\t"
00202 "ja .AF5;\n\t"
00203 "xchg %%rax, %%rdx;\n\t"
00204 ".AF5:;\n\t"
00205 "sub %%rdx, %%rax;\n\t"
00206 "mov %%rax, (%%rbx);\n\t"
00207 "add $4, %%rsi;\n\t"
00208 "add $4, %%rdi;\n\t"
00209 "add $4, %%rbx;\n\t"
00210 "loop .AF3;\n\t"
00211 ".AF4:;\n\t"
00212 "emms;\n\t"
00213 :
00214 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00215 :"memory"
00216 );
00217 }
00218
00219
00220
00221
00222 void sse2_absDiff(const byte *a, const byte *b, byte *diff, const int32 sz)
00223 {
00224 static int32 rcx= sz>>5;
00225 static int32 rdx= sz&0x1f;
00226
00227 asm (
00228 "or %%rcx, %%rcx;\n\t"
00229 "jz .AD2;\n\t"
00230 ".AD1:;\n\t"
00231 "movdqu 0(%%rsi), %%xmm0;\n\t"
00232 "movdqu 0(%%rdi), %%xmm1;\n\t"
00233 "movdqu 16(%%rsi), %%xmm2;\n\t"
00234 "movdqu 16(%%rdi), %%xmm3;\n\t"
00235 "movdqu %%xmm0, %%xmm4;\n\t"
00236 "movdqu %%xmm1, %%xmm5;\n\t"
00237 "movdqu %%xmm2, %%xmm6;\n\t"
00238 "movdqu %%xmm3, %%xmm7;\n\t"
00239 "psubusb %%xmm1, %%xmm0;\n\t"
00240 "psubusb %%xmm3, %%xmm2;\n\t"
00241 "psubusb %%xmm4, %%xmm5;\n\t"
00242 "psubusb %%xmm6, %%xmm7;\n\t"
00243 "pmaxub %%xmm0, %%xmm5;\n\t"
00244 "pmaxub %%xmm2, %%xmm7;\n\t"
00245 "movdqu %%xmm5, 0(%%rbx);\n\t"
00246 "movdqu %%xmm7, 16(%%rbx);\n\t"
00247 "add $32, %%rsi;\n\t"
00248 "add $32, %%rdi;\n\t"
00249 "add $32, %%rbx;\n\t"
00250 "loop .AD1;\n\t"
00251 ".AD2:;\n\t"
00252 "mov %%rdx, %%rcx;\n\t"
00253 "or %%rcx, %%rcx;\n\t"
00254 "jz .AD4;\n\t"
00255 ".AD3:;\n\t"
00256 "movb (%%rsi), %%al;\n\t"
00257 "movb (%%rdi), %%dl;\n\t"
00258 "cmpb %%dl, %%al;\n\t"
00259 "ja .AD5;\n\t"
00260 "xchgb %%al, %%dl;\n\t"
00261 ".AD5:;\n\t"
00262 "subb %%dl, %%al;\n\t"
00263 "movb %%al, (%%rbx);\n\t"
00264 "inc %%rbx;\n\t"
00265 "inc %%rsi;\n\t"
00266 "inc %%rdi;\n\t"
00267 "loop .AD3;\n\t"
00268 ".AD4:;\n\t"
00269 "emms;\n\t"
00270 :
00271 :"S"(a),"D"(b),"b"(diff), "c"(rcx), "d"(rdx)
00272 :"memory"
00273 );
00274 }
00275 #endif
00276
00277 #ifdef INVT_USE_SSE
00278
00279
00280 void sse_sum(const double *a, double *sum, const int32 sz)
00281 {
00282 static int32 rcx = sz>>3;
00283 static int32 rdx = sz&0x7;
00284
00285 asm (
00286 "pxor %%xmm4, %%xmm4;\n\t"
00287 "pxor %%xmm5, %%xmm5;\n\t"
00288 "pxor %%xmm6, %%xmm6;\n\t"
00289 "pxor %%xmm7, %%xmm7;\n\t"
00290 "or %%rcx, %%rcx;\n\t"
00291 "jz BE1;\n\t"
00292 ".BE0:\n\t"
00293 "movupd 0(%%rsi), %%xmm0;\n\t"
00294 "movupd 16(%%rsi), %%xmm1;\n\t"
00295 "movupd 32(%%rsi), %%xmm2;\n\t"
00296 "movupd 48(%%rsi), %%xmm3;\n\t"
00297 "addpd %%xmm0, %%xmm4;\n\t"
00298 "addpd %%xmm1, %%xmm5;\n\t"
00299 "addpd %%xmm2, %%xmm6;\n\t"
00300 "addpd %%xmm3, %%xmm7;\n\t"
00301 "add $64, %%rsi;\n\t"
00302 "loop .BE0;\n\t"
00303 "BE1:;\n\t"
00304 "mov %%rdx, %%rcx;\n\t"
00305 "pxor %%xmm0, %%xmm0;\n\t"
00306 "or %%rcx, %%rcx;\n\t"
00307 "jz BE2;\n\t"
00308 "BE3:;\n\t"
00309 "movupd 0(%%rsi), %%xmm1;\n\t"
00310 "addpd %%xmm1, %%xmm0;\n\t"
00311 "add $16, %%rsi;\n\t"
00312 "loop BE3;\n\t"
00313 "BE2:;\n\t"
00314 "addpd %%xmm4, %%xmm7;\n\t"
00315 "addpd %%xmm5, %%xmm7;\n\t"
00316 "addpd %%xmm6, %%xmm7;\n\t"
00317 "addpd %%xmm7, %%xmm0;\n\t"
00318 "movhpd %%xmm0, (%%rbx);\n\t"
00319 "addsd (%%rbx), %%xmm0;\n\t"
00320 "movsd %%xmm0, (%%rbx);\n\t"
00321 "emms;\n\t"
00322 :
00323 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx)
00324 :"memory"
00325 );
00326 }
00327 #endif
00328
00329 #ifdef INVT_USE_SSE2
00330
00331
00332 void sse2_sum(const float *a, double *sum, const int32 sz)
00333 {
00334 static int32 rcx = sz>>3;
00335 static int32 rdx = sz & 0x7;
00336
00337 asm (
00338 "pxor %%xmm4, %%xmm4;\n\t"
00339 "pxor %%xmm5, %%xmm5;\n\t"
00340 "pxor %%xmm6, %%xmm6;\n\t"
00341 "pxor %%xmm7, %%xmm7;\n\t"
00342 "or %%rcx, %%rcx;\n\t"
00343 "jz BA1;\n\t"
00344 ".BA0:\n\t"
00345 "cvtps2pd 0(%%rsi), %%xmm0;\n\t"
00346 "cvtps2pd 8(%%rsi), %%xmm1;\n\t"
00347 "cvtps2pd 16(%%rsi), %%xmm2;\n\t"
00348 "cvtps2pd 24(%%rsi), %%xmm3;\n\t"
00349 "addpd %%xmm0, %%xmm4;\n\t"
00350 "addpd %%xmm1, %%xmm5;\n\t"
00351 "addpd %%xmm2, %%xmm6;\n\t"
00352 "addpd %%xmm3, %%xmm7;\n\t"
00353 "add $32, %%rsi;\n\t"
00354 "loop .BA0;\n\t"
00355 "BA1:;\n\t"
00356 "pxor %%xmm0, %%xmm0;\n\t"
00357 "mov %%rdx, %%rcx;\n\t"
00358 "or %%rcx, %%rcx;\n\t"
00359 "jz BA2;\n\t"
00360 "BA3:;\n\t"
00361 "cvtps2pd 0(%%rsi), %%xmm1;\n\t"
00362 "addpd %%xmm1, %%xmm0;\n\t"
00363 "add $8, %%rsi;\n\t"
00364 "loop BA3;\n\t"
00365 "BA2:;\n\t"
00366 "addpd %%xmm4, %%xmm7;\n\t"
00367 "addpd %%xmm5, %%xmm7;\n\t"
00368 "addpd %%xmm6, %%xmm7;\n\t"
00369 "addpd %%xmm7, %%xmm0;\n\t"
00370 "movhpd %%xmm0, (%%rbx);\n\t"
00371 "addsd (%%rbx), %%xmm0;\n\t"
00372 "movsd %%xmm0, (%%rbx);\n\t"
00373 "emms;\n\t"
00374 :
00375 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx)
00376 :"memory"
00377 );
00378 }
00379
00380
00381
00382
00383 void sse2_sum(const int32 *a, double *sum, const int32 sz)
00384 {
00385 static int32 rcx = sz>>3;
00386 static int32 rdx = sz & 0x7;
00387
00388 asm (
00389 "pxor %%xmm4, %%xmm4;\n\t"
00390 "pxor %%xmm5, %%xmm5;\n\t"
00391 "pxor %%xmm6, %%xmm6;\n\t"
00392 "pxor %%xmm7, %%xmm7;\n\t"
00393 "or %%rcx, %%rcx;\n\t"
00394 ".BC0:\n\t"
00395 "cvtdq2pd 0(%%rsi), %%xmm0;\n\t"
00396 "cvtdq2pd 8(%%rsi), %%xmm1;\n\t"
00397 "cvtdq2pd 16(%%rsi), %%xmm2;\n\t"
00398 "cvtdq2pd 24(%%rsi), %%xmm3;\n\t"
00399 "addpd %%xmm0, %%xmm4;\n\t"
00400 "addpd %%xmm1, %%xmm5;\n\t"
00401 "addpd %%xmm2, %%xmm6;\n\t"
00402 "addpd %%xmm3, %%xmm7;\n\t"
00403 "add $32, %%rsi;\n\t"
00404 "loop .BC0;\n\t"
00405 "BC1:;\n\t"
00406 "pxor %%xmm0, %%xmm0;\n\t"
00407 "mov %%rdx, %%rcx;\n\t"
00408 "or %%rcx, %%rcx;\n\t"
00409 "jz BC2;\n\t"
00410 "BC3:;\n\t"
00411 "cvtdq2pd 0(%%rsi), %%xmm1;\n\t"
00412 "addpd %%xmm1, %%xmm0;\n\t"
00413 "add $8, %%rsi;\n\t"
00414 "loop BC3;\n\t"
00415 "BC2:;\n\t"
00416 "addpd %%xmm4, %%xmm7;\n\t"
00417 "addpd %%xmm5, %%xmm7;\n\t"
00418 "addpd %%xmm6, %%xmm7;\n\t"
00419 "addpd %%xmm7, %%xmm0;\n\t"
00420 "movhpd %%xmm0, (%%rbx);\n\t"
00421 "addsd (%%rbx), %%xmm0;\n\t"
00422 "movsd %%xmm0, (%%rbx);\n\t"
00423 "emms;\n\t"
00424 :
00425 :"S"(a), "b"(sum), "c"(rcx), "d"(rdx)
00426 :"memory"
00427 );
00428 }
00429
00430
00431
00432
00433 void sse2_sum(const byte *a, double *sum, const int32 sz)
00434 {
00435 static int rcx = sz>>5;
00436 static int rdx = sz & 0x1f;
00437
00438 asm (
00439 "or %%rcx, %%rcx;\n\t"
00440 "jz BB1;\n\t"
00441 "pxor %%xmm7, %%xmm7;\n\t"
00442 "push %%rbx;\n\t"
00443 "push %%rdx;\n\t"
00444 "BB3:;\n\t"
00445 "pxor %%xmm5, %%xmm5;\n\t"
00446 "pxor %%xmm6, %%xmm6;\n\t"
00447 "movdqu (%%rsi), %%xmm0;\n\t"
00448 "movdqu 16(%%rsi), %%xmm1;\n\t"
00449 "psadbw %%xmm0, %%xmm5;\n\t"
00450 "psadbw %%xmm1, %%xmm6;\n\t"
00451 "pextrw $0, %%xmm5, %%rax;\n\t"
00452 "cvtsi2sd %%rax, %%xmm0;\n\t"
00453 "pextrw $4, %%xmm5, %%rbx;\n\t"
00454 "cvtsi2sd %%rbx, %%xmm1;\n\t"
00455 "pextrw $0, %%xmm6, %%rdx;\n\t"
00456 "cvtsi2sd %%rdx, %%xmm2;\n\t"
00457 "pextrw $4, %%xmm6, %%rdi;\n\t"
00458 "cvtsi2sd %%rdi, %%xmm3;\n\t"
00459 "addsd %%xmm0, %%xmm1;\n\t"
00460 "addsd %%xmm2, %%xmm3;\n\t"
00461 "addsd %%xmm1, %%xmm7;\n\t"
00462 "addsd %%xmm3, %%xmm7;\n\t"
00463 "add $32, %%rsi;\n\t"
00464 "loop BB3;\n\t"
00465 "pop %%rdx;\n\t"
00466 "pop %%rbx;\n\t"
00467 "BB1:;\n\t"
00468 "xor %%rdi, %%rdi;\n\t"
00469 "mov %%rdx, %%rcx;\n\t"
00470 "or %%rcx, %%rcx;\n\t"
00471 "jz BB2;\n\t"
00472 "BB5:;\n\t"
00473 "xor %%rax, %%rax;\n\t"
00474 "movb (%%rsi), %%al;\n\t"
00475 "add %%rax, %%rdi;\n\t"
00476 "inc %%rsi;\n\t"
00477 "loop BB5;\n\t"
00478 "BB2:\n\t"
00479 "cvtsi2sd %%rdi, %%xmm0;\n\t"
00480 "addsd %%xmm0, %%xmm7;\n\t"
00481 "movhpd %%xmm7, (%%rbx);\n\t"
00482 "addsd (%%rbx), %%xmm7;\n\t"
00483 "movsd %%xmm7, (%%rbx);\n\t"
00484 "BB6:;\n\t"
00485 "emms;\n\t"
00486 :
00487 :"S"(a), "c"(rcx),"b"(sum),"d"(rdx)
00488 :"memory","rax","rdi"
00489 );
00490 }
00491 #endif
00492
00493 #ifdef INVT_USE_SSE
00494
00495
00496 void sse_clampedDiff(const byte *a, const byte *b, byte *result, const int32 sz)
00497 {
00498 int rcx = sz >> 6;
00499 int rdx = sz & 0x7f;
00500
00501 asm (
00502 "or %%rcx, %%rcx;\n\t"
00503 "jz .DA0;\n\t"
00504 ".DA1:;\n\t"
00505 "movdqu (%%rsi), %%xmm0;\n\t"
00506 "movdqu (%%rdi), %%xmm4;\n\t"
00507 "movdqu 16(%%rsi), %%xmm1;\n\t"
00508 "movdqu 16(%%rdi), %%xmm5;\n\t"
00509 "movdqu 32(%%rsi), %%xmm2;\n\t"
00510 "movdqu 32(%%rdi), %%xmm6;\n\t"
00511 "movdqu 48(%%rsi), %%xmm3;\n\t"
00512 "movdqu 48(%%rdi), %%xmm7;\n\t"
00513 "psubusb %%xmm4, %%xmm0;\n\t"
00514 "psubusb %%xmm5, %%xmm1;\n\t"
00515 "psubusb %%xmm6, %%xmm2;\n\t"
00516 "psubusb %%xmm7, %%xmm3;\n\t"
00517 "movdqu %%xmm0, 0(%%rbx);\n\t"
00518 "movdqu %%xmm1, 16(%%rbx);\n\t"
00519 "movdqu %%xmm2, 32(%%rbx);\n\t"
00520 "movdqu %%xmm3, 48(%%rbx);\n\t"
00521 "add $64, %%rsi;\n\t"
00522 "add $64, %%rdi;\n\t"
00523 "add $64, %%rbx;\n\t"
00524 "loop .DA1;\n\t"
00525 ".DA0:;\n\t"
00526 "mov %%rdx, %%rcx;\n\t"
00527 "or %%rcx, %%rcx;\n\t"
00528 "jz .DA2;\n\t"
00529 ".DA3:;\n\t"
00530 "movb (%%rsi), %%al;\n\t"
00531 "movb (%%rdi), %%dl;\n\t"
00532 "cmpb %%bl, %%al;\n\t"
00533 "ja .DA4;\n\t"
00534 "xchg %%al, %%bl;\n\t"
00535 ".DA4:;\n\t"
00536 "subb %%bl, %%al;\n\t"
00537 "movb %%al, (%%rbx);\n\t"
00538 "inc %%rsi;\n\t"
00539 "inc %%rdi;\n\t"
00540 "inc %%rbx;\n\t"
00541 "loop .DA3;\n\t"
00542 ".DA2:;\n\t"
00543 "emms;\n\t"
00544 :
00545 :"S"(a),"D"(b),"c"(rcx),"d"(rdx),"b"(result)
00546 );
00547 }
00548
00549
00550
00551
00552 void sse_clampedDiff(const float32 *a, const float32 *b, float32 *result,
00553 const int32 sz)
00554 {
00555 int32 rcx=sz>>5;
00556 int32 rdx=sz&0x1f;
00557
00558 asm (
00559 "or %%rcx, %%rcx;\n\t"
00560 "jz .DB0;\n\t"
00561 ".DB1:;\n\t"
00562 "movups 0(%%rsi), %%xmm0;\n\t"
00563 "movups 0(%%rdi), %%xmm1;\n\t"
00564 "movups 16(%%rsi), %%xmm2;\n\t"
00565 "movups 16(%%rdi), %%xmm3;\n\t"
00566 "movups %%xmm1, %%xmm6;\n\t"
00567 "movups %%xmm3, %%xmm7;\n\t"
00568 "cmpps $1, %%xmm0, %%xmm6;\n\t"
00569 "cmpps $1, %%xmm2, %%xmm7;\n\t"
00570 "subps %%xmm1, %%xmm0;\n\t"
00571 "subps %%xmm3, %%xmm2;\n\t"
00572 "andps %%xmm6, %%xmm0;\n\t"
00573 "andps %%xmm7, %%xmm2;\n\t"
00574 "movups %%xmm0, (%%rbx);\n\t"
00575 "movups %%xmm2, 16(%%rbx);\n\t"
00576 "add $32, %%rsi;\n\t"
00577 "add $32, %%rdi;\n\t"
00578 "add $32, %%rbx;\n\t"
00579 "loop .DB1;\n\t"
00580 ".DB0:;\n\t"
00581 "mov %%rdx, %%rcx;\n\t"
00582 "or %%rcx, %%rcx;\n\t"
00583 "jz .DB2;\n\t"
00584 ".DB3:;\n\t"
00585 "movss (%%rsi), %%xmm0;\n\t"
00586 "movss (%%rdi), %%xmm1;\n\t"
00587 "movss %%xmm1, %%xmm2;\n\t"
00588 "cmpss $1, %%xmm0, %%xmm2;\n\t"
00589 "andps %%xmm2, %%xmm0;\n\t"
00590 "andps %%xmm2, %%xmm1;\n\t"
00591 "subss %%xmm1, %%xmm0;\n\t"
00592 "movss %%xmm0, (%%rbx);\n\t"
00593 "add $4, %%rsi;\n\t"
00594 "add $4, %%rdi;\n\t"
00595 "add $4, %%rbx;\n\t"
00596 "loop .DB3;\n\t"
00597 ".DB2:;\n\t"
00598 :
00599 :"S"(a), "D"(b), "b"(result), "c"(rcx), "d"(rdx)
00600 :"memory"
00601 );
00602 }
00603
00604
00605
00606
00607 void sse_clampedDiff(const int32 *a, const int32 *b, int32 *c, const int32 sz)
00608 {
00609 int32 rcx=sz>>3;
00610 int32 rdx=sz&0x7;
00611 asm (
00612 "or %%rcx, %%rcx;\n\t"
00613 "jz .DC0;\n\t"
00614 ".DC1:;\n\t"
00615 "movdqu 0(%%rsi), %%xmm0;\n\t"
00616 "movdqu 0(%%rdi), %%xmm1;\n\t"
00617 "movdqu 16(%%rsi), %%xmm3;\n\t"
00618 "movdqu 16(%%rdi), %%xmm4;\n\t"
00619 "movdqu %%xmm0, %%xmm2;\n\t"
00620 "movdqu %%xmm3, %%xmm5;\n\t"
00621 "pcmpgtd %%xmm1, %%xmm2;\n\t"
00622 "pcmpgtd %%xmm4, %%xmm5;\n\t"
00623 "psubd %%xmm1, %%xmm0;\n\t"
00624 "psubd %%xmm4, %%xmm3;\n\t"
00625 "pand %%xmm2, %%xmm0;\n\t"
00626 "pand %%xmm5, %%xmm3;\n\t"
00627 "movdqu %%xmm0, (%%rbx);\n\t"
00628 "movdqu %%xmm3, 16(%%rbx);\n\t"
00629 "add $32, %%rsi;\n\t"
00630 "add $32, %%rdi;\n\t"
00631 "add $32, %%rbx;\n\t"
00632 "loop .DC1;\n\t"
00633 ".DC0:;\n\t"
00634 "mov %%rdx, %%rcx;\n\t"
00635 "or %%rcx, %%rcx;\n\t"
00636 "jz .DC2;\n\t"
00637 ".DC3:;\n\t"
00638 "movsd 0(%%rsi), %%xmm0;\n\t"
00639 "movsd 0(%%rdi), %%xmm1;\n\t"
00640 "movdqu %%xmm0, %%xmm2;\n\t"
00641 "pcmpgtd %%xmm1, %%xmm2;\n\t"
00642 "psubd %%xmm1, %%xmm0;\n\t"
00643 "pand %%xmm2, %%xmm0;\n\t"
00644 "movsd %%xmm0, (%%rbx);\n\t"
00645 "add $4, %%rsi;\n\t"
00646 "add $4, %%rdi;\n\t"
00647 "add $4, %%rbx;\n\t"
00648 "loop .DC3;\n\t"
00649 ".DC2:;\n\t"
00650 :
00651 :"S"(a), "D"(b), "c"(rcx), "d"(rdx), "b"(c)
00652 :"memory"
00653 );
00654 }
00655
00656
00657
00658
00659 void sse_binaryReverse(const byte *a, byte *result, const byte val, const
00660 int32 sz)
00661 {
00662 static unsigned int rcx=(sz>>7);
00663 static unsigned int rdx=sz&0x7f;
00664
00665 byte pVal[16];
00666
00667 memset(result, val, 16);
00668
00669 asm (
00670 "or %%rcx, %%rcx;\n\t"
00671 "jz .FA0;\n\t"
00672 ".FA1:;\n\t"
00673 "movdqu 0(%%rbx), %%xmm0;\n\t"
00674 "movdqu 0(%%rbx), %%xmm1;\n\t"
00675 "movdqu %%xmm0, %%xmm2;\n\t"
00676 "movdqu %%xmm1, %%xmm3;\n\t"
00677 "movdqu %%xmm0, %%xmm4;\n\t"
00678 "movdqu %%xmm1, %%xmm5;\n\t"
00679 "movdqu %%xmm0, %%xmm6;\n\t"
00680 "movdqu %%xmm1, %%xmm7;\n\t"
00681 "psubb (%%rsi), %%xmm0;\n\t"
00682 "psubb 16(%%rsi), %%xmm1;\n\t"
00683 "psubb 32(%%rsi), %%xmm2;\n\t"
00684 "psubb 48(%%rsi), %%xmm3;\n\t"
00685 "psubb 64(%%rsi), %%xmm4;\n\t"
00686 "psubb 80(%%rsi), %%xmm5;\n\t"
00687 "psubb 96(%%rsi), %%xmm6;\n\t"
00688 "psubb 112(%%rsi), %%xmm7;\n\t"
00689 "movdqu %%xmm0, (%%rdi);\n\t"
00690 "movdqu %%xmm1, 16(%%rdi);\n\t"
00691 "movdqu %%xmm2, 32(%%rdi);\n\t"
00692 "movdqu %%xmm3, 48(%%rdi);\n\t"
00693 "movdqu %%xmm4, 64(%%rdi);\n\t"
00694 "movdqu %%xmm5, 80(%%rdi);\n\t"
00695 "movdqu %%xmm6, 96(%%rdi);\n\t"
00696 "movdqu %%xmm7, 112(%%rdi);\n\t"
00697 "add $128, %%rdi;\n\t"
00698 "add $128, %%rsi;\n\t"
00699 "loop .FA1;\n\t"
00700 ".FA0:;\n\t"
00701 "mov %%rdx, %%rcx;\n\t"
00702 "or %%rcx, %%rcx;\n\t"
00703 "jz .FA2;\n\t"
00704 "movb (%%rbx), %%dl;\n\t"
00705 ".FA3:;\n\t"
00706 "movb %%dl, %%dh;\n\t"
00707 "movb (%%rsi), %%al;\n\t"
00708 "subb %%al, %%dh;\n\t"
00709 "movb %%dh, (%%rdi);\n\t"
00710 "inc %%rsi;\n\t"
00711 "inc %%rdi;\n\t"
00712 "loop .FA3;\n\t"
00713 ".FA2:;\n\t"
00714 :
00715 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx)
00716 :"memory","rax"
00717 );
00718 }
00719
00720
00721
00722
00723 void sse_binaryReverse(const float *a, float *result, const float val,
00724 const int sz)
00725 {
00726 static unsigned int rcx = sz>>5;
00727 static unsigned int rdx = sz&0x1f;
00728 int i;
00729 float pVal[16];
00730
00731 for(i=0;i<16;++i)
00732 pVal[i] = val;
00733
00734
00735 asm (
00736 "or %%rcx, %%rcx;\n\t"
00737 "jz .FB4;\n\t"
00738 ".FB2:;\n\t"
00739 "movups (%%rbx), %%xmm0;\n\t"
00740 "movups (%%rbx), %%xmm1;\n\t"
00741 "movups %%xmm0, %%xmm2;\n\t"
00742 "movups %%xmm1, %%xmm3;\n\t"
00743 "movups %%xmm0, %%xmm4;\n\t"
00744 "movups %%xmm1, %%xmm5;\n\t"
00745 "movups %%xmm0, %%xmm6;\n\t"
00746 "movups %%xmm1, %%xmm7;\n\t"
00747 "psubq (%%rsi), %%xmm0;\n\t"
00748 "psubq 16(%%rsi), %%xmm1;\n\t"
00749 "psubq 32(%%rsi), %%xmm2;\n\t"
00750 "psubq 48(%%rsi), %%xmm3;\n\t"
00751 "psubq 64(%%rsi), %%xmm4;\n\t"
00752 "psubq 80(%%rsi), %%xmm5;\n\t"
00753 "psubq 96(%%rsi), %%xmm6;\n\t"
00754 "psubq 112(%%rsi), %%xmm7;\n\t"
00755 "movups %%xmm0, 0(%%rdi);\n\t"
00756 "movups %%xmm1, 16(%%rdi);\n\t"
00757 "movups %%xmm2, 32(%%rdi);\n\t"
00758 "movups %%xmm3, 48(%%rdi);\n\t"
00759 "movups %%xmm4, 64(%%rdi);\n\t"
00760 "movups %%xmm5, 80(%%rdi);\n\t"
00761 "movups %%xmm6, 96(%%rdi);\n\t"
00762 "movups %%xmm7,112(%%rdi);\n\t"
00763 "add $128, %%rsi;\n\t"
00764 "add $128, %%rdi;\n\t"
00765 "loop .FB2;\n\t"
00766 ".FB4:\n\t"
00767 "or %%rdx, %%rdx;\n\t"
00768 "jz .FB1;\n\t"
00769 "mov %%rdx, %%rcx;\n\t"
00770 ".FB3:;\n\t"
00771 "movss 0(%%rbx), %%xmm0;\n\t"
00772 "subss (%%rsi), %%xmm0;\n\t"
00773 "movups %%xmm0, (%%rdi);\n\t"
00774 "add $16, %%rsi;\n\t"
00775 "add $16, %%rdi;\n\t"
00776 "loop .FB3;\n\t"
00777 ".FB1:;\n\t"
00778 :
00779 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx)
00780 :"memory","rax"
00781 );
00782 }
00783
00784
00785
00786
00787
00788 void sse_binaryReverse(const int32 *a, int32 *result, const int32 val,
00789 const int32 sz)
00790 {
00791 int32 rcx=sz>>5;
00792 int32 rdx=sz&31;
00793 int32 pVal[16];
00794 int i;
00795
00796 for(i=0;i<16;++i) pVal[i] = val;
00797
00798 asm (
00799 "or %%rcx, %%rcx;\n\t"
00800 "jz .FC4;\n\t"
00801 ".FC2:;\n\t"
00802 "movdqu (%%rbx), %%xmm0;\n\t"
00803 "movdqu (%%rbx), %%xmm1;\n\t"
00804 "movdqu %%xmm0, %%xmm2;\n\t"
00805 "movdqu %%xmm1, %%xmm3;\n\t"
00806 "movdqu %%xmm0, %%xmm4;\n\t"
00807 "movdqu %%xmm1, %%xmm5;\n\t"
00808 "movdqu %%xmm0, %%xmm6;\n\t"
00809 "movdqu %%xmm1, %%xmm7;\n\t"
00810 "psubd (%%rsi), %%xmm0;\n\t"
00811 "psubd 16(%%rsi), %%xmm1;\n\t"
00812 "psubd 32(%%rsi), %%xmm2;\n\t"
00813 "psubd 48(%%rsi), %%xmm3;\n\t"
00814 "psubd 64(%%rsi), %%xmm4;\n\t"
00815 "psubd 80(%%rsi), %%xmm5;\n\t"
00816 "psubd 96(%%rsi), %%xmm6;\n\t"
00817 "psubd 112(%%rsi), %%xmm7;\n\t"
00818 "movdqu %%xmm0, 0(%%rdi);\n\t"
00819 "movdqu %%xmm1, 16(%%rdi);\n\t"
00820 "movdqu %%xmm2, 32(%%rdi);\n\t"
00821 "movdqu %%xmm3, 48(%%rdi);\n\t"
00822 "movdqu %%xmm4, 64(%%rdi);\n\t"
00823 "movdqu %%xmm5, 80(%%rdi);\n\t"
00824 "movdqu %%xmm6, 96(%%rdi);\n\t"
00825 "movdqu %%xmm7,112(%%rdi);\n\t"
00826 "add $128, %%rsi;\n\t"
00827 "add $128, %%rdi;\n\t"
00828 "loop .FC2;\n\t"
00829 ".FC4:;\n\t"
00830 "or %%rdx, %%rdx;\n\t"
00831 "jz .FC1;\n\t"
00832 "mov %%rdx, %%rcx;\n\t"
00833 ".FC3:;\n\t"
00834 "movdqu 0(%%rbx), %%xmm0;\n\t"
00835 "psubd (%%rsi), %%xmm0;\n\t"
00836 "movups %%xmm0, (%%rdi);\n\t"
00837 "add $16, %%rsi;\n\t"
00838 "add $16, %%rdi;\n\t"
00839 "loop .FC3;\n\t"
00840 ".FC1:;\n\t"
00841 :
00842 :"S"(a), "D"(result), "b"(pVal),"c"(rcx),"d"(rdx)
00843 :"memory","rax"
00844 );
00845 }
00846
00847
00848
00849
00850
00851 void sse_cvt_byte_to_int(const byte *a, int32 *b, const int32 sz)
00852 {
00853 int32 rcx=sz>>4;
00854 int32 rdx=sz&0xf;
00855
00856 asm(
00857 "or %%rcx, %%rcx;\n\t"
00858 "jz .GA4;\n\t"
00859 "pxor %%xmm0, %%xmm0;\n\t"
00860 ".GA2:;\n\t"
00861 "movdqu 0(%%rsi), %%xmm1;\n\t"
00862 "movdqa %%xmm1, %%xmm2;\n\t"
00863 "movdqa %%xmm1, %%xmm3;\n\t"
00864 "movdqa %%xmm1, %%xmm4;\n\t"
00865 "psrldq $4, %%xmm2;\n\t"
00866 "psrldq $8, %%xmm3;\n\t"
00867 "psrldq $12, %%xmm4;\n\t"
00868 "punpcklbw %%xmm0, %%xmm1;\n\t"
00869 "punpcklbw %%xmm0, %%xmm2;\n\t"
00870 "punpcklbw %%xmm0, %%xmm3;\n\t"
00871 "punpcklbw %%xmm0, %%xmm4;\n\t"
00872 "punpcklbw %%xmm0, %%xmm1;\n\t"
00873 "punpcklbw %%xmm0, %%xmm2;\n\t"
00874 "punpcklbw %%xmm0, %%xmm3;\n\t"
00875 "punpcklbw %%xmm0, %%xmm4;\n\t"
00876 "movdqu %%xmm1, (%%rdi);\n\t"
00877 "movdqu %%xmm2, 16(%%rdi);\n\t"
00878 "movdqu %%xmm3, 32(%%rdi);\n\t"
00879 "movdqu %%xmm4, 48(%%rdi);\n\t"
00880 "add $16, %%rsi;\n\t"
00881 "add $64, %%rdi;\n\t"
00882 "loop .GA2;\n\t"
00883 ".GA4:;\n\t"
00884 "or %%rdx, %%rdx;\n\t"
00885 "jz .GA1;\n\t"
00886 "mov %%rdx, %%rcx;\n\t"
00887 ".GA3:;\n\t"
00888 "xor %%rax, %%rax;\n\t"
00889 "movb (%%rsi), %%al;\n\t"
00890 "mov %%rax, (%%rdi);\n\t"
00891 "inc %%rsi;\n\t"
00892 "add $4, %%rdi;\n\t"
00893 "loop .GA3;\n\t"
00894 ".GA1:;"
00895 :
00896 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
00897 :"memory"
00898 );
00899
00900
00901 }
00902
00903 #endif
00904
00905 #ifdef INVT_USE_SSE2
00906
00907
00908
00909 void sse2_cvt_byte_to_float(const byte *a, float32 *b, const int32 sz)
00910 {
00911 int32 rcx=sz>>4;
00912 int32 rdx=sz&0xf;
00913
00914 asm(
00915 "or %%rcx, %%rcx;\n\t"
00916 "jz .GB4;\n\t"
00917 ".GB2:;\n\t"
00918 "pxor %%xmm0, %%xmm0;\n\t"
00919 "movdqu 0(%%rsi), %%xmm1;\n\t"
00920 "movdqu 4(%%rsi), %%xmm2;\n\t"
00921 "movdqu 8(%%rsi), %%xmm3;\n\t"
00922 "movdqu 12(%%rsi), %%xmm4;\n\t"
00923 "punpcklbw %%xmm0, %%xmm1;\n\t"
00924 "punpcklbw %%xmm0, %%xmm2;\n\t"
00925 "punpcklbw %%xmm0, %%xmm3;\n\t"
00926 "punpcklbw %%xmm0, %%xmm4;\n\t"
00927 "punpcklbw %%xmm0, %%xmm1;\n\t"
00928 "punpcklbw %%xmm0, %%xmm2;\n\t"
00929 "punpcklbw %%xmm0, %%xmm3;\n\t"
00930 "punpcklbw %%xmm0, %%xmm4;\n\t"
00931 "cvtdq2ps %%xmm1, %%xmm1;\n\t"
00932 "cvtdq2ps %%xmm2, %%xmm2;\n\t"
00933 "movups %%xmm1, (%%rdi);\n\t"
00934 "movups %%xmm2, 16(%%rdi);\n\t"
00935 "cvtdq2ps %%xmm3, %%xmm3;\n\t"
00936 "cvtdq2ps %%xmm4, %%xmm4;\n\t"
00937 "movups %%xmm3, 32(%%rdi);\n\t"
00938 "movups %%xmm4, 48(%%rdi);\n\t"
00939 "add $16, %%rsi;\n\t"
00940 "add $64, %%rdi;\n\t"
00941 "loop .GB2;\n\t"
00942 ".GB4:;\n\t"
00943 "or %%rdx, %%rdx;\n\t"
00944 "jz .GB1;\n\t"
00945 "mov %%rdx, %%rcx;\n\t"
00946 ".GB3:;\n\t"
00947 "xor %%rax, %%rax;\n\t"
00948 "movb (%%rsi), %%al;\n\t"
00949 "movd %%rax, %%xmm0;\n\t"
00950 "cvtdq2ps %%xmm0, %%xmm1;\n\t"
00951 "movss %%xmm1, (%%rdi);\n\t"
00952 "inc %%rsi;\n\t"
00953 "add $4, %%rdi;\n\t"
00954 "loop .GB3;\n\t"
00955 ".GB1:;"
00956 :
00957 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
00958 :"memory"
00959 );
00960 }
00961
00962
00963
00964
00965
00966 void sse2_cvt_byte_to_double(const byte *a, double *b, int32 sz)
00967 {
00968 int32 rcx=sz>>3;
00969 int32 rdx=sz&0x7;
00970
00971 asm(
00972 "or %%rcx, %%rcx;\n\t"
00973 "jz .GC4;\n\t"
00974 ".GC2:;\n\t"
00975 "pxor %%xmm0, %%xmm0;\n\t"
00976 "movdqu 0(%%rsi), %%xmm1;\n\t"
00977 "movdqu 2(%%rsi), %%xmm2;\n\t"
00978 "movdqu 4(%%rsi), %%xmm3;\n\t"
00979 "movdqu 6(%%rsi), %%xmm4;\n\t"
00980 "punpcklbw %%xmm0, %%xmm1;\n\t"
00981 "punpcklbw %%xmm0, %%xmm2;\n\t"
00982 "punpcklbw %%xmm0, %%xmm3;\n\t"
00983 "punpcklbw %%xmm0, %%xmm4;\n\t"
00984 "punpcklbw %%xmm0, %%xmm1;\n\t"
00985 "punpcklbw %%xmm0, %%xmm2;\n\t"
00986 "punpcklbw %%xmm0, %%xmm3;\n\t"
00987 "punpcklbw %%xmm0, %%xmm4;\n\t"
00988 "cvtdq2pd %%xmm1, %%xmm1;\n\t"
00989 "cvtdq2pd %%xmm2, %%xmm2;\n\t"
00990 "movupd %%xmm1, (%%rdi);\n\t"
00991 "movupd %%xmm2, 16(%%rdi);\n\t"
00992 "cvtdq2pd %%xmm3, %%xmm3;\n\t"
00993 "cvtdq2pd %%xmm4, %%xmm4;\n\t"
00994 "movupd %%xmm3, 32(%%rdi);\n\t"
00995 "movupd %%xmm4, 48(%%rdi);\n\t"
00996 "add $8, %%rsi;\n\t"
00997 "add $64, %%rdi;\n\t"
00998 "loop .GC2;\n\t"
00999 ".GC4:;\n\t"
01000 "or %%rdx, %%rdx;\n\t"
01001 "jz .GC1;\n\t"
01002 "mov %%rdx, %%rcx;\n\t"
01003 ".GC3:;\n\t"
01004 "xor %%rax, %%rax;\n\t"
01005 "movb (%%rsi), %%al;\n\t"
01006 "movd %%rax, %%xmm0;\n\t"
01007 "cvtdq2pd %%xmm0, %%xmm1;\n\t"
01008 "movsd %%xmm1, (%%rdi);\n\t"
01009 "inc %%rsi;\n\t"
01010 "add $8, %%rdi;\n\t"
01011 "loop .GC3;\n\t"
01012 ".GC1:;"
01013 :
01014 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01015 :"memory"
01016 );
01017
01018 }
01019
01020
01021
01022
01023
01024 void sse2_cvt_int_to_float(const int32 *a, float *b, const int32 sz)
01025 {
01026 int32 rcx=sz>>5;
01027 int32 rdx=sz&0x1f;
01028
01029 asm(
01030 "or %%rcx, %%rcx;\n\t"
01031 "jz .GD4;\n\t"
01032 ".GD2:;\n\t"
01033 "movdqu 0(%%rsi), %%xmm0;\n\t"
01034 "movdqu 16(%%rsi), %%xmm1;\n\t"
01035 "movdqu 32(%%rsi), %%xmm2;\n\t"
01036 "movdqu 48(%%rsi), %%xmm3;\n\t"
01037 "movdqu 64(%%rsi), %%xmm4;\n\t"
01038 "movdqu 80(%%rsi), %%xmm5;\n\t"
01039 "movdqu 96(%%rsi), %%xmm6;\n\t"
01040 "movdqu 112(%%rsi), %%xmm7;\n\t"
01041 "cvtdq2ps %%xmm0, %%xmm0;\n\t"
01042 "cvtdq2ps %%xmm1, %%xmm1;\n\t"
01043 "cvtdq2ps %%xmm2, %%xmm2;\n\t"
01044 "cvtdq2ps %%xmm3, %%xmm3;\n\t"
01045 "cvtdq2ps %%xmm4, %%xmm4;\n\t"
01046 "cvtdq2ps %%xmm5, %%xmm5;\n\t"
01047 "cvtdq2ps %%xmm6, %%xmm6;\n\t"
01048 "cvtdq2ps %%xmm7, %%xmm7;\n\t"
01049 "movups %%xmm0, 0(%%rdi);\n\t"
01050 "movups %%xmm1, 16(%%rdi);\n\t"
01051 "movups %%xmm2, 32(%%rdi);\n\t"
01052 "movups %%xmm3, 48(%%rdi);\n\t"
01053 "movups %%xmm4, 64(%%rdi);\n\t"
01054 "movups %%xmm5, 80(%%rdi);\n\t"
01055 "movups %%xmm6, 96(%%rdi);\n\t"
01056 "movups %%xmm7, 112(%%rdi);\n\t"
01057 "add $128, %%rsi;\n\t"
01058 "add $128, %%rdi;\n\t"
01059 "dec %%rcx;\n\t"
01060 "jnz .GD2;\n\t"
01061 ".GD4:;\n\t"
01062 "or %%rdx, %%rdx;\n\t"
01063 "jz .GD1;\n\t"
01064 "mov %%rdx, %%rcx;\n\t"
01065 ".GD3:;\n\t"
01066 "movsd (%%rsi), %%xmm0;\n\t"
01067 "cvtdq2ps %%xmm0, %%xmm0;\n\t"
01068 "movss %%xmm0, (%%rdi);\n\t"
01069 "add $4, %%rsi;\n\t"
01070 "add $4, %%rdi;\n\t"
01071 "loop .GD3;\n\t"
01072 ".GD1:;"
01073 :
01074 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01075 :"memory"
01076 );
01077
01078 }
01079
01080
01081
01082 void sse2_cvt_int_to_double(const int32 *a, double *b, const int32 sz)
01083 {
01084 int32 rcx=sz>>4;
01085 int32 rdx=sz&0xf;
01086
01087 asm(
01088 "or %%rcx, %%rcx;\n\t"
01089 "jz .GE4;\n\t"
01090 ".GE2:;\n\t"
01091 "movdqu 0(%%rsi), %%xmm0;\n\t"
01092 "movdqu 8(%%rsi), %%xmm1;\n\t"
01093 "movdqu 16(%%rsi), %%xmm2;\n\t"
01094 "movdqu 24(%%rsi), %%xmm3;\n\t"
01095 "movdqu 32(%%rsi), %%xmm4;\n\t"
01096 "movdqu 40(%%rsi), %%xmm5;\n\t"
01097 "movdqu 48(%%rsi), %%xmm6;\n\t"
01098 "movdqu 56(%%rsi), %%xmm7;\n\t"
01099 "cvtdq2pd %%xmm0, %%xmm0;\n\t"
01100 "cvtdq2pd %%xmm1, %%xmm1;\n\t"
01101 "cvtdq2pd %%xmm2, %%xmm2;\n\t"
01102 "cvtdq2pd %%xmm3, %%xmm3;\n\t"
01103 "cvtdq2pd %%xmm4, %%xmm4;\n\t"
01104 "cvtdq2pd %%xmm5, %%xmm5;\n\t"
01105 "cvtdq2pd %%xmm6, %%xmm6;\n\t"
01106 "cvtdq2pd %%xmm7, %%xmm7;\n\t"
01107 "movups %%xmm0, 0(%%rdi);\n\t"
01108 "movups %%xmm1, 16(%%rdi);\n\t"
01109 "movups %%xmm2, 32(%%rdi);\n\t"
01110 "movups %%xmm3, 48(%%rdi);\n\t"
01111 "movups %%xmm4, 64(%%rdi);\n\t"
01112 "movups %%xmm5, 80(%%rdi);\n\t"
01113 "movups %%xmm6, 96(%%rdi);\n\t"
01114 "movups %%xmm7, 112(%%rdi);\n\t"
01115 "add $64, %%rsi;\n\t"
01116 "add $128, %%rdi;\n\t"
01117 "dec %%rcx;\n\t"
01118 "jnz .GE2;\n\t"
01119 ".GE4:;\n\t"
01120 "or %%rdx, %%rdx;\n\t"
01121 "jz .GE1;\n\t"
01122 "mov %%rdx, %%rcx;\n\t"
01123 ".GE3:;\n\t"
01124 "movsd (%%rsi), %%xmm0;\n\t"
01125 "cvtdq2pd %%xmm0, %%xmm0;\n\t"
01126 "movsd %%xmm0, (%%rdi);\n\t"
01127 "add $4, %%rsi;\n\t"
01128 "add $8, %%rdi;\n\t"
01129 "loop .GE3;\n\t"
01130 ".GE1:;"
01131 :
01132 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01133 :"memory"
01134 );
01135
01136 }
01137
01138
01139 void sse2_cvt_float_to_int(const float *a, int *b, const int32 sz)
01140 {
01141 int32 rcx=sz;
01142 int32 rdx=sz;
01143
01144 asm (
01145 "or %%rcx, %%rcx;\n\t"
01146 "jz .GF1;\n\t"
01147 ".GF2:;\n\t"
01148 "movdqu 0(%%rsi), %%xmm0;\n\t"
01149 "movdqu 8(%%rsi), %%xmm1;\n\t"
01150 "movdqu 16(%%rsi), %%xmm2;\n\t"
01151 "movdqu 24(%%rsi), %%xmm3;\n\t"
01152 "movdqu 32(%%rsi), %%xmm4;\n\t"
01153 "movdqu 40(%%rsi), %%xmm5;\n\t"
01154 "movdqu 48(%%rsi), %%xmm6;\n\t"
01155 "movdqu 56(%%rsi), %%xmm7;\n\t"
01156 "cvtps2dq %%xmm0, %%xmm0;\n\t"
01157 "cvtps2dq %%xmm1, %%xmm1;\n\t"
01158 "cvtps2dq %%xmm2, %%xmm2;\n\t"
01159 "cvtps2dq %%xmm3, %%xmm3;\n\t"
01160 "cvtps2dq %%xmm4, %%xmm4;\n\t"
01161 "cvtps2dq %%xmm5, %%xmm5;\n\t"
01162 "cvtps2dq %%xmm6, %%xmm6;\n\t"
01163 "cvtps2dq %%xmm7, %%xmm7;\n\t"
01164 "movdqu %%xmm0, 0(%%rdi);\n\t"
01165 "movdqu %%xmm1, 16(%%rdi);\n\t"
01166 "movdqu %%xmm2, 32(%%rdi);\n\t"
01167 "movdqu %%xmm3, 48(%%rdi);\n\t"
01168 "movdqu %%xmm4, 64(%%rdi);\n\t"
01169 "movdqu %%xmm5, 80(%%rdi);\n\t"
01170 "movdqu %%xmm6, 96(%%rdi);\n\t"
01171 "movdqu %%xmm7, 112(%%rdi);\n\t"
01172 "add $64, %%rsi;\n\t"
01173 "add $128, %%rdi;\n\t"
01174 "dec %%rcx;\n\t"
01175 "jnz .GF2;\n\t"
01176 ".GF4:;\n\t"
01177 "or %%rdx, %%rdx;\n\t"
01178 "jz .GF1;\n\t"
01179 "mov %%rdx, %%rcx;\n\t"
01180 ".GF3:;\n\t"
01181 "movsd (%%rsi), %%xmm0;\n\t"
01182 "cvtps2dq %%xmm0, %%xmm0;\n\t"
01183 "movsd %%xmm0, (%%rdi);\n\t"
01184 "add $4, %%rsi;\n\t"
01185 "add $8, %%rdi;\n\t"
01186 "loop .GF3;\n\t"
01187 ".GF1:;"
01188 :
01189 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01190 :"memory"
01191 );
01192
01193 }
01194
01195
01196
01197
01198 void sse2_cvt_float_to_double(const float *a, double *b, const int32 sz)
01199 {
01200 int32 rcx=sz>>4;
01201 int32 rdx=sz&0xf;
01202
01203 asm(
01204 "or %%rcx, %%rcx;\n\t"
01205 "jz .GG4;\n\t"
01206 ".GG2:;\n\t"
01207 "movups 0(%%rsi), %%xmm0;\n\t"
01208 "movups 8(%%rsi), %%xmm1;\n\t"
01209 "movups 16(%%rsi), %%xmm2;\n\t"
01210 "movups 24(%%rsi), %%xmm3;\n\t"
01211 "movups 32(%%rsi), %%xmm4;\n\t"
01212 "movups 40(%%rsi), %%xmm5;\n\t"
01213 "movups 48(%%rsi), %%xmm6;\n\t"
01214 "movups 56(%%rsi), %%xmm7;\n\t"
01215 "cvtps2pd %%xmm0, %%xmm0;\n\t"
01216 "cvtps2pd %%xmm1, %%xmm1;\n\t"
01217 "cvtps2pd %%xmm2, %%xmm2;\n\t"
01218 "cvtps2pd %%xmm3, %%xmm3;\n\t"
01219 "cvtps2pd %%xmm4, %%xmm4;\n\t"
01220 "cvtps2pd %%xmm5, %%xmm5;\n\t"
01221 "cvtps2pd %%xmm6, %%xmm6;\n\t"
01222 "cvtps2pd %%xmm7, %%xmm7;\n\t"
01223 "movupd %%xmm0, 0(%%rdi);\n\t"
01224 "movupd %%xmm1, 16(%%rdi);\n\t"
01225 "movupd %%xmm2, 32(%%rdi);\n\t"
01226 "movupd %%xmm3, 48(%%rdi);\n\t"
01227 "movupd %%xmm4, 64(%%rdi);\n\t"
01228 "movupd %%xmm5, 80(%%rdi);\n\t"
01229 "movupd %%xmm6, 96(%%rdi);\n\t"
01230 "movupd %%xmm7, 112(%%rdi);\n\t"
01231 "add $64, %%rsi;\n\t"
01232 "add $128, %%rdi;\n\t"
01233 "dec %%rcx;\n\t"
01234 "jnz .GG2;\n\t"
01235 ".GG4:;\n\t"
01236 "or %%rdx, %%rdx;\n\t"
01237 "jz .GG1;\n\t"
01238 "mov %%rdx, %%rcx;\n\t"
01239 ".GG3:;\n\t"
01240 "movsd (%%rsi), %%xmm0;\n\t"
01241 "cvtps2pd %%xmm0, %%xmm0;\n\t"
01242 "movsd %%xmm0, (%%rdi);\n\t"
01243 "add $4, %%rsi;\n\t"
01244 "add $8, %%rdi;\n\t"
01245 "loop .GG3;\n\t"
01246 ".GG1:;"
01247 :
01248 :"S"(a), "D"(b), "c"(rcx),"d"(rdx)
01249 :"memory"
01250 );
01251 }
01252
01253 #endif
01254
01255 #ifdef INVT_USE_SSE
01256
01257
01258 void sse_lowPass3x(const float *a, float *b, const int h, const int w)
01259 {
01260 const float coeffs[] = { 3.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0, 4.0};
01261 int rdx = (w-2)/12;
01262 int rax = (w-2)%12;
01263
01264 asm (
01265
01266 "or %%rcx, %%rcx;\n\t"
01267 "jz .HA1;\n\t"
01268 ".HA2:;\n\t"
01269
01270
01271 "movss 0(%%rsi), %%xmm1;\n\t"
01272 "movss 4(%%rsi), %%xmm2;\n\t"
01273 "addss %%xmm1, %%xmm1;\n\t"
01274 "addss %%xmm1, %%xmm2;\n\t"
01275 "divss (%%rbx), %%xmm2;\n\t"
01276 "movss %%xmm2, (%%rdi);\n\t"
01277 "add $4, %%rdi;\n\t"
01278
01279
01280 "or %%rdx, %%rdx;\n\t"
01281 "jz .HA4;\n\t"
01282
01283 "push %%rdx;\n\t"
01284 ".HA3:;\n\t"
01285 "movups 00(%%rsi), %%xmm0;\n\t"
01286 "movups 04(%%rsi), %%xmm1;\n\t"
01287 "movups 8(%%rsi), %%xmm2;\n\t"
01288 "movups 16(%%rsi), %%xmm3;\n\t"
01289 "movups 20(%%rsi), %%xmm4;\n\t"
01290 "movups 24(%%rsi), %%xmm5;\n\t"
01291 "movups 32(%%rsi), %%xmm6;\n\t"
01292 "movups 36(%%rsi), %%xmm7;\n\t"
01293 "addps %%xmm1, %%xmm0;\n\t"
01294 "addps %%xmm4, %%xmm3;\n\t"
01295 "addps %%xmm1, %%xmm0;\n\t"
01296 "addps %%xmm4, %%xmm3;\n\t"
01297 "movups 40(%%rsi), %%xmm1;\n\t"
01298 "addps %%xmm7, %%xmm6;\n\t"
01299 "addps %%xmm2, %%xmm0;\n\t"
01300 "addps %%xmm1, %%xmm6;\n\t"
01301 "addps %%xmm5, %%xmm3;\n\t"
01302 "addps %%xmm7, %%xmm6;\n\t"
01303 "divps 16(%%rbx ), %%xmm0;\n\t"
01304 "divps 16(%%rbx ), %%xmm3;\n\t"
01305 "divps 16(%%rbx ), %%xmm6;\n\t"
01306 "movups %%xmm0, (%%rdi);\n\t"
01307 "movups %%xmm3, 16(%%rdi);\n\t"
01308 "movups %%xmm6, 32(%%rdi);\n\t"
01309 "add $48, %%rsi;\n\t"
01310 "add $48, %%rdi;\n\t"
01311 "dec %%rdx;\n\t"
01312 "jnz .HA3;\n\t"
01313 "pop %%rdx;\n\t"
01314 ".HA4:;\n\t"
01315
01316 "or %%rax, %%rax;\n\t"
01317 "jz .HA6;\n\t"
01318 "push %%rax;\n\t"
01319 ".HA5:;\n\t"
01320 "movss 00(%%rsi), %%xmm0;\n\t"
01321 "movss 04(%%rsi), %%xmm1;\n\t"
01322 "movss 8(%%rsi), %%xmm2;\n\t"
01323 "addps %%xmm1, %%xmm0;\n\t"
01324 "addps %%xmm1, %%xmm2;\n\t"
01325 "addps %%xmm2, %%xmm0;\n\t"
01326 "divss 16(%%rbx ), %%xmm0;\n\t"
01327 "movss %%xmm0, (%%rdi);\n\t"
01328 "add $4, %%rsi;\n\t"
01329 "add $4, %%rdi;\n\t"
01330 "dec %%rax;\n\t"
01331 "jnz .HA5;\n\t"
01332 "pop %%rax;\n\t"
01333
01334 ".HA6:;\n\t"
01335 "movss (%%rsi), %%xmm1;\n\t"
01336 "movss 4(%%rsi), %%xmm2;\n\t"
01337 "addss %%xmm2, %%xmm2;\n\t"
01338 "addss %%xmm1, %%xmm2;\n\t"
01339 "divss 0(%%rbx), %%xmm2;\n\t"
01340
01341 "movss %%xmm2, (%%rdi);\n\t"
01342 "add $4, %%rdi;\n\t"
01343 "add $8, %%rsi;\n\t"
01344 "dec %%rcx;\n\t"
01345 "jnz .HA2;\n\t"
01346 ".HA1:;\n\t"
01347 :
01348 :"S"(a), "D"(b),"c"(h),"a"(rax),"d"(rdx),"b"(coeffs)
01349 :"memory"
01350 );
01351
01352 }
01353
01354
01355
01356
01357
01358
01359 void sse_lowPass3y(const float *a, float *b, const int h, const int w)
01360 {
01361 const float coeffs[] = { 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0};
01362
01363 if (h < 2){
01364 memcpy(b, a, w*h*sizeof(b[0]));
01365 return;
01366 }
01367
01368 if (h < 2){
01369 memcpy(b, a, w*h*sizeof(b[0]));
01370 return;
01371 }
01372
01373 asm (
01374
01375 "mov %%rdx, %%rcx;\n\t"
01376 "or %%rcx, %%rcx;\n\t"
01377 "jz .HU1;\n\t"
01378 "push %%rsi;\n\t"
01379 ".HU0:;\n\t"
01380 "movss (%%rsi), %%xmm0;\n\t"
01381 "movss (%%rsi, %%rdx, 4), %%xmm1;\n\t"
01382 "addss %%xmm0, %%xmm0;\n\t"
01383 "addss %%xmm1, %%xmm0;\n\t"
01384 "divss (%%rbx), %%xmm0;\n\t"
01385 "add $4, %%rsi;\n\t"
01386 "movss %%xmm0, (%%rdi);\n\t"
01387 "add $4, %%rdi;\n\t"
01388 "dec %%rcx;\n\t"
01389 "jnz .HU0;\n\t"
01390 "pop %%rsi;\n\t"
01391 ".HU1:;\n\t"
01392 "cmp $2, %%rax;\n\t"
01393 "jle .HU5;\n\t"
01394
01395 "push %%rax;\n\t"
01396 "sub $2, %%rax;\n\t"
01397 "jle .HU4;\n\t"
01398 ".HU2:;\n\t"
01399 "mov %%rdx, %%rcx;\n\t"
01400 "push %%rdx;\n\t"
01401 ".HU3:;\n\t"
01402 "movss (%%rsi), %%xmm0;\n\t"
01403 "movss (%%rsi,%%rdx,4), %%xmm1;\n\t"
01404 "movss (%%rsi,%%rdx,8), %%xmm2;\n\t"
01405 "addss %%xmm1, %%xmm0;\n\t"
01406 "addss %%xmm1, %%xmm2;\n\t"
01407 "addss %%xmm2, %%xmm0;\n\t"
01408 "divss 16(%%rbx), %%xmm0;\n\t"
01409 "movss %%xmm0, (%%rdi);\n\t"
01410 "add $4, %%rsi;\n\t"
01411 "add $4, %%rdi;\n\t"
01412 "dec %%rcx;\n\t"
01413 "jnz .HU3;\n\t"
01414 "pop %%rdx;\n\t"
01415 "dec %%rax;\n\t"
01416 "jnz .HU2;\n\t"
01417
01418 ".HU4:;\n\t"
01419 "pop %%rax;\n\t"
01420 ".HU5:;\n\t"
01421 "or %%rdx, %%rdx;\n\t"
01422 "jz .HU7;\n\t"
01423 "push %%rdx;\n\t"
01424 "mov %%rdx, %%rcx;\n\t"
01425 ".HU6:;\n\t"
01426 "movss (%%rsi), %%xmm0;\n\t"
01427 "movss (%%rsi,%%rcx,4), %%xmm1;\n\t"
01428 "addss %%xmm1, %%xmm1;\n\t"
01429 "addss %%xmm1, %%xmm0;\n\t"
01430 "divss (%%rbx), %%xmm0;\n\t"
01431 "movss %%xmm0, (%%rdi);\n\t"
01432 "add $4, %%rsi;\n\t"
01433 "add $4, %%rdi;\n\t"
01434 "dec %%rdx;\n\t"
01435 "jnz .HU6;\n\t"
01436 "pop %%rdx;\n\t"
01437 ".HU7:;\n\t"
01438 :
01439 :"S"(a),"D"(b),"a"(h),"d"(w),"b"(coeffs)
01440 );
01441
01442 }
01443
01444
01445
01446
01447 void sse_lowPass5x(const float *src, float *dest, const int h, const int w)
01448 {
01449 const float *sptr= src;
01450 float *dptr= dest;
01451
01452 if(w<2)
01453 {
01454 memcpy(dest,src,h*w*sizeof(dest[0]));
01455 return;
01456 }
01457
01458 if (w == 2)
01459 for (int j = 0; j < h; j ++)
01460 {
01461
01462 *dptr++ = sptr[0] * (6.0F / 10.0F) + sptr[1] * (4.0F / 10.0F);
01463
01464
01465 *dptr++ = sptr[0] * (4.0F / 10.0F) + sptr[1] * (6.0F / 10.0F);
01466
01467 sptr += 2;
01468 }
01469 else if (w == 3)
01470 for (int j = 0; j < h; j ++)
01471 {
01472
01473 *dptr++ = sptr[0] * (6.0F / 11.0F) +
01474 sptr[1] * (4.0F / 11.0F) +
01475 sptr[2] * (1.0F / 11.0F);
01476
01477
01478 *dptr++ = (sptr[0] + sptr[2]) * (4.0F / 14.0F) +
01479 sptr[1] * (6.0F / 14.0F);
01480
01481
01482 *dptr++ = sptr[0] * (1.0F / 11.0F) +
01483 sptr[1] * (4.0F / 11.0F) +
01484 sptr[2] * (6.0F / 11.0F);
01485
01486 sptr += 3;
01487 }
01488 else
01489 if(w>3)
01490 {
01491 const float coeffs[] = {6.0/11.0, 4.0/11.0, 1.0/11.0, 4.0/15.0,
01492 4.0/15.0, 6.0/15.0, 1.0/15.0, 1.0/16.0,
01493 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
01494 4.0/16.0, 4.0/16.0, 4.0/16.0, 4.0/16.0,
01495 6.0/16.0, 6.0/16.0, 6.0/16.0, 6.0/16.0,
01496 1.0/15.0, 4.0/15.0, 6.0/15.0, 1.0/15.0,
01497 1.0/11.0, 4.0/11.0, 6.0/11.0, 1.0/11.0
01498 };
01499
01500 int rax= (w-4)&3;
01501 int rdx= (w-4)>>2;
01502
01503 asm(
01504 "or %%rcx, %%rcx;\n\t"
01505 "jz .HG6;\n\t"
01506 ".HG0:;\n\t"
01507 "movss (%%rsi), %%xmm0;\n\t"
01508 "movss 4(%%rsi), %%xmm2;\n\t"
01509 "movss 8(%%rsi), %%xmm4;\n\t"
01510 "movss 12(%%rsi), %%xmm6;\n\t"
01511 "movss %%xmm0, %%xmm1;\n\t"
01512 "movss %%xmm2, %%xmm3;\n\t"
01513 "movss %%xmm4, %%xmm5;\n\t"
01514 "mulss (%%rbx), %%xmm0;\n\t"
01515 "mulss 4(%%rbx), %%xmm2;\n\t"
01516 "mulss 8(%%rbx), %%xmm4;\n\t"
01517 "addss %%xmm5, %%xmm1;\n\t"
01518 "mulss 16(%%rbx), %%xmm1;\n\t"
01519 "mulss 20(%%rbx), %%xmm3;\n\t"
01520 "mulss 24(%%rbx), %%xmm6;\n\t"
01521 "addss %%xmm2, %%xmm0;\n\t"
01522 "addss %%xmm3, %%xmm1;\n\t"
01523 "addss %%xmm4, %%xmm0;\n\t"
01524 "addss %%xmm6, %%xmm1;\n\t"
01525 "movss %%xmm0, (%%rdi);\n\t"
01526 "movss %%xmm1, 4(%%rdi);\n\t"
01527 "add $8, %%rdi;\n\t"
01528
01529 "or %%rdx, %%rdx;\n\t"
01530 "jz .HG5;\n\t"
01531
01532 "push %%rdx;\n\t"
01533 "movups 32(%%rbx), %%xmm5;\n\t"
01534 "movups 48(%%rbx), %%xmm6;\n\t"
01535 "movups 64(%%rbx), %%xmm7;\n\t"
01536 ".HG1:;\n\t"
01537 "movups 0(%%rsi), %%xmm0;\n\t"
01538 "movups 04(%%rsi), %%xmm1;\n\t"
01539 "movups 8(%%rsi), %%xmm2;\n\t"
01540 "movups 12(%%rsi), %%xmm3;\n\t"
01541 "movups 16(%%rsi), %%xmm4;\n\t"
01542 "addps %%xmm4, %%xmm0;\n\t"
01543 "addps %%xmm3, %%xmm1;\n\t"
01544 "mulps %%xmm5, %%xmm0;\n\t"
01545 "mulps %%xmm6, %%xmm1;\n\t"
01546 "mulps %%xmm7, %%xmm2;\n\t"
01547 "addps %%xmm1, %%xmm0;\n\t"
01548 "addps %%xmm2, %%xmm0;\n\t"
01549 "movups %%xmm0, (%%rdi);\n\t"
01550 "add $16, %%rsi;\n\t"
01551 "add $16, %%rdi;\n\t"
01552 "dec %%rdx;\n\t"
01553 "jnz .HG1;\n\t"
01554 "pop %%rdx;\n\t"
01555
01556 ".HG5:;\n\t"
01557 "or %%rax, %%rax;\n\t"
01558 "jz .HG3;\n\t"
01559 "push %%rax;\n\t"
01560 "movups 32(%%rbx), %%xmm5;\n\t"
01561 "movups 48(%%rbx), %%xmm6;\n\t"
01562 "movups 64(%%rbx), %%xmm7;\n\t"
01563 ".HG2:;\n\t"
01564 "movss (%%rsi), %%xmm0;\n\t"
01565 "movss 4(%%rsi), %%xmm1;\n\t"
01566 "movss 8(%%rsi), %%xmm2;\n\t"
01567 "movss 12(%%rsi), %%xmm3;\n\t"
01568 "movss 16(%%rsi), %%xmm4;\n\t"
01569 "mulss %%xmm5 , %%xmm0;\n\t"
01570 "mulss %%xmm6 , %%xmm1;\n\t"
01571 "mulss %%xmm7 , %%xmm2;\n\t"
01572 "mulss %%xmm6 , %%xmm3;\n\t"
01573 "mulss %%xmm5 , %%xmm4;\n\t"
01574 "addss %%xmm1, %%xmm0;\n\t"
01575 "addss %%xmm3, %%xmm2;\n\t"
01576 "addss %%xmm4, %%xmm0;\n\t"
01577 "addss %%xmm2, %%xmm0;\n\t"
01578 "add $4, %%rsi;\n\t"
01579 "movss %%xmm0, (%%rdi);\n\t"
01580 "add $4, %%rdi;\n\t"
01581 "dec %%rax;\n\t"
01582 "jnz .HG2;\n\t"
01583 "pop %%rax;\n\t"
01584 ".HG3:;\n\t"
01585 "movss (%%rsi), %%xmm0;\n\t"
01586 "movss 4(%%rsi), %%xmm1;\n\t"
01587 "movss 8(%%rsi), %%xmm2;\n\t"
01588 "movss 12(%%rsi), %%xmm3;\n\t"
01589 "movss %%xmm1, %%xmm4;\n\t"
01590 "movss %%xmm2, %%xmm5;\n\t"
01591 "movss %%xmm3, %%xmm6;\n\t"
01592 "addps %%xmm1, %%xmm3;\n\t"
01593 "mulss 80(%%rbx), %%xmm0;\n\t"
01594 "mulss 84(%%rbx), %%xmm3;\n\t"
01595 "mulss 88(%%rbx), %%xmm2;\n\t"
01596 "addss %%xmm3, %%xmm0;\n\t"
01597 "addss %%xmm2, %%xmm0;\n\t"
01598 "movss %%xmm0, (%%rdi);\n\t"
01599 "mulss 96(%%rbx), %%xmm4;\n\t"
01600 "mulss 100(%%rbx), %%xmm5;\n\t"
01601 "mulss 104(%%rbx), %%xmm6;\n\t"
01602 "addss %%xmm5, %%xmm4;\n\t"
01603 "addss %%xmm6, %%xmm4;\n\t"
01604 "movss %%xmm4, 4(%%rdi);\n\t"
01605 "add $16, %%rsi;\n\t"
01606 "add $8, %%rdi;\n\t"
01607 "dec %%rcx;\n\t"
01608 "jnz .HG0;\n\t"
01609 ".HG6:;\n\t"
01610 :
01611 :"S"(sptr),"D"(dptr),"a"(rax),"b"(coeffs),"c"(h),"d"(rdx)
01612 :"memory"
01613 );
01614 }
01615
01616 }
01617
01618
01619
01620
01621
01622 void sse_lowPass5y(const float *src, float *dest, const int h,
01623 const int w)
01624 {
01625 if (h < 2){
01626 memcpy(dest, src, h*w*sizeof(dest[0]));
01627 return;
01628 }
01629
01630 const float *sptr= src;
01631 float *dptr= dest;
01632
01633
01634 const int w2 = w * 2;
01635
01636
01637 if (h == 2)
01638 {
01639
01640 for (int i = 0; i < w; i ++)
01641 {
01642 *dptr++ = sptr[0] * (6.0F / 10.0F) +
01643 sptr[w] * (4.0F / 10.0F);
01644 sptr++;
01645 }
01646 sptr -= w;
01647
01648
01649 for (int i = 0; i < w; i ++)
01650 {
01651 *dptr++ = sptr[0] * (4.0F / 10.0F) +
01652 sptr[w] * (6.0F / 10.0F);
01653 sptr++;
01654 }
01655 }
01656 else if (h == 3)
01657 {
01658
01659 for (int i = 0; i < w; i ++)
01660 {
01661 *dptr++ = sptr[ 0] * (6.0F / 11.0F) +
01662 sptr[ w] * (4.0F / 11.0F) +
01663 sptr[w2] * (1.0F / 11.0F);
01664 sptr++;
01665 }
01666 sptr -= w;
01667
01668
01669 for (int i = 0; i < w; i ++)
01670 {
01671 *dptr++ = (sptr[ 0] + sptr[w2]) * (4.0F / 14.0F) +
01672 sptr[ w] * (6.0F / 14.0F);
01673 sptr++;
01674 }
01675 sptr -= w;
01676
01677
01678 for (int i = 0; i < w; i ++)
01679 {
01680 *dptr++ = sptr[ 0] * (1.0F / 11.0F) +
01681 sptr[ w] * (4.0F / 11.0F) +
01682 sptr[w2] * (6.0F / 11.0F);
01683 sptr++;
01684 }
01685 }
01686 else
01687 {
01688
01689
01690 static const float coeffs[] = {
01691 6.0/11.0, 6.0/11.0, 6.0/11.0, 6.0/11.0,
01692 4.0/11.0, 4.0/11.0, 4.0/11.0, 4.0/11.0,
01693 1.0/11.0, 1.0/11.0, 1.0/11.0, 1.0/11.0,
01694 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F, 4.0F/15.0F,
01695 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F, 6.0F/15.0F,
01696 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F, 1.0F/15.0F,
01697 1.0/16.0, 1.0/16.0, 1.0/16.0, 1.0/16.0,
01698 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F, 4.0F/16.0F,
01699 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F, 6.0F/16.0F
01700 };
01701
01702 int rcx=h-4;
01703 int rdx=w>>2;
01704 int rax=w&3;
01705
01706
01707
01708
01709
01710
01711
01712
01713
01714
01715
01716
01717
01718
01719
01720
01721
01722
01723
01724
01725
01726
01727
01728
01729
01730
01731
01732
01733
01734
01735
01736
01737
01738
01739
01740
01741
01742
01743
01744
01745
01746
01747
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757
01758
01759
01760
01761
01762
01763
01764
01765
01766
01767
01768
01769
01770
01771
01772
01773
01774
01775
01776
01777
01778
01779
01780
01781
01782
01783
01784
01785
01786
01787
01788
01789
01790
01791
01792
01793
01794
01795
01796
01797
01798
01799
01800
01801
01802
01803
01804
01805
01806
01807
01808
01809
01810
01811
01812
01813
01814
01815
01816
01817
01818
01819
01820
01821
01822
01823
01824
01825
01826
01827
01828
01829
01830
01831
01832
01833
01834
01835
01836
01837
01838
01839
01840
01841
01842
01843
01844
01845
01846
01847
01848
01849
01850
01851
01852
01853
01854
01855
01856
01857
01858
01859
01860
01861
01862
01863
01864
01865
01866
01867
01868
01869
01870
01871
01872
01873
01874
01875
01876
01877
01878
01879
01880
01881
01882
01883
01884
01885
01886
01887
01888
01889
01890
01891
01892
01893
01894
01895
01896
01897
01898
01899
01900
01901
01902
01903
01904
01905
01906
01907
01908
01909
01910
01911
01912
01913
01914
01915
01916
01917
01918
01919
01920
01921
01922
01923
01924
01925
01926
01927
01928
01929
01930
01931
01932
01933
01934
01935
01936
01937
01938
01939
01940
01941
01942
01943
01944
01945
01946
01947
01948
01949
01950
01951
01952
01953
01954
01955
01956
01957
01958
01959
01960
01961
01962
01963
01964
01965
01966
01967
01968
01969
01970
01971
01972
01973
01974
01975
01976
01977
01978
01979
01980
01981
01982
01983
01984
01985
01986
01987
01988
01989
01990
01991
01992
01993
01994
01995
01996 }
01997 }
01998
01999
02000
02001
02002 void sse_yuv411_to_rgb_mode_640x480(const byte *src, byte *dest,
02003 const int nbpix2)
02004 {
02005 int rcx=nbpix2/6;
02006
02007 const float coeffs[] = {
02008 0.0F, -0.198242F, 1.014648F, 0.0F,
02009 0.700195F, -0.29052F, 0.0F, 0.0F,
02010 128.0F, 128.0F, 128.0F, 128.0F
02011 };
02012
02013 asm (
02014 ".JA0:;\n\t"
02015 "or %%rcx, %%rcx;\n\t"
02016 "jz .JA1;\n\t"
02017 "pxor %%mm7, %%mm7;\n\t"
02018 "xor %%rax, %%rax;\n\t"
02019 "xor %%rbx, %%rbx;\n\t"
02020 "mov (%%rsi), %%rax;\n\t"
02021 "movw 4(%%rsi), %%bx;\n\t"
02022 "movd %%rax, %%mm0;\n\t"
02023 "movd %%rax, %%mm1;\n\t"
02024 "movd %%rbx, %%mm2;\n\t"
02025 "psrlq $16, %%mm1;\n\t"
02026 "punpcklbw %%mm7, %%mm0;\n\t"
02027 "punpcklbw %%mm7, %%mm1;\n\t"
02028 "punpcklbw %%mm7, %%mm2;\n\t"
02029 "punpcklwd %%mm7, %%mm0;\n\t"
02030 "punpcklwd %%mm7, %%mm1;\n\t"
02031 "punpcklwd %%mm7, %%mm2;\n\t"
02032
02033 "cvtpi2ps %%mm0, %%xmm0;\n\t"
02034 "cvtpi2ps %%mm1, %%xmm1;\n\t"
02035 "cvtpi2ps %%mm2, %%xmm2;\n\t"
02036
02037
02038 "movaps %%xmm0, %%xmm3;\n\t"
02039
02040
02041 "movaps %%xmm1, %%xmm4;\n\t"
02042
02043
02044 "movaps %%xmm2, %%xmm5;\n\t"
02045
02046
02047 "movaps %%xmm2, %%xmm6;\n\t"
02048
02049 "shufps $0x55, %%xmm3, %%xmm3;\n\t"
02050 "shufps $00, %%xmm4, %%xmm4;\n\t"
02051 "shufps $0x00, %%xmm5, %%xmm5;\n\t"
02052 "shufps $0x55, %%xmm6, %%xmm6;\n\t"
02053
02054
02055 "shufps $0, %%xmm0, %%xmm0;\n\t"
02056
02057 "shufps $0x55, %%xmm1, %%xmm1;\n\t"
02058
02059 "subps 32(%%rdx), %%xmm0;\n\t"
02060 "subps 32(%%rdx), %%xmm1;\n\t"
02061
02062 "mulps (%%rdx), %%xmm0;\n\t"
02063 "mulps 16(%%rdx),%%xmm1;\n\t"
02064
02065 "addps %%xmm0, %%xmm3;\n\t"
02066 "addps %%xmm0, %%xmm4;\n\t"
02067 "addps %%xmm0, %%xmm5;\n\t"
02068 "addps %%xmm0, %%xmm6;\n\t"
02069
02070 "addps %%xmm1, %%xmm3;\n\t"
02071 "addps %%xmm1, %%xmm4;\n\t"
02072 "addps %%xmm1, %%xmm5;\n\t"
02073 "addps %%xmm1, %%xmm6;\n\t"
02074
02075 "cvtps2pi %%xmm3, %%mm0;\n\t"
02076 "movhlps %%xmm3, %%xmm3;\n\t"
02077 "cvtps2pi %%xmm3, %%mm1;\n\t"
02078 "packssdw %%mm1, %%mm0;\n\t"
02079
02080 "cvtps2pi %%xmm4, %%mm2;\n\t"
02081 "movhlps %%xmm4, %%xmm4;\n\t"
02082 "cvtps2pi %%xmm4, %%mm3;\n\t"
02083 "packssdw %%mm3, %%mm2;\n\t"
02084
02085 "cvtps2pi %%xmm5, %%mm4;\n\t"
02086 "movhlps %%xmm5, %%xmm5;\n\t"
02087 "cvtps2pi %%xmm5, %%mm5;\n\t"
02088 "packssdw %%mm5, %%mm4;\n\t"
02089
02090 "cvtps2pi %%xmm6, %%mm6;\n\t"
02091 "movhlps %%xmm6, %%xmm6;\n\t"
02092 "cvtps2pi %%xmm6, %%mm7;\n\t"
02093 "packssdw %%mm7, %%mm6;\n\t"
02094
02095 "pxor %%mm1, %%mm1;\n\t"
02096 "pcmpgtw %%mm0, %%mm1;\n\t"
02097 "pandn %%mm0, %%mm1;\n\t"
02098
02099 "pxor %%mm3, %%mm3;\n\t"
02100 "pcmpgtw %%mm2, %%mm3;\n\t"
02101 "pandn %%mm2, %%mm3;\n\t"
02102
02103 "pxor %%mm5, %%mm5;\n\t"
02104 "pcmpgtw %%mm4, %%mm5;\n\t"
02105 "pandn %%mm4, %%mm5;\n\t"
02106
02107 "pxor %%mm7, %%mm7;\n\t"
02108 "pcmpgtw %%mm6, %%mm7;\n\t"
02109 "pandn %%mm6, %%mm7;\n\t"
02110
02111 "packuswb %%mm1, %%mm1;\n\t"
02112 "packuswb %%mm3, %%mm3;\n\t"
02113 "packuswb %%mm5, %%mm5;\n\t"
02114 "packuswb %%mm7, %%mm7;\n\t"
02115
02116 "push %%rcx;\n\t"
02117 "push %%rdx;\n\t"
02118 "movd %%mm1, %%rax;\n\t"
02119 "movd %%mm3, %%rbx;\n\t"
02120 "movd %%mm5, %%rcx;\n\t"
02121 "movd %%mm7, %%rdx;\n\t"
02122 "movw %%ax, (%%rdi);\n\t"
02123 "movw %%bx,3(%%rdi);\n\t"
02124 "movw %%cx,6(%%rdi);\n\t"
02125 "movw %%dx,9(%%rdi);\n\t"
02126 "shr $8, %%rax;\n\t"
02127 "shr $8, %%rbx;\n\t"
02128 "shr $8, %%rcx;\n\t"
02129 "shr $8, %%rdx;\n\t"
02130 "movb %%ah, 2(%%rdi);\n\t"
02131 "movb %%bh, 5(%%rdi);\n\t"
02132 "movb %%ch, 8(%%rdi);\n\t"
02133 "movb %%dh,11(%%rdi);\n\t"
02134 "pop %%rdx;\n\t"
02135 "pop %%rcx;\n\t"
02136
02137 "add $12,%%rdi;\n\t"
02138 "dec %%rcx;\n\t"
02139 "add $6, %%rsi;\n\t"
02140 "jmp .JA0;\n\t"
02141 ".JA1:;\n\t"
02142 "emms;\n\t"
02143 :
02144 :"S"(src),"D"(dest),"c"(rcx),"d"(coeffs)
02145 :"rax","rbx","memory"
02146 );
02147
02148 }
02149
02150
02151
02152
02153 void sse_lowPass9x(const float *sptr, float *dptr, const int h, const int w)
02154 {
02155
02156 for (int j = 0; j < h; j ++)
02157 {
02158
02159 *dptr++ = sptr[0] * (70.0F / 163.0F) +
02160 sptr[1] * (56.0F / 163.0F) +
02161 sptr[2] * (28.0F / 163.0F) +
02162 sptr[3] * ( 8.0F / 163.0F) +
02163 sptr[4] * ( 1.0F / 163.0F);
02164 *dptr++ = (sptr[0] + sptr[2]) * (56.0F / 219.0F) +
02165 sptr[1] * (70.0F / 219.0F) +
02166 sptr[3] * (28.0F / 219.0F) +
02167 sptr[4] * ( 8.0F / 219.0F) +
02168 sptr[5] * ( 1.0F / 219.0F);
02169 *dptr++ = (sptr[0] + sptr[4]) * (28.0F / 247.0F) +
02170 (sptr[1] + sptr[3]) * (56.0F / 247.0F) +
02171 sptr[2] * (70.0F / 247.0F) +
02172 sptr[5] * ( 8.0F / 247.0F) +
02173 sptr[6] * ( 1.0F / 247.0F);
02174 *dptr++ = (sptr[0] + sptr[6]) * ( 8.0F / 255.0F) +
02175 (sptr[1] + sptr[5]) * (28.0F / 255.0F) +
02176 (sptr[2] + sptr[4]) * (56.0F / 255.0F) +
02177 sptr[3] * (70.0F / 255.0F) +
02178 sptr[7] * ( 1.0F / 255.0F);
02179
02180
02181 for (int i = 0; i < w - 8; i ++)
02182 {
02183 *dptr++ = (sptr[0] + sptr[8]) * ( 1.0F / 256.0F) +
02184 (sptr[1] + sptr[7]) * ( 8.0F / 256.0F) +
02185 (sptr[2] + sptr[6]) * (28.0F / 256.0F) +
02186 (sptr[3] + sptr[5]) * (56.0F / 256.0F) +
02187 sptr[4] * (70.0F / 256.0F);
02188 sptr ++;
02189 }
02190
02191
02192 *dptr++ = sptr[0] * ( 1.0F / 255.0F) +
02193 (sptr[1] + sptr[7]) * ( 8.0F / 255.0F) +
02194 (sptr[2] + sptr[6]) * (28.0F / 255.0F) +
02195 (sptr[3] + sptr[5]) * (56.0F / 255.0F) +
02196 sptr[4] * (70.0F / 255.0F);
02197 sptr ++;
02198 *dptr++ = sptr[0] * ( 1.0F / 247.0F) +
02199 sptr[1] * ( 8.0F / 247.0F) +
02200 (sptr[2] + sptr[6]) * (28.0F / 247.0F) +
02201 (sptr[3] + sptr[5]) * (56.0F / 247.0F) +
02202 sptr[4] * (70.0F / 247.0F);
02203 sptr ++;
02204 *dptr++ = sptr[0] * ( 1.0F / 219.0F) +
02205 sptr[1] * ( 8.0F / 219.0F) +
02206 sptr[2] * (28.0F / 219.0F) +
02207 (sptr[3] + sptr[5]) * (56.0F / 219.0F) +
02208 sptr[4] * (70.0F / 219.0F);
02209 sptr ++;
02210 *dptr++ = sptr[0] * ( 1.0F / 163.0F) +
02211 sptr[1] * ( 8.0F / 163.0F) +
02212 sptr[2] * (28.0F / 163.0F) +
02213 sptr[3] * (56.0F / 163.0F) +
02214 sptr[4] * (70.0F / 163.0F);
02215 sptr += 5;
02216 }
02217 }
02218 #endif
02219
02220
02221
02222
02223
02224
02225
02226
02227