X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavfilter%2Fx86%2Faf_afir.asm;h=2cc09709a2772c39913c5ccc09b02105fe9f160a;hb=c67d2a287502845baadf986a9c63e6117a25be3f;hp=849d85e70fba83161ef73b7430537f21f5e2c6e4;hpb=f8377ffce35251bba043aeda5d81df0d411a0595;p=ffmpeg diff --git a/libavfilter/x86/af_afir.asm b/libavfilter/x86/af_afir.asm index 849d85e70fb..2cc09709a27 100644 --- a/libavfilter/x86/af_afir.asm +++ b/libavfilter/x86/af_afir.asm @@ -27,10 +27,9 @@ SECTION .text ; void ff_fcmul_add(float *sum, const float *t, const float *c, int len) ;------------------------------------------------------------------------------ -INIT_XMM sse3 +%macro FCMUL_ADD 0 cglobal fcmul_add, 4,4,6, sum, t, c, len shl lend, 3 - add lend, mmsize*2 add tq, lenq add cq, lenq add sumq, lenq @@ -41,20 +40,30 @@ ALIGN 16 movsldup m3, [tq + lenq+mmsize] movaps m1, [cq + lenq] movaps m4, [cq + lenq+mmsize] - mulps m0, m1 - mulps m3, m4 - shufps m1, m1, 0xb1 - shufps m4, m4, 0xb1 + mulps m0, m0, m1 + mulps m3, m3, m4 + shufps m1, m1, m1, 0xb1 + shufps m4, m4, m4, 0xb1 movshdup m2, [tq + lenq] movshdup m5, [tq + lenq+mmsize] - mulps m2, m1 - mulps m5, m4 - addsubps m0, m2 - addsubps m3, m5 - addps m0, [sumq + lenq] - addps m3, [sumq + lenq+mmsize] + mulps m2, m2, m1 + mulps m5, m5, m4 + addsubps m0, m0, m2 + addsubps m3, m3, m5 + addps m0, m0, [sumq + lenq] + addps m3, m3, [sumq + lenq+mmsize] movaps [sumq + lenq], m0 movaps [sumq + lenq+mmsize], m3 add lenq, mmsize*2 jl .loop - REP_RET + movss xm0, [tq + lenq] + mulss xm0, [cq + lenq] + addss xm0, [sumq + lenq] + movss [sumq + lenq], xm0 + RET +%endmacro + +INIT_XMM sse3 +FCMUL_ADD +INIT_YMM avx +FCMUL_ADD