; void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
;------------------------------------------------------------------------------
-INIT_XMM sse3
+%macro FCMUL_ADD 0
cglobal fcmul_add, 4,4,6, sum, t, c, len
shl lend, 3
- add lend, mmsize*2
add tq, lenq
add cq, lenq
add sumq, lenq
movsldup m3, [tq + lenq+mmsize]
movaps m1, [cq + lenq]
movaps m4, [cq + lenq+mmsize]
- mulps m0, m1
- mulps m3, m4
- shufps m1, m1, 0xb1
- shufps m4, m4, 0xb1
+ mulps m0, m0, m1
+ mulps m3, m3, m4
+ shufps m1, m1, m1, 0xb1
+ shufps m4, m4, m4, 0xb1
movshdup m2, [tq + lenq]
movshdup m5, [tq + lenq+mmsize]
- mulps m2, m1
- mulps m5, m4
- addsubps m0, m2
- addsubps m3, m5
- addps m0, [sumq + lenq]
- addps m3, [sumq + lenq+mmsize]
+ mulps m2, m2, m1
+ mulps m5, m5, m4
+ addsubps m0, m0, m2
+ addsubps m3, m3, m5
+ addps m0, m0, [sumq + lenq]
+ addps m3, m3, [sumq + lenq+mmsize]
movaps [sumq + lenq], m0
movaps [sumq + lenq+mmsize], m3
add lenq, mmsize*2
jl .loop
- REP_RET
+ movss xm0, [tq + lenq]
+ mulss xm0, [cq + lenq]
+ addss xm0, [sumq + lenq]
+ movss [sumq + lenq], xm0
+ RET
+%endmacro
+
+INIT_XMM sse3
+FCMUL_ADD
+INIT_YMM avx
+FCMUL_ADD