From e59bd089868603c09faa2893fe68b426c07b46b6 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 19 Dec 2014 22:18:42 -0500 Subject: [PATCH] vp9/x86: slightly simplify 44/48/84/88 h stores. --- libavcodec/x86/vp9lpf.asm | 88 ++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 48 deletions(-) diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm index c62ac462b07..64d3b268cf6 100644 --- a/libavcodec/x86/vp9lpf.asm +++ b/libavcodec/x86/vp9lpf.asm @@ -724,34 +724,34 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3, dst, stride, mstride, dst2, stri SBUTTERFLY bw, 2, 3, 8 SBUTTERFLY wd, 0, 2, 8 SBUTTERFLY wd, 1, 3, 8 - SBUTTERFLY dq, 0, 4, 8 - SBUTTERFLY dq, 1, 5, 8 - SBUTTERFLY dq, 2, 6, 8 - SBUTTERFLY dq, 3, 7, 8 movd [P7], m0 - punpckhqdq m0, m8 - movd [P6], m0 - movd [Q0], m1 - punpckhqdq m1, m9 - movd [Q1], m1 movd [P3], m2 - punpckhqdq m2, m10 - movd [P2], m2 + movd [Q0], m1 movd [Q4], m3 - punpckhqdq m3, m11 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P6], m0 + movd [P2], m2 + movd [Q1], m1 movd [Q5], m3 - movd [P5], m4 - punpckhqdq m4, m12 - movd [P4], m4 - movd [Q2], m5 - punpckhqdq m5, m13 - movd [Q3], m5 - movd [P1], m6 - punpckhqdq m6, m14 - movd [P0], m6 - movd [Q6], m7 - punpckhqdq m7, m8 - movd [Q7], m7 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P5], m0 + movd [P1], m2 + movd [Q2], m1 + movd [Q6], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P4], m0 + movd [P0], m2 + movd [Q3], m1 + movd [Q7], m3 %else ; the following code do a transpose of 8 full lines to 16 half ; lines (high part). It is inlined to avoid the need of a staging area @@ -776,30 +776,22 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3, dst, stride, mstride, dst2, stri SBUTTERFLY dq, 1, 5, 8 SBUTTERFLY dq, 2, 6, 8 SBUTTERFLY dq, 3, 7, 8 - movh [P7], m0 - punpckhqdq m0, m8 - movh [P6], m0 - movh [Q0], m1 - punpckhqdq m1, m9 - movh [Q1], m1 - movh [P3], m2 - punpckhqdq m2, m10 - movh [P2], m2 - movh [Q4], m3 - punpckhqdq m3, m11 - movh [Q5], m3 - movh [P5], m4 - punpckhqdq m4, m12 - movh [P4], m4 - movh [Q2], m5 - punpckhqdq m5, m13 - movh [Q3], m5 - movh [P1], m6 - punpckhqdq m6, m14 - movh [P0], m6 - movh [Q6], m7 - punpckhqdq m7, m8 - movh [Q7], m7 + movh [P7], m0 + movhps [P6], m0 + movh [Q0], m1 + movhps [Q1], m1 + movh [P3], m2 + movhps [P2], m2 + movh [Q4], m3 + movhps [Q5], m3 + movh [P5], m4 + movhps [P4], m4 + movh [Q2], m5 + movhps [Q3], m5 + movh [P1], m6 + movhps [P0], m6 + movh [Q6], m7 + movhps [Q7], m7 %endif %endif -- 2.39.5