From: Henrik Gramner <henrik@gramner.com>
Date: Sat, 20 Feb 2016 19:31:22 +0000 (+0100)
Subject: x86: SSE2/AVX idct_dequant_2x4_(dc|dconly)
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=23d1d8e89be2d99f5c6924a6055fc80d69429503;p=x264

x86: SSE2/AVX idct_dequant_2x4_(dc|dconly)

Only used in 4:2:2. Both 8-bit and high bit-depth implemented.

Approximate performance improvement compared to C on Ivy Bridge:

                         x86-32  x86-64
idct_dequant_2x4_dc      2.1x    1.7x
idct_dequant_2x4_dconly  2.7x    2.0x

Helps more on 32-bit due to the C versions being register starved.
---

diff --git a/common/quant.c b/common/quant.c
index 75325d0d..312f7cd9 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -486,6 +486,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->dequant_4x4 = x264_dequant_4x4_sse2;
         pf->dequant_8x8 = x264_dequant_8x8_sse2;
         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
+        pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
+        pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
         pf->denoise_dct = x264_denoise_dct_sse2;
         pf->decimate_score15 = x264_decimate_score15_sse2;
         pf->decimate_score16 = x264_decimate_score16_sse2;
@@ -532,6 +534,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     }
     if( cpu&X264_CPU_AVX )
     {
+        pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
+        pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
         pf->denoise_dct = x264_denoise_dct_avx;
     }
     if( cpu&X264_CPU_XOP )
@@ -618,6 +622,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
             pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
             pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
         }
+        pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
+        pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
         pf->denoise_dct = x264_denoise_dct_sse2;
         pf->decimate_score15 = x264_decimate_score15_sse2;
@@ -680,6 +686,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
             pf->dequant_4x4 = x264_dequant_4x4_avx;
             pf->dequant_8x8 = x264_dequant_8x8_avx;
         }
+        pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
+        pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
         pf->denoise_dct = x264_denoise_dct_avx;
     }
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index f7c7cc03..2dc02493 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -829,6 +829,150 @@ INIT_YMM avx2
 DEQUANT_DC w, pmullw
 %endif
 
+%macro PEXTRW 4
+    %if cpuflag(sse4)
+        pextrw %1, %2, %3
+    %else
+        ; pextrw with a memory destination requires SSE4.1, go through a GPR as a fallback
+        %if %3
+            pextrw %4d, %2, %3
+        %else
+            movd %4d, %2
+        %endif
+        mov %1, %4w
+    %endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
+; void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
+;-----------------------------------------------------------------------------
+
+%macro DEQUANT_2x4_DC 1
+%ifidn %1, dconly
+    DECLARE_REG_TMP 6,3,2
+    %define %%args dct, dmf, qp
+%else
+    DECLARE_REG_TMP 6,4,3
+    %define %%args dct, dct4x4, dmf, qp
+%endif
+
+%if ARCH_X86_64 == 0
+    DECLARE_REG_TMP 2,0,1
+%endif
+
+cglobal idct_dequant_2x4_%1, 0,3,5, %%args
+    movifnidn  t2d, qpm
+    imul       t0d, t2d, 0x2b
+    shr        t0d, 8         ; qp / 6
+    lea        t1d, [t0*5]
+    sub        t2d, t0d
+    sub        t2d, t1d       ; qp % 6
+    shl        t2d, 6         ; 16 * sizeof(int)
+%if ARCH_X86_64
+    imul       t2d, [dmfq+t2], -0xffff ; (-dmf) << 16 | dmf
+%else
+    mov       dctq, dctmp
+    add         t2, dmfmp
+    imul       t2d, [t2], -0xffff
+%endif
+%if HIGH_BIT_DEPTH
+    mova        m0, [dctq]
+    mova        m1, [dctq+16]
+    SUMSUB_BA    d, 1, 0, 2   ; 16-bit intermediate precision is enough for the first two sumsub steps,
+    packssdw    m1, m0        ; and by packing to words we can use pmaddwd instead of pmulld later.
+%else
+    movq        m0, [dctq]
+    movq        m1, [dctq+8]
+    SUMSUB_BA    w, 1, 0, 2
+    punpcklqdq  m1, m0        ; a0 a1 a2 a3 a4 a5 a6 a7
+%endif
+    pshufd      m0, m1, q2301 ; a2 a3 a0 a1 a6 a7 a4 a5
+    movd        m3, t2d
+    pshuflw     m3, m3, q1000 ; +  +  +  -
+    SUMSUB_BA    w, 0, 1, 2
+    punpcklqdq  m3, m3        ; +  +  +  -  +  +  +  -
+    pshufd      m1, m1, q0022
+    sub        t0d, 6
+    jl .rshift
+    movd        m2, t0d
+    psllw       m3, m2
+    pmaddwd     m0, m3
+    pmaddwd     m1, m3
+    jmp .end
+.rshift:
+    neg        t0d
+    movd        m2, t0d
+    pcmpeqd     m4, m4
+    pmaddwd     m0, m3
+    pmaddwd     m1, m3
+    pslld       m4, m2
+    psrad       m4, 1
+    psubd       m0, m4 ; + 1 << (qp/6-1)
+    psubd       m1, m4
+    psrad       m0, m2
+    psrad       m1, m2
+.end:
+%ifidn %1, dconly
+%if HIGH_BIT_DEPTH
+    mova    [dctq], m0
+    mova [dctq+16], m1
+%else
+    packssdw    m0, m1
+    mova    [dctq], m0
+%endif
+%else
+    movifnidn dct4x4q, dct4x4mp
+%if HIGH_BIT_DEPTH
+    movd   [dct4x4q+0*64], m0
+%if cpuflag(sse4)
+    pextrd [dct4x4q+1*64], m0, 1
+    add    dct4x4q, 4*64
+    pextrd [dct4x4q-2*64], m0, 2
+    pextrd [dct4x4q-1*64], m0, 3
+    movd   [dct4x4q+0*64], m1
+    pextrd [dct4x4q+1*64], m1, 1
+    pextrd [dct4x4q+2*64], m1, 2
+    pextrd [dct4x4q+3*64], m1, 3
+%else
+    MOVHL       m2, m0
+    psrlq       m0, 32
+    movd   [dct4x4q+1*64], m0
+    add    dct4x4q, 4*64
+    movd   [dct4x4q-2*64], m2
+    psrlq       m2, 32
+    movd   [dct4x4q-1*64], m2
+    movd   [dct4x4q+0*64], m1
+    MOVHL       m2, m1
+    psrlq       m1, 32
+    movd   [dct4x4q+1*64], m1
+    movd   [dct4x4q+2*64], m2
+    psrlq       m2, 32
+    movd   [dct4x4q+3*64], m2
+%endif
+%else
+    PEXTRW [dct4x4q+0*32], m0, 0, eax
+    PEXTRW [dct4x4q+1*32], m0, 2, eax
+    PEXTRW [dct4x4q+2*32], m0, 4, eax
+    PEXTRW [dct4x4q+3*32], m0, 6, eax
+    add    dct4x4q, 4*32
+    PEXTRW [dct4x4q+0*32], m1, 0, eax
+    PEXTRW [dct4x4q+1*32], m1, 2, eax
+    PEXTRW [dct4x4q+2*32], m1, 4, eax
+    PEXTRW [dct4x4q+3*32], m1, 6, eax
+%endif
+%endif
+    RET
+%endmacro
+
+; sse4 reduces code size compared to sse2 but isn't any faster, so just go with sse2+avx
+INIT_XMM sse2
+DEQUANT_2x4_DC dc
+DEQUANT_2x4_DC dconly
+INIT_XMM avx
+DEQUANT_2x4_DC dc
+DEQUANT_2x4_DC dconly
+
 ; t4 is eax for return value.
 %if ARCH_X86_64
     DECLARE_REG_TMP 0,1,2,3,6,4  ; Identical for both Windows and *NIX
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 91a4dcb0..c8c4c867 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -72,6 +72,10 @@ void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i
 void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
+void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
+void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
+void x264_idct_dequant_2x4_dconly_avx ( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
 int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
 int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
 int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );