From eeb9b66ddb0f27d8baaa8efa9597613e61140836 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <henrik@gramner.com>
Date: Sat, 27 Feb 2016 20:34:39 +0100
Subject: [PATCH] x86: dct2x4dc asm

Only used in 4:2:2. MMX2 version implemented for 8-bit, SSE2 and AVX
versions implemented for high bit-depth.

2.5x faster on 32-bit and 1.6x faster on 64-bit compared to C on Ivy Bridge.
---
 common/dct.c         |  3 ++
 common/x86/dct-a.asm | 72 ++++++++++++++++++++++++++++++++++++++++++++
 common/x86/dct.h     |  4 +++
 3 files changed, 79 insertions(+)

diff --git a/common/dct.c b/common/dct.c
index 2740a317..9e2e9555 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -576,6 +576,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add4x4_idct     = x264_add4x4_idct_sse2;
         dctf->dct4x4dc        = x264_dct4x4dc_sse2;
         dctf->idct4x4dc       = x264_idct4x4dc_sse2;
+        dctf->dct2x4dc        = x264_dct2x4dc_sse2;
         dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse2;
         dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse2;
         dctf->add8x8_idct     = x264_add8x8_idct_sse2;
@@ -597,6 +598,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add4x4_idct     = x264_add4x4_idct_avx;
         dctf->dct4x4dc        = x264_dct4x4dc_avx;
         dctf->idct4x4dc       = x264_idct4x4dc_avx;
+        dctf->dct2x4dc        = x264_dct2x4dc_avx;
         dctf->sub8x8_dct8     = x264_sub8x8_dct8_avx;
         dctf->sub16x16_dct8   = x264_sub16x16_dct8_avx;
         dctf->add8x8_idct     = x264_add8x8_idct_avx;
@@ -633,6 +635,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     if( cpu&X264_CPU_MMX2 )
     {
         dctf->dct4x4dc         = x264_dct4x4dc_mmx2;
+        dctf->dct2x4dc         = x264_dct2x4dc_mmx2;
         dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_mmx2;
         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
     }
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 004014fe..454f53f1 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -209,6 +209,78 @@ cglobal idct4x4dc, 1,1
     RET
 %endif ; HIGH_BIT_DEPTH
 
+;-----------------------------------------------------------------------------
+; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
+;-----------------------------------------------------------------------------
+%if WIN64
+    DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
+%else
+    DECLARE_REG_TMP 2
+%endif
+
+%macro INSERT_COEFF 3 ; dst, src, imm
+    %if %3
+        %if HIGH_BIT_DEPTH
+            %if cpuflag(sse4)
+                pinsrd %1, %2, %3
+            %elif %3 == 2
+                movd       m2, %2
+            %elif %3 == 1
+                punpckldq  %1, %2
+            %else
+                punpckldq  m2, %2
+                punpcklqdq %1, m2
+            %endif
+        %else
+            %if %3 == 2
+                punpckldq  %1, %2
+            %else
+                pinsrw %1, %2, %3
+            %endif
+        %endif
+    %else
+        movd %1, %2
+    %endif
+    %if HIGH_BIT_DEPTH
+        mov %2, t0d
+    %else
+        mov %2, t0w
+    %endif
+%endmacro
+
+%macro DCT2x4DC 2
+cglobal dct2x4dc, 2,3
+    xor          t0d, t0d
+    INSERT_COEFF  m0, [r1+0*16*SIZEOF_DCTCOEF], 0
+    INSERT_COEFF  m0, [r1+1*16*SIZEOF_DCTCOEF], 2
+    add           r1, 4*16*SIZEOF_DCTCOEF
+    INSERT_COEFF  m0, [r1-2*16*SIZEOF_DCTCOEF], 1
+    INSERT_COEFF  m0, [r1-1*16*SIZEOF_DCTCOEF], 3
+    INSERT_COEFF  m1, [r1+0*16*SIZEOF_DCTCOEF], 0
+    INSERT_COEFF  m1, [r1+1*16*SIZEOF_DCTCOEF], 2
+    INSERT_COEFF  m1, [r1+2*16*SIZEOF_DCTCOEF], 1
+    INSERT_COEFF  m1, [r1+3*16*SIZEOF_DCTCOEF], 3
+    SUMSUB_BA     %1, 1, 0, 2
+    SBUTTERFLY    %2, 1, 0, 2
+    SUMSUB_BA     %1, 0, 1, 2
+    SBUTTERFLY    %2, 0, 1, 2
+    SUMSUB_BA     %1, 1, 0, 2
+    pshuf%1       m0, m0, q1032
+    mova        [r0], m1
+    mova [r0+mmsize], m0
+    RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+DCT2x4DC d, dq
+INIT_XMM avx
+DCT2x4DC d, dq
+%else
+INIT_MMX mmx2
+DCT2x4DC w, wd
+%endif
+
 %if HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
 ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 35b33845..ded790ff 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -77,6 +77,10 @@ void x264_idct4x4dc_mmx      ( int16_t d[16] );
 void x264_idct4x4dc_sse2     ( int32_t d[16] );
 void x264_idct4x4dc_avx      ( int32_t d[16] );
 
+void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] );
+void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] );
+void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] );
+
 void x264_sub8x8_dct8_mmx    ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct8_mmx  ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct8_sse2   ( dctcoef dct   [64], pixel *pix1, pixel *pix2 );
-- 
2.39.5