]> git.sesse.net Git - x264/commitdiff
x86: dct2x4dc asm
authorHenrik Gramner <henrik@gramner.com>
Sat, 27 Feb 2016 19:34:39 +0000 (20:34 +0100)
committerHenrik Gramner <henrik@gramner.com>
Tue, 12 Apr 2016 15:10:39 +0000 (17:10 +0200)
Only used in 4:2:2. MMX2 version implemented for 8-bit, SSE2 and AVX
versions implemented for high bit-depth.

2.5x faster on 32-bit and 1.6x faster on 64-bit compared to C on Ivy Bridge.

common/dct.c
common/x86/dct-a.asm
common/x86/dct.h

index 2740a3171b5e64e63d0b83b89ed191c1f320387b..9e2e955504b977586f638fc2ca2ab31cf3eae8da 100644 (file)
@@ -576,6 +576,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add4x4_idct     = x264_add4x4_idct_sse2;
         dctf->dct4x4dc        = x264_dct4x4dc_sse2;
         dctf->idct4x4dc       = x264_idct4x4dc_sse2;
+        dctf->dct2x4dc        = x264_dct2x4dc_sse2;
         dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse2;
         dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse2;
         dctf->add8x8_idct     = x264_add8x8_idct_sse2;
@@ -597,6 +598,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add4x4_idct     = x264_add4x4_idct_avx;
         dctf->dct4x4dc        = x264_dct4x4dc_avx;
         dctf->idct4x4dc       = x264_idct4x4dc_avx;
+        dctf->dct2x4dc        = x264_dct2x4dc_avx;
         dctf->sub8x8_dct8     = x264_sub8x8_dct8_avx;
         dctf->sub16x16_dct8   = x264_sub16x16_dct8_avx;
         dctf->add8x8_idct     = x264_add8x8_idct_avx;
@@ -633,6 +635,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     if( cpu&X264_CPU_MMX2 )
     {
         dctf->dct4x4dc         = x264_dct4x4dc_mmx2;
+        dctf->dct2x4dc         = x264_dct2x4dc_mmx2;
         dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_mmx2;
         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
     }
index 004014feb830b5022c4b6bcc952c521c4bb5741a..454f53f1f5c4c65f8641dbde2721ab6195c17bc9 100644 (file)
@@ -209,6 +209,78 @@ cglobal idct4x4dc, 1,1
     RET
 %endif ; HIGH_BIT_DEPTH
 
+;-----------------------------------------------------------------------------
+; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
+;-----------------------------------------------------------------------------
+%if WIN64
+    DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
+%else
+    DECLARE_REG_TMP 2
+%endif
+
+%macro INSERT_COEFF 3 ; dst, src, imm
+    %if %3
+        %if HIGH_BIT_DEPTH
+            %if cpuflag(sse4)
+                pinsrd %1, %2, %3
+            %elif %3 == 2
+                movd       m2, %2
+            %elif %3 == 1
+                punpckldq  %1, %2
+            %else
+                punpckldq  m2, %2
+                punpcklqdq %1, m2
+            %endif
+        %else
+            %if %3 == 2
+                punpckldq  %1, %2
+            %else
+                pinsrw %1, %2, %3
+            %endif
+        %endif
+    %else
+        movd %1, %2
+    %endif
+    %if HIGH_BIT_DEPTH
+        mov %2, t0d
+    %else
+        mov %2, t0w
+    %endif
+%endmacro
+
+%macro DCT2x4DC 2
+cglobal dct2x4dc, 2,3
+    xor          t0d, t0d
+    INSERT_COEFF  m0, [r1+0*16*SIZEOF_DCTCOEF], 0
+    INSERT_COEFF  m0, [r1+1*16*SIZEOF_DCTCOEF], 2
+    add           r1, 4*16*SIZEOF_DCTCOEF
+    INSERT_COEFF  m0, [r1-2*16*SIZEOF_DCTCOEF], 1
+    INSERT_COEFF  m0, [r1-1*16*SIZEOF_DCTCOEF], 3
+    INSERT_COEFF  m1, [r1+0*16*SIZEOF_DCTCOEF], 0
+    INSERT_COEFF  m1, [r1+1*16*SIZEOF_DCTCOEF], 2
+    INSERT_COEFF  m1, [r1+2*16*SIZEOF_DCTCOEF], 1
+    INSERT_COEFF  m1, [r1+3*16*SIZEOF_DCTCOEF], 3
+    SUMSUB_BA     %1, 1, 0, 2
+    SBUTTERFLY    %2, 1, 0, 2
+    SUMSUB_BA     %1, 0, 1, 2
+    SBUTTERFLY    %2, 0, 1, 2
+    SUMSUB_BA     %1, 1, 0, 2
+    pshuf%1       m0, m0, q1032
+    mova        [r0], m1
+    mova [r0+mmsize], m0
+    RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+DCT2x4DC d, dq
+INIT_XMM avx
+DCT2x4DC d, dq
+%else
+INIT_MMX mmx2
+DCT2x4DC w, wd
+%endif
+
 %if HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
 ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
index 35b338453093c41712751b67e9e54b8d93b4f46e..ded790ff830285b66fc2e8ac768a393749717ba7 100644 (file)
@@ -77,6 +77,10 @@ void x264_idct4x4dc_mmx      ( int16_t d[16] );
 void x264_idct4x4dc_sse2     ( int32_t d[16] );
 void x264_idct4x4dc_avx      ( int32_t d[16] );
 
+void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] );
+void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] );
+void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] );
+
 void x264_sub8x8_dct8_mmx    ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct8_mmx  ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct8_sse2   ( dctcoef dct   [64], pixel *pix1, pixel *pix2 );