From eeb9b66ddb0f27d8baaa8efa9597613e61140836 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sat, 27 Feb 2016 20:34:39 +0100 Subject: [PATCH] x86: dct2x4dc asm Only used in 4:2:2. MMX2 version implemented for 8-bit, SSE2 and AVX versions implemented for high bit-depth. 2.5x faster on 32-bit and 1.6x faster on 64-bit compared to C on Ivy Bridge. --- common/dct.c | 3 ++ common/x86/dct-a.asm | 72 ++++++++++++++++++++++++++++++++++++++++++++ common/x86/dct.h | 4 +++ 3 files changed, 79 insertions(+) diff --git a/common/dct.c b/common/dct.c index 2740a317..9e2e9555 100644 --- a/common/dct.c +++ b/common/dct.c @@ -576,6 +576,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add4x4_idct = x264_add4x4_idct_sse2; dctf->dct4x4dc = x264_dct4x4dc_sse2; dctf->idct4x4dc = x264_idct4x4dc_sse2; + dctf->dct2x4dc = x264_dct2x4dc_sse2; dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; dctf->add8x8_idct = x264_add8x8_idct_sse2; @@ -597,6 +598,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add4x4_idct = x264_add4x4_idct_avx; dctf->dct4x4dc = x264_dct4x4dc_avx; dctf->idct4x4dc = x264_idct4x4dc_avx; + dctf->dct2x4dc = x264_dct2x4dc_avx; dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; dctf->add8x8_idct = x264_add8x8_idct_avx; @@ -633,6 +635,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) if( cpu&X264_CPU_MMX2 ) { dctf->dct4x4dc = x264_dct4x4dc_mmx2; + dctf->dct2x4dc = x264_dct2x4dc_mmx2; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2; } diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 004014fe..454f53f1 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -209,6 +209,78 @@ cglobal idct4x4dc, 1,1 RET %endif ; HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] ) +;----------------------------------------------------------------------------- +%if WIN64 + DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size +%else + DECLARE_REG_TMP 2 +%endif + +%macro INSERT_COEFF 3 ; dst, src, imm + %if %3 + %if HIGH_BIT_DEPTH + %if cpuflag(sse4) + pinsrd %1, %2, %3 + %elif %3 == 2 + movd m2, %2 + %elif %3 == 1 + punpckldq %1, %2 + %else + punpckldq m2, %2 + punpcklqdq %1, m2 + %endif + %else + %if %3 == 2 + punpckldq %1, %2 + %else + pinsrw %1, %2, %3 + %endif + %endif + %else + movd %1, %2 + %endif + %if HIGH_BIT_DEPTH + mov %2, t0d + %else + mov %2, t0w + %endif +%endmacro + +%macro DCT2x4DC 2 +cglobal dct2x4dc, 2,3 + xor t0d, t0d + INSERT_COEFF m0, [r1+0*16*SIZEOF_DCTCOEF], 0 + INSERT_COEFF m0, [r1+1*16*SIZEOF_DCTCOEF], 2 + add r1, 4*16*SIZEOF_DCTCOEF + INSERT_COEFF m0, [r1-2*16*SIZEOF_DCTCOEF], 1 + INSERT_COEFF m0, [r1-1*16*SIZEOF_DCTCOEF], 3 + INSERT_COEFF m1, [r1+0*16*SIZEOF_DCTCOEF], 0 + INSERT_COEFF m1, [r1+1*16*SIZEOF_DCTCOEF], 2 + INSERT_COEFF m1, [r1+2*16*SIZEOF_DCTCOEF], 1 + INSERT_COEFF m1, [r1+3*16*SIZEOF_DCTCOEF], 3 + SUMSUB_BA %1, 1, 0, 2 + SBUTTERFLY %2, 1, 0, 2 + SUMSUB_BA %1, 0, 1, 2 + SBUTTERFLY %2, 0, 1, 2 + SUMSUB_BA %1, 1, 0, 2 + pshuf%1 m0, m0, q1032 + mova [r0], m1 + mova [r0+mmsize], m0 + RET +%endmacro + +%if HIGH_BIT_DEPTH +INIT_XMM sse2 +DCT2x4DC d, dq +INIT_XMM avx +DCT2x4DC d, dq +%else +INIT_MMX mmx2 +DCT2x4DC w, wd +%endif + %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 ) diff --git a/common/x86/dct.h b/common/x86/dct.h index 35b33845..ded790ff 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -77,6 +77,10 @@ void x264_idct4x4dc_mmx ( int16_t d[16] ); void x264_idct4x4dc_sse2 ( int32_t d[16] ); void x264_idct4x4dc_avx ( int32_t d[16] ); +void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] ); +void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] ); +void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] ); + void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 ); -- 2.39.5