From: Henrik Gramner Date: Sat, 20 Feb 2016 19:31:22 +0000 (+0100) Subject: x86: SSE2/AVX idct_dequant_2x4_(dc|dconly) X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=23d1d8e89be2d99f5c6924a6055fc80d69429503;p=x264 x86: SSE2/AVX idct_dequant_2x4_(dc|dconly) Only used in 4:2:2. Both 8-bit and high bit-depth implemented. Approximate performance improvement compared to C on Ivy Bridge: x86-32 x86-64 idct_dequant_2x4_dc 2.1x 1.7x idct_dequant_2x4_dconly 2.7x 2.0x Helps more on 32-bit due to the C versions being register starved. --- diff --git a/common/quant.c b/common/quant.c index 75325d0d..312f7cd9 100644 --- a/common/quant.c +++ b/common/quant.c @@ -486,6 +486,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_4x4 = x264_dequant_4x4_sse2; pf->dequant_8x8 = x264_dequant_8x8_sse2; pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2; + pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2; + pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2; pf->denoise_dct = x264_denoise_dct_sse2; pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; @@ -532,6 +534,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) } if( cpu&X264_CPU_AVX ) { + pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx; + pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx; pf->denoise_dct = x264_denoise_dct_avx; } if( cpu&X264_CPU_XOP ) @@ -618,6 +622,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2; pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } + pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2; + pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2; pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2; pf->denoise_dct = x264_denoise_dct_sse2; pf->decimate_score15 = x264_decimate_score15_sse2; @@ -680,6 +686,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_4x4 = x264_dequant_4x4_avx; pf->dequant_8x8 = x264_dequant_8x8_avx; } + pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx; + pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx; pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx; pf->denoise_dct = x264_denoise_dct_avx; } diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index f7c7cc03..2dc02493 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -829,6 +829,150 @@ INIT_YMM avx2 DEQUANT_DC w, pmullw %endif +%macro PEXTRW 4 + %if cpuflag(sse4) + pextrw %1, %2, %3 + %else + ; pextrw with a memory destination requires SSE4.1, go through a GPR as a fallback + %if %3 + pextrw %4d, %2, %3 + %else + movd %4d, %2 + %endif + mov %1, %4w + %endif +%endmacro + +;----------------------------------------------------------------------------- +; void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ) +; void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp ) +;----------------------------------------------------------------------------- + +%macro DEQUANT_2x4_DC 1 +%ifidn %1, dconly + DECLARE_REG_TMP 6,3,2 + %define %%args dct, dmf, qp +%else + DECLARE_REG_TMP 6,4,3 + %define %%args dct, dct4x4, dmf, qp +%endif + +%if ARCH_X86_64 == 0 + DECLARE_REG_TMP 2,0,1 +%endif + +cglobal idct_dequant_2x4_%1, 0,3,5, %%args + movifnidn t2d, qpm + imul t0d, t2d, 0x2b + shr t0d, 8 ; qp / 6 + lea t1d, [t0*5] + sub t2d, t0d + sub t2d, t1d ; qp % 6 + shl t2d, 6 ; 16 * sizeof(int) +%if ARCH_X86_64 + imul t2d, [dmfq+t2], -0xffff ; (-dmf) << 16 | dmf +%else + mov dctq, dctmp + add t2, dmfmp + imul t2d, [t2], -0xffff +%endif +%if HIGH_BIT_DEPTH + mova m0, [dctq] + mova m1, [dctq+16] + SUMSUB_BA d, 1, 0, 2 ; 16-bit intermediate precision is enough for the first two sumsub steps, + packssdw m1, m0 ; and by packing to words we can use pmaddwd instead of pmulld later. +%else + movq m0, [dctq] + movq m1, [dctq+8] + SUMSUB_BA w, 1, 0, 2 + punpcklqdq m1, m0 ; a0 a1 a2 a3 a4 a5 a6 a7 +%endif + pshufd m0, m1, q2301 ; a2 a3 a0 a1 a6 a7 a4 a5 + movd m3, t2d + pshuflw m3, m3, q1000 ; + + + - + SUMSUB_BA w, 0, 1, 2 + punpcklqdq m3, m3 ; + + + - + + + - + pshufd m1, m1, q0022 + sub t0d, 6 + jl .rshift + movd m2, t0d + psllw m3, m2 + pmaddwd m0, m3 + pmaddwd m1, m3 + jmp .end +.rshift: + neg t0d + movd m2, t0d + pcmpeqd m4, m4 + pmaddwd m0, m3 + pmaddwd m1, m3 + pslld m4, m2 + psrad m4, 1 + psubd m0, m4 ; + 1 << (qp/6-1) + psubd m1, m4 + psrad m0, m2 + psrad m1, m2 +.end: +%ifidn %1, dconly +%if HIGH_BIT_DEPTH + mova [dctq], m0 + mova [dctq+16], m1 +%else + packssdw m0, m1 + mova [dctq], m0 +%endif +%else + movifnidn dct4x4q, dct4x4mp +%if HIGH_BIT_DEPTH + movd [dct4x4q+0*64], m0 +%if cpuflag(sse4) + pextrd [dct4x4q+1*64], m0, 1 + add dct4x4q, 4*64 + pextrd [dct4x4q-2*64], m0, 2 + pextrd [dct4x4q-1*64], m0, 3 + movd [dct4x4q+0*64], m1 + pextrd [dct4x4q+1*64], m1, 1 + pextrd [dct4x4q+2*64], m1, 2 + pextrd [dct4x4q+3*64], m1, 3 +%else + MOVHL m2, m0 + psrlq m0, 32 + movd [dct4x4q+1*64], m0 + add dct4x4q, 4*64 + movd [dct4x4q-2*64], m2 + psrlq m2, 32 + movd [dct4x4q-1*64], m2 + movd [dct4x4q+0*64], m1 + MOVHL m2, m1 + psrlq m1, 32 + movd [dct4x4q+1*64], m1 + movd [dct4x4q+2*64], m2 + psrlq m2, 32 + movd [dct4x4q+3*64], m2 +%endif +%else + PEXTRW [dct4x4q+0*32], m0, 0, eax + PEXTRW [dct4x4q+1*32], m0, 2, eax + PEXTRW [dct4x4q+2*32], m0, 4, eax + PEXTRW [dct4x4q+3*32], m0, 6, eax + add dct4x4q, 4*32 + PEXTRW [dct4x4q+0*32], m1, 0, eax + PEXTRW [dct4x4q+1*32], m1, 2, eax + PEXTRW [dct4x4q+2*32], m1, 4, eax + PEXTRW [dct4x4q+3*32], m1, 6, eax +%endif +%endif + RET +%endmacro + +; sse4 reduces code size compared to sse2 but isn't any faster, so just go with sse2+avx +INIT_XMM sse2 +DEQUANT_2x4_DC dc +DEQUANT_2x4_DC dconly +INIT_XMM avx +DEQUANT_2x4_DC dc +DEQUANT_2x4_DC dconly + ; t4 is eax for return value. %if ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX diff --git a/common/x86/quant.h b/common/x86/quant.h index 91a4dcb0..c8c4c867 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -72,6 +72,10 @@ void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); +void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); +void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp ); +void x264_idct_dequant_2x4_dconly_avx ( dctcoef dct[8], int dequant_mf[6][16], int i_qp ); int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf ); int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf ); int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );