From f61d454ca13f277b6ab7bbc9ebf7d26ce6d67ec6 Mon Sep 17 00:00:00 2001 From: James Darnley Date: Thu, 16 Mar 2017 14:59:48 +0100 Subject: [PATCH] avcodec/h264: add avx 8-bit h264_idct_add MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Haswell: - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext Skylake-U: - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext --- libavcodec/x86/h264_idct.asm | 33 ++++++++++++++++++++++++++++++++- libavcodec/x86/h264dsp_init.c | 3 +++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index be15afb7665..81fe793600d 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -65,7 +65,15 @@ SECTION .text IDCT4_1D w, 0, 1, 2, 3, 4, 5 mova m6, [pw_32] - TRANSPOSE4x4W 0, 1, 2, 3, 4 + %if mmsize == 8 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + %else + punpcklwd m0, m1 + punpcklwd m2, m3 + SBUTTERFLY dq, 0, 2, 4 + MOVHL m1, m0 + MOVHL m3, m2 + %endif paddw m0, m6 IDCT4_1D w, 0, 1, 2, 3, 4, 5 pxor m7, m7 @@ -1131,3 +1139,26 @@ INIT_MMX mmx IDCT_DC_DEQUANT 0 INIT_MMX sse2 IDCT_DC_DEQUANT 7 + +INIT_XMM avx + +; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride + movd %3, [%7] + movd %4, [%7+%8] + psraw %1, %6 + psraw %2, %6 + punpcklbw %3, %5 + punpcklbw %4, %5 + paddw %3, %1 + paddw %4, %2 + packuswb %3, %5 + packuswb %4, %5 + movd [%7], %3 + movd [%7+%8], %4 +%endmacro + +cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ + movsxdifnidn stride_q, stride_d + IDCT4_ADD dst_q, block_q, stride_q +RET diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 0643b373627..8ba085f5e88 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ int stride); IDCT_ADD_FUNC(, 8, mmx) +IDCT_ADD_FUNC(, 8, avx) IDCT_ADD_FUNC(, 10, sse2) IDCT_ADD_FUNC(_dc, 8, mmxext) IDCT_ADD_FUNC(_dc, 10, mmxext) @@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx; c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx; } + + c->h264_idct_add = ff_h264_idct_add_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { -- 2.39.2