X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fx86%2Fh264_idct_10bit.asm;h=b7d51051d3eb0d9ed128737c9cca2cea0d06f92e;hb=e44b58924fe7b180bf8b0c277c15d1a58210a0e9;hp=ad923f94f6a3b6f625c3040517e64cff5a0b5a61;hpb=6860b4081d046558c44b1b42f22022ea341a2a73;p=ffmpeg diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index ad923f94f6a..b7d51051d3e 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -22,7 +22,7 @@ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "x86util.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA @@ -32,7 +32,7 @@ pd_32: times 4 dd 32 SECTION .text ;----------------------------------------------------------------------------- -; void h264_idct_add(pixel *dst, dctcoef *block, int stride) +; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) ;----------------------------------------------------------------------------- %macro STORE_DIFFx2 6 psrad %1, 6 @@ -66,6 +66,10 @@ SECTION .text paddd m0, [pd_32] IDCT4_1D d,0,1,2,3,4,5 pxor m5, m5 + mova [%2+ 0], m5 + mova [%2+16], m5 + mova [%2+32], m5 + mova [%2+48], m5 STORE_DIFFx2 m0, m1, m4, m5, %1, %3 lea %1, [%1+%3*2] STORE_DIFFx2 m2, m3, m4, m5, %1, %3 @@ -79,13 +83,13 @@ cglobal h264_idct_add_10, 3,3 INIT_XMM sse2 IDCT_ADD_10 -%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD_10 -%endif ;----------------------------------------------------------------------------- -; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) +; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- ;;;;;;; NO FATE SAMPLES TRIGGER THIS %macro ADD4x4IDCT 0 @@ -100,6 +104,10 @@ add4x4_idct %+ SUFFIX: paddd m0, [pd_32] IDCT4_1D d,0,1,2,3,4,5 pxor m5, m5 + mova [r2+ 0], m5 + mova [r2+16], m5 + mova [r2+32], m5 + mova [r2+48], m5 STORE_DIFFx2 m0, m1, m4, m5, r5, r3 lea r5, [r5+r3*2] STORE_DIFFx2 m2, m3, m4, m5, r5, r3 @@ -109,11 +117,9 @@ add4x4_idct %+ SUFFIX: INIT_XMM sse2 ALIGN 16 ADD4x4IDCT -%if HAVE_AVX_EXTERNAL INIT_XMM avx ALIGN 16 ADD4x4IDCT -%endif %macro ADD16_OP 2 cmp byte [r4+%2], 0 @@ -149,13 +155,11 @@ cglobal h264_idct_add16_10, 5,6 INIT_XMM sse2 IDCT_ADD16_10 -%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD16_10 -%endif ;----------------------------------------------------------------------------- -; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride) +; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride) ;----------------------------------------------------------------------------- %macro IDCT_DC_ADD_OP_10 3 pxor m5, m5 @@ -184,9 +188,10 @@ IDCT_ADD16_10 mova [%1+%3 ], m4 %endmacro -INIT_MMX mmx2 +INIT_MMX mmxext cglobal h264_idct_dc_add_10,3,3 movd m0, [r1] + mov dword [r1], 0 paddd m0, [pd_32] psrad m0, 6 lea r1, [r2*3] @@ -196,14 +201,14 @@ cglobal h264_idct_dc_add_10,3,3 RET ;----------------------------------------------------------------------------- -; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) +; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride) ;----------------------------------------------------------------------------- %macro IDCT8_DC_ADD 0 -cglobal h264_idct8_dc_add_10,3,3,7 - mov r1d, [r1] - add r1, 32 - sar r1, 6 - movd m0, r1d +cglobal h264_idct8_dc_add_10,3,4,7 + movd m0, [r1] + mov dword[r1], 0 + paddd m0, [pd_32] + psrad m0, 6 lea r1, [r2*3] SPLATW m0, m0, 0 mova m6, [pw_pixel_max] @@ -215,13 +220,13 @@ cglobal h264_idct8_dc_add_10,3,3,7 INIT_XMM sse2 IDCT8_DC_ADD -%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT8_DC_ADD -%endif ;----------------------------------------------------------------------------- -; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) +; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- %macro AC 1 .ac%1: @@ -255,6 +260,8 @@ idct_dc_add %+ SUFFIX: add r5, r0 movq m0, [r2+ 0] movhps m0, [r2+64] + mov dword [r2+ 0], 0 + mov dword [r2+64], 0 paddd m0, [pd_32] psrad m0, 6 pshufhw m0, m0, 0 @@ -286,14 +293,14 @@ cglobal h264_idct_add16intra_10,5,7,8 INIT_XMM sse2 IDCT_ADD16INTRA_10 -%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD16INTRA_10 -%endif %assign last_block 36 ;----------------------------------------------------------------------------- -; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) +; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- %macro IDCT_ADD8 0 cglobal h264_idct_add8_10,5,8,7 @@ -323,13 +330,11 @@ cglobal h264_idct_add8_10,5,8,7 INIT_XMM sse2 IDCT_ADD8 -%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT_ADD8 -%endif ;----------------------------------------------------------------------------- -; void h264_idct8_add(pixel *dst, dctcoef *block, int stride) +; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride) ;----------------------------------------------------------------------------- %macro IDCT8_1D 2 SWAP 0, 1 @@ -473,6 +478,22 @@ h264_idct8_add1_10 %+ SUFFIX: packssdw m8, m0 paddsw m8, [r0] pxor m0, m0 + mova [r1+ 0], m0 + mova [r1+ 16], m0 + mova [r1+ 32], m0 + mova [r1+ 48], m0 + mova [r1+ 64], m0 + mova [r1+ 80], m0 + mova [r1+ 96], m0 + mova [r1+112], m0 + mova [r1+128], m0 + mova [r1+144], m0 + mova [r1+160], m0 + mova [r1+176], m0 + mova [r1+192], m0 + mova [r1+208], m0 + mova [r1+224], m0 + mova [r1+240], m0 CLIPW m8, m0, [pw_pixel_max] mova [r0], m8 mova m8, [pw_pixel_max] @@ -492,6 +513,22 @@ h264_idct8_add1_10 %+ SUFFIX: lea r3, [r0+8] IDCT8_ADD_SSE_END r0, rsp, r2 IDCT8_ADD_SSE_END r3, rsp+16, r2 + mova [r1+ 0], m7 + mova [r1+ 16], m7 + mova [r1+ 32], m7 + mova [r1+ 48], m7 + mova [r1+ 64], m7 + mova [r1+ 80], m7 + mova [r1+ 96], m7 + mova [r1+112], m7 + mova [r1+128], m7 + mova [r1+144], m7 + mova [r1+160], m7 + mova [r1+176], m7 + mova [r1+192], m7 + mova [r1+208], m7 + mova [r1+224], m7 + mova [r1+240], m7 %endif ; ARCH_X86_64 add rsp, pad @@ -500,13 +537,13 @@ h264_idct8_add1_10 %+ SUFFIX: INIT_XMM sse2 IDCT8_ADD -%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT8_ADD -%endif ;----------------------------------------------------------------------------- -; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) +; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset, +; int16_t *block, int stride, +; const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- ;;;;;;; NO FATE SAMPLES TRIGGER THIS %macro IDCT8_ADD4_OP 2 @@ -540,7 +577,5 @@ cglobal h264_idct8_add4_10, 0,7,16 INIT_XMM sse2 IDCT8_ADD4 -%if HAVE_AVX_EXTERNAL INIT_XMM avx IDCT8_ADD4 -%endif