;******************************************************************************* ;* SIMD-optimized IDCT functions for HEVC decoding ;* Copyright (c) 2014 Pierre-Edouard LEPERE ;* Copyright (c) 2014 James Almer ;* ;* This file is part of Libav. ;* ;* Libav is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* Libav is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with Libav; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" section .text ; void ff_hevc_idctHxW_dc_{8,10}_(int16_t *coeffs) ; %1 = HxW ; %2 = number of loops ; %3 = bitdepth %macro IDCT_DC 3 cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp movsx tmpd, word [coeffq] add tmpd, (1 << (14 - %3)) + 1 sar tmpd, (15 - %3) movd xm0, tmpd SPLATW m0, xm0 DEFINE_ARGS coeff, cnt mov cntd, %2 .loop: mova [coeffq+mmsize*0], m0 mova [coeffq+mmsize*1], m0 mova [coeffq+mmsize*2], m0 mova [coeffq+mmsize*3], m0 add coeffq, mmsize*8 mova [coeffq+mmsize*-4], m0 mova [coeffq+mmsize*-3], m0 mova [coeffq+mmsize*-2], m0 mova [coeffq+mmsize*-1], m0 dec cntd jg .loop RET %endmacro ; %1 = HxW ; %2 = bitdepth %macro IDCT_DC_NL 2 ; No loop cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp movsx tmpd, word [coeffq] add tmpd, (1 << (14 - %2)) + 1 sar tmpd, (15 - %2) movd m0, tmpd SPLATW m0, xm0 mova [coeffq+mmsize*0], m0 mova [coeffq+mmsize*1], m0 mova [coeffq+mmsize*2], m0 mova [coeffq+mmsize*3], m0 %if mmsize == 16 mova [coeffq+mmsize*4], m0 mova [coeffq+mmsize*5], m0 mova [coeffq+mmsize*6], m0 mova [coeffq+mmsize*7], m0 %endif RET %endmacro ; 8-bit INIT_MMX mmxext IDCT_DC_NL 4, 8 IDCT_DC 8, 2, 8 INIT_XMM sse2 IDCT_DC_NL 8, 8 IDCT_DC 16, 4, 8 IDCT_DC 32, 16, 8 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 IDCT_DC 16, 2, 8 IDCT_DC 32, 8, 8 %endif ;HAVE_AVX2_EXTERNAL ; 10-bit INIT_MMX mmxext IDCT_DC_NL 4, 10 IDCT_DC 8, 2, 10 INIT_XMM sse2 IDCT_DC_NL 8, 10 IDCT_DC 16, 4, 10 IDCT_DC 32, 16, 10 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 IDCT_DC 16, 2, 10 IDCT_DC 32, 8, 10 %endif ;HAVE_AVX2_EXTERNAL