2 ; * Provide SSE & MMX idct functions for HEVC decoding
3 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
24 max_pixels_10: times 16 dw ((1 << 10)-1)
25 dc_add_10: times 4 dd ((1 << 14-10) + 1)
30 ;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
33 add %1w, ((1 << 14-8) + 1)
44 %macro DC_ADD_INIT_AVX2 2
45 add %1w, ((1 << 14-8) + 1)
48 vpbroadcastw m0, xm0 ;SPLATW
76 ; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
78 cglobal hevc_idct4_dc_add_8, 3, 4, 0
81 DC_ADD_OP movh, r0, r2, r3
84 ; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
85 cglobal hevc_idct8_dc_add_8, 3, 4, 0
88 DC_ADD_OP mova, r0, r2, r3
90 DC_ADD_OP mova, r0, r2, r3
93 ; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
94 cglobal hevc_idct4_dc_add_8, 2, 3, 0
98 DC_ADD_OP movh, r0, r1, r2
101 ; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
102 cglobal hevc_idct8_dc_add_8, 2, 3, 0
106 DC_ADD_OP mova, r0, r1, r2
108 DC_ADD_OP mova, r0, r1, r2
114 ; void ff_hevc_idct16_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
115 cglobal hevc_idct16_dc_add_8, 3, 4, 6
118 DC_ADD_OP mova, r0, r2, r3
120 DC_ADD_OP mova, r0, r2, r3
122 DC_ADD_OP mova, r0, r2, r3
124 DC_ADD_OP mova, r0, r2, r3
127 %if HAVE_AVX2_EXTERNAL
129 ; void ff_hevc_idct32_dc_add_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
130 cglobal hevc_idct32_dc_add_8, 3, 4, 6
132 DC_ADD_INIT_AVX2 r3, r2
133 DC_ADD_OP mova, r0, r2, r3,
136 DC_ADD_OP mova, r0, r2, r3
139 %endif ;HAVE_AVX2_EXTERNAL
140 ;-----------------------------------------------------------------------------
141 ; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
142 ;-----------------------------------------------------------------------------
143 %macro IDCT_DC_ADD_OP_10 3
146 paddw m1, m0, [%1+0 ]
147 paddw m2, m0, [%1+%2 ]
148 paddw m3, m0, [%1+%2*2]
149 paddw m4, m0, [%1+%3 ]
171 cglobal hevc_idct4_dc_add_10,3,3
173 add r1w, ((1 << 4) + 1)
178 mova m6, [max_pixels_10]
179 IDCT_DC_ADD_OP_10 r0, r2, r1
182 ;-----------------------------------------------------------------------------
183 ; void ff_hevc_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
184 ;-----------------------------------------------------------------------------
185 %macro IDCT8_DC_ADD 0
186 cglobal hevc_idct8_dc_add_10,3,4,7
188 add r1w, ((1 << 4) + 1)
193 mova m6, [max_pixels_10]
194 IDCT_DC_ADD_OP_10 r0, r2, r1
196 IDCT_DC_ADD_OP_10 r0, r2, r1
202 %if HAVE_AVX_EXTERNAL
207 %if HAVE_AVX2_EXTERNAL
209 cglobal hevc_idct16_dc_add_10,3,4,7
211 add r1w, ((1 << 4) + 1)
215 vpbroadcastw m0, xm0 ;SPLATW
216 mova m6, [max_pixels_10]
217 IDCT_DC_ADD_OP_10 r0, r2, r1
219 IDCT_DC_ADD_OP_10 r0, r2, r1
221 IDCT_DC_ADD_OP_10 r0, r2, r1
223 IDCT_DC_ADD_OP_10 r0, r2, r1
225 %endif ;HAVE_AVX_EXTERNAL