2 ; * Provide SIMD optimizations for transform_add functions for HEVC decoding
3 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
24 max_pixels_10: times 16 dw ((1 << 10)-1)
29 ;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
30 %macro TR_ADD_MMX_4_8 0
54 ; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
55 cglobal hevc_transform_add4_8, 3, 4, 6
62 %macro TR_ADD_SSE_8_8 0
91 %macro TR_ADD_SSE_16_32_8 3
95 vinserti128 m2, m2, [r1+%1+32], 1
96 vinserti128 m6, m6, [r1+%1+48], 1
110 mova xm4, [r1+%1+mmsize*2 ]
111 mova xm6, [r1+%1+mmsize*2+16]
113 vinserti128 m4, m4, [r1+%1+96 ], 1
114 vinserti128 m6, m6, [r1+%1+112], 1
137 %macro TRANSFORM_ADD_8 0
138 ; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
139 cglobal hevc_transform_add8_8, 3, 4, 8
147 ; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
148 cglobal hevc_transform_add16_8, 3, 4, 7
151 TR_ADD_SSE_16_32_8 0, r0, r0+r2
152 TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
156 TR_ADD_SSE_16_32_8 0, r0, r0+r2
157 TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
161 ; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
162 cglobal hevc_transform_add32_8, 3, 4, 7
164 TR_ADD_SSE_16_32_8 0, r0, r0+16
165 TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
169 TR_ADD_SSE_16_32_8 0, r0, r0+16
170 TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
180 %if HAVE_AVX2_EXTERNAL
182 ; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
183 cglobal hevc_transform_add32_8, 3, 4, 7
186 TR_ADD_SSE_16_32_8 0, r0, r0+r2
187 TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
191 TR_ADD_SSE_16_32_8 0, r0, r0+r2
192 TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
197 ;-----------------------------------------------------------------------------
198 ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
199 ;-----------------------------------------------------------------------------
200 %macro TR_ADD_SSE_8_10 4
219 %macro TR_ADD_MMX4_10 3
230 %macro TRANS_ADD_SSE_16_10 3
249 %macro TRANS_ADD_SSE_32_10 2
269 %macro TRANS_ADD16_AVX2 4
290 %macro TRANS_ADD32_AVX2 3
313 cglobal hevc_transform_add4_10,3,4, 6
315 mova m3, [max_pixels_10]
316 TR_ADD_MMX4_10 r0, r2, r1
319 TR_ADD_MMX4_10 r0, r2, r1
322 ;-----------------------------------------------------------------------------
323 ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
324 ;-----------------------------------------------------------------------------
326 cglobal hevc_transform_add8_10,3,4,6
328 mova m5, [max_pixels_10]
331 TR_ADD_SSE_8_10 r0, r2, r3, r1
334 TR_ADD_SSE_8_10 r0, r2, r3, r1
337 cglobal hevc_transform_add16_10,3,4,6
339 mova m5, [max_pixels_10]
341 TRANS_ADD_SSE_16_10 r0, r2, r1
345 TRANS_ADD_SSE_16_10 r0, r2, r1
349 cglobal hevc_transform_add32_10,3,4,6
351 mova m5, [max_pixels_10]
353 TRANS_ADD_SSE_32_10 r0, r1
357 TRANS_ADD_SSE_32_10 r0, r1
361 %if HAVE_AVX2_EXTERNAL
364 cglobal hevc_transform_add16_10,3,4,6
366 mova m5, [max_pixels_10]
369 TRANS_ADD16_AVX2 r0, r2, r3, r1
373 TRANS_ADD16_AVX2 r0, r2, r3, r1
377 cglobal hevc_transform_add32_10,3,4,6
379 mova m5, [max_pixels_10]
381 TRANS_ADD32_AVX2 r0, r2, r1
385 TRANS_ADD32_AVX2 r0, r2, r1
388 %endif ;HAVE_AVX_EXTERNAL