1 ; *****************************************************************************
2 ; * Provide SIMD optimizations for add_residual functions for HEVC decoding
3 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ; ******************************************************************************
22 %include "libavutil/x86/x86util.asm"
27 %define max_pixels_10 pw_1023
29 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
30 %macro ADD_RES_MMX_4_8 0
52 ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
53 cglobal hevc_add_residual_4_8, 3, 3, 6
60 %macro ADD_RES_SSE_8_8 0
89 %macro ADD_RES_SSE_16_32_8 3
93 vinserti128 m2, m2, [r1+%1+32], 1
94 vinserti128 m6, m6, [r1+%1+48], 1
101 mova xm4, [r1+%1+mmsize*2]
102 mova xm6, [r1+%1+mmsize*2+16]
104 vinserti128 m4, m4, [r1+%1+96 ], 1
105 vinserti128 m6, m6, [r1+%1+112], 1
121 %macro TRANSFORM_ADD_8 0
122 ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
123 cglobal hevc_add_residual_8_8, 3, 4, 8
131 ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
132 cglobal hevc_add_residual_16_8, 3, 5, 7
137 ADD_RES_SSE_16_32_8 0, r0, r0+r2
138 ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
145 ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
146 cglobal hevc_add_residual_32_8, 3, 5, 7
150 ADD_RES_SSE_16_32_8 0, r0, r0+16
151 ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
164 %if HAVE_AVX2_EXTERNAL
166 ; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
167 cglobal hevc_add_residual_32_8, 3, 5, 7
172 ADD_RES_SSE_16_32_8 0, r0, r0+r2
173 ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
179 %endif ;HAVE_AVX2_EXTERNAL
181 %macro ADD_RES_SSE_8_10 4
200 %macro ADD_RES_MMX_4_10 3
211 %macro ADD_RES_SSE_16_10 3
230 %macro ADD_RES_SSE_32_10 2
250 %macro ADD_RES_AVX2_16_10 4
271 %macro ADD_RES_AVX2_32_10 3
292 ; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
294 cglobal hevc_add_residual_4_10, 3, 3, 6
296 mova m3, [max_pixels_10]
297 ADD_RES_MMX_4_10 r0, r2, r1
300 ADD_RES_MMX_4_10 r0, r2, r1
304 cglobal hevc_add_residual_8_10, 3, 4, 6
306 mova m5, [max_pixels_10]
309 ADD_RES_SSE_8_10 r0, r2, r3, r1
312 ADD_RES_SSE_8_10 r0, r2, r3, r1
315 cglobal hevc_add_residual_16_10, 3, 5, 6
317 mova m5, [max_pixels_10]
321 ADD_RES_SSE_16_10 r0, r2, r1
328 cglobal hevc_add_residual_32_10, 3, 5, 6
330 mova m5, [max_pixels_10]
334 ADD_RES_SSE_32_10 r0, r1
341 %if HAVE_AVX2_EXTERNAL
343 cglobal hevc_add_residual_16_10, 3, 5, 6
345 mova m5, [max_pixels_10]
350 ADD_RES_AVX2_16_10 r0, r2, r3, r1
357 cglobal hevc_add_residual_32_10, 3, 5, 6
359 mova m5, [max_pixels_10]
363 ADD_RES_AVX2_32_10 r0, r2, r1
369 %endif ;HAVE_AVX2_EXTERNAL