1 ; *****************************************************************************
2 ; * Provide SIMD optimizations for add_residual functions for HEVC decoding
3 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ; ******************************************************************************
22 %include "libavutil/x86/x86util.asm"
27 %define max_pixels_10 pw_1023
29 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
30 %macro ADD_RES_MMX_4_8 0
54 ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
55 cglobal hevc_add_residual_4_8, 3, 3, 6
62 %macro ADD_RES_SSE_8_8 0
91 %macro ADD_RES_SSE_16_32_8 3
95 vinserti128 m2, m2, [r1+%1+32], 1
96 vinserti128 m6, m6, [r1+%1+48], 1
110 mova xm4, [r1+%1+mmsize*2]
111 mova xm6, [r1+%1+mmsize*2+16]
113 vinserti128 m4, m4, [r1+%1+96 ], 1
114 vinserti128 m6, m6, [r1+%1+112], 1
137 %macro TRANSFORM_ADD_8 0
138 ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
139 cglobal hevc_add_residual_8_8, 3, 4, 8
147 ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
148 cglobal hevc_add_residual_16_8, 3, 5, 7
153 ADD_RES_SSE_16_32_8 0, r0, r0+r2
154 ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
161 ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
162 cglobal hevc_add_residual_32_8, 3, 5, 7
166 ADD_RES_SSE_16_32_8 0, r0, r0+16
167 ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
180 %if HAVE_AVX2_EXTERNAL
182 ; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
183 cglobal hevc_add_residual_32_8, 3, 5, 7
188 ADD_RES_SSE_16_32_8 0, r0, r0+r2
189 ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
197 %macro ADD_RES_SSE_8_10 4
216 %macro ADD_RES_MMX_4_10 3
227 %macro ADD_RES_SSE_16_10 3
246 %macro ADD_RES_SSE_32_10 2
266 %macro ADD_RES_AVX2_16_10 4
287 %macro ADD_RES_AVX2_32_10 3
308 ; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
310 cglobal hevc_add_residual_4_10, 3, 3, 6
312 mova m3, [max_pixels_10]
313 ADD_RES_MMX_4_10 r0, r2, r1
316 ADD_RES_MMX_4_10 r0, r2, r1
320 cglobal hevc_add_residual_8_10, 3, 4, 6
322 mova m5, [max_pixels_10]
325 ADD_RES_SSE_8_10 r0, r2, r3, r1
328 ADD_RES_SSE_8_10 r0, r2, r3, r1
331 cglobal hevc_add_residual_16_10, 3, 5, 6
333 mova m5, [max_pixels_10]
337 ADD_RES_SSE_16_10 r0, r2, r1
344 cglobal hevc_add_residual_32_10, 3, 5, 6
346 mova m5, [max_pixels_10]
350 ADD_RES_SSE_32_10 r0, r1
357 %if HAVE_AVX2_EXTERNAL
359 cglobal hevc_add_residual_16_10, 3, 5, 6
361 mova m5, [max_pixels_10]
366 ADD_RES_AVX2_16_10 r0, r2, r3, r1
373 cglobal hevc_add_residual_32_10, 3, 5, 6
375 mova m5, [max_pixels_10]
379 ADD_RES_AVX2_32_10 r0, r2, r1
385 %endif ;HAVE_AVX2_EXTERNAL