1 ;************************************************************************
2 ;* SIMD-optimized lossless video encoding functions
3 ;* Copyright (c) 2000, 2001 Fabrice Bellard
4 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 ;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 ;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
9 ;* This file is part of FFmpeg.
11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* FFmpeg is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with FFmpeg; if not, write to the Free Software
23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 %include "libavutil/x86/x86util.asm"
32 ; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
34 %macro DIFF_BYTES_PROLOGUE 0
36 cglobal diff_bytes, 3,5,2, dst, src1, src2
41 cglobal diff_bytes, 4,5,2, dst, src1, src2, w
47 ; labels to jump to if w < regsize and w < 0
48 %macro DIFF_BYTES_LOOP_PREP 2
59 ; mov type used for src1q, dstq, first reg, second reg
60 %macro DIFF_BYTES_LOOP_CORE 4
63 mov%1 %4, [src1q + i + regsize]
65 psubb %4, [src2q + i + regsize]
67 mov%2 [regsize + dstq + i], %4
69 ; SSE enforces alignment of psubb operand
74 mov%1 %3, [src1q + i + regsize]
75 movu %4, [src2q + i + regsize]
77 mov%2 [regsize + dstq + i], %3
81 %macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
82 %define regsize mmsize
84 DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
88 and wq, 2 * regsize - 1
91 ; fall back to narrower xmm
92 %define regsize (mmsize / 2)
93 DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa, .end_aa
95 DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
99 and wq, 2 * regsize - 1
107 mov t0b, [src1q + wq]
108 sub t0b, [src2q + wq]
119 %define regsize mmsize
120 DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
127 %define regsize mmsize
128 DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
129 test dstq, regsize - 1
131 test src1q, regsize - 1
138 %if HAVE_AVX2_EXTERNAL
141 %define regsize mmsize
142 ; Directly using unaligned SSE2 version is marginally faster than
143 ; branching based on arguments.
144 DIFF_BYTES_LOOP_PREP .skip_main_uu, .end_uu
145 test dstq, regsize - 1
147 test src1q, regsize - 1
156 ;--------------------------------------------------------------------------------------------------
157 ;void sub_left_predict(uint8_t *dst, uint8_t *src, ptrdiff_t stride, ptrdiff_t width, int height)
158 ;--------------------------------------------------------------------------------------------------
161 cglobal sub_left_predict, 5,6,5, dst, src, stride, width, height, x
162 mova m1, [pb_80] ; prev initial
168 pinsrb m4, m1, xd, 15
172 movu m0, [srcq + widthq]
173 palignr m2, m0, m1, 15
174 movu m1, [srcq + widthq + 16]
175 palignr m3, m1, m0, 15
178 movu [dstq + widthq], m2
179 movu [dstq + widthq + 16], m3
184 sub dstq, xq ; dst + width