1 ;************************************************************************
2 ;* SIMD-optimized lossless video encoding functions
3 ;* Copyright (c) 2000, 2001 Fabrice Bellard
4 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 ;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 ;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
9 ;* This file is part of FFmpeg.
11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* FFmpeg is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with FFmpeg; if not, write to the Free Software
23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 %include "libavutil/x86/x86util.asm"
30 ; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
32 %macro DIFF_BYTES_PROLOGUE 0
34 cglobal diff_bytes, 3,5,2, dst, src1, src2
39 cglobal diff_bytes, 4,5,2, dst, src1, src2, w
45 ; labels to jump to if w < regsize and w < 0
46 %macro DIFF_BYTES_LOOP_PREP 2
57 ; mov type used for src1q, dstq, first reg, second reg
58 %macro DIFF_BYTES_LOOP_CORE 4
61 mov%1 %4, [src1q + i + regsize]
63 psubb %4, [src2q + i + regsize]
65 mov%2 [regsize + dstq + i], %4
67 ; SSE enforces alignment of psubb operand
72 mov%1 %3, [src1q + i + regsize]
73 movu %4, [src2q + i + regsize]
75 mov%2 [regsize + dstq + i], %3
79 %macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
80 %define regsize mmsize
82 DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
86 and wq, 2 * regsize - 1
89 ; fall back to narrower xmm
90 %define regsize (mmsize / 2)
91 DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa, .end_aa
93 DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
97 and wq, 2 * regsize - 1
105 mov t0b, [src1q + wq]
106 sub t0b, [src2q + wq]
117 %define regsize mmsize
118 DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
125 %define regsize mmsize
126 DIFF_BYTES_LOOP_PREP .skip_main_aa, .end_aa
127 test dstq, regsize - 1
129 test src1q, regsize - 1
136 %if HAVE_AVX2_EXTERNAL
139 %define regsize mmsize
140 ; Directly using unaligned SSE2 version is marginally faster than
141 ; branching based on arguments.
142 DIFF_BYTES_LOOP_PREP .skip_main_uu, .end_uu
143 test dstq, regsize - 1
145 test src1q, regsize - 1