1 ;************************************************************************
2 ;* SIMD-optimized lossless video encoding functions
3 ;* Copyright (c) 2000, 2001 Fabrice Bellard
4 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 ;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 ;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
9 ;* This file is part of FFmpeg.
11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* FFmpeg is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with FFmpeg; if not, write to the Free Software
23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 %include "libavutil/x86/x86util.asm"
30 ; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
32 %macro DIFF_BYTES_PROLOGUE 0
34 cglobal diff_bytes, 3,5,2, dst, src1, src2
39 cglobal diff_bytes, 4,5,2, dst, src1, src2, w
45 ; label to jump to if w < regsize
46 %macro DIFF_BYTES_LOOP_PREP 1
56 ; mov type used for src1q, dstq, first reg, second reg
57 %macro DIFF_BYTES_LOOP_CORE 4
60 mov%1 %4, [src1q + i + regsize]
62 psubb %4, [src2q + i + regsize]
64 mov%2 [regsize + dstq + i], %4
66 ; SSE enforces alignment of psubb operand
71 mov%1 %3, [src1q + i + regsize]
72 movu %4, [src2q + i + regsize]
74 mov%2 [regsize + dstq + i], %3
78 %macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
79 %define regsize mmsize
81 DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
85 and wq, 2 * regsize - 1
88 ; fall back to narrower xmm
89 %define regsize mmsize / 2
90 DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
92 DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
96 and wq, 2 * regsize - 1
104 mov t0b, [src1q + wq]
105 sub t0b, [src2q + wq]
116 %define regsize mmsize
117 DIFF_BYTES_LOOP_PREP .skip_main_aa
124 %define regsize mmsize
125 DIFF_BYTES_LOOP_PREP .skip_main_aa
126 test dstq, regsize - 1
128 test src1q, regsize - 1
135 %if HAVE_AVX2_EXTERNAL
138 %define regsize mmsize
139 ; Directly using unaligned SSE2 version is marginally faster than
140 ; branching based on arguments.
141 DIFF_BYTES_LOOP_PREP .skip_main_uu
142 test dstq, regsize - 1
144 test src1q, regsize - 1