1 ;******************************************************************************
2 ;* SIMD lossless video DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2014 Michael Niedermayer
5 ;* Copyright (c) 2017 Jokyo Images
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
29 pb_zzzzzzzz77777777: times 8 db -1
31 pb_ef: times 8 db 14,15
32 pb_67: times 8 db 6, 7
33 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
34 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
35 pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
36 pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
40 ;------------------------------------------------------------------------------
41 ; void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
42 ; const uint8_t *diff, int w,
43 ; int *left, int *left_top)
44 ;------------------------------------------------------------------------------
46 cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
77 pmaxub m3, m5 ; median
78 paddb m3, m2 ; +residual
98 movzx r2d, byte [dstq-1]
100 movzx r2d, byte [topq-1]
113 %macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
139 movhps [dstq+wq+8], xm0
143 vextracti128 xm2, m1, 1 ; get second lane of the ymm
144 pshufb xm0, xm5 ; set alls val to last val of the first lane
148 mova [dstq+wq+16], xm0
150 movq [dstq+wq+16], xm0
151 movhps [dstq+wq+16+8], xm0
157 movzx eax, byte [dstq - 1]
168 ;------------------------------------------------------------------------------
169 ; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
170 ;------------------------------------------------------------------------------
172 cglobal add_left_pred, 3,3,7, dst, src, w, left
175 mova m4, [pb_zzzz3333zzzzbbbb]
176 mova m3, [pb_zz11zz55zz99zzdd]
181 %macro ADD_LEFT_PRED_UNALIGNED 0
182 cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
184 VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
185 VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
186 VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
189 test srcq, mmsize - 1
191 test dstq, mmsize - 1
201 ADD_LEFT_PRED_UNALIGNED
203 %if HAVE_AVX2_EXTERNAL
205 ADD_LEFT_PRED_UNALIGNED
208 ;------------------------------------------------------------------------------
209 ; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
210 ;------------------------------------------------------------------------------
212 cglobal add_bytes, 3,4,2, dst, src, w, size
220 mova m0, [srcq + sizeq]
221 mova m1, [srcq + sizeq + mmsize]
222 paddb m0, [dstq + sizeq]
223 paddb m1, [dstq + sizeq + mmsize]
224 mova [dstq + sizeq], m0
225 mova [dstq + sizeq + mmsize], m1
235 mov sizeb, [srcq + wq]
236 add [dstq + wq], sizeb
250 %if HAVE_AVX2_EXTERNAL
255 %macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
281 movhps [dstq+wq+8], m0
296 ;---------------------------------------------------------------------------------------------
297 ; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
298 ;---------------------------------------------------------------------------------------------
300 cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
303 mova m3, [pb_zzzz2323zzzzabab]
308 ADD_HFYU_LEFT_LOOP_INT16 a, a
311 cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
313 mova m4, [pb_zzzzzzzz67676767]
314 mova m3, [pb_zzzz2323zzzzabab]
323 ADD_HFYU_LEFT_LOOP_INT16 a, a
325 ADD_HFYU_LEFT_LOOP_INT16 u, a
327 ADD_HFYU_LEFT_LOOP_INT16 u, u
330 ;---------------------------------------------------------------------------------------------
331 ; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
332 ;---------------------------------------------------------------------------------------------
333 %macro ADD_GRADIENT_PRED 0
334 cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
340 vpbroadcastb xm1, xm1
351 lea tmpq, [srcq + strideq]
352 mova m2, [tmpq + widthq] ; A = src[x-stride]
353 movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
354 mova m4, [srcq + widthq] ; current val (src[x])
368 ; prefix sum current val
379 paddb m2, m4 ; current + (A - B)
381 paddb xm1, xm2 ; += C
382 mova [srcq + widthq], xm1 ; store
384 pshufb xm1, xm0 ; put last val in all val of xm1
387 vextracti128 xm2, m2, 1 ; get second lane of the ymm
390 mova [srcq + widthq + 16], xm1 ; store
391 pshufb xm1, xm0 ; put last val in all val of m1
403 %if HAVE_AVX2_EXTERNAL