1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2003-2013 Michael Niedermayer
5 ;* Copyright (c) 2013 Daniel Kang
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
28 cextern h263_loop_filter_strength
30 pb_zzzzzzzz77777777: times 8 db -1
32 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
33 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
34 pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
35 pd_16384: times 4 dd 16384
36 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
40 %macro SCALARPRODUCT 0
41 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
42 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
49 movu m0, [v1q + orderq]
50 movu m1, [v1q + orderq + mmsize]
51 pmaddwd m0, [v2q + orderq]
52 pmaddwd m1, [v2q + orderq + mmsize]
68 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
69 cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
84 movu m0, [v2q + orderq]
85 movu m1, [v2q + orderq + mmsize]
86 mova m4, [v1q + orderq]
87 mova m5, [v1q + orderq + mmsize]
88 movu m2, [v3q + orderq]
89 movu m3, [v3q + orderq + mmsize]
98 mova [v1q + orderq], m2
99 mova [v1q + orderq + mmsize], m3
119 %macro SCALARPRODUCT_LOOP 1
125 mova m4, [v2q + orderq]
126 mova m0, [v2q + orderq + mmsize]
130 mova m5, [v3q + orderq]
131 mova m2, [v3q + orderq + mmsize]
135 mova m0, [v2q + orderq]
136 mova m1, [v2q + orderq + mmsize]
137 mova m2, [v3q + orderq]
138 mova m3, [v3q + orderq + mmsize]
140 %define t0 [v1q + orderq]
141 %define t1 [v1q + orderq + mmsize]
156 mova [v1q + orderq], m2
157 mova [v1q + orderq + mmsize], m3
164 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
166 cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
176 mova m4, [v2q + orderq]
177 mova m5, [v3q + orderq]
178 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
193 SCALARPRODUCT_LOOP 14
194 SCALARPRODUCT_LOOP 12
195 SCALARPRODUCT_LOOP 10
210 ;-----------------------------------------------------------------------------
211 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
212 ; const int16_t *window, unsigned int len)
213 ;-----------------------------------------------------------------------------
215 %macro REVERSE_WORDS 1-2
216 %if cpuflag(ssse3) && notcpuflag(atom)
222 %elif cpuflag(mmxext)
228 %if cpuflag(ssse3) ; dst, src, unused
229 ; dst = ((dst * src) + (1<<14)) >> 15
231 %elif cpuflag(mmxext) ; dst, src, temp
232 ; dst = (dst * src) >> 15
233 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
234 ; in from the pmullw result.
244 %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
246 cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
248 cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
250 lea offset2q, [offsetq-mmsize]
251 %if cpuflag(ssse3) && notcpuflag(atom)
252 mova m5, [pb_revwords]
259 ; This version does the 16x16->16 multiplication in-place without expanding
260 ; to 32-bit. The ssse3 version is bit-identical.
261 mova m0, [windowq+offset2q]
262 mova m1, [ inputq+offset2q]
265 pmulhrsw m0, [ inputq+offsetq ]
266 mova [outputq+offset2q], m1
267 mova [outputq+offsetq ], m0
269 ; This version expands 16-bit to 32-bit, multiplies by the window,
270 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
271 ; save to the output. The window is reversed for the second half.
272 mova m3, [windowq+offset2q]
273 mova m4, [ inputq+offset2q]
287 mova [outputq+offset2q], m0
289 mova m4, [ inputq+offsetq]
303 mova [outputq+offsetq], m0
305 ; This version does the 16x16->16 multiplication in-place without expanding
306 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
307 ; therefore are not bit-identical to the C version.
308 mova m0, [windowq+offset2q]
309 mova m1, [ inputq+offset2q]
310 mova m2, [ inputq+offsetq ]
311 MUL16FIXED m1, m0, m3
313 MUL16FIXED m2, m0, m3
314 mova [outputq+offset2q], m1
315 mova [outputq+offsetq ], m2
338 ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
340 cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
343 movd mm4, [left_topq]
348 psubb mm0, mm4 ; t-tl
360 psubb mm0, mm4 ; t-tl
366 paddb mm4, mm3 ; t-tl+l
371 pmaxub mm3, mm5 ; median
372 paddb mm3, mm2 ; +residual
392 movzx r2d, byte [dstq-1]
394 movzx r2d, byte [topq-1]
399 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
429 movhps [dstq+wq+8], m0
441 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
443 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
446 mova m4, [pb_zzzz3333zzzzbbbb]
447 mova m3, [pb_zz11zz55zz99zzdd]
450 ADD_HFYU_LEFT_LOOP 1, 1
453 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
455 mova m6, [pb_zzzzzzzz77777777]
456 mova m4, [pb_zzzz3333zzzzbbbb]
457 mova m3, [pb_zz11zz55zz99zzdd]
464 ADD_HFYU_LEFT_LOOP 1, 1
466 ADD_HFYU_LEFT_LOOP 0, 1
468 ADD_HFYU_LEFT_LOOP 0, 0
470 ;-----------------------------------------------------------------------------
471 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
472 ; int32_t max, unsigned int len)
473 ;-----------------------------------------------------------------------------
475 ; %1 = number of xmm registers used
476 ; %2 = number of inline load/process/store loops per asm loop
477 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
478 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
480 %macro VECTOR_CLIP_INT32 4-5
481 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
494 mova m0, [srcq+mmsize*0*%%i]
495 mova m1, [srcq+mmsize*1*%%i]
496 mova m2, [srcq+mmsize*2*%%i]
497 mova m3, [srcq+mmsize*3*%%i]
499 mova m7, [srcq+mmsize*4*%%i]
500 mova m8, [srcq+mmsize*5*%%i]
501 mova m9, [srcq+mmsize*6*%%i]
502 mova m10, [srcq+mmsize*7*%%i]
512 CLIPD m10, m4, m5, m6
514 mova [dstq+mmsize*0*%%i], m0
515 mova [dstq+mmsize*1*%%i], m1
516 mova [dstq+mmsize*2*%%i], m2
517 mova [dstq+mmsize*3*%%i], m3
519 mova [dstq+mmsize*4*%%i], m7
520 mova [dstq+mmsize*5*%%i], m8
521 mova [dstq+mmsize*6*%%i], m9
522 mova [dstq+mmsize*7*%%i], m10
526 add srcq, mmsize*4*(%2+%3)
527 add dstq, mmsize*4*(%2+%3)
528 sub lend, mmsize*(%2+%3)
534 %define CLIPD CLIPD_MMX
535 VECTOR_CLIP_INT32 0, 1, 0, 0
537 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
538 %define CLIPD CLIPD_SSE2
539 VECTOR_CLIP_INT32 6, 2, 0, 1
541 %define CLIPD CLIPD_SSE41
543 VECTOR_CLIP_INT32 11, 1, 1, 0
545 VECTOR_CLIP_INT32 6, 1, 0, 0
548 ; %1 = aligned/unaligned
562 pshuflw m0, m0, 10110001b
563 pshuflw m1, m1, 10110001b
564 pshufhw m0, m0, 10110001b
565 pshufhw m1, m1, 10110001b
590 pshuflw m0, m0, 10110001b
591 pshufhw m0, m0, 10110001b
602 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
605 cglobal bswap32_buf, 3,4,3
607 mova m2, [pb_bswap32]
609 cglobal bswap32_buf, 3,4,5
657 %macro H263_LOOP_FILTER 5
730 ; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
731 cglobal h263_v_loop_filter, 3,5
735 lea r4, [h263_loop_filter_strength]
736 movzx r3d, BYTE [r4+r2]
744 H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
752 %macro TRANSPOSE4X4 2
771 ; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
773 cglobal h263_h_loop_filter, 3,5,0,32
777 lea r4, [h263_loop_filter_strength]
778 movzx r3d, BYTE [r4+r2]
787 TRANSPOSE4X4 r4, rsp+4
789 H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d