1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2003-2013 Michael Niedermayer
5 ;* Copyright (c) 2013 Daniel Kang
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
28 pb_zzzzzzzz77777777: times 8 db -1
30 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
31 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
32 pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
33 pd_16384: times 4 dd 16384
34 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
38 %macro SCALARPRODUCT 0
39 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
40 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
47 movu m0, [v1q + orderq]
48 movu m1, [v1q + orderq + mmsize]
49 pmaddwd m0, [v2q + orderq]
50 pmaddwd m1, [v2q + orderq + mmsize]
66 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
67 cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
82 movu m0, [v2q + orderq]
83 movu m1, [v2q + orderq + mmsize]
84 mova m4, [v1q + orderq]
85 mova m5, [v1q + orderq + mmsize]
86 movu m2, [v3q + orderq]
87 movu m3, [v3q + orderq + mmsize]
96 mova [v1q + orderq], m2
97 mova [v1q + orderq + mmsize], m3
117 %macro SCALARPRODUCT_LOOP 1
123 mova m4, [v2q + orderq]
124 mova m0, [v2q + orderq + mmsize]
128 mova m5, [v3q + orderq]
129 mova m2, [v3q + orderq + mmsize]
133 mova m0, [v2q + orderq]
134 mova m1, [v2q + orderq + mmsize]
135 mova m2, [v3q + orderq]
136 mova m3, [v3q + orderq + mmsize]
138 %define t0 [v1q + orderq]
139 %define t1 [v1q + orderq + mmsize]
154 mova [v1q + orderq], m2
155 mova [v1q + orderq + mmsize], m3
162 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
164 cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
174 mova m4, [v2q + orderq]
175 mova m5, [v3q + orderq]
176 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
191 SCALARPRODUCT_LOOP 14
192 SCALARPRODUCT_LOOP 12
193 SCALARPRODUCT_LOOP 10
208 ;-----------------------------------------------------------------------------
209 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
210 ; const int16_t *window, unsigned int len)
211 ;-----------------------------------------------------------------------------
213 %macro REVERSE_WORDS 1-2
214 %if cpuflag(ssse3) && notcpuflag(atom)
220 %elif cpuflag(mmxext)
226 %if cpuflag(ssse3) ; dst, src, unused
227 ; dst = ((dst * src) + (1<<14)) >> 15
229 %elif cpuflag(mmxext) ; dst, src, temp
230 ; dst = (dst * src) >> 15
231 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
232 ; in from the pmullw result.
242 %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
244 cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
246 cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
248 lea offset2q, [offsetq-mmsize]
249 %if cpuflag(ssse3) && notcpuflag(atom)
250 mova m5, [pb_revwords]
257 ; This version does the 16x16->16 multiplication in-place without expanding
258 ; to 32-bit. The ssse3 version is bit-identical.
259 mova m0, [windowq+offset2q]
260 mova m1, [ inputq+offset2q]
263 pmulhrsw m0, [ inputq+offsetq ]
264 mova [outputq+offset2q], m1
265 mova [outputq+offsetq ], m0
267 ; This version expands 16-bit to 32-bit, multiplies by the window,
268 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
269 ; save to the output. The window is reversed for the second half.
270 mova m3, [windowq+offset2q]
271 mova m4, [ inputq+offset2q]
285 mova [outputq+offset2q], m0
287 mova m4, [ inputq+offsetq]
301 mova [outputq+offsetq], m0
303 ; This version does the 16x16->16 multiplication in-place without expanding
304 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
305 ; therefore are not bit-identical to the C version.
306 mova m0, [windowq+offset2q]
307 mova m1, [ inputq+offset2q]
308 mova m2, [ inputq+offsetq ]
309 MUL16FIXED m1, m0, m3
311 MUL16FIXED m2, m0, m3
312 mova [outputq+offset2q], m1
313 mova [outputq+offsetq ], m2
336 ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
338 cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
341 movd mm4, [left_topq]
346 psubb mm0, mm4 ; t-tl
358 psubb mm0, mm4 ; t-tl
364 paddb mm4, mm3 ; t-tl+l
369 pmaxub mm3, mm5 ; median
370 paddb mm3, mm2 ; +residual
390 movzx r2d, byte [dstq-1]
392 movzx r2d, byte [topq-1]
397 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
427 movhps [dstq+wq+8], m0
439 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
441 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
444 mova m4, [pb_zzzz3333zzzzbbbb]
445 mova m3, [pb_zz11zz55zz99zzdd]
448 ADD_HFYU_LEFT_LOOP 1, 1
451 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
453 mova m6, [pb_zzzzzzzz77777777]
454 mova m4, [pb_zzzz3333zzzzbbbb]
455 mova m3, [pb_zz11zz55zz99zzdd]
462 ADD_HFYU_LEFT_LOOP 1, 1
464 ADD_HFYU_LEFT_LOOP 0, 1
466 ADD_HFYU_LEFT_LOOP 0, 0
468 ;-----------------------------------------------------------------------------
469 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
470 ; int32_t max, unsigned int len)
471 ;-----------------------------------------------------------------------------
473 ; %1 = number of xmm registers used
474 ; %2 = number of inline load/process/store loops per asm loop
475 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
476 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
478 %macro VECTOR_CLIP_INT32 4-5
479 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
492 mova m0, [srcq+mmsize*0*%%i]
493 mova m1, [srcq+mmsize*1*%%i]
494 mova m2, [srcq+mmsize*2*%%i]
495 mova m3, [srcq+mmsize*3*%%i]
497 mova m7, [srcq+mmsize*4*%%i]
498 mova m8, [srcq+mmsize*5*%%i]
499 mova m9, [srcq+mmsize*6*%%i]
500 mova m10, [srcq+mmsize*7*%%i]
510 CLIPD m10, m4, m5, m6
512 mova [dstq+mmsize*0*%%i], m0
513 mova [dstq+mmsize*1*%%i], m1
514 mova [dstq+mmsize*2*%%i], m2
515 mova [dstq+mmsize*3*%%i], m3
517 mova [dstq+mmsize*4*%%i], m7
518 mova [dstq+mmsize*5*%%i], m8
519 mova [dstq+mmsize*6*%%i], m9
520 mova [dstq+mmsize*7*%%i], m10
524 add srcq, mmsize*4*(%2+%3)
525 add dstq, mmsize*4*(%2+%3)
526 sub lend, mmsize*(%2+%3)
532 %define CLIPD CLIPD_MMX
533 VECTOR_CLIP_INT32 0, 1, 0, 0
535 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
536 %define CLIPD CLIPD_SSE2
537 VECTOR_CLIP_INT32 6, 2, 0, 1
539 %define CLIPD CLIPD_SSE41
541 VECTOR_CLIP_INT32 11, 1, 1, 0
543 VECTOR_CLIP_INT32 6, 1, 0, 0
546 ; %1 = aligned/unaligned
560 pshuflw m0, m0, 10110001b
561 pshuflw m1, m1, 10110001b
562 pshufhw m0, m0, 10110001b
563 pshufhw m1, m1, 10110001b
588 pshuflw m0, m0, 10110001b
589 pshufhw m0, m0, 10110001b
600 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
603 cglobal bswap32_buf, 3,4,3
605 mova m2, [pb_bswap32]
607 cglobal bswap32_buf, 3,4,5