1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 pb_zzzzzzzz77777777: times 8 db -1
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30 pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
31 pd_16384: times 4 dd 16384
32 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
36 %macro SCALARPRODUCT 0
37 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
38 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
45 movu m0, [v1q + orderq]
46 movu m1, [v1q + orderq + mmsize]
47 pmaddwd m0, [v2q + orderq]
48 pmaddwd m1, [v2q + orderq + mmsize]
64 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
65 cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
80 movu m0, [v2q + orderq]
81 movu m1, [v2q + orderq + mmsize]
82 mova m4, [v1q + orderq]
83 mova m5, [v1q + orderq + mmsize]
84 movu m2, [v3q + orderq]
85 movu m3, [v3q + orderq + mmsize]
94 mova [v1q + orderq], m2
95 mova [v1q + orderq + mmsize], m3
115 %macro SCALARPRODUCT_LOOP 1
121 mova m4, [v2q + orderq]
122 mova m0, [v2q + orderq + mmsize]
126 mova m5, [v3q + orderq]
127 mova m2, [v3q + orderq + mmsize]
131 mova m0, [v2q + orderq]
132 mova m1, [v2q + orderq + mmsize]
133 mova m2, [v3q + orderq]
134 mova m3, [v3q + orderq + mmsize]
136 %define t0 [v1q + orderq]
137 %define t1 [v1q + orderq + mmsize]
152 mova [v1q + orderq], m2
153 mova [v1q + orderq + mmsize], m3
160 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
162 cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
172 mova m4, [v2q + orderq]
173 mova m5, [v3q + orderq]
174 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
189 SCALARPRODUCT_LOOP 14
190 SCALARPRODUCT_LOOP 12
191 SCALARPRODUCT_LOOP 10
206 ;-----------------------------------------------------------------------------
207 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
208 ; const int16_t *window, unsigned int len)
209 ;-----------------------------------------------------------------------------
211 %macro REVERSE_WORDS 1-2
212 %if cpuflag(ssse3) && notcpuflag(atom)
218 %elif cpuflag(mmxext)
224 %if cpuflag(ssse3) ; dst, src, unused
225 ; dst = ((dst * src) + (1<<14)) >> 15
227 %elif cpuflag(mmxext) ; dst, src, temp
228 ; dst = (dst * src) >> 15
229 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
230 ; in from the pmullw result.
240 %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
242 cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
244 cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
246 lea offset2q, [offsetq-mmsize]
247 %if cpuflag(ssse3) && notcpuflag(atom)
248 mova m5, [pb_revwords]
255 ; This version does the 16x16->16 multiplication in-place without expanding
256 ; to 32-bit. The ssse3 version is bit-identical.
257 mova m0, [windowq+offset2q]
258 mova m1, [ inputq+offset2q]
261 pmulhrsw m0, [ inputq+offsetq ]
262 mova [outputq+offset2q], m1
263 mova [outputq+offsetq ], m0
265 ; This version expands 16-bit to 32-bit, multiplies by the window,
266 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
267 ; save to the output. The window is reversed for the second half.
268 mova m3, [windowq+offset2q]
269 mova m4, [ inputq+offset2q]
283 mova [outputq+offset2q], m0
285 mova m4, [ inputq+offsetq]
299 mova [outputq+offsetq], m0
301 ; This version does the 16x16->16 multiplication in-place without expanding
302 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
303 ; therefore are not bit-identical to the C version.
304 mova m0, [windowq+offset2q]
305 mova m1, [ inputq+offset2q]
306 mova m2, [ inputq+offsetq ]
307 MUL16FIXED m1, m0, m3
309 MUL16FIXED m2, m0, m3
310 mova [outputq+offset2q], m1
311 mova [outputq+offsetq ], m2
334 ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
336 cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
339 movd mm4, [left_topq]
344 psubb mm0, mm4 ; t-tl
356 psubb mm0, mm4 ; t-tl
362 paddb mm4, mm3 ; t-tl+l
367 pmaxub mm3, mm5 ; median
368 paddb mm3, mm2 ; +residual
388 movzx r2d, byte [dstq-1]
390 movzx r2d, byte [topq-1]
395 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
425 movhps [dstq+wq+8], m0
437 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
439 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
442 mova m4, [pb_zzzz3333zzzzbbbb]
443 mova m3, [pb_zz11zz55zz99zzdd]
446 ADD_HFYU_LEFT_LOOP 1, 1
449 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
451 mova m6, [pb_zzzzzzzz77777777]
452 mova m4, [pb_zzzz3333zzzzbbbb]
453 mova m3, [pb_zz11zz55zz99zzdd]
460 ADD_HFYU_LEFT_LOOP 1, 1
462 ADD_HFYU_LEFT_LOOP 0, 1
464 ADD_HFYU_LEFT_LOOP 0, 0
467 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
469 cglobal scalarproduct_float, 3,3,2, v1, v2, offset
476 movaps xmm1, [v1q+offsetq]
477 mulps xmm1, [v2q+offsetq]
492 ;-----------------------------------------------------------------------------
493 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
494 ; int32_t max, unsigned int len)
495 ;-----------------------------------------------------------------------------
497 ; %1 = number of xmm registers used
498 ; %2 = number of inline load/process/store loops per asm loop
499 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
500 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
502 %macro VECTOR_CLIP_INT32 4-5
503 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
516 mova m0, [srcq+mmsize*0*%%i]
517 mova m1, [srcq+mmsize*1*%%i]
518 mova m2, [srcq+mmsize*2*%%i]
519 mova m3, [srcq+mmsize*3*%%i]
521 mova m7, [srcq+mmsize*4*%%i]
522 mova m8, [srcq+mmsize*5*%%i]
523 mova m9, [srcq+mmsize*6*%%i]
524 mova m10, [srcq+mmsize*7*%%i]
534 CLIPD m10, m4, m5, m6
536 mova [dstq+mmsize*0*%%i], m0
537 mova [dstq+mmsize*1*%%i], m1
538 mova [dstq+mmsize*2*%%i], m2
539 mova [dstq+mmsize*3*%%i], m3
541 mova [dstq+mmsize*4*%%i], m7
542 mova [dstq+mmsize*5*%%i], m8
543 mova [dstq+mmsize*6*%%i], m9
544 mova [dstq+mmsize*7*%%i], m10
548 add srcq, mmsize*4*(%2+%3)
549 add dstq, mmsize*4*(%2+%3)
550 sub lend, mmsize*(%2+%3)
556 %define CLIPD CLIPD_MMX
557 VECTOR_CLIP_INT32 0, 1, 0, 0
559 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
560 %define CLIPD CLIPD_SSE2
561 VECTOR_CLIP_INT32 6, 2, 0, 1
563 %define CLIPD CLIPD_SSE41
565 VECTOR_CLIP_INT32 11, 1, 1, 0
567 VECTOR_CLIP_INT32 6, 1, 0, 0
570 ;-----------------------------------------------------------------------------
571 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
573 ;-----------------------------------------------------------------------------
574 %macro VECTOR_FMUL_REVERSE 0
575 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
576 lea lenq, [lend*4 - 2*mmsize]
580 vmovaps xmm0, [src1q + 16]
581 vinsertf128 m0, m0, [src1q], 1
582 vshufps m0, m0, m0, q0123
583 vmovaps xmm1, [src1q + mmsize + 16]
584 vinsertf128 m1, m1, [src1q + mmsize], 1
585 vshufps m1, m1, m1, q0123
588 mova m1, [src1q + mmsize]
592 mulps m0, m0, [src0q + lenq + mmsize]
593 mulps m1, m1, [src0q + lenq]
594 mova [dstq + lenq + mmsize], m0
595 mova [dstq + lenq], m1
607 ;-----------------------------------------------------------------------------
608 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
609 ; const float *src2, int len)
610 ;-----------------------------------------------------------------------------
611 %macro VECTOR_FMUL_ADD 0
612 cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
613 lea lenq, [lend*4 - 2*mmsize]
616 mova m0, [src0q + lenq]
617 mova m1, [src0q + lenq + mmsize]
618 mulps m0, m0, [src1q + lenq]
619 mulps m1, m1, [src1q + lenq + mmsize]
620 addps m0, m0, [src2q + lenq]
621 addps m1, m1, [src2q + lenq + mmsize]
622 mova [dstq + lenq], m0
623 mova [dstq + lenq + mmsize], m1
635 ;-----------------------------------------------------------------------------
636 ; void ff_butterflies_float_interleave(float *dst, const float *src0,
637 ; const float *src1, int len);
638 ;-----------------------------------------------------------------------------
640 %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
641 cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
648 lea src0q, [src0q + lenq]
649 lea src1q, [src1q + lenq]
650 lea dstq, [ dstq + 2*lenq]
653 mova m0, [src0q + lenq]
654 mova m1, [src1q + lenq]
660 vextractf128 [dstq + 2*lenq ], m1, 0
661 vextractf128 [dstq + 2*lenq + 16], m0, 0
662 vextractf128 [dstq + 2*lenq + 32], m1, 1
663 vextractf128 [dstq + 2*lenq + 48], m0, 1
665 mova [dstq + 2*lenq ], m1
666 mova [dstq + 2*lenq + mmsize], m0
675 BUTTERFLIES_FLOAT_INTERLEAVE
677 BUTTERFLIES_FLOAT_INTERLEAVE
679 ; %1 = aligned/unaligned
693 pshuflw m0, m0, 10110001b
694 pshuflw m1, m1, 10110001b
695 pshufhw m0, m0, 10110001b
696 pshufhw m1, m1, 10110001b
721 pshuflw m0, m0, 10110001b
722 pshufhw m0, m0, 10110001b
733 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
736 cglobal bswap32_buf, 3,4,3
738 mova m2, [pb_bswap32]
740 cglobal bswap32_buf, 3,4,5
806 ; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
809 cglobal %1_pixels4_l2, 6,6
849 ; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
852 cglobal %1_pixels8_l2, 6,6
892 ; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
895 cglobal %1_pixels16_l2, 6,6
938 ; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)
945 cglobal %1_pixels%2, 4,5
976 ; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
977 cglobal put_pixels16, 4,5,4
995 ; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
996 cglobal avg_pixels16, 4,5,4