1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "x86util.asm"
26 pb_zzzzzzzz77777777: times 8 db -1
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30 pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
31 pd_16384: times 4 dd 16384
32 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
36 %macro SCALARPRODUCT 1
37 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
38 cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
45 movu m0, [v1q + orderq]
46 movu m1, [v1q + orderq + mmsize]
47 pmaddwd m0, [v2q + orderq]
48 pmaddwd m1, [v2q + orderq + mmsize]
64 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
65 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
80 movu m0, [v2q + orderq]
81 movu m1, [v2q + orderq + mmsize]
82 mova m4, [v1q + orderq]
83 mova m5, [v1q + orderq + mmsize]
84 movu m2, [v3q + orderq]
85 movu m3, [v3q + orderq + mmsize]
94 mova [v1q + orderq], m2
95 mova [v1q + orderq + mmsize], m3
115 %macro SCALARPRODUCT_LOOP 1
121 mova m4, [v2q + orderq]
122 mova m0, [v2q + orderq + mmsize]
126 mova m5, [v3q + orderq]
127 mova m2, [v3q + orderq + mmsize]
131 mova m0, [v2q + orderq]
132 mova m1, [v2q + orderq + mmsize]
133 mova m2, [v3q + orderq]
134 mova m3, [v3q + orderq + mmsize]
136 %define t0 [v1q + orderq]
137 %define t1 [v1q + orderq + mmsize]
152 mova [v1q + orderq], m2
153 mova [v1q + orderq + mmsize], m3
160 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
161 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
171 mova m4, [v2q + orderq]
172 mova m5, [v3q + orderq]
173 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
188 SCALARPRODUCT_LOOP 14
189 SCALARPRODUCT_LOOP 12
190 SCALARPRODUCT_LOOP 10
205 ;-----------------------------------------------------------------------------
206 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
207 ; const int16_t *window, unsigned int len)
208 ;-----------------------------------------------------------------------------
210 %macro REVERSE_WORDS_MMXEXT 1-2
214 %macro REVERSE_WORDS_SSE2 1-2
220 %macro REVERSE_WORDS_SSSE3 2
224 ; dst = (dst * src) >> 15
225 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
226 ; in from the pmullw result.
227 %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
236 ; dst = ((dst * src) + (1<<14)) >> 15
237 %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
241 %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
242 cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
243 lea offset2q, [offsetq-mmsize]
247 mova m5, [pb_revwords]
252 ; This version expands 16-bit to 32-bit, multiplies by the window,
253 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
254 ; save to the output. The window is reversed for the second half.
255 mova m3, [windowq+offset2q]
256 mova m4, [ inputq+offset2q]
270 mova [outputq+offset2q], m0
272 mova m4, [ inputq+offsetq]
286 mova [outputq+offsetq], m0
288 ; This version does the 16x16->16 multiplication in-place without expanding
289 ; to 32-bit. The ssse3 version is bit-identical.
290 mova m0, [windowq+offset2q]
291 mova m1, [ inputq+offset2q]
294 pmulhrsw m0, [ inputq+offsetq ]
295 mova [outputq+offset2q], m1
296 mova [outputq+offsetq ], m0
298 ; This version does the 16x16->16 multiplication in-place without expanding
299 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
300 ; therefore are not bit-identical to the C version.
301 mova m0, [windowq+offset2q]
302 mova m1, [ inputq+offset2q]
303 mova m2, [ inputq+offsetq ]
304 MUL16FIXED m1, m0, m3
306 MUL16FIXED m2, m0, m3
307 mova [outputq+offset2q], m1
308 mova [outputq+offsetq ], m2
317 %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
318 %define MUL16FIXED MUL16FIXED_MMXEXT
319 APPLY_WINDOW_INT16 mmxext, 0, 0
320 APPLY_WINDOW_INT16 mmxext_ba, 1, 0
322 %define REVERSE_WORDS REVERSE_WORDS_SSE2
323 APPLY_WINDOW_INT16 sse2, 0, 0
324 APPLY_WINDOW_INT16 sse2_ba, 1, 0
325 APPLY_WINDOW_INT16 ssse3_atom, 0, 1
326 %define REVERSE_WORDS REVERSE_WORDS_SSSE3
327 APPLY_WINDOW_INT16 ssse3, 0, 1
330 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
331 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
334 movd mm4, [left_topq]
339 psubb mm0, mm4 ; t-tl
351 psubb mm0, mm4 ; t-tl
357 paddb mm4, mm3 ; t-tl+l
362 pmaxub mm3, mm5 ; median
363 paddb mm3, mm2 ; +residual
383 movzx r2d, byte [dstq-1]
385 movzx r2d, byte [topq-1]
390 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
420 movhps [dstq+wq+8], m0
432 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
434 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
437 mova m4, [pb_zzzz3333zzzzbbbb]
438 mova m3, [pb_zz11zz55zz99zzdd]
441 ADD_HFYU_LEFT_LOOP 1, 1
444 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
446 mova m6, [pb_zzzzzzzz77777777]
447 mova m4, [pb_zzzz3333zzzzbbbb]
448 mova m3, [pb_zz11zz55zz99zzdd]
455 ADD_HFYU_LEFT_LOOP 1, 1
457 ADD_HFYU_LEFT_LOOP 0, 1
459 ADD_HFYU_LEFT_LOOP 0, 0
462 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
463 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
470 movaps xmm1, [v1q+offsetq]
471 mulps xmm1, [v2q+offsetq]
486 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
487 ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
488 ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
490 ; The actual function itself is below. It basically wraps a very simple
491 ; w = end_x - start_x
494 ; jump to the slow loop functions
496 ; jump to the fast loop functions
500 ; ... and then the same for left/right extend also. See below for loop
501 ; function implementations. Fast are fixed-width, slow is variable-width
503 %macro EMU_EDGE_FUNC 0
506 cglobal emu_edge_core, 6, 9, 1
507 mov r8, r5 ; save block_h
510 cglobal emu_edge_core, 2, 7, 0
512 mov r5, r5m ; block_h
515 ; start with vertical extend (top/bottom) and body pixel copy
517 sub w_reg, r6m ; w = start_x - end_x
525 jg .slow_v_extend_loop
527 mov r2, r2m ; linesize
529 sal w_reg, 7 ; w * 128
531 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
534 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
536 call w_reg ; fast top extend, body copy and bottom extend
539 ; horizontal extend (left/right)
540 mov w_reg, r6m ; start_x
543 mov r3, r0 ; backup of buf+block_h*linesize
546 mov r0m, r0 ; backup of buf+block_h*linesize
552 jg .slow_left_extend_loop
555 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
558 ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
559 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
561 lea rax, [.emuedge_extend_left_2]
564 lea w_reg, [.emuedge_extend_left_2+w_reg]
568 ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
574 mov w_reg, r7m ; end_x
575 mov r1, r8m ; block_w
578 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
580 jg .slow_right_extend_loop
582 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
586 lea rax, [.emuedge_extend_right_2]
589 lea r1, [.emuedge_extend_right_2+r1]
615 %define stack_offset 0x14
620 ; macro to read/write a horizontal number of pixels (%2) to/from registers
621 ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
622 ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
623 ; - else if (%2 & 8) fills 8 bytes into mm0
624 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
625 ; - else if (%2 & 4) fills 4 bytes into mm0-1
626 ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
627 ; (note that we're using r3 for body/bottom because it's a shorter
628 ; opcode, and then the loop fits in 128 bytes)
629 ; - else fills remaining bytes into rax
630 ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
631 ; - if (%2 & 7 == 4) fills 4 bytes into ebx
632 ; - else if (%2 & 4) fills 4 bytes into mm0-7
633 ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
634 ; - else fills remaining bytes into ebx
635 ; writing data out is in the same way
636 %macro READ_NUM_BYTES 2
637 %assign %%src_off 0 ; offset in source buffer
638 %assign %%smidx 0 ; mmx register idx
639 %assign %%sxidx 0 ; xmm register idx
643 movups xmm %+ %%sxidx, [r1+%%src_off]
644 %assign %%src_off %%src_off+16
645 %assign %%sxidx %%sxidx+1
650 %if (%2-%%src_off) == 8
651 mov rax, [r1+%%src_off]
652 %assign %%src_off %%src_off+8
653 %endif ; (%2-%%src_off) == 8
656 %rep (%2-%%src_off)/8
657 movq mm %+ %%smidx, [r1+%%src_off]
658 %assign %%src_off %%src_off+8
659 %assign %%smidx %%smidx+1
660 %endrep ; (%2-%%dst_off)/8
662 %if (%2-%%src_off) == 4
663 mov vald, [r1+%%src_off]
664 %elif (%2-%%src_off) & 4
665 movd mm %+ %%smidx, [r1+%%src_off]
666 %assign %%src_off %%src_off+4
667 %endif ; (%2-%%src_off) ==/& 4
669 %if (%2-%%src_off) == 1
670 mov vall, [r1+%%src_off]
671 %elif (%2-%%src_off) == 2
672 mov valw, [r1+%%src_off]
673 %elif (%2-%%src_off) == 3
675 mov valw2, [r1+%%src_off]
677 mov valw3, [r1+%%src_off]
679 mov valw4, [r1+%%src_off]
680 %endif ; %1 ==/!= top
681 mov vall, [r1+%%src_off+2]
682 %endif ; (%2-%%src_off) == 1/2/3
683 %endmacro ; READ_NUM_BYTES
685 %macro WRITE_NUM_BYTES 2
686 %assign %%dst_off 0 ; offset in destination buffer
687 %assign %%dmidx 0 ; mmx register idx
688 %assign %%dxidx 0 ; xmm register idx
692 movups [r0+%%dst_off], xmm %+ %%dxidx
693 %assign %%dst_off %%dst_off+16
694 %assign %%dxidx %%dxidx+1
699 %if (%2-%%dst_off) == 8
700 mov [r0+%%dst_off], rax
701 %assign %%dst_off %%dst_off+8
702 %endif ; (%2-%%dst_off) == 8
705 %rep (%2-%%dst_off)/8
706 movq [r0+%%dst_off], mm %+ %%dmidx
707 %assign %%dst_off %%dst_off+8
708 %assign %%dmidx %%dmidx+1
709 %endrep ; (%2-%%dst_off)/8
711 %if (%2-%%dst_off) == 4
712 mov [r0+%%dst_off], vald
713 %elif (%2-%%dst_off) & 4
714 movd [r0+%%dst_off], mm %+ %%dmidx
715 %assign %%dst_off %%dst_off+4
716 %endif ; (%2-%%dst_off) ==/& 4
718 %if (%2-%%dst_off) == 1
719 mov [r0+%%dst_off], vall
720 %elif (%2-%%dst_off) == 2
721 mov [r0+%%dst_off], valw
722 %elif (%2-%%dst_off) == 3
724 mov [r0+%%dst_off], valw2
726 mov [r0+%%dst_off], valw3
728 mov [r0+%%dst_off], valw4
729 %endif ; %1 ==/!= top
730 mov [r0+%%dst_off+2], vall
731 %endif ; (%2-%%dst_off) == 1/2/3
732 %endmacro ; WRITE_NUM_BYTES
734 ; vertical top/bottom extend and body copy fast loops
735 ; these are function pointers to set-width line copy functions, i.e.
736 ; they read a fixed number of pixels into set registers, and write
737 ; those out into the destination buffer
738 ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
739 ; r6(eax/64)/r3(ebx/32)=val_reg
740 %macro VERTICAL_EXTEND 0
744 .emuedge_v_extend_ %+ %%n:
745 ; extend pixels above body
747 test r3 , r3 ; if (!start_y)
748 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
751 je .emuedge_copy_body_ %+ %%n %+ _loop
752 %endif ; ARCH_X86_64/32
753 READ_NUM_BYTES top, %%n ; read bytes
754 .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
755 WRITE_NUM_BYTES top, %%n ; write bytes
756 add r0 , r2 ; dst += linesize
761 %endif ; ARCH_X86_64/32
762 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
765 .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
766 READ_NUM_BYTES body, %%n ; read bytes
767 WRITE_NUM_BYTES body, %%n ; write bytes
768 add r0 , r2 ; dst += linesize
769 add r1 , r2 ; src += linesize
771 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
774 test r5 , r5 ; if (!block_h)
775 jz .emuedge_v_extend_end_ %+ %%n ; goto end
776 sub r1 , r2 ; src -= linesize
777 READ_NUM_BYTES bottom, %%n ; read bytes
778 .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
779 WRITE_NUM_BYTES bottom, %%n ; write bytes
780 add r0 , r2 ; dst += linesize
782 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
784 .emuedge_v_extend_end_ %+ %%n:
789 %endif ; ARCH_X86_64/32
792 %endmacro VERTICAL_EXTEND
794 ; left/right (horizontal) fast extend functions
795 ; these are essentially identical to the vertical extend ones above,
796 ; just left/right separated because number of pixels to extend is
797 ; obviously not the same on both sides.
798 ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
799 ; lowest two bytes of the register (so val*0x0101), and are splatted
800 ; into each byte of mm0 as well if n_pixels >= 8
802 %macro READ_V_PIXEL 2
816 %macro WRITE_V_PIXEL 2
819 movq [%2+%%dst_off], mm0
820 %assign %%dst_off %%dst_off+8
824 movd [%2+%%dst_off], mm0
826 mov [%2+%%dst_off] , valw
827 mov [%2+%%dst_off+2], valw
829 %assign %%dst_off %%dst_off+4
832 mov [%2+%%dst_off], valw
836 ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
841 .emuedge_extend_left_ %+ %%n: ; do {
842 sub r0, r2 ; dst -= linesize
843 READ_V_PIXEL %%n, [r0+r1] ; read pixels
844 WRITE_V_PIXEL %%n, r0 ; write pixels
846 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
851 %endif ; ARCH_X86_64/32
854 %endmacro ; LEFT_EXTEND
856 ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
857 %macro RIGHT_EXTEND 0
861 .emuedge_extend_right_ %+ %%n: ; do {
863 sub r3, r2 ; dst -= linesize
864 READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
865 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
868 sub r0, r2 ; dst -= linesize
869 READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
870 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
872 %endif ; ARCH_X86_64/32
873 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
878 %endif ; ARCH_X86_64/32
883 %define stack_offset 0x10
885 %endmacro ; RIGHT_EXTEND
887 ; below follow the "slow" copy/extend functions, these act on a non-fixed
888 ; width specified in a register, and run a loop to copy the full amount
889 ; of bytes. They are optimized for copying of large amounts of pixels per
890 ; line, so they unconditionally splat data into mm registers to copy 8
891 ; bytes per loop iteration. It could be considered to use xmm for x86-64
892 ; also, but I haven't optimized this as much (i.e. FIXME)
893 %macro V_COPY_NPX 4-5
919 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
921 V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
924 V_COPY_NPX %1, rax , mov, 8
927 V_COPY_NPX %1, mm0, movq, 8
928 %endif ; ARCH_X86_64/32
930 V_COPY_NPX %1, vald, mov, 4
931 V_COPY_NPX %1, valw, mov, 2
932 V_COPY_NPX %1, vall, mov, 1
942 %macro SLOW_V_EXTEND 0
944 ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
945 ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
947 push r8 ; save old value of block_h
950 jz .do_body_copy ; if (!start_y) goto do_body_copy
955 je .do_body_copy ; if (!start_y) goto do_body_copy
956 V_COPY_ROW top, dword r3m
963 pop r8 ; restore old value of block_h
970 jz .skip_bottom_extend
972 V_COPY_ROW bottom, r5
980 %macro SLOW_LEFT_EXTEND 0
981 .slow_left_extend_loop:
982 ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
985 READ_V_PIXEL 8, [r0+w_reg]
986 .left_extend_8px_loop:
990 jle .left_extend_8px_loop
993 jge .left_extend_loop_end
994 .left_extend_2px_loop:
998 jl .left_extend_2px_loop
999 .left_extend_loop_end:
1001 jnz .slow_left_extend_loop
1008 %macro SLOW_RIGHT_EXTEND 0
1009 .slow_right_extend_loop:
1010 ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
1011 ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1020 sub buf_reg, linesize
1021 READ_V_PIXEL 8, [buf_reg+w_reg-1]
1022 .right_extend_8px_loop:
1023 movq [buf_reg+r1], mm0
1026 jge .right_extend_8px_loop
1029 je .right_extend_loop_end
1030 .right_extend_2px_loop:
1032 mov [buf_reg+r1], valw
1034 jg .right_extend_2px_loop
1035 .right_extend_loop_end:
1037 jnz .slow_right_extend_loop
1057 ;-----------------------------------------------------------------------------
1058 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
1059 ; int32_t max, unsigned int len)
1060 ;-----------------------------------------------------------------------------
1062 ; %1 = number of xmm registers used
1063 ; %2 = number of inline load/process/store loops per asm loop
1064 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
1065 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
1067 %macro VECTOR_CLIP_INT32 4-5
1068 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
1081 mova m0, [srcq+mmsize*0*%%i]
1082 mova m1, [srcq+mmsize*1*%%i]
1083 mova m2, [srcq+mmsize*2*%%i]
1084 mova m3, [srcq+mmsize*3*%%i]
1086 mova m7, [srcq+mmsize*4*%%i]
1087 mova m8, [srcq+mmsize*5*%%i]
1088 mova m9, [srcq+mmsize*6*%%i]
1089 mova m10, [srcq+mmsize*7*%%i]
1091 CLIPD m0, m4, m5, m6
1092 CLIPD m1, m4, m5, m6
1093 CLIPD m2, m4, m5, m6
1094 CLIPD m3, m4, m5, m6
1096 CLIPD m7, m4, m5, m6
1097 CLIPD m8, m4, m5, m6
1098 CLIPD m9, m4, m5, m6
1099 CLIPD m10, m4, m5, m6
1101 mova [dstq+mmsize*0*%%i], m0
1102 mova [dstq+mmsize*1*%%i], m1
1103 mova [dstq+mmsize*2*%%i], m2
1104 mova [dstq+mmsize*3*%%i], m3
1106 mova [dstq+mmsize*4*%%i], m7
1107 mova [dstq+mmsize*5*%%i], m8
1108 mova [dstq+mmsize*6*%%i], m9
1109 mova [dstq+mmsize*7*%%i], m10
1113 add srcq, mmsize*4*(%2+%3)
1114 add dstq, mmsize*4*(%2+%3)
1115 sub lend, mmsize*(%2+%3)
1121 %define SPLATD SPLATD_MMX
1122 %define CLIPD CLIPD_MMX
1123 VECTOR_CLIP_INT32 0, 1, 0, 0
1125 %define SPLATD SPLATD_SSE2
1126 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
1127 %define CLIPD CLIPD_SSE2
1128 VECTOR_CLIP_INT32 6, 2, 0, 1
1130 %define CLIPD CLIPD_SSE41
1132 VECTOR_CLIP_INT32 11, 1, 1, 0
1134 VECTOR_CLIP_INT32 6, 1, 0, 0
1137 ;-----------------------------------------------------------------------------
1138 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
1140 ;-----------------------------------------------------------------------------
1141 %macro VECTOR_FMUL_REVERSE 0
1142 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
1143 lea lenq, [lend*4 - 2*mmsize]
1147 vmovaps xmm0, [src1q + 16]
1148 vinsertf128 m0, m0, [src1q], 1
1149 vshufps m0, m0, m0, q0123
1150 vmovaps xmm1, [src1q + mmsize + 16]
1151 vinsertf128 m1, m1, [src1q + mmsize], 1
1152 vshufps m1, m1, m1, q0123
1155 mova m1, [src1q + mmsize]
1156 shufps m0, m0, q0123
1157 shufps m1, m1, q0123
1159 mulps m0, m0, [src0q + lenq + mmsize]
1160 mulps m1, m1, [src0q + lenq]
1161 mova [dstq + lenq + mmsize], m0
1162 mova [dstq + lenq], m1
1171 %if HAVE_AVX_EXTERNAL
1176 ;-----------------------------------------------------------------------------
1177 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
1178 ; const float *src2, int len)
1179 ;-----------------------------------------------------------------------------
1180 %macro VECTOR_FMUL_ADD 0
1181 cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
1182 lea lenq, [lend*4 - 2*mmsize]
1185 mova m0, [src0q + lenq]
1186 mova m1, [src0q + lenq + mmsize]
1187 mulps m0, m0, [src1q + lenq]
1188 mulps m1, m1, [src1q + lenq + mmsize]
1189 addps m0, m0, [src2q + lenq]
1190 addps m1, m1, [src2q + lenq + mmsize]
1191 mova [dstq + lenq], m0
1192 mova [dstq + lenq + mmsize], m1
1201 %if HAVE_AVX_EXTERNAL
1206 ;-----------------------------------------------------------------------------
1207 ; void ff_butterflies_float_interleave(float *dst, const float *src0,
1208 ; const float *src1, int len);
1209 ;-----------------------------------------------------------------------------
1211 %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
1212 cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
1219 lea src0q, [src0q + lenq]
1220 lea src1q, [src1q + lenq]
1221 lea dstq, [ dstq + 2*lenq]
1224 mova m0, [src0q + lenq]
1225 mova m1, [src1q + lenq]
1231 vextractf128 [dstq + 2*lenq ], m1, 0
1232 vextractf128 [dstq + 2*lenq + 16], m0, 0
1233 vextractf128 [dstq + 2*lenq + 32], m1, 1
1234 vextractf128 [dstq + 2*lenq + 48], m0, 1
1236 mova [dstq + 2*lenq ], m1
1237 mova [dstq + 2*lenq + mmsize], m0
1246 BUTTERFLIES_FLOAT_INTERLEAVE
1247 %if HAVE_AVX_EXTERNAL
1249 BUTTERFLIES_FLOAT_INTERLEAVE
1253 ; %1 = aligned/unaligned
1254 %macro BSWAP_LOOPS_SSE2 1
1261 pshuflw m0, m0, 10110001b
1262 pshuflw m1, m1, 10110001b
1263 pshufhw m0, m0, 10110001b
1264 pshufhw m1, m1, 10110001b
1284 pshuflw m0, m0, 10110001b
1285 pshufhw m0, m0, 10110001b
1295 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
1296 cglobal bswap32_buf, 3,4,5
1318 ; %1 = aligned/unaligned
1319 %macro BSWAP_LOOPS_SSSE3 1
1346 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
1347 cglobal bswap32_buf, 3,4,3
1349 mova m2, [pb_bswap32]