1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86inc.asm"
23 %include "x86util.asm"
27 pb_zzzzzzzz77777777: times 8 db -1
29 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
30 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
31 pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
32 pd_16384: times 4 dd 16384
33 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
37 %macro SCALARPRODUCT 1
38 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
39 cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
46 movu m0, [v1q + orderq]
47 movu m1, [v1q + orderq + mmsize]
48 pmaddwd m0, [v2q + orderq]
49 pmaddwd m1, [v2q + orderq + mmsize]
65 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
66 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
81 movu m0, [v2q + orderq]
82 movu m1, [v2q + orderq + mmsize]
83 mova m4, [v1q + orderq]
84 mova m5, [v1q + orderq + mmsize]
85 movu m2, [v3q + orderq]
86 movu m3, [v3q + orderq + mmsize]
95 mova [v1q + orderq], m2
96 mova [v1q + orderq + mmsize], m3
116 %macro SCALARPRODUCT_LOOP 1
122 mova m4, [v2q + orderq]
123 mova m0, [v2q + orderq + mmsize]
127 mova m5, [v3q + orderq]
128 mova m2, [v3q + orderq + mmsize]
132 mova m0, [v2q + orderq]
133 mova m1, [v2q + orderq + mmsize]
134 mova m2, [v3q + orderq]
135 mova m3, [v3q + orderq + mmsize]
137 %define t0 [v1q + orderq]
138 %define t1 [v1q + orderq + mmsize]
153 mova [v1q + orderq], m2
154 mova [v1q + orderq + mmsize], m3
161 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
162 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
172 mova m4, [v2q + orderq]
173 mova m5, [v3q + orderq]
174 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
189 SCALARPRODUCT_LOOP 14
190 SCALARPRODUCT_LOOP 12
191 SCALARPRODUCT_LOOP 10
206 ;-----------------------------------------------------------------------------
207 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
208 ; const int16_t *window, unsigned int len)
209 ;-----------------------------------------------------------------------------
211 %macro REVERSE_WORDS_MMXEXT 1-2
215 %macro REVERSE_WORDS_SSE2 1-2
221 %macro REVERSE_WORDS_SSSE3 2
225 ; dst = (dst * src) >> 15
226 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
227 ; in from the pmullw result.
228 %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
237 ; dst = ((dst * src) + (1<<14)) >> 15
238 %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
242 %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
243 cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
244 lea offset2q, [offsetq-mmsize]
248 mova m5, [pb_revwords]
253 ; This version expands 16-bit to 32-bit, multiplies by the window,
254 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
255 ; save to the output. The window is reversed for the second half.
256 mova m3, [windowq+offset2q]
257 mova m4, [ inputq+offset2q]
271 mova [outputq+offset2q], m0
273 mova m4, [ inputq+offsetq]
287 mova [outputq+offsetq], m0
289 ; This version does the 16x16->16 multiplication in-place without expanding
290 ; to 32-bit. The ssse3 version is bit-identical.
291 mova m0, [windowq+offset2q]
292 mova m1, [ inputq+offset2q]
295 pmulhrsw m0, [ inputq+offsetq ]
296 mova [outputq+offset2q], m1
297 mova [outputq+offsetq ], m0
299 ; This version does the 16x16->16 multiplication in-place without expanding
300 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
301 ; therefore are not bit-identical to the C version.
302 mova m0, [windowq+offset2q]
303 mova m1, [ inputq+offset2q]
304 mova m2, [ inputq+offsetq ]
305 MUL16FIXED m1, m0, m3
307 MUL16FIXED m2, m0, m3
308 mova [outputq+offset2q], m1
309 mova [outputq+offsetq ], m2
318 %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
319 %define MUL16FIXED MUL16FIXED_MMXEXT
320 APPLY_WINDOW_INT16 mmxext, 0, 0
321 APPLY_WINDOW_INT16 mmxext_ba, 1, 0
323 %define REVERSE_WORDS REVERSE_WORDS_SSE2
324 APPLY_WINDOW_INT16 sse2, 0, 0
325 APPLY_WINDOW_INT16 sse2_ba, 1, 0
326 APPLY_WINDOW_INT16 ssse3_atom, 0, 1
327 %define REVERSE_WORDS REVERSE_WORDS_SSSE3
328 APPLY_WINDOW_INT16 ssse3, 0, 1
331 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
332 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
335 movd mm4, [left_topq]
340 psubb mm0, mm4 ; t-tl
352 psubb mm0, mm4 ; t-tl
358 paddb mm4, mm3 ; t-tl+l
363 pmaxub mm3, mm5 ; median
364 paddb mm3, mm2 ; +residual
384 movzx r2d, byte [dstq-1]
386 movzx r2d, byte [topq-1]
391 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
421 movhps [dstq+wq+8], m0
433 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
435 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
438 mova m4, [pb_zzzz3333zzzzbbbb]
439 mova m3, [pb_zz11zz55zz99zzdd]
442 ADD_HFYU_LEFT_LOOP 1, 1
445 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
447 mova m6, [pb_zzzzzzzz77777777]
448 mova m4, [pb_zzzz3333zzzzbbbb]
449 mova m3, [pb_zz11zz55zz99zzdd]
456 ADD_HFYU_LEFT_LOOP 1, 1
458 ADD_HFYU_LEFT_LOOP 0, 1
460 ADD_HFYU_LEFT_LOOP 0, 0
463 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
464 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
471 movaps xmm1, [v1q+offsetq]
472 mulps xmm1, [v2q+offsetq]
487 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
488 ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
489 ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
491 ; The actual function itself is below. It basically wraps a very simple
492 ; w = end_x - start_x
495 ; jump to the slow loop functions
497 ; jump to the fast loop functions
501 ; ... and then the same for left/right extend also. See below for loop
502 ; function implementations. Fast are fixed-width, slow is variable-width
504 %macro EMU_EDGE_FUNC 0
507 cglobal emu_edge_core, 6, 9, 1
508 mov r8, r5 ; save block_h
511 cglobal emu_edge_core, 2, 7, 0
513 mov r5, r5m ; block_h
516 ; start with vertical extend (top/bottom) and body pixel copy
518 sub w_reg, r6m ; w = start_x - end_x
526 jg .slow_v_extend_loop
528 mov r2, r2m ; linesize
530 sal w_reg, 7 ; w * 128
532 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
535 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
537 call w_reg ; fast top extend, body copy and bottom extend
540 ; horizontal extend (left/right)
541 mov w_reg, r6m ; start_x
544 mov r3, r0 ; backup of buf+block_h*linesize
547 mov r0m, r0 ; backup of buf+block_h*linesize
553 jg .slow_left_extend_loop
556 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
559 ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
560 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
562 lea rax, [.emuedge_extend_left_2]
565 lea w_reg, [.emuedge_extend_left_2+w_reg]
569 ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
575 mov w_reg, r7m ; end_x
576 mov r1, r8m ; block_w
579 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
581 jg .slow_right_extend_loop
583 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
587 lea rax, [.emuedge_extend_right_2]
590 lea r1, [.emuedge_extend_right_2+r1]
616 %define stack_offset 0x14
621 ; macro to read/write a horizontal number of pixels (%2) to/from registers
622 ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
623 ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
624 ; - else if (%2 & 8) fills 8 bytes into mm0
625 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
626 ; - else if (%2 & 4) fills 4 bytes into mm0-1
627 ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
628 ; (note that we're using r3 for body/bottom because it's a shorter
629 ; opcode, and then the loop fits in 128 bytes)
630 ; - else fills remaining bytes into rax
631 ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
632 ; - if (%2 & 7 == 4) fills 4 bytes into ebx
633 ; - else if (%2 & 4) fills 4 bytes into mm0-7
634 ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
635 ; - else fills remaining bytes into ebx
636 ; writing data out is in the same way
637 %macro READ_NUM_BYTES 2
638 %assign %%src_off 0 ; offset in source buffer
639 %assign %%smidx 0 ; mmx register idx
640 %assign %%sxidx 0 ; xmm register idx
644 movups xmm %+ %%sxidx, [r1+%%src_off]
645 %assign %%src_off %%src_off+16
646 %assign %%sxidx %%sxidx+1
651 %if (%2-%%src_off) == 8
652 mov rax, [r1+%%src_off]
653 %assign %%src_off %%src_off+8
654 %endif ; (%2-%%src_off) == 8
657 %rep (%2-%%src_off)/8
658 movq mm %+ %%smidx, [r1+%%src_off]
659 %assign %%src_off %%src_off+8
660 %assign %%smidx %%smidx+1
661 %endrep ; (%2-%%dst_off)/8
663 %if (%2-%%src_off) == 4
664 mov vald, [r1+%%src_off]
665 %elif (%2-%%src_off) & 4
666 movd mm %+ %%smidx, [r1+%%src_off]
667 %assign %%src_off %%src_off+4
668 %endif ; (%2-%%src_off) ==/& 4
670 %if (%2-%%src_off) == 1
671 mov vall, [r1+%%src_off]
672 %elif (%2-%%src_off) == 2
673 mov valw, [r1+%%src_off]
674 %elif (%2-%%src_off) == 3
676 mov valw2, [r1+%%src_off]
678 mov valw3, [r1+%%src_off]
680 mov valw4, [r1+%%src_off]
681 %endif ; %1 ==/!= top
682 mov vall, [r1+%%src_off+2]
683 %endif ; (%2-%%src_off) == 1/2/3
684 %endmacro ; READ_NUM_BYTES
686 %macro WRITE_NUM_BYTES 2
687 %assign %%dst_off 0 ; offset in destination buffer
688 %assign %%dmidx 0 ; mmx register idx
689 %assign %%dxidx 0 ; xmm register idx
693 movups [r0+%%dst_off], xmm %+ %%dxidx
694 %assign %%dst_off %%dst_off+16
695 %assign %%dxidx %%dxidx+1
700 %if (%2-%%dst_off) == 8
701 mov [r0+%%dst_off], rax
702 %assign %%dst_off %%dst_off+8
703 %endif ; (%2-%%dst_off) == 8
706 %rep (%2-%%dst_off)/8
707 movq [r0+%%dst_off], mm %+ %%dmidx
708 %assign %%dst_off %%dst_off+8
709 %assign %%dmidx %%dmidx+1
710 %endrep ; (%2-%%dst_off)/8
712 %if (%2-%%dst_off) == 4
713 mov [r0+%%dst_off], vald
714 %elif (%2-%%dst_off) & 4
715 movd [r0+%%dst_off], mm %+ %%dmidx
716 %assign %%dst_off %%dst_off+4
717 %endif ; (%2-%%dst_off) ==/& 4
719 %if (%2-%%dst_off) == 1
720 mov [r0+%%dst_off], vall
721 %elif (%2-%%dst_off) == 2
722 mov [r0+%%dst_off], valw
723 %elif (%2-%%dst_off) == 3
725 mov [r0+%%dst_off], valw2
727 mov [r0+%%dst_off], valw3
729 mov [r0+%%dst_off], valw4
730 %endif ; %1 ==/!= top
731 mov [r0+%%dst_off+2], vall
732 %endif ; (%2-%%dst_off) == 1/2/3
733 %endmacro ; WRITE_NUM_BYTES
735 ; vertical top/bottom extend and body copy fast loops
736 ; these are function pointers to set-width line copy functions, i.e.
737 ; they read a fixed number of pixels into set registers, and write
738 ; those out into the destination buffer
739 ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
740 ; r6(eax/64)/r3(ebx/32)=val_reg
741 %macro VERTICAL_EXTEND 0
745 .emuedge_v_extend_ %+ %%n:
746 ; extend pixels above body
748 test r3 , r3 ; if (!start_y)
749 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
752 je .emuedge_copy_body_ %+ %%n %+ _loop
753 %endif ; ARCH_X86_64/32
754 READ_NUM_BYTES top, %%n ; read bytes
755 .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
756 WRITE_NUM_BYTES top, %%n ; write bytes
757 add r0 , r2 ; dst += linesize
762 %endif ; ARCH_X86_64/32
763 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
766 .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
767 READ_NUM_BYTES body, %%n ; read bytes
768 WRITE_NUM_BYTES body, %%n ; write bytes
769 add r0 , r2 ; dst += linesize
770 add r1 , r2 ; src += linesize
772 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
775 test r5 , r5 ; if (!block_h)
776 jz .emuedge_v_extend_end_ %+ %%n ; goto end
777 sub r1 , r2 ; src -= linesize
778 READ_NUM_BYTES bottom, %%n ; read bytes
779 .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
780 WRITE_NUM_BYTES bottom, %%n ; write bytes
781 add r0 , r2 ; dst += linesize
783 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
785 .emuedge_v_extend_end_ %+ %%n:
790 %endif ; ARCH_X86_64/32
793 %endmacro VERTICAL_EXTEND
795 ; left/right (horizontal) fast extend functions
796 ; these are essentially identical to the vertical extend ones above,
797 ; just left/right separated because number of pixels to extend is
798 ; obviously not the same on both sides.
799 ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
800 ; lowest two bytes of the register (so val*0x0101), and are splatted
801 ; into each byte of mm0 as well if n_pixels >= 8
803 %macro READ_V_PIXEL 2
817 %macro WRITE_V_PIXEL 2
820 movq [%2+%%dst_off], mm0
821 %assign %%dst_off %%dst_off+8
825 movd [%2+%%dst_off], mm0
827 mov [%2+%%dst_off] , valw
828 mov [%2+%%dst_off+2], valw
830 %assign %%dst_off %%dst_off+4
833 mov [%2+%%dst_off], valw
837 ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
842 .emuedge_extend_left_ %+ %%n: ; do {
843 sub r0, r2 ; dst -= linesize
844 READ_V_PIXEL %%n, [r0+r1] ; read pixels
845 WRITE_V_PIXEL %%n, r0 ; write pixels
847 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
852 %endif ; ARCH_X86_64/32
855 %endmacro ; LEFT_EXTEND
857 ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
858 %macro RIGHT_EXTEND 0
862 .emuedge_extend_right_ %+ %%n: ; do {
864 sub r3, r2 ; dst -= linesize
865 READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
866 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
869 sub r0, r2 ; dst -= linesize
870 READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
871 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
873 %endif ; ARCH_X86_64/32
874 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
879 %endif ; ARCH_X86_64/32
884 %define stack_offset 0x10
886 %endmacro ; RIGHT_EXTEND
888 ; below follow the "slow" copy/extend functions, these act on a non-fixed
889 ; width specified in a register, and run a loop to copy the full amount
890 ; of bytes. They are optimized for copying of large amounts of pixels per
891 ; line, so they unconditionally splat data into mm registers to copy 8
892 ; bytes per loop iteration. It could be considered to use xmm for x86-64
893 ; also, but I haven't optimized this as much (i.e. FIXME)
894 %macro V_COPY_NPX 4-5
920 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
922 V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
925 V_COPY_NPX %1, rax , mov, 8
928 V_COPY_NPX %1, mm0, movq, 8
929 %endif ; ARCH_X86_64/32
931 V_COPY_NPX %1, vald, mov, 4
932 V_COPY_NPX %1, valw, mov, 2
933 V_COPY_NPX %1, vall, mov, 1
943 %macro SLOW_V_EXTEND 0
945 ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
946 ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
948 push r8 ; save old value of block_h
951 jz .do_body_copy ; if (!start_y) goto do_body_copy
956 je .do_body_copy ; if (!start_y) goto do_body_copy
957 V_COPY_ROW top, dword r3m
964 pop r8 ; restore old value of block_h
971 jz .skip_bottom_extend
973 V_COPY_ROW bottom, r5
981 %macro SLOW_LEFT_EXTEND 0
982 .slow_left_extend_loop:
983 ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
986 READ_V_PIXEL 8, [r0+w_reg]
987 .left_extend_8px_loop:
991 jle .left_extend_8px_loop
994 jge .left_extend_loop_end
995 .left_extend_2px_loop:
999 jl .left_extend_2px_loop
1000 .left_extend_loop_end:
1002 jnz .slow_left_extend_loop
1009 %macro SLOW_RIGHT_EXTEND 0
1010 .slow_right_extend_loop:
1011 ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
1012 ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1021 sub buf_reg, linesize
1022 READ_V_PIXEL 8, [buf_reg+w_reg-1]
1023 .right_extend_8px_loop:
1024 movq [buf_reg+r1], mm0
1027 jge .right_extend_8px_loop
1030 je .right_extend_loop_end
1031 .right_extend_2px_loop:
1033 mov [buf_reg+r1], valw
1035 jg .right_extend_2px_loop
1036 .right_extend_loop_end:
1038 jnz .slow_right_extend_loop
1058 ;-----------------------------------------------------------------------------
1059 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
1060 ; int32_t max, unsigned int len)
1061 ;-----------------------------------------------------------------------------
1063 ; %1 = number of xmm registers used
1064 ; %2 = number of inline load/process/store loops per asm loop
1065 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
1066 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
1068 %macro VECTOR_CLIP_INT32 4-5
1069 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
1082 mova m0, [srcq+mmsize*0*%%i]
1083 mova m1, [srcq+mmsize*1*%%i]
1084 mova m2, [srcq+mmsize*2*%%i]
1085 mova m3, [srcq+mmsize*3*%%i]
1087 mova m7, [srcq+mmsize*4*%%i]
1088 mova m8, [srcq+mmsize*5*%%i]
1089 mova m9, [srcq+mmsize*6*%%i]
1090 mova m10, [srcq+mmsize*7*%%i]
1092 CLIPD m0, m4, m5, m6
1093 CLIPD m1, m4, m5, m6
1094 CLIPD m2, m4, m5, m6
1095 CLIPD m3, m4, m5, m6
1097 CLIPD m7, m4, m5, m6
1098 CLIPD m8, m4, m5, m6
1099 CLIPD m9, m4, m5, m6
1100 CLIPD m10, m4, m5, m6
1102 mova [dstq+mmsize*0*%%i], m0
1103 mova [dstq+mmsize*1*%%i], m1
1104 mova [dstq+mmsize*2*%%i], m2
1105 mova [dstq+mmsize*3*%%i], m3
1107 mova [dstq+mmsize*4*%%i], m7
1108 mova [dstq+mmsize*5*%%i], m8
1109 mova [dstq+mmsize*6*%%i], m9
1110 mova [dstq+mmsize*7*%%i], m10
1114 add srcq, mmsize*4*(%2+%3)
1115 add dstq, mmsize*4*(%2+%3)
1116 sub lend, mmsize*(%2+%3)
1122 %define SPLATD SPLATD_MMX
1123 %define CLIPD CLIPD_MMX
1124 VECTOR_CLIP_INT32 0, 1, 0, 0
1126 %define SPLATD SPLATD_SSE2
1127 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
1128 %define CLIPD CLIPD_SSE2
1129 VECTOR_CLIP_INT32 6, 2, 0, 1
1131 %define CLIPD CLIPD_SSE41
1133 VECTOR_CLIP_INT32 11, 1, 1, 0
1135 VECTOR_CLIP_INT32 6, 1, 0, 0
1138 ;-----------------------------------------------------------------------------
1139 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
1141 ;-----------------------------------------------------------------------------
1142 %macro VECTOR_FMUL_REVERSE 0
1143 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
1144 lea lenq, [lend*4 - 2*mmsize]
1148 vmovaps xmm0, [src1q + 16]
1149 vinsertf128 m0, m0, [src1q], 1
1150 vshufps m0, m0, m0, q0123
1151 vmovaps xmm1, [src1q + mmsize + 16]
1152 vinsertf128 m1, m1, [src1q + mmsize], 1
1153 vshufps m1, m1, m1, q0123
1156 mova m1, [src1q + mmsize]
1157 shufps m0, m0, q0123
1158 shufps m1, m1, q0123
1160 mulps m0, m0, [src0q + lenq + mmsize]
1161 mulps m1, m1, [src0q + lenq]
1162 mova [dstq + lenq + mmsize], m0
1163 mova [dstq + lenq], m1
1177 ;-----------------------------------------------------------------------------
1178 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
1179 ; const float *src2, int len)
1180 ;-----------------------------------------------------------------------------
1181 %macro VECTOR_FMUL_ADD 0
1182 cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
1183 lea lenq, [lend*4 - 2*mmsize]
1186 mova m0, [src0q + lenq]
1187 mova m1, [src0q + lenq + mmsize]
1188 mulps m0, m0, [src1q + lenq]
1189 mulps m1, m1, [src1q + lenq + mmsize]
1190 addps m0, m0, [src2q + lenq]
1191 addps m1, m1, [src2q + lenq + mmsize]
1192 mova [dstq + lenq], m0
1193 mova [dstq + lenq + mmsize], m1
1207 ;-----------------------------------------------------------------------------
1208 ; void ff_butterflies_float_interleave(float *dst, const float *src0,
1209 ; const float *src1, int len);
1210 ;-----------------------------------------------------------------------------
1212 %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
1213 cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
1220 lea src0q, [src0q + lenq]
1221 lea src1q, [src1q + lenq]
1222 lea dstq, [ dstq + 2*lenq]
1225 mova m0, [src0q + lenq]
1226 mova m1, [src1q + lenq]
1232 vextractf128 [dstq + 2*lenq ], m1, 0
1233 vextractf128 [dstq + 2*lenq + 16], m0, 0
1234 vextractf128 [dstq + 2*lenq + 32], m1, 1
1235 vextractf128 [dstq + 2*lenq + 48], m0, 1
1237 mova [dstq + 2*lenq ], m1
1238 mova [dstq + 2*lenq + mmsize], m0
1247 BUTTERFLIES_FLOAT_INTERLEAVE
1250 BUTTERFLIES_FLOAT_INTERLEAVE
1254 ; %1 = aligned/unaligned
1255 %macro BSWAP_LOOPS_SSE2 1
1262 pshuflw m0, m0, 10110001b
1263 pshuflw m1, m1, 10110001b
1264 pshufhw m0, m0, 10110001b
1265 pshufhw m1, m1, 10110001b
1285 pshuflw m0, m0, 10110001b
1286 pshufhw m0, m0, 10110001b
1296 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
1297 cglobal bswap32_buf, 3,4,5
1319 ; %1 = aligned/unaligned
1320 %macro BSWAP_LOOPS_SSSE3 1
1347 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
1348 cglobal bswap32_buf, 3,4,3
1350 mova m2, [pb_bswap32]