1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86inc.asm"
23 %include "x86util.asm"
27 pb_zzzzzzzz77777777: times 8 db -1
29 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
30 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
31 pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
32 pd_16384: times 4 dd 16384
33 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
37 %macro SCALARPRODUCT 1
38 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
39 cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
46 movu m0, [v1q + orderq]
47 movu m1, [v1q + orderq + mmsize]
48 pmaddwd m0, [v2q + orderq]
49 pmaddwd m1, [v2q + orderq + mmsize]
65 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
66 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
81 movu m0, [v2q + orderq]
82 movu m1, [v2q + orderq + mmsize]
83 mova m4, [v1q + orderq]
84 mova m5, [v1q + orderq + mmsize]
85 movu m2, [v3q + orderq]
86 movu m3, [v3q + orderq + mmsize]
95 mova [v1q + orderq], m2
96 mova [v1q + orderq + mmsize], m3
116 %macro SCALARPRODUCT_LOOP 1
122 mova m4, [v2q + orderq]
123 mova m0, [v2q + orderq + mmsize]
127 mova m5, [v3q + orderq]
128 mova m2, [v3q + orderq + mmsize]
132 mova m0, [v2q + orderq]
133 mova m1, [v2q + orderq + mmsize]
134 mova m2, [v3q + orderq]
135 mova m3, [v3q + orderq + mmsize]
137 %define t0 [v1q + orderq]
138 %define t1 [v1q + orderq + mmsize]
153 mova [v1q + orderq], m2
154 mova [v1q + orderq + mmsize], m3
161 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
162 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
172 mova m4, [v2q + orderq]
173 mova m5, [v3q + orderq]
174 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
189 SCALARPRODUCT_LOOP 14
190 SCALARPRODUCT_LOOP 12
191 SCALARPRODUCT_LOOP 10
206 ;-----------------------------------------------------------------------------
207 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
208 ; const int16_t *window, unsigned int len)
209 ;-----------------------------------------------------------------------------
211 %macro REVERSE_WORDS_MMXEXT 1-2
215 %macro REVERSE_WORDS_SSE2 1-2
221 %macro REVERSE_WORDS_SSSE3 2
225 ; dst = (dst * src) >> 15
226 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
227 ; in from the pmullw result.
228 %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
237 ; dst = ((dst * src) + (1<<14)) >> 15
238 %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
242 %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
243 cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
244 lea offset2q, [offsetq-mmsize]
248 mova m5, [pb_revwords]
253 ; This version expands 16-bit to 32-bit, multiplies by the window,
254 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
255 ; save to the output. The window is reversed for the second half.
256 mova m3, [windowq+offset2q]
257 mova m4, [ inputq+offset2q]
271 mova [outputq+offset2q], m0
273 mova m4, [ inputq+offsetq]
287 mova [outputq+offsetq], m0
289 ; This version does the 16x16->16 multiplication in-place without expanding
290 ; to 32-bit. The ssse3 version is bit-identical.
291 mova m0, [windowq+offset2q]
292 mova m1, [ inputq+offset2q]
295 pmulhrsw m0, [ inputq+offsetq ]
296 mova [outputq+offset2q], m1
297 mova [outputq+offsetq ], m0
299 ; This version does the 16x16->16 multiplication in-place without expanding
300 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
301 ; therefore are not bit-identical to the C version.
302 mova m0, [windowq+offset2q]
303 mova m1, [ inputq+offset2q]
304 mova m2, [ inputq+offsetq ]
305 MUL16FIXED m1, m0, m3
307 MUL16FIXED m2, m0, m3
308 mova [outputq+offset2q], m1
309 mova [outputq+offsetq ], m2
318 %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
319 %define MUL16FIXED MUL16FIXED_MMXEXT
320 APPLY_WINDOW_INT16 mmxext, 0, 0
321 APPLY_WINDOW_INT16 mmxext_ba, 1, 0
323 %define REVERSE_WORDS REVERSE_WORDS_SSE2
324 APPLY_WINDOW_INT16 sse2, 0, 0
325 APPLY_WINDOW_INT16 sse2_ba, 1, 0
326 APPLY_WINDOW_INT16 ssse3_atom, 0, 1
327 %define REVERSE_WORDS REVERSE_WORDS_SSSE3
328 APPLY_WINDOW_INT16 ssse3, 0, 1
331 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
332 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
335 movd mm4, [left_topq]
340 psubb mm0, mm4 ; t-tl
352 psubb mm0, mm4 ; t-tl
358 paddb mm4, mm3 ; t-tl+l
363 pmaxub mm3, mm5 ; median
364 paddb mm3, mm2 ; +residual
384 movzx r2d, byte [dstq-1]
386 movzx r2d, byte [topq-1]
391 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
417 movhps [dstq+wq+8], m0
429 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
431 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
434 mova m4, [pb_zzzz3333zzzzbbbb]
435 mova m3, [pb_zz11zz55zz99zzdd]
441 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
443 mova m6, [pb_zzzzzzzz77777777]
444 mova m4, [pb_zzzz3333zzzzbbbb]
445 mova m3, [pb_zz11zz55zz99zzdd]
449 jnz add_hfyu_left_prediction_ssse3.skip_prologue
457 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
458 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
465 movaps xmm1, [v1q+offsetq]
466 mulps xmm1, [v2q+offsetq]
481 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
482 ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
483 ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
485 ; The actual function itself is below. It basically wraps a very simple
486 ; w = end_x - start_x
489 ; jump to the slow loop functions
491 ; jump to the fast loop functions
495 ; ... and then the same for left/right extend also. See below for loop
496 ; function implementations. Fast are fixed-width, slow is variable-width
498 %macro EMU_EDGE_FUNC 0
501 cglobal emu_edge_core, 6, 9, 1
502 mov r8, r5 ; save block_h
505 cglobal emu_edge_core, 2, 7, 0
507 mov r5, r5m ; block_h
510 ; start with vertical extend (top/bottom) and body pixel copy
512 sub w_reg, r6m ; w = start_x - end_x
520 jg .slow_v_extend_loop
522 mov r2, r2m ; linesize
524 sal w_reg, 7 ; w * 128
526 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
529 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
531 call w_reg ; fast top extend, body copy and bottom extend
534 ; horizontal extend (left/right)
535 mov w_reg, r6m ; start_x
538 mov r3, r0 ; backup of buf+block_h*linesize
541 mov r0m, r0 ; backup of buf+block_h*linesize
547 jg .slow_left_extend_loop
550 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
553 ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
554 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
556 lea rax, [.emuedge_extend_left_2]
559 lea w_reg, [.emuedge_extend_left_2+w_reg]
563 ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
569 mov w_reg, r7m ; end_x
570 mov r1, r8m ; block_w
573 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
575 jg .slow_right_extend_loop
577 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
581 lea rax, [.emuedge_extend_right_2]
584 lea r1, [.emuedge_extend_right_2+r1]
610 %define stack_offset 0x14
615 ; macro to read/write a horizontal number of pixels (%2) to/from registers
616 ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
617 ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
618 ; - else if (%2 & 8) fills 8 bytes into mm0
619 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
620 ; - else if (%2 & 4) fills 4 bytes into mm0-1
621 ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
622 ; (note that we're using r3 for body/bottom because it's a shorter
623 ; opcode, and then the loop fits in 128 bytes)
624 ; - else fills remaining bytes into rax
625 ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
626 ; - if (%2 & 7 == 4) fills 4 bytes into ebx
627 ; - else if (%2 & 4) fills 4 bytes into mm0-7
628 ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
629 ; - else fills remaining bytes into ebx
630 ; writing data out is in the same way
631 %macro READ_NUM_BYTES 2
632 %assign %%src_off 0 ; offset in source buffer
633 %assign %%smidx 0 ; mmx register idx
634 %assign %%sxidx 0 ; xmm register idx
638 movups xmm %+ %%sxidx, [r1+%%src_off]
639 %assign %%src_off %%src_off+16
640 %assign %%sxidx %%sxidx+1
645 %if (%2-%%src_off) == 8
646 mov rax, [r1+%%src_off]
647 %assign %%src_off %%src_off+8
648 %endif ; (%2-%%src_off) == 8
651 %rep (%2-%%src_off)/8
652 movq mm %+ %%smidx, [r1+%%src_off]
653 %assign %%src_off %%src_off+8
654 %assign %%smidx %%smidx+1
655 %endrep ; (%2-%%dst_off)/8
657 %if (%2-%%src_off) == 4
658 mov vald, [r1+%%src_off]
659 %elif (%2-%%src_off) & 4
660 movd mm %+ %%smidx, [r1+%%src_off]
661 %assign %%src_off %%src_off+4
662 %endif ; (%2-%%src_off) ==/& 4
664 %if (%2-%%src_off) == 1
665 mov vall, [r1+%%src_off]
666 %elif (%2-%%src_off) == 2
667 mov valw, [r1+%%src_off]
668 %elif (%2-%%src_off) == 3
670 mov valw2, [r1+%%src_off]
672 mov valw3, [r1+%%src_off]
674 mov valw4, [r1+%%src_off]
675 %endif ; %1 ==/!= top
676 mov vall, [r1+%%src_off+2]
677 %endif ; (%2-%%src_off) == 1/2/3
678 %endmacro ; READ_NUM_BYTES
680 %macro WRITE_NUM_BYTES 2
681 %assign %%dst_off 0 ; offset in destination buffer
682 %assign %%dmidx 0 ; mmx register idx
683 %assign %%dxidx 0 ; xmm register idx
687 movups [r0+%%dst_off], xmm %+ %%dxidx
688 %assign %%dst_off %%dst_off+16
689 %assign %%dxidx %%dxidx+1
694 %if (%2-%%dst_off) == 8
695 mov [r0+%%dst_off], rax
696 %assign %%dst_off %%dst_off+8
697 %endif ; (%2-%%dst_off) == 8
700 %rep (%2-%%dst_off)/8
701 movq [r0+%%dst_off], mm %+ %%dmidx
702 %assign %%dst_off %%dst_off+8
703 %assign %%dmidx %%dmidx+1
704 %endrep ; (%2-%%dst_off)/8
706 %if (%2-%%dst_off) == 4
707 mov [r0+%%dst_off], vald
708 %elif (%2-%%dst_off) & 4
709 movd [r0+%%dst_off], mm %+ %%dmidx
710 %assign %%dst_off %%dst_off+4
711 %endif ; (%2-%%dst_off) ==/& 4
713 %if (%2-%%dst_off) == 1
714 mov [r0+%%dst_off], vall
715 %elif (%2-%%dst_off) == 2
716 mov [r0+%%dst_off], valw
717 %elif (%2-%%dst_off) == 3
719 mov [r0+%%dst_off], valw2
721 mov [r0+%%dst_off], valw3
723 mov [r0+%%dst_off], valw4
724 %endif ; %1 ==/!= top
725 mov [r0+%%dst_off+2], vall
726 %endif ; (%2-%%dst_off) == 1/2/3
727 %endmacro ; WRITE_NUM_BYTES
729 ; vertical top/bottom extend and body copy fast loops
730 ; these are function pointers to set-width line copy functions, i.e.
731 ; they read a fixed number of pixels into set registers, and write
732 ; those out into the destination buffer
733 ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
734 ; r6(eax/64)/r3(ebx/32)=val_reg
735 %macro VERTICAL_EXTEND 0
739 .emuedge_v_extend_ %+ %%n:
740 ; extend pixels above body
742 test r3 , r3 ; if (!start_y)
743 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
746 je .emuedge_copy_body_ %+ %%n %+ _loop
747 %endif ; ARCH_X86_64/32
748 READ_NUM_BYTES top, %%n ; read bytes
749 .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
750 WRITE_NUM_BYTES top, %%n ; write bytes
751 add r0 , r2 ; dst += linesize
756 %endif ; ARCH_X86_64/32
757 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
760 .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
761 READ_NUM_BYTES body, %%n ; read bytes
762 WRITE_NUM_BYTES body, %%n ; write bytes
763 add r0 , r2 ; dst += linesize
764 add r1 , r2 ; src += linesize
766 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
769 test r5 , r5 ; if (!block_h)
770 jz .emuedge_v_extend_end_ %+ %%n ; goto end
771 sub r1 , r2 ; src -= linesize
772 READ_NUM_BYTES bottom, %%n ; read bytes
773 .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
774 WRITE_NUM_BYTES bottom, %%n ; write bytes
775 add r0 , r2 ; dst += linesize
777 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
779 .emuedge_v_extend_end_ %+ %%n:
784 %endif ; ARCH_X86_64/32
787 %endmacro VERTICAL_EXTEND
789 ; left/right (horizontal) fast extend functions
790 ; these are essentially identical to the vertical extend ones above,
791 ; just left/right separated because number of pixels to extend is
792 ; obviously not the same on both sides.
793 ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
794 ; lowest two bytes of the register (so val*0x0101), and are splatted
795 ; into each byte of mm0 as well if n_pixels >= 8
797 %macro READ_V_PIXEL 2
811 %macro WRITE_V_PIXEL 2
814 movq [%2+%%dst_off], mm0
815 %assign %%dst_off %%dst_off+8
819 movd [%2+%%dst_off], mm0
821 mov [%2+%%dst_off] , valw
822 mov [%2+%%dst_off+2], valw
824 %assign %%dst_off %%dst_off+4
827 mov [%2+%%dst_off], valw
831 ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
836 .emuedge_extend_left_ %+ %%n: ; do {
837 sub r0, r2 ; dst -= linesize
838 READ_V_PIXEL %%n, [r0+r1] ; read pixels
839 WRITE_V_PIXEL %%n, r0 ; write pixels
841 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
846 %endif ; ARCH_X86_64/32
849 %endmacro ; LEFT_EXTEND
851 ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
852 %macro RIGHT_EXTEND 0
856 .emuedge_extend_right_ %+ %%n: ; do {
858 sub r3, r2 ; dst -= linesize
859 READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
860 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
863 sub r0, r2 ; dst -= linesize
864 READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
865 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
867 %endif ; ARCH_X86_64/32
868 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
873 %endif ; ARCH_X86_64/32
878 %define stack_offset 0x10
880 %endmacro ; RIGHT_EXTEND
882 ; below follow the "slow" copy/extend functions, these act on a non-fixed
883 ; width specified in a register, and run a loop to copy the full amount
884 ; of bytes. They are optimized for copying of large amounts of pixels per
885 ; line, so they unconditionally splat data into mm registers to copy 8
886 ; bytes per loop iteration. It could be considered to use xmm for x86-64
887 ; also, but I haven't optimized this as much (i.e. FIXME)
888 %macro V_COPY_NPX 4-5
914 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
916 V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
919 V_COPY_NPX %1, rax , mov, 8
922 V_COPY_NPX %1, mm0, movq, 8
923 %endif ; ARCH_X86_64/32
925 V_COPY_NPX %1, vald, mov, 4
926 V_COPY_NPX %1, valw, mov, 2
927 V_COPY_NPX %1, vall, mov, 1
937 %macro SLOW_V_EXTEND 0
939 ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
940 ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
942 push r8 ; save old value of block_h
945 jz .do_body_copy ; if (!start_y) goto do_body_copy
950 je .do_body_copy ; if (!start_y) goto do_body_copy
951 V_COPY_ROW top, dword r3m
958 pop r8 ; restore old value of block_h
965 jz .skip_bottom_extend
967 V_COPY_ROW bottom, r5
975 %macro SLOW_LEFT_EXTEND 0
976 .slow_left_extend_loop:
977 ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
980 READ_V_PIXEL 8, [r0+w_reg]
981 .left_extend_8px_loop:
985 jle .left_extend_8px_loop
988 jge .left_extend_loop_end
989 .left_extend_2px_loop:
993 jl .left_extend_2px_loop
994 .left_extend_loop_end:
996 jnz .slow_left_extend_loop
1003 %macro SLOW_RIGHT_EXTEND 0
1004 .slow_right_extend_loop:
1005 ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
1006 ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1015 sub buf_reg, linesize
1016 READ_V_PIXEL 8, [buf_reg+w_reg-1]
1017 .right_extend_8px_loop:
1018 movq [buf_reg+r1], mm0
1021 jge .right_extend_8px_loop
1024 je .right_extend_loop_end
1025 .right_extend_2px_loop:
1027 mov [buf_reg+r1], valw
1029 jg .right_extend_2px_loop
1030 .right_extend_loop_end:
1032 jnz .slow_right_extend_loop
1052 ;-----------------------------------------------------------------------------
1053 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
1054 ; int32_t max, unsigned int len)
1055 ;-----------------------------------------------------------------------------
1057 ; %1 = number of xmm registers used
1058 ; %2 = number of inline load/process/store loops per asm loop
1059 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
1060 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
1062 %macro VECTOR_CLIP_INT32 4-5
1063 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
1076 mova m0, [srcq+mmsize*0*%%i]
1077 mova m1, [srcq+mmsize*1*%%i]
1078 mova m2, [srcq+mmsize*2*%%i]
1079 mova m3, [srcq+mmsize*3*%%i]
1081 mova m7, [srcq+mmsize*4*%%i]
1082 mova m8, [srcq+mmsize*5*%%i]
1083 mova m9, [srcq+mmsize*6*%%i]
1084 mova m10, [srcq+mmsize*7*%%i]
1086 CLIPD m0, m4, m5, m6
1087 CLIPD m1, m4, m5, m6
1088 CLIPD m2, m4, m5, m6
1089 CLIPD m3, m4, m5, m6
1091 CLIPD m7, m4, m5, m6
1092 CLIPD m8, m4, m5, m6
1093 CLIPD m9, m4, m5, m6
1094 CLIPD m10, m4, m5, m6
1096 mova [dstq+mmsize*0*%%i], m0
1097 mova [dstq+mmsize*1*%%i], m1
1098 mova [dstq+mmsize*2*%%i], m2
1099 mova [dstq+mmsize*3*%%i], m3
1101 mova [dstq+mmsize*4*%%i], m7
1102 mova [dstq+mmsize*5*%%i], m8
1103 mova [dstq+mmsize*6*%%i], m9
1104 mova [dstq+mmsize*7*%%i], m10
1108 add srcq, mmsize*4*(%2+%3)
1109 add dstq, mmsize*4*(%2+%3)
1110 sub lend, mmsize*(%2+%3)
1116 %define SPLATD SPLATD_MMX
1117 %define CLIPD CLIPD_MMX
1118 VECTOR_CLIP_INT32 0, 1, 0, 0
1120 %define SPLATD SPLATD_SSE2
1121 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
1122 %define CLIPD CLIPD_SSE2
1123 VECTOR_CLIP_INT32 6, 2, 0, 1
1125 %define CLIPD CLIPD_SSE41
1127 VECTOR_CLIP_INT32 11, 1, 1, 0
1129 VECTOR_CLIP_INT32 6, 1, 0, 0
1132 ;-----------------------------------------------------------------------------
1133 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
1135 ;-----------------------------------------------------------------------------
1136 %macro VECTOR_FMUL_REVERSE 0
1137 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
1138 lea lenq, [lend*4 - 2*mmsize]
1142 vmovaps xmm0, [src1q + 16]
1143 vinsertf128 m0, m0, [src1q], 1
1144 vshufps m0, m0, m0, q0123
1145 vmovaps xmm1, [src1q + mmsize + 16]
1146 vinsertf128 m1, m1, [src1q + mmsize], 1
1147 vshufps m1, m1, m1, q0123
1150 mova m1, [src1q + mmsize]
1151 shufps m0, m0, q0123
1152 shufps m1, m1, q0123
1154 mulps m0, m0, [src0q + lenq + mmsize]
1155 mulps m1, m1, [src0q + lenq]
1156 mova [dstq + lenq + mmsize], m0
1157 mova [dstq + lenq], m1
1176 ;-----------------------------------------------------------------------------
1177 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
1178 ; const float *src2, int len)
1179 ;-----------------------------------------------------------------------------
1180 %macro VECTOR_FMUL_ADD 0
1181 cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
1182 lea lenq, [lend*4 - 2*mmsize]
1185 mova m0, [src0q + lenq]
1186 mova m1, [src0q + lenq + mmsize]
1187 mulps m0, m0, [src1q + lenq]
1188 mulps m1, m1, [src1q + lenq + mmsize]
1189 addps m0, m0, [src2q + lenq]
1190 addps m1, m1, [src2q + lenq + mmsize]
1191 mova [dstq + lenq], m0
1192 mova [dstq + lenq + mmsize], m1
1211 ;-----------------------------------------------------------------------------
1212 ; void ff_butterflies_float_interleave(float *dst, const float *src0,
1213 ; const float *src1, int len);
1214 ;-----------------------------------------------------------------------------
1216 %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
1217 cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
1224 lea src0q, [src0q + lenq]
1225 lea src1q, [src1q + lenq]
1226 lea dstq, [ dstq + 2*lenq]
1229 mova m0, [src0q + lenq]
1230 mova m1, [src1q + lenq]
1236 vextractf128 [dstq + 2*lenq ], m1, 0
1237 vextractf128 [dstq + 2*lenq + 16], m0, 0
1238 vextractf128 [dstq + 2*lenq + 32], m1, 1
1239 vextractf128 [dstq + 2*lenq + 48], m0, 1
1241 mova [dstq + 2*lenq ], m1
1242 mova [dstq + 2*lenq + mmsize], m0
1255 BUTTERFLIES_FLOAT_INTERLEAVE
1258 BUTTERFLIES_FLOAT_INTERLEAVE
1262 ; %1 = aligned/unaligned
1263 %macro BSWAP_LOOPS_SSE2 1
1270 pshuflw m0, m0, 10110001b
1271 pshuflw m1, m1, 10110001b
1272 pshufhw m0, m0, 10110001b
1273 pshufhw m1, m1, 10110001b
1293 pshuflw m0, m0, 10110001b
1294 pshufhw m0, m0, 10110001b
1304 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
1305 cglobal bswap32_buf, 3,4,5
1327 ; %1 = aligned/unaligned
1328 %macro BSWAP_LOOPS_SSSE3 1
1355 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
1356 cglobal bswap32_buf, 3,4,3
1358 mova m2, [pb_bswap32]