1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86inc.asm"
23 %include "x86util.asm"
27 pb_zzzzzzzz77777777: times 8 db -1
29 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
30 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
31 pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
32 pd_16384: times 4 dd 16384
33 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
37 %macro SCALARPRODUCT 1
38 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
39 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
47 movu m0, [v1q + orderq]
48 movu m1, [v1q + orderq + mmsize]
49 pmaddwd m0, [v2q + orderq]
50 pmaddwd m1, [v2q + orderq + mmsize]
68 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
69 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
84 movu m0, [v2q + orderq]
85 movu m1, [v2q + orderq + mmsize]
86 mova m4, [v1q + orderq]
87 mova m5, [v1q + orderq + mmsize]
88 movu m2, [v3q + orderq]
89 movu m3, [v3q + orderq + mmsize]
98 mova [v1q + orderq], m2
99 mova [v1q + orderq + mmsize], m3
119 %macro SCALARPRODUCT_LOOP 1
125 mova m4, [v2q + orderq]
126 mova m0, [v2q + orderq + mmsize]
130 mova m5, [v3q + orderq]
131 mova m2, [v3q + orderq + mmsize]
135 mova m0, [v2q + orderq]
136 mova m1, [v2q + orderq + mmsize]
137 mova m2, [v3q + orderq]
138 mova m3, [v3q + orderq + mmsize]
140 %define t0 [v1q + orderq]
141 %define t1 [v1q + orderq + mmsize]
156 mova [v1q + orderq], m2
157 mova [v1q + orderq + mmsize], m3
164 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
165 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
175 mova m4, [v2q + orderq]
176 mova m5, [v3q + orderq]
177 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
192 SCALARPRODUCT_LOOP 14
193 SCALARPRODUCT_LOOP 12
194 SCALARPRODUCT_LOOP 10
209 ;-----------------------------------------------------------------------------
210 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
211 ; const int16_t *window, unsigned int len)
212 ;-----------------------------------------------------------------------------
214 %macro REVERSE_WORDS_MMXEXT 1-2
218 %macro REVERSE_WORDS_SSE2 1-2
224 %macro REVERSE_WORDS_SSSE3 2
228 ; dst = (dst * src) >> 15
229 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
230 ; in from the pmullw result.
231 %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
240 ; dst = ((dst * src) + (1<<14)) >> 15
241 %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
245 %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
246 cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
247 lea offset2q, [offsetq-mmsize]
251 mova m5, [pb_revwords]
256 ; This version expands 16-bit to 32-bit, multiplies by the window,
257 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
258 ; save to the output. The window is reversed for the second half.
259 mova m3, [windowq+offset2q]
260 mova m4, [ inputq+offset2q]
274 mova [outputq+offset2q], m0
276 mova m4, [ inputq+offsetq]
290 mova [outputq+offsetq], m0
292 ; This version does the 16x16->16 multiplication in-place without expanding
293 ; to 32-bit. The ssse3 version is bit-identical.
294 mova m0, [windowq+offset2q]
295 mova m1, [ inputq+offset2q]
298 pmulhrsw m0, [ inputq+offsetq ]
299 mova [outputq+offset2q], m1
300 mova [outputq+offsetq ], m0
302 ; This version does the 16x16->16 multiplication in-place without expanding
303 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
304 ; therefore are not bit-identical to the C version.
305 mova m0, [windowq+offset2q]
306 mova m1, [ inputq+offset2q]
307 mova m2, [ inputq+offsetq ]
308 MUL16FIXED m1, m0, m3
310 MUL16FIXED m2, m0, m3
311 mova [outputq+offset2q], m1
312 mova [outputq+offsetq ], m2
321 %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
322 %define MUL16FIXED MUL16FIXED_MMXEXT
323 APPLY_WINDOW_INT16 mmxext, 0, 0
324 APPLY_WINDOW_INT16 mmxext_ba, 1, 0
326 %define REVERSE_WORDS REVERSE_WORDS_SSE2
327 APPLY_WINDOW_INT16 sse2, 0, 0
328 APPLY_WINDOW_INT16 sse2_ba, 1, 0
329 APPLY_WINDOW_INT16 ssse3_atom, 0, 1
330 %define REVERSE_WORDS REVERSE_WORDS_SSSE3
331 APPLY_WINDOW_INT16 ssse3, 0, 1
334 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
335 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
338 movd mm4, [left_topq]
343 psubb mm0, mm4 ; t-tl
355 psubb mm0, mm4 ; t-tl
361 paddb mm4, mm3 ; t-tl+l
366 pmaxub mm3, mm5 ; median
367 paddb mm3, mm2 ; +residual
387 movzx r2d, byte [dstq-1]
389 movzx r2d, byte [topq-1]
394 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
420 movhps [dstq+wq+8], m0
432 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
434 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
437 mova m4, [pb_zzzz3333zzzzbbbb]
438 mova m3, [pb_zz11zz55zz99zzdd]
444 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
446 mova m6, [pb_zzzzzzzz77777777]
447 mova m4, [pb_zzzz3333zzzzbbbb]
448 mova m3, [pb_zz11zz55zz99zzdd]
452 jnz add_hfyu_left_prediction_ssse3.skip_prologue
460 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
461 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
468 movaps xmm1, [v1q+offsetq]
469 mulps xmm1, [v2q+offsetq]
484 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
485 ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
486 ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
488 ; The actual function itself is below. It basically wraps a very simple
489 ; w = end_x - start_x
492 ; jump to the slow loop functions
494 ; jump to the fast loop functions
498 ; ... and then the same for left/right extend also. See below for loop
499 ; function implementations. Fast are fixed-width, slow is variable-width
501 %macro EMU_EDGE_FUNC 0
504 cglobal emu_edge_core, 6, 7, 1
505 mov r11, r5 ; save block_h
508 cglobal emu_edge_core, 2, 7, 0
510 mov r5, r5m ; block_h
513 ; start with vertical extend (top/bottom) and body pixel copy
515 sub w_reg, r6m ; w = start_x - end_x
523 jg .slow_v_extend_loop
525 mov r2, r2m ; linesize
527 sal w_reg, 7 ; w * 128
529 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
532 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
534 call w_reg ; fast top extend, body copy and bottom extend
537 ; horizontal extend (left/right)
538 mov w_reg, r6m ; start_x
541 mov r3, r0 ; backup of buf+block_h*linesize
544 mov r0m, r0 ; backup of buf+block_h*linesize
550 jg .slow_left_extend_loop
553 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
556 ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
557 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
559 lea rax, [.emuedge_extend_left_2]
562 lea w_reg, [.emuedge_extend_left_2+w_reg]
566 ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
572 mov w_reg, r7m ; end_x
573 mov r1, r8m ; block_w
576 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
578 jg .slow_right_extend_loop
580 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
584 lea rax, [.emuedge_extend_right_2]
587 lea r1, [.emuedge_extend_right_2+r1]
613 %define stack_offset 0x14
618 ; macro to read/write a horizontal number of pixels (%2) to/from registers
619 ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
620 ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
621 ; - else if (%2 & 8) fills 8 bytes into mm0
622 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
623 ; - else if (%2 & 4) fills 4 bytes into mm0-1
624 ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
625 ; (note that we're using r3 for body/bottom because it's a shorter
626 ; opcode, and then the loop fits in 128 bytes)
627 ; - else fills remaining bytes into rax
628 ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
629 ; - if (%2 & 7 == 4) fills 4 bytes into ebx
630 ; - else if (%2 & 4) fills 4 bytes into mm0-7
631 ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
632 ; - else fills remaining bytes into ebx
633 ; writing data out is in the same way
634 %macro READ_NUM_BYTES 2
635 %assign %%src_off 0 ; offset in source buffer
636 %assign %%smidx 0 ; mmx register idx
637 %assign %%sxidx 0 ; xmm register idx
641 movups xmm %+ %%sxidx, [r1+%%src_off]
642 %assign %%src_off %%src_off+16
643 %assign %%sxidx %%sxidx+1
648 %if (%2-%%src_off) == 8
649 mov rax, [r1+%%src_off]
650 %assign %%src_off %%src_off+8
651 %endif ; (%2-%%src_off) == 8
654 %rep (%2-%%src_off)/8
655 movq mm %+ %%smidx, [r1+%%src_off]
656 %assign %%src_off %%src_off+8
657 %assign %%smidx %%smidx+1
658 %endrep ; (%2-%%dst_off)/8
660 %if (%2-%%src_off) == 4
661 mov vald, [r1+%%src_off]
662 %elif (%2-%%src_off) & 4
663 movd mm %+ %%smidx, [r1+%%src_off]
664 %assign %%src_off %%src_off+4
665 %endif ; (%2-%%src_off) ==/& 4
667 %if (%2-%%src_off) == 1
668 mov vall, [r1+%%src_off]
669 %elif (%2-%%src_off) == 2
670 mov valw, [r1+%%src_off]
671 %elif (%2-%%src_off) == 3
673 mov valw2, [r1+%%src_off]
675 mov valw3, [r1+%%src_off]
677 mov valw4, [r1+%%src_off]
678 %endif ; %1 ==/!= top
679 mov vall, [r1+%%src_off+2]
680 %endif ; (%2-%%src_off) == 1/2/3
681 %endmacro ; READ_NUM_BYTES
683 %macro WRITE_NUM_BYTES 2
684 %assign %%dst_off 0 ; offset in destination buffer
685 %assign %%dmidx 0 ; mmx register idx
686 %assign %%dxidx 0 ; xmm register idx
690 movups [r0+%%dst_off], xmm %+ %%dxidx
691 %assign %%dst_off %%dst_off+16
692 %assign %%dxidx %%dxidx+1
697 %if (%2-%%dst_off) == 8
698 mov [r0+%%dst_off], rax
699 %assign %%dst_off %%dst_off+8
700 %endif ; (%2-%%dst_off) == 8
703 %rep (%2-%%dst_off)/8
704 movq [r0+%%dst_off], mm %+ %%dmidx
705 %assign %%dst_off %%dst_off+8
706 %assign %%dmidx %%dmidx+1
707 %endrep ; (%2-%%dst_off)/8
709 %if (%2-%%dst_off) == 4
710 mov [r0+%%dst_off], vald
711 %elif (%2-%%dst_off) & 4
712 movd [r0+%%dst_off], mm %+ %%dmidx
713 %assign %%dst_off %%dst_off+4
714 %endif ; (%2-%%dst_off) ==/& 4
716 %if (%2-%%dst_off) == 1
717 mov [r0+%%dst_off], vall
718 %elif (%2-%%dst_off) == 2
719 mov [r0+%%dst_off], valw
720 %elif (%2-%%dst_off) == 3
722 mov [r0+%%dst_off], valw2
724 mov [r0+%%dst_off], valw3
726 mov [r0+%%dst_off], valw4
727 %endif ; %1 ==/!= top
728 mov [r0+%%dst_off+2], vall
729 %endif ; (%2-%%dst_off) == 1/2/3
730 %endmacro ; WRITE_NUM_BYTES
732 ; vertical top/bottom extend and body copy fast loops
733 ; these are function pointers to set-width line copy functions, i.e.
734 ; they read a fixed number of pixels into set registers, and write
735 ; those out into the destination buffer
736 ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
737 ; r6(eax/64)/r3(ebx/32)=val_reg
738 %macro VERTICAL_EXTEND 0
742 .emuedge_v_extend_ %+ %%n:
743 ; extend pixels above body
745 test r3 , r3 ; if (!start_y)
746 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
749 je .emuedge_copy_body_ %+ %%n %+ _loop
750 %endif ; ARCH_X86_64/32
751 READ_NUM_BYTES top, %%n ; read bytes
752 .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
753 WRITE_NUM_BYTES top, %%n ; write bytes
754 add r0 , r2 ; dst += linesize
759 %endif ; ARCH_X86_64/32
760 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
763 .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
764 READ_NUM_BYTES body, %%n ; read bytes
765 WRITE_NUM_BYTES body, %%n ; write bytes
766 add r0 , r2 ; dst += linesize
767 add r1 , r2 ; src += linesize
769 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
772 test r5 , r5 ; if (!block_h)
773 jz .emuedge_v_extend_end_ %+ %%n ; goto end
774 sub r1 , r2 ; src -= linesize
775 READ_NUM_BYTES bottom, %%n ; read bytes
776 .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
777 WRITE_NUM_BYTES bottom, %%n ; write bytes
778 add r0 , r2 ; dst += linesize
780 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
782 .emuedge_v_extend_end_ %+ %%n:
787 %endif ; ARCH_X86_64/32
790 %endmacro VERTICAL_EXTEND
792 ; left/right (horizontal) fast extend functions
793 ; these are essentially identical to the vertical extend ones above,
794 ; just left/right separated because number of pixels to extend is
795 ; obviously not the same on both sides.
796 ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
797 ; lowest two bytes of the register (so val*0x0101), and are splatted
798 ; into each byte of mm0 as well if n_pixels >= 8
800 %macro READ_V_PIXEL 2
814 %macro WRITE_V_PIXEL 2
817 movq [%2+%%dst_off], mm0
818 %assign %%dst_off %%dst_off+8
822 movd [%2+%%dst_off], mm0
824 mov [%2+%%dst_off] , valw
825 mov [%2+%%dst_off+2], valw
827 %assign %%dst_off %%dst_off+4
830 mov [%2+%%dst_off], valw
834 ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
839 .emuedge_extend_left_ %+ %%n: ; do {
840 sub r0, r2 ; dst -= linesize
841 READ_V_PIXEL %%n, [r0+r1] ; read pixels
842 WRITE_V_PIXEL %%n, r0 ; write pixels
844 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
849 %endif ; ARCH_X86_64/32
852 %endmacro ; LEFT_EXTEND
854 ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
855 %macro RIGHT_EXTEND 0
859 .emuedge_extend_right_ %+ %%n: ; do {
861 sub r3, r2 ; dst -= linesize
862 READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
863 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
866 sub r0, r2 ; dst -= linesize
867 READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
868 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
870 %endif ; ARCH_X86_64/32
871 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
876 %endif ; ARCH_X86_64/32
881 %define stack_offset 0x10
883 %endmacro ; RIGHT_EXTEND
885 ; below follow the "slow" copy/extend functions, these act on a non-fixed
886 ; width specified in a register, and run a loop to copy the full amount
887 ; of bytes. They are optimized for copying of large amounts of pixels per
888 ; line, so they unconditionally splat data into mm registers to copy 8
889 ; bytes per loop iteration. It could be considered to use xmm for x86-64
890 ; also, but I haven't optimized this as much (i.e. FIXME)
891 %macro V_COPY_NPX 4-5
917 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
919 V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
922 V_COPY_NPX %1, rax , mov, 8
925 V_COPY_NPX %1, mm0, movq, 8
926 %endif ; ARCH_X86_64/32
928 V_COPY_NPX %1, vald, mov, 4
929 V_COPY_NPX %1, valw, mov, 2
930 V_COPY_NPX %1, vall, mov, 1
940 %macro SLOW_V_EXTEND 0
942 ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
943 ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
945 push r11 ; save old value of block_h
948 jz .do_body_copy ; if (!start_y) goto do_body_copy
953 je .do_body_copy ; if (!start_y) goto do_body_copy
954 V_COPY_ROW top, dword r3m
961 pop r11 ; restore old value of block_h
968 jz .skip_bottom_extend
970 V_COPY_ROW bottom, r5
978 %macro SLOW_LEFT_EXTEND 0
979 .slow_left_extend_loop:
980 ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
983 READ_V_PIXEL 8, [r0+w_reg]
984 .left_extend_8px_loop:
988 jle .left_extend_8px_loop
991 jge .left_extend_loop_end
992 .left_extend_2px_loop:
996 jl .left_extend_2px_loop
997 .left_extend_loop_end:
999 jnz .slow_left_extend_loop
1006 %macro SLOW_RIGHT_EXTEND 0
1007 .slow_right_extend_loop:
1008 ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
1009 ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1018 sub buf_reg, linesize
1019 READ_V_PIXEL 8, [buf_reg+w_reg-1]
1020 .right_extend_8px_loop:
1021 movq [buf_reg+r1], mm0
1024 jge .right_extend_8px_loop
1027 je .right_extend_loop_end
1028 .right_extend_2px_loop:
1030 mov [buf_reg+r1], valw
1032 jg .right_extend_2px_loop
1033 .right_extend_loop_end:
1035 jnz .slow_right_extend_loop
1055 ;-----------------------------------------------------------------------------
1056 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
1057 ; int32_t max, unsigned int len)
1058 ;-----------------------------------------------------------------------------
1060 ; %1 = number of xmm registers used
1061 ; %2 = number of inline load/process/store loops per asm loop
1062 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
1063 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
1065 %macro VECTOR_CLIP_INT32 4-5
1066 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
1079 mova m0, [srcq+mmsize*0*%%i]
1080 mova m1, [srcq+mmsize*1*%%i]
1081 mova m2, [srcq+mmsize*2*%%i]
1082 mova m3, [srcq+mmsize*3*%%i]
1084 mova m7, [srcq+mmsize*4*%%i]
1085 mova m8, [srcq+mmsize*5*%%i]
1086 mova m9, [srcq+mmsize*6*%%i]
1087 mova m10, [srcq+mmsize*7*%%i]
1089 CLIPD m0, m4, m5, m6
1090 CLIPD m1, m4, m5, m6
1091 CLIPD m2, m4, m5, m6
1092 CLIPD m3, m4, m5, m6
1094 CLIPD m7, m4, m5, m6
1095 CLIPD m8, m4, m5, m6
1096 CLIPD m9, m4, m5, m6
1097 CLIPD m10, m4, m5, m6
1099 mova [dstq+mmsize*0*%%i], m0
1100 mova [dstq+mmsize*1*%%i], m1
1101 mova [dstq+mmsize*2*%%i], m2
1102 mova [dstq+mmsize*3*%%i], m3
1104 mova [dstq+mmsize*4*%%i], m7
1105 mova [dstq+mmsize*5*%%i], m8
1106 mova [dstq+mmsize*6*%%i], m9
1107 mova [dstq+mmsize*7*%%i], m10
1111 add srcq, mmsize*4*(%2+%3)
1112 add dstq, mmsize*4*(%2+%3)
1113 sub lend, mmsize*(%2+%3)
1119 %define SPLATD SPLATD_MMX
1120 %define CLIPD CLIPD_MMX
1121 VECTOR_CLIP_INT32 0, 1, 0, 0
1123 %define SPLATD SPLATD_SSE2
1124 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
1125 %define CLIPD CLIPD_SSE2
1126 VECTOR_CLIP_INT32 6, 2, 0, 1
1128 %define CLIPD CLIPD_SSE41
1130 VECTOR_CLIP_INT32 11, 1, 1, 0
1132 VECTOR_CLIP_INT32 6, 1, 0, 0
1135 ;-----------------------------------------------------------------------------
1136 ; void ff_butterflies_float_interleave(float *dst, const float *src0,
1137 ; const float *src1, int len);
1138 ;-----------------------------------------------------------------------------
1140 %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
1141 cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
1148 lea src0q, [src0q + lenq]
1149 lea src1q, [src1q + lenq]
1150 lea dstq, [ dstq + 2*lenq]
1153 mova m0, [src0q + lenq]
1154 mova m1, [src1q + lenq]
1160 vextractf128 [dstq + 2*lenq ], m1, 0
1161 vextractf128 [dstq + 2*lenq + 16], m0, 0
1162 vextractf128 [dstq + 2*lenq + 32], m1, 1
1163 vextractf128 [dstq + 2*lenq + 48], m0, 1
1165 mova [dstq + 2*lenq ], m1
1166 mova [dstq + 2*lenq + mmsize], m0
1179 BUTTERFLIES_FLOAT_INTERLEAVE
1182 BUTTERFLIES_FLOAT_INTERLEAVE
1186 ; %1 = aligned/unaligned
1187 %macro BSWAP_LOOPS_SSE2 1
1194 pshuflw m0, m0, 10110001b
1195 pshuflw m1, m1, 10110001b
1196 pshufhw m0, m0, 10110001b
1197 pshufhw m1, m1, 10110001b
1217 pshuflw m0, m0, 10110001b
1218 pshufhw m0, m0, 10110001b
1228 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
1229 cglobal bswap32_buf, 3,4,5
1251 ; %1 = aligned/unaligned
1252 %macro BSWAP_LOOPS_SSSE3 1
1279 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
1280 cglobal bswap32_buf, 3,4,3
1282 mova m2, [pb_bswap32]