1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
26 pb_zzzzzzzz77777777: times 8 db -1
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30 pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
31 pd_16384: times 4 dd 16384
35 %macro SCALARPRODUCT 1
36 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
37 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
45 movu m0, [v1q + orderq]
46 movu m1, [v1q + orderq + mmsize]
47 pmaddwd m0, [v2q + orderq]
48 pmaddwd m1, [v2q + orderq + mmsize]
66 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
67 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
82 movu m0, [v2q + orderq]
83 movu m1, [v2q + orderq + mmsize]
84 mova m4, [v1q + orderq]
85 mova m5, [v1q + orderq + mmsize]
86 movu m2, [v3q + orderq]
87 movu m3, [v3q + orderq + mmsize]
96 mova [v1q + orderq], m2
97 mova [v1q + orderq + mmsize], m3
117 %macro SCALARPRODUCT_LOOP 1
123 mova m4, [v2q + orderq]
124 mova m0, [v2q + orderq + mmsize]
128 mova m5, [v3q + orderq]
129 mova m2, [v3q + orderq + mmsize]
133 mova m0, [v2q + orderq]
134 mova m1, [v2q + orderq + mmsize]
135 mova m2, [v3q + orderq]
136 mova m3, [v3q + orderq + mmsize]
138 %define t0 [v1q + orderq]
139 %define t1 [v1q + orderq + mmsize]
154 mova [v1q + orderq], m2
155 mova [v1q + orderq + mmsize], m3
162 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
163 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
173 mova m4, [v2q + orderq]
174 mova m5, [v3q + orderq]
175 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
190 SCALARPRODUCT_LOOP 14
191 SCALARPRODUCT_LOOP 12
192 SCALARPRODUCT_LOOP 10
207 ;-----------------------------------------------------------------------------
208 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
209 ; const int16_t *window, unsigned int len)
210 ;-----------------------------------------------------------------------------
212 %macro REVERSE_WORDS_MMXEXT 1-2
216 %macro REVERSE_WORDS_SSE2 1-2
222 %macro REVERSE_WORDS_SSSE3 2
226 ; dst = (dst * src) >> 15
227 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
228 ; in from the pmullw result.
229 %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
238 ; dst = ((dst * src) + (1<<14)) >> 15
239 %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
243 %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
244 cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
245 lea offset2q, [offsetq-mmsize]
249 mova m5, [pb_revwords]
254 ; This version expands 16-bit to 32-bit, multiplies by the window,
255 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
256 ; save to the output. The window is reversed for the second half.
257 mova m3, [windowq+offset2q]
258 mova m4, [ inputq+offset2q]
272 mova [outputq+offset2q], m0
274 mova m4, [ inputq+offsetq]
288 mova [outputq+offsetq], m0
290 ; This version does the 16x16->16 multiplication in-place without expanding
291 ; to 32-bit. The ssse3 version is bit-identical.
292 mova m0, [windowq+offset2q]
293 mova m1, [ inputq+offset2q]
296 pmulhrsw m0, [ inputq+offsetq ]
297 mova [outputq+offset2q], m1
298 mova [outputq+offsetq ], m0
300 ; This version does the 16x16->16 multiplication in-place without expanding
301 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
302 ; therefore are not bit-identical to the C version.
303 mova m0, [windowq+offset2q]
304 mova m1, [ inputq+offset2q]
305 mova m2, [ inputq+offsetq ]
306 MUL16FIXED m1, m0, m3
308 MUL16FIXED m2, m0, m3
309 mova [outputq+offset2q], m1
310 mova [outputq+offsetq ], m2
319 %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
320 %define MUL16FIXED MUL16FIXED_MMXEXT
321 APPLY_WINDOW_INT16 mmxext, 0, 0
322 APPLY_WINDOW_INT16 mmxext_ba, 1, 0
324 %define REVERSE_WORDS REVERSE_WORDS_SSE2
325 APPLY_WINDOW_INT16 sse2, 0, 0
326 APPLY_WINDOW_INT16 sse2_ba, 1, 0
327 APPLY_WINDOW_INT16 ssse3_atom, 0, 1
328 %define REVERSE_WORDS REVERSE_WORDS_SSSE3
329 APPLY_WINDOW_INT16 ssse3, 0, 1
332 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
333 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
336 movd mm4, [left_topq]
341 psubb mm0, mm4 ; t-tl
353 psubb mm0, mm4 ; t-tl
359 paddb mm4, mm3 ; t-tl+l
364 pmaxub mm3, mm5 ; median
365 paddb mm3, mm2 ; +residual
385 movzx r2d, byte [dstq-1]
387 movzx r2d, byte [topq-1]
392 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
418 movhps [dstq+wq+8], m0
430 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
432 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
435 mova m4, [pb_zzzz3333zzzzbbbb]
436 mova m3, [pb_zz11zz55zz99zzdd]
442 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
444 mova m6, [pb_zzzzzzzz77777777]
445 mova m4, [pb_zzzz3333zzzzbbbb]
446 mova m3, [pb_zz11zz55zz99zzdd]
450 jnz add_hfyu_left_prediction_ssse3.skip_prologue
458 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
459 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
466 movaps xmm1, [v1q+offsetq]
467 mulps xmm1, [v2q+offsetq]
482 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
483 ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
484 ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
486 ; The actual function itself is below. It basically wraps a very simple
487 ; w = end_x - start_x
490 ; jump to the slow loop functions
492 ; jump to the fast loop functions
496 ; ... and then the same for left/right extend also. See below for loop
497 ; function implementations. Fast are fixed-width, slow is variable-width
499 %macro EMU_EDGE_FUNC 1
502 cglobal emu_edge_core_%1, 6, 7, 1
503 mov r11, r5 ; save block_h
506 cglobal emu_edge_core_%1, 2, 7, 0
508 mov r5, r5m ; block_h
511 ; start with vertical extend (top/bottom) and body pixel copy
513 sub w_reg, r6m ; w = start_x - end_x
521 jg .slow_v_extend_loop
523 mov r2, r2m ; linesize
525 sal w_reg, 7 ; w * 128
527 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
530 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
532 call w_reg ; fast top extend, body copy and bottom extend
535 ; horizontal extend (left/right)
536 mov w_reg, r6m ; start_x
539 mov r3, r0 ; backup of buf+block_h*linesize
542 mov r0m, r0 ; backup of buf+block_h*linesize
548 jg .slow_left_extend_loop
551 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
554 ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
555 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
557 lea rax, [.emuedge_extend_left_2]
560 lea w_reg, [.emuedge_extend_left_2+w_reg]
564 ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
570 mov w_reg, r7m ; end_x
571 mov r1, r8m ; block_w
574 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
576 jg .slow_right_extend_loop
578 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
582 lea rax, [.emuedge_extend_right_2]
585 lea r1, [.emuedge_extend_right_2+r1]
611 %define stack_offset 0x14
616 ; macro to read/write a horizontal number of pixels (%2) to/from registers
617 ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
618 ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
619 ; - else if (%2 & 8) fills 8 bytes into mm0
620 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
621 ; - else if (%2 & 4) fills 4 bytes into mm0-1
622 ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
623 ; (note that we're using r3 for body/bottom because it's a shorter
624 ; opcode, and then the loop fits in 128 bytes)
625 ; - else fills remaining bytes into rax
626 ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
627 ; - if (%2 & 7 == 4) fills 4 bytes into ebx
628 ; - else if (%2 & 4) fills 4 bytes into mm0-7
629 ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
630 ; - else fills remaining bytes into ebx
631 ; writing data out is in the same way
632 %macro READ_NUM_BYTES 3
633 %assign %%src_off 0 ; offset in source buffer
634 %assign %%smidx 0 ; mmx register idx
635 %assign %%sxidx 0 ; xmm register idx
639 movdqu xmm %+ %%sxidx, [r1+%%src_off]
640 %assign %%src_off %%src_off+16
641 %assign %%sxidx %%sxidx+1
646 %if (%2-%%src_off) == 8
647 mov rax, [r1+%%src_off]
648 %assign %%src_off %%src_off+8
649 %endif ; (%2-%%src_off) == 8
652 %rep (%2-%%src_off)/8
653 movq mm %+ %%smidx, [r1+%%src_off]
654 %assign %%src_off %%src_off+8
655 %assign %%smidx %%smidx+1
656 %endrep ; (%2-%%dst_off)/8
658 %if (%2-%%src_off) == 4
659 mov vald, [r1+%%src_off]
660 %elif (%2-%%src_off) & 4
661 movd mm %+ %%smidx, [r1+%%src_off]
662 %assign %%src_off %%src_off+4
663 %endif ; (%2-%%src_off) ==/& 4
665 %if (%2-%%src_off) == 1
666 mov vall, [r1+%%src_off]
667 %elif (%2-%%src_off) == 2
668 mov valw, [r1+%%src_off]
669 %elif (%2-%%src_off) == 3
671 mov valw2, [r1+%%src_off]
673 mov valw3, [r1+%%src_off]
675 mov valw4, [r1+%%src_off]
676 %endif ; %1 ==/!= top
677 mov vall, [r1+%%src_off+2]
678 %endif ; (%2-%%src_off) == 1/2/3
679 %endmacro ; READ_NUM_BYTES
681 %macro WRITE_NUM_BYTES 3
682 %assign %%dst_off 0 ; offset in destination buffer
683 %assign %%dmidx 0 ; mmx register idx
684 %assign %%dxidx 0 ; xmm register idx
688 movdqu [r0+%%dst_off], xmm %+ %%dxidx
689 %assign %%dst_off %%dst_off+16
690 %assign %%dxidx %%dxidx+1
695 %if (%2-%%dst_off) == 8
696 mov [r0+%%dst_off], rax
697 %assign %%dst_off %%dst_off+8
698 %endif ; (%2-%%dst_off) == 8
701 %rep (%2-%%dst_off)/8
702 movq [r0+%%dst_off], mm %+ %%dmidx
703 %assign %%dst_off %%dst_off+8
704 %assign %%dmidx %%dmidx+1
705 %endrep ; (%2-%%dst_off)/8
707 %if (%2-%%dst_off) == 4
708 mov [r0+%%dst_off], vald
709 %elif (%2-%%dst_off) & 4
710 movd [r0+%%dst_off], mm %+ %%dmidx
711 %assign %%dst_off %%dst_off+4
712 %endif ; (%2-%%dst_off) ==/& 4
714 %if (%2-%%dst_off) == 1
715 mov [r0+%%dst_off], vall
716 %elif (%2-%%dst_off) == 2
717 mov [r0+%%dst_off], valw
718 %elif (%2-%%dst_off) == 3
720 mov [r0+%%dst_off], valw2
722 mov [r0+%%dst_off], valw3
724 mov [r0+%%dst_off], valw4
725 %endif ; %1 ==/!= top
726 mov [r0+%%dst_off+2], vall
727 %endif ; (%2-%%dst_off) == 1/2/3
728 %endmacro ; WRITE_NUM_BYTES
730 ; vertical top/bottom extend and body copy fast loops
731 ; these are function pointers to set-width line copy functions, i.e.
732 ; they read a fixed number of pixels into set registers, and write
733 ; those out into the destination buffer
734 ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
735 ; r6(eax/64)/r3(ebx/32)=val_reg
736 %macro VERTICAL_EXTEND 1
740 .emuedge_v_extend_ %+ %%n:
741 ; extend pixels above body
743 test r3 , r3 ; if (!start_y)
744 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
747 je .emuedge_copy_body_ %+ %%n %+ _loop
748 %endif ; ARCH_X86_64/32
749 READ_NUM_BYTES top, %%n, %1 ; read bytes
750 .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
751 WRITE_NUM_BYTES top, %%n, %1 ; write bytes
752 add r0 , r2 ; dst += linesize
757 %endif ; ARCH_X86_64/32
758 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
761 .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
762 READ_NUM_BYTES body, %%n, %1 ; read bytes
763 WRITE_NUM_BYTES body, %%n, %1 ; write bytes
764 add r0 , r2 ; dst += linesize
765 add r1 , r2 ; src += linesize
767 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
770 test r5 , r5 ; if (!block_h)
771 jz .emuedge_v_extend_end_ %+ %%n ; goto end
772 sub r1 , r2 ; src -= linesize
773 READ_NUM_BYTES bottom, %%n, %1 ; read bytes
774 .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
775 WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes
776 add r0 , r2 ; dst += linesize
778 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
780 .emuedge_v_extend_end_ %+ %%n:
785 %endif ; ARCH_X86_64/32
788 %endmacro VERTICAL_EXTEND
790 ; left/right (horizontal) fast extend functions
791 ; these are essentially identical to the vertical extend ones above,
792 ; just left/right separated because number of pixels to extend is
793 ; obviously not the same on both sides.
794 ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
795 ; lowest two bytes of the register (so val*0x0101), and are splatted
796 ; into each byte of mm0 as well if n_pixels >= 8
798 %macro READ_V_PIXEL 3
812 %macro WRITE_V_PIXEL 2
815 movq [%2+%%dst_off], mm0
816 %assign %%dst_off %%dst_off+8
820 movd [%2+%%dst_off], mm0
822 mov [%2+%%dst_off] , valw
823 mov [%2+%%dst_off+2], valw
825 %assign %%dst_off %%dst_off+4
828 mov [%2+%%dst_off], valw
832 ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
837 .emuedge_extend_left_ %+ %%n: ; do {
838 sub r0, r2 ; dst -= linesize
839 READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels
840 WRITE_V_PIXEL %%n, r0 ; write pixels
842 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
847 %endif ; ARCH_X86_64/32
850 %endmacro ; LEFT_EXTEND
852 ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
853 %macro RIGHT_EXTEND 1
857 .emuedge_extend_right_ %+ %%n: ; do {
859 sub r3, r2 ; dst -= linesize
860 READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels
861 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
864 sub r0, r2 ; dst -= linesize
865 READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels
866 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
868 %endif ; ARCH_X86_64/32
869 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
874 %endif ; ARCH_X86_64/32
879 %define stack_offset 0x10
881 %endmacro ; RIGHT_EXTEND
883 ; below follow the "slow" copy/extend functions, these act on a non-fixed
884 ; width specified in a register, and run a loop to copy the full amount
885 ; of bytes. They are optimized for copying of large amounts of pixels per
886 ; line, so they unconditionally splat data into mm registers to copy 8
887 ; bytes per loop iteration. It could be considered to use xmm for x86-64
888 ; also, but I haven't optimized this as much (i.e. FIXME)
889 %macro V_COPY_NPX 4-5
915 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
917 V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
920 V_COPY_NPX %1, rax , mov, 8
923 V_COPY_NPX %1, mm0, movq, 8
924 %endif ; ARCH_X86_64/32
926 V_COPY_NPX %1, vald, mov, 4
927 V_COPY_NPX %1, valw, mov, 2
928 V_COPY_NPX %1, vall, mov, 1
938 %macro SLOW_V_EXTEND 1
940 ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
941 ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
943 push r11 ; save old value of block_h
946 jz .do_body_copy ; if (!start_y) goto do_body_copy
947 V_COPY_ROW top, r3, %1
951 je .do_body_copy ; if (!start_y) goto do_body_copy
952 V_COPY_ROW top, dword r3m, %1
956 V_COPY_ROW body, r4, %1
959 pop r11 ; restore old value of block_h
966 jz .skip_bottom_extend
968 V_COPY_ROW bottom, r5, %1
976 %macro SLOW_LEFT_EXTEND 1
977 .slow_left_extend_loop:
978 ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
981 READ_V_PIXEL 8, [r0+w_reg], %1
982 .left_extend_8px_loop:
986 jle .left_extend_8px_loop
989 jge .left_extend_loop_end
990 .left_extend_2px_loop:
994 jl .left_extend_2px_loop
995 .left_extend_loop_end:
997 jnz .slow_left_extend_loop
1004 %macro SLOW_RIGHT_EXTEND 1
1005 .slow_right_extend_loop:
1006 ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
1007 ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1016 sub buf_reg, linesize
1017 READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
1018 .right_extend_8px_loop:
1019 movq [buf_reg+r1], mm0
1022 jge .right_extend_8px_loop
1025 je .right_extend_loop_end
1026 .right_extend_2px_loop:
1028 mov [buf_reg+r1], valw
1030 jg .right_extend_2px_loop
1031 .right_extend_loop_end:
1033 jnz .slow_right_extend_loop
1044 SLOW_RIGHT_EXTEND %1
1052 ;-----------------------------------------------------------------------------
1053 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
1054 ; int32_t max, unsigned int len)
1055 ;-----------------------------------------------------------------------------
1057 %macro PMINSD_MMX 3 ; dst, src, tmp
1065 %macro PMAXSD_MMX 3 ; dst, src, tmp
1073 %macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
1074 PMINSD_MMX %1, %3, %4
1075 PMAXSD_MMX %1, %2, %4
1078 %macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
1085 %macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused
1094 %macro SPLATD_SSE2 1
1098 %macro VECTOR_CLIP_INT32 4
1099 cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
1112 mova m0, [srcq+mmsize*0*%%i]
1113 mova m1, [srcq+mmsize*1*%%i]
1114 mova m2, [srcq+mmsize*2*%%i]
1115 mova m3, [srcq+mmsize*3*%%i]
1117 mova m7, [srcq+mmsize*4*%%i]
1118 mova m8, [srcq+mmsize*5*%%i]
1119 mova m9, [srcq+mmsize*6*%%i]
1120 mova m10, [srcq+mmsize*7*%%i]
1122 CLIPD m0, m4, m5, m6
1123 CLIPD m1, m4, m5, m6
1124 CLIPD m2, m4, m5, m6
1125 CLIPD m3, m4, m5, m6
1127 CLIPD m7, m4, m5, m6
1128 CLIPD m8, m4, m5, m6
1129 CLIPD m9, m4, m5, m6
1130 CLIPD m10, m4, m5, m6
1132 mova [dstq+mmsize*0*%%i], m0
1133 mova [dstq+mmsize*1*%%i], m1
1134 mova [dstq+mmsize*2*%%i], m2
1135 mova [dstq+mmsize*3*%%i], m3
1137 mova [dstq+mmsize*4*%%i], m7
1138 mova [dstq+mmsize*5*%%i], m8
1139 mova [dstq+mmsize*6*%%i], m9
1140 mova [dstq+mmsize*7*%%i], m10
1144 add srcq, mmsize*4*(%3+%4)
1145 add dstq, mmsize*4*(%3+%4)
1146 sub lend, mmsize*(%3+%4)
1152 %define SPLATD SPLATD_MMX
1153 %define CLIPD CLIPD_MMX
1154 VECTOR_CLIP_INT32 mmx, 0, 1, 0
1156 %define SPLATD SPLATD_SSE2
1157 VECTOR_CLIP_INT32 sse2_int, 6, 1, 0
1158 %define CLIPD CLIPD_SSE2
1159 VECTOR_CLIP_INT32 sse2, 6, 2, 0
1160 %define CLIPD CLIPD_SSE41
1162 VECTOR_CLIP_INT32 sse41, 11, 1, 1
1164 VECTOR_CLIP_INT32 sse41, 6, 1, 0