1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
23 %include "x86util.asm"
27 pb_zzzzzzzz77777777: times 8 db -1
29 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
30 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
31 pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
32 pd_16384: times 4 dd 16384
36 %macro SCALARPRODUCT 1
37 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
38 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
46 movu m0, [v1q + orderq]
47 movu m1, [v1q + orderq + mmsize]
48 pmaddwd m0, [v2q + orderq]
49 pmaddwd m1, [v2q + orderq + mmsize]
67 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
68 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
83 movu m0, [v2q + orderq]
84 movu m1, [v2q + orderq + mmsize]
85 mova m4, [v1q + orderq]
86 mova m5, [v1q + orderq + mmsize]
87 movu m2, [v3q + orderq]
88 movu m3, [v3q + orderq + mmsize]
97 mova [v1q + orderq], m2
98 mova [v1q + orderq + mmsize], m3
118 %macro SCALARPRODUCT_LOOP 1
124 mova m4, [v2q + orderq]
125 mova m0, [v2q + orderq + mmsize]
129 mova m5, [v3q + orderq]
130 mova m2, [v3q + orderq + mmsize]
134 mova m0, [v2q + orderq]
135 mova m1, [v2q + orderq + mmsize]
136 mova m2, [v3q + orderq]
137 mova m3, [v3q + orderq + mmsize]
139 %define t0 [v1q + orderq]
140 %define t1 [v1q + orderq + mmsize]
155 mova [v1q + orderq], m2
156 mova [v1q + orderq + mmsize], m3
163 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
164 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
174 mova m4, [v2q + orderq]
175 mova m5, [v3q + orderq]
176 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
191 SCALARPRODUCT_LOOP 14
192 SCALARPRODUCT_LOOP 12
193 SCALARPRODUCT_LOOP 10
208 ;-----------------------------------------------------------------------------
209 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
210 ; const int16_t *window, unsigned int len)
211 ;-----------------------------------------------------------------------------
213 %macro REVERSE_WORDS_MMXEXT 1-2
217 %macro REVERSE_WORDS_SSE2 1-2
223 %macro REVERSE_WORDS_SSSE3 2
227 ; dst = (dst * src) >> 15
228 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
229 ; in from the pmullw result.
230 %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
239 ; dst = ((dst * src) + (1<<14)) >> 15
240 %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
244 %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
245 cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
246 lea offset2q, [offsetq-mmsize]
250 mova m5, [pb_revwords]
255 ; This version expands 16-bit to 32-bit, multiplies by the window,
256 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
257 ; save to the output. The window is reversed for the second half.
258 mova m3, [windowq+offset2q]
259 mova m4, [ inputq+offset2q]
273 mova [outputq+offset2q], m0
275 mova m4, [ inputq+offsetq]
289 mova [outputq+offsetq], m0
291 ; This version does the 16x16->16 multiplication in-place without expanding
292 ; to 32-bit. The ssse3 version is bit-identical.
293 mova m0, [windowq+offset2q]
294 mova m1, [ inputq+offset2q]
297 pmulhrsw m0, [ inputq+offsetq ]
298 mova [outputq+offset2q], m1
299 mova [outputq+offsetq ], m0
301 ; This version does the 16x16->16 multiplication in-place without expanding
302 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
303 ; therefore are not bit-identical to the C version.
304 mova m0, [windowq+offset2q]
305 mova m1, [ inputq+offset2q]
306 mova m2, [ inputq+offsetq ]
307 MUL16FIXED m1, m0, m3
309 MUL16FIXED m2, m0, m3
310 mova [outputq+offset2q], m1
311 mova [outputq+offsetq ], m2
320 %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
321 %define MUL16FIXED MUL16FIXED_MMXEXT
322 APPLY_WINDOW_INT16 mmxext, 0, 0
323 APPLY_WINDOW_INT16 mmxext_ba, 1, 0
325 %define REVERSE_WORDS REVERSE_WORDS_SSE2
326 APPLY_WINDOW_INT16 sse2, 0, 0
327 APPLY_WINDOW_INT16 sse2_ba, 1, 0
328 APPLY_WINDOW_INT16 ssse3_atom, 0, 1
329 %define REVERSE_WORDS REVERSE_WORDS_SSSE3
330 APPLY_WINDOW_INT16 ssse3, 0, 1
333 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
334 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
337 movd mm4, [left_topq]
342 psubb mm0, mm4 ; t-tl
354 psubb mm0, mm4 ; t-tl
360 paddb mm4, mm3 ; t-tl+l
365 pmaxub mm3, mm5 ; median
366 paddb mm3, mm2 ; +residual
386 movzx r2d, byte [dstq-1]
388 movzx r2d, byte [topq-1]
393 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
419 movhps [dstq+wq+8], m0
431 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
433 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
436 mova m4, [pb_zzzz3333zzzzbbbb]
437 mova m3, [pb_zz11zz55zz99zzdd]
443 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
445 mova m6, [pb_zzzzzzzz77777777]
446 mova m4, [pb_zzzz3333zzzzbbbb]
447 mova m3, [pb_zz11zz55zz99zzdd]
451 jnz add_hfyu_left_prediction_ssse3.skip_prologue
459 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
460 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
467 movaps xmm1, [v1q+offsetq]
468 mulps xmm1, [v2q+offsetq]
483 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
484 ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
485 ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
487 ; The actual function itself is below. It basically wraps a very simple
488 ; w = end_x - start_x
491 ; jump to the slow loop functions
493 ; jump to the fast loop functions
497 ; ... and then the same for left/right extend also. See below for loop
498 ; function implementations. Fast are fixed-width, slow is variable-width
500 %macro EMU_EDGE_FUNC 1
503 cglobal emu_edge_core_%1, 6, 7, 1
504 mov r11, r5 ; save block_h
507 cglobal emu_edge_core_%1, 2, 7, 0
509 mov r5, r5m ; block_h
512 ; start with vertical extend (top/bottom) and body pixel copy
514 sub w_reg, r6m ; w = start_x - end_x
522 jg .slow_v_extend_loop
524 mov r2, r2m ; linesize
526 sal w_reg, 7 ; w * 128
528 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
531 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
533 call w_reg ; fast top extend, body copy and bottom extend
536 ; horizontal extend (left/right)
537 mov w_reg, r6m ; start_x
540 mov r3, r0 ; backup of buf+block_h*linesize
543 mov r0m, r0 ; backup of buf+block_h*linesize
549 jg .slow_left_extend_loop
552 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
555 ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
556 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
558 lea rax, [.emuedge_extend_left_2]
561 lea w_reg, [.emuedge_extend_left_2+w_reg]
565 ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
571 mov w_reg, r7m ; end_x
572 mov r1, r8m ; block_w
575 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
577 jg .slow_right_extend_loop
579 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
583 lea rax, [.emuedge_extend_right_2]
586 lea r1, [.emuedge_extend_right_2+r1]
612 %define stack_offset 0x14
617 ; macro to read/write a horizontal number of pixels (%2) to/from registers
618 ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
619 ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
620 ; - else if (%2 & 8) fills 8 bytes into mm0
621 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
622 ; - else if (%2 & 4) fills 4 bytes into mm0-1
623 ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
624 ; (note that we're using r3 for body/bottom because it's a shorter
625 ; opcode, and then the loop fits in 128 bytes)
626 ; - else fills remaining bytes into rax
627 ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
628 ; - if (%2 & 7 == 4) fills 4 bytes into ebx
629 ; - else if (%2 & 4) fills 4 bytes into mm0-7
630 ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
631 ; - else fills remaining bytes into ebx
632 ; writing data out is in the same way
633 %macro READ_NUM_BYTES 3
634 %assign %%src_off 0 ; offset in source buffer
635 %assign %%smidx 0 ; mmx register idx
636 %assign %%sxidx 0 ; xmm register idx
640 movdqu xmm %+ %%sxidx, [r1+%%src_off]
641 %assign %%src_off %%src_off+16
642 %assign %%sxidx %%sxidx+1
647 %if (%2-%%src_off) == 8
648 mov rax, [r1+%%src_off]
649 %assign %%src_off %%src_off+8
650 %endif ; (%2-%%src_off) == 8
653 %rep (%2-%%src_off)/8
654 movq mm %+ %%smidx, [r1+%%src_off]
655 %assign %%src_off %%src_off+8
656 %assign %%smidx %%smidx+1
657 %endrep ; (%2-%%dst_off)/8
659 %if (%2-%%src_off) == 4
660 mov vald, [r1+%%src_off]
661 %elif (%2-%%src_off) & 4
662 movd mm %+ %%smidx, [r1+%%src_off]
663 %assign %%src_off %%src_off+4
664 %endif ; (%2-%%src_off) ==/& 4
666 %if (%2-%%src_off) == 1
667 mov vall, [r1+%%src_off]
668 %elif (%2-%%src_off) == 2
669 mov valw, [r1+%%src_off]
670 %elif (%2-%%src_off) == 3
672 mov valw2, [r1+%%src_off]
674 mov valw3, [r1+%%src_off]
676 mov valw4, [r1+%%src_off]
677 %endif ; %1 ==/!= top
678 mov vall, [r1+%%src_off+2]
679 %endif ; (%2-%%src_off) == 1/2/3
680 %endmacro ; READ_NUM_BYTES
682 %macro WRITE_NUM_BYTES 3
683 %assign %%dst_off 0 ; offset in destination buffer
684 %assign %%dmidx 0 ; mmx register idx
685 %assign %%dxidx 0 ; xmm register idx
689 movdqu [r0+%%dst_off], xmm %+ %%dxidx
690 %assign %%dst_off %%dst_off+16
691 %assign %%dxidx %%dxidx+1
696 %if (%2-%%dst_off) == 8
697 mov [r0+%%dst_off], rax
698 %assign %%dst_off %%dst_off+8
699 %endif ; (%2-%%dst_off) == 8
702 %rep (%2-%%dst_off)/8
703 movq [r0+%%dst_off], mm %+ %%dmidx
704 %assign %%dst_off %%dst_off+8
705 %assign %%dmidx %%dmidx+1
706 %endrep ; (%2-%%dst_off)/8
708 %if (%2-%%dst_off) == 4
709 mov [r0+%%dst_off], vald
710 %elif (%2-%%dst_off) & 4
711 movd [r0+%%dst_off], mm %+ %%dmidx
712 %assign %%dst_off %%dst_off+4
713 %endif ; (%2-%%dst_off) ==/& 4
715 %if (%2-%%dst_off) == 1
716 mov [r0+%%dst_off], vall
717 %elif (%2-%%dst_off) == 2
718 mov [r0+%%dst_off], valw
719 %elif (%2-%%dst_off) == 3
721 mov [r0+%%dst_off], valw2
723 mov [r0+%%dst_off], valw3
725 mov [r0+%%dst_off], valw4
726 %endif ; %1 ==/!= top
727 mov [r0+%%dst_off+2], vall
728 %endif ; (%2-%%dst_off) == 1/2/3
729 %endmacro ; WRITE_NUM_BYTES
731 ; vertical top/bottom extend and body copy fast loops
732 ; these are function pointers to set-width line copy functions, i.e.
733 ; they read a fixed number of pixels into set registers, and write
734 ; those out into the destination buffer
735 ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
736 ; r6(eax/64)/r3(ebx/32)=val_reg
737 %macro VERTICAL_EXTEND 1
741 .emuedge_v_extend_ %+ %%n:
742 ; extend pixels above body
744 test r3 , r3 ; if (!start_y)
745 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
748 je .emuedge_copy_body_ %+ %%n %+ _loop
749 %endif ; ARCH_X86_64/32
750 READ_NUM_BYTES top, %%n, %1 ; read bytes
751 .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
752 WRITE_NUM_BYTES top, %%n, %1 ; write bytes
753 add r0 , r2 ; dst += linesize
758 %endif ; ARCH_X86_64/32
759 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
762 .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
763 READ_NUM_BYTES body, %%n, %1 ; read bytes
764 WRITE_NUM_BYTES body, %%n, %1 ; write bytes
765 add r0 , r2 ; dst += linesize
766 add r1 , r2 ; src += linesize
768 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
771 test r5 , r5 ; if (!block_h)
772 jz .emuedge_v_extend_end_ %+ %%n ; goto end
773 sub r1 , r2 ; src -= linesize
774 READ_NUM_BYTES bottom, %%n, %1 ; read bytes
775 .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
776 WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes
777 add r0 , r2 ; dst += linesize
779 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
781 .emuedge_v_extend_end_ %+ %%n:
786 %endif ; ARCH_X86_64/32
789 %endmacro VERTICAL_EXTEND
791 ; left/right (horizontal) fast extend functions
792 ; these are essentially identical to the vertical extend ones above,
793 ; just left/right separated because number of pixels to extend is
794 ; obviously not the same on both sides.
795 ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
796 ; lowest two bytes of the register (so val*0x0101), and are splatted
797 ; into each byte of mm0 as well if n_pixels >= 8
799 %macro READ_V_PIXEL 3
813 %macro WRITE_V_PIXEL 2
816 movq [%2+%%dst_off], mm0
817 %assign %%dst_off %%dst_off+8
821 movd [%2+%%dst_off], mm0
823 mov [%2+%%dst_off] , valw
824 mov [%2+%%dst_off+2], valw
826 %assign %%dst_off %%dst_off+4
829 mov [%2+%%dst_off], valw
833 ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
838 .emuedge_extend_left_ %+ %%n: ; do {
839 sub r0, r2 ; dst -= linesize
840 READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels
841 WRITE_V_PIXEL %%n, r0 ; write pixels
843 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
848 %endif ; ARCH_X86_64/32
851 %endmacro ; LEFT_EXTEND
853 ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
854 %macro RIGHT_EXTEND 1
858 .emuedge_extend_right_ %+ %%n: ; do {
860 sub r3, r2 ; dst -= linesize
861 READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels
862 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
865 sub r0, r2 ; dst -= linesize
866 READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels
867 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
869 %endif ; ARCH_X86_64/32
870 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
875 %endif ; ARCH_X86_64/32
880 %define stack_offset 0x10
882 %endmacro ; RIGHT_EXTEND
884 ; below follow the "slow" copy/extend functions, these act on a non-fixed
885 ; width specified in a register, and run a loop to copy the full amount
886 ; of bytes. They are optimized for copying of large amounts of pixels per
887 ; line, so they unconditionally splat data into mm registers to copy 8
888 ; bytes per loop iteration. It could be considered to use xmm for x86-64
889 ; also, but I haven't optimized this as much (i.e. FIXME)
890 %macro V_COPY_NPX 4-5
916 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
918 V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
921 V_COPY_NPX %1, rax , mov, 8
924 V_COPY_NPX %1, mm0, movq, 8
925 %endif ; ARCH_X86_64/32
927 V_COPY_NPX %1, vald, mov, 4
928 V_COPY_NPX %1, valw, mov, 2
929 V_COPY_NPX %1, vall, mov, 1
939 %macro SLOW_V_EXTEND 1
941 ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
942 ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
944 push r11 ; save old value of block_h
947 jz .do_body_copy ; if (!start_y) goto do_body_copy
948 V_COPY_ROW top, r3, %1
952 je .do_body_copy ; if (!start_y) goto do_body_copy
953 V_COPY_ROW top, dword r3m, %1
957 V_COPY_ROW body, r4, %1
960 pop r11 ; restore old value of block_h
967 jz .skip_bottom_extend
969 V_COPY_ROW bottom, r5, %1
977 %macro SLOW_LEFT_EXTEND 1
978 .slow_left_extend_loop:
979 ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
982 READ_V_PIXEL 8, [r0+w_reg], %1
983 .left_extend_8px_loop:
987 jle .left_extend_8px_loop
990 jge .left_extend_loop_end
991 .left_extend_2px_loop:
995 jl .left_extend_2px_loop
996 .left_extend_loop_end:
998 jnz .slow_left_extend_loop
1005 %macro SLOW_RIGHT_EXTEND 1
1006 .slow_right_extend_loop:
1007 ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
1008 ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1017 sub buf_reg, linesize
1018 READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
1019 .right_extend_8px_loop:
1020 movq [buf_reg+r1], mm0
1023 jge .right_extend_8px_loop
1026 je .right_extend_loop_end
1027 .right_extend_2px_loop:
1029 mov [buf_reg+r1], valw
1031 jg .right_extend_2px_loop
1032 .right_extend_loop_end:
1034 jnz .slow_right_extend_loop
1045 SLOW_RIGHT_EXTEND %1
1053 ;-----------------------------------------------------------------------------
1054 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
1055 ; int32_t max, unsigned int len)
1056 ;-----------------------------------------------------------------------------
1062 %macro SPLATD_SSE2 1
1066 %macro VECTOR_CLIP_INT32 4
1067 cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
1080 mova m0, [srcq+mmsize*0*%%i]
1081 mova m1, [srcq+mmsize*1*%%i]
1082 mova m2, [srcq+mmsize*2*%%i]
1083 mova m3, [srcq+mmsize*3*%%i]
1085 mova m7, [srcq+mmsize*4*%%i]
1086 mova m8, [srcq+mmsize*5*%%i]
1087 mova m9, [srcq+mmsize*6*%%i]
1088 mova m10, [srcq+mmsize*7*%%i]
1090 CLIPD m0, m4, m5, m6
1091 CLIPD m1, m4, m5, m6
1092 CLIPD m2, m4, m5, m6
1093 CLIPD m3, m4, m5, m6
1095 CLIPD m7, m4, m5, m6
1096 CLIPD m8, m4, m5, m6
1097 CLIPD m9, m4, m5, m6
1098 CLIPD m10, m4, m5, m6
1100 mova [dstq+mmsize*0*%%i], m0
1101 mova [dstq+mmsize*1*%%i], m1
1102 mova [dstq+mmsize*2*%%i], m2
1103 mova [dstq+mmsize*3*%%i], m3
1105 mova [dstq+mmsize*4*%%i], m7
1106 mova [dstq+mmsize*5*%%i], m8
1107 mova [dstq+mmsize*6*%%i], m9
1108 mova [dstq+mmsize*7*%%i], m10
1112 add srcq, mmsize*4*(%3+%4)
1113 add dstq, mmsize*4*(%3+%4)
1114 sub lend, mmsize*(%3+%4)
1120 %define SPLATD SPLATD_MMX
1121 %define CLIPD CLIPD_MMX
1122 VECTOR_CLIP_INT32 mmx, 0, 1, 0
1124 %define SPLATD SPLATD_SSE2
1125 VECTOR_CLIP_INT32 sse2_int, 6, 1, 0
1126 %define CLIPD CLIPD_SSE2
1127 VECTOR_CLIP_INT32 sse2, 6, 2, 0
1128 %define CLIPD CLIPD_SSE41
1130 VECTOR_CLIP_INT32 sse41, 11, 1, 1
1132 VECTOR_CLIP_INT32 sse41, 6, 1, 0