1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 pb_zzzzzzzz77777777: times 8 db -1
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30 pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
31 pd_16384: times 4 dd 16384
32 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
36 %macro SCALARPRODUCT 0
37 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
38 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
45 movu m0, [v1q + orderq]
46 movu m1, [v1q + orderq + mmsize]
47 pmaddwd m0, [v2q + orderq]
48 pmaddwd m1, [v2q + orderq + mmsize]
64 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
65 cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
80 movu m0, [v2q + orderq]
81 movu m1, [v2q + orderq + mmsize]
82 mova m4, [v1q + orderq]
83 mova m5, [v1q + orderq + mmsize]
84 movu m2, [v3q + orderq]
85 movu m3, [v3q + orderq + mmsize]
94 mova [v1q + orderq], m2
95 mova [v1q + orderq + mmsize], m3
115 %macro SCALARPRODUCT_LOOP 1
121 mova m4, [v2q + orderq]
122 mova m0, [v2q + orderq + mmsize]
126 mova m5, [v3q + orderq]
127 mova m2, [v3q + orderq + mmsize]
131 mova m0, [v2q + orderq]
132 mova m1, [v2q + orderq + mmsize]
133 mova m2, [v3q + orderq]
134 mova m3, [v3q + orderq + mmsize]
136 %define t0 [v1q + orderq]
137 %define t1 [v1q + orderq + mmsize]
152 mova [v1q + orderq], m2
153 mova [v1q + orderq + mmsize], m3
160 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
162 cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
172 mova m4, [v2q + orderq]
173 mova m5, [v3q + orderq]
174 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
189 SCALARPRODUCT_LOOP 14
190 SCALARPRODUCT_LOOP 12
191 SCALARPRODUCT_LOOP 10
206 ;-----------------------------------------------------------------------------
207 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
208 ; const int16_t *window, unsigned int len)
209 ;-----------------------------------------------------------------------------
211 %macro REVERSE_WORDS 1-2
212 %if cpuflag(ssse3) && notcpuflag(atom)
218 %elif cpuflag(mmxext)
224 %if cpuflag(ssse3) ; dst, src, unused
225 ; dst = ((dst * src) + (1<<14)) >> 15
227 %elif cpuflag(mmxext) ; dst, src, temp
228 ; dst = (dst * src) >> 15
229 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
230 ; in from the pmullw result.
240 %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
242 cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
244 cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
246 lea offset2q, [offsetq-mmsize]
247 %if cpuflag(ssse3) && notcpuflag(atom)
248 mova m5, [pb_revwords]
255 ; This version does the 16x16->16 multiplication in-place without expanding
256 ; to 32-bit. The ssse3 version is bit-identical.
257 mova m0, [windowq+offset2q]
258 mova m1, [ inputq+offset2q]
261 pmulhrsw m0, [ inputq+offsetq ]
262 mova [outputq+offset2q], m1
263 mova [outputq+offsetq ], m0
265 ; This version expands 16-bit to 32-bit, multiplies by the window,
266 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
267 ; save to the output. The window is reversed for the second half.
268 mova m3, [windowq+offset2q]
269 mova m4, [ inputq+offset2q]
283 mova [outputq+offset2q], m0
285 mova m4, [ inputq+offsetq]
299 mova [outputq+offsetq], m0
301 ; This version does the 16x16->16 multiplication in-place without expanding
302 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
303 ; therefore are not bit-identical to the C version.
304 mova m0, [windowq+offset2q]
305 mova m1, [ inputq+offset2q]
306 mova m2, [ inputq+offsetq ]
307 MUL16FIXED m1, m0, m3
309 MUL16FIXED m2, m0, m3
310 mova [outputq+offset2q], m1
311 mova [outputq+offsetq ], m2
334 ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
336 cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
339 movd mm4, [left_topq]
344 psubb mm0, mm4 ; t-tl
356 psubb mm0, mm4 ; t-tl
362 paddb mm4, mm3 ; t-tl+l
367 pmaxub mm3, mm5 ; median
368 paddb mm3, mm2 ; +residual
388 movzx r2d, byte [dstq-1]
390 movzx r2d, byte [topq-1]
395 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
425 movhps [dstq+wq+8], m0
437 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
439 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
442 mova m4, [pb_zzzz3333zzzzbbbb]
443 mova m3, [pb_zz11zz55zz99zzdd]
446 ADD_HFYU_LEFT_LOOP 1, 1
449 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
451 mova m6, [pb_zzzzzzzz77777777]
452 mova m4, [pb_zzzz3333zzzzbbbb]
453 mova m3, [pb_zz11zz55zz99zzdd]
460 ADD_HFYU_LEFT_LOOP 1, 1
462 ADD_HFYU_LEFT_LOOP 0, 1
464 ADD_HFYU_LEFT_LOOP 0, 0
467 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
469 cglobal scalarproduct_float, 3,3,2, v1, v2, offset
476 movaps xmm1, [v1q+offsetq]
477 mulps xmm1, [v2q+offsetq]
492 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
493 ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
494 ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
496 ; The actual function itself is below. It basically wraps a very simple
497 ; w = end_x - start_x
500 ; jump to the slow loop functions
502 ; jump to the fast loop functions
506 ; ... and then the same for left/right extend also. See below for loop
507 ; function implementations. Fast are fixed-width, slow is variable-width
509 %macro EMU_EDGE_FUNC 0
512 cglobal emu_edge_core, 6, 9, 1
513 mov r8, r5 ; save block_h
516 cglobal emu_edge_core, 2, 7, 0
518 mov r5, r5m ; block_h
521 ; start with vertical extend (top/bottom) and body pixel copy
523 sub w_reg, r6m ; w = start_x - end_x
531 jg .slow_v_extend_loop
533 mov r2, r2m ; linesize
535 sal w_reg, 7 ; w * 128
537 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
540 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
542 call w_reg ; fast top extend, body copy and bottom extend
545 ; horizontal extend (left/right)
546 mov w_reg, r6m ; start_x
549 mov r3, r0 ; backup of buf+block_h*linesize
552 mov r0m, r0 ; backup of buf+block_h*linesize
558 jg .slow_left_extend_loop
561 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
564 ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
565 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
567 lea rax, [.emuedge_extend_left_2]
570 lea w_reg, [.emuedge_extend_left_2+w_reg]
574 ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
580 mov w_reg, r7m ; end_x
581 mov r1, r8m ; block_w
584 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
586 jg .slow_right_extend_loop
588 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
592 lea rax, [.emuedge_extend_right_2]
595 lea r1, [.emuedge_extend_right_2+r1]
621 %define stack_offset 0x14
626 ; macro to read/write a horizontal number of pixels (%2) to/from registers
627 ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
628 ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
629 ; - else if (%2 & 8) fills 8 bytes into mm0
630 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
631 ; - else if (%2 & 4) fills 4 bytes into mm0-1
632 ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
633 ; (note that we're using r3 for body/bottom because it's a shorter
634 ; opcode, and then the loop fits in 128 bytes)
635 ; - else fills remaining bytes into rax
636 ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
637 ; - if (%2 & 7 == 4) fills 4 bytes into ebx
638 ; - else if (%2 & 4) fills 4 bytes into mm0-7
639 ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
640 ; - else fills remaining bytes into ebx
641 ; writing data out is in the same way
642 %macro READ_NUM_BYTES 2
643 %assign %%src_off 0 ; offset in source buffer
644 %assign %%smidx 0 ; mmx register idx
645 %assign %%sxidx 0 ; xmm register idx
649 movups xmm %+ %%sxidx, [r1+%%src_off]
650 %assign %%src_off %%src_off+16
651 %assign %%sxidx %%sxidx+1
656 %if (%2-%%src_off) == 8
657 mov rax, [r1+%%src_off]
658 %assign %%src_off %%src_off+8
659 %endif ; (%2-%%src_off) == 8
662 %rep (%2-%%src_off)/8
663 movq mm %+ %%smidx, [r1+%%src_off]
664 %assign %%src_off %%src_off+8
665 %assign %%smidx %%smidx+1
666 %endrep ; (%2-%%dst_off)/8
668 %if (%2-%%src_off) == 4
669 mov vald, [r1+%%src_off]
670 %elif (%2-%%src_off) & 4
671 movd mm %+ %%smidx, [r1+%%src_off]
672 %assign %%src_off %%src_off+4
673 %endif ; (%2-%%src_off) ==/& 4
675 %if (%2-%%src_off) == 1
676 mov vall, [r1+%%src_off]
677 %elif (%2-%%src_off) == 2
678 mov valw, [r1+%%src_off]
679 %elif (%2-%%src_off) == 3
681 mov valw2, [r1+%%src_off]
683 mov valw3, [r1+%%src_off]
685 mov valw4, [r1+%%src_off]
686 %endif ; %1 ==/!= top
687 mov vall, [r1+%%src_off+2]
688 %endif ; (%2-%%src_off) == 1/2/3
689 %endmacro ; READ_NUM_BYTES
691 %macro WRITE_NUM_BYTES 2
692 %assign %%dst_off 0 ; offset in destination buffer
693 %assign %%dmidx 0 ; mmx register idx
694 %assign %%dxidx 0 ; xmm register idx
698 movups [r0+%%dst_off], xmm %+ %%dxidx
699 %assign %%dst_off %%dst_off+16
700 %assign %%dxidx %%dxidx+1
705 %if (%2-%%dst_off) == 8
706 mov [r0+%%dst_off], rax
707 %assign %%dst_off %%dst_off+8
708 %endif ; (%2-%%dst_off) == 8
711 %rep (%2-%%dst_off)/8
712 movq [r0+%%dst_off], mm %+ %%dmidx
713 %assign %%dst_off %%dst_off+8
714 %assign %%dmidx %%dmidx+1
715 %endrep ; (%2-%%dst_off)/8
717 %if (%2-%%dst_off) == 4
718 mov [r0+%%dst_off], vald
719 %elif (%2-%%dst_off) & 4
720 movd [r0+%%dst_off], mm %+ %%dmidx
721 %assign %%dst_off %%dst_off+4
722 %endif ; (%2-%%dst_off) ==/& 4
724 %if (%2-%%dst_off) == 1
725 mov [r0+%%dst_off], vall
726 %elif (%2-%%dst_off) == 2
727 mov [r0+%%dst_off], valw
728 %elif (%2-%%dst_off) == 3
730 mov [r0+%%dst_off], valw2
732 mov [r0+%%dst_off], valw3
734 mov [r0+%%dst_off], valw4
735 %endif ; %1 ==/!= top
736 mov [r0+%%dst_off+2], vall
737 %endif ; (%2-%%dst_off) == 1/2/3
738 %endmacro ; WRITE_NUM_BYTES
740 ; vertical top/bottom extend and body copy fast loops
741 ; these are function pointers to set-width line copy functions, i.e.
742 ; they read a fixed number of pixels into set registers, and write
743 ; those out into the destination buffer
744 ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
745 ; r6(eax/64)/r3(ebx/32)=val_reg
746 %macro VERTICAL_EXTEND 0
750 .emuedge_v_extend_ %+ %%n:
751 ; extend pixels above body
753 test r3 , r3 ; if (!start_y)
754 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
757 je .emuedge_copy_body_ %+ %%n %+ _loop
758 %endif ; ARCH_X86_64/32
759 READ_NUM_BYTES top, %%n ; read bytes
760 .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
761 WRITE_NUM_BYTES top, %%n ; write bytes
762 add r0 , r2 ; dst += linesize
767 %endif ; ARCH_X86_64/32
768 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
771 .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
772 READ_NUM_BYTES body, %%n ; read bytes
773 WRITE_NUM_BYTES body, %%n ; write bytes
774 add r0 , r2 ; dst += linesize
775 add r1 , r2 ; src += linesize
777 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
780 test r5 , r5 ; if (!block_h)
781 jz .emuedge_v_extend_end_ %+ %%n ; goto end
782 sub r1 , r2 ; src -= linesize
783 READ_NUM_BYTES bottom, %%n ; read bytes
784 .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
785 WRITE_NUM_BYTES bottom, %%n ; write bytes
786 add r0 , r2 ; dst += linesize
788 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
790 .emuedge_v_extend_end_ %+ %%n:
795 %endif ; ARCH_X86_64/32
798 %endmacro VERTICAL_EXTEND
800 ; left/right (horizontal) fast extend functions
801 ; these are essentially identical to the vertical extend ones above,
802 ; just left/right separated because number of pixels to extend is
803 ; obviously not the same on both sides.
804 ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
805 ; lowest two bytes of the register (so val*0x0101), and are splatted
806 ; into each byte of mm0 as well if n_pixels >= 8
808 %macro READ_V_PIXEL 2
822 %macro WRITE_V_PIXEL 2
825 movq [%2+%%dst_off], mm0
826 %assign %%dst_off %%dst_off+8
830 movd [%2+%%dst_off], mm0
832 mov [%2+%%dst_off] , valw
833 mov [%2+%%dst_off+2], valw
835 %assign %%dst_off %%dst_off+4
838 mov [%2+%%dst_off], valw
842 ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
847 .emuedge_extend_left_ %+ %%n: ; do {
848 sub r0, r2 ; dst -= linesize
849 READ_V_PIXEL %%n, [r0+r1] ; read pixels
850 WRITE_V_PIXEL %%n, r0 ; write pixels
852 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
857 %endif ; ARCH_X86_64/32
860 %endmacro ; LEFT_EXTEND
862 ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
863 %macro RIGHT_EXTEND 0
867 .emuedge_extend_right_ %+ %%n: ; do {
869 sub r3, r2 ; dst -= linesize
870 READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
871 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
874 sub r0, r2 ; dst -= linesize
875 READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
876 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
878 %endif ; ARCH_X86_64/32
879 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
884 %endif ; ARCH_X86_64/32
889 %define stack_offset 0x10
891 %endmacro ; RIGHT_EXTEND
893 ; below follow the "slow" copy/extend functions, these act on a non-fixed
894 ; width specified in a register, and run a loop to copy the full amount
895 ; of bytes. They are optimized for copying of large amounts of pixels per
896 ; line, so they unconditionally splat data into mm registers to copy 8
897 ; bytes per loop iteration. It could be considered to use xmm for x86-64
898 ; also, but I haven't optimized this as much (i.e. FIXME)
899 %macro V_COPY_NPX 4-5
925 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
927 V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
930 V_COPY_NPX %1, rax , mov, 8
933 V_COPY_NPX %1, mm0, movq, 8
934 %endif ; ARCH_X86_64/32
936 V_COPY_NPX %1, vald, mov, 4
937 V_COPY_NPX %1, valw, mov, 2
938 V_COPY_NPX %1, vall, mov, 1
948 %macro SLOW_V_EXTEND 0
950 ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
951 ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
953 push r8 ; save old value of block_h
956 jz .do_body_copy ; if (!start_y) goto do_body_copy
961 je .do_body_copy ; if (!start_y) goto do_body_copy
962 V_COPY_ROW top, dword r3m
969 pop r8 ; restore old value of block_h
976 jz .skip_bottom_extend
978 V_COPY_ROW bottom, r5
986 %macro SLOW_LEFT_EXTEND 0
987 .slow_left_extend_loop:
988 ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
991 READ_V_PIXEL 8, [r0+w_reg]
992 .left_extend_8px_loop:
996 jle .left_extend_8px_loop
999 jge .left_extend_loop_end
1000 .left_extend_2px_loop:
1004 jl .left_extend_2px_loop
1005 .left_extend_loop_end:
1007 jnz .slow_left_extend_loop
1014 %macro SLOW_RIGHT_EXTEND 0
1015 .slow_right_extend_loop:
1016 ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
1017 ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1026 sub buf_reg, linesize
1027 READ_V_PIXEL 8, [buf_reg+w_reg-1]
1028 .right_extend_8px_loop:
1029 movq [buf_reg+r1], mm0
1032 jge .right_extend_8px_loop
1035 je .right_extend_loop_end
1036 .right_extend_2px_loop:
1038 mov [buf_reg+r1], valw
1040 jg .right_extend_2px_loop
1041 .right_extend_loop_end:
1043 jnz .slow_right_extend_loop
1063 ;-----------------------------------------------------------------------------
1064 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
1065 ; int32_t max, unsigned int len)
1066 ;-----------------------------------------------------------------------------
1068 ; %1 = number of xmm registers used
1069 ; %2 = number of inline load/process/store loops per asm loop
1070 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
1071 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
1073 %macro VECTOR_CLIP_INT32 4-5
1074 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
1087 mova m0, [srcq+mmsize*0*%%i]
1088 mova m1, [srcq+mmsize*1*%%i]
1089 mova m2, [srcq+mmsize*2*%%i]
1090 mova m3, [srcq+mmsize*3*%%i]
1092 mova m7, [srcq+mmsize*4*%%i]
1093 mova m8, [srcq+mmsize*5*%%i]
1094 mova m9, [srcq+mmsize*6*%%i]
1095 mova m10, [srcq+mmsize*7*%%i]
1097 CLIPD m0, m4, m5, m6
1098 CLIPD m1, m4, m5, m6
1099 CLIPD m2, m4, m5, m6
1100 CLIPD m3, m4, m5, m6
1102 CLIPD m7, m4, m5, m6
1103 CLIPD m8, m4, m5, m6
1104 CLIPD m9, m4, m5, m6
1105 CLIPD m10, m4, m5, m6
1107 mova [dstq+mmsize*0*%%i], m0
1108 mova [dstq+mmsize*1*%%i], m1
1109 mova [dstq+mmsize*2*%%i], m2
1110 mova [dstq+mmsize*3*%%i], m3
1112 mova [dstq+mmsize*4*%%i], m7
1113 mova [dstq+mmsize*5*%%i], m8
1114 mova [dstq+mmsize*6*%%i], m9
1115 mova [dstq+mmsize*7*%%i], m10
1119 add srcq, mmsize*4*(%2+%3)
1120 add dstq, mmsize*4*(%2+%3)
1121 sub lend, mmsize*(%2+%3)
1127 %define CLIPD CLIPD_MMX
1128 VECTOR_CLIP_INT32 0, 1, 0, 0
1130 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
1131 %define CLIPD CLIPD_SSE2
1132 VECTOR_CLIP_INT32 6, 2, 0, 1
1134 %define CLIPD CLIPD_SSE41
1136 VECTOR_CLIP_INT32 11, 1, 1, 0
1138 VECTOR_CLIP_INT32 6, 1, 0, 0
1141 ;-----------------------------------------------------------------------------
1142 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
1144 ;-----------------------------------------------------------------------------
1145 %macro VECTOR_FMUL_REVERSE 0
1146 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
1147 lea lenq, [lend*4 - 2*mmsize]
1151 vmovaps xmm0, [src1q + 16]
1152 vinsertf128 m0, m0, [src1q], 1
1153 vshufps m0, m0, m0, q0123
1154 vmovaps xmm1, [src1q + mmsize + 16]
1155 vinsertf128 m1, m1, [src1q + mmsize], 1
1156 vshufps m1, m1, m1, q0123
1159 mova m1, [src1q + mmsize]
1160 shufps m0, m0, q0123
1161 shufps m1, m1, q0123
1163 mulps m0, m0, [src0q + lenq + mmsize]
1164 mulps m1, m1, [src0q + lenq]
1165 mova [dstq + lenq + mmsize], m0
1166 mova [dstq + lenq], m1
1175 %if HAVE_AVX_EXTERNAL
1180 ;-----------------------------------------------------------------------------
1181 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
1182 ; const float *src2, int len)
1183 ;-----------------------------------------------------------------------------
1184 %macro VECTOR_FMUL_ADD 0
1185 cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
1186 lea lenq, [lend*4 - 2*mmsize]
1189 mova m0, [src0q + lenq]
1190 mova m1, [src0q + lenq + mmsize]
1191 mulps m0, m0, [src1q + lenq]
1192 mulps m1, m1, [src1q + lenq + mmsize]
1193 addps m0, m0, [src2q + lenq]
1194 addps m1, m1, [src2q + lenq + mmsize]
1195 mova [dstq + lenq], m0
1196 mova [dstq + lenq + mmsize], m1
1205 %if HAVE_AVX_EXTERNAL
1210 ;-----------------------------------------------------------------------------
1211 ; void ff_butterflies_float_interleave(float *dst, const float *src0,
1212 ; const float *src1, int len);
1213 ;-----------------------------------------------------------------------------
1215 %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
1216 cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
1223 lea src0q, [src0q + lenq]
1224 lea src1q, [src1q + lenq]
1225 lea dstq, [ dstq + 2*lenq]
1228 mova m0, [src0q + lenq]
1229 mova m1, [src1q + lenq]
1235 vextractf128 [dstq + 2*lenq ], m1, 0
1236 vextractf128 [dstq + 2*lenq + 16], m0, 0
1237 vextractf128 [dstq + 2*lenq + 32], m1, 1
1238 vextractf128 [dstq + 2*lenq + 48], m0, 1
1240 mova [dstq + 2*lenq ], m1
1241 mova [dstq + 2*lenq + mmsize], m0
1250 BUTTERFLIES_FLOAT_INTERLEAVE
1251 %if HAVE_AVX_EXTERNAL
1253 BUTTERFLIES_FLOAT_INTERLEAVE
1256 ; %1 = aligned/unaligned
1257 %macro BSWAP_LOOPS 1
1270 pshuflw m0, m0, 10110001b
1271 pshuflw m1, m1, 10110001b
1272 pshufhw m0, m0, 10110001b
1273 pshufhw m1, m1, 10110001b
1298 pshuflw m0, m0, 10110001b
1299 pshufhw m0, m0, 10110001b
1310 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
1311 %macro BSWAP32_BUF 0
1313 cglobal bswap32_buf, 3,4,3
1315 mova m2, [pb_bswap32]
1317 cglobal bswap32_buf, 3,4,5
1383 ; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
1386 cglobal %1_pixels4_l2, 6,6
1387 movsxdifnidn r3, r3d
1388 movsxdifnidn r4, r4d
1426 ; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
1429 cglobal %1_pixels8_l2, 6,6
1430 movsxdifnidn r3, r3d
1431 movsxdifnidn r4, r4d
1469 ; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
1470 %macro PIXELS16_L2 1
1472 cglobal %1_pixels16_l2, 6,6
1473 movsxdifnidn r3, r3d
1474 movsxdifnidn r4, r4d
1515 ; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)
1522 cglobal %1_pixels%2, 4,5
1523 movsxdifnidn r2, r2d
1553 ; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
1554 cglobal put_pixels16, 4,5,4
1555 movsxdifnidn r2, r2d
1572 ; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
1573 cglobal avg_pixels16, 4,5,4
1574 movsxdifnidn r2, r2d