1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
26 pb_zzzzzzzz77777777: times 8 db -1
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30 pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
31 pd_16384: times 4 dd 16384
33 section .text align=16
35 %macro SCALARPRODUCT 1
36 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
37 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
45 movu m0, [v1q + orderq]
46 movu m1, [v1q + orderq + mmsize]
47 pmaddwd m0, [v2q + orderq]
48 pmaddwd m1, [v2q + orderq + mmsize]
66 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
67 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
82 movu m0, [v2q + orderq]
83 movu m1, [v2q + orderq + mmsize]
84 mova m4, [v1q + orderq]
85 mova m5, [v1q + orderq + mmsize]
86 movu m2, [v3q + orderq]
87 movu m3, [v3q + orderq + mmsize]
96 mova [v1q + orderq], m2
97 mova [v1q + orderq + mmsize], m3
117 %macro SCALARPRODUCT_LOOP 1
123 mova m4, [v2q + orderq]
124 mova m0, [v2q + orderq + mmsize]
128 mova m5, [v3q + orderq]
129 mova m2, [v3q + orderq + mmsize]
133 mova m0, [v2q + orderq]
134 mova m1, [v2q + orderq + mmsize]
135 mova m2, [v3q + orderq]
136 mova m3, [v3q + orderq + mmsize]
138 %define t0 [v1q + orderq]
139 %define t1 [v1q + orderq + mmsize]
154 mova [v1q + orderq], m2
155 mova [v1q + orderq + mmsize], m3
162 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
163 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
173 mova m4, [v2q + orderq]
174 mova m5, [v3q + orderq]
175 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
190 SCALARPRODUCT_LOOP 14
191 SCALARPRODUCT_LOOP 12
192 SCALARPRODUCT_LOOP 10
207 ;-----------------------------------------------------------------------------
208 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
209 ; const int16_t *window, unsigned int len)
210 ;-----------------------------------------------------------------------------
212 %macro REVERSE_WORDS_MMXEXT 1-2
216 %macro REVERSE_WORDS_SSE2 1-2
222 %macro REVERSE_WORDS_SSSE3 2
226 ; dst = (dst * src) >> 15
227 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
228 ; in from the pmullw result.
229 %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
238 ; dst = ((dst * src) + (1<<14)) >> 15
239 %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
243 %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
244 cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
245 lea offset2q, [offsetq-mmsize]
249 mova m5, [pb_revwords]
254 ; This version expands 16-bit to 32-bit, multiplies by the window,
255 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
256 ; save to the output. The window is reversed for the second half.
257 mova m3, [windowq+offset2q]
258 mova m4, [ inputq+offset2q]
272 mova [outputq+offset2q], m0
274 mova m4, [ inputq+offsetq]
288 mova [outputq+offsetq], m0
290 ; This version does the 16x16->16 multiplication in-place without expanding
291 ; to 32-bit. The ssse3 version is bit-identical.
292 mova m0, [windowq+offset2q]
293 mova m1, [ inputq+offset2q]
296 pmulhrsw m0, [ inputq+offsetq ]
297 mova [outputq+offset2q], m1
298 mova [outputq+offsetq ], m0
300 ; This version does the 16x16->16 multiplication in-place without expanding
301 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
302 ; therefore are not bit-identical to the C version.
303 mova m0, [windowq+offset2q]
304 mova m1, [ inputq+offset2q]
305 mova m2, [ inputq+offsetq ]
306 MUL16FIXED m1, m0, m3
308 MUL16FIXED m2, m0, m3
309 mova [outputq+offset2q], m1
310 mova [outputq+offsetq ], m2
319 %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
320 %define MUL16FIXED MUL16FIXED_MMXEXT
321 APPLY_WINDOW_INT16 mmxext, 0, 0
322 APPLY_WINDOW_INT16 mmxext_ba, 1, 0
324 %define REVERSE_WORDS REVERSE_WORDS_SSE2
325 APPLY_WINDOW_INT16 sse2, 0, 0
326 APPLY_WINDOW_INT16 sse2_ba, 1, 0
327 APPLY_WINDOW_INT16 ssse3_atom, 0, 1
328 %define REVERSE_WORDS REVERSE_WORDS_SSSE3
329 APPLY_WINDOW_INT16 ssse3, 0, 1
332 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
333 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
336 movd mm4, [left_topq]
341 psubb mm0, mm4 ; t-tl
353 psubb mm0, mm4 ; t-tl
359 paddb mm4, mm3 ; t-tl+l
364 pmaxub mm3, mm5 ; median
365 paddb mm3, mm2 ; +residual
385 movzx r2d, byte [dstq-1]
387 movzx r2d, byte [topq-1]
392 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
418 movhps [dstq+wq+8], m0
430 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
432 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
435 mova m4, [pb_zzzz3333zzzzbbbb]
436 mova m3, [pb_zz11zz55zz99zzdd]
442 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
444 mova m6, [pb_zzzzzzzz77777777]
445 mova m4, [pb_zzzz3333zzzzbbbb]
446 mova m3, [pb_zz11zz55zz99zzdd]
450 jnz add_hfyu_left_prediction_ssse3.skip_prologue
458 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
459 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
466 movaps xmm1, [v1q+offsetq]
467 mulps xmm1, [v2q+offsetq]
482 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
483 ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
484 ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
486 ; The actual function itself is below. It basically wraps a very simple
487 ; w = end_x - start_x
490 ; jump to the slow loop functions
492 ; jump to the fast loop functions
496 ; ... and then the same for left/right extend also. See below for loop
497 ; function implementations. Fast are fixed-width, slow is variable-width
499 %macro EMU_EDGE_FUNC 1
502 cglobal emu_edge_core_%1, 6, 7, 1
503 mov r11, r5 ; save block_h
506 cglobal emu_edge_core_%1, 2, 7, 0
508 mov r5, r5m ; block_h
511 ; start with vertical extend (top/bottom) and body pixel copy
513 sub w_reg, r6m ; w = start_x - end_x
521 jg .slow_v_extend_loop
523 mov r2, r2m ; linesize
525 sal w_reg, 7 ; w * 128
527 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
530 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
532 call w_reg ; fast top extend, body copy and bottom extend
535 ; horizontal extend (left/right)
536 mov w_reg, r6m ; start_x
539 mov r3, r0 ; backup of buf+block_h*linesize
542 mov r0m, r0 ; backup of buf+block_h*linesize
548 jg .slow_left_extend_loop
551 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
554 ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
555 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
557 lea rax, [.emuedge_extend_left_2]
560 lea w_reg, [.emuedge_extend_left_2+w_reg]
564 ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
570 mov w_reg, r7m ; end_x
571 mov r1, r8m ; block_w
574 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
576 jg .slow_right_extend_loop
578 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
582 lea rax, [.emuedge_extend_right_2]
585 lea r1, [.emuedge_extend_right_2+r1]
611 %define stack_offset 0x14
616 ; macro to read/write a horizontal number of pixels (%2) to/from registers
617 ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
618 ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
619 ; - else if (%2 & 8) fills 8 bytes into mm0
620 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
621 ; - else if (%2 & 4) fills 4 bytes into mm0-1
622 ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
623 ; (note that we're using r3 for body/bottom because it's a shorter
624 ; opcode, and then the loop fits in 128 bytes)
625 ; - else fills remaining bytes into rax
626 ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
627 ; - if (%2 & 7 == 4) fills 4 bytes into ebx
628 ; - else if (%2 & 4) fills 4 bytes into mm0-7
629 ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
630 ; - else fills remaining bytes into ebx
631 ; writing data out is in the same way
632 %macro READ_NUM_BYTES 3
633 %assign %%src_off 0 ; offset in source buffer
634 %assign %%smidx 0 ; mmx register idx
635 %assign %%sxidx 0 ; xmm register idx
639 movdqu xmm %+ %%sxidx, [r1+%%src_off]
640 %assign %%src_off %%src_off+16
641 %assign %%sxidx %%sxidx+1
646 %if (%2-%%src_off) == 8
647 mov rax, [r1+%%src_off]
648 %assign %%src_off %%src_off+8
649 %endif ; (%2-%%src_off) == 8
652 %rep (%2-%%src_off)/8
653 movq mm %+ %%smidx, [r1+%%src_off]
654 %assign %%src_off %%src_off+8
655 %assign %%smidx %%smidx+1
656 %endrep ; (%2-%%dst_off)/8
658 %if (%2-%%src_off) == 4
659 mov vald, [r1+%%src_off]
660 %elif (%2-%%src_off) & 4
661 movd mm %+ %%smidx, [r1+%%src_off]
662 %assign %%src_off %%src_off+4
663 %endif ; (%2-%%src_off) ==/& 4
665 %if (%2-%%src_off) == 1
666 mov vall, [r1+%%src_off]
667 %elif (%2-%%src_off) == 2
668 mov valw, [r1+%%src_off]
669 %elif (%2-%%src_off) == 3
671 mov valw2, [r1+%%src_off]
673 mov valw3, [r1+%%src_off]
675 mov valw4, [r1+%%src_off]
676 %endif ; %1 ==/!= top
677 mov vall, [r1+%%src_off+2]
678 %endif ; (%2-%%src_off) == 1/2/3
679 %endmacro ; READ_NUM_BYTES
681 %macro WRITE_NUM_BYTES 3
682 %assign %%dst_off 0 ; offset in destination buffer
683 %assign %%dmidx 0 ; mmx register idx
684 %assign %%dxidx 0 ; xmm register idx
688 movdqu [r0+%%dst_off], xmm %+ %%dxidx
689 %assign %%dst_off %%dst_off+16
690 %assign %%dxidx %%dxidx+1
695 %if (%2-%%dst_off) == 8
696 mov [r0+%%dst_off], rax
697 %assign %%dst_off %%dst_off+8
698 %endif ; (%2-%%dst_off) == 8
701 %rep (%2-%%dst_off)/8
702 movq [r0+%%dst_off], mm %+ %%dmidx
703 %assign %%dst_off %%dst_off+8
704 %assign %%dmidx %%dmidx+1
705 %endrep ; (%2-%%dst_off)/8
707 %if (%2-%%dst_off) == 4
708 mov [r0+%%dst_off], vald
709 %elif (%2-%%dst_off) & 4
710 movd [r0+%%dst_off], mm %+ %%dmidx
711 %assign %%dst_off %%dst_off+4
712 %endif ; (%2-%%dst_off) ==/& 4
714 %if (%2-%%dst_off) == 1
715 mov [r0+%%dst_off], vall
716 %elif (%2-%%dst_off) == 2
717 mov [r0+%%dst_off], valw
718 %elif (%2-%%dst_off) == 3
720 mov [r0+%%dst_off], valw2
722 mov [r0+%%dst_off], valw3
724 mov [r0+%%dst_off], valw4
725 %endif ; %1 ==/!= top
726 mov [r0+%%dst_off+2], vall
727 %endif ; (%2-%%dst_off) == 1/2/3
728 %endmacro ; WRITE_NUM_BYTES
730 ; vertical top/bottom extend and body copy fast loops
731 ; these are function pointers to set-width line copy functions, i.e.
732 ; they read a fixed number of pixels into set registers, and write
733 ; those out into the destination buffer
734 ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
735 ; r6(eax/64)/r3(ebx/32)=val_reg
736 %macro VERTICAL_EXTEND 1
740 .emuedge_v_extend_ %+ %%n:
741 ; extend pixels above body
743 test r3 , r3 ; if (!start_y)
744 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
747 je .emuedge_copy_body_ %+ %%n %+ _loop
748 %endif ; ARCH_X86_64/32
749 READ_NUM_BYTES top, %%n, %1 ; read bytes
750 .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
751 WRITE_NUM_BYTES top, %%n, %1 ; write bytes
752 add r0 , r2 ; dst += linesize
757 %endif ; ARCH_X86_64/32
758 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
761 .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
762 READ_NUM_BYTES body, %%n, %1 ; read bytes
763 WRITE_NUM_BYTES body, %%n, %1 ; write bytes
764 add r0 , r2 ; dst += linesize
765 add r1 , r2 ; src += linesize
767 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
770 test r5 , r5 ; if (!block_h)
771 jz .emuedge_v_extend_end_ %+ %%n ; goto end
772 sub r1 , r2 ; src -= linesize
773 READ_NUM_BYTES bottom, %%n, %1 ; read bytes
774 .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
775 WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes
776 add r0 , r2 ; dst += linesize
778 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
780 .emuedge_v_extend_end_ %+ %%n:
785 %endif ; ARCH_X86_64/32
788 %endmacro VERTICAL_EXTEND
790 ; left/right (horizontal) fast extend functions
791 ; these are essentially identical to the vertical extend ones above,
792 ; just left/right separated because number of pixels to extend is
793 ; obviously not the same on both sides.
794 ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
795 ; lowest two bytes of the register (so val*0x0101), and are splatted
796 ; into each byte of mm0 as well if n_pixels >= 8
798 %macro READ_V_PIXEL 3
812 %macro WRITE_V_PIXEL 2
815 movq [%2+%%dst_off], mm0
816 %assign %%dst_off %%dst_off+8
820 movd [%2+%%dst_off], mm0
822 mov [%2+%%dst_off] , valw
823 mov [%2+%%dst_off+2], valw
825 %assign %%dst_off %%dst_off+4
828 mov [%2+%%dst_off], valw
832 ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
837 .emuedge_extend_left_ %+ %%n: ; do {
838 sub r0, r2 ; dst -= linesize
839 READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels
840 WRITE_V_PIXEL %%n, r0 ; write pixels
842 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
847 %endif ; ARCH_X86_64/32
850 %endmacro ; LEFT_EXTEND
852 ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
853 %macro RIGHT_EXTEND 1
857 .emuedge_extend_right_ %+ %%n: ; do {
859 sub r3, r2 ; dst -= linesize
860 READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels
861 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
864 sub r0, r2 ; dst -= linesize
865 READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels
866 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
868 %endif ; ARCH_X86_64/32
869 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
874 %endif ; ARCH_X86_64/32
879 %define stack_offset 0x10
881 %endmacro ; RIGHT_EXTEND
883 ; below follow the "slow" copy/extend functions, these act on a non-fixed
884 ; width specified in a register, and run a loop to copy the full amount
885 ; of bytes. They are optimized for copying of large amounts of pixels per
886 ; line, so they unconditionally splat data into mm registers to copy 8
887 ; bytes per loop iteration. It could be considered to use xmm for x86-64
888 ; also, but I haven't optimized this as much (i.e. FIXME)
889 %macro V_COPY_NPX 4-5
915 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
917 V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
920 V_COPY_NPX %1, rax , mov, 8
923 V_COPY_NPX %1, mm0, movq, 8
924 %endif ; ARCH_X86_64/32
926 V_COPY_NPX %1, vald, mov, 4
927 V_COPY_NPX %1, valw, mov, 2
928 V_COPY_NPX %1, vall, mov, 1
938 %macro SLOW_V_EXTEND 1
940 ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
941 ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
943 push r11 ; save old value of block_h
946 jz .do_body_copy ; if (!start_y) goto do_body_copy
947 V_COPY_ROW top, r3, %1
951 je .do_body_copy ; if (!start_y) goto do_body_copy
952 V_COPY_ROW top, dword r3m, %1
956 V_COPY_ROW body, r4, %1
959 pop r11 ; restore old value of block_h
966 jz .skip_bottom_extend
968 V_COPY_ROW bottom, r5, %1
976 %macro SLOW_LEFT_EXTEND 1
977 .slow_left_extend_loop:
978 ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
981 READ_V_PIXEL 8, [r0+w_reg], %1
982 .left_extend_8px_loop:
986 jle .left_extend_8px_loop
989 jge .left_extend_loop_end
990 .left_extend_2px_loop:
994 jl .left_extend_2px_loop
995 .left_extend_loop_end:
997 jnz .slow_left_extend_loop
1004 %macro SLOW_RIGHT_EXTEND 1
1005 .slow_right_extend_loop:
1006 ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
1007 ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1016 sub buf_reg, linesize
1017 READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
1018 .right_extend_8px_loop:
1019 movq [buf_reg+r1], mm0
1022 jge .right_extend_8px_loop
1025 je .right_extend_loop_end
1026 .right_extend_2px_loop:
1028 mov [buf_reg+r1], valw
1030 jg .right_extend_2px_loop
1031 .right_extend_loop_end:
1033 jnz .slow_right_extend_loop
1044 SLOW_RIGHT_EXTEND %1