1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
26 pb_zzzzzzzz77777777: times 8 db -1
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
31 section .text align=16
33 %macro SCALARPRODUCT 1
34 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
35 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
43 movu m0, [v1q + orderq]
44 movu m1, [v1q + orderq + mmsize]
45 pmaddwd m0, [v2q + orderq]
46 pmaddwd m1, [v2q + orderq + mmsize]
64 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
65 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
80 movu m0, [v2q + orderq]
81 movu m1, [v2q + orderq + mmsize]
82 mova m4, [v1q + orderq]
83 mova m5, [v1q + orderq + mmsize]
84 movu m2, [v3q + orderq]
85 movu m3, [v3q + orderq + mmsize]
94 mova [v1q + orderq], m2
95 mova [v1q + orderq + mmsize], m3
115 %macro SCALARPRODUCT_LOOP 1
121 mova m4, [v2q + orderq]
122 mova m0, [v2q + orderq + mmsize]
126 mova m5, [v3q + orderq]
127 mova m2, [v3q + orderq + mmsize]
131 mova m0, [v2q + orderq]
132 mova m1, [v2q + orderq + mmsize]
133 mova m2, [v3q + orderq]
134 mova m3, [v3q + orderq + mmsize]
136 %define t0 [v1q + orderq]
137 %define t1 [v1q + orderq + mmsize]
152 mova [v1q + orderq], m2
153 mova [v1q + orderq + mmsize], m3
160 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
161 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
171 mova m4, [v2q + orderq]
172 mova m5, [v3q + orderq]
173 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
188 SCALARPRODUCT_LOOP 14
189 SCALARPRODUCT_LOOP 12
190 SCALARPRODUCT_LOOP 10
206 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
207 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
210 movd mm4, [left_topq]
215 psubb mm0, mm4 ; t-tl
227 psubb mm0, mm4 ; t-tl
233 paddb mm4, mm3 ; t-tl+l
238 pmaxub mm3, mm5 ; median
239 paddb mm3, mm2 ; +residual
259 movzx r2d, byte [dstq-1]
261 movzx r2d, byte [topq-1]
266 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
292 movhps [dstq+wq+8], m0
304 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
306 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
309 mova m4, [pb_zzzz3333zzzzbbbb]
310 mova m3, [pb_zz11zz55zz99zzdd]
316 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
318 mova m6, [pb_zzzzzzzz77777777]
319 mova m4, [pb_zzzz3333zzzzbbbb]
320 mova m3, [pb_zz11zz55zz99zzdd]
324 jnz add_hfyu_left_prediction_ssse3.skip_prologue
332 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
333 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
340 movaps xmm1, [v1q+offsetq]
341 mulps xmm1, [v2q+offsetq]
356 ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
357 ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
358 ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
360 ; The actual function itself is below. It basically wraps a very simple
361 ; w = end_x - start_x
364 ; jump to the slow loop functions
366 ; jump to the fast loop functions
370 ; ... and then the same for left/right extend also. See below for loop
371 ; function implementations. Fast are fixed-width, slow is variable-width
373 %macro EMU_EDGE_FUNC 1
376 cglobal emu_edge_core_%1, 6, 7, 1
377 mov r11, r5 ; save block_h
380 cglobal emu_edge_core_%1, 2, 7, 0
382 mov r5, r5m ; block_h
385 ; start with vertical extend (top/bottom) and body pixel copy
387 sub w_reg, r6m ; w = start_x - end_x
395 jg .slow_v_extend_loop
397 mov r2, r2m ; linesize
399 sal w_reg, 7 ; w * 128
401 lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
404 lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
406 call w_reg ; fast top extend, body copy and bottom extend
409 ; horizontal extend (left/right)
410 mov w_reg, r6m ; start_x
413 mov r3, r0 ; backup of buf+block_h*linesize
416 mov r0m, r0 ; backup of buf+block_h*linesize
422 jg .slow_left_extend_loop
425 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
428 ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
429 ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
431 lea rax, [.emuedge_extend_left_2]
434 lea w_reg, [.emuedge_extend_left_2+w_reg]
438 ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
444 mov w_reg, r7m ; end_x
445 mov r1, r8m ; block_w
448 jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
450 jg .slow_right_extend_loop
452 ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
456 lea rax, [.emuedge_extend_right_2]
459 lea r1, [.emuedge_extend_right_2+r1]
485 %define stack_offset 0x14
490 ; macro to read/write a horizontal number of pixels (%2) to/from registers
491 ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
492 ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
493 ; - else if (%2 & 8) fills 8 bytes into mm0
494 ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
495 ; - else if (%2 & 4) fills 4 bytes into mm0-1
496 ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
497 ; (note that we're using r3 for body/bottom because it's a shorter
498 ; opcode, and then the loop fits in 128 bytes)
499 ; - else fills remaining bytes into rax
500 ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
501 ; - if (%2 & 7 == 4) fills 4 bytes into ebx
502 ; - else if (%2 & 4) fills 4 bytes into mm0-7
503 ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
504 ; - else fills remaining bytes into ebx
505 ; writing data out is in the same way
506 %macro READ_NUM_BYTES 3
507 %assign %%src_off 0 ; offset in source buffer
508 %assign %%smidx 0 ; mmx register idx
509 %assign %%sxidx 0 ; xmm register idx
513 movdqu xmm %+ %%sxidx, [r1+%%src_off]
514 %assign %%src_off %%src_off+16
515 %assign %%sxidx %%sxidx+1
520 %if (%2-%%src_off) == 8
521 mov rax, [r1+%%src_off]
522 %assign %%src_off %%src_off+8
523 %endif ; (%2-%%src_off) == 8
526 %rep (%2-%%src_off)/8
527 movq mm %+ %%smidx, [r1+%%src_off]
528 %assign %%src_off %%src_off+8
529 %assign %%smidx %%smidx+1
530 %endrep ; (%2-%%dst_off)/8
532 %if (%2-%%src_off) == 4
533 mov vald, [r1+%%src_off]
534 %elif (%2-%%src_off) & 4
535 movd mm %+ %%smidx, [r1+%%src_off]
536 %assign %%src_off %%src_off+4
537 %endif ; (%2-%%src_off) ==/& 4
539 %if (%2-%%src_off) == 1
540 mov vall, [r1+%%src_off]
541 %elif (%2-%%src_off) == 2
542 mov valw, [r1+%%src_off]
543 %elif (%2-%%src_off) == 3
545 mov valw2, [r1+%%src_off]
547 mov valw3, [r1+%%src_off]
549 mov valw4, [r1+%%src_off]
550 %endif ; %1 ==/!= top
551 mov vall, [r1+%%src_off+2]
552 %endif ; (%2-%%src_off) == 1/2/3
553 %endmacro ; READ_NUM_BYTES
555 %macro WRITE_NUM_BYTES 3
556 %assign %%dst_off 0 ; offset in destination buffer
557 %assign %%dmidx 0 ; mmx register idx
558 %assign %%dxidx 0 ; xmm register idx
562 movdqu [r0+%%dst_off], xmm %+ %%dxidx
563 %assign %%dst_off %%dst_off+16
564 %assign %%dxidx %%dxidx+1
569 %if (%2-%%dst_off) == 8
570 mov [r0+%%dst_off], rax
571 %assign %%dst_off %%dst_off+8
572 %endif ; (%2-%%dst_off) == 8
575 %rep (%2-%%dst_off)/8
576 movq [r0+%%dst_off], mm %+ %%dmidx
577 %assign %%dst_off %%dst_off+8
578 %assign %%dmidx %%dmidx+1
579 %endrep ; (%2-%%dst_off)/8
581 %if (%2-%%dst_off) == 4
582 mov [r0+%%dst_off], vald
583 %elif (%2-%%dst_off) & 4
584 movd [r0+%%dst_off], mm %+ %%dmidx
585 %assign %%dst_off %%dst_off+4
586 %endif ; (%2-%%dst_off) ==/& 4
588 %if (%2-%%dst_off) == 1
589 mov [r0+%%dst_off], vall
590 %elif (%2-%%dst_off) == 2
591 mov [r0+%%dst_off], valw
592 %elif (%2-%%dst_off) == 3
594 mov [r0+%%dst_off], valw2
596 mov [r0+%%dst_off], valw3
598 mov [r0+%%dst_off], valw4
599 %endif ; %1 ==/!= top
600 mov [r0+%%dst_off+2], vall
601 %endif ; (%2-%%dst_off) == 1/2/3
602 %endmacro ; WRITE_NUM_BYTES
604 ; vertical top/bottom extend and body copy fast loops
605 ; these are function pointers to set-width line copy functions, i.e.
606 ; they read a fixed number of pixels into set registers, and write
607 ; those out into the destination buffer
608 ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
609 ; r6(eax/64)/r3(ebx/32)=val_reg
610 %macro VERTICAL_EXTEND 1
614 .emuedge_v_extend_ %+ %%n:
615 ; extend pixels above body
617 test r3 , r3 ; if (!start_y)
618 jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
621 je .emuedge_copy_body_ %+ %%n %+ _loop
622 %endif ; ARCH_X86_64/32
623 READ_NUM_BYTES top, %%n, %1 ; read bytes
624 .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
625 WRITE_NUM_BYTES top, %%n, %1 ; write bytes
626 add r0 , r2 ; dst += linesize
631 %endif ; ARCH_X86_64/32
632 jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
635 .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
636 READ_NUM_BYTES body, %%n, %1 ; read bytes
637 WRITE_NUM_BYTES body, %%n, %1 ; write bytes
638 add r0 , r2 ; dst += linesize
639 add r1 , r2 ; src += linesize
641 jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
644 test r5 , r5 ; if (!block_h)
645 jz .emuedge_v_extend_end_ %+ %%n ; goto end
646 sub r1 , r2 ; src -= linesize
647 READ_NUM_BYTES bottom, %%n, %1 ; read bytes
648 .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
649 WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes
650 add r0 , r2 ; dst += linesize
652 jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
654 .emuedge_v_extend_end_ %+ %%n:
659 %endif ; ARCH_X86_64/32
662 %endmacro VERTICAL_EXTEND
664 ; left/right (horizontal) fast extend functions
665 ; these are essentially identical to the vertical extend ones above,
666 ; just left/right separated because number of pixels to extend is
667 ; obviously not the same on both sides.
668 ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
669 ; lowest two bytes of the register (so val*0x0101), and are splatted
670 ; into each byte of mm0 as well if n_pixels >= 8
672 %macro READ_V_PIXEL 3
686 %macro WRITE_V_PIXEL 2
689 movq [%2+%%dst_off], mm0
690 %assign %%dst_off %%dst_off+8
694 movd [%2+%%dst_off], mm0
696 mov [%2+%%dst_off] , valw
697 mov [%2+%%dst_off+2], valw
699 %assign %%dst_off %%dst_off+4
702 mov [%2+%%dst_off], valw
706 ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
711 .emuedge_extend_left_ %+ %%n: ; do {
712 sub r0, r2 ; dst -= linesize
713 READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels
714 WRITE_V_PIXEL %%n, r0 ; write pixels
716 jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
721 %endif ; ARCH_X86_64/32
724 %endmacro ; LEFT_EXTEND
726 ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
727 %macro RIGHT_EXTEND 1
731 .emuedge_extend_right_ %+ %%n: ; do {
733 sub r3, r2 ; dst -= linesize
734 READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels
735 WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
738 sub r0, r2 ; dst -= linesize
739 READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels
740 WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
742 %endif ; ARCH_X86_64/32
743 jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
748 %endif ; ARCH_X86_64/32
753 %define stack_offset 0x10
755 %endmacro ; RIGHT_EXTEND
757 ; below follow the "slow" copy/extend functions, these act on a non-fixed
758 ; width specified in a register, and run a loop to copy the full amount
759 ; of bytes. They are optimized for copying of large amounts of pixels per
760 ; line, so they unconditionally splat data into mm registers to copy 8
761 ; bytes per loop iteration. It could be considered to use xmm for x86-64
762 ; also, but I haven't optimized this as much (i.e. FIXME)
763 %macro V_COPY_NPX 4-5
789 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
791 V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
794 V_COPY_NPX %1, rax , mov, 8
797 V_COPY_NPX %1, mm0, movq, 8
798 %endif ; ARCH_X86_64/32
800 V_COPY_NPX %1, vald, mov, 4
801 V_COPY_NPX %1, valw, mov, 2
802 V_COPY_NPX %1, vall, mov, 1
812 %macro SLOW_V_EXTEND 1
814 ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
815 ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
817 push r11 ; save old value of block_h
820 jz .do_body_copy ; if (!start_y) goto do_body_copy
821 V_COPY_ROW top, r3, %1
825 je .do_body_copy ; if (!start_y) goto do_body_copy
826 V_COPY_ROW top, dword r3m, %1
830 V_COPY_ROW body, r4, %1
833 pop r11 ; restore old value of block_h
840 jz .skip_bottom_extend
842 V_COPY_ROW bottom, r5, %1
850 %macro SLOW_LEFT_EXTEND 1
851 .slow_left_extend_loop:
852 ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
855 READ_V_PIXEL 8, [r0+w_reg], %1
856 .left_extend_8px_loop:
860 jle .left_extend_8px_loop
863 jge .left_extend_loop_end
864 .left_extend_2px_loop:
868 jl .left_extend_2px_loop
869 .left_extend_loop_end:
871 jnz .slow_left_extend_loop
878 %macro SLOW_RIGHT_EXTEND 1
879 .slow_right_extend_loop:
880 ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
881 ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
890 sub buf_reg, linesize
891 READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
892 .right_extend_8px_loop:
893 movq [buf_reg+r1], mm0
896 jge .right_extend_8px_loop
899 je .right_extend_loop_end
900 .right_extend_2px_loop:
902 mov [buf_reg+r1], valw
904 jg .right_extend_2px_loop
905 .right_extend_loop_end:
907 jnz .slow_right_extend_loop