X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fx86%2Fdsputil_yasm.asm;h=09940d147d71395cc253232fa282fdc17bd1da5b;hb=5d42ac7ffb5471666136402a553454caf3a3c989;hp=96080be34961442e90f6c56e48faea8160a32c79;hpb=b1159ad92818cd8f0885d252b0800f5960fe7241;p=ffmpeg diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 96080be3496..09940d147d7 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -2,24 +2,25 @@ ;* MMX optimized DSP utils ;* Copyright (c) 2008 Loren Merritt ;* -;* This file is part of FFmpeg. +;* This file is part of Libav. ;* -;* FFmpeg is free software; you can redistribute it and/or +;* Libav is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* FFmpeg is distributed in the hope that it will be useful, +;* Libav is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA pb_f: times 16 db 15 @@ -27,77 +28,11 @@ pb_zzzzzzzz77777777: times 8 db -1 pb_7: times 8 db 7 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 +pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 +pd_16384: times 4 dd 16384 +pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 -section .text align=16 - -%macro PSWAPD_SSE 2 - pshufw %1, %2, 0x4e -%endmacro -%macro PSWAPD_3DN1 2 - movq %1, %2 - psrlq %1, 32 - punpckldq %1, %2 -%endmacro - -%macro FLOAT_TO_INT16_INTERLEAVE6 1 -; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) -cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 -%ifdef ARCH_X86_64 - %define lend r10d - mov lend, r2d -%else - %define lend dword r2m -%endif - mov src1q, [srcq+1*gprsize] - mov src2q, [srcq+2*gprsize] - mov src3q, [srcq+3*gprsize] - mov src4q, [srcq+4*gprsize] - mov src5q, [srcq+5*gprsize] - mov srcq, [srcq] - sub src1q, srcq - sub src2q, srcq - sub src3q, srcq - sub src4q, srcq - sub src5q, srcq -.loop: - cvtps2pi mm0, [srcq] - cvtps2pi mm1, [srcq+src1q] - cvtps2pi mm2, [srcq+src2q] - cvtps2pi mm3, [srcq+src3q] - cvtps2pi mm4, [srcq+src4q] - cvtps2pi mm5, [srcq+src5q] - packssdw mm0, mm3 - packssdw mm1, mm4 - packssdw mm2, mm5 - pswapd mm3, mm0 - punpcklwd mm0, mm1 - punpckhwd mm1, mm2 - punpcklwd mm2, mm3 - pswapd mm3, mm0 - punpckldq mm0, mm2 - punpckhdq mm2, mm1 - punpckldq mm1, mm3 - movq [dstq ], mm0 - movq [dstq+16], mm2 - movq [dstq+ 8], mm1 - add srcq, 8 - add dstq, 24 - sub lend, 2 - jg .loop - emms - RET -%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 - -%define pswapd PSWAPD_SSE -FLOAT_TO_INT16_INTERLEAVE6 sse -%define cvtps2pi pf2id -%define pswapd PSWAPD_3DN1 -FLOAT_TO_INT16_INTERLEAVE6 3dnow -%undef pswapd -FLOAT_TO_INT16_INTERLEAVE6 3dn2 -%undef cvtps2pi - - +SECTION_TEXT %macro SCALARPRODUCT 1 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) @@ -131,7 +66,7 @@ cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift RET ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) -cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul +cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul shl orderq, 1 movd m7, mulm %if mmsize == 16 @@ -202,12 +137,20 @@ align 16 mova m2, [v3q + orderq] mova m3, [v3q + orderq + mmsize] %endif - pmaddwd m0, [v1q + orderq] - pmaddwd m1, [v1q + orderq + mmsize] + %define t0 [v1q + orderq] + %define t1 [v1q + orderq + mmsize] +%if ARCH_X86_64 + mova m8, t0 + mova m9, t1 + %define t0 m8 + %define t1 m9 +%endif + pmaddwd m0, t0 + pmaddwd m1, t1 pmullw m2, m7 pmullw m3, m7 - paddw m2, [v1q + orderq] - paddw m3, [v1q + orderq + mmsize] + paddw m2, t0 + paddw m3, t1 paddd m6, m0 paddd m6, m1 mova [v1q + orderq], m2 @@ -219,7 +162,7 @@ align 16 %endmacro ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) -cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul +cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul shl orderq, 1 movd m7, mulm pshuflw m7, m7, 0 @@ -263,8 +206,132 @@ SCALARPRODUCT_LOOP 0 RET +;----------------------------------------------------------------------------- +; void ff_apply_window_int16(int16_t *output, const int16_t *input, +; const int16_t *window, unsigned int len) +;----------------------------------------------------------------------------- + +%macro REVERSE_WORDS_MMXEXT 1-2 + pshufw %1, %1, 0x1B +%endmacro + +%macro REVERSE_WORDS_SSE2 1-2 + pshuflw %1, %1, 0x1B + pshufhw %1, %1, 0x1B + pshufd %1, %1, 0x4E +%endmacro + +%macro REVERSE_WORDS_SSSE3 2 + pshufb %1, %2 +%endmacro + +; dst = (dst * src) >> 15 +; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back +; in from the pmullw result. +%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp + mova %3, %1 + pmulhw %1, %2 + pmullw %3, %2 + psrlw %3, 15 + psllw %1, 1 + por %1, %3 +%endmacro -; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) +; dst = ((dst * src) + (1<<14)) >> 15 +%macro MUL16FIXED_SSSE3 3 ; dst, src, unused + pmulhrsw %1, %2 +%endmacro + +%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3 +cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2 + lea offset2q, [offsetq-mmsize] +%if %2 + mova m5, [pd_16384] +%elifidn %1, ssse3 + mova m5, [pb_revwords] + ALIGN 16 +%endif +.loop: +%if %2 + ; This version expands 16-bit to 32-bit, multiplies by the window, + ; adds 16384 for rounding, right shifts 15, then repacks back to words to + ; save to the output. The window is reversed for the second half. + mova m3, [windowq+offset2q] + mova m4, [ inputq+offset2q] + pxor m0, m0 + punpcklwd m0, m3 + punpcklwd m1, m4 + pmaddwd m0, m1 + paddd m0, m5 + psrad m0, 15 + pxor m2, m2 + punpckhwd m2, m3 + punpckhwd m1, m4 + pmaddwd m2, m1 + paddd m2, m5 + psrad m2, 15 + packssdw m0, m2 + mova [outputq+offset2q], m0 + REVERSE_WORDS m3 + mova m4, [ inputq+offsetq] + pxor m0, m0 + punpcklwd m0, m3 + punpcklwd m1, m4 + pmaddwd m0, m1 + paddd m0, m5 + psrad m0, 15 + pxor m2, m2 + punpckhwd m2, m3 + punpckhwd m1, m4 + pmaddwd m2, m1 + paddd m2, m5 + psrad m2, 15 + packssdw m0, m2 + mova [outputq+offsetq], m0 +%elif %3 + ; This version does the 16x16->16 multiplication in-place without expanding + ; to 32-bit. The ssse3 version is bit-identical. + mova m0, [windowq+offset2q] + mova m1, [ inputq+offset2q] + pmulhrsw m1, m0 + REVERSE_WORDS m0, m5 + pmulhrsw m0, [ inputq+offsetq ] + mova [outputq+offset2q], m1 + mova [outputq+offsetq ], m0 +%else + ; This version does the 16x16->16 multiplication in-place without expanding + ; to 32-bit. The mmxext and sse2 versions do not use rounding, and + ; therefore are not bit-identical to the C version. + mova m0, [windowq+offset2q] + mova m1, [ inputq+offset2q] + mova m2, [ inputq+offsetq ] + MUL16FIXED m1, m0, m3 + REVERSE_WORDS m0 + MUL16FIXED m2, m0, m3 + mova [outputq+offset2q], m1 + mova [outputq+offsetq ], m2 +%endif + add offsetd, mmsize + sub offset2d, mmsize + jae .loop + REP_RET +%endmacro + +INIT_MMX +%define REVERSE_WORDS REVERSE_WORDS_MMXEXT +%define MUL16FIXED MUL16FIXED_MMXEXT +APPLY_WINDOW_INT16 mmxext, 0, 0 +APPLY_WINDOW_INT16 mmxext_ba, 1, 0 +INIT_XMM +%define REVERSE_WORDS REVERSE_WORDS_SSE2 +APPLY_WINDOW_INT16 sse2, 0, 0 +APPLY_WINDOW_INT16 sse2_ba, 1, 0 +APPLY_WINDOW_INT16 ssse3_atom, 0, 1 +%define REVERSE_WORDS REVERSE_WORDS_SSSE3 +APPLY_WINDOW_INT16 ssse3, 0, 1 + + +; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top movq mm0, [topq] movq mm2, mm0 @@ -362,23 +429,23 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to RET %endmacro -; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) +; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) INIT_MMX cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left .skip_prologue: - mova m5, [pb_7 GLOBAL] - mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] - mova m3, [pb_zz11zz55zz99zzdd GLOBAL] + mova m5, [pb_7] + mova m4, [pb_zzzz3333zzzzbbbb] + mova m3, [pb_zz11zz55zz99zzdd] movd m0, leftm psllq m0, 56 ADD_HFYU_LEFT_LOOP 1 INIT_XMM cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left - mova m5, [pb_f GLOBAL] - mova m6, [pb_zzzzzzzz77777777 GLOBAL] - mova m4, [pb_zzzz3333zzzzbbbb GLOBAL] - mova m3, [pb_zz11zz55zz99zzdd GLOBAL] + mova m5, [pb_f] + mova m6, [pb_zzzzzzzz77777777] + mova m4, [pb_zzzz3333zzzzbbbb] + mova m3, [pb_zz11zz55zz99zzdd] movd m0, leftm pslldq m0, 15 test srcq, 15 @@ -389,3 +456,848 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left .unaligned: ADD_HFYU_LEFT_LOOP 0 + +; float scalarproduct_float_sse(const float *v1, const float *v2, int len) +cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset + neg offsetq + shl offsetq, 2 + sub v1q, offsetq + sub v2q, offsetq + xorps xmm0, xmm0 + .loop: + movaps xmm1, [v1q+offsetq] + mulps xmm1, [v2q+offsetq] + addps xmm0, xmm1 + add offsetq, 16 + js .loop + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movd r0m, xmm0 + fld dword r0m +%endif + RET + +; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, +; x86_reg start_y, x86_reg end_y, x86_reg block_h, +; x86_reg start_x, x86_reg end_x, x86_reg block_w); +; +; The actual function itself is below. It basically wraps a very simple +; w = end_x - start_x +; if (w) { +; if (w > 22) { +; jump to the slow loop functions +; } else { +; jump to the fast loop functions +; } +; } +; +; ... and then the same for left/right extend also. See below for loop +; function implementations. Fast are fixed-width, slow is variable-width + +%macro EMU_EDGE_FUNC 0 +%if ARCH_X86_64 +%define w_reg r10 +cglobal emu_edge_core, 6, 7, 1 + mov r11, r5 ; save block_h +%else +%define w_reg r6 +cglobal emu_edge_core, 2, 7, 0 + mov r4, r4m ; end_y + mov r5, r5m ; block_h +%endif + + ; start with vertical extend (top/bottom) and body pixel copy + mov w_reg, r7m + sub w_reg, r6m ; w = start_x - end_x + sub r5, r4 +%if ARCH_X86_64 + sub r4, r3 +%else + sub r4, dword r3m +%endif + cmp w_reg, 22 + jg .slow_v_extend_loop +%if ARCH_X86_32 + mov r2, r2m ; linesize +%endif + sal w_reg, 7 ; w * 128 +%ifdef PIC + lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] + add w_reg, rax +%else + lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] +%endif + call w_reg ; fast top extend, body copy and bottom extend +.v_extend_end: + + ; horizontal extend (left/right) + mov w_reg, r6m ; start_x + sub r0, w_reg +%if ARCH_X86_64 + mov r3, r0 ; backup of buf+block_h*linesize + mov r5, r11 +%else + mov r0m, r0 ; backup of buf+block_h*linesize + mov r5, r5m +%endif + test w_reg, w_reg + jz .right_extend + cmp w_reg, 22 + jg .slow_left_extend_loop + mov r1, w_reg + dec w_reg + ; FIXME we can do a if size == 1 here if that makes any speed difference, test me + sar w_reg, 1 + sal w_reg, 6 + ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs + ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h +%ifdef PIC + lea rax, [.emuedge_extend_left_2] + add w_reg, rax +%else + lea w_reg, [.emuedge_extend_left_2+w_reg] +%endif + call w_reg + + ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w +.right_extend: +%if ARCH_X86_32 + mov r0, r0m + mov r5, r5m +%endif + mov w_reg, r7m ; end_x + mov r1, r8m ; block_w + mov r4, r1 + sub r1, w_reg + jz .h_extend_end ; if (end_x == block_w) goto h_extend_end + cmp r1, 22 + jg .slow_right_extend_loop + dec r1 + ; FIXME we can do a if size == 1 here if that makes any speed difference, test me + sar r1, 1 + sal r1, 6 +%ifdef PIC + lea rax, [.emuedge_extend_right_2] + add r1, rax +%else + lea r1, [.emuedge_extend_right_2+r1] +%endif + call r1 +.h_extend_end: + RET + +%if ARCH_X86_64 +%define vall al +%define valh ah +%define valw ax +%define valw2 r10w +%define valw3 r3w +%if WIN64 +%define valw4 r4w +%else ; unix64 +%define valw4 r3w +%endif +%define vald eax +%else +%define vall bl +%define valh bh +%define valw bx +%define valw2 r6w +%define valw3 valw2 +%define valw4 valw3 +%define vald ebx +%define stack_offset 0x14 +%endif + +%endmacro + +; macro to read/write a horizontal number of pixels (%2) to/from registers +; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels +; - if (%2 & 15 == 8) fills the last 8 bytes into rax +; - else if (%2 & 8) fills 8 bytes into mm0 +; - if (%2 & 7 == 4) fills the last 4 bytes into rax +; - else if (%2 & 4) fills 4 bytes into mm0-1 +; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax +; (note that we're using r3 for body/bottom because it's a shorter +; opcode, and then the loop fits in 128 bytes) +; - else fills remaining bytes into rax +; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels +; - if (%2 & 7 == 4) fills 4 bytes into ebx +; - else if (%2 & 4) fills 4 bytes into mm0-7 +; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx +; - else fills remaining bytes into ebx +; writing data out is in the same way +%macro READ_NUM_BYTES 2 +%assign %%src_off 0 ; offset in source buffer +%assign %%smidx 0 ; mmx register idx +%assign %%sxidx 0 ; xmm register idx + +%if cpuflag(sse) +%rep %2/16 + movups xmm %+ %%sxidx, [r1+%%src_off] +%assign %%src_off %%src_off+16 +%assign %%sxidx %%sxidx+1 +%endrep ; %2/16 +%endif + +%if ARCH_X86_64 +%if (%2-%%src_off) == 8 + mov rax, [r1+%%src_off] +%assign %%src_off %%src_off+8 +%endif ; (%2-%%src_off) == 8 +%endif ; x86-64 + +%rep (%2-%%src_off)/8 + movq mm %+ %%smidx, [r1+%%src_off] +%assign %%src_off %%src_off+8 +%assign %%smidx %%smidx+1 +%endrep ; (%2-%%dst_off)/8 + +%if (%2-%%src_off) == 4 + mov vald, [r1+%%src_off] +%elif (%2-%%src_off) & 4 + movd mm %+ %%smidx, [r1+%%src_off] +%assign %%src_off %%src_off+4 +%endif ; (%2-%%src_off) ==/& 4 + +%if (%2-%%src_off) == 1 + mov vall, [r1+%%src_off] +%elif (%2-%%src_off) == 2 + mov valw, [r1+%%src_off] +%elif (%2-%%src_off) == 3 +%ifidn %1, top + mov valw2, [r1+%%src_off] +%elifidn %1, body + mov valw3, [r1+%%src_off] +%elifidn %1, bottom + mov valw4, [r1+%%src_off] +%endif ; %1 ==/!= top + mov vall, [r1+%%src_off+2] +%endif ; (%2-%%src_off) == 1/2/3 +%endmacro ; READ_NUM_BYTES + +%macro WRITE_NUM_BYTES 2 +%assign %%dst_off 0 ; offset in destination buffer +%assign %%dmidx 0 ; mmx register idx +%assign %%dxidx 0 ; xmm register idx + +%if cpuflag(sse) +%rep %2/16 + movups [r0+%%dst_off], xmm %+ %%dxidx +%assign %%dst_off %%dst_off+16 +%assign %%dxidx %%dxidx+1 +%endrep ; %2/16 +%endif + +%if ARCH_X86_64 +%if (%2-%%dst_off) == 8 + mov [r0+%%dst_off], rax +%assign %%dst_off %%dst_off+8 +%endif ; (%2-%%dst_off) == 8 +%endif ; x86-64 + +%rep (%2-%%dst_off)/8 + movq [r0+%%dst_off], mm %+ %%dmidx +%assign %%dst_off %%dst_off+8 +%assign %%dmidx %%dmidx+1 +%endrep ; (%2-%%dst_off)/8 + +%if (%2-%%dst_off) == 4 + mov [r0+%%dst_off], vald +%elif (%2-%%dst_off) & 4 + movd [r0+%%dst_off], mm %+ %%dmidx +%assign %%dst_off %%dst_off+4 +%endif ; (%2-%%dst_off) ==/& 4 + +%if (%2-%%dst_off) == 1 + mov [r0+%%dst_off], vall +%elif (%2-%%dst_off) == 2 + mov [r0+%%dst_off], valw +%elif (%2-%%dst_off) == 3 +%ifidn %1, top + mov [r0+%%dst_off], valw2 +%elifidn %1, body + mov [r0+%%dst_off], valw3 +%elifidn %1, bottom + mov [r0+%%dst_off], valw4 +%endif ; %1 ==/!= top + mov [r0+%%dst_off+2], vall +%endif ; (%2-%%dst_off) == 1/2/3 +%endmacro ; WRITE_NUM_BYTES + +; vertical top/bottom extend and body copy fast loops +; these are function pointers to set-width line copy functions, i.e. +; they read a fixed number of pixels into set registers, and write +; those out into the destination buffer +; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h +; r6(eax/64)/r3(ebx/32)=val_reg +%macro VERTICAL_EXTEND 0 +%assign %%n 1 +%rep 22 +ALIGN 128 +.emuedge_v_extend_ %+ %%n: + ; extend pixels above body +%if ARCH_X86_64 + test r3 , r3 ; if (!start_y) + jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body +%else ; ARCH_X86_32 + cmp dword r3m, 0 + je .emuedge_copy_body_ %+ %%n %+ _loop +%endif ; ARCH_X86_64/32 + READ_NUM_BYTES top, %%n ; read bytes +.emuedge_extend_top_ %+ %%n %+ _loop: ; do { + WRITE_NUM_BYTES top, %%n ; write bytes + add r0 , r2 ; dst += linesize +%if ARCH_X86_64 + dec r3d +%else ; ARCH_X86_32 + dec dword r3m +%endif ; ARCH_X86_64/32 + jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) + + ; copy body pixels +.emuedge_copy_body_ %+ %%n %+ _loop: ; do { + READ_NUM_BYTES body, %%n ; read bytes + WRITE_NUM_BYTES body, %%n ; write bytes + add r0 , r2 ; dst += linesize + add r1 , r2 ; src += linesize + dec r4d + jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) + + ; copy bottom pixels + test r5 , r5 ; if (!block_h) + jz .emuedge_v_extend_end_ %+ %%n ; goto end + sub r1 , r2 ; src -= linesize + READ_NUM_BYTES bottom, %%n ; read bytes +.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { + WRITE_NUM_BYTES bottom, %%n ; write bytes + add r0 , r2 ; dst += linesize + dec r5d + jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) + +.emuedge_v_extend_end_ %+ %%n: +%if ARCH_X86_64 + ret +%else ; ARCH_X86_32 + rep ret +%endif ; ARCH_X86_64/32 +%assign %%n %%n+1 +%endrep +%endmacro VERTICAL_EXTEND + +; left/right (horizontal) fast extend functions +; these are essentially identical to the vertical extend ones above, +; just left/right separated because number of pixels to extend is +; obviously not the same on both sides. +; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the +; lowest two bytes of the register (so val*0x0101), and are splatted +; into each byte of mm0 as well if n_pixels >= 8 + +%macro READ_V_PIXEL 2 + mov vall, %2 + mov valh, vall +%if %1 >= 8 + movd mm0, vald +%if cpuflag(mmx2) + pshufw mm0, mm0, 0 +%else ; mmx + punpcklwd mm0, mm0 + punpckldq mm0, mm0 +%endif ; sse +%endif ; %1 >= 8 +%endmacro + +%macro WRITE_V_PIXEL 2 +%assign %%dst_off 0 +%rep %1/8 + movq [%2+%%dst_off], mm0 +%assign %%dst_off %%dst_off+8 +%endrep +%if %1 & 4 +%if %1 >= 8 + movd [%2+%%dst_off], mm0 +%else ; %1 < 8 + mov [%2+%%dst_off] , valw + mov [%2+%%dst_off+2], valw +%endif ; %1 >=/< 8 +%assign %%dst_off %%dst_off+4 +%endif ; %1 & 4 +%if %1&2 + mov [%2+%%dst_off], valw +%endif ; %1 & 2 +%endmacro + +; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val +%macro LEFT_EXTEND 0 +%assign %%n 2 +%rep 11 +ALIGN 64 +.emuedge_extend_left_ %+ %%n: ; do { + sub r0, r2 ; dst -= linesize + READ_V_PIXEL %%n, [r0+r1] ; read pixels + WRITE_V_PIXEL %%n, r0 ; write pixels + dec r5 + jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) +%if ARCH_X86_64 + ret +%else ; ARCH_X86_32 + rep ret +%endif ; ARCH_X86_64/32 +%assign %%n %%n+2 +%endrep +%endmacro ; LEFT_EXTEND + +; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val +%macro RIGHT_EXTEND 0 +%assign %%n 2 +%rep 11 +ALIGN 64 +.emuedge_extend_right_ %+ %%n: ; do { +%if ARCH_X86_64 + sub r3, r2 ; dst -= linesize + READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels + WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels + dec r11 +%else ; ARCH_X86_32 + sub r0, r2 ; dst -= linesize + READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels + WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels + dec r5 +%endif ; ARCH_X86_64/32 + jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) +%if ARCH_X86_64 + ret +%else ; ARCH_X86_32 + rep ret +%endif ; ARCH_X86_64/32 +%assign %%n %%n+2 +%endrep + +%if ARCH_X86_32 +%define stack_offset 0x10 +%endif +%endmacro ; RIGHT_EXTEND + +; below follow the "slow" copy/extend functions, these act on a non-fixed +; width specified in a register, and run a loop to copy the full amount +; of bytes. They are optimized for copying of large amounts of pixels per +; line, so they unconditionally splat data into mm registers to copy 8 +; bytes per loop iteration. It could be considered to use xmm for x86-64 +; also, but I haven't optimized this as much (i.e. FIXME) +%macro V_COPY_NPX 4-5 +%if %0 == 4 + test w_reg, %4 + jz .%1_skip_%4_px +%else ; %0 == 5 +.%1_%4_px_loop: +%endif + %3 %2, [r1+cnt_reg] + %3 [r0+cnt_reg], %2 + add cnt_reg, %4 +%if %0 == 5 + sub w_reg, %4 + test w_reg, %5 + jnz .%1_%4_px_loop +%endif +.%1_skip_%4_px: +%endmacro + +%macro V_COPY_ROW 2 +%ifidn %1, bottom + sub r1, linesize +%endif +.%1_copy_loop: + xor cnt_reg, cnt_reg +%if notcpuflag(sse) +%define linesize r2m + V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 +%else ; sse + V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0 +%if ARCH_X86_64 +%define linesize r2 + V_COPY_NPX %1, rax , mov, 8 +%else ; ARCH_X86_32 +%define linesize r2m + V_COPY_NPX %1, mm0, movq, 8 +%endif ; ARCH_X86_64/32 +%endif ; sse + V_COPY_NPX %1, vald, mov, 4 + V_COPY_NPX %1, valw, mov, 2 + V_COPY_NPX %1, vall, mov, 1 + mov w_reg, cnt_reg +%ifidn %1, body + add r1, linesize +%endif + add r0, linesize + dec %2 + jnz .%1_copy_loop +%endmacro + +%macro SLOW_V_EXTEND 0 +.slow_v_extend_loop: +; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h +; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x +%if ARCH_X86_64 + push r11 ; save old value of block_h + test r3, r3 +%define cnt_reg r11 + jz .do_body_copy ; if (!start_y) goto do_body_copy + V_COPY_ROW top, r3 +%else + cmp dword r3m, 0 +%define cnt_reg r2 + je .do_body_copy ; if (!start_y) goto do_body_copy + V_COPY_ROW top, dword r3m +%endif + +.do_body_copy: + V_COPY_ROW body, r4 + +%if ARCH_X86_64 + pop r11 ; restore old value of block_h +%define cnt_reg r3 +%endif + test r5, r5 +%if ARCH_X86_64 + jz .v_extend_end +%else + jz .skip_bottom_extend +%endif + V_COPY_ROW bottom, r5 +%if ARCH_X86_32 +.skip_bottom_extend: + mov r2, r2m +%endif + jmp .v_extend_end +%endmacro + +%macro SLOW_LEFT_EXTEND 0 +.slow_left_extend_loop: +; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x + mov r4, 8 + sub r0, linesize + READ_V_PIXEL 8, [r0+w_reg] +.left_extend_8px_loop: + movq [r0+r4-8], mm0 + add r4, 8 + cmp r4, w_reg + jle .left_extend_8px_loop + sub r4, 8 + cmp r4, w_reg + jge .left_extend_loop_end +.left_extend_2px_loop: + mov [r0+r4], valw + add r4, 2 + cmp r4, w_reg + jl .left_extend_2px_loop +.left_extend_loop_end: + dec r5 + jnz .slow_left_extend_loop +%if ARCH_X86_32 + mov r2, r2m +%endif + jmp .right_extend +%endmacro + +%macro SLOW_RIGHT_EXTEND 0 +.slow_right_extend_loop: +; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h, +; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr +%if ARCH_X86_64 +%define buf_reg r3 +%define bh_reg r11 +%else +%define buf_reg r0 +%define bh_reg r5 +%endif + lea r1, [r4-8] + sub buf_reg, linesize + READ_V_PIXEL 8, [buf_reg+w_reg-1] +.right_extend_8px_loop: + movq [buf_reg+r1], mm0 + sub r1, 8 + cmp r1, w_reg + jge .right_extend_8px_loop + add r1, 8 + cmp r1, w_reg + je .right_extend_loop_end +.right_extend_2px_loop: + sub r1, 2 + mov [buf_reg+r1], valw + cmp r1, w_reg + jg .right_extend_2px_loop +.right_extend_loop_end: + dec bh_reg + jnz .slow_right_extend_loop + jmp .h_extend_end +%endmacro + +%macro emu_edge 1 +INIT_XMM %1 +EMU_EDGE_FUNC +VERTICAL_EXTEND +LEFT_EXTEND +RIGHT_EXTEND +SLOW_V_EXTEND +SLOW_LEFT_EXTEND +SLOW_RIGHT_EXTEND +%endmacro + +emu_edge sse +%if ARCH_X86_32 +emu_edge mmx +%endif + +;----------------------------------------------------------------------------- +; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, +; int32_t max, unsigned int len) +;----------------------------------------------------------------------------- + +; %1 = number of xmm registers used +; %2 = number of inline load/process/store loops per asm loop +; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop +; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) +; %5 = suffix +%macro VECTOR_CLIP_INT32 4-5 +cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len +%if %4 + cvtsi2ss m4, minm + cvtsi2ss m5, maxm +%else + movd m4, minm + movd m5, maxm +%endif + SPLATD m4 + SPLATD m5 +.loop: +%assign %%i 1 +%rep %2 + mova m0, [srcq+mmsize*0*%%i] + mova m1, [srcq+mmsize*1*%%i] + mova m2, [srcq+mmsize*2*%%i] + mova m3, [srcq+mmsize*3*%%i] +%if %3 + mova m7, [srcq+mmsize*4*%%i] + mova m8, [srcq+mmsize*5*%%i] + mova m9, [srcq+mmsize*6*%%i] + mova m10, [srcq+mmsize*7*%%i] +%endif + CLIPD m0, m4, m5, m6 + CLIPD m1, m4, m5, m6 + CLIPD m2, m4, m5, m6 + CLIPD m3, m4, m5, m6 +%if %3 + CLIPD m7, m4, m5, m6 + CLIPD m8, m4, m5, m6 + CLIPD m9, m4, m5, m6 + CLIPD m10, m4, m5, m6 +%endif + mova [dstq+mmsize*0*%%i], m0 + mova [dstq+mmsize*1*%%i], m1 + mova [dstq+mmsize*2*%%i], m2 + mova [dstq+mmsize*3*%%i], m3 +%if %3 + mova [dstq+mmsize*4*%%i], m7 + mova [dstq+mmsize*5*%%i], m8 + mova [dstq+mmsize*6*%%i], m9 + mova [dstq+mmsize*7*%%i], m10 +%endif +%assign %%i %%i+1 +%endrep + add srcq, mmsize*4*(%2+%3) + add dstq, mmsize*4*(%2+%3) + sub lend, mmsize*(%2+%3) + jg .loop + REP_RET +%endmacro + +INIT_MMX mmx +%define SPLATD SPLATD_MMX +%define CLIPD CLIPD_MMX +VECTOR_CLIP_INT32 0, 1, 0, 0 +INIT_XMM sse2 +%define SPLATD SPLATD_SSE2 +VECTOR_CLIP_INT32 6, 1, 0, 0, _int +%define CLIPD CLIPD_SSE2 +VECTOR_CLIP_INT32 6, 2, 0, 1 +INIT_XMM sse4 +%define CLIPD CLIPD_SSE41 +%ifdef m8 +VECTOR_CLIP_INT32 11, 1, 1, 0 +%else +VECTOR_CLIP_INT32 6, 1, 0, 0 +%endif + +;----------------------------------------------------------------------------- +; void ff_butterflies_float_interleave(float *dst, const float *src0, +; const float *src1, int len); +;----------------------------------------------------------------------------- + +%macro BUTTERFLIES_FLOAT_INTERLEAVE 0 +cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len +%if ARCH_X86_64 + movsxd lenq, lend +%endif + test lenq, lenq + jz .end + shl lenq, 2 + lea src0q, [src0q + lenq] + lea src1q, [src1q + lenq] + lea dstq, [ dstq + 2*lenq] + neg lenq +.loop: + mova m0, [src0q + lenq] + mova m1, [src1q + lenq] + subps m2, m0, m1 + addps m0, m0, m1 + unpcklps m1, m0, m2 + unpckhps m0, m0, m2 +%if cpuflag(avx) + vextractf128 [dstq + 2*lenq ], m1, 0 + vextractf128 [dstq + 2*lenq + 16], m0, 0 + vextractf128 [dstq + 2*lenq + 32], m1, 1 + vextractf128 [dstq + 2*lenq + 48], m0, 1 +%else + mova [dstq + 2*lenq ], m1 + mova [dstq + 2*lenq + mmsize], m0 +%endif + add lenq, mmsize + jl .loop +%if mmsize == 32 + vzeroupper + RET +%endif +.end: + REP_RET +%endmacro + +INIT_XMM sse +BUTTERFLIES_FLOAT_INTERLEAVE +INIT_YMM avx +BUTTERFLIES_FLOAT_INTERLEAVE + +INIT_XMM sse2 +; %1 = aligned/unaligned +%macro BSWAP_LOOPS_SSE2 1 + mov r3, r2 + sar r2, 3 + jz .left4_%1 +.loop8_%1: + mov%1 m0, [r1 + 0] + mov%1 m1, [r1 + 16] + pshuflw m0, m0, 10110001b + pshuflw m1, m1, 10110001b + pshufhw m0, m0, 10110001b + pshufhw m1, m1, 10110001b + mova m2, m0 + mova m3, m1 + psllw m0, 8 + psllw m1, 8 + psrlw m2, 8 + psrlw m3, 8 + por m2, m0 + por m3, m1 + mova [r0 + 0], m2 + mova [r0 + 16], m3 + add r1, 32 + add r0, 32 + dec r2 + jnz .loop8_%1 +.left4_%1: + mov r2, r3 + and r3, 4 + jz .left + mov%1 m0, [r1] + pshuflw m0, m0, 10110001b + pshufhw m0, m0, 10110001b + mova m2, m0 + psllw m0, 8 + psrlw m2, 8 + por m2, m0 + mova [r0], m2 + add r1, 16 + add r0, 16 +%endmacro + +; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); +cglobal bswap32_buf, 3,4,5 + mov r3, r1 + and r3, 15 + jz .start_align + BSWAP_LOOPS_SSE2 u + jmp .left +.start_align: + BSWAP_LOOPS_SSE2 a +.left: + and r2, 3 + jz .end +.loop2: + mov r3d, [r1] + bswap r3d + mov [r0], r3d + add r1, 4 + add r0, 4 + dec r2 + jnz .loop2 +.end + RET + +; %1 = aligned/unaligned +%macro BSWAP_LOOPS_SSSE3 1 + mov r3, r2 + sar r2, 3 + jz .left4_%1 +.loop8_%1: + mov%1 m0, [r1 + 0] + mov%1 m1, [r1 + 16] + pshufb m0, m2 + pshufb m1, m2 + mova [r0 + 0], m0 + mova [r0 + 16], m1 + add r0, 32 + add r1, 32 + dec r2 + jnz .loop8_%1 +.left4_%1: + mov r2, r3 + and r3, 4 + jz .left2 + mov%1 m0, [r1] + pshufb m0, m2 + mova [r0], m0 + add r1, 16 + add r0, 16 +%endmacro + +INIT_XMM ssse3 +; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); +cglobal bswap32_buf, 3,4,3 + mov r3, r1 + mova m2, [pb_bswap32] + and r3, 15 + jz .start_align + BSWAP_LOOPS_SSSE3 u + jmp .left2 +.start_align: + BSWAP_LOOPS_SSSE3 a +.left2: + mov r3, r2 + and r2, 2 + jz .left1 + movq m0, [r1] + pshufb m0, m2 + movq [r0], m0 + add r1, 8 + add r0, 8 +.left1: + and r3, 1 + jz .end + mov r2d, [r1] + bswap r2d + mov [r0], r2d +.end: + RET