X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libswscale%2Fx86%2Foutput.asm;h=133817cb71899b1363b20d864a7ff45087efc1b4;hb=f99195d56f4aab266926724ca1cfae822df4df16;hp=9ea4af953502b84bf117276a56e7c03660b2e9ed;hpb=fa6c7ccc20d3dc8f220af31f10a159e1b7a13b92;p=ffmpeg diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm index 9ea4af95350..133817cb718 100644 --- a/libswscale/x86/output.asm +++ b/libswscale/x86/output.asm @@ -54,76 +54,8 @@ SECTION .text ; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple ; of 2. $offset is either 0 or 3. $dither holds 8 values. ;----------------------------------------------------------------------------- - -%macro yuv2planeX_fn 3 - -%if ARCH_X86_32 -%define cntr_reg fltsizeq -%define movsx mov -%else -%define cntr_reg r7 -%define movsx movsxd -%endif - -cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset -%if %1 == 8 || %1 == 9 || %1 == 10 - pxor m6, m6 -%endif ; %1 == 8/9/10 - -%if %1 == 8 -%if ARCH_X86_32 -%assign pad 0x2c - (stack_offset & 15) - SUB rsp, pad -%define m_dith m7 -%else ; x86-64 -%define m_dith m9 -%endif ; x86-32 - - ; create registers holding dither - movq m_dith, [ditherq] ; dither - test offsetd, offsetd - jz .no_rot -%if mmsize == 16 - punpcklqdq m_dith, m_dith -%endif ; mmsize == 16 - PALIGNR m_dith, m_dith, 3, m0 -.no_rot: -%if mmsize == 16 - punpcklbw m_dith, m6 -%if ARCH_X86_64 - punpcklwd m8, m_dith, m6 - pslld m8, 12 -%else ; x86-32 - punpcklwd m5, m_dith, m6 - pslld m5, 12 -%endif ; x86-32/64 - punpckhwd m_dith, m6 - pslld m_dith, 12 -%if ARCH_X86_32 - mova [rsp+ 0], m5 - mova [rsp+16], m_dith -%endif -%else ; mmsize == 8 - punpcklbw m5, m_dith, m6 - punpckhbw m_dith, m6 - punpcklwd m4, m5, m6 - punpckhwd m5, m6 - punpcklwd m3, m_dith, m6 - punpckhwd m_dith, m6 - pslld m4, 12 - pslld m5, 12 - pslld m3, 12 - pslld m_dith, 12 - mova [rsp+ 0], m4 - mova [rsp+ 8], m5 - mova [rsp+16], m3 - mova [rsp+24], m_dith -%endif ; mmsize == 8/16 -%endif ; %1 == 8 - - xor r5, r5 - -.pixelloop: +%macro yuv2planeX_mainloop 2 +.pixelloop_%2: %assign %%i 0 ; the rep here is for the 8bit output mmx case, where dither covers ; 8 pixels but we can only handle 2 pixels per register, and thus 4 @@ -150,7 +82,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset mova m2, m1 %endif ; %1 == 8/9/10/16 movsx cntr_reg, fltsizem -.filterloop_ %+ %%i: +.filterloop_%2_ %+ %%i: ; input pixels mov r6, [srcq+gprsize*cntr_reg-2*gprsize] %if %1 == 16 @@ -197,7 +129,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset %endif ; %1 == 8/9/10/16 sub cntr_reg, 2 - jg .filterloop_ %+ %%i + jg .filterloop_%2_ %+ %%i %if %1 == 16 psrad m2, 31 - %1 @@ -224,7 +156,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset %endif ; mmxext/sse2/sse4/avx pminsw m2, [yuv2yuvX_%1_upper] %endif ; %1 == 9/10/16 - mova [dstq+r5*2], m2 + mov%2 [dstq+r5*2], m2 %endif ; %1 == 8/9/10/16 add r5, mmsize/2 @@ -232,7 +164,87 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset %assign %%i %%i+2 %endrep - jg .pixelloop + jg .pixelloop_%2 +%endmacro + +%macro yuv2planeX_fn 3 + +%if ARCH_X86_32 +%define cntr_reg fltsizeq +%define movsx mov +%else +%define cntr_reg r7 +%define movsx movsxd +%endif + +cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset +%if %1 == 8 || %1 == 9 || %1 == 10 + pxor m6, m6 +%endif ; %1 == 8/9/10 + +%if %1 == 8 +%if ARCH_X86_32 +%assign pad 0x2c - (stack_offset & 15) + SUB rsp, pad +%define m_dith m7 +%else ; x86-64 +%define m_dith m9 +%endif ; x86-32 + + ; create registers holding dither + movq m_dith, [ditherq] ; dither + test offsetd, offsetd + jz .no_rot +%if mmsize == 16 + punpcklqdq m_dith, m_dith +%endif ; mmsize == 16 + PALIGNR m_dith, m_dith, 3, m0 +.no_rot: +%if mmsize == 16 + punpcklbw m_dith, m6 +%if ARCH_X86_64 + punpcklwd m8, m_dith, m6 + pslld m8, 12 +%else ; x86-32 + punpcklwd m5, m_dith, m6 + pslld m5, 12 +%endif ; x86-32/64 + punpckhwd m_dith, m6 + pslld m_dith, 12 +%if ARCH_X86_32 + mova [rsp+ 0], m5 + mova [rsp+16], m_dith +%endif +%else ; mmsize == 8 + punpcklbw m5, m_dith, m6 + punpckhbw m_dith, m6 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m3, m_dith, m6 + punpckhwd m_dith, m6 + pslld m4, 12 + pslld m5, 12 + pslld m3, 12 + pslld m_dith, 12 + mova [rsp+ 0], m4 + mova [rsp+ 8], m5 + mova [rsp+16], m3 + mova [rsp+24], m_dith +%endif ; mmsize == 8/16 +%endif ; %1 == 8 + + xor r5, r5 + +%if mmsize == 8 || %1 == 8 + yuv2planeX_mainloop %1, a +%else ; mmsize == 16 + test dstq, 15 + jnz .unaligned + yuv2planeX_mainloop %1, a + REP_RET +.unaligned: + yuv2planeX_mainloop %1, u +%endif ; mmsize == 8/16 %if %1 == 8 %if ARCH_X86_32