X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;ds=sidebyside;f=libavfilter%2Fx86%2Fvf_blend.asm;h=680e266348a90524ddf5aa4c44ac38f23452778d;hb=3a230ce5fa10b21312236b362df9eeddd99e7ac2;hp=4916aaf251e014cba52fc5727e847045a4838519;hpb=d204b7ff610cb8dac1116e7a4fd3068733cb5adf;p=ffmpeg diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index 4916aaf251e..680e266348a 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -2,6 +2,8 @@ ;* x86-optimized functions for blend filter ;* ;* Copyright (C) 2015 Paul B Mahol +;* Copyright (C) 2018 Henrik Gramner +;* Copyright (C) 2018 Jokyo Images ;* ;* This file is part of FFmpeg. ;* @@ -74,39 +76,36 @@ BLEND_INIT %1, 2 BLEND_END %endmacro -INIT_XMM sse2 -BLEND_SIMPLE xor, xor -BLEND_SIMPLE or, or -BLEND_SIMPLE and, and -BLEND_SIMPLE addition, addusb -BLEND_SIMPLE subtract, subusb -BLEND_SIMPLE darken, minub -BLEND_SIMPLE lighten, maxub - -BLEND_INIT grainextract, 4 - pxor m2, m2 - mova m3, [pw_128] +%macro GRAINEXTRACT 0 +BLEND_INIT grainextract, 6 + pxor m4, m4 + VBROADCASTI128 m5, [pw_128] .nextrow: mov xq, widthq - .loop: - movh m0, [topq + xq] - movh m1, [bottomq + xq] - punpcklbw m0, m2 - punpcklbw m1, m2 - paddw m0, m3 - psubw m0, m1 - packuswb m0, m0 - movh [dstq + xq], m0 - add xq, mmsize / 2 + movu m1, [topq + xq] + movu m3, [bottomq + xq] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + paddw m0, m5 + paddw m1, m5 + psubw m0, m2 + psubw m1, m3 + + packuswb m0, m1 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro %macro MULTIPLY 3 ; a, b, pw_1 pmullw %1, %2 ; xxxxxxxx a * b paddw %1, %3 - mova %2, %1 - psrlw %2, 8 + psrlw %2, %1, 8 paddw %1, %2 psrlw %1, 8 ; 00xx00xx a * b / 255 %endmacro @@ -118,92 +117,112 @@ BLEND_END pxor %1, %4 ; 00xx00xx 255 - x / 255 %endmacro -BLEND_INIT multiply, 4 - pxor m2, m2 - mova m3, [pw_1] +%macro BLEND_MULTIPLY 0 +BLEND_INIT multiply, 6 + pxor m4, m4 + VBROADCASTI128 m5, [pw_1] .nextrow: mov xq, widthq .loop: - ; word - ; |--| - movh m0, [topq + xq] ; 0000xxxx - movh m1, [bottomq + xq] - punpcklbw m0, m2 ; 00xx00xx - punpcklbw m1, m2 - - MULTIPLY m0, m1, m3 - - packuswb m0, m0 ; 0000xxxx - movh [dstq + xq], m0 - add xq, mmsize / 2 - + movu m1, [topq + xq] + movu m3, [bottomq + xq] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + MULTIPLY m0, m2, m5 + MULTIPLY m1, m3, m5 + + packuswb m0, m1 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro -BLEND_INIT screen, 5 - pxor m2, m2 - mova m3, [pw_1] - mova m4, [pw_255] +%macro BLEND_SCREEN 0 +BLEND_INIT screen, 7 + pxor m4, m4 + + VBROADCASTI128 m5, [pw_1] + VBROADCASTI128 m6, [pw_255] .nextrow: mov xq, widthq .loop: - movh m0, [topq + xq] ; 0000xxxx - movh m1, [bottomq + xq] - punpcklbw m0, m2 ; 00xx00xx - punpcklbw m1, m2 - - SCREEN m0, m1, m3, m4 - - packuswb m0, m0 ; 0000xxxx - movh [dstq + xq], m0 - add xq, mmsize / 2 - + movu m1, [topq + xq] + movu m3, [bottomq + xq] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + SCREEN m0, m2, m5, m6 + SCREEN m1, m3, m5, m6 + + packuswb m0, m1 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro +%macro AVERAGE 0 BLEND_INIT average, 3 - pxor m2, m2 + pcmpeqb m2, m2 + .nextrow: mov xq, widthq - .loop: - movh m0, [topq + xq] - movh m1, [bottomq + xq] - punpcklbw m0, m2 - punpcklbw m1, m2 - paddw m0, m1 - psrlw m0, 1 - packuswb m0, m0 - movh [dstq + xq], m0 - add xq, mmsize / 2 +.loop: + movu m0, [topq + xq] + movu m1, [bottomq + xq] + pxor m0, m2 + pxor m1, m2 + pavgb m0, m1 + pxor m0, m2 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro -BLEND_INIT grainmerge, 4 - pxor m2, m2 - mova m3, [pw_128] + +%macro GRAINMERGE 0 +BLEND_INIT grainmerge, 6 + pxor m4, m4 + + VBROADCASTI128 m5, [pw_128] .nextrow: mov xq, widthq .loop: - movh m0, [topq + xq] - movh m1, [bottomq + xq] - punpcklbw m0, m2 - punpcklbw m1, m2 - paddw m0, m1 - psubw m0, m3 - packuswb m0, m0 - movh [dstq + xq], m0 - add xq, mmsize / 2 + movu m1, [topq + xq] + movu m3, [bottomq + xq] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + paddw m0, m2 + paddw m1, m3 + psubw m0, m5 + psubw m1, m5 + + packuswb m0, m1 + mova [dstq + xq], m0 + add xq, mmsize jl .loop BLEND_END +%endmacro +%macro HARDMIX 0 BLEND_INIT hardmix, 5 - mova m2, [pb_255] - mova m3, [pb_128] - mova m4, [pb_127] + VBROADCASTI128 m2, [pb_255] + VBROADCASTI128 m3, [pb_128] + VBROADCASTI128 m4, [pb_127] .nextrow: mov xq, widthq @@ -218,7 +237,9 @@ BLEND_INIT hardmix, 5 add xq, mmsize jl .loop BLEND_END +%endmacro +%macro DIVIDE 0 BLEND_INIT divide, 4 pxor m2, m2 mova m3, [ps_255] @@ -247,9 +268,11 @@ BLEND_INIT divide, 4 jl .loop BLEND_END +%endmacro +%macro PHOENIX 0 BLEND_INIT phoenix, 4 - mova m3, [pb_255] + VBROADCASTI128 m3, [pb_255] .nextrow: mov xq, widthq @@ -266,6 +289,7 @@ BLEND_INIT phoenix, 4 add xq, mmsize jl .loop BLEND_END +%endmacro %macro BLEND_ABS 0 BLEND_INIT difference, 5 @@ -291,7 +315,7 @@ BLEND_END BLEND_INIT extremity, 8 pxor m2, m2 - mova m4, [pw_255] + VBROADCASTI128 m4, [pw_255] .nextrow: mov xq, widthq @@ -315,7 +339,7 @@ BLEND_END BLEND_INIT negation, 8 pxor m2, m2 - mova m4, [pw_255] + VBROADCASTI128 m4, [pw_255] .nextrow: mov xq, widthq @@ -341,6 +365,43 @@ BLEND_END %endmacro INIT_XMM sse2 +BLEND_SIMPLE xor, xor +BLEND_SIMPLE or, or +BLEND_SIMPLE and, and +BLEND_SIMPLE addition, addusb +BLEND_SIMPLE subtract, subusb +BLEND_SIMPLE darken, minub +BLEND_SIMPLE lighten, maxub +GRAINEXTRACT +BLEND_MULTIPLY +BLEND_SCREEN +AVERAGE +GRAINMERGE +HARDMIX +PHOENIX +DIVIDE + BLEND_ABS + INIT_XMM ssse3 BLEND_ABS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +BLEND_SIMPLE xor, xor +BLEND_SIMPLE or, or +BLEND_SIMPLE and, and +BLEND_SIMPLE addition, addusb +BLEND_SIMPLE subtract, subusb +BLEND_SIMPLE darken, minub +BLEND_SIMPLE lighten, maxub +GRAINEXTRACT +BLEND_MULTIPLY +BLEND_SCREEN +AVERAGE +GRAINMERGE +HARDMIX +PHOENIX + +BLEND_ABS +%endif