X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavfilter%2Fx86%2Fvf_hflip.asm;h=285618954f219d785d1e1ace7d14dc3fe920815c;hb=6d7c63588c81ba61b75701702b8680bd0063f36c;hp=89d6d0c69bb1f70ed028a02e255351580b2f754c;hpb=374f818bfbc5f7ad3a88f6a17770abb14abec4d1;p=ffmpeg diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm index 89d6d0c69bb..285618954f2 100644 --- a/libavfilter/x86/vf_hflip.asm +++ b/libavfilter/x86/vf_hflip.asm @@ -29,11 +29,16 @@ pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 SECTION .text -INIT_XMM ssse3 -cglobal hflip_byte, 3, 5, 3, src, dst, w, r, x - mova m0, [pb_flip_byte] - xor xq, xq +;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short) +%macro HFLIP 3 +cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x + VBROADCASTI128 m0, [pb_flip_%1] + xor xq, xq +%if %3 == 1 movsxdifnidn wq, wd +%else ; short + add wd, wd +%endif mov rq, wq and rq, 2 * mmsize - 1 cmp wq, 2 * mmsize @@ -42,8 +47,13 @@ cglobal hflip_byte, 3, 5, 3, src, dst, w, r, x .loop0: neg xq - movu m1, [srcq + xq - mmsize + 1] - movu m2, [srcq + xq - 2 * mmsize + 1] +%if mmsize == 32 + vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load + vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load +%else + movu m1, [srcq + xq - mmsize + %3] + movu m2, [srcq + xq - 2 * mmsize + %3] +%endif pshufb m1, m0 pshufb m2, m0 neg xq @@ -53,55 +63,28 @@ cglobal hflip_byte, 3, 5, 3, src, dst, w, r, x cmp xq, wq jl .loop0 - cmp rq, 0 - je .end - add wq, rq + cmp rq, 0 + je .end + add wq, rq .loop1: neg xq - mov rb, [srcq + xq] + mov r%2, [srcq + xq] neg xq - mov [dstq + xq], rb - add xq, 1 + mov [dstq + xq], r%2 + add xq, %3 cmp xq, wq jl .loop1 .end: -RET - -cglobal hflip_short, 3, 5, 3, src, dst, w, r, x - mova m0, [pb_flip_short] - xor xq, xq - add wd, wd - mov rq, wq - and rq, 2 * mmsize - 1 - cmp wq, 2 * mmsize - jl .loop1 - sub wq, rq + RET +%endmacro - .loop0: - neg xq - movu m1, [srcq + xq - mmsize + 2] - movu m2, [srcq + xq - 2 * mmsize + 2] - pshufb m1, m0 - pshufb m2, m0 - neg xq - movu [dstq + xq ], m1 - movu [dstq + xq + mmsize], m2 - add xq, mmsize * 2 - cmp xq, wq - jl .loop0 - - cmp rq, 0 - je .end - add wq, rq +INIT_XMM ssse3 +HFLIP byte, b, 1 +HFLIP short, w, 2 - .loop1: - neg xq - mov rw, [srcq + xq] - neg xq - mov [dstq + xq], rw - add xq, 2 - cmp xq, wq - jl .loop1 - .end: -RET +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +HFLIP byte, b, 1 +HFLIP short, w, 2 +%endif