X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fmc-a2.asm;h=c58aba4f1d1cff9e73e2e38856b049a4a811dfbc;hb=c82c7374938f4342971adf8b2495c3a1bbe621c4;hp=00fcc8f84f498dee91c2d9ad92ff8cab836c68c8;hpb=aa47955a0ec65218e8bb967d36689069baca5fd1;p=x264 diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 00fcc8f8..c58aba4f 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* mc-a2.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2005-2012 x264 project +;* Copyright (C) 2005-2016 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser @@ -30,17 +30,44 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -filt_mul20: times 16 db 20 -filt_mul15: times 8 db 1, -5 -filt_mul51: times 8 db -5, 1 -hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 -deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 +pw_1024: times 16 dw 1024 +filt_mul20: times 32 db 20 +filt_mul15: times 16 db 1, -5 +filt_mul51: times 16 db -5, 1 +hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 +deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 + +%if HIGH_BIT_DEPTH +copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +v210_mask: times 4 dq 0xc00ffc003ff003ff +v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15 +v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14 +; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register +v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800 + dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800 + +deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 +deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 +%else +copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14 +deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1 + db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1 + +deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 +deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 +%endif ; !HIGH_BIT_DEPTH + +mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6 + db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14 +mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14 + +pf_256: times 4 dd 256.0 +pf_inv256: times 4 dd 0.00390625 pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff -pf_inv256: times 8 dd 0.00390625 pad10: times 8 dw 10*PIXEL_MAX pad20: times 8 dw 20*PIXEL_MAX @@ -51,15 +78,22 @@ tap1: times 4 dw 1, -5 tap2: times 4 dw 20, 20 tap3: times 4 dw -5, 1 +pw_0xc000: times 8 dw 0xc000 +pw_31: times 8 dw 31 +pd_4: times 4 dd 4 + SECTION .text cextern pb_0 cextern pw_1 +cextern pw_8 cextern pw_16 cextern pw_32 +cextern pw_512 cextern pw_00ff cextern pw_3fff cextern pw_pixel_max +cextern pw_0to15 cextern pd_ffff %macro LOAD_ADD 4 @@ -120,19 +154,24 @@ cextern pd_ffff paddw %4, %6 %endmacro -%macro FILT_PACK 4-6 b - paddw %1, %4 - paddw %2, %4 -%if %0 == 6 - psubusw %1, %6 - psubusw %2, %6 - psrlw %1, %3 - psrlw %2, %3 +%macro FILT_PACK 3-5 +%if cpuflag(ssse3) + pmulhrsw %1, %3 + pmulhrsw %2, %3 +%else + paddw %1, %3 + paddw %2, %3 +%if %0 == 5 + psubusw %1, %5 + psubusw %2, %5 + psrlw %1, %4 + psrlw %2, %4 %else - psraw %1, %3 - psraw %2, %3 + psraw %1, %4 + psraw %2, %4 +%endif %endif -%ifnidn w, %5 +%if HIGH_BIT_DEPTH == 0 packuswb %1, %2 %endif %endmacro @@ -148,14 +187,11 @@ cextern pd_ffff %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width ); +; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width ); ;----------------------------------------------------------------------------- %macro HPEL_FILTER 0 cglobal hpel_filter_v, 5,6,11 - FIX_STRIDES r3d, r4d -%if WIN64 - movsxd r4, r4d -%endif + FIX_STRIDES r3, r4 lea r5, [r1+r3] sub r1, r3 sub r1, r3 @@ -172,7 +208,7 @@ cglobal hpel_filter_v, 5,6,11 %define s30 [pad30] %endif add r0, r4 - lea r2, [r2+r4] + add r2, r4 neg r4 mova m7, [pw_pixel_max] pxor m0, m0 @@ -199,22 +235,22 @@ cglobal hpel_filter_v, 5,6,11 mova [r2+r4+mmsize], m4 paddw m1, s30 paddw m4, s30 - FILT_PACK m1, m4, 5, m6, w, s10 + FILT_PACK m1, m4, m6, 5, s10 CLIPW m1, m0, m7 CLIPW m4, m0, m7 mova [r0+r4], m1 mova [r0+r4+mmsize], m4 add r4, 2*mmsize jl .loop - REP_RET + RET ;----------------------------------------------------------------------------- -; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width ); +; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_c, 3,3,10 add r2, r2 add r0, r2 - lea r1, [r1+r2] + add r1, r2 neg r2 mova m0, [tap1] mova m7, [tap3] @@ -255,10 +291,10 @@ cglobal hpel_filter_c, 3,3,10 mova [r0+r2], m1 add r2, mmsize jl .loop - REP_RET + RET ;----------------------------------------------------------------------------- -; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width ); +; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_h, 3,4,8 %define src r1+r2 @@ -291,14 +327,14 @@ cglobal hpel_filter_h, 3,4,8 FILT_H2 m1, m2, m3, m4, m5, m6 mova m7, [pw_1] pxor m2, m2 - FILT_PACK m1, m4, 1, m7, w + FILT_PACK m1, m4, m7, 1 CLIPW m1, m2, m0 CLIPW m4, m2, m0 mova [r0+r2], m1 mova [r0+r2+mmsize], m4 add r2, mmsize*2 jl .loop - REP_RET + RET %endmacro ; HPEL_FILTER INIT_MMX mmx2 @@ -310,12 +346,9 @@ HPEL_FILTER %if HIGH_BIT_DEPTH == 0 %macro HPEL_V 1 ;----------------------------------------------------------------------------- -; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width ); +; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_v, 5,6,%1 -%if WIN64 - movsxd r4, r4d -%endif lea r5, [r1+r3] sub r1, r3 sub r1, r3 @@ -348,30 +381,38 @@ cglobal hpel_filter_v, 5,6,%1 paddw m4, m5 paddw m1, m3 paddw m4, m6 + mova m7, [pw_1024] %else LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1 FILT_V2 m1, m2, m3, m4, m5, m6 + mova m7, [pw_16] %endif - mova m7, [pw_16] +%if mmsize==32 + mova [r2+r4*2], xm1 + mova [r2+r4*2+mmsize/2], xm4 + vextracti128 [r2+r4*2+mmsize], m1, 1 + vextracti128 [r2+r4*2+mmsize*3/2], m4, 1 +%else mova [r2+r4*2], m1 mova [r2+r4*2+mmsize], m4 - FILT_PACK m1, m4, 5, m7 +%endif + FILT_PACK m1, m4, m7, 5 movnta [r0+r4], m1 add r1, mmsize add r5, mmsize add r4, mmsize jl .loop - REP_RET + RET %endmacro ;----------------------------------------------------------------------------- -; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); +; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal hpel_filter_c_mmx2, 3,3 +INIT_MMX mmx2 +cglobal hpel_filter_c, 3,3 add r0, r2 lea r1, [r1+r2*2] neg r2 @@ -391,16 +432,17 @@ cglobal hpel_filter_c_mmx2, 3,3 paddw m5, [src+12] ; b1 paddw m6, [src+10] ; c1 FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, 6, m7 + FILT_PACK m1, m4, m7, 6 movntq [r0+r2], m1 add r2, 8 jl .loop - REP_RET + RET ;----------------------------------------------------------------------------- -; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); +; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- -cglobal hpel_filter_h_mmx2, 3,3 +INIT_MMX mmx2 +cglobal hpel_filter_h, 3,3 add r0, r2 add r1, r2 neg r2 @@ -435,17 +477,15 @@ cglobal hpel_filter_h_mmx2, 3,3 paddw m6, m7 ; a1 movq m7, [pw_1] FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, 1, m7 + FILT_PACK m1, m4, m7, 1 movntq [r0+r2], m1 add r2, 8 jl .loop - REP_RET - -INIT_XMM + RET %macro HPEL_C 0 ;----------------------------------------------------------------------------- -; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); +; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_c, 3,3,9 add r0, r2 @@ -453,29 +493,33 @@ cglobal hpel_filter_c, 3,3,9 neg r2 %define src r1+r2*2 %ifnidn cpuname, sse2 +%if cpuflag(ssse3) + mova m7, [pw_512] +%else mova m7, [pw_32] - %define tpw_32 m7 +%endif + %define pw_rnd m7 %elif ARCH_X86_64 mova m8, [pw_32] - %define tpw_32 m8 + %define pw_rnd m8 %else - %define tpw_32 [pw_32] + %define pw_rnd [pw_32] %endif ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer... -%if cpuflag(misalign) +%if mmsize==32 .loop: movu m4, [src-4] movu m5, [src-2] - mova m6, [src] - movu m3, [src+12] - movu m2, [src+14] - mova m1, [src+16] + mova m6, [src+0] + movu m3, [src-4+mmsize] + movu m2, [src-2+mmsize] + mova m1, [src+0+mmsize] paddw m4, [src+6] paddw m5, [src+4] paddw m6, [src+2] - paddw m3, [src+22] - paddw m2, [src+20] - paddw m1, [src+18] + paddw m3, [src+6+mmsize] + paddw m2, [src+4+mmsize] + paddw m1, [src+2+mmsize] FILT_H2 m4, m5, m6, m3, m2, m1 %else mova m0, [src-16] @@ -505,17 +549,21 @@ cglobal hpel_filter_c, 3,3,9 paddw m6, m0 FILT_H m3, m5, m6 %endif - FILT_PACK m4, m3, 6, tpw_32 - movntps [r0+r2], m4 - add r2, 16 + FILT_PACK m4, m3, pw_rnd, 6 +%if mmsize==32 + vpermq m4, m4, q3120 +%endif + movnta [r0+r2], m4 + add r2, mmsize jl .loop - REP_RET + RET %endmacro ;----------------------------------------------------------------------------- -; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); +; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- -cglobal hpel_filter_h_sse2, 3,3,8 +INIT_XMM sse2 +cglobal hpel_filter_h, 3,3,8 add r0, r2 add r1, r2 neg r2 @@ -554,14 +602,14 @@ cglobal hpel_filter_h_sse2, 3,3,8 paddw m6, m7 ; c1 mova m7, [pw_1] ; FIXME xmm8 FILT_H2 m1, m2, m3, m4, m5, m6 - FILT_PACK m1, m4, 1, m7 + FILT_PACK m1, m4, m7, 1 movntps [r0+r2], m1 add r2, 16 jl .loop - REP_RET + RET ;----------------------------------------------------------------------------- -; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); +; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- %macro HPEL_H 0 cglobal hpel_filter_h, 3,3 @@ -571,7 +619,7 @@ cglobal hpel_filter_h, 3,3 %define src r1+r2 mova m0, [src-16] mova m1, [src] - mova m7, [pw_16] + mova m7, [pw_1024] .loop: mova m2, [src+16] ; Using unaligned loads instead of palignr is marginally slower on SB and significantly @@ -593,21 +641,19 @@ cglobal hpel_filter_h, 3,3 paddw m3, m1 paddw m4, m5 paddw m4, m6 - FILT_PACK m3, m4, 5, m7 + FILT_PACK m3, m4, m7, 5 pshufb m3, [hpel_shuf] mova m1, m2 movntps [r0+r2], m3 add r2, 16 jl .loop - REP_RET + RET %endmacro INIT_MMX mmx2 HPEL_V 0 INIT_XMM sse2 HPEL_V 8 -INIT_XMM sse2, misalign -HPEL_C %if ARCH_X86_64 == 0 INIT_XMM sse2 HPEL_C @@ -619,6 +665,45 @@ INIT_XMM avx HPEL_C HPEL_V 0 HPEL_H +INIT_YMM avx2 +HPEL_V 8 +HPEL_C + +INIT_YMM avx2 +cglobal hpel_filter_h, 3,3,8 + add r0, r2 + add r1, r2 + neg r2 + %define src r1+r2 + mova m5, [filt_mul15] + mova m6, [filt_mul20] + mova m7, [filt_mul51] +.loop: + movu m0, [src-2] + movu m1, [src-1] + movu m2, [src+2] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m7 + paddw m0, m2 + + mova m2, [src+0] + movu m3, [src+1] + movu m4, [src+3] + pmaddubsw m2, m6 + pmaddubsw m3, m6 + pmaddubsw m4, m7 + paddw m0, m2 + paddw m1, m3 + paddw m1, m4 + + mova m2, [pw_1024] + FILT_PACK m0, m1, m2, 5 + pshufb m0, [hpel_shuf] + movnta [r0+r2], m0 + add r2, mmsize + jl .loop + RET %endif %if ARCH_X86_64 @@ -626,9 +711,9 @@ HPEL_H ;The optimum prefetch distance is difficult to determine in checkasm: ;any prefetch seems slower than not prefetching. ;In real use, the prefetch seems to be a slight win. - ;+16 is picked somewhat arbitrarily here based on the fact that even one + ;+mmsize is picked somewhat arbitrarily here based on the fact that even one ;loop iteration is going to take longer than the prefetch. - prefetcht0 [r1+r2*2+16] + prefetcht0 [r1+r2*2+mmsize] %if cpuflag(ssse3) mova m1, [r3] mova m2, [r3+r2] @@ -661,31 +746,48 @@ HPEL_H packuswb %3, %4 FILT_V2 m1, m2, m3, m4, m5, m6 %endif - add r3, 16 - add r1, 16 + add r3, mmsize + add r1, mmsize +%if mmsize==32 + vinserti128 %1, m1, xm4, 1 + vperm2i128 %2, m1, m4, q0301 +%else mova %1, m1 mova %2, m4 - FILT_PACK m1, m4, 5, m15 +%endif + FILT_PACK m1, m4, m15, 5 movntps [r8+r4+%5], m1 %endmacro -%macro FILT_C 4 - PALIGNR m1, %2, %1, 12, m2 - PALIGNR m2, %2, %1, 14, %1 +%macro FILT_C 3 +%if mmsize==32 + vperm2i128 m3, %2, %1, q0003 +%endif + PALIGNR m1, %2, %1, (mmsize-4), m3 + PALIGNR m2, %2, %1, (mmsize-2), m3 +%if mmsize==32 + vperm2i128 %1, %3, %2, q0003 +%endif PALIGNR m3, %3, %2, 4, %1 PALIGNR m4, %3, %2, 2, %1 paddw m3, m2 +%if mmsize==32 + mova m2, %1 +%endif mova %1, %3 - PALIGNR %3, %2, 6, m2 + PALIGNR %3, %3, %2, 6, m2 paddw m4, %2 paddw %3, m1 FILT_H %3, m3, m4 %endmacro %macro DO_FILT_C 4 - FILT_C %1, %2, %3, 6 - FILT_C %2, %1, %4, 6 - FILT_PACK %3, %4, 6, m15 + FILT_C %1, %2, %3 + FILT_C %2, %1, %4 + FILT_PACK %3, %4, m15, 6 +%if mmsize==32 + vpermq %3, %3, q3120 +%endif movntps [r5+r4], %3 %endmacro @@ -699,8 +801,14 @@ HPEL_H %endmacro %macro DO_FILT_H 3 - PALIGNR m1, %2, %1, 14, m3 - PALIGNR m2, %2, %1, 15, m3 +%if mmsize==32 + vperm2i128 m3, %2, %1, q0003 +%endif + PALIGNR m1, %2, %1, (mmsize-2), m3 + PALIGNR m2, %2, %1, (mmsize-1), m3 +%if mmsize==32 + vperm2i128 m3, %3, %2, q0003 +%endif PALIGNR m4, %3, %2, 1 , m3 PALIGNR m5, %3, %2, 2 , m3 PALIGNR m6, %3, %2, 3 , m3 @@ -716,14 +824,14 @@ HPEL_H paddw m2, m4 paddw m1, m5 paddw m2, m6 - FILT_PACK m1, m2, 5, m15 + FILT_PACK m1, m2, m15, 5 pshufb m1, [hpel_shuf] %else ; ssse3, avx ADD8TO16 m1, m6, m12, m3, m0 ; a ADD8TO16 m2, m5, m12, m3, m0 ; b ADD8TO16 %2, m4, m12, m3, m0 ; c FILT_V2 m1, m2, %2, m6, m5, m4 - FILT_PACK m1, m6, 5, m15 + FILT_PACK m1, m6, m15, 5 %endif movntps [r0+r4], m1 mova %2, %3 @@ -732,17 +840,13 @@ HPEL_H %macro HPEL 0 ;----------------------------------------------------------------------------- ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, -; uint8_t *src, int stride, int width, int height) +; uint8_t *src, intptr_t stride, int width, int height ) ;----------------------------------------------------------------------------- cglobal hpel_filter, 7,9,16 -%if WIN64 - movsxd r4, r4d - movsxd r5, r5d -%endif mov r7, r3 - sub r5, 16 + sub r5d, mmsize mov r8, r1 - and r7, 15 + and r7, mmsize-1 sub r3, r7 add r0, r5 add r8, r5 @@ -754,13 +858,14 @@ cglobal hpel_filter, 7,9,16 sub r3, r2 sub r3, r2 mov r4, r7 - mova m15, [pw_16] %if cpuflag(ssse3) mova m0, [filt_mul51] mova m12, [filt_mul15] mova m14, [filt_mul20] + mova m15, [pw_1024] %else pxor m0, m0 + mova m15, [pw_16] %endif ;ALIGN 16 .loopy: @@ -768,16 +873,24 @@ cglobal hpel_filter, 7,9,16 DO_FILT_V m8, m7, m13, m12, 0 ;ALIGN 16 .loopx: - DO_FILT_V m6, m5, m11, m12, 16 + DO_FILT_V m6, m5, m11, m12, mmsize .lastx: +%if cpuflag(ssse3) + psrlw m15, 1 ; pw_512 +%else paddw m15, m15 ; pw_32 +%endif DO_FILT_C m9, m8, m7, m6 - psrlw m15, 1 ; pw_16 - movdqa m7, m5 +%if cpuflag(ssse3) + paddw m15, m15 ; pw_1024 +%else + psrlw m15, 1 ; pw_16 +%endif + mova m7, m5 DO_FILT_H m10, m13, m11 - add r4, 16 + add r4, mmsize jl .loopx - cmp r4, 16 + cmp r4, mmsize jl .lastx ; setup regs for next y sub r4, r7 @@ -800,6 +913,8 @@ INIT_XMM ssse3 HPEL INIT_XMM avx HPEL +INIT_YMM avx2 +HPEL %endif ; ARCH_X86_64 %undef movntq @@ -807,65 +922,90 @@ HPEL %undef sfence %endif ; !HIGH_BIT_DEPTH +%macro PREFETCHNT_ITER 2 ; src, bytes/iteration + %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal? + %rep (%2+63) / 64 ; assume 64 byte cache lines + prefetchnta [%1+%%i] + %assign %%i %%i + 64 + %endrep +%endmacro + ;----------------------------------------------------------------------------- -; void plane_copy_core( pixel *dst, int i_dst, -; pixel *src, int i_src, int w, int h) +; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst, +; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- -; assumes i_dst and w are multiples of 16, and i_dst>w -INIT_MMX -cglobal plane_copy_core_mmx2, 6,7 - FIX_STRIDES r1d, r3d, r4d - movsxdifnidn r1, r1d - movsxdifnidn r3, r3d +; assumes i_dst and w are multiples of mmsize, and i_dst>w +%macro PLANE_COPY_CORE 1 ; swap +%if %1 +cglobal plane_copy_swap_core, 6,7 + mova m4, [copy_swap_shuf] +%else +cglobal plane_copy_core, 6,7 +%endif + FIX_STRIDES r1, r3 +%if %1 && HIGH_BIT_DEPTH + shl r4d, 2 +%elif %1 || HIGH_BIT_DEPTH + add r4d, r4d +%else movsxdifnidn r4, r4d - sub r1, r4 - sub r3, r4 +%endif + add r0, r4 + add r2, r4 + neg r4 .loopy: - mov r6d, r4d - sub r6d, 63 + lea r6, [r4+4*mmsize] +%if %1 + test r6d, r6d + jg .skip +%endif .loopx: - prefetchnta [r2+256] - movq m0, [r2 ] - movq m1, [r2+ 8] - movntq [r0 ], m0 - movntq [r0+ 8], m1 - movq m2, [r2+16] - movq m3, [r2+24] - movntq [r0+16], m2 - movntq [r0+24], m3 - movq m4, [r2+32] - movq m5, [r2+40] - movntq [r0+32], m4 - movntq [r0+40], m5 - movq m6, [r2+48] - movq m7, [r2+56] - movntq [r0+48], m6 - movntq [r0+56], m7 - add r2, 64 - add r0, 64 - sub r6d, 64 - jg .loopx - prefetchnta [r2+256] - add r6d, 63 - jle .end16 -.loop16: - movq m0, [r2 ] - movq m1, [r2+8] - movntq [r0 ], m0 - movntq [r0+8], m1 - add r2, 16 - add r0, 16 - sub r6d, 16 - jg .loop16 -.end16: + PREFETCHNT_ITER r2+r6, 4*mmsize + movu m0, [r2+r6-4*mmsize] + movu m1, [r2+r6-3*mmsize] + movu m2, [r2+r6-2*mmsize] + movu m3, [r2+r6-1*mmsize] +%if %1 + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 +%endif + movnta [r0+r6-4*mmsize], m0 + movnta [r0+r6-3*mmsize], m1 + movnta [r0+r6-2*mmsize], m2 + movnta [r0+r6-1*mmsize], m3 + add r6, 4*mmsize + jle .loopx +.skip: + PREFETCHNT_ITER r2+r6, 4*mmsize + sub r6, 4*mmsize + jz .end +.loop_end: + movu m0, [r2+r6] +%if %1 + pshufb m0, m4 +%endif + movnta [r0+r6], m0 + add r6, mmsize + jl .loop_end +.end: add r0, r1 add r2, r3 - dec r5d + dec r5d jg .loopy sfence - emms RET +%endmacro +INIT_XMM sse +PLANE_COPY_CORE 0 +INIT_XMM ssse3 +PLANE_COPY_CORE 1 +INIT_YMM avx +PLANE_COPY_CORE 0 +INIT_YMM avx2 +PLANE_COPY_CORE 1 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint %if HIGH_BIT_DEPTH @@ -951,22 +1091,19 @@ cglobal plane_copy_core_mmx2, 6,7 %macro PLANE_INTERLEAVE 0 ;----------------------------------------------------------------------------- -; void plane_copy_interleave_core( uint8_t *dst, int i_dst, -; uint8_t *srcu, int i_srcu, -; uint8_t *srcv, int i_srcv, int w, int h ) +; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst, +; uint8_t *srcu, intptr_t i_srcu, +; uint8_t *srcv, intptr_t i_srcv, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>2*w -cglobal plane_copy_interleave_core, 7,9 - FIX_STRIDES r1d, r3d, r5d, r6d +cglobal plane_copy_interleave_core, 6,9 + mov r6d, r6m %if HIGH_BIT_DEPTH - mov r1m, r1d - mov r3m, r3d - mov r6m, r6d -%endif - movsxdifnidn r1, r1d - movsxdifnidn r3, r3d - movsxdifnidn r5, r5d - movsxdifnidn r6, r6d + FIX_STRIDES r1, r3, r5, r6d + movifnidn r1mp, r1 + movifnidn r3mp, r3 + mov r6m, r6d +%endif lea r0, [r0+r6*2] add r2, r6 add r4, r6 @@ -1021,10 +1158,10 @@ cglobal plane_copy_interleave_core, 7,9 RET ;----------------------------------------------------------------------------- -; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height ) +; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height ) ;----------------------------------------------------------------------------- cglobal store_interleave_chroma, 5,5 - FIX_STRIDES r1d + FIX_STRIDES r1 .loop: INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a @@ -1033,7 +1170,7 @@ cglobal store_interleave_chroma, 5,5 lea r0, [r0+r1*2] sub r4d, 2 jg .loop - REP_RET + RET %endmacro ; PLANE_INTERLEAVE %macro DEINTERLEAVE_START 0 @@ -1048,20 +1185,17 @@ cglobal store_interleave_chroma, 5,5 %macro PLANE_DEINTERLEAVE 0 ;----------------------------------------------------------------------------- -; void plane_copy_deinterleave( pixel *dstu, int i_dstu, -; pixel *dstv, int i_dstv, -; pixel *src, int i_src, int w, int h ) +; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu, +; pixel *dstv, intptr_t i_dstv, +; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- cglobal plane_copy_deinterleave, 6,7 DEINTERLEAVE_START mov r6d, r6m - FIX_STRIDES r1d, r3d, r5d, r6d + FIX_STRIDES r1, r3, r5, r6d %if HIGH_BIT_DEPTH mov r6m, r6d %endif - movsxdifnidn r1, r1d - movsxdifnidn r3, r3d - movsxdifnidn r5, r5d add r0, r6 add r2, r6 lea r4, [r4+r6*2] @@ -1078,14 +1212,14 @@ cglobal plane_copy_deinterleave, 6,7 add r4, r5 dec dword r7m jg .loopy - REP_RET + RET ;----------------------------------------------------------------------------- -; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height ) +; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fenc, 4,4 DEINTERLEAVE_START - FIX_STRIDES r2d + FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a @@ -1093,14 +1227,14 @@ cglobal load_deinterleave_chroma_fenc, 4,4 lea r1, [r1+r2*2] sub r3d, 2 jg .loop - REP_RET + RET ;----------------------------------------------------------------------------- -; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height ) +; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fdec, 4,4 DEINTERLEAVE_START - FIX_STRIDES r2d + FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a @@ -1108,9 +1242,166 @@ cglobal load_deinterleave_chroma_fdec, 4,4 lea r1, [r1+r2*2] sub r3d, 2 jg .loop - REP_RET + RET %endmacro ; PLANE_DEINTERLEAVE +%macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2 +%if cpuflag(ssse3) + mova m3, [deinterleave_rgb_shuf+(%1-3)*16] +%endif +%%loopy: + mov %8, r6 + mov %9, %6 +%%loopx: + movu m0, [%8] + movu m1, [%8+%1*mmsize/4] +%if cpuflag(ssse3) + pshufb m0, m3 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3 + pshufb m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7 +%elif %1 == 3 + psrldq m2, m0, 6 + punpcklqdq m0, m1 ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5 + psrldq m1, 6 + punpcklqdq m2, m1 ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7 + psrlq m3, m0, 24 + psrlq m4, m2, 24 + punpckhbw m1, m0, m3 ; b4 b5 g4 g5 r4 r5 + punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1 + punpckhbw m3, m2, m4 ; b6 b7 g6 g7 r6 r7 + punpcklbw m2, m4 ; b2 b3 g2 g3 r2 r3 + punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3 + punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7 +%else + pshufd m3, m0, q2301 + pshufd m4, m1, q2301 + punpckhbw m2, m0, m3 ; b2 b3 g2 g3 r2 r3 + punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1 + punpckhbw m3, m1, m4 ; b6 b7 g6 g7 r6 r7 + punpcklbw m1, m4 ; b4 b5 g4 g5 r4 r5 + punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3 + punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7 +%endif + punpckldq m2, m0, m1 ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7 + punpckhdq m0, m1 ; r0 r1 r2 r3 r4 r5 r6 r7 + movh [r0+%9], m2 + movhps [r2+%9], m2 + movh [r4+%9], m0 + add %8, %1*mmsize/2 + add %9, mmsize/2 + jl %%loopx + add r0, %2 + add r2, %3 + add r4, %4 + add r6, %5 + dec %7d + jg %%loopy +%endmacro + +%macro PLANE_DEINTERLEAVE_RGB 0 +;----------------------------------------------------------------------------- +; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta, +; pixel *dstb, intptr_t i_dstb, +; pixel *dstc, intptr_t i_dstc, +; pixel *src, intptr_t i_src, int pw, int w, int h ) +;----------------------------------------------------------------------------- +%if ARCH_X86_64 +cglobal plane_copy_deinterleave_rgb, 8,12 + %define %%args r1, r3, r5, r7, r8, r9, r10, r11 + mov r8d, r9m + mov r9d, r10m + add r0, r8 + add r2, r8 + add r4, r8 + neg r8 +%else +cglobal plane_copy_deinterleave_rgb, 1,7 + %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5 + mov r1, r9m + mov r2, r2m + mov r4, r4m + mov r6, r6m + add r0, r1 + add r2, r1 + add r4, r1 + neg r1 + mov r9m, r1 + mov r1, r10m +%endif + cmp dword r8m, 4 + je .pw4 + PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR + jmp .ret +.pw4: + PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA +.ret: + REP_RET +%endmacro + +%if HIGH_BIT_DEPTH == 0 +INIT_XMM sse2 +PLANE_DEINTERLEAVE_RGB +INIT_XMM ssse3 +PLANE_DEINTERLEAVE_RGB +%endif ; !HIGH_BIT_DEPTH + +%macro PLANE_DEINTERLEAVE_V210 0 +;----------------------------------------------------------------------------- +; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty, +; uint16_t *dstc, intptr_t i_dstc, +; uint32_t *src, intptr_t i_src, int w, int h ) +;----------------------------------------------------------------------------- +%if ARCH_X86_64 +cglobal plane_copy_deinterleave_v210, 8,10,7 +%define src r8 +%define org_w r9 +%define h r7d +%else +cglobal plane_copy_deinterleave_v210, 7,7,7 +%define src r4m +%define org_w r6m +%define h dword r7m +%endif + FIX_STRIDES r1, r3, r6d + shl r5, 2 + add r0, r6 + add r2, r6 + neg r6 + mov src, r4 + mov org_w, r6 + mova m2, [v210_mask] + mova m3, [v210_luma_shuf] + mova m4, [v210_chroma_shuf] + mova m5, [v210_mult] ; also functions as vpermd index for avx2 + pshufd m6, m5, q1102 + +ALIGN 16 +.loop: + movu m1, [r4] + pandn m0, m2, m1 + pand m1, m2 + pshufb m0, m3 + pshufb m1, m4 + pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __ + pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __ +%if mmsize == 32 + vpermd m0, m5, m0 + vpermd m1, m5, m1 +%endif + movu [r0+r6], m0 + movu [r2+r6], m1 + add r4, mmsize + add r6, 3*mmsize/4 + jl .loop + add r0, r1 + add r2, r3 + add src, r5 + mov r4, src + mov r6, org_w + dec h + jg .loop + RET +%endmacro ; PLANE_DEINTERLEAVE_V210 + %if HIGH_BIT_DEPTH INIT_MMX mmx2 PLANE_INTERLEAVE @@ -1119,9 +1410,14 @@ PLANE_DEINTERLEAVE INIT_XMM sse2 PLANE_INTERLEAVE PLANE_DEINTERLEAVE +INIT_XMM ssse3 +PLANE_DEINTERLEAVE_V210 INIT_XMM avx PLANE_INTERLEAVE PLANE_DEINTERLEAVE +PLANE_DEINTERLEAVE_V210 +INIT_YMM avx2 +PLANE_DEINTERLEAVE_V210 %else INIT_MMX mmx2 PLANE_INTERLEAVE @@ -1141,115 +1437,110 @@ PLANE_DEINTERLEAVE ;----------------------------------------------------------------------------- ; void *memcpy_aligned( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal memcpy_aligned_mmx, 3,3 - test r2d, 16 - jz .copy32start - movq mm0, [r1 + r2 - 16] - movq mm1, [r1 + r2 - 8] - movq [r0 + r2 - 16], mm0 - movq [r0 + r2 - 8], mm1 - sub r2d, 16 -.copy32start - test r2d, r2d - jz .ret -.copy32: - movq mm0, [r1 + r2 - 32] - movq mm1, [r1 + r2 - 24] - movq mm2, [r1 + r2 - 16] - movq mm3, [r1 + r2 - 8] - movq [r0 + r2 - 32], mm0 - movq [r0 + r2 - 24], mm1 - movq [r0 + r2 - 16], mm2 - movq [r0 + r2 - 8], mm3 - sub r2d, 32 - jg .copy32 -.ret - REP_RET - -;----------------------------------------------------------------------------- -; void *memcpy_aligned( void *dst, const void *src, size_t n ); -;----------------------------------------------------------------------------- -cglobal memcpy_aligned_sse2, 3,3 +%macro MEMCPY 0 +cglobal memcpy_aligned, 3,3 +%if mmsize == 16 test r2d, 16 - jz .copy32 - movdqa xmm0, [r1 + r2 - 16] - movdqa [r0 + r2 - 16], xmm0 + jz .copy2 + mova m0, [r1+r2-16] + mova [r0+r2-16], m0 sub r2d, 16 -.copy32: - test r2d, 32 - jz .copy64start - movdqa xmm0, [r1 + r2 - 32] - movdqa [r0 + r2 - 32], xmm0 - movdqa xmm1, [r1 + r2 - 16] - movdqa [r0 + r2 - 16], xmm1 - sub r2d, 32 -.copy64start +.copy2: +%endif + test r2d, 2*mmsize + jz .copy4start + mova m0, [r1+r2-1*mmsize] + mova m1, [r1+r2-2*mmsize] + mova [r0+r2-1*mmsize], m0 + mova [r0+r2-2*mmsize], m1 + sub r2d, 2*mmsize +.copy4start: test r2d, r2d jz .ret -.copy64: - movdqa xmm0, [r1 + r2 - 64] - movdqa [r0 + r2 - 64], xmm0 - movdqa xmm1, [r1 + r2 - 48] - movdqa [r0 + r2 - 48], xmm1 - movdqa xmm2, [r1 + r2 - 32] - movdqa [r0 + r2 - 32], xmm2 - movdqa xmm3, [r1 + r2 - 16] - movdqa [r0 + r2 - 16], xmm3 - sub r2d, 64 - jg .copy64 +.copy4: + mova m0, [r1+r2-1*mmsize] + mova m1, [r1+r2-2*mmsize] + mova m2, [r1+r2-3*mmsize] + mova m3, [r1+r2-4*mmsize] + mova [r0+r2-1*mmsize], m0 + mova [r0+r2-2*mmsize], m1 + mova [r0+r2-3*mmsize], m2 + mova [r0+r2-4*mmsize], m3 + sub r2d, 4*mmsize + jg .copy4 .ret: REP_RET +%endmacro + +INIT_MMX mmx +MEMCPY +INIT_XMM sse +MEMCPY ;----------------------------------------------------------------------------- ; void *memzero_aligned( void *dst, size_t n ); ;----------------------------------------------------------------------------- -%macro MEMZERO 0 +%macro MEMZERO 1 cglobal memzero_aligned, 2,2 add r0, r1 neg r1 +%if mmsize == 8 pxor m0, m0 +%else + xorps m0, m0 +%endif .loop: %assign i 0 -%rep 8 +%rep %1 mova [r0 + r1 + i], m0 %assign i i+mmsize %endrep - add r1, mmsize*8 + add r1, mmsize*%1 jl .loop - REP_RET + RET %endmacro INIT_MMX mmx -MEMZERO -INIT_XMM sse2 -MEMZERO - - +MEMZERO 8 +INIT_XMM sse +MEMZERO 8 +INIT_YMM avx +MEMZERO 4 %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride ) +; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride ) ;----------------------------------------------------------------------------- -INIT_XMM -cglobal integral_init4h_sse4, 3,4 +%macro INTEGRAL_INIT4H 0 +cglobal integral_init4h, 3,4 lea r3, [r0+r2*2] add r1, r2 neg r2 pxor m4, m4 .loop: - movdqa m0, [r1+r2] - movdqa m1, [r1+r2+16] + mova xm0, [r1+r2] + mova xm1, [r1+r2+16] +%if mmsize==32 + vinserti128 m0, m0, [r1+r2+ 8], 1 + vinserti128 m1, m1, [r1+r2+24], 1 +%else palignr m1, m0, 8 +%endif mpsadbw m0, m4, 0 mpsadbw m1, m4, 0 paddw m0, [r0+r2*2] - paddw m1, [r0+r2*2+16] - movdqa [r3+r2*2 ], m0 - movdqa [r3+r2*2+16], m1 - add r2, 16 + paddw m1, [r0+r2*2+mmsize] + mova [r3+r2*2 ], m0 + mova [r3+r2*2+mmsize], m1 + add r2, mmsize jl .loop - REP_RET + RET +%endmacro + +INIT_XMM sse4 +INTEGRAL_INIT4H +INIT_YMM avx2 +INTEGRAL_INIT4H %macro INTEGRAL_INIT8H 0 cglobal integral_init8h, 3,4 @@ -1258,36 +1549,45 @@ cglobal integral_init8h, 3,4 neg r2 pxor m4, m4 .loop: - movdqa m0, [r1+r2] - movdqa m1, [r1+r2+16] + mova xm0, [r1+r2] + mova xm1, [r1+r2+16] +%if mmsize==32 + vinserti128 m0, m0, [r1+r2+ 8], 1 + vinserti128 m1, m1, [r1+r2+24], 1 + mpsadbw m2, m0, m4, 100100b + mpsadbw m3, m1, m4, 100100b +%else palignr m1, m0, 8 - mpsadbw m2, m0, m4, 4 - mpsadbw m3, m1, m4, 4 + mpsadbw m2, m0, m4, 100b + mpsadbw m3, m1, m4, 100b +%endif mpsadbw m0, m4, 0 mpsadbw m1, m4, 0 paddw m0, [r0+r2*2] - paddw m1, [r0+r2*2+16] + paddw m1, [r0+r2*2+mmsize] paddw m0, m2 paddw m1, m3 - movdqa [r3+r2*2 ], m0 - movdqa [r3+r2*2+16], m1 - add r2, 16 + mova [r3+r2*2 ], m0 + mova [r3+r2*2+mmsize], m1 + add r2, mmsize jl .loop - REP_RET + RET %endmacro INIT_XMM sse4 INTEGRAL_INIT8H INIT_XMM avx INTEGRAL_INIT8H +INIT_YMM avx2 +INTEGRAL_INIT8H %endif ; !HIGH_BIT_DEPTH %macro INTEGRAL_INIT_8V 0 ;----------------------------------------------------------------------------- -; void integral_init8v( uint16_t *sum8, int stride ) +; void integral_init8v( uint16_t *sum8, intptr_t stride ) ;----------------------------------------------------------------------------- cglobal integral_init8v, 3,3 - shl r1, 1 + add r1, r1 add r0, r1 lea r2, [r0+r1*8] neg r1 @@ -1300,19 +1600,21 @@ cglobal integral_init8v, 3,3 mova [r0+r1+mmsize], m1 add r1, 2*mmsize jl .loop - REP_RET + RET %endmacro INIT_MMX mmx INTEGRAL_INIT_8V INIT_XMM sse2 INTEGRAL_INIT_8V +INIT_YMM avx2 +INTEGRAL_INIT_8V ;----------------------------------------------------------------------------- -; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride ) +; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride ) ;----------------------------------------------------------------------------- -INIT_MMX -cglobal integral_init4v_mmx, 3,5 +INIT_MMX mmx +cglobal integral_init4v, 3,5 shl r2, 1 lea r3, [r0+r2*4] lea r4, [r0+r2*8] @@ -1331,10 +1633,10 @@ cglobal integral_init4v_mmx, 3,5 mova [r1+r2-8], m3 sub r2, 8 jge .loop - REP_RET + RET -INIT_XMM -cglobal integral_init4v_sse2, 3,5 +INIT_XMM sse2 +cglobal integral_init4v, 3,5 shl r2, 1 add r0, r2 add r1, r2 @@ -1357,9 +1659,10 @@ cglobal integral_init4v_sse2, 3,5 mova [r1+r2], m3 add r2, 16 jl .loop - REP_RET + RET -cglobal integral_init4v_ssse3, 3,5 +INIT_XMM ssse3 +cglobal integral_init4v, 3,5 shl r2, 1 add r0, r2 add r1, r2 @@ -1382,7 +1685,29 @@ cglobal integral_init4v_ssse3, 3,5 mova [r1+r2], m3 add r2, 16 jl .loop - REP_RET + RET + +INIT_YMM avx2 +cglobal integral_init4v, 3,5 + add r2, r2 + add r0, r2 + add r1, r2 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + neg r2 +.loop: + mova m2, [r0+r2] + movu m1, [r4+r2+8] + paddw m0, m2, [r0+r2+8] + paddw m1, [r4+r2] + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m2 + mova [r0+r2], m1 + mova [r1+r2], m3 + add r2, 32 + jl .loop + RET %macro FILT8x4 7 mova %3, [r0+%7] @@ -1391,12 +1716,54 @@ cglobal integral_init4v_ssse3, 3,5 pavgb %4, [r0+r5*2+%7] PALIGNR %1, %3, 1, m6 PALIGNR %2, %4, 1, m6 +%if cpuflag(xop) + pavgb %1, %3 + pavgb %2, %4 +%else pavgb %1, %3 pavgb %2, %4 psrlw %5, %1, 8 psrlw %6, %2, 8 pand %1, m7 pand %2, m7 +%endif +%endmacro + +%macro FILT32x4U 4 + mova m1, [r0+r5] + pavgb m0, m1, [r0] + movu m3, [r0+r5+1] + pavgb m2, m3, [r0+1] + pavgb m1, [r0+r5*2] + pavgb m3, [r0+r5*2+1] + pavgb m0, m2 + pavgb m1, m3 + + mova m3, [r0+r5+mmsize] + pavgb m2, m3, [r0+mmsize] + movu m5, [r0+r5+1+mmsize] + pavgb m4, m5, [r0+1+mmsize] + pavgb m3, [r0+r5*2+mmsize] + pavgb m5, [r0+r5*2+1+mmsize] + pavgb m2, m4 + pavgb m3, m5 + + pshufb m0, m7 + pshufb m1, m7 + pshufb m2, m7 + pshufb m3, m7 + punpckhqdq m4, m0, m2 + punpcklqdq m0, m0, m2 + punpckhqdq m5, m1, m3 + punpcklqdq m2, m1, m3 + vpermq m0, m0, q3120 + vpermq m1, m4, q3120 + vpermq m2, m2, q3120 + vpermq m3, m5, q3120 + mova [%1], m0 + mova [%2], m1 + mova [%3], m2 + mova [%4], m3 %endmacro %macro FILT16x2 4 @@ -1408,12 +1775,17 @@ cglobal integral_init4v_ssse3, 3,5 pavgb %1, m3 PALIGNR m3, m2, 1, m6 pavgb m3, m2 +%if cpuflag(xop) + vpperm m5, m3, %1, m7 + vpperm m3, m3, %1, m6 +%else psrlw m5, m3, 8 psrlw m4, %1, 8 pand m3, m7 pand %1, m7 packuswb m3, %1 packuswb m5, m4 +%endif mova [%2], m3 mova [%3], m5 mova %1, m2 @@ -1470,12 +1842,17 @@ cglobal integral_init4v_ssse3, 3,5 pavgw %1, m3 PALIGNR m3, m2, 2, m6 pavgw m3, m2 +%if cpuflag(xop) + vpperm m5, m3, %1, m7 + vpperm m3, m3, %1, m6 +%else psrld m5, m3, 16 psrld m4, %1, 16 pand m3, m7 pand %1, m7 packssdw m3, %1 packssdw m5, m4 +%endif mova [%2], m3 mova [%3], m5 mova %1, m2 @@ -1483,17 +1860,18 @@ cglobal integral_init4v_ssse3, 3,5 ;----------------------------------------------------------------------------- ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, -; int src_stride, int dst_stride, int width, int height ) +; intptr_t src_stride, intptr_t dst_stride, int width, int height ) ;----------------------------------------------------------------------------- %macro FRAME_INIT_LOWRES 0 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise %if HIGH_BIT_DEPTH shl dword r6m, 1 - FIX_STRIDES r5d + FIX_STRIDES r5 shl dword r7m, 1 %endif -%if WIN64 - movsxd r5, r5d +%if mmsize >= 16 + add dword r7m, mmsize-1 + and dword r7m, ~(mmsize-1) %endif ; src += 2*(height-1)*stride + 2*width mov r6d, r8m @@ -1521,8 +1899,13 @@ cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, PUSH r6 %define src_gap [rsp] %if HIGH_BIT_DEPTH +%if cpuflag(xop) + mova m6, [deinterleave_shuf32a] + mova m7, [deinterleave_shuf32b] +%else pcmpeqw m7, m7 psrld m7, 16 +%endif .vloop: mov r6d, r7m %ifnidn cpuname, mmx2 @@ -1547,57 +1930,49 @@ cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, sub r6d, mmsize jg .hloop %else ; !HIGH_BIT_DEPTH -%if mmsize == 16 - ; adjust for the odd end case - mov r6d, r7m - and r6d, 8 - sub r1, r6 - sub r2, r6 - sub r3, r6 - sub r4, r6 - add dst_gap, r6d -%endif ; mmsize +%if cpuflag(avx2) + mova m7, [deinterleave_shuf] +%elif cpuflag(xop) + mova m6, [deinterleave_shuf32a] + mova m7, [deinterleave_shuf32b] +%else pcmpeqb m7, m7 psrlw m7, 8 +%endif .vloop: mov r6d, r7m %ifnidn cpuname, mmx2 +%if mmsize <= 16 mova m0, [r0] mova m1, [r0+r5] pavgb m0, m1 pavgb m1, [r0+r5*2] %endif -%if mmsize == 16 - test r6d, 8 - jz .hloop - sub r0, 16 - FILT8x4 m0, m1, m2, m3, m4, m5, 0 - packuswb m0, m4 - packuswb m1, m5 - movq [r1], m0 - movhps [r2], m0 - movq [r3], m1 - movhps [r4], m1 - mova m0, m2 - mova m1, m3 - sub r6d, 8 - jz .skip -%endif ; mmsize +%endif .hloop: sub r0, mmsize*2 sub r1, mmsize sub r2, mmsize sub r3, mmsize sub r4, mmsize -%ifdef m8 +%if mmsize==32 + FILT32x4U r1, r2, r3, r4 +%elifdef m8 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize mova m8, m0 mova m9, m1 FILT8x4 m2, m3, m0, m1, m4, m5, 0 +%if cpuflag(xop) + vpperm m4, m2, m8, m7 + vpperm m2, m2, m8, m6 + vpperm m5, m3, m9, m7 + vpperm m3, m3, m9, m6 +%else packuswb m2, m8 packuswb m3, m9 packuswb m4, m10 packuswb m5, m11 +%endif mova [r1], m2 mova [r2], m4 mova [r3], m3 @@ -1636,70 +2011,80 @@ INIT_XMM sse2 FRAME_INIT_LOWRES INIT_XMM ssse3 FRAME_INIT_LOWRES +INIT_XMM avx +FRAME_INIT_LOWRES +INIT_XMM xop +FRAME_INIT_LOWRES +%if HIGH_BIT_DEPTH==0 +INIT_YMM avx2 +FRAME_INIT_LOWRES +%endif ;----------------------------------------------------------------------------- ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) ;----------------------------------------------------------------------------- %macro MBTREE 0 -cglobal mbtree_propagate_cost, 7,7,7 - add r6d, r6d - lea r0, [r0+r6*2] - add r1, r6 - add r2, r6 - add r3, r6 - add r4, r6 - neg r6 - pxor xmm4, xmm4 - movss xmm6, [r5] - shufps xmm6, xmm6, 0 - mulps xmm6, [pf_inv256] - movdqa xmm5, [pw_3fff] +cglobal mbtree_propagate_cost, 6,6,7 + movss m6, [r5] + mov r5d, r6m + lea r0, [r0+r5*2] + add r5d, r5d + add r1, r5 + add r2, r5 + add r3, r5 + add r4, r5 + neg r5 + pxor m4, m4 + shufps m6, m6, 0 + mova m5, [pw_3fff] .loop: - movq xmm2, [r2+r6] ; intra - movq xmm0, [r4+r6] ; invq - movq xmm3, [r3+r6] ; inter - movq xmm1, [r1+r6] ; prop - punpcklwd xmm2, xmm4 - punpcklwd xmm0, xmm4 - pmaddwd xmm0, xmm2 - pand xmm3, xmm5 - punpcklwd xmm1, xmm4 - punpcklwd xmm3, xmm4 + movq m2, [r2+r5] ; intra + movq m0, [r4+r5] ; invq + movq m3, [r3+r5] ; inter + movq m1, [r1+r5] ; prop + pand m3, m5 + pminsw m3, m2 + punpcklwd m2, m4 + punpcklwd m0, m4 + pmaddwd m0, m2 + punpcklwd m1, m4 + punpcklwd m3, m4 %if cpuflag(fma4) - cvtdq2ps xmm0, xmm0 - cvtdq2ps xmm1, xmm1 - vfmaddps xmm0, xmm0, xmm6, xmm1 - cvtdq2ps xmm1, xmm2 - psubd xmm2, xmm3 - cvtdq2ps xmm2, xmm2 - rcpps xmm3, xmm1 - mulps xmm1, xmm3 - mulps xmm0, xmm2 - addps xmm2, xmm3, xmm3 - vfnmaddps xmm3, xmm1, xmm3, xmm2 - mulps xmm0, xmm3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + fmaddps m0, m0, m6, m1 + cvtdq2ps m1, m2 + psubd m2, m3 + cvtdq2ps m2, m2 + rcpps m3, m1 + mulps m1, m3 + mulps m0, m2 + addps m2, m3, m3 + fnmaddps m3, m1, m3, m2 + mulps m0, m3 %else - cvtdq2ps xmm0, xmm0 - mulps xmm0, xmm6 ; intra*invq*fps_factor>>8 - cvtdq2ps xmm1, xmm1 ; prop - addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8) - cvtdq2ps xmm1, xmm2 ; intra - psubd xmm2, xmm3 ; intra - inter - cvtdq2ps xmm2, xmm2 ; intra - inter - rcpps xmm3, xmm1 ; 1 / intra 1st approximation - mulps xmm1, xmm3 ; intra * (1/intra 1st approx) - mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2 - mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) - addps xmm3, xmm3 ; 2 * (1/intra 1st approx) - subps xmm3, xmm1 ; 2nd approximation for 1/intra - mulps xmm0, xmm3 ; / intra -%endif - cvtps2dq xmm0, xmm0 - movdqa [r0+r6*2], xmm0 - add r6, 8 + cvtdq2ps m0, m0 + mulps m0, m6 ; intra*invq*fps_factor>>8 + cvtdq2ps m1, m1 ; prop + addps m0, m1 ; prop + (intra*invq*fps_factor>>8) + cvtdq2ps m1, m2 ; intra + psubd m2, m3 ; intra - inter + cvtdq2ps m2, m2 ; intra - inter + rcpps m3, m1 ; 1 / intra 1st approximation + mulps m1, m3 ; intra * (1/intra 1st approx) + mulps m1, m3 ; intra * (1/intra 1st approx)^2 + mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + addps m3, m3 ; 2 * (1/intra 1st approx) + subps m3, m1 ; 2nd approximation for 1/intra + mulps m0, m3 ; / intra +%endif + cvtps2dq m0, m0 + packssdw m0, m0 + movh [r0+r5], m0 + add r5, 8 jl .loop - REP_RET + RET %endmacro INIT_XMM sse2 @@ -1708,50 +2093,283 @@ MBTREE INIT_XMM fma4 MBTREE -%macro INT16_TO_FLOAT 1 - vpunpckhwd xmm4, xmm%1, xmm7 - vpunpcklwd xmm%1, xmm7 - vinsertf128 ymm%1, ymm%1, xmm4, 1 - vcvtdq2ps ymm%1, ymm%1 +%macro INT16_UNPACK 1 + punpckhwd xm4, xm%1, xm7 + punpcklwd xm%1, xm7 + vinsertf128 m%1, m%1, xm4, 1 +%endmacro + +; FIXME: align loads to 16 bytes +%macro MBTREE_AVX 0 +cglobal mbtree_propagate_cost, 6,6,8-cpuflag(avx2) + vbroadcastss m6, [r5] + mov r5d, r6m + lea r0, [r0+r5*2] + add r5d, r5d + add r1, r5 + add r2, r5 + add r3, r5 + add r4, r5 + neg r5 + mova xm5, [pw_3fff] +%if notcpuflag(avx2) + pxor xm7, xm7 +%endif +.loop: +%if cpuflag(avx2) + pmovzxwd m0, [r2+r5] ; intra + pmovzxwd m1, [r4+r5] ; invq + pmovzxwd m2, [r1+r5] ; prop + pand xm3, xm5, [r3+r5] ; inter + pmovzxwd m3, xm3 + pminsd m3, m0 + pmaddwd m1, m0 + psubd m4, m0, m3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m4, m4 + fmaddps m1, m1, m6, m2 + rcpps m3, m0 + mulps m2, m0, m3 + mulps m1, m4 + addps m4, m3, m3 + fnmaddps m4, m2, m3, m4 + mulps m1, m4 +%else + movu xm0, [r2+r5] + movu xm1, [r4+r5] + movu xm2, [r1+r5] + pand xm3, xm5, [r3+r5] + pminsw xm3, xm0 + INT16_UNPACK 0 + INT16_UNPACK 1 + INT16_UNPACK 2 + INT16_UNPACK 3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + mulps m1, m0 + subps m4, m0, m3 + mulps m1, m6 ; intra*invq*fps_factor>>8 + addps m1, m2 ; prop + (intra*invq*fps_factor>>8) + rcpps m3, m0 ; 1 / intra 1st approximation + mulps m2, m0, m3 ; intra * (1/intra 1st approx) + mulps m2, m3 ; intra * (1/intra 1st approx)^2 + mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + addps m3, m3 ; 2 * (1/intra 1st approx) + subps m3, m2 ; 2nd approximation for 1/intra + mulps m1, m3 ; / intra +%endif + vcvtps2dq m1, m1 + vextractf128 xm2, m1, 1 + packssdw xm1, xm2 + mova [r0+r5], xm1 + add r5, 16 + jl .loop + RET %endmacro -; FIXME: align loads/stores to 16 bytes INIT_YMM avx -cglobal mbtree_propagate_cost, 7,7,8 - add r6d, r6d - lea r0, [r0+r6*2] - add r1, r6 - add r2, r6 - add r3, r6 - add r4, r6 - neg r6 - vmovdqa xmm5, [pw_3fff] - vbroadcastss ymm6, [r5] - vmulps ymm6, ymm6, [pf_inv256] - vpxor xmm7, xmm7 +MBTREE_AVX +INIT_YMM avx2 +MBTREE_AVX + +%macro MBTREE_PROPAGATE_LIST 0 +;----------------------------------------------------------------------------- +; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs, +; int16_t *output, int bipred_weight, int mb_y, int len ) +;----------------------------------------------------------------------------- +cglobal mbtree_propagate_list_internal, 4,6,8 + movh m6, [pw_0to15] ; mb_x + movd m7, r5m + pshuflw m7, m7, 0 + punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y + movd m7, r4m + SPLATW m7, m7 ; bipred_weight + psllw m7, 9 ; bipred_weight << 9 + + mov r5d, r6m + xor r4d, r4d .loop: - vmovdqu xmm0, [r2+r6] ; intra - vmovdqu xmm1, [r4+r6] ; invq - vmovdqu xmm2, [r1+r6] ; prop - vpand xmm3, xmm5, [r3+r6] ; inter - INT16_TO_FLOAT 0 - INT16_TO_FLOAT 1 - INT16_TO_FLOAT 2 - INT16_TO_FLOAT 3 - vmulps ymm1, ymm1, ymm0 - vsubps ymm4, ymm0, ymm3 - vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8 - vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8) - vrcpps ymm3, ymm0 ; 1 / intra 1st approximation - vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx) - vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2 - vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) - vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx) - vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra - vmulps ymm1, ymm1, ymm3 ; / intra - vcvtps2dq ymm1, ymm1 - vmovdqu [r0+r6*2], ymm1 - add r6, 16 + mova m3, [r1+r4*2] + movu m4, [r2+r4*2] + mova m5, [pw_0xc000] + pand m4, m5 + pcmpeqw m4, m5 + pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 +%if cpuflag(avx) + pblendvb m5, m3, m5, m4 +%else + pand m5, m4 + pandn m4, m3 + por m5, m4 ; if( lists_used == 3 ) + ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 +%endif + + movu m0, [r0+r4*4] ; x,y + movu m1, [r0+r4*4+mmsize] + + psraw m2, m0, 5 + psraw m3, m1, 5 + mova m4, [pd_4] + paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y} + paddw m6, m4 ; {mbx, mby} += {4, 0} + paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y} + paddw m6, m4 ; {mbx, mby} += {4, 0} + + mova [r3+mmsize*0], m2 + mova [r3+mmsize*1], m3 + + mova m3, [pw_31] + pand m0, m3 ; x &= 31 + pand m1, m3 ; y &= 31 + packuswb m0, m1 + psrlw m1, m0, 3 + pand m0, m3 ; x + SWAP 1, 3 + pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw + + mova m3, [pw_32] + psubw m3, m0 ; 32 - x + mova m4, [pw_1024] + psubw m4, m1 ; (32 - y) << 5 + + pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5 + pmullw m4, m0 ; idx1weight = (32-y)*x << 5 + pmullw m0, m1 ; idx3weight = y*x << 5 + pmullw m1, m3 ; idx2weight = y*(32-x) << 5 + + ; avoid overflow in the input to pmulhrsw + psrlw m3, m2, 15 + psubw m2, m3 ; idx0weight -= (idx0weight == 32768) + + pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10 + pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10 + pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10 + pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10 + + SBUTTERFLY wd, 2, 4, 3 + SBUTTERFLY wd, 1, 0, 3 + mova [r3+mmsize*2], m2 + mova [r3+mmsize*3], m4 + mova [r3+mmsize*4], m1 + mova [r3+mmsize*5], m0 + add r4d, mmsize/2 + add r3, mmsize*6 + cmp r4d, r5d jl .loop - vzeroupper + REP_RET +%endmacro + +INIT_XMM ssse3 +MBTREE_PROPAGATE_LIST +INIT_XMM avx +MBTREE_PROPAGATE_LIST + +%macro MBTREE_FIX8 0 +;----------------------------------------------------------------------------- +; void mbtree_fix8_pack( uint16_t *dst, float *src, int count ) +;----------------------------------------------------------------------------- +cglobal mbtree_fix8_pack, 3,4 +%if mmsize == 32 + vbroadcastf128 m2, [pf_256] + vbroadcasti128 m3, [mbtree_fix8_pack_shuf] +%else + movaps m2, [pf_256] + mova m3, [mbtree_fix8_pack_shuf] +%endif + sub r2d, mmsize/2 + movsxdifnidn r2, r2d + lea r1, [r1+4*r2] + lea r0, [r0+2*r2] + neg r2 + jg .skip_loop +.loop: + mulps m0, m2, [r1+4*r2] + mulps m1, m2, [r1+4*r2+mmsize] + cvttps2dq m0, m0 + cvttps2dq m1, m1 + packssdw m0, m1 + pshufb m0, m3 +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + mova [r0+2*r2], m0 + add r2, mmsize/2 + jle .loop +.skip_loop: + sub r2, mmsize/2 + jz .end + ; Do the remaining values in scalar in order to avoid overreading src. +.scalar: + mulss xm0, xm2, [r1+4*r2+2*mmsize] + cvttss2si r3d, xm0 + rol r3w, 8 + mov [r0+2*r2+mmsize], r3w + inc r2 + jl .scalar +.end: RET + +;----------------------------------------------------------------------------- +; void mbtree_fix8_unpack( float *dst, uint16_t *src, int count ) +;----------------------------------------------------------------------------- +cglobal mbtree_fix8_unpack, 3,4 +%if mmsize == 32 + vbroadcastf128 m2, [pf_inv256] +%else + movaps m2, [pf_inv256] + mova m4, [mbtree_fix8_unpack_shuf+16] +%endif + mova m3, [mbtree_fix8_unpack_shuf] + sub r2d, mmsize/2 + movsxdifnidn r2, r2d + lea r1, [r1+2*r2] + lea r0, [r0+4*r2] + neg r2 + jg .skip_loop +.loop: +%if mmsize == 32 + vbroadcasti128 m0, [r1+2*r2] + vbroadcasti128 m1, [r1+2*r2+16] + pshufb m0, m3 + pshufb m1, m3 +%else + mova m1, [r1+2*r2] + pshufb m0, m1, m3 + pshufb m1, m4 +%endif + psrad m0, 16 ; sign-extend + psrad m1, 16 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + mulps m0, m2 + mulps m1, m2 + movaps [r0+4*r2], m0 + movaps [r0+4*r2+mmsize], m1 + add r2, mmsize/2 + jle .loop +.skip_loop: + sub r2, mmsize/2 + jz .end +.scalar: + movzx r3d, word [r1+2*r2+mmsize] + rol r3w, 8 + movsx r3d, r3w + ; Use 3-arg cvtsi2ss as a workaround for the fact that the instruction has a stupid dependency on + ; dst which causes terrible performance when used in a loop otherwise. Blame Intel for poor design. + cvtsi2ss xm0, xm2, r3d + mulss xm0, xm2 + movss [r0+4*r2+2*mmsize], xm0 + inc r2 + jl .scalar +.end: + RET +%endmacro + +INIT_XMM ssse3 +MBTREE_FIX8 +INIT_YMM avx2 +MBTREE_FIX8