X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fpixel-a.asm;h=9791e8665ea635edc8c7a93de77b774cb6ae85b3;hb=7c860f075ccd14fb7891d5fc6c9eab1a37ea555d;hp=619af7c4fff11a125eedf86d324a44db010a2c49;hpb=b63a73da3add660358a4bad1a590c2d4ed466dc4;p=x264 diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 619af7c4..9791e866 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* pixel.asm: x86 pixel metrics ;***************************************************************************** -;* Copyright (C) 2003-2011 x264 project +;* Copyright (C) 2003-2014 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz @@ -32,8 +32,17 @@ %include "x86util.asm" SECTION_RODATA 32 +hmul_16p: times 16 db 1 + times 8 db 1, -1 +hmul_8p: times 8 db 1 + times 4 db 1, -1 + times 8 db 1 + times 4 db 1, -1 mask_ff: times 16 db 0xff times 16 db 0 +mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1 +mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1 +mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1 %if BIT_DEPTH == 10 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 @@ -46,12 +55,7 @@ ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 %endif -mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1 -mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1 -mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 -hmul_8p: times 8 db 1 - times 4 db 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 pb_pppm: times 4 db 1,1,1,-1 @@ -85,6 +89,7 @@ intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1 intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0 +ALIGN 32 intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5 intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4 intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1 @@ -120,9 +125,29 @@ transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 sw_f0: dq 0xfff0, 0 -sq_0f: dq 0xffffffff, 0 pd_f0: times 4 dd 0xffff0000 +pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7 + +ads_mvs_shuffle: +%macro ADS_MVS_SHUFFLE 8 + %assign y x + %rep 8 + %rep 7 + %rotate (~y)&1 + %assign y y>>((~y)&1) + %endrep + db %1*2, %1*2+1 + %rotate 1 + %assign y y>>1 + %endrep +%endmacro +%assign x 0 +%rep 256 + ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7 +%assign x x+1 +%endrep + SECTION .text cextern pb_0 @@ -130,83 +155,83 @@ cextern pb_1 cextern pw_1 cextern pw_8 cextern pw_16 -cextern pw_64 +cextern pw_32 cextern pw_00ff cextern pw_ppppmmmm cextern pw_ppmmppmm cextern pw_pmpmpmpm cextern pw_pmmpzzzz +cextern pd_1 cextern hsub_mul +cextern popcnt_table ;============================================================================= ; SSD ;============================================================================= -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int ) +; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD_ONE 2 -cglobal pixel_ssd_%1x%2, 4,5,6 - mov r4, %1*%2/mmsize +cglobal pixel_ssd_%1x%2, 4,7,6 + FIX_STRIDES r1, r3 +%if mmsize == %1*2 + %define offset0_1 r1 + %define offset0_2 r1*2 + %define offset0_3 r5 + %define offset1_1 r3 + %define offset1_2 r3*2 + %define offset1_3 r6 + lea r5, [3*r1] + lea r6, [3*r3] +%elif mmsize == %1 + %define offset0_1 mmsize + %define offset0_2 r1 + %define offset0_3 r1+mmsize + %define offset1_1 mmsize + %define offset1_2 r3 + %define offset1_3 r3+mmsize +%elif mmsize == %1/2 + %define offset0_1 mmsize + %define offset0_2 mmsize*2 + %define offset0_3 mmsize*3 + %define offset1_1 mmsize + %define offset1_2 mmsize*2 + %define offset1_3 mmsize*3 +%endif + %assign %%n %2/(2*mmsize/%1) +%if %%n > 1 + mov r4d, %%n +%endif pxor m0, m0 -.loop +.loop: mova m1, [r0] -%if %1 <= mmsize/2 - mova m3, [r0+r1*2] - %define offset r3*2 - %define num_rows 2 -%else - mova m3, [r0+mmsize] - %define offset mmsize - %define num_rows 1 -%endif - lea r0, [r0+r1*2*num_rows] + mova m2, [r0+offset0_1] + mova m3, [r0+offset0_2] + mova m4, [r0+offset0_3] psubw m1, [r2] - psubw m3, [r2+offset] - lea r2, [r2+r3*2*num_rows] + psubw m2, [r2+offset1_1] + psubw m3, [r2+offset1_2] + psubw m4, [r2+offset1_3] +%if %%n > 1 + lea r0, [r0+r1*(%2/%%n)] + lea r2, [r2+r3*(%2/%%n)] +%endif pmaddwd m1, m1 + pmaddwd m2, m2 pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 paddd m0, m1 paddd m0, m3 - dec r4 +%if %%n > 1 + dec r4d jg .loop +%endif HADDD m0, m5 - movd eax, m0 - RET -%endmacro - -%macro SSD_16_MMX 2 -cglobal pixel_ssd_%1x%2, 4,5 - mov r4, %1*%2/mmsize/2 - pxor m0, m0 -.loop - mova m1, [r0] - mova m2, [r2] - mova m3, [r0+mmsize] - mova m4, [r2+mmsize] - mova m5, [r0+mmsize*2] - mova m6, [r2+mmsize*2] - mova m7, [r0+mmsize*3] - psubw m1, m2 - psubw m3, m4 - mova m2, [r2+mmsize*3] - psubw m5, m6 - pmaddwd m1, m1 - psubw m7, m2 - pmaddwd m3, m3 - pmaddwd m5, m5 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - pmaddwd m7, m7 - paddd m1, m3 - paddd m5, m7 - paddd m0, m1 - paddd m0, m5 - dec r4 - jg .loop - HADDD m0, m7 - movd eax, m0 + movd eax, xm0 RET %endmacro @@ -217,17 +242,20 @@ SSD_ONE 4, 16 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 -SSD_16_MMX 16, 8 -SSD_16_MMX 16, 16 +SSD_ONE 16, 8 +SSD_ONE 16, 16 INIT_XMM sse2 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 SSD_ONE 16, 8 SSD_ONE 16, 16 +INIT_YMM avx2 +SSD_ONE 16, 8 +SSD_ONE 16, 16 %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %macro SSD_LOAD_FULL 5 mova m1, [t0+%1] mova m2, [t2+%2] @@ -287,6 +315,23 @@ SSD_ONE 16, 16 punpcklbw m%2, m%4 %endmacro +%macro LOAD_AVX2 5 + mova xm%1, %3 + vinserti128 m%1, m%1, %4, 1 +%if %5 + lea t0, [t0+2*t1] +%endif +%endmacro + +%macro JOIN_AVX2 7 + mova xm%2, %5 + vinserti128 m%2, m%2, %6, 1 +%if %7 + lea t2, [t2+2*t3] +%endif + SBUTTERFLY bw, %1, %2, %3 +%endmacro + %macro SSD_LOAD_HALF 5 LOAD 1, 2, [t0+%1], [t0+%3], 1 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 @@ -361,7 +406,7 @@ SSD_ONE 16, 16 %endmacro ;----------------------------------------------------------------------------- -; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int ) +; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD 2 %if %1 != %2 @@ -377,7 +422,7 @@ cglobal pixel_ssd_%1x%2, 0,0,0 %else .startloop: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3 PROLOGUE 0,0,8 %else @@ -409,8 +454,15 @@ ALIGN 16 %endif dec al jg .loop +%if mmsize==32 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + HADDD xm0, xm1 + movd eax, xm0 +%else HADDD m0, m1 movd eax, m0 +%endif RET %endif %endmacro @@ -462,11 +514,16 @@ SSD 8, 8 SSD 16, 8 SSD 8, 16 SSD 8, 4 +%define LOAD LOAD_AVX2 +%define JOIN JOIN_AVX2 +INIT_YMM avx2 +SSD 16, 16 +SSD 16, 8 %assign function_align 16 %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2, +; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2, ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) ; ; The maximum width this function can handle without risk of overflow is given @@ -477,7 +534,7 @@ SSD 8, 4 ; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane ; distortion levels it will take much more than that though. ;----------------------------------------------------------------------------- -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro SSD_NV12 0 cglobal pixel_ssd_nv12_core, 6,7,7 shl r4d, 2 @@ -500,7 +557,7 @@ cglobal pixel_ssd_nv12_core, 6,7,7 psubw m1, [r2+r6+mmsize] PSHUFLW m0, m0, q3120 PSHUFLW m1, m1, q3120 -%if mmsize==16 +%if mmsize >= 16 pshufhw m0, m0, q3120 pshufhw m1, m1, q3120 %endif @@ -510,8 +567,13 @@ cglobal pixel_ssd_nv12_core, 6,7,7 paddd m3, m1 add r6, 2*mmsize jl .loopx -%if mmsize==16 ; using HADDD would remove the mmsize/32 part from the - ; equation above, putting the width limit at 8208 +%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled + jz .no_overread + psubd m3, m1 +.no_overread: +%endif +%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the + ; equation above, putting the width limit at 8208 punpckhdq m0, m2, m6 punpckhdq m1, m3, m6 punpckldq m2, m6 @@ -539,9 +601,13 @@ cglobal pixel_ssd_nv12_core, 6,7,7 jg .loopy mov r3, r6m mov r4, r7m -%if mmsize==16 - movq [r3], m4 - movhps [r4], m4 +%if mmsize == 32 + vextracti128 xm0, m4, 1 + paddq xm4, xm0 +%endif +%if mmsize >= 16 + movq [r3], xm4 + movhps [r4], xm4 %else ; fixup for mmx2 SBUTTERFLY dq, 4, 5, 0 mova m0, m4 @@ -558,9 +624,9 @@ cglobal pixel_ssd_nv12_core, 6,7,7 %endmacro ; SSD_NV12 %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, +; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2, ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) ; ; This implementation can potentially overflow on image widths >= 11008 (or @@ -569,7 +635,7 @@ cglobal pixel_ssd_nv12_core, 6,7,7 ;----------------------------------------------------------------------------- %macro SSD_NV12 0 cglobal pixel_ssd_nv12_core, 6,7 - shl r4d, 1 + add r4d, r4d add r0, r4 add r2, r4 pxor m3, m3 @@ -579,10 +645,15 @@ cglobal pixel_ssd_nv12_core, 6,7 mov r6, r4 neg r6 .loopx: - mova m0, [r0+r6] +%if mmsize == 32 ; only 16-byte alignment is guaranteed + movu m2, [r0+r6] + movu m1, [r2+r6] +%else + mova m2, [r0+r6] mova m1, [r2+r6] - psubusb m0, m1 - psubusb m1, [r0+r6] +%endif + psubusb m0, m2, m1 + psubusb m1, m2 por m0, m1 psrlw m2, m0, 8 pand m0, m5 @@ -592,19 +663,28 @@ cglobal pixel_ssd_nv12_core, 6,7 paddd m4, m2 add r6, mmsize jl .loopx +%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled + jz .no_overread + pcmpeqb xm1, xm1 + pandn m0, m1, m0 ; zero the lower half + pandn m2, m1, m2 + psubd m3, m0 + psubd m4, m2 +.no_overread: +%endif add r0, r1 add r2, r3 dec r5d jg .loopy mov r3, r6m mov r4, r7m - mova m5, [sq_0f] HADDD m3, m0 HADDD m4, m0 - pand m3, m5 - pand m4, m5 - movq [r3], m3 - movq [r4], m4 + pxor xm0, xm0 + punpckldq xm3, xm0 + punpckldq xm4, xm0 + movq [r3], xm3 + movq [r4], xm4 RET %endmacro ; SSD_NV12 %endif ; !HIGH_BIT_DEPTH @@ -615,6 +695,8 @@ INIT_XMM sse2 SSD_NV12 INIT_XMM avx SSD_NV12 +INIT_YMM avx2 +SSD_NV12 ;============================================================================= ; variance @@ -623,17 +705,17 @@ SSD_NV12 %macro VAR_START 1 pxor m5, m5 ; sum pxor m6, m6 ; sum squared -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %if %1 mova m7, [pw_00ff] -%else +%elif mmsize < 32 pxor m7, m7 ; zero %endif %endif ; !HIGH_BIT_DEPTH %endmacro %macro VAR_END 2 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if mmsize == 8 && %1*%2 == 256 HADDUW m5, m2 %else @@ -642,12 +724,13 @@ SSD_NV12 %else ; !HIGH_BIT_DEPTH HADDW m5, m2 %endif ; HIGH_BIT_DEPTH - movd eax, m5 HADDD m6, m1 +%if ARCH_X86_64 + punpckldq m5, m6 + movq rax, m5 +%else + movd eax, m5 movd edx, m6 -%ifdef ARCH_X86_64 - shl rdx, 32 - add rax, rdx %endif RET %endmacro @@ -670,7 +753,7 @@ SSD_NV12 %macro VAR_2ROW 2 mov r2d, %2 .loop: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m0, [r0] mova m1, [r0+mmsize] mova m3, [r0+%1] @@ -687,7 +770,7 @@ SSD_NV12 %else add r0, r1 %endif -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 punpcklbw m3, m7 punpckhbw m4, m7 %endif ; !HIGH_BIT_DEPTH @@ -697,7 +780,7 @@ SSD_NV12 %endmacro ;----------------------------------------------------------------------------- -; int pixel_var_wxh( uint8_t *, int ) +; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_var_16x16, 2,3 @@ -718,7 +801,7 @@ cglobal pixel_var_8x8, 2,3 VAR_2ROW r1, 4 VAR_END 8, 8 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 FIX_STRIDES r1 @@ -751,7 +834,7 @@ INIT_XMM xop VAR %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 VAR_START 1 @@ -805,22 +888,51 @@ INIT_XMM avx VAR INIT_XMM xop VAR + +INIT_YMM avx2 +cglobal pixel_var_16x16, 2,4,7 + VAR_START 0 + mov r2d, 4 + lea r3, [r1*3] +.loop: + pmovzxbw m0, [r0] + pmovzxbw m3, [r0+r1] + pmovzxbw m1, [r0+r1*2] + pmovzxbw m4, [r0+r3] + lea r0, [r0+r1*4] + VAR_CORE + dec r2d + jg .loop + vextracti128 xm0, m5, 1 + vextracti128 xm1, m6, 1 + paddw xm5, xm0 + paddd xm6, xm1 + HADDW xm5, xm2 + HADDD xm6, xm1 +%if ARCH_X86_64 + punpckldq xm5, xm6 + movq rax, xm5 +%else + movd eax, xm5 + movd edx, xm6 +%endif + RET %endif ; !HIGH_BIT_DEPTH -%macro VAR2_END 1 - HADDW m5, m7 - movd r1d, m5 +%macro VAR2_END 3 + HADDW %2, xm1 + movd r1d, %2 imul r1d, r1d - HADDD m6, m1 + HADDD %3, xm1 shr r1d, %1 - movd eax, m6 - mov [r4], eax + movd eax, %3 + movd [r4], %3 sub eax, r1d ; sqr - (sum * sum >> shift) RET %endmacro ;----------------------------------------------------------------------------- -; int pixel_var2_8x8( pixel *, int, pixel *, int, int * ) +; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * ) ;----------------------------------------------------------------------------- %macro VAR2_8x8_MMX 2 cglobal pixel_var2_8x%1, 5,6 @@ -828,7 +940,7 @@ cglobal pixel_var2_8x%1, 5,6 VAR_START 0 mov r5d, %1 .loop: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m0, [r0] mova m1, [r0+mmsize] psubw m0, [r2] @@ -855,10 +967,10 @@ cglobal pixel_var2_8x%1, 5,6 add r2, r3 dec r5d jg .loop - VAR2_END %2 + VAR2_END %2, m5, m6 %endmacro -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 INIT_MMX mmx2 VAR2_8x8_MMX 8, 6 VAR2_8x8_MMX 16, 7 @@ -869,7 +981,7 @@ cglobal pixel_var2_8x%1, 5,6,8 VAR_START 1 mov r5d, %1/2 .loop: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m0, [r0] mova m1, [r0+r1*2] mova m2, [r2] @@ -893,14 +1005,14 @@ cglobal pixel_var2_8x%1, 5,6,8 lea r2, [r2+r3*2*SIZEOF_PIXEL] dec r5d jg .loop - VAR2_END %2 + VAR2_END %2, m5, m6 %endmacro INIT_XMM sse2 VAR2_8x8_SSE2 8, 6 VAR2_8x8_SSE2 16, 7 -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %macro VAR2_8x8_SSSE3 2 cglobal pixel_var2_8x%1, 5,6,8 pxor m5, m5 ; sum @@ -942,7 +1054,7 @@ cglobal pixel_var2_8x%1, 5,6,8 lea r2, [r2+r3*2] dec r5d jg .loop - VAR2_END %2 + VAR2_END %2, m5, m6 %endmacro INIT_XMM ssse3 @@ -952,6 +1064,48 @@ INIT_XMM xop VAR2_8x8_SSSE3 8, 6 VAR2_8x8_SSSE3 16, 7 +%macro VAR2_8x8_AVX2 2 +cglobal pixel_var2_8x%1, 5,6,6 + pxor m3, m3 ; sum + pxor m4, m4 ; sum squared + mova m5, [hsub_mul] + mov r5d, %1/4 +.loop: + movq xm0, [r0] + movq xm1, [r2] + vinserti128 m0, m0, [r0+r1], 1 + vinserti128 m1, m1, [r2+r3], 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m0, m1 + movq xm1, [r0] + movq xm2, [r2] + vinserti128 m1, m1, [r0+r1], 1 + vinserti128 m2, m2, [r2+r3], 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m1, m2 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + paddw m3, m0 + paddw m3, m1 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m4, m0 + paddd m4, m1 + dec r5d + jg .loop + vextracti128 xm0, m3, 1 + vextracti128 xm1, m4, 1 + paddw xm3, xm0 + paddd xm4, xm1 + VAR2_END %2, xm3, xm4 +%endmacro + +INIT_YMM avx2 +VAR2_8x8_AVX2 8, 6 +VAR2_8x8_AVX2 16, 7 + %endif ; !HIGH_BIT_DEPTH ;============================================================================= @@ -962,7 +1116,7 @@ VAR2_8x8_SSSE3 16, 7 %if cpuflag(sse4) ; just use shufps on anything post conroe shufps %1, %2, 0 -%elif cpuflag(ssse3) +%elif cpuflag(ssse3) && notcpuflag(atom) ; join 2x 32 bit and duplicate them ; emulating shufps is faster on conroe punpcklqdq %1, %2 @@ -1023,7 +1177,7 @@ VAR2_8x8_SSSE3 16, 7 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro -%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0 +%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] @@ -1059,13 +1213,59 @@ VAR2_8x8_SSSE3 16, 7 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 %endmacro +%macro LOAD_SUMSUB_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + vbroadcasti128 m%1, [%6] + vbroadcasti128 m%3, [%7] + vbroadcasti128 m%2, [%8] + vbroadcasti128 m%4, [%9] + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 + LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer + mova xm%3, %6 + mova xm%4, %8 + mova xm%1, %5 + mova xm%2, %7 + vpermq m%3, m%3, q0011 + vpermq m%4, m%4, q0011 + vpermq m%1, m%1, q0011 + vpermq m%2, m%2, q0011 +%endmacro + +%macro LOAD_SUMSUB8_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] + LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + ; in: r4=3*stride1, r5=3*stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 - %xdefine %%n n%1 + %xdefine %%n nn%1 %assign offset %2*SIZEOF_PIXEL LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] @@ -1080,8 +1280,9 @@ VAR2_8x8_SSSE3 16, 7 SWAP %%n, 4 %endmacro +; in: %1 = horizontal if 0, vertical if 1 %macro SATD_8x4_SSE 8-9 -%ifidn %1, sse2 +%if %1 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax %else HADAMARD4_V %2, %3, %4, %5, %6 @@ -1095,7 +1296,7 @@ VAR2_8x8_SSSE3 16, 7 %else SWAP %8, %2 %endif -%ifidn %1, sse2 +%if %1 paddw m%8, m%4 %else HADAMARD 1, max, %3, %5, %6, %7 @@ -1110,7 +1311,7 @@ VAR2_8x8_SSSE3 16, 7 %endmacro %macro SATD_END_MMX 0 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 movd eax, m0 %else ; !HIGH_BIT_DEPTH @@ -1128,7 +1329,7 @@ VAR2_8x8_SSSE3 16, 7 ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- -; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int ) +; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_satd_16x4_internal @@ -1154,7 +1355,7 @@ pixel_satd_8x4_internal_mmx2: paddw m0, m1 ret -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro SATD_MxN_MMX 3 cglobal pixel_satd_%1x%2, 4,7 SATD_START_MMX @@ -1182,7 +1383,7 @@ SATD_MxN_MMX 16, 8, 4 SATD_MxN_MMX 8, 16, 8 %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 cglobal pixel_satd_16x16, 4,6 SATD_START_MMX pxor m0, m0 @@ -1250,41 +1451,93 @@ cglobal pixel_satd_4x4, 4,6 SATD_4x4_MMX m0, 0, 0 SATD_END_MMX -%macro SATD_START_SSE2 2 -%if cpuflag(ssse3) +%macro SATD_START_SSE2 2-3 0 + FIX_STRIDES r1, r3 +%if HIGH_BIT_DEPTH && %3 + pxor %2, %2 +%elif cpuflag(ssse3) && notcpuflag(atom) +%if mmsize==32 + mova %2, [hmul_16p] +%else mova %2, [hmul_8p] +%endif %endif lea r4, [3*r1] lea r5, [3*r3] pxor %1, %1 %endmacro -%macro SATD_END_SSE2 1 - HADDW %1, m7 +%macro SATD_END_SSE2 1-2 +%if HIGH_BIT_DEPTH + HADDUW %1, xm0 +%if %0 == 2 + paddd %1, %2 +%endif +%else + HADDW %1, xm7 +%endif movd eax, %1 RET %endmacro +%macro SATD_ACCUM 3 +%if HIGH_BIT_DEPTH + HADDUW %1, %2 + paddd %3, %1 + pxor %1, %1 +%endif +%endmacro + %macro BACKUP_POINTERS 0 -%ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 +%if ARCH_X86_64 +%if WIN64 + PUSH r7 +%endif + mov r6, r0 + mov r7, r2 %endif %endmacro %macro RESTORE_AND_INC_POINTERS 0 -%ifdef ARCH_X86_64 - lea r0, [r10+8] - lea r2, [r11+8] +%if ARCH_X86_64 + lea r0, [r6+8*SIZEOF_PIXEL] + lea r2, [r7+8*SIZEOF_PIXEL] +%if WIN64 + POP r7 +%endif %else mov r0, r0mp mov r2, r2mp - add r0, 8 - add r2, 8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL %endif %endmacro -%macro SATD_4x8_SSE 2 +%macro SATD_4x8_SSE 3 +%if HIGH_BIT_DEPTH + movh m0, [r0+0*r1] + movh m4, [r2+0*r3] + movh m1, [r0+1*r1] + movh m5, [r2+1*r3] + movhps m0, [r0+4*r1] + movhps m4, [r2+4*r3] + movh m2, [r0+2*r1] + movh m6, [r2+2*r3] + psubw m0, m4 + movh m3, [r0+r4] + movh m4, [r2+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + movhps m1, [r0+1*r1] + movhps m5, [r2+1*r3] + movhps m2, [r0+2*r1] + movhps m6, [r2+2*r3] + psubw m1, m5 + movhps m3, [r0+r4] + movhps m4, [r2+r5] + psubw m2, m6 + psubw m3, m4 +%else ; !HIGH_BIT_DEPTH movd m4, [r2] movd m5, [r2+r3] movd m6, [r2+2*r3] @@ -1301,7 +1554,7 @@ cglobal pixel_satd_4x4, 4,6 JDUP m5, m3 movd m3, [r0+2*r1] JDUP m1, m3 -%if cpuflag(ssse3) && %1==1 +%if %1==0 && %2==1 mova m3, [hmul_4p] DIFFOP 0, 4, 1, 5, 3 %else @@ -1319,20 +1572,23 @@ cglobal pixel_satd_4x4, 4,6 JDUP m5, m4 movd m4, [r0+r1] JDUP m3, m4 -%if cpuflag(ssse3) && %1==1 +%if %1==0 && %2==1 mova m4, [hmul_4p] DIFFOP 2, 6, 3, 5, 4 %else DIFFOP 2, 6, 3, 5, 7 %endif - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2 +%endif ; HIGH_BIT_DEPTH + SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 %endmacro ;----------------------------------------------------------------------------- -; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int ) +; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 0 -%if cpuflag(ssse3) +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) + +%if vertical==0 || HIGH_BIT_DEPTH cglobal pixel_satd_4x4, 4, 6, 6 SATD_START_MMX mova m4, [hmul_4p] @@ -1351,55 +1607,57 @@ cglobal pixel_satd_4x4, 4, 6, 6 cglobal pixel_satd_4x8, 4, 6, 8 SATD_START_MMX -%if cpuflag(ssse3) +%if vertical==0 mova m7, [hmul_4p] %endif - SATD_4x8_SSE 0, swap + SATD_4x8_SSE vertical, 0, swap HADDW m7, m1 movd eax, m7 RET cglobal pixel_satd_4x16, 4, 6, 8 SATD_START_MMX -%if cpuflag(ssse3) +%if vertical==0 mova m7, [hmul_4p] %endif - SATD_4x8_SSE 0, swap - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - SATD_4x8_SSE 1, add + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0+r1*2*SIZEOF_PIXEL] + lea r2, [r2+r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET cglobal pixel_satd_8x8_internal - LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 %%pixel_satd_8x4_internal: - LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 ret -%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same +; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) +; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) +%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx) cglobal pixel_satd_16x4_internal LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] lea r0, [r0+4*r1] - ; FIXME: this doesn't really mean ssse3, but rather selects between two different behaviors implemented with sse2? - SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10 - SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10 + ; always use horizontal mode here + SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10 + SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10 ret cglobal pixel_satd_16x8, 4,6,12 SATD_START_SSE2 m10, m7 -%if notcpuflag(ssse3) +%if vertical mova m7, [pw_00ff] %endif jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x16, 4,6,12 SATD_START_SSE2 m10, m7 -%if notcpuflag(ssse3) +%if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal @@ -1418,14 +1676,15 @@ cglobal pixel_satd_16x8, 4,6,8 SATD_END_SSE2 m6 cglobal pixel_satd_16x16, 4,6,8 - SATD_START_SSE2 m6, m7 + SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal call pixel_satd_8x8_internal + SATD_ACCUM m6, m0, m7 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal call pixel_satd_8x8_internal - SATD_END_SSE2 m6 + SATD_END_SSE2 m6, m7 %endif cglobal pixel_satd_8x16, 4,6,8 @@ -1446,14 +1705,14 @@ cglobal pixel_satd_8x4, 4,6,8 %endmacro ; SATDS_SSE2 %macro SA8D_INTER 0 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 %define lh m10 %define rh m0 %else %define lh m0 %define rh [esp+48] %endif -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 paddd lh, rh %else @@ -1462,21 +1721,18 @@ cglobal pixel_satd_8x4, 4,6,8 %endmacro %macro SA8D 0 -%ifdef HIGH_BIT_DEPTH - %define vertical 1 -%else ; sse2 doesn't seem to like the horizontal way of doing things - %define vertical (cpuflags == cpuflags_sse2) -%endif +; sse2 doesn't seem to like the horizontal way of doing things +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) -%ifdef ARCH_X86_64 +%if ARCH_X86_64 ;----------------------------------------------------------------------------- -; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int ) +; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal - lea r10, [r0+4*r1] - lea r11, [r2+4*r3] + lea r6, [r0+4*r1] + lea r7, [r2+4*r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 - LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11 + LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 %if vertical HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax %else ; non-sse2 @@ -1488,7 +1744,7 @@ cglobal pixel_sa8d_8x8_internal SAVE_MM_PERMUTATION ret -cglobal pixel_sa8d_8x8, 4,6,12 +cglobal pixel_sa8d_8x8, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] @@ -1496,7 +1752,7 @@ cglobal pixel_sa8d_8x8, 4,6,12 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 @@ -1506,7 +1762,7 @@ cglobal pixel_sa8d_8x8, 4,6,12 shr eax, 1 RET -cglobal pixel_sa8d_16x16, 4,6,12 +cglobal pixel_sa8d_16x16, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] @@ -1516,7 +1772,7 @@ cglobal pixel_sa8d_16x16, 4,6,12 call pixel_sa8d_8x8_internal ; pix[0] add r2, 8*SIZEOF_PIXEL add r0, 8*SIZEOF_PIXEL -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova m10, m0 @@ -1531,7 +1787,7 @@ cglobal pixel_sa8d_16x16, 4,6,12 call pixel_sa8d_8x8_internal ; pix[8*stride] SA8D_INTER SWAP 0, 10 -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd eax, m0 @@ -1600,7 +1856,7 @@ cglobal pixel_sa8d_8x8, 4,7 lea r4, [3*r1] lea r5, [3*r3] call pixel_sa8d_8x8_internal -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 @@ -1623,7 +1879,7 @@ cglobal pixel_sa8d_16x16, 4,7 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 @@ -1643,7 +1899,7 @@ cglobal pixel_sa8d_16x16, 4,7 %endif mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH SA8D_INTER %else ; !HIGH_BIT_DEPTH paddusw m0, [esp+64-mmsize] @@ -1672,6 +1928,170 @@ cglobal pixel_sa8d_16x16, 4,7 %endif ; !ARCH_X86_64 %endmacro ; SA8D +;============================================================================= +; SA8D_SATD +;============================================================================= + +; %1: vertical/horizontal mode +; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9) +; m10: satd result +; m6, m11-15: tmp regs +%macro SA8D_SATD_8x4 5 +%if %1 + LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 + HADAMARD 0, sumsub, %2, %3, 6 + HADAMARD 0, sumsub, %4, %5, 6 + SBUTTERFLY wd, %2, %3, 6 + SBUTTERFLY wd, %4, %5, 6 + HADAMARD2_2D %2, %4, %3, %5, 6, dq + + mova m12, m%2 + mova m13, m%3 + mova m14, m%4 + mova m15, m%5 + HADAMARD 0, sumsub, %2, %3, 6 + HADAMARD 0, sumsub, %4, %5, 6 + SBUTTERFLY qdq, 12, 13, 6 + HADAMARD 0, amax, 12, 13, 6 + SBUTTERFLY qdq, 14, 15, 6 + paddw m10, m12 + HADAMARD 0, amax, 14, 15, 6 + paddw m10, m14 +%else + LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 + HADAMARD4_V %2, %3, %4, %5, 6 + + pabsw m12, m%2 ; doing the abs first is a slight advantage + pabsw m14, m%4 + pabsw m13, m%3 + pabsw m15, m%5 + HADAMARD 1, max, 12, 14, 6, 11 + paddw m10, m12 + HADAMARD 1, max, 13, 15, 6, 11 + paddw m10, m13 +%endif +%endmacro ; SA8D_SATD_8x4 + +; %1: add spilled regs? +; %2: spill regs? +%macro SA8D_SATD_ACCUM 2 +%if HIGH_BIT_DEPTH + pmaddwd m10, [pw_1] + HADDUWD m0, m1 +%if %1 + paddd m10, temp1 + paddd m0, temp0 +%endif +%if %2 + mova temp1, m10 + pxor m10, m10 +%endif +%elif %1 + paddw m0, temp0 +%endif +%if %2 + mova temp0, m0 +%endif +%endmacro + +%macro SA8D_SATD 0 +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) +cglobal pixel_sa8d_satd_8x8_internal + SA8D_SATD_8x4 vertical, 0, 1, 2, 3 + SA8D_SATD_8x4 vertical, 4, 5, 8, 9 + +%if vertical ; sse2-style + HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax + HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax +%else ; complete sa8d + SUMSUB_BADC w, 0, 4, 1, 5, 12 + HADAMARD 2, sumsub, 0, 4, 12, 11 + HADAMARD 2, sumsub, 1, 5, 12, 11 + SUMSUB_BADC w, 2, 8, 3, 9, 12 + HADAMARD 2, sumsub, 2, 8, 12, 11 + HADAMARD 2, sumsub, 3, 9, 12, 11 + HADAMARD 1, amax, 0, 4, 12, 11 + HADAMARD 1, amax, 1, 5, 12, 4 + HADAMARD 1, amax, 2, 8, 12, 4 + HADAMARD 1, amax, 3, 9, 12, 4 +%endif + + ; create sa8d sub results + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + + SAVE_MM_PERMUTATION + ret + +;------------------------------------------------------------------------------- +; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t ) +;------------------------------------------------------------------------------- +cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize + %define temp0 [rsp+0*mmsize] + %define temp1 [rsp+1*mmsize] + FIX_STRIDES r1, r3 +%if vertical==0 + mova m7, [hmul_8p] +%endif + lea r4, [3*r1] + lea r5, [3*r3] + pxor m10, m10 + +%if mmsize==32 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 0, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 0 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m10, 1 + paddw xm0, xm1 + paddw xm10, xm2 +%else + lea r6, [r2+8*SIZEOF_PIXEL] + lea r7, [r0+8*SIZEOF_PIXEL] + + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 0, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 1 + + mov r0, r7 + mov r2, r6 + + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 0 +%endif + +; xop already has fast horizontal sums +%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0 + pmaddwd xm10, [pw_1] + HADDUWD xm0, xm1 + phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2 + pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1 + paddd xm0, xm1 ; sa8d sa8d satd satd + movd r0d, xm0 + pextrd eax, xm0, 2 +%else +%if HIGH_BIT_DEPTH + HADDD xm0, xm1 + HADDD xm10, xm2 +%else + HADDUW xm0, xm1 + HADDW xm10, xm2 +%endif + movd r0d, xm0 + movd eax, xm10 +%endif + add r0d, 1 + shl rax, 32 + shr r0d, 1 + or rax, r0 + RET +%endmacro ; SA8D_SATD + ;============================================================================= ; INTRA SATD ;============================================================================= @@ -1688,7 +2108,7 @@ cglobal pixel_sa8d_16x16, 4,7 ; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+, ; and are only retained for old cpus. %macro INTRA_SA8D_SSE2 0 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) ;----------------------------------------------------------------------------- @@ -1794,6 +2214,12 @@ cglobal intra_sa8d_x3_8x8, 3,3,14 INIT_MMX cglobal hadamard_load ; not really a global, but otherwise cycles get attributed to the wrong function in profiling +%if HIGH_BIT_DEPTH + mova m0, [r0+0*FENC_STRIDEB] + mova m1, [r0+1*FENC_STRIDEB] + mova m2, [r0+2*FENC_STRIDEB] + mova m3, [r0+3*FENC_STRIDEB] +%else pxor m7, m7 movd m0, [r0+0*FENC_STRIDE] movd m1, [r0+1*FENC_STRIDE] @@ -1803,24 +2229,31 @@ cglobal hadamard_load punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 +%endif HADAMARD4_2D 0, 1, 2, 3, 4 SAVE_MM_PERMUTATION ret %macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp %ifidn %1, top - movd %3, [r1+%2-FDEC_STRIDE] +%if HIGH_BIT_DEPTH + mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB] +%else + movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB] pxor %5, %5 punpcklbw %3, %5 +%endif %else ; left %ifnidn %2, 0 - shl %2d, 5 ; log(FDEC_STRIDE) + shl %2d, 5 ; log(FDEC_STRIDEB) %endif - movd %3, [r1+%2-4+1*FDEC_STRIDE] - pinsrw %3, [r1+%2-2+0*FDEC_STRIDE], 0 - pinsrw %3, [r1+%2-2+2*FDEC_STRIDE], 2 - pinsrw %3, [r1+%2-2+3*FDEC_STRIDE], 3 + movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB] + pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0 + pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2 + pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3 +%if HIGH_BIT_DEPTH == 0 psrlw %3, 8 +%endif %ifnidn %2, 0 shr %2d, 5 %endif @@ -1859,19 +2292,6 @@ cglobal hadamard_load %8 %3, %6 %endmacro -%macro CLEAR_SUMS 0 -%ifdef ARCH_X86_64 - mov qword [sums+0], 0 - mov qword [sums+8], 0 - mov qword [sums+16], 0 -%else - pxor m7, m7 - movq [sums+0], m7 - movq [sums+8], m7 - movq [sums+16], m7 -%endif -%endmacro - ; in: m1..m3 ; out: m7 ; clobber: m4..m6 @@ -1907,15 +2327,16 @@ cglobal hadamard_load ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal intra_satd_x3_4x4, 3,3 -%ifdef ARCH_X86_64 +%if UNIX64 ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 %define left_1d rsp-16 ; size 8 %else - ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned - SUB esp, 16 - %define top_1d esp+8 - %define left_1d esp + ; WIN64: stack is 16 byte aligned because abi says so + ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned + SUB rsp, 16 + %define top_1d rsp+8 + %define left_1d rsp %endif call hadamard_load @@ -1937,50 +2358,52 @@ cglobal intra_satd_x3_4x4, 3,3 movd [r2+0], m0 ; i4x4_v satd movd [r2+4], m4 ; i4x4_h satd movd [r2+8], m5 ; i4x4_dc satd -%ifndef ARCH_X86_64 - ADD esp, 16 +%if UNIX64 == 0 + ADD rsp, 16 %endif RET -%ifdef ARCH_X86_64 - %define t0 r10 - %define t2 r11 -%else - %define t0 r0 - %define t2 r2 -%endif - ;----------------------------------------------------------------------------- ; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal intra_satd_x3_16x16, 0,5 - %assign stack_pad 88 + ((stack_offset+88+gprsize)&15) + %assign stack_pad 120 + ((stack_offset+120+gprsize)&15) ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, stack_pad -%define sums rsp+64 ; size 24 +%define sums rsp+64 ; size 56 %define top_1d rsp+32 ; size 32 %define left_1d rsp ; size 32 movifnidn r1, r1mp - CLEAR_SUMS + + pxor m7, m7 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 +%if HIGH_BIT_DEPTH + mova [sums+24], m7 + mova [sums+32], m7 + mova [sums+40], m7 + mova [sums+48], m7 +%endif ; 1D hadamards - mov t0d, 12 - movd m6, [pw_64] + mov r3d, 12 + movd m6, [pw_32] .loop_edge: - SCALAR_HADAMARD left, t0, m0, m1 - SCALAR_HADAMARD top, t0, m1, m2, m3 - paddw m6, m0 - paddw m6, m1 - sub t0d, 4 + SCALAR_HADAMARD left, r3, m0, m1 + SCALAR_HADAMARD top, r3, m1, m2, m3 + pavgw m0, m1 + paddw m6, m0 + sub r3d, 4 jge .loop_edge - psrlw m6, 3 - pand m6, [sw_f0] ; dc + psrlw m6, 2 + pand m6, [sw_f0] ; dc ; 2D hadamards - movifnidn r0, r0mp - mov r3, -4 + movifnidn r0, r0mp + mov r3, -4 .loop_y: - mov r4, -4 + mov r4, -4 .loop_x: call hadamard_load @@ -1988,38 +2411,79 @@ cglobal intra_satd_x3_16x16, 0,5 SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)] pavgw m4, m7 pavgw m5, m7 - paddw m0, [sums+0] ; i16x16_v satd - paddw m4, [sums+8] ; i16x16_h satd + paddw m0, [sums+ 0] ; i16x16_v satd + paddw m4, [sums+ 8] ; i16x16_h satd paddw m5, [sums+16] ; i16x16_dc satd - movq [sums+0], m0 - movq [sums+8], m4 - movq [sums+16], m5 + mova [sums+ 0], m0 + mova [sums+ 8], m4 + mova [sums+16], m5 - add r0, 4 + add r0, 4*SIZEOF_PIXEL inc r4 jl .loop_x - add r0, 4*FENC_STRIDE-16 +%if HIGH_BIT_DEPTH + psrld m7, m4, 16 + pslld m4, 16 + psrld m4, 16 + paddd m4, m7 + psrld m7, m0, 16 + pslld m0, 16 + psrld m0, 16 + paddd m0, m7 + paddd m4, [sums+32] + paddd m0, [sums+24] + mova [sums+32], m4 + mova [sums+24], m0 + pxor m7, m7 + punpckhwd m3, m5, m7 + punpcklwd m5, m7 + paddd m3, [sums+48] + paddd m5, [sums+40] + mova [sums+48], m3 + mova [sums+40], m5 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 +%endif + add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL inc r3 jl .loop_y ; horizontal sum movifnidn r2, r2mp - movq m2, [sums+16] - movq m1, [sums+8] - movq m0, [sums+0] - movq m7, m2 - SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd +%if HIGH_BIT_DEPTH + mova m1, m5 + paddd m5, m3 + HADDD m5, m7 ; DC satd + HADDD m4, m7 ; H satd + HADDD m0, m7 ; the part of V satd that doesn't overlap with DC + psrld m0, 1 + psrlq m1, 32 ; DC[1] + paddd m0, m3 ; DC[2] + psrlq m3, 32 ; DC[3] + paddd m0, m1 + paddd m0, m3 +%else + mova m7, m5 + SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd psrld m0, 1 pslld m7, 16 psrld m7, 16 - paddd m0, m2 + paddd m0, m5 psubd m0, m7 - movd [r2+8], m2 ; i16x16_dc satd - movd [r2+4], m1 ; i16x16_h satd - movd [r2+0], m0 ; i16x16_v satd - ADD rsp, stack_pad +%endif + movd [r2+8], m5 ; i16x16_dc satd + movd [r2+4], m4 ; i16x16_h satd + movd [r2+0], m0 ; i16x16_v satd + ADD rsp, stack_pad RET +%if ARCH_X86_64 + %define t0 r6 +%else + %define t0 r2 +%endif + ;----------------------------------------------------------------------------- ; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- @@ -2031,32 +2495,35 @@ cglobal intra_satd_x3_8x8c, 0,6 %define top_1d rsp+16 ; size 16 %define left_1d rsp ; size 16 movifnidn r1, r1mp - CLEAR_SUMS + pxor m7, m7 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 ; 1D hadamards - mov t0d, 4 + mov r3d, 4 .loop_edge: - SCALAR_HADAMARD left, t0, m0, m1 - SCALAR_HADAMARD top, t0, m0, m1, m2 - sub t0d, 4 + SCALAR_HADAMARD left, r3, m0, m1 + SCALAR_HADAMARD top, r3, m0, m1, m2 + sub r3d, 4 jge .loop_edge ; dc - movzx t2d, word [left_1d+0] + movzx t0d, word [left_1d+0] movzx r3d, word [top_1d+0] movzx r4d, word [left_1d+8] movzx r5d, word [top_1d+8] - lea t2d, [t2 + r3 + 16] + lea t0d, [t0 + r3 + 16] lea r3d, [r4 + r5 + 16] - shr t2d, 1 + shr t0d, 1 shr r3d, 1 add r4d, 8 add r5d, 8 - and t2d, -16 ; tl + and t0d, -16 ; tl and r3d, -16 ; br and r4d, -16 ; bl and r5d, -16 ; tr - mov [dc_1d+ 0], t2d ; tl + mov [dc_1d+ 0], t0d ; tl mov [dc_1d+ 4], r5d ; tr mov [dc_1d+ 8], r4d ; bl mov [dc_1d+12], r3d ; br @@ -2082,10 +2549,10 @@ cglobal intra_satd_x3_8x8c, 0,6 movq [sums+8], m4 movq [sums+0], m5 - add r0, 4 + add r0, 4*SIZEOF_PIXEL inc r4 jl .loop_x - add r0, 4*FENC_STRIDE-8 + add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL add r5, 8 inc r3 jl .loop_y @@ -2095,10 +2562,18 @@ cglobal intra_satd_x3_8x8c, 0,6 movq m1, [sums+8] movq m2, [sums+16] movq m7, m0 +%if HIGH_BIT_DEPTH + psrlq m7, 16 + HADDW m7, m3 + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + psrld m2, 1 + paddd m2, m7 +%else psrlq m7, 15 paddw m2, m7 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m2, 1 +%endif movd [r2+0], m0 ; i8x8c_dc satd movd [r2+4], m1 ; i8x8c_h satd movd [r2+8], m2 ; i8x8c_v satd @@ -2310,7 +2785,7 @@ cglobal intra_sad_x9_4x4, 3,4,9 %assign pad 0xc0-gprsize-(stack_offset&15) %define pred_buf rsp sub rsp, pad -%ifdef ARCH_X86_64 +%if ARCH_X86_64 INTRA_X9_PRED intrax9a, m8 %else INTRA_X9_PRED intrax9a, [rsp+0xa0] @@ -2345,7 +2820,7 @@ cglobal intra_sad_x9_4x4, 3,4,9 paddd m2, m3 paddd m4, m5 paddd m6, m7 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 SWAP 7, 8 pxor m8, m8 %define %%zero m8 @@ -2385,7 +2860,7 @@ cglobal intra_sad_x9_4x4, 3,4,9 RET %endif ; cpuflag -%ifdef ARCH_X86_64 +%if ARCH_X86_64 ;----------------------------------------------------------------------------- ; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts ) ;----------------------------------------------------------------------------- @@ -2466,7 +2941,7 @@ ALIGN 16 psubw m1, m9 psubw m2, m10 psubw m3, m11 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap + SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap pmaddwd m0, [pw_1] %if cpuflag(sse4) pshufd m1, m0, q0032 @@ -2574,7 +3049,7 @@ ALIGN 16 psubw m2, [fenc_buf+0x20] .satd_8x4b: psubw m3, [fenc_buf+0x30] - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap pmaddwd m0, [pw_1] %if cpuflag(sse4) pshufd m1, m0, q0032 @@ -2597,7 +3072,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 %define fenc13 m5 %define fenc46 m6 %define fenc57 m7 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 %define tmp m8 %assign padbase 0x0 %else @@ -2953,7 +3428,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 ADD rsp, pad RET -%ifdef ARCH_X86_64 +%if ARCH_X86_64 ;----------------------------------------------------------------------------- ; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds ) ;----------------------------------------------------------------------------- @@ -3254,7 +3729,7 @@ ALIGN 16 ; out: [tmp]=hadamard4, m0=satd INIT_MMX mmx2 cglobal hadamard_ac_4x4 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m0, [r0] mova m1, [r0+r1] mova m2, [r0+r1*2] @@ -3296,7 +3771,7 @@ cglobal hadamard_ac_2x2max ABSW2 m1, m3, m1, m3, m4, m5 HADAMARD 0, max, 0, 2, 4, 5 HADAMARD 0, max, 1, 3, 4, 5 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pmaddwd m0, m7 pmaddwd m1, m7 paddd m6, m0 @@ -3309,13 +3784,13 @@ cglobal hadamard_ac_2x2max ret %macro AC_PREP 2 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pmaddwd %1, %2 %endif %endmacro %macro AC_PADD 3 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH AC_PREP %2, %3 paddd %1, %2 %else @@ -3325,7 +3800,7 @@ cglobal hadamard_ac_2x2max cglobal hadamard_ac_8x8 mova m6, [mask_ac4] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m7, [pw_1] %else pxor m7, m7 @@ -3347,7 +3822,7 @@ cglobal hadamard_ac_8x8 AC_PADD m5, m0, m7 sub r3, 40 mova [rsp+gprsize+8], m5 ; save satd -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pxor m6, m6 %endif %rep 3 @@ -3362,7 +3837,7 @@ cglobal hadamard_ac_8x8 ABSW2 m1, m3, m1, m3, m4, m5 ABSW2 m0, m2, m0, m2, m4, m5 HADAMARD 0, max, 1, 3, 4, 5 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pand m0, [mask_ac4] pmaddwd m1, m7 pmaddwd m0, m7 @@ -3386,7 +3861,7 @@ cglobal hadamard_ac_8x8 %macro HADAMARD_AC_WXH_SUM_MMX 2 mova m1, [rsp+1*mmsize] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if %1*%2 >= 128 paddd m0, [rsp+2*mmsize] paddd m1, [rsp+3*mmsize] @@ -3459,7 +3934,7 @@ cglobal pixel_hadamard_ac_%1x%2, 2,4 movd edx, m0 movd eax, m1 shr edx, 1 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 shl rdx, 32 add rax, rdx %endif @@ -3473,7 +3948,7 @@ HADAMARD_AC_WXH_MMX 16, 8 HADAMARD_AC_WXH_MMX 8, 8 %macro LOAD_INC_8x4W_SSE2 5 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movu m%1, [r0] movu m%2, [r0+r1] movu m%3, [r0+r1*2] @@ -3508,18 +3983,18 @@ HADAMARD_AC_WXH_MMX 8, 8 ; in: r0=pix, r1=stride, r2=stride*3 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4 cglobal hadamard_ac_8x8 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 %define spill0 m8 %define spill1 m9 %define spill2 m10 %else %define spill0 [rsp+gprsize] - %define spill1 [rsp+gprsize+16] - %define spill2 [rsp+gprsize+32] + %define spill1 [rsp+gprsize+mmsize] + %define spill2 [rsp+gprsize+mmsize*2] %endif -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %define vertical 1 -%elif cpuflag(ssse3) +%elif cpuflag(ssse3) && notcpuflag(atom) %define vertical 0 ;LOAD_INC loads sumsubs mova m7, [hmul_8p] @@ -3578,17 +4053,14 @@ cglobal hadamard_ac_8x8 AC_PADD m1, m2, [pw_1] ABSW m2, m7, m7 AC_PADD m1, m3, [pw_1] - mova m3, m7 AC_PADD m1, m2, [pw_1] - mova m2, m6 + paddw m3, m7, spill2 psubw m7, spill2 - paddw m3, spill2 - mova [rsp+gprsize+32], m1 ; save satd - mova m1, m5 + mova [rsp+gprsize+mmsize*2], m1 ; save satd + paddw m2, m6, spill1 psubw m6, spill1 - paddw m2, spill1 + paddw m1, m5, spill0 psubw m5, spill0 - paddw m1, spill0 %assign %%x 2 %if vertical %assign %%x 4 @@ -3602,7 +4074,7 @@ cglobal hadamard_ac_8x8 AC_PREP m2, [pw_1] AC_PADD m2, m3, [pw_1] AC_PADD m2, m1, [pw_1] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH paddd m2, m2 %else paddw m2, m2 @@ -3612,20 +4084,22 @@ cglobal hadamard_ac_8x8 ABSW m0, m0, m7 AC_PADD m2, m4, [pw_1] AC_PADD m2, m0, [pw_1] - mova [rsp+gprsize+16], m2 ; save sa8d + mova [rsp+gprsize+mmsize], m2 ; save sa8d SWAP 0, 2 SAVE_MM_PERMUTATION ret HADAMARD_AC_WXH_SSE2 16, 16 -HADAMARD_AC_WXH_SSE2 8, 16 HADAMARD_AC_WXH_SSE2 16, 8 +%if mmsize <= 16 +HADAMARD_AC_WXH_SSE2 8, 16 HADAMARD_AC_WXH_SSE2 8, 8 +%endif %endmacro ; HADAMARD_AC_SSE2 %macro HADAMARD_AC_WXH_SUM_SSE2 2 mova m1, [rsp+2*mmsize] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if %1*%2 >= 128 paddd m0, [rsp+3*mmsize] paddd m1, [rsp+4*mmsize] @@ -3637,68 +4111,75 @@ HADAMARD_AC_WXH_SSE2 8, 8 paddd m1, [rsp+8*mmsize] psrld m0, 1 %endif - HADDD m0, m2 - HADDD m1, m3 + HADDD xm0, xm2 + HADDD xm1, xm3 %else ; !HIGH_BIT_DEPTH -%if %1*%2 >= 128 +%if %1*%2*16/mmsize >= 128 paddusw m0, [rsp+3*mmsize] paddusw m1, [rsp+4*mmsize] %endif -%if %1*%2 == 256 +%if %1*%2*16/mmsize == 256 paddusw m0, [rsp+5*mmsize] paddusw m1, [rsp+6*mmsize] paddusw m0, [rsp+7*mmsize] paddusw m1, [rsp+8*mmsize] psrlw m0, 1 %endif - HADDUW m0, m2 - HADDW m1, m3 +%if mmsize==32 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + paddusw xm0, xm2 + paddusw xm1, xm3 +%endif + HADDUW xm0, xm2 + HADDW xm1, xm3 %endif ; HIGH_BIT_DEPTH %endmacro ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) %macro HADAMARD_AC_WXH_SSE2 2 -cglobal pixel_hadamard_ac_%1x%2, 2,3,11 - %assign pad 16-gprsize-(stack_offset&15) +cglobal pixel_hadamard_ac_%1x%2, 2,4,11 %define ysub r1 FIX_STRIDES r1 - sub rsp, 48+pad - lea r2, [r1*3] + mov r3, rsp + and rsp, ~(mmsize-1) + sub rsp, mmsize*3 + lea r2, [r1*3] call hadamard_ac_8x8 %if %2==16 %define ysub r2 - lea r0, [r0+r1*4] - sub rsp, 32 + lea r0, [r0+r1*4] + sub rsp, mmsize*2 call hadamard_ac_8x8 %endif -%if %1==16 +%if %1==16 && mmsize <= 16 neg ysub - sub rsp, 32 - lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] + sub rsp, mmsize*2 + lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] neg ysub call hadamard_ac_8x8 %if %2==16 - lea r0, [r0+r1*4] - sub rsp, 32 + lea r0, [r0+r1*4] + sub rsp, mmsize*2 call hadamard_ac_8x8 %endif %endif HADAMARD_AC_WXH_SUM_SSE2 %1, %2 - movd edx, m0 - movd eax, m1 - shr edx, 2 - (%1*%2 >> 8) + movd edx, xm0 + movd eax, xm1 + shr edx, 2 - (%1*%2*16/mmsize >> 8) shr eax, 1 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 shl rdx, 32 add rax, rdx %endif - add rsp, 16+%1*%2/2+pad + mov rsp, r3 RET %endmacro ; HADAMARD_AC_WXH_SSE2 ; instantiate satds -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 cextern pixel_sa8d_8x8_internal_mmx2 INIT_MMX mmx2 SA8D @@ -3715,17 +4196,30 @@ SA8D INIT_XMM sse2 SA8D SATDS_SSE2 -%ifndef HIGH_BIT_DEPTH +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 INTRA_SA8D_SSE2 +%endif INIT_MMX mmx2 INTRA_X3_MMX -%endif INIT_XMM sse2 HADAMARD_AC_SSE2 +%if HIGH_BIT_DEPTH == 0 +INIT_XMM ssse3,atom +SATDS_SSE2 +SA8D +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif +%endif + %define DIFFOP DIFF_SUMSUB_SSSE3 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 @@ -3734,14 +4228,17 @@ INIT_XMM ssse3 SATDS_SSE2 SA8D HADAMARD_AC_SSE2 -%ifndef HIGH_BIT_DEPTH +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 INTRA_X9 INTRA8_X9 %endif %undef movdqa ; nehalem doesn't like movaps %undef movdqu ; movups %undef punpcklqdq ; or movlhps -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 INIT_MMX ssse3 INTRA_X3_MMX %endif @@ -3752,15 +4249,24 @@ INIT_XMM sse4 SATDS_SSE2 SA8D HADAMARD_AC_SSE2 -%ifndef HIGH_BIT_DEPTH +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 INTRA_X9 INTRA8_X9 %endif +; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so +; it's effectively free. +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE INIT_XMM avx SATDS_SSE2 SA8D -%ifndef HIGH_BIT_DEPTH +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 INTRA_X9 INTRA8_X9 %endif @@ -3770,22 +4276,341 @@ HADAMARD_AC_SSE2 INIT_XMM xop SATDS_SSE2 SA8D -%ifndef HIGH_BIT_DEPTH +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 INTRA_X9 ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why. %endif HADAMARD_AC_SSE2 + +%if HIGH_BIT_DEPTH == 0 +%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 +%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 +%define TRANS TRANS_SSE4 +INIT_YMM avx2 +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif + +%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] + movq xm%1, [r0] + movq xm%3, [r2] + movq xm%2, [r0+r1] + movq xm%4, [r2+r3] + vinserti128 m%1, m%1, [r0+4*r1], 1 + vinserti128 m%3, m%3, [r2+4*r3], 1 + vinserti128 m%2, m%2, [r0+r4], 1 + vinserti128 m%4, m%4, [r2+r5], 1 + punpcklqdq m%1, m%1 + punpcklqdq m%3, m%3 + punpcklqdq m%2, m%2 + punpcklqdq m%4, m%4 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + + movq xm%3, [r0] + movq xm%5, [r2] + movq xm%4, [r0+r1] + movq xm%6, [r2+r3] + vinserti128 m%3, m%3, [r0+4*r1], 1 + vinserti128 m%5, m%5, [r2+4*r3], 1 + vinserti128 m%4, m%4, [r0+r4], 1 + vinserti128 m%6, m%6, [r2+r5], 1 + punpcklqdq m%3, m%3 + punpcklqdq m%5, m%5 + punpcklqdq m%4, m%4 + punpcklqdq m%6, m%6 + DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 +%endmacro + +%macro SATD_START_AVX2 2-3 0 + FIX_STRIDES r1, r3 +%if %3 + mova %2, [hmul_8p] + lea r4, [5*r1] + lea r5, [5*r3] +%else + mova %2, [hmul_16p] + lea r4, [3*r1] + lea r5, [3*r3] +%endif + pxor %1, %1 +%endmacro + +%define TRANS TRANS_SSE4 +INIT_YMM avx2 +cglobal pixel_satd_16x8_internal + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_16x16, 4,6,8 + SATD_START_AVX2 m6, m7 + call pixel_satd_16x8_internal + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +pixel_satd_16x8_internal: + call pixel_satd_16x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_16x8, 4,6,8 + SATD_START_AVX2 m6, m7 + jmp pixel_satd_16x8_internal + +cglobal pixel_satd_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_8x16, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_sa8d_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + HADAMARD4_V 0, 1, 2, 3, 4 + HADAMARD 8, sumsub, 0, 1, 4, 5 + HADAMARD 8, sumsub, 2, 3, 4, 5 + HADAMARD 2, sumsub, 0, 1, 4, 5 + HADAMARD 2, sumsub, 2, 3, 4, 5 + HADAMARD 1, amax, 0, 1, 4, 5 + HADAMARD 1, amax, 2, 3, 4, 5 + paddw m6, m0 + paddw m6, m2 + ret + +cglobal pixel_sa8d_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_sa8d_8x8_internal + vextracti128 xm1, m6, 1 + paddw xm6, xm1 + HADDW xm6, xm1 + movd eax, xm6 + add eax, 1 + shr eax, 1 + RET + +cglobal intra_sad_x9_8x8, 5,7,8 + %define pred(i,j) [rsp+i*0x40+j*0x20] + + mov r6, rsp + and rsp, ~31 + sub rsp, 0x240 + movu m5, [r0+0*FENC_STRIDE] + movu m6, [r0+4*FENC_STRIDE] + punpcklqdq m5, [r0+2*FENC_STRIDE] + punpcklqdq m6, [r0+6*FENC_STRIDE] + + ; save instruction size: avoid 4-byte memory offsets + lea r0, [intra8x9_h1+128] + %define off(m) (r0+m-(intra8x9_h1+128)) + + vpbroadcastq m0, [r2+16] + psadbw m4, m0, m5 + psadbw m2, m0, m6 + mova pred(0,0), m0 + mova pred(0,1), m0 + paddw m4, m2 + + vpbroadcastq m1, [r2+7] + pshufb m3, m1, [off(intra8x9_h1)] + pshufb m2, m1, [off(intra8x9_h3)] + mova pred(1,0), m3 + mova pred(1,1), m2 + psadbw m3, m5 + psadbw m2, m6 + paddw m3, m2 + + lea r5, [rsp+0x100] + %define pred(i,j) [r5+i*0x40+j*0x20-0x100] + + ; combine the first two + pslldq m3, 2 + por m4, m3 + + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + pshufb m0, m2 + mova pred(2,0), m0 + mova pred(2,1), m0 + psadbw m3, m0, m5 + psadbw m2, m0, m6 + paddw m3, m2 + + pslldq m3, 4 + por m4, m3 + + vbroadcasti128 m0, [r2+16] + vbroadcasti128 m2, [r2+17] + pslldq m1, m0, 1 + pavgb m3, m0, m2 + PRED4x4_LOWPASS m0, m1, m2, m0, m7 + pshufb m1, m0, [off(intra8x9_ddl1)] + pshufb m2, m0, [off(intra8x9_ddl3)] + mova pred(3,0), m1 + mova pred(3,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 6 + por m4, m1 + vextracti128 xm1, m4, 1 + paddw xm4, xm1 + mova [r4], xm4 + + ; for later + vinserti128 m7, m3, xm0, 1 + + vbroadcasti128 m2, [r2+8] + vbroadcasti128 m0, [r2+7] + vbroadcasti128 m1, [r2+6] + pavgb m3, m2, m0 + PRED4x4_LOWPASS m0, m1, m2, m0, m4 + pshufb m1, m0, [off(intra8x9_ddr1)] + pshufb m2, m0, [off(intra8x9_ddr3)] + mova pred(4,0), m1 + mova pred(4,1), m2 + psadbw m4, m1, m5 + psadbw m2, m6 + paddw m4, m2 + + add r0, 256 + add r5, 0xC0 + %define off(m) (r0+m-(intra8x9_h1+256+128)) + %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0] + + vpblendd m2, m3, m0, 11110011b + pshufb m1, m2, [off(intra8x9_vr1)] + pshufb m2, m2, [off(intra8x9_vr3)] + mova pred(5,0), m1 + mova pred(5,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 2 + por m4, m1 + + psrldq m2, m3, 4 + pblendw m2, m0, q3330 + punpcklbw m0, m3 + pshufb m1, m2, [off(intra8x9_hd1)] + pshufb m2, m0, [off(intra8x9_hd3)] + mova pred(6,0), m1 + mova pred(6,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 4 + por m4, m1 + + pshufb m1, m7, [off(intra8x9_vl1)] + pshufb m2, m7, [off(intra8x9_vl3)] + mova pred(7,0), m1 + mova pred(7,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 6 + por m4, m1 + vextracti128 xm1, m4, 1 + paddw xm4, xm1 + mova xm3, [r4] + SBUTTERFLY qdq, 3, 4, 7 + paddw xm3, xm4 + + pslldq m1, m0, 1 + vpbroadcastd m0, [r2+7] + palignr m0, m1, 1 + pshufb m1, m0, [off(intra8x9_hu1)] + pshufb m2, m0, [off(intra8x9_hu3)] + mova pred(8,0), m1 + mova pred(8,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + vextracti128 xm2, m1, 1 + paddw xm1, xm2 + movhlps xm2, xm1 + paddw xm1, xm2 + movd r2d, xm1 + + paddw xm3, [r3] + mova [r4], xm3 + add r2w, word [r3+16] + mov [r4+16], r2w + + phminposuw xm3, xm3 + movd r3d, xm3 + add r2d, 8<<16 + cmp r3w, r2w + cmovg r3d, r2d + + mov r2d, r3d + shr r3, 16 + shl r3, 6 + add r1, 4*FDEC_STRIDE + mova xm0, [rsp+r3+0x00] + mova xm1, [rsp+r3+0x10] + mova xm2, [rsp+r3+0x20] + mova xm3, [rsp+r3+0x30] + movq [r1+FDEC_STRIDE*-4], xm0 + movhps [r1+FDEC_STRIDE*-2], xm0 + movq [r1+FDEC_STRIDE*-3], xm1 + movhps [r1+FDEC_STRIDE*-1], xm1 + movq [r1+FDEC_STRIDE* 0], xm2 + movhps [r1+FDEC_STRIDE* 2], xm2 + movq [r1+FDEC_STRIDE* 1], xm3 + movhps [r1+FDEC_STRIDE* 3], xm3 + mov rsp, r6 + mov eax, r2d + RET +%endif ; HIGH_BIT_DEPTH + ;============================================================================= ; SSIM ;============================================================================= ;----------------------------------------------------------------------------- -; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) +; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, +; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- %macro SSIM_ITER 1 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movdqu m5, [r0+(%1&1)*r1] movdqu m6, [r2+(%1&1)*r3] %else @@ -3808,13 +4633,8 @@ HADAMARD_AC_SSE2 pmaddwd m7, m5, m6 pmaddwd m5, m5 pmaddwd m6, m6 -%if %1==0 - SWAP 3, 5 - SWAP 4, 7 -%else - paddd m3, m5 - paddd m4, m7 -%endif + ACCUM paddd, 3, 5, %1 + ACCUM paddd, 4, 7, %1 paddd m3, m6 %endmacro @@ -3841,7 +4661,7 @@ cglobal pixel_ssim_4x4x2_core, 4,4,8 punpckhdq m5, m3, m4 punpckldq m3, m4 -%ifdef UNIX64 +%if UNIX64 %define t0 r4 %else %define t0 rax @@ -3934,7 +4754,7 @@ cglobal pixel_ssim_end4, 3,3,7 addps m0, m4 pshuflw m4, m0, q0032 addss m0, m4 -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 movd r0m, m0 fld dword r0m %endif @@ -3946,17 +4766,86 @@ SSIM INIT_XMM avx SSIM +;----------------------------------------------------------------------------- +; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); +;----------------------------------------------------------------------------- +%macro ASD8 0 +cglobal pixel_asd8, 5,5 + pxor m0, m0 + pxor m1, m1 +.loop: +%if HIGH_BIT_DEPTH + paddw m0, [r0] + paddw m1, [r2] + paddw m0, [r0+2*r1] + paddw m1, [r2+2*r3] + lea r0, [r0+4*r1] + paddw m0, [r0] + paddw m1, [r2+4*r3] + lea r2, [r2+4*r3] + paddw m0, [r0+2*r1] + paddw m1, [r2+2*r3] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%else + movq m2, [r0] + movq m3, [r2] + movhps m2, [r0+r1] + movhps m3, [r2+r3] + lea r0, [r0+2*r1] + psadbw m2, m1 + psadbw m3, m1 + movq m4, [r0] + movq m5, [r2+2*r3] + lea r2, [r2+2*r3] + movhps m4, [r0+r1] + movhps m5, [r2+r3] + lea r0, [r0+2*r1] + paddw m0, m2 + psubw m0, m3 + psadbw m4, m1 + psadbw m5, m1 + lea r2, [r2+2*r3] + paddw m0, m4 + psubw m0, m5 +%endif + sub r4d, 4 + jg .loop +%if HIGH_BIT_DEPTH + psubw m0, m1 + HADDW m0, m1 + ABSD m1, m0 +%else + movhlps m1, m0 + paddw m0, m1 + ABSW m1, m0 +%endif + movd eax, m1 + RET +%endmacro + +INIT_XMM sse2 +ASD8 +INIT_XMM ssse3 +ASD8 +%if HIGH_BIT_DEPTH +INIT_XMM xop +ASD8 +%endif + ;============================================================================= ; Successive Elimination ADS ;============================================================================= %macro ADS_START 0 -%ifdef WIN64 - movsxd r5, r5d +%if UNIX64 + movsxd r5, r5d +%else + mov r5d, r5m %endif - mov r0d, r5d - lea r6, [r4+r5+15] - and r6, ~15; + mov r0d, r5d + lea r6, [r4+r5+(mmsize-1)] + and r6, ~(mmsize-1) shl r2d, 1 %endmacro @@ -3964,10 +4853,19 @@ SSIM add r1, 8*%1 add r3, 8*%1 add r6, 4*%1 - sub r0d, 4*%1 + sub r0d, 4*%1 jg .loop WIN64_RESTORE_XMM rsp - jmp ads_mvs +%if mmsize==32 + vzeroupper +%endif + lea r6, [r4+r5+(mmsize-1)] + and r6, ~(mmsize-1) +%if cpuflag(ssse3) + jmp ads_mvs_ssse3 +%else + jmp ads_mvs_mmx +%endif %endmacro ;----------------------------------------------------------------------------- @@ -3975,193 +4873,227 @@ SSIM ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 -cglobal pixel_ads4, 6,7 - movq mm6, [r0] - movq mm4, [r0+8] - pshufw mm7, mm6, 0 - pshufw mm6, mm6, q2222 - pshufw mm5, mm4, 0 - pshufw mm4, mm4, q2222 +cglobal pixel_ads4, 5,7 + mova m6, [r0] + mova m4, [r0+8] + pshufw m7, m6, 0 + pshufw m6, m6, q2222 + pshufw m5, m4, 0 + pshufw m4, m4, q2222 ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+16] - psubw mm0, mm7 - psubw mm1, mm6 - ABSW mm0, mm0, mm2 - ABSW mm1, mm1, mm3 - movq mm2, [r1+r2] - movq mm3, [r1+r2+16] - psubw mm2, mm5 - psubw mm3, mm4 - paddw mm0, mm1 - ABSW mm2, mm2, mm1 - ABSW mm3, mm3, mm1 - paddw mm0, mm2 - paddw mm0, mm3 - pshufw mm1, r6m, 0 - paddusw mm0, [r3] - psubusw mm1, mm0 - packsswb mm1, mm1 - movd [r6], mm1 + movu m0, [r1] + movu m1, [r1+16] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + movu m2, [r1+r2] + movu m3, [r1+r2+16] + psubw m2, m5 + psubw m3, m4 + paddw m0, m1 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + pshufw m1, r6m, 0 + paddusw m0, [r3] + psubusw m1, m0 + packsswb m1, m1 + movd [r6], m1 ADS_END 1 -cglobal pixel_ads2, 6,7 - movq mm6, [r0] - pshufw mm5, r6m, 0 - pshufw mm7, mm6, 0 - pshufw mm6, mm6, q2222 +cglobal pixel_ads2, 5,7 + mova m6, [r0] + pshufw m5, r6m, 0 + pshufw m7, m6, 0 + pshufw m6, m6, q2222 ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+r2] - psubw mm0, mm7 - psubw mm1, mm6 - ABSW mm0, mm0, mm2 - ABSW mm1, mm1, mm3 - paddw mm0, mm1 - paddusw mm0, [r3] - movq mm4, mm5 - psubusw mm4, mm0 - packsswb mm4, mm4 - movd [r6], mm4 + movu m0, [r1] + movu m1, [r1+r2] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddw m0, m1 + paddusw m0, [r3] + mova m4, m5 + psubusw m4, m0 + packsswb m4, m4 + movd [r6], m4 ADS_END 1 -cglobal pixel_ads1, 6,7 - pshufw mm7, [r0], 0 - pshufw mm6, r6m, 0 +cglobal pixel_ads1, 5,7 + pshufw m7, [r0], 0 + pshufw m6, r6m, 0 ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+8] - psubw mm0, mm7 - psubw mm1, mm7 - ABSW mm0, mm0, mm2 - ABSW mm1, mm1, mm3 - paddusw mm0, [r3] - paddusw mm1, [r3+8] - movq mm4, mm6 - movq mm5, mm6 - psubusw mm4, mm0 - psubusw mm5, mm1 - packsswb mm4, mm5 - movq [r6], mm4 + movu m0, [r1] + movu m1, [r1+8] + psubw m0, m7 + psubw m1, m7 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddusw m0, [r3] + paddusw m1, [r3+8] + mova m4, m6 + mova m5, m6 + psubusw m4, m0 + psubusw m5, m1 + packsswb m4, m5 + mova [r6], m4 ADS_END 2 %macro ADS_XMM 0 -cglobal pixel_ads4, 6,7,12 - movdqa xmm4, [r0] - pshuflw xmm7, xmm4, 0 - pshuflw xmm6, xmm4, q2222 - pshufhw xmm5, xmm4, 0 - pshufhw xmm4, xmm4, q2222 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpckhqdq xmm5, xmm5 - punpckhqdq xmm4, xmm4 -%ifdef ARCH_X86_64 - pshuflw xmm8, r6m, 0 - punpcklqdq xmm8, xmm8 +%if mmsize==32 +cglobal pixel_ads4, 5,7,8 + vpbroadcastw m7, [r0+ 0] + vpbroadcastw m6, [r0+ 4] + vpbroadcastw m5, [r0+ 8] + vpbroadcastw m4, [r0+12] +%else +cglobal pixel_ads4, 5,7,12 + mova m4, [r0] + pshuflw m7, m4, q0000 + pshuflw m6, m4, q2222 + pshufhw m5, m4, q0000 + pshufhw m4, m4, q2222 + punpcklqdq m7, m7 + punpcklqdq m6, m6 + punpckhqdq m5, m5 + punpckhqdq m4, m4 +%endif +%if ARCH_X86_64 && mmsize == 16 + movd m8, r6m + SPLATW m8, m8 ADS_START - movdqu xmm10, [r1] - movdqu xmm11, [r1+r2] + movu m10, [r1] + movu m11, [r1+r2] .loop: - psubw xmm0, xmm10, xmm7 - movdqu xmm10, [r1+16] - psubw xmm1, xmm10, xmm6 - ABSW xmm0, xmm0, xmm2 - ABSW xmm1, xmm1, xmm3 - psubw xmm2, xmm11, xmm5 - movdqu xmm11, [r1+r2+16] - paddw xmm0, xmm1 - psubw xmm3, xmm11, xmm4 - movdqu xmm9, [r3] - ABSW xmm2, xmm2, xmm1 - ABSW xmm3, xmm3, xmm1 - paddw xmm0, xmm2 - paddw xmm0, xmm3 - paddusw xmm0, xmm9 - psubusw xmm1, xmm8, xmm0 - packsswb xmm1, xmm1 - movq [r6], xmm1 + psubw m0, m10, m7 + movu m10, [r1+16] + psubw m1, m10, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + psubw m2, m11, m5 + movu m11, [r1+r2+16] + paddw m0, m1 + psubw m3, m11, m4 + movu m9, [r3] + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + paddusw m0, m9 + psubusw m1, m8, m0 %else ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+16] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - ABSW xmm0, xmm0, xmm2 - ABSW xmm1, xmm1, xmm3 - movdqu xmm2, [r1+r2] - movdqu xmm3, [r1+r2+16] - psubw xmm2, xmm5 - psubw xmm3, xmm4 - paddw xmm0, xmm1 - ABSW xmm2, xmm2, xmm1 - ABSW xmm3, xmm3, xmm1 - paddw xmm0, xmm2 - paddw xmm0, xmm3 - movd xmm1, r6m - movdqu xmm2, [r3] - pshuflw xmm1, xmm1, 0 - punpcklqdq xmm1, xmm1 - paddusw xmm0, xmm2 - psubusw xmm1, xmm0 - packsswb xmm1, xmm1 - movq [r6], xmm1 + movu m0, [r1] + movu m1, [r1+16] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + movu m2, [r1+r2] + movu m3, [r1+r2+16] + psubw m2, m5 + psubw m3, m4 + paddw m0, m1 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + movu m2, [r3] +%if mmsize==32 + vpbroadcastw m1, r6m +%else + movd m1, r6m + pshuflw m1, m1, 0 + punpcklqdq m1, m1 +%endif + paddusw m0, m2 + psubusw m1, m0 %endif ; ARCH - ADS_END 2 + packsswb m1, m1 +%if mmsize==32 + vpermq m1, m1, q3120 + mova [r6], xm1 +%else + movh [r6], m1 +%endif + ADS_END mmsize/8 -cglobal pixel_ads2, 6,7,8 - movq xmm6, [r0] - movd xmm5, r6m - pshuflw xmm7, xmm6, 0 - pshuflw xmm6, xmm6, q2222 - pshuflw xmm5, xmm5, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpcklqdq xmm5, xmm5 +cglobal pixel_ads2, 5,7,8 +%if mmsize==32 + vpbroadcastw m7, [r0+0] + vpbroadcastw m6, [r0+4] + vpbroadcastw m5, r6m +%else + movq m6, [r0] + movd m5, r6m + pshuflw m7, m6, 0 + pshuflw m6, m6, q2222 + pshuflw m5, m5, 0 + punpcklqdq m7, m7 + punpcklqdq m6, m6 + punpcklqdq m5, m5 +%endif ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+r2] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - movdqu xmm4, [r3] - ABSW xmm0, xmm0, xmm2 - ABSW xmm1, xmm1, xmm3 - paddw xmm0, xmm1 - paddusw xmm0, xmm4 - psubusw xmm1, xmm5, xmm0 - packsswb xmm1, xmm1 - movq [r6], xmm1 - ADS_END 2 + movu m0, [r1] + movu m1, [r1+r2] + psubw m0, m7 + psubw m1, m6 + movu m4, [r3] + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddw m0, m1 + paddusw m0, m4 + psubusw m1, m5, m0 + packsswb m1, m1 +%if mmsize==32 + vpermq m1, m1, q3120 + mova [r6], xm1 +%else + movh [r6], m1 +%endif + ADS_END mmsize/8 -cglobal pixel_ads1, 6,7,8 - movd xmm7, [r0] - movd xmm6, r6m - pshuflw xmm7, xmm7, 0 - pshuflw xmm6, xmm6, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 +cglobal pixel_ads1, 5,7,8 +%if mmsize==32 + vpbroadcastw m7, [r0] + vpbroadcastw m6, r6m +%else + movd m7, [r0] + movd m6, r6m + pshuflw m7, m7, 0 + pshuflw m6, m6, 0 + punpcklqdq m7, m7 + punpcklqdq m6, m6 +%endif ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+16] - psubw xmm0, xmm7 - psubw xmm1, xmm7 - movdqu xmm2, [r3] - movdqu xmm3, [r3+16] - ABSW xmm0, xmm0, xmm4 - ABSW xmm1, xmm1, xmm5 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 - psubusw xmm4, xmm6, xmm0 - psubusw xmm5, xmm6, xmm1 - packsswb xmm4, xmm5 - movdqa [r6], xmm4 - ADS_END 4 + movu m0, [r1] + movu m1, [r1+mmsize] + psubw m0, m7 + psubw m1, m7 + movu m2, [r3] + movu m3, [r3+mmsize] + ABSW m0, m0, m4 + ABSW m1, m1, m5 + paddusw m0, m2 + paddusw m1, m3 + psubusw m4, m6, m0 + psubusw m5, m6, m1 + packsswb m4, m5 +%if mmsize==32 + vpermq m4, m4, q3120 +%endif + mova [r6], m4 + ADS_END mmsize/4 %endmacro INIT_XMM sse2 @@ -4170,6 +5102,8 @@ INIT_XMM ssse3 ADS_XMM INIT_XMM avx ADS_XMM +INIT_YMM avx2 +ADS_XMM ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) ; { @@ -4194,11 +5128,9 @@ ADS_XMM inc r1d %endmacro -INIT_MMX +INIT_MMX mmx cglobal pixel_ads_mvs, 0,7,0 -ads_mvs: - lea r6, [r4+r5+15] - and r6, ~15; +ads_mvs_mmx: ; mvs = r4 ; masks = r6 ; width = r5 @@ -4214,7 +5146,7 @@ ALIGN 16 jge .end .loopi: mov r2, [r6+r1] -%ifdef ARCH_X86_64 +%if ARCH_X86_64 test r2, r2 %else mov r3, r2 @@ -4226,7 +5158,7 @@ ALIGN 16 TEST 1 TEST 2 TEST 3 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 shr r2, 32 %else mov r2d, [r6+r1] @@ -4240,3 +5172,36 @@ ALIGN 16 .end: movifnidn eax, r0d RET + +INIT_XMM ssse3 +cglobal pixel_ads_mvs, 0,7,0 +ads_mvs_ssse3: + mova m3, [pw_8] + mova m4, [pw_76543210] + pxor m5, m5 + add r5, r6 + xor r0d, r0d ; nmv + mov [r5], r0d +%ifdef PIC + lea r1, [$$] + %define GLOBAL +r1-$$ +%else + %define GLOBAL +%endif +.loop: + movh m0, [r6] + pcmpeqb m0, m5 + pmovmskb r2d, m0 + xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions) + movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt + add r2d, r2d + ; shuffle counters based on mv mask + pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL] + movu [r4+r0*2], m2 + add r0d, r3d + paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7} + add r6, 8 + cmp r6, r5 + jl .loop + movifnidn eax, r0d + RET