X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fpixel-a.asm;h=9791e8665ea635edc8c7a93de77b774cb6ae85b3;hb=7c860f075ccd14fb7891d5fc6c9eab1a37ea555d;hp=b094c9c247b199fb9b8f69c46de9f1e318533c75;hpb=389b401a99f2f33b41db7d74904b3ff7509d79e5;p=x264 diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index b094c9c2..9791e866 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* pixel.asm: x86 pixel metrics ;***************************************************************************** -;* Copyright (C) 2003-2011 x264 project +;* Copyright (C) 2003-2014 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz @@ -32,8 +32,17 @@ %include "x86util.asm" SECTION_RODATA 32 +hmul_16p: times 16 db 1 + times 8 db 1, -1 +hmul_8p: times 8 db 1 + times 4 db 1, -1 + times 8 db 1 + times 4 db 1, -1 mask_ff: times 16 db 0xff times 16 db 0 +mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1 +mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1 +mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1 %if BIT_DEPTH == 10 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 @@ -46,12 +55,7 @@ ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 %endif -mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1 -mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1 -mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1 hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 -hmul_8p: times 8 db 1 - times 4 db 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 pb_pppm: times 4 db 1,1,1,-1 @@ -67,6 +71,7 @@ intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9 intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3 intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1 intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1 +intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0 pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007 pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007 intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15 @@ -77,14 +82,72 @@ intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0 intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11 intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8 intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9 -intrax9b_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1 +intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3 +intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1 +intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1 intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1 intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1 +intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0 + +ALIGN 32 +intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5 +intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4 +intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1 +intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 +intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10 +intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11 +intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14 +intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15 +intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 +intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9 +intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10 +intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11 +intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13 +intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12 +intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9 +intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8 +intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14 +intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14 +intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12 +intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12 +intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10 +intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8 +intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10 +intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8 +intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2 +intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0 +intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15 +intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15 +pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003 +pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001 + +transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 +transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 sw_f0: dq 0xfff0, 0 -sq_0f: dq 0xffffffff, 0 pd_f0: times 4 dd 0xffff0000 +pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7 + +ads_mvs_shuffle: +%macro ADS_MVS_SHUFFLE 8 + %assign y x + %rep 8 + %rep 7 + %rotate (~y)&1 + %assign y y>>((~y)&1) + %endrep + db %1*2, %1*2+1 + %rotate 1 + %assign y y>>1 + %endrep +%endmacro +%assign x 0 +%rep 256 + ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7 +%assign x x+1 +%endrep + SECTION .text cextern pb_0 @@ -92,103 +155,107 @@ cextern pb_1 cextern pw_1 cextern pw_8 cextern pw_16 -cextern pw_64 +cextern pw_32 cextern pw_00ff cextern pw_ppppmmmm cextern pw_ppmmppmm cextern pw_pmpmpmpm cextern pw_pmmpzzzz +cextern pd_1 cextern hsub_mul +cextern popcnt_table ;============================================================================= ; SSD ;============================================================================= -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int ) +; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD_ONE 2 -cglobal pixel_ssd_%1x%2, 4,5,6 - mov r4, %1*%2/mmsize +cglobal pixel_ssd_%1x%2, 4,7,6 + FIX_STRIDES r1, r3 +%if mmsize == %1*2 + %define offset0_1 r1 + %define offset0_2 r1*2 + %define offset0_3 r5 + %define offset1_1 r3 + %define offset1_2 r3*2 + %define offset1_3 r6 + lea r5, [3*r1] + lea r6, [3*r3] +%elif mmsize == %1 + %define offset0_1 mmsize + %define offset0_2 r1 + %define offset0_3 r1+mmsize + %define offset1_1 mmsize + %define offset1_2 r3 + %define offset1_3 r3+mmsize +%elif mmsize == %1/2 + %define offset0_1 mmsize + %define offset0_2 mmsize*2 + %define offset0_3 mmsize*3 + %define offset1_1 mmsize + %define offset1_2 mmsize*2 + %define offset1_3 mmsize*3 +%endif + %assign %%n %2/(2*mmsize/%1) +%if %%n > 1 + mov r4d, %%n +%endif pxor m0, m0 -.loop +.loop: mova m1, [r0] -%if %1 <= mmsize/2 - mova m3, [r0+r1*2] - %define offset r3*2 - %define num_rows 2 -%else - mova m3, [r0+mmsize] - %define offset mmsize - %define num_rows 1 -%endif + mova m2, [r0+offset0_1] + mova m3, [r0+offset0_2] + mova m4, [r0+offset0_3] psubw m1, [r2] - psubw m3, [r2+offset] + psubw m2, [r2+offset1_1] + psubw m3, [r2+offset1_2] + psubw m4, [r2+offset1_3] +%if %%n > 1 + lea r0, [r0+r1*(%2/%%n)] + lea r2, [r2+r3*(%2/%%n)] +%endif pmaddwd m1, m1 + pmaddwd m2, m2 pmaddwd m3, m3 - dec r4 - lea r0, [r0+r1*2*num_rows] - lea r2, [r2+r3*2*num_rows] + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 paddd m0, m1 paddd m0, m3 +%if %%n > 1 + dec r4d jg .loop +%endif HADDD m0, m5 - movd eax, m0 - RET -%endmacro - -%macro SSD_16_MMX 2 -cglobal pixel_ssd_%1x%2, 4,5 - mov r4, %1*%2/mmsize/2 - pxor m0, m0 -.loop - mova m1, [r0] - mova m2, [r2] - mova m3, [r0+mmsize] - mova m4, [r2+mmsize] - mova m5, [r0+mmsize*2] - mova m6, [r2+mmsize*2] - mova m7, [r0+mmsize*3] - psubw m1, m2 - psubw m3, m4 - mova m2, [r2+mmsize*3] - psubw m5, m6 - pmaddwd m1, m1 - psubw m7, m2 - pmaddwd m3, m3 - pmaddwd m5, m5 - dec r4 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - pmaddwd m7, m7 - paddd m1, m3 - paddd m5, m7 - paddd m0, m1 - paddd m0, m5 - jg .loop - HADDD m0, m7 - movd eax, m0 + movd eax, xm0 RET %endmacro INIT_MMX mmx2 SSD_ONE 4, 4 SSD_ONE 4, 8 +SSD_ONE 4, 16 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 -SSD_16_MMX 16, 8 -SSD_16_MMX 16, 16 +SSD_ONE 16, 8 +SSD_ONE 16, 16 INIT_XMM sse2 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 SSD_ONE 16, 8 SSD_ONE 16, 16 +INIT_YMM avx2 +SSD_ONE 16, 8 +SSD_ONE 16, 16 %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %macro SSD_LOAD_FULL 5 mova m1, [t0+%1] mova m2, [t2+%2] @@ -248,6 +315,23 @@ SSD_ONE 16, 16 punpcklbw m%2, m%4 %endmacro +%macro LOAD_AVX2 5 + mova xm%1, %3 + vinserti128 m%1, m%1, %4, 1 +%if %5 + lea t0, [t0+2*t1] +%endif +%endmacro + +%macro JOIN_AVX2 7 + mova xm%2, %5 + vinserti128 m%2, m%2, %6, 1 +%if %7 + lea t2, [t2+2*t3] +%endif + SBUTTERFLY bw, %1, %2, %3 +%endmacro + %macro SSD_LOAD_HALF 5 LOAD 1, 2, [t0+%1], [t0+%3], 1 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 @@ -322,7 +406,7 @@ SSD_ONE 16, 16 %endmacro ;----------------------------------------------------------------------------- -; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int ) +; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD 2 %if %1 != %2 @@ -338,7 +422,7 @@ cglobal pixel_ssd_%1x%2, 0,0,0 %else .startloop: -%ifdef ARCH_X86_64 +%if ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3 PROLOGUE 0,0,8 %else @@ -370,8 +454,15 @@ ALIGN 16 %endif dec al jg .loop +%if mmsize==32 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + HADDD xm0, xm1 + movd eax, xm0 +%else HADDD m0, m1 movd eax, m0 +%endif RET %endif %endmacro @@ -417,11 +508,22 @@ INIT_MMX ssse3 SSD 4, 4 SSD 4, 8 SSD 4, 16 +INIT_XMM xop +SSD 16, 16 +SSD 8, 8 +SSD 16, 8 +SSD 8, 16 +SSD 8, 4 +%define LOAD LOAD_AVX2 +%define JOIN JOIN_AVX2 +INIT_YMM avx2 +SSD 16, 16 +SSD 16, 8 %assign function_align 16 %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2, +; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2, ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) ; ; The maximum width this function can handle without risk of overflow is given @@ -432,7 +534,7 @@ SSD 4, 16 ; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane ; distortion levels it will take much more than that though. ;----------------------------------------------------------------------------- -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro SSD_NV12 0 cglobal pixel_ssd_nv12_core, 6,7,7 shl r4d, 2 @@ -455,7 +557,7 @@ cglobal pixel_ssd_nv12_core, 6,7,7 psubw m1, [r2+r6+mmsize] PSHUFLW m0, m0, q3120 PSHUFLW m1, m1, q3120 -%if mmsize==16 +%if mmsize >= 16 pshufhw m0, m0, q3120 pshufhw m1, m1, q3120 %endif @@ -465,8 +567,13 @@ cglobal pixel_ssd_nv12_core, 6,7,7 paddd m3, m1 add r6, 2*mmsize jl .loopx -%if mmsize==16 ; using HADDD would remove the mmsize/32 part from the - ; equation above, putting the width limit at 8208 +%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled + jz .no_overread + psubd m3, m1 +.no_overread: +%endif +%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the + ; equation above, putting the width limit at 8208 punpckhdq m0, m2, m6 punpckhdq m1, m3, m6 punpckldq m2, m6 @@ -494,9 +601,13 @@ cglobal pixel_ssd_nv12_core, 6,7,7 jg .loopy mov r3, r6m mov r4, r7m -%if mmsize==16 - movq [r3], m4 - movhps [r4], m4 +%if mmsize == 32 + vextracti128 xm0, m4, 1 + paddq xm4, xm0 +%endif +%if mmsize >= 16 + movq [r3], xm4 + movhps [r4], xm4 %else ; fixup for mmx2 SBUTTERFLY dq, 4, 5, 0 mova m0, m4 @@ -513,9 +624,9 @@ cglobal pixel_ssd_nv12_core, 6,7,7 %endmacro ; SSD_NV12 %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, +; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2, ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) ; ; This implementation can potentially overflow on image widths >= 11008 (or @@ -524,7 +635,7 @@ cglobal pixel_ssd_nv12_core, 6,7,7 ;----------------------------------------------------------------------------- %macro SSD_NV12 0 cglobal pixel_ssd_nv12_core, 6,7 - shl r4d, 1 + add r4d, r4d add r0, r4 add r2, r4 pxor m3, m3 @@ -534,32 +645,46 @@ cglobal pixel_ssd_nv12_core, 6,7 mov r6, r4 neg r6 .loopx: - mova m0, [r0+r6] +%if mmsize == 32 ; only 16-byte alignment is guaranteed + movu m2, [r0+r6] + movu m1, [r2+r6] +%else + mova m2, [r0+r6] mova m1, [r2+r6] - psubusb m0, m1 - psubusb m1, [r0+r6] +%endif + psubusb m0, m2, m1 + psubusb m1, m2 por m0, m1 psrlw m2, m0, 8 - add r6, mmsize pand m0, m5 pmaddwd m2, m2 pmaddwd m0, m0 paddd m3, m0 paddd m4, m2 + add r6, mmsize jl .loopx +%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled + jz .no_overread + pcmpeqb xm1, xm1 + pandn m0, m1, m0 ; zero the lower half + pandn m2, m1, m2 + psubd m3, m0 + psubd m4, m2 +.no_overread: +%endif add r0, r1 add r2, r3 dec r5d jg .loopy mov r3, r6m mov r4, r7m - mova m5, [sq_0f] HADDD m3, m0 HADDD m4, m0 - pand m3, m5 - pand m4, m5 - movq [r3], m3 - movq [r4], m4 + pxor xm0, xm0 + punpckldq xm3, xm0 + punpckldq xm4, xm0 + movq [r3], xm3 + movq [r4], xm4 RET %endmacro ; SSD_NV12 %endif ; !HIGH_BIT_DEPTH @@ -570,6 +695,8 @@ INIT_XMM sse2 SSD_NV12 INIT_XMM avx SSD_NV12 +INIT_YMM avx2 +SSD_NV12 ;============================================================================= ; variance @@ -578,17 +705,17 @@ SSD_NV12 %macro VAR_START 1 pxor m5, m5 ; sum pxor m6, m6 ; sum squared -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %if %1 mova m7, [pw_00ff] -%else +%elif mmsize < 32 pxor m7, m7 ; zero %endif %endif ; !HIGH_BIT_DEPTH %endmacro %macro VAR_END 2 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if mmsize == 8 && %1*%2 == 256 HADDUW m5, m2 %else @@ -597,12 +724,13 @@ SSD_NV12 %else ; !HIGH_BIT_DEPTH HADDW m5, m2 %endif ; HIGH_BIT_DEPTH - movd eax, m5 HADDD m6, m1 +%if ARCH_X86_64 + punpckldq m5, m6 + movq rax, m5 +%else + movd eax, m5 movd edx, m6 -%ifdef ARCH_X86_64 - shl rdx, 32 - add rax, rdx %endif RET %endmacro @@ -625,7 +753,7 @@ SSD_NV12 %macro VAR_2ROW 2 mov r2d, %2 .loop: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m0, [r0] mova m1, [r0+mmsize] mova m3, [r0+%1] @@ -642,38 +770,38 @@ SSD_NV12 %else add r0, r1 %endif -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 punpcklbw m3, m7 punpckhbw m4, m7 %endif ; !HIGH_BIT_DEPTH - dec r2d VAR_CORE + dec r2d jg .loop %endmacro ;----------------------------------------------------------------------------- -; int pixel_var_wxh( uint8_t *, int ) +; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -INIT_MMX -cglobal pixel_var_16x16_mmx2, 2,3 +INIT_MMX mmx2 +cglobal pixel_var_16x16, 2,3 FIX_STRIDES r1 VAR_START 0 VAR_2ROW 8*SIZEOF_PIXEL, 16 VAR_END 16, 16 -cglobal pixel_var_8x16_mmx2, 2,3 +cglobal pixel_var_8x16, 2,3 FIX_STRIDES r1 VAR_START 0 VAR_2ROW r1, 8 VAR_END 8, 16 -cglobal pixel_var_8x8_mmx2, 2,3 +cglobal pixel_var_8x8, 2,3 FIX_STRIDES r1 VAR_START 0 VAR_2ROW r1, 4 VAR_END 8, 8 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 FIX_STRIDES r1 @@ -702,9 +830,11 @@ INIT_XMM sse2 VAR INIT_XMM avx VAR +INIT_XMM xop +VAR %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 VAR_START 1 @@ -756,30 +886,61 @@ INIT_XMM sse2 VAR INIT_XMM avx VAR +INIT_XMM xop +VAR + +INIT_YMM avx2 +cglobal pixel_var_16x16, 2,4,7 + VAR_START 0 + mov r2d, 4 + lea r3, [r1*3] +.loop: + pmovzxbw m0, [r0] + pmovzxbw m3, [r0+r1] + pmovzxbw m1, [r0+r1*2] + pmovzxbw m4, [r0+r3] + lea r0, [r0+r1*4] + VAR_CORE + dec r2d + jg .loop + vextracti128 xm0, m5, 1 + vextracti128 xm1, m6, 1 + paddw xm5, xm0 + paddd xm6, xm1 + HADDW xm5, xm2 + HADDD xm6, xm1 +%if ARCH_X86_64 + punpckldq xm5, xm6 + movq rax, xm5 +%else + movd eax, xm5 + movd edx, xm6 +%endif + RET %endif ; !HIGH_BIT_DEPTH -%macro VAR2_END 0 - HADDW m5, m7 - movd r1d, m5 +%macro VAR2_END 3 + HADDW %2, xm1 + movd r1d, %2 imul r1d, r1d - HADDD m6, m1 - shr r1d, 6 - movd eax, m6 - mov [r4], eax + HADDD %3, xm1 + shr r1d, %1 + movd eax, %3 + movd [r4], %3 sub eax, r1d ; sqr - (sum * sum >> shift) RET %endmacro ;----------------------------------------------------------------------------- -; int pixel_var2_8x8( pixel *, int, pixel *, int, int * ) +; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * ) ;----------------------------------------------------------------------------- -INIT_MMX -cglobal pixel_var2_8x8_mmx2, 5,6 +%macro VAR2_8x8_MMX 2 +cglobal pixel_var2_8x%1, 5,6 FIX_STRIDES r1, r3 VAR_START 0 - mov r5d, 8 + mov r5d, %1 .loop: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m0, [r0] mova m1, [r0+mmsize] psubw m0, [r2] @@ -806,15 +967,21 @@ cglobal pixel_var2_8x8_mmx2, 5,6 add r2, r3 dec r5d jg .loop - VAR2_END - RET + VAR2_END %2, m5, m6 +%endmacro + +%if ARCH_X86_64 == 0 +INIT_MMX mmx2 +VAR2_8x8_MMX 8, 6 +VAR2_8x8_MMX 16, 7 +%endif -INIT_XMM -cglobal pixel_var2_8x8_sse2, 5,6,8 +%macro VAR2_8x8_SSE2 2 +cglobal pixel_var2_8x%1, 5,6,8 VAR_START 1 - mov r5d, 4 + mov r5d, %1/2 .loop: -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m0, [r0] mova m1, [r0+r1*2] mova m2, [r2] @@ -838,15 +1005,20 @@ cglobal pixel_var2_8x8_sse2, 5,6,8 lea r2, [r2+r3*2*SIZEOF_PIXEL] dec r5d jg .loop - VAR2_END - RET + VAR2_END %2, m5, m6 +%endmacro + +INIT_XMM sse2 +VAR2_8x8_SSE2 8, 6 +VAR2_8x8_SSE2 16, 7 -%ifndef HIGH_BIT_DEPTH -cglobal pixel_var2_8x8_ssse3, 5,6,8 +%if HIGH_BIT_DEPTH == 0 +%macro VAR2_8x8_SSSE3 2 +cglobal pixel_var2_8x%1, 5,6,8 pxor m5, m5 ; sum pxor m6, m6 ; sum squared mova m7, [hsub_mul] - mov r5d, 2 + mov r5d, %1/4 .loop: movq m0, [r0] movq m2, [r2] @@ -882,8 +1054,58 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 lea r2, [r2+r3*2] dec r5d jg .loop - VAR2_END - RET + VAR2_END %2, m5, m6 +%endmacro + +INIT_XMM ssse3 +VAR2_8x8_SSSE3 8, 6 +VAR2_8x8_SSSE3 16, 7 +INIT_XMM xop +VAR2_8x8_SSSE3 8, 6 +VAR2_8x8_SSSE3 16, 7 + +%macro VAR2_8x8_AVX2 2 +cglobal pixel_var2_8x%1, 5,6,6 + pxor m3, m3 ; sum + pxor m4, m4 ; sum squared + mova m5, [hsub_mul] + mov r5d, %1/4 +.loop: + movq xm0, [r0] + movq xm1, [r2] + vinserti128 m0, m0, [r0+r1], 1 + vinserti128 m1, m1, [r2+r3], 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m0, m1 + movq xm1, [r0] + movq xm2, [r2] + vinserti128 m1, m1, [r0+r1], 1 + vinserti128 m2, m2, [r2+r3], 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m1, m2 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + paddw m3, m0 + paddw m3, m1 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m4, m0 + paddd m4, m1 + dec r5d + jg .loop + vextracti128 xm0, m3, 1 + vextracti128 xm1, m4, 1 + paddw xm3, xm0 + paddd xm4, xm1 + VAR2_END %2, xm3, xm4 +%endmacro + +INIT_YMM avx2 +VAR2_8x8_AVX2 8, 6 +VAR2_8x8_AVX2 16, 7 + %endif ; !HIGH_BIT_DEPTH ;============================================================================= @@ -894,7 +1116,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 %if cpuflag(sse4) ; just use shufps on anything post conroe shufps %1, %2, 0 -%elif cpuflag(ssse3) +%elif cpuflag(ssse3) && notcpuflag(atom) ; join 2x 32 bit and duplicate them ; emulating shufps is faster on conroe punpcklqdq %1, %2 @@ -955,7 +1177,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro -%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0 +%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] @@ -991,13 +1213,59 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 %endmacro +%macro LOAD_SUMSUB_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + vbroadcasti128 m%1, [%6] + vbroadcasti128 m%3, [%7] + vbroadcasti128 m%2, [%8] + vbroadcasti128 m%4, [%9] + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 + LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer + mova xm%3, %6 + mova xm%4, %8 + mova xm%1, %5 + mova xm%2, %7 + vpermq m%3, m%3, q0011 + vpermq m%4, m%4, q0011 + vpermq m%1, m%1, q0011 + vpermq m%2, m%2, q0011 +%endmacro + +%macro LOAD_SUMSUB8_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] + LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + ; in: r4=3*stride1, r5=3*stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 - %xdefine %%n n%1 + %xdefine %%n nn%1 %assign offset %2*SIZEOF_PIXEL LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] @@ -1012,8 +1280,9 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 SWAP %%n, 4 %endmacro +; in: %1 = horizontal if 0, vertical if 1 %macro SATD_8x4_SSE 8-9 -%ifidn %1, sse2 +%if %1 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax %else HADAMARD4_V %2, %3, %4, %5, %6 @@ -1027,7 +1296,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 %else SWAP %8, %2 %endif -%ifidn %1, sse2 +%if %1 paddw m%8, m%4 %else HADAMARD 1, max, %3, %5, %6, %7 @@ -1042,7 +1311,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 %endmacro %macro SATD_END_MMX 0 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 movd eax, m0 %else ; !HIGH_BIT_DEPTH @@ -1060,7 +1329,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8 ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- -; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int ) +; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_satd_16x4_internal @@ -1086,7 +1355,7 @@ pixel_satd_8x4_internal_mmx2: paddw m0, m1 ret -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %macro SATD_MxN_MMX 3 cglobal pixel_satd_%1x%2, 4,7 SATD_START_MMX @@ -1114,7 +1383,7 @@ SATD_MxN_MMX 16, 8, 4 SATD_MxN_MMX 8, 16, 8 %endif ; HIGH_BIT_DEPTH -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 cglobal pixel_satd_16x16, 4,6 SATD_START_MMX pxor m0, m0 @@ -1159,6 +1428,17 @@ cglobal pixel_satd_8x4, 4,6 call pixel_satd_8x4_internal_mmx2 SATD_END_MMX +cglobal pixel_satd_4x16, 4,6 + SATD_START_MMX + SATD_4x4_MMX m0, 0, 1 + SATD_4x4_MMX m1, 0, 1 + paddw m0, m1 + SATD_4x4_MMX m1, 0, 1 + paddw m0, m1 + SATD_4x4_MMX m1, 0, 0 + paddw m0, m1 + SATD_END_MMX + cglobal pixel_satd_4x8, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 @@ -1171,66 +1451,93 @@ cglobal pixel_satd_4x4, 4,6 SATD_4x4_MMX m0, 0, 0 SATD_END_MMX -%macro SATD_START_SSE2 2 -%if cpuflag(ssse3) +%macro SATD_START_SSE2 2-3 0 + FIX_STRIDES r1, r3 +%if HIGH_BIT_DEPTH && %3 + pxor %2, %2 +%elif cpuflag(ssse3) && notcpuflag(atom) +%if mmsize==32 + mova %2, [hmul_16p] +%else mova %2, [hmul_8p] +%endif %endif lea r4, [3*r1] lea r5, [3*r3] pxor %1, %1 %endmacro -%macro SATD_END_SSE2 1 - HADDW %1, m7 +%macro SATD_END_SSE2 1-2 +%if HIGH_BIT_DEPTH + HADDUW %1, xm0 +%if %0 == 2 + paddd %1, %2 +%endif +%else + HADDW %1, xm7 +%endif movd eax, %1 RET %endmacro +%macro SATD_ACCUM 3 +%if HIGH_BIT_DEPTH + HADDUW %1, %2 + paddd %3, %1 + pxor %1, %1 +%endif +%endmacro + %macro BACKUP_POINTERS 0 -%ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 +%if ARCH_X86_64 +%if WIN64 + PUSH r7 +%endif + mov r6, r0 + mov r7, r2 %endif %endmacro %macro RESTORE_AND_INC_POINTERS 0 -%ifdef ARCH_X86_64 - lea r0, [r10+8] - lea r2, [r11+8] +%if ARCH_X86_64 + lea r0, [r6+8*SIZEOF_PIXEL] + lea r2, [r7+8*SIZEOF_PIXEL] +%if WIN64 + POP r7 +%endif %else mov r0, r0mp mov r2, r2mp - add r0, 8 - add r2, 8 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL %endif %endmacro -;----------------------------------------------------------------------------- -; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -%macro SATDS_SSE2 0 -%if cpuflag(ssse3) -cglobal pixel_satd_4x4, 4, 6, 6 - SATD_START_MMX - mova m4, [hmul_4p] - LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] - LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] - LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] - LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] - DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 - HADAMARD 0, sumsub, 0, 1, 2, 3 - HADAMARD 4, sumsub, 0, 1, 2, 3 - HADAMARD 1, amax, 0, 1, 2, 3 - HADDW m0, m1 - movd eax, m0 - RET -%endif - -cglobal pixel_satd_4x8, 4, 6, 8 - SATD_START_MMX -%if cpuflag(ssse3) - mova m7, [hmul_4p] -%endif +%macro SATD_4x8_SSE 3 +%if HIGH_BIT_DEPTH + movh m0, [r0+0*r1] + movh m4, [r2+0*r3] + movh m1, [r0+1*r1] + movh m5, [r2+1*r3] + movhps m0, [r0+4*r1] + movhps m4, [r2+4*r3] + movh m2, [r0+2*r1] + movh m6, [r2+2*r3] + psubw m0, m4 + movh m3, [r0+r4] + movh m4, [r2+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + movhps m1, [r0+1*r1] + movhps m5, [r2+1*r3] + movhps m2, [r0+2*r1] + movhps m6, [r2+2*r3] + psubw m1, m5 + movhps m3, [r0+r4] + movhps m4, [r2+r5] + psubw m2, m6 + psubw m3, m4 +%else ; !HIGH_BIT_DEPTH movd m4, [r2] movd m5, [r2+r3] movd m6, [r2+2*r3] @@ -1247,7 +1554,12 @@ cglobal pixel_satd_4x8, 4, 6, 8 JDUP m5, m3 movd m3, [r0+2*r1] JDUP m1, m3 +%if %1==0 && %2==1 + mova m3, [hmul_4p] + DIFFOP 0, 4, 1, 5, 3 +%else DIFFOP 0, 4, 1, 5, 7 +%endif movd m5, [r2] add r2, r5 movd m3, [r0] @@ -1260,40 +1572,92 @@ cglobal pixel_satd_4x8, 4, 6, 8 JDUP m5, m4 movd m4, [r0+r1] JDUP m3, m4 +%if %1==0 && %2==1 + mova m4, [hmul_4p] + DIFFOP 2, 6, 3, 5, 4 +%else DIFFOP 2, 6, 3, 5, 7 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6, swap - HADDW m6, m1 - movd eax, m6 +%endif +%endif ; HIGH_BIT_DEPTH + SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +%macro SATDS_SSE2 0 +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) + +%if vertical==0 || HIGH_BIT_DEPTH +cglobal pixel_satd_4x4, 4, 6, 6 + SATD_START_MMX + mova m4, [hmul_4p] + LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] + LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] + LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] + LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 + HADAMARD 0, sumsub, 0, 1, 2, 3 + HADAMARD 4, sumsub, 0, 1, 2, 3 + HADAMARD 1, amax, 0, 1, 2, 3 + HADDW m0, m1 + movd eax, m0 + RET +%endif + +cglobal pixel_satd_4x8, 4, 6, 8 + SATD_START_MMX +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + HADDW m7, m1 + movd eax, m7 + RET + +cglobal pixel_satd_4x16, 4, 6, 8 + SATD_START_MMX +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0+r1*2*SIZEOF_PIXEL] + lea r2, [r2+r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + HADDW m7, m1 + movd eax, m7 RET cglobal pixel_satd_8x8_internal - LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 %%pixel_satd_8x4_internal: - LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 ret -%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same +; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) +; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) +%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx) cglobal pixel_satd_16x4_internal LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] lea r0, [r0+4*r1] - ; FIXME: this doesn't really mean ssse3, but rather selects between two different behaviors implemented with sse2? - SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10 - SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10 + ; always use horizontal mode here + SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10 + SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10 ret cglobal pixel_satd_16x8, 4,6,12 SATD_START_SSE2 m10, m7 -%if notcpuflag(ssse3) +%if vertical mova m7, [pw_00ff] %endif jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x16, 4,6,12 SATD_START_SSE2 m10, m7 -%if notcpuflag(ssse3) +%if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal @@ -1312,14 +1676,15 @@ cglobal pixel_satd_16x8, 4,6,8 SATD_END_SSE2 m6 cglobal pixel_satd_16x16, 4,6,8 - SATD_START_SSE2 m6, m7 + SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal call pixel_satd_8x8_internal + SATD_ACCUM m6, m0, m7 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal call pixel_satd_8x8_internal - SATD_END_SSE2 m6 + SATD_END_SSE2 m6, m7 %endif cglobal pixel_satd_8x16, 4,6,8 @@ -1340,14 +1705,14 @@ cglobal pixel_satd_8x4, 4,6,8 %endmacro ; SATDS_SSE2 %macro SA8D_INTER 0 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 %define lh m10 %define rh m0 %else %define lh m0 %define rh [esp+48] %endif -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 paddd lh, rh %else @@ -1356,36 +1721,22 @@ cglobal pixel_satd_8x4, 4,6,8 %endmacro %macro SA8D 0 -%ifdef HIGH_BIT_DEPTH - %define vertical 1 -%else ; sse2 doesn't seem to like the horizontal way of doing things - %define vertical (cpuflags == cpuflags_sse2) -%endif +; sse2 doesn't seem to like the horizontal way of doing things +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) -%ifdef ARCH_X86_64 +%if ARCH_X86_64 ;----------------------------------------------------------------------------- -; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int ) +; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal - lea r10, [r0+4*r1] - lea r11, [r2+4*r3] + lea r6, [r0+4*r1] + lea r7, [r2+4*r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 - LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11 + LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 %if vertical HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax %else ; non-sse2 - HADAMARD4_V 0, 1, 2, 8, 6 - HADAMARD4_V 4, 5, 3, 9, 6 - SUMSUB_BADC w, 0, 4, 1, 5, 6 - HADAMARD 2, sumsub, 0, 4, 6, 11 - HADAMARD 2, sumsub, 1, 5, 6, 11 - SUMSUB_BADC w, 2, 3, 8, 9, 6 - HADAMARD 2, sumsub, 2, 3, 6, 11 - HADAMARD 2, sumsub, 8, 9, 6, 11 - HADAMARD 1, amax, 0, 4, 6, 11 - HADAMARD 1, amax, 1, 5, 6, 4 - HADAMARD 1, amax, 2, 3, 6, 4 - HADAMARD 1, amax, 8, 9, 6, 4 + HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11 %endif paddw m0, m1 paddw m0, m2 @@ -1393,7 +1744,7 @@ cglobal pixel_sa8d_8x8_internal SAVE_MM_PERMUTATION ret -cglobal pixel_sa8d_8x8, 4,6,12 +cglobal pixel_sa8d_8x8, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] @@ -1401,7 +1752,7 @@ cglobal pixel_sa8d_8x8, 4,6,12 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 @@ -1411,7 +1762,7 @@ cglobal pixel_sa8d_8x8, 4,6,12 shr eax, 1 RET -cglobal pixel_sa8d_16x16, 4,6,12 +cglobal pixel_sa8d_16x16, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] @@ -1421,7 +1772,7 @@ cglobal pixel_sa8d_16x16, 4,6,12 call pixel_sa8d_8x8_internal ; pix[0] add r2, 8*SIZEOF_PIXEL add r0, 8*SIZEOF_PIXEL -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova m10, m0 @@ -1436,7 +1787,7 @@ cglobal pixel_sa8d_16x16, 4,6,12 call pixel_sa8d_8x8_internal ; pix[8*stride] SA8D_INTER SWAP 0, 10 -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd eax, m0 @@ -1505,7 +1856,7 @@ cglobal pixel_sa8d_8x8, 4,7 lea r4, [3*r1] lea r5, [3*r3] call pixel_sa8d_8x8_internal -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 @@ -1528,7 +1879,7 @@ cglobal pixel_sa8d_16x16, 4,7 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 @@ -1548,7 +1899,7 @@ cglobal pixel_sa8d_16x16, 4,7 %endif mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH SA8D_INTER %else ; !HIGH_BIT_DEPTH paddusw m0, [esp+64-mmsize] @@ -1577,6 +1928,170 @@ cglobal pixel_sa8d_16x16, 4,7 %endif ; !ARCH_X86_64 %endmacro ; SA8D +;============================================================================= +; SA8D_SATD +;============================================================================= + +; %1: vertical/horizontal mode +; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9) +; m10: satd result +; m6, m11-15: tmp regs +%macro SA8D_SATD_8x4 5 +%if %1 + LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 + HADAMARD 0, sumsub, %2, %3, 6 + HADAMARD 0, sumsub, %4, %5, 6 + SBUTTERFLY wd, %2, %3, 6 + SBUTTERFLY wd, %4, %5, 6 + HADAMARD2_2D %2, %4, %3, %5, 6, dq + + mova m12, m%2 + mova m13, m%3 + mova m14, m%4 + mova m15, m%5 + HADAMARD 0, sumsub, %2, %3, 6 + HADAMARD 0, sumsub, %4, %5, 6 + SBUTTERFLY qdq, 12, 13, 6 + HADAMARD 0, amax, 12, 13, 6 + SBUTTERFLY qdq, 14, 15, 6 + paddw m10, m12 + HADAMARD 0, amax, 14, 15, 6 + paddw m10, m14 +%else + LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 + HADAMARD4_V %2, %3, %4, %5, 6 + + pabsw m12, m%2 ; doing the abs first is a slight advantage + pabsw m14, m%4 + pabsw m13, m%3 + pabsw m15, m%5 + HADAMARD 1, max, 12, 14, 6, 11 + paddw m10, m12 + HADAMARD 1, max, 13, 15, 6, 11 + paddw m10, m13 +%endif +%endmacro ; SA8D_SATD_8x4 + +; %1: add spilled regs? +; %2: spill regs? +%macro SA8D_SATD_ACCUM 2 +%if HIGH_BIT_DEPTH + pmaddwd m10, [pw_1] + HADDUWD m0, m1 +%if %1 + paddd m10, temp1 + paddd m0, temp0 +%endif +%if %2 + mova temp1, m10 + pxor m10, m10 +%endif +%elif %1 + paddw m0, temp0 +%endif +%if %2 + mova temp0, m0 +%endif +%endmacro + +%macro SA8D_SATD 0 +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) +cglobal pixel_sa8d_satd_8x8_internal + SA8D_SATD_8x4 vertical, 0, 1, 2, 3 + SA8D_SATD_8x4 vertical, 4, 5, 8, 9 + +%if vertical ; sse2-style + HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax + HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax +%else ; complete sa8d + SUMSUB_BADC w, 0, 4, 1, 5, 12 + HADAMARD 2, sumsub, 0, 4, 12, 11 + HADAMARD 2, sumsub, 1, 5, 12, 11 + SUMSUB_BADC w, 2, 8, 3, 9, 12 + HADAMARD 2, sumsub, 2, 8, 12, 11 + HADAMARD 2, sumsub, 3, 9, 12, 11 + HADAMARD 1, amax, 0, 4, 12, 11 + HADAMARD 1, amax, 1, 5, 12, 4 + HADAMARD 1, amax, 2, 8, 12, 4 + HADAMARD 1, amax, 3, 9, 12, 4 +%endif + + ; create sa8d sub results + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + + SAVE_MM_PERMUTATION + ret + +;------------------------------------------------------------------------------- +; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t ) +;------------------------------------------------------------------------------- +cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize + %define temp0 [rsp+0*mmsize] + %define temp1 [rsp+1*mmsize] + FIX_STRIDES r1, r3 +%if vertical==0 + mova m7, [hmul_8p] +%endif + lea r4, [3*r1] + lea r5, [3*r3] + pxor m10, m10 + +%if mmsize==32 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 0, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 0 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m10, 1 + paddw xm0, xm1 + paddw xm10, xm2 +%else + lea r6, [r2+8*SIZEOF_PIXEL] + lea r7, [r0+8*SIZEOF_PIXEL] + + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 0, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 1 + + mov r0, r7 + mov r2, r6 + + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 0 +%endif + +; xop already has fast horizontal sums +%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0 + pmaddwd xm10, [pw_1] + HADDUWD xm0, xm1 + phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2 + pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1 + paddd xm0, xm1 ; sa8d sa8d satd satd + movd r0d, xm0 + pextrd eax, xm0, 2 +%else +%if HIGH_BIT_DEPTH + HADDD xm0, xm1 + HADDD xm10, xm2 +%else + HADDUW xm0, xm1 + HADDW xm10, xm2 +%endif + movd r0d, xm0 + movd eax, xm10 +%endif + add r0d, 1 + shl rax, 32 + shr r0d, 1 + or rax, r0 + RET +%endmacro ; SA8D_SATD + ;============================================================================= ; INTRA SATD ;============================================================================= @@ -1590,12 +2105,14 @@ cglobal pixel_sa8d_16x16, 4,7 paddw %3, %5 %endmacro +; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+, +; and are only retained for old cpus. %macro INTRA_SA8D_SSE2 0 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) ;----------------------------------------------------------------------------- -cglobal intra_sa8d_x3_8x8, 3,3,16 +cglobal intra_sa8d_x3_8x8, 3,3,14 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0*FENC_STRIDE] @@ -1622,23 +2139,15 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 paddusw m8, m10 paddusw m9, m11 ABSW2 m10, m11, m6, m7, m6, m7 - ABSW m15, m1, m1 + ABSW m13, m1, m1 paddusw m10, m11 paddusw m8, m9 - paddusw m15, m10 - paddusw m15, m8 + paddusw m13, m10 + paddusw m13, m8 ; 1D hadamard of edges movq m8, [r1+7] movq m9, [r1+16] -%if cpuflag(ssse3) - punpcklwd m8, m8 - pshufb m9, [intrax3_shuf] - pmaddubsw m8, [pb_pppm] - pmaddubsw m9, [pb_pppm] - HSUMSUB2 psignw, m8, m9, m10, m11, m9, q1032, [pw_ppppmmmm] - HSUMSUB2 psignw, m8, m9, m10, m11, m9, q2301, [pw_ppmmppmm] -%else ; sse2 pxor m10, m10 punpcklbw m8, m10 punpcklbw m9, m10 @@ -1652,7 +2161,6 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 pmullw m11, [pw_pmpmpmpm] paddw m8, m10 paddw m9, m11 -%endif ; differences paddw m10, m8, m9 @@ -1664,8 +2172,8 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 psubw m8, m0 psubw m10, m0 ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum - paddusw m14, m8, m15 - paddusw m15, m10 + paddusw m8, m13 + paddusw m13, m10 punpcklwd m0, m1 punpcklwd m2, m3 punpcklwd m4, m5 @@ -1674,7 +2182,7 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 punpckldq m4, m6 punpcklqdq m0, m4 ; transpose psllw m9, 3 ; top edge - psrldq m2, m15, 2 ; 8x7 sum + psrldq m2, m13, 2 ; 8x7 sum psubw m0, m9 ; 8x1 sum ABSW m0, m0, m9 paddusw m2, m0 @@ -1682,21 +2190,21 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 ; 3x HADDW movdqa m7, [pw_1] pmaddwd m2, m7 - pmaddwd m14, m7 - pmaddwd m15, m7 - punpckhdq m3, m2, m14 - punpckldq m2, m14 - pshufd m5, m15, q3311 + pmaddwd m8, m7 + pmaddwd m13, m7 + punpckhdq m3, m2, m8 + punpckldq m2, m8 + pshufd m5, m13, q3311 paddd m2, m3 - paddd m5, m15 - punpckhqdq m3, m2, m5 + paddd m5, m13 + punpckhqdq m0, m2, m5 punpcklqdq m2, m5 - pavgw m3, m2 - pxor m0, m0 - pavgw m3, m0 - movq [r2], m3 ; i8x8_v, i8x8_h - psrldq m3, 8 - movd [r2+8], m3 ; i8x8_dc + pavgw m0, m2 + pxor m1, m1 + pavgw m0, m1 + movq [r2], m0 ; i8x8_v, i8x8_h + psrldq m0, 8 + movd [r2+8], m0 ; i8x8_dc RET %endif ; ARCH_X86_64 %endmacro ; INTRA_SA8D_SSE2 @@ -1706,6 +2214,12 @@ cglobal intra_sa8d_x3_8x8, 3,3,16 INIT_MMX cglobal hadamard_load ; not really a global, but otherwise cycles get attributed to the wrong function in profiling +%if HIGH_BIT_DEPTH + mova m0, [r0+0*FENC_STRIDEB] + mova m1, [r0+1*FENC_STRIDEB] + mova m2, [r0+2*FENC_STRIDEB] + mova m3, [r0+3*FENC_STRIDEB] +%else pxor m7, m7 movd m0, [r0+0*FENC_STRIDE] movd m1, [r0+1*FENC_STRIDE] @@ -1715,24 +2229,31 @@ cglobal hadamard_load punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 +%endif HADAMARD4_2D 0, 1, 2, 3, 4 SAVE_MM_PERMUTATION ret %macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp %ifidn %1, top - movd %3, [r1+%2-FDEC_STRIDE] +%if HIGH_BIT_DEPTH + mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB] +%else + movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB] pxor %5, %5 punpcklbw %3, %5 +%endif %else ; left %ifnidn %2, 0 - shl %2d, 5 ; log(FDEC_STRIDE) + shl %2d, 5 ; log(FDEC_STRIDEB) %endif - movd %3, [r1+%2-4+1*FDEC_STRIDE] - pinsrw %3, [r1+%2-2+0*FDEC_STRIDE], 0 - pinsrw %3, [r1+%2-2+2*FDEC_STRIDE], 2 - pinsrw %3, [r1+%2-2+3*FDEC_STRIDE], 3 + movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB] + pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0 + pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2 + pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3 +%if HIGH_BIT_DEPTH == 0 psrlw %3, 8 +%endif %ifnidn %2, 0 shr %2d, 5 %endif @@ -1771,19 +2292,6 @@ cglobal hadamard_load %8 %3, %6 %endmacro -%macro CLEAR_SUMS 0 -%ifdef ARCH_X86_64 - mov qword [sums+0], 0 - mov qword [sums+8], 0 - mov qword [sums+16], 0 -%else - pxor m7, m7 - movq [sums+0], m7 - movq [sums+8], m7 - movq [sums+16], m7 -%endif -%endmacro - ; in: m1..m3 ; out: m7 ; clobber: m4..m6 @@ -1819,15 +2327,16 @@ cglobal hadamard_load ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal intra_satd_x3_4x4, 3,3 -%ifdef ARCH_X86_64 +%if UNIX64 ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 %define left_1d rsp-16 ; size 8 %else - ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned - SUB esp, 16 - %define top_1d esp+8 - %define left_1d esp + ; WIN64: stack is 16 byte aligned because abi says so + ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned + SUB rsp, 16 + %define top_1d rsp+8 + %define left_1d rsp %endif call hadamard_load @@ -1849,50 +2358,52 @@ cglobal intra_satd_x3_4x4, 3,3 movd [r2+0], m0 ; i4x4_v satd movd [r2+4], m4 ; i4x4_h satd movd [r2+8], m5 ; i4x4_dc satd -%ifndef ARCH_X86_64 - ADD esp, 16 +%if UNIX64 == 0 + ADD rsp, 16 %endif RET -%ifdef ARCH_X86_64 - %define t0 r10 - %define t2 r11 -%else - %define t0 r0 - %define t2 r2 -%endif - ;----------------------------------------------------------------------------- ; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal intra_satd_x3_16x16, 0,5 - %assign stack_pad 88 + ((stack_offset+88+gprsize)&15) + %assign stack_pad 120 + ((stack_offset+120+gprsize)&15) ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, stack_pad -%define sums rsp+64 ; size 24 +%define sums rsp+64 ; size 56 %define top_1d rsp+32 ; size 32 %define left_1d rsp ; size 32 movifnidn r1, r1mp - CLEAR_SUMS + + pxor m7, m7 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 +%if HIGH_BIT_DEPTH + mova [sums+24], m7 + mova [sums+32], m7 + mova [sums+40], m7 + mova [sums+48], m7 +%endif ; 1D hadamards - mov t0d, 12 - movd m6, [pw_64] + mov r3d, 12 + movd m6, [pw_32] .loop_edge: - SCALAR_HADAMARD left, t0, m0, m1 - SCALAR_HADAMARD top, t0, m1, m2, m3 - paddw m6, m0 - paddw m6, m1 - sub t0d, 4 + SCALAR_HADAMARD left, r3, m0, m1 + SCALAR_HADAMARD top, r3, m1, m2, m3 + pavgw m0, m1 + paddw m6, m0 + sub r3d, 4 jge .loop_edge - psrlw m6, 3 - pand m6, [sw_f0] ; dc + psrlw m6, 2 + pand m6, [sw_f0] ; dc ; 2D hadamards - movifnidn r0, r0mp - mov r3, -4 + movifnidn r0, r0mp + mov r3, -4 .loop_y: - mov r4, -4 + mov r4, -4 .loop_x: call hadamard_load @@ -1900,38 +2411,79 @@ cglobal intra_satd_x3_16x16, 0,5 SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)] pavgw m4, m7 pavgw m5, m7 - paddw m0, [sums+0] ; i16x16_v satd - paddw m4, [sums+8] ; i16x16_h satd + paddw m0, [sums+ 0] ; i16x16_v satd + paddw m4, [sums+ 8] ; i16x16_h satd paddw m5, [sums+16] ; i16x16_dc satd - movq [sums+0], m0 - movq [sums+8], m4 - movq [sums+16], m5 + mova [sums+ 0], m0 + mova [sums+ 8], m4 + mova [sums+16], m5 - add r0, 4 + add r0, 4*SIZEOF_PIXEL inc r4 jl .loop_x - add r0, 4*FENC_STRIDE-16 +%if HIGH_BIT_DEPTH + psrld m7, m4, 16 + pslld m4, 16 + psrld m4, 16 + paddd m4, m7 + psrld m7, m0, 16 + pslld m0, 16 + psrld m0, 16 + paddd m0, m7 + paddd m4, [sums+32] + paddd m0, [sums+24] + mova [sums+32], m4 + mova [sums+24], m0 + pxor m7, m7 + punpckhwd m3, m5, m7 + punpcklwd m5, m7 + paddd m3, [sums+48] + paddd m5, [sums+40] + mova [sums+48], m3 + mova [sums+40], m5 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 +%endif + add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL inc r3 jl .loop_y ; horizontal sum movifnidn r2, r2mp - movq m2, [sums+16] - movq m1, [sums+8] - movq m0, [sums+0] - movq m7, m2 - SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd +%if HIGH_BIT_DEPTH + mova m1, m5 + paddd m5, m3 + HADDD m5, m7 ; DC satd + HADDD m4, m7 ; H satd + HADDD m0, m7 ; the part of V satd that doesn't overlap with DC + psrld m0, 1 + psrlq m1, 32 ; DC[1] + paddd m0, m3 ; DC[2] + psrlq m3, 32 ; DC[3] + paddd m0, m1 + paddd m0, m3 +%else + mova m7, m5 + SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd psrld m0, 1 pslld m7, 16 psrld m7, 16 - paddd m0, m2 + paddd m0, m5 psubd m0, m7 - movd [r2+8], m2 ; i16x16_dc satd - movd [r2+4], m1 ; i16x16_h satd - movd [r2+0], m0 ; i16x16_v satd - ADD rsp, stack_pad +%endif + movd [r2+8], m5 ; i16x16_dc satd + movd [r2+4], m4 ; i16x16_h satd + movd [r2+0], m0 ; i16x16_v satd + ADD rsp, stack_pad RET +%if ARCH_X86_64 + %define t0 r6 +%else + %define t0 r2 +%endif + ;----------------------------------------------------------------------------- ; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- @@ -1943,32 +2495,35 @@ cglobal intra_satd_x3_8x8c, 0,6 %define top_1d rsp+16 ; size 16 %define left_1d rsp ; size 16 movifnidn r1, r1mp - CLEAR_SUMS + pxor m7, m7 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 ; 1D hadamards - mov t0d, 4 + mov r3d, 4 .loop_edge: - SCALAR_HADAMARD left, t0, m0, m1 - SCALAR_HADAMARD top, t0, m0, m1, m2 - sub t0d, 4 + SCALAR_HADAMARD left, r3, m0, m1 + SCALAR_HADAMARD top, r3, m0, m1, m2 + sub r3d, 4 jge .loop_edge ; dc - movzx t2d, word [left_1d+0] + movzx t0d, word [left_1d+0] movzx r3d, word [top_1d+0] movzx r4d, word [left_1d+8] movzx r5d, word [top_1d+8] - lea t2d, [t2 + r3 + 16] + lea t0d, [t0 + r3 + 16] lea r3d, [r4 + r5 + 16] - shr t2d, 1 + shr t0d, 1 shr r3d, 1 add r4d, 8 add r5d, 8 - and t2d, -16 ; tl + and t0d, -16 ; tl and r3d, -16 ; br and r4d, -16 ; bl and r5d, -16 ; tr - mov [dc_1d+ 0], t2d ; tl + mov [dc_1d+ 0], t0d ; tl mov [dc_1d+ 4], r5d ; tr mov [dc_1d+ 8], r4d ; bl mov [dc_1d+12], r3d ; br @@ -1994,10 +2549,10 @@ cglobal intra_satd_x3_8x8c, 0,6 movq [sums+8], m4 movq [sums+0], m5 - add r0, 4 + add r0, 4*SIZEOF_PIXEL inc r4 jl .loop_x - add r0, 4*FENC_STRIDE-8 + add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL add r5, 8 inc r3 jl .loop_y @@ -2007,10 +2562,18 @@ cglobal intra_satd_x3_8x8c, 0,6 movq m1, [sums+8] movq m2, [sums+16] movq m7, m0 +%if HIGH_BIT_DEPTH + psrlq m7, 16 + HADDW m7, m3 + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + psrld m2, 1 + paddd m2, m7 +%else psrlq m7, 15 paddw m2, m7 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m2, 1 +%endif movd [r2+0], m0 ; i8x8c_dc satd movd [r2+4], m1 ; i8x8c_h satd movd [r2+8], m2 ; i8x8c_v satd @@ -2021,12 +2584,20 @@ cglobal intra_satd_x3_8x8c, 0,6 %macro PRED4x4_LOWPASS 5 +%ifid %5 + pavgb %5, %2, %3 + pxor %3, %2 + pand %3, [pb_1] + psubusb %5, %3 + pavgb %1, %4, %5 +%else mova %5, %2 pavgb %2, %3 pxor %3, %5 pand %3, [pb_1] psubusb %2, %3 pavgb %1, %4, %2 +%endif %endmacro %macro INTRA_X9_PRED 2 @@ -2081,18 +2652,20 @@ cglobal intra_satd_x3_8x8c, 0,6 %endmacro ; INTRA_X9_PRED %macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp - pshufb m%1, [intrax9b_vh1] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3 + pshufb m2, m%1, [intrax9b_vh1] + pshufb m3, m%1, [intrax9b_vh2] + mova [pred_buf+0x60], m2 + mova [pred_buf+0x70], m3 + pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3 pmaddubsw m%1, [hmul_4p] pshufhw m0, m%1, q2301 pshuflw m0, m0, q2301 psignw m%1, [pw_pmpmpmpm] paddw m0, m%1 psllw m0, 2 ; hadamard(top), hadamard(left) - mova m1, m0 - mova m2, m0 movhlps m3, m0 - pshufb m1, [intrax9b_v1] - pshufb m2, [intrax9b_v2] + pshufb m1, m0, [intrax9b_v1] + pshufb m2, m0, [intrax9b_v2] paddw m0, m3 psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated? pavgw m0, [pw_16] @@ -2102,6 +2675,13 @@ cglobal intra_satd_x3_8x8c, 0,6 ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef. HADAMARD 0, sumsub, %2, %3, %4, %5 HADAMARD 1, sumsub, %2, %3, %4, %5 + movd r3d, m0 + shr r3d, 4 + imul r3d, 0x01010101 + mov [pred_buf+0x80], r3d + mov [pred_buf+0x88], r3d + mov [pred_buf+0x90], r3d + mov [pred_buf+0x98], r3d psubw m3, m%2 psubw m0, m%2 psubw m1, m%2 @@ -2122,17 +2702,23 @@ cglobal intra_satd_x3_8x8c, 0,6 %endif movhlps m2, m1 paddw m1, m2 +%if cpuflag(xop) + vphaddwq m3, m3 + vphaddwq m1, m1 + packssdw m1, m3 +%else phaddw m1, m3 pmaddwd m1, [pw_1] ; v, _, h, dc +%endif %endmacro ; INTRA_X9_VHDC -%macro INTRA_X9_END 1 +%macro INTRA_X9_END 2 %if cpuflag(sse4) phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu movd eax, m0 add eax, 1<<16 - cmp ax, r1w - cmovge eax, r1d + cmp ax, r3w + cmovge eax, r3d %else %if %1 ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index @@ -2158,22 +2744,58 @@ cglobal intra_satd_x3_8x8c, 0,6 ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits ; 1<<12: undo sign manipulation lea eax, [rax+r2+(1<<16)+(1<<12)] - cmp ax, r1w - cmovge eax, r1d + cmp ax, r3w + cmovge eax, r3d %endif ; cpuflag + + ; output the predicted samples + mov r3d, eax + shr r3d, 16 +%ifdef PIC + lea r2, [%2_lut] + movzx r2d, byte [r2+r3] +%else + movzx r2d, byte [%2_lut+r3] +%endif +%if %1 ; sad + movq mm0, [pred_buf+r2] + movq mm1, [pred_buf+r2+16] + movd [r1+0*FDEC_STRIDE], mm0 + movd [r1+2*FDEC_STRIDE], mm1 + psrlq mm0, 32 + psrlq mm1, 32 + movd [r1+1*FDEC_STRIDE], mm0 + movd [r1+3*FDEC_STRIDE], mm1 +%else ; satd +%assign i 0 +%rep 4 + mov r3d, [pred_buf+r2+8*i] + mov [r1+i*FDEC_STRIDE], r3d +%assign i i+1 +%endrep +%endif %endmacro ; INTRA_X9_END %macro INTRA_X9 0 ;----------------------------------------------------------------------------- ; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts ) ;----------------------------------------------------------------------------- -cglobal intra_sad_x9_4x4, 3,3,9 -%ifdef ARCH_X86_64 +%if notcpuflag(xop) +cglobal intra_sad_x9_4x4, 3,4,9 + %assign pad 0xc0-gprsize-(stack_offset&15) + %define pred_buf rsp + sub rsp, pad +%if ARCH_X86_64 INTRA_X9_PRED intrax9a, m8 %else - sub rsp, 0x1c - INTRA_X9_PRED intrax9a, [rsp] + INTRA_X9_PRED intrax9a, [rsp+0xa0] %endif + mova [rsp+0x00], m2 + mova [rsp+0x10], m3 + mova [rsp+0x20], m4 + mova [rsp+0x30], m5 + mova [rsp+0x40], m6 + mova [rsp+0x50], m7 %if cpuflag(sse4) movd m0, [r0+0*FENC_STRIDE] pinsrd m0, [r0+1*FENC_STRIDE], 1 @@ -2198,32 +2820,34 @@ cglobal intra_sad_x9_4x4, 3,3,9 paddd m2, m3 paddd m4, m5 paddd m6, m7 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 SWAP 7, 8 pxor m8, m8 %define %%zero m8 %else - mova m7, [rsp] + mova m7, [rsp+0xa0] %define %%zero [pb_0] %endif - mova m3, m7 - mova m5, m7 + pshufb m3, m7, [intrax9a_vh1] + pshufb m5, m7, [intrax9a_vh2] pshufb m7, [intrax9a_dc] - pshufb m3, [intrax9a_vh1] psadbw m7, %%zero - pshufb m5, [intrax9a_vh2] psrlw m7, 2 + mova [rsp+0x60], m3 + mova [rsp+0x70], m5 psadbw m3, m0 pavgw m7, %%zero pshufb m7, %%zero psadbw m5, m1 + movq [rsp+0x80], m7 + movq [rsp+0x90], m7 psadbw m0, m7 paddd m3, m5 psadbw m1, m7 paddd m0, m1 - movzx r1d, word [r2] + movzx r3d, word [r2] movd r0d, m3 ; v - add r1d, r0d + add r3d, r0d punpckhqdq m3, m0 ; h, dc shufps m3, m2, q2020 psllq m6, 32 @@ -2231,18 +2855,26 @@ cglobal intra_sad_x9_4x4, 3,3,9 movu m0, [r2+2] packssdw m3, m4 paddw m0, m3 - INTRA_X9_END 1 -%ifndef ARCH_X86_64 - add rsp, 0x1c -%endif + INTRA_X9_END 1, intrax9a + add rsp, pad RET +%endif ; cpuflag -%ifdef ARCH_X86_64 +%if ARCH_X86_64 ;----------------------------------------------------------------------------- ; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts ) ;----------------------------------------------------------------------------- -cglobal intra_satd_x9_4x4, 3,3,16 +cglobal intra_satd_x9_4x4, 3,4,16 + %assign pad 0xb0-gprsize-(stack_offset&15) + %define pred_buf rsp + sub rsp, pad INTRA_X9_PRED intrax9b, m15 + mova [rsp+0x00], m2 + mova [rsp+0x10], m3 + mova [rsp+0x20], m4 + mova [rsp+0x30], m5 + mova [rsp+0x40], m6 + mova [rsp+0x50], m7 movd m8, [r0+0*FENC_STRIDE] movd m9, [r0+1*FENC_STRIDE] movd m10, [r0+2*FENC_STRIDE] @@ -2286,7 +2918,7 @@ cglobal intra_satd_x9_4x4, 3,3,16 INTRA_X9_VHDC 15, 8, 10, 6, 7 ; find minimum movu m0, [r2+2] - movd r1d, m1 + movd r3d, m1 palignr m5, m1, 8 %if notcpuflag(sse4) pshufhw m0, m0, q3120 ; compensate for different order in unpack @@ -2294,8 +2926,9 @@ cglobal intra_satd_x9_4x4, 3,3,16 packssdw m5, m4 paddw m0, m5 movzx r0d, word [r2] - add r1d, r0d - INTRA_X9_END 0 + add r3d, r0d + INTRA_X9_END 0, intrax9b + add rsp, pad RET RESET_MM_PERMUTATION ALIGN 16 @@ -2308,7 +2941,7 @@ ALIGN 16 psubw m1, m9 psubw m2, m10 psubw m3, m11 - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap + SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap pmaddwd m0, [pw_1] %if cpuflag(sse4) pshufd m1, m0, q0032 @@ -2319,13 +2952,19 @@ ALIGN 16 ret %else ; !ARCH_X86_64 -cglobal intra_satd_x9_4x4, 3,3,8 - sub rsp, 0x9c - INTRA_X9_PRED intrax9b, [rsp+0x80] - mova [rsp+0x40], m4 - mova [rsp+0x50], m5 - mova [rsp+0x60], m6 - mova [rsp+0x70], m7 +cglobal intra_satd_x9_4x4, 3,4,8 + %assign pad 0x120-gprsize-(stack_offset&15) + %define fenc_buf rsp + %define pred_buf rsp+0x40 + %define spill rsp+0xe0 + sub rsp, pad + INTRA_X9_PRED intrax9b, [spill+0x20] + mova [pred_buf+0x00], m2 + mova [pred_buf+0x10], m3 + mova [pred_buf+0x20], m4 + mova [pred_buf+0x30], m5 + mova [pred_buf+0x40], m6 + mova [pred_buf+0x50], m7 movd m4, [r0+0*FENC_STRIDE] movd m5, [r0+1*FENC_STRIDE] movd m6, [r0+2*FENC_STRIDE] @@ -2339,10 +2978,10 @@ cglobal intra_satd_x9_4x4, 3,3,8 pmaddubsw m5, m7 pmaddubsw m6, m7 pmaddubsw m0, m7 - mova [rsp+0x00], m4 - mova [rsp+0x10], m5 - mova [rsp+0x20], m6 - mova [rsp+0x30], m0 + mova [fenc_buf+0x00], m4 + mova [fenc_buf+0x10], m5 + mova [fenc_buf+0x20], m6 + mova [fenc_buf+0x30], m0 movddup m0, m2 pshufd m1, m2, q3232 movddup m2, m3 @@ -2355,49 +2994,47 @@ cglobal intra_satd_x9_4x4, 3,3,8 psubw m1, m5 psubw m2, m6 call .satd_8x4b ; ddr, ddl - mova m3, [rsp+0x50] - mova m1, [rsp+0x40] + mova m3, [pred_buf+0x30] + mova m1, [pred_buf+0x20] movddup m2, m3 movhlps m3, m3 - movq [rsp+0x48], m0 + movq [spill+0x08], m0 movddup m0, m1 movhlps m1, m1 call .satd_8x4 ; vr, vl - mova m3, [rsp+0x70] - mova m1, [rsp+0x60] + mova m3, [pred_buf+0x50] + mova m1, [pred_buf+0x40] movddup m2, m3 movhlps m3, m3 - movq [rsp+0x50], m0 + movq [spill+0x10], m0 movddup m0, m1 movhlps m1, m1 call .satd_8x4 ; hd, hu - movq [rsp+0x58], m0 - mova m1, [rsp+0x80] - mova m4, [rsp+0x00] - mova m5, [rsp+0x20] + movq [spill+0x18], m0 + mova m1, [spill+0x20] + mova m4, [fenc_buf+0x00] + mova m5, [fenc_buf+0x20] mova m2, [pw_ppmmppmm] psignw m4, m2 psignw m5, m2 - paddw m4, [rsp+0x10] - paddw m5, [rsp+0x30] + paddw m4, [fenc_buf+0x10] + paddw m5, [fenc_buf+0x30] INTRA_X9_VHDC 1, 4, 5, 6, 7 ; find minimum movu m0, [r2+2] - movd r1d, m1 - movhlps m1, m1 - movhps m1, [rsp+0x48] + movd r3d, m1 + punpckhqdq m1, [spill+0x00] + packssdw m1, [spill+0x10] %if cpuflag(sse4) - pshufd m2, [rsp+0x50], q3120 - packssdw m1, m2 + pshufhw m1, m1, q3120 %else - packssdw m1, [rsp+0x50] pshufhw m0, m0, q3120 %endif paddw m0, m1 movzx r0d, word [r2] - add r1d, r0d - INTRA_X9_END 0 - add rsp, 0x9c + add r3d, r0d + INTRA_X9_END 0, intrax9b + add rsp, pad RET RESET_MM_PERMUTATION ALIGN 16 @@ -2406,12 +3043,13 @@ ALIGN 16 pmaddubsw m1, m7 pmaddubsw m2, m7 pmaddubsw m3, m7 - psubw m0, [rsp+0x00+gprsize] - psubw m1, [rsp+0x10+gprsize] - psubw m2, [rsp+0x20+gprsize] + %xdefine fenc_buf fenc_buf+gprsize + psubw m0, [fenc_buf+0x00] + psubw m1, [fenc_buf+0x10] + psubw m2, [fenc_buf+0x20] .satd_8x4b: - psubw m3, [rsp+0x30+gprsize] - SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap + psubw m3, [fenc_buf+0x30] + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap pmaddwd m0, [pw_1] %if cpuflag(sse4) pshufd m1, m0, q0032 @@ -2425,11 +3063,673 @@ ALIGN 16 +%macro INTRA8_X9 0 +;----------------------------------------------------------------------------- +; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds ) +;----------------------------------------------------------------------------- +cglobal intra_sad_x9_8x8, 5,6,9 + %define fenc02 m4 + %define fenc13 m5 + %define fenc46 m6 + %define fenc57 m7 +%if ARCH_X86_64 + %define tmp m8 + %assign padbase 0x0 +%else + %define tmp [rsp] + %assign padbase 0x10 +%endif + %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15) + %define pred(i,j) [rsp+i*0x40+j*0x10+padbase] + + SUB rsp, pad + movq fenc02, [r0+FENC_STRIDE* 0] + movq fenc13, [r0+FENC_STRIDE* 1] + movq fenc46, [r0+FENC_STRIDE* 4] + movq fenc57, [r0+FENC_STRIDE* 5] + movhps fenc02, [r0+FENC_STRIDE* 2] + movhps fenc13, [r0+FENC_STRIDE* 3] + movhps fenc46, [r0+FENC_STRIDE* 6] + movhps fenc57, [r0+FENC_STRIDE* 7] + + ; save instruction size: avoid 4-byte memory offsets + lea r0, [intra8x9_h1+128] + %define off(m) (r0+m-(intra8x9_h1+128)) + +; v + movddup m0, [r2+16] + mova pred(0,0), m0 + psadbw m1, m0, fenc02 + mova pred(0,1), m0 + psadbw m2, m0, fenc13 + mova pred(0,2), m0 + psadbw m3, m0, fenc46 + mova pred(0,3), m0 + psadbw m0, m0, fenc57 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + movd [r4+0], m0 + +; h + movq m0, [r2+7] + pshufb m1, m0, [off(intra8x9_h1)] + pshufb m2, m0, [off(intra8x9_h2)] + mova pred(1,0), m1 + psadbw m1, fenc02 + mova pred(1,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m3, m0, [off(intra8x9_h3)] + pshufb m2, m0, [off(intra8x9_h4)] + mova pred(1,2), m3 + psadbw m3, fenc46 + mova pred(1,3), m2 + psadbw m2, fenc57 + paddw m1, m3 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+2], m1 + + lea r5, [rsp+padbase+0x100] + %define pred(i,j) [r5+i*0x40+j*0x10-0x100] + +; dc + movhps m0, [r2+16] + pxor m2, m2 + psadbw m0, m2 + movhlps m1, m0 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + pshufb m0, m2 + mova pred(2,0), m0 + psadbw m1, m0, fenc02 + mova pred(2,1), m0 + psadbw m2, m0, fenc13 + mova pred(2,2), m0 + psadbw m3, m0, fenc46 + mova pred(2,3), m0 + psadbw m0, m0, fenc57 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + movd [r4+4], m0 + +; ddl +; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 +; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 +; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA +; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB +; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC +; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD +; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE +; Ft8 Ft9 FtA FtB FtC FtD FtE FtF + mova m0, [r2+16] + movu m2, [r2+17] + pslldq m1, m0, 1 + pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___ + PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF + pshufb m1, m0, [off(intra8x9_ddl1)] + pshufb m2, m0, [off(intra8x9_ddl2)] + mova pred(3,0), m1 + psadbw m1, fenc02 + mova pred(3,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddl3)] + mova pred(3,2), m2 + psadbw m2, fenc46 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddl4)] + mova pred(3,3), m2 + psadbw m2, fenc57 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+6], m1 + +; vl +; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 +; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 +; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 +; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 +; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA +; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA +; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB +; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB + pshufb m1, m3, [off(intra8x9_vl1)] + pshufb m2, m0, [off(intra8x9_vl2)] + pshufb m3, m3, [off(intra8x9_vl3)] + pshufb m0, m0, [off(intra8x9_vl4)] + mova pred(7,0), m1 + psadbw m1, fenc02 + mova pred(7,1), m2 + psadbw m2, fenc13 + mova pred(7,2), m3 + psadbw m3, fenc46 + mova pred(7,3), m0 + psadbw m0, fenc57 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(sse4) + pextrw [r4+14], m0, 0 +%else + movd r5d, m0 + mov [r4+14], r5w + lea r5, [rsp+padbase+0x100] +%endif + +; ddr +; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 +; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 +; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 +; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 +; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 +; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 +; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 +; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt + movu m2, [r2+8] + movu m0, [r2+7] + movu m1, [r2+6] + pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 + PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 + pshufb m1, m0, [off(intra8x9_ddr1)] + pshufb m2, m0, [off(intra8x9_ddr2)] + mova pred(4,0), m1 + psadbw m1, fenc02 + mova pred(4,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddr3)] + mova pred(4,2), m2 + psadbw m2, fenc46 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddr4)] + mova pred(4,3), m2 + psadbw m2, fenc57 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+8], m1 + + add r0, 256 + add r5, 0xC0 + %define off(m) (r0+m-(intra8x9_h1+256+128)) + %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0] + +; vr +; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 +; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 +; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 +; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 +; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 +; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 +; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 +; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 + movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 + pshufb m1, m2, [off(intra8x9_vr1)] + pshufb m2, m2, [off(intra8x9_vr3)] + mova pred(5,0), m1 + psadbw m1, fenc02 + mova pred(5,2), m2 + psadbw m2, fenc46 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_vr2)] + mova pred(5,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_vr4)] + mova pred(5,3), m2 + psadbw m2, fenc57 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+10], m1 + +; hd +; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 +; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3 +; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1 +; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt +; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 +; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 +; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 +; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 + pshufd m2, m3, q0001 +%if cpuflag(sse4) + pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___ +%else + movss m1, m0, m2 + SWAP 1, 2 +%endif + punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___ + pshufb m1, m2, [off(intra8x9_hd1)] + pshufb m2, m2, [off(intra8x9_hd2)] + mova pred(6,0), m1 + psadbw m1, fenc02 + mova pred(6,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_hd3)] + pshufb m3, m0, [off(intra8x9_hd4)] + mova pred(6,2), m2 + psadbw m2, fenc46 + mova pred(6,3), m3 + psadbw m3, fenc57 + paddw m1, m2 + paddw m1, m3 + movhlps m2, m1 + paddw m1, m2 + ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall + pslldq m1, 12 + SWAP 3, 1 + +; hu +; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 +; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5 +; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 +; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 +; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 +; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 +; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 +; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 +%if cpuflag(sse4) + pinsrb m0, [r2+7], 15 ; Gl7 +%else + movd m1, [r2+7] + pslldq m0, 1 + palignr m1, m0, 1 + SWAP 0, 1 +%endif + pshufb m1, m0, [off(intra8x9_hu1)] + pshufb m2, m0, [off(intra8x9_hu2)] + mova pred(8,0), m1 + psadbw m1, fenc02 + mova pred(8,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_hu3)] + pshufb m0, m0, [off(intra8x9_hu4)] + mova pred(8,2), m2 + psadbw m2, fenc46 + mova pred(8,3), m0 + psadbw m0, fenc57 + paddw m1, m2 + paddw m1, m0 + movhlps m2, m1 + paddw m1, m2 + movd r2d, m1 + + movu m0, [r3] + por m3, [r4] + paddw m0, m3 + mova [r4], m0 + movzx r5d, word [r3+16] + add r2d, r5d + mov [r4+16], r2w + +%if cpuflag(sse4) + phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl + movd eax, m0 +%else + ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index + paddusw m0, m0 + paddusw m0, m0 + paddw m0, [off(pw_s00112233)] + movhlps m1, m0 + pminsw m0, m1 + pshuflw m1, m0, q0032 + pminsw m0, m1 + movd eax, m0 + ; repack with 3 bit index + xor eax, 0x80008000 + movzx r3d, ax + shr eax, 15 + add r3d, r3d + or eax, 1 + cmp eax, r3d + cmovg eax, r3d + ; reverse to phminposuw order + mov r3d, eax + and eax, 7 + shr r3d, 3 + shl eax, 16 + or eax, r3d +%endif + add r2d, 8<<16 + cmp ax, r2w + cmovg eax, r2d + + mov r2d, eax + shr r2d, 16 + shl r2d, 6 + add r1, 4*FDEC_STRIDE + mova m0, [rsp+padbase+r2+0x00] + mova m1, [rsp+padbase+r2+0x10] + mova m2, [rsp+padbase+r2+0x20] + mova m3, [rsp+padbase+r2+0x30] + movq [r1+FDEC_STRIDE*-4], m0 + movhps [r1+FDEC_STRIDE*-2], m0 + movq [r1+FDEC_STRIDE*-3], m1 + movhps [r1+FDEC_STRIDE*-1], m1 + movq [r1+FDEC_STRIDE* 0], m2 + movhps [r1+FDEC_STRIDE* 2], m2 + movq [r1+FDEC_STRIDE* 1], m3 + movhps [r1+FDEC_STRIDE* 3], m3 + ADD rsp, pad + RET + +%if ARCH_X86_64 +;----------------------------------------------------------------------------- +; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds ) +;----------------------------------------------------------------------------- +cglobal intra_sa8d_x9_8x8, 5,6,16 + %assign pad 0x2c0+0x10-gprsize-(stack_offset&15) + %define fenc_buf rsp + %define pred_buf rsp+0x80 + SUB rsp, pad + mova m15, [hmul_8p] + pxor m8, m8 +%assign %%i 0 +%rep 8 + movddup m %+ %%i, [r0+%%i*FENC_STRIDE] + pmaddubsw m9, m %+ %%i, m15 + punpcklbw m %+ %%i, m8 + mova [fenc_buf+%%i*0x10], m9 +%assign %%i %%i+1 +%endrep + + ; save instruction size: avoid 4-byte memory offsets + lea r0, [intra8x9_h1+0x80] + %define off(m) (r0+m-(intra8x9_h1+0x80)) + lea r5, [pred_buf+0x80] + +; v, h, dc + HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8 + pabsw m11, m1 +%assign %%i 2 +%rep 6 + pabsw m8, m %+ %%i + paddw m11, m8 +%assign %%i %%i+1 +%endrep + + ; 1D hadamard of edges + movq m8, [r2+7] + movddup m9, [r2+16] + mova [r5-0x80], m9 + mova [r5-0x70], m9 + mova [r5-0x60], m9 + mova [r5-0x50], m9 + punpcklwd m8, m8 + pshufb m9, [intrax3_shuf] + pmaddubsw m8, [pb_pppm] + pmaddubsw m9, [pb_pppm] + HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm] + HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm] + + ; dc + paddw m10, m8, m9 + paddw m10, [pw_8] + pand m10, [sw_f0] + psrlw m12, m10, 4 + psllw m10, 2 + pxor m13, m13 + pshufb m12, m13 + mova [r5+0x00], m12 + mova [r5+0x10], m12 + mova [r5+0x20], m12 + mova [r5+0x30], m12 + + ; differences + psllw m8, 3 ; left edge + psubw m8, m0 + psubw m10, m0 + pabsw m8, m8 ; 1x8 sum + pabsw m10, m10 + paddw m8, m11 + paddw m11, m10 + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + punpckldq m0, m2 + punpckldq m4, m6 + punpcklqdq m0, m4 ; transpose + psllw m9, 3 ; top edge + psrldq m10, m11, 2 ; 8x7 sum + psubw m0, m9 ; 8x1 sum + pabsw m0, m0 + paddw m10, m0 + + phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow + psrlw m11, 1 + psrlw m10, 1 + +; store h + movq m3, [r2+7] + pshufb m0, m3, [off(intra8x9_h1)] + pshufb m1, m3, [off(intra8x9_h2)] + pshufb m2, m3, [off(intra8x9_h3)] + pshufb m3, m3, [off(intra8x9_h4)] + mova [r5-0x40], m0 + mova [r5-0x30], m1 + mova [r5-0x20], m2 + mova [r5-0x10], m3 + +; ddl + mova m8, [r2+16] + movu m2, [r2+17] + pslldq m1, m8, 1 + pavgb m9, m8, m2 + PRED4x4_LOWPASS m8, m1, m2, m8, m3 + pshufb m0, m8, [off(intra8x9_ddl1)] + pshufb m1, m8, [off(intra8x9_ddl2)] + pshufb m2, m8, [off(intra8x9_ddl3)] + pshufb m3, m8, [off(intra8x9_ddl4)] + add r5, 0x40 + call .sa8d + phaddd m11, m0 + +; vl + pshufb m0, m9, [off(intra8x9_vl1)] + pshufb m1, m8, [off(intra8x9_vl2)] + pshufb m2, m9, [off(intra8x9_vl3)] + pshufb m3, m8, [off(intra8x9_vl4)] + add r5, 0x100 + call .sa8d + phaddd m10, m11 + mova m12, m0 + +; ddr + movu m2, [r2+8] + movu m8, [r2+7] + movu m1, [r2+6] + pavgb m9, m2, m8 + PRED4x4_LOWPASS m8, m1, m2, m8, m3 + pshufb m0, m8, [off(intra8x9_ddr1)] + pshufb m1, m8, [off(intra8x9_ddr2)] + pshufb m2, m8, [off(intra8x9_ddr3)] + pshufb m3, m8, [off(intra8x9_ddr4)] + sub r5, 0xc0 + call .sa8d + mova m11, m0 + + add r0, 0x100 + %define off(m) (r0+m-(intra8x9_h1+0x180)) + +; vr + movsd m2, m9, m8 + pshufb m0, m2, [off(intra8x9_vr1)] + pshufb m1, m8, [off(intra8x9_vr2)] + pshufb m2, m2, [off(intra8x9_vr3)] + pshufb m3, m8, [off(intra8x9_vr4)] + add r5, 0x40 + call .sa8d + phaddd m11, m0 + +; hd +%if cpuflag(sse4) + pshufd m1, m9, q0001 + pblendw m1, m8, q3330 +%else + pshufd m2, m9, q0001 + movss m1, m8, m2 +%endif + punpcklbw m8, m9 + pshufb m0, m1, [off(intra8x9_hd1)] + pshufb m1, m1, [off(intra8x9_hd2)] + pshufb m2, m8, [off(intra8x9_hd3)] + pshufb m3, m8, [off(intra8x9_hd4)] + add r5, 0x40 + call .sa8d + phaddd m0, m12 + phaddd m11, m0 + +; hu +%if cpuflag(sse4) + pinsrb m8, [r2+7], 15 +%else + movd m9, [r2+7] + pslldq m8, 1 + palignr m9, m8, 1 + SWAP 8, 9 +%endif + pshufb m0, m8, [off(intra8x9_hu1)] + pshufb m1, m8, [off(intra8x9_hu2)] + pshufb m2, m8, [off(intra8x9_hu3)] + pshufb m3, m8, [off(intra8x9_hu4)] + add r5, 0x80 + call .sa8d + + pmaddwd m0, [pw_1] + phaddw m10, m11 + movhlps m1, m0 + paddw m0, m1 + pshuflw m1, m0, q0032 + pavgw m0, m1 + pxor m2, m2 + pavgw m10, m2 + movd r2d, m0 + + movu m0, [r3] + paddw m0, m10 + mova [r4], m0 + movzx r5d, word [r3+16] + add r2d, r5d + mov [r4+16], r2w + +%if cpuflag(sse4) + phminposuw m0, m0 + movd eax, m0 +%else + ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index + paddusw m0, m0 + paddw m0, [off(pw_s00001111)] + movhlps m1, m0 + pminsw m0, m1 + pshuflw m1, m0, q0032 + mova m2, m0 + pminsw m0, m1 + pcmpgtw m2, m1 ; 2nd index bit + movd r3d, m0 + movd r4d, m2 + ; repack with 3 bit index + xor r3d, 0x80008000 + and r4d, 0x00020002 + movzx eax, r3w + movzx r5d, r4w + shr r3d, 16 + shr r4d, 16 + lea eax, [rax*4+r5] + lea r3d, [ r3*4+r4+1] + cmp eax, r3d + cmovg eax, r3d + ; reverse to phminposuw order + mov r3d, eax + and eax, 7 + shr r3d, 3 + shl eax, 16 + or eax, r3d +%endif + add r2d, 8<<16 + cmp ax, r2w + cmovg eax, r2d + + mov r2d, eax + shr r2d, 16 + shl r2d, 6 + add r1, 4*FDEC_STRIDE + mova m0, [pred_buf+r2+0x00] + mova m1, [pred_buf+r2+0x10] + mova m2, [pred_buf+r2+0x20] + mova m3, [pred_buf+r2+0x30] + movq [r1+FDEC_STRIDE*-4], m0 + movhps [r1+FDEC_STRIDE*-2], m0 + movq [r1+FDEC_STRIDE*-3], m1 + movhps [r1+FDEC_STRIDE*-1], m1 + movq [r1+FDEC_STRIDE* 0], m2 + movhps [r1+FDEC_STRIDE* 2], m2 + movq [r1+FDEC_STRIDE* 1], m3 + movhps [r1+FDEC_STRIDE* 3], m3 + ADD rsp, pad + RET + +ALIGN 16 +.sa8d: + %xdefine mret m0 + %xdefine fenc_buf fenc_buf+gprsize + mova [r5+0x00], m0 + mova [r5+0x10], m1 + mova [r5+0x20], m2 + mova [r5+0x30], m3 + movddup m4, m0 + movddup m5, m1 + movddup m6, m2 + movddup m7, m3 + punpckhqdq m0, m0 + punpckhqdq m1, m1 + punpckhqdq m2, m2 + punpckhqdq m3, m3 + PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + psubw m0, [fenc_buf+0x00] + psubw m1, [fenc_buf+0x10] + pmaddubsw m2, m15 + pmaddubsw m3, m15 + psubw m2, [fenc_buf+0x20] + psubw m3, [fenc_buf+0x30] + pmaddubsw m4, m15 + pmaddubsw m5, m15 + psubw m4, [fenc_buf+0x40] + psubw m5, [fenc_buf+0x50] + pmaddubsw m6, m15 + pmaddubsw m7, m15 + psubw m6, [fenc_buf+0x60] + psubw m7, [fenc_buf+0x70] + HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14 + paddw m0, m1 + paddw m0, m2 + paddw mret, m0, m3 + ret +%endif ; ARCH_X86_64 +%endmacro ; INTRA8_X9 + ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0 ; out: [tmp]=hadamard4, m0=satd INIT_MMX mmx2 cglobal hadamard_ac_4x4 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m0, [r0] mova m1, [r0+r1] mova m2, [r0+r1*2] @@ -2471,7 +3771,7 @@ cglobal hadamard_ac_2x2max ABSW2 m1, m3, m1, m3, m4, m5 HADAMARD 0, max, 0, 2, 4, 5 HADAMARD 0, max, 1, 3, 4, 5 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pmaddwd m0, m7 pmaddwd m1, m7 paddd m6, m0 @@ -2484,13 +3784,13 @@ cglobal hadamard_ac_2x2max ret %macro AC_PREP 2 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pmaddwd %1, %2 %endif %endmacro %macro AC_PADD 3 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH AC_PREP %2, %3 paddd %1, %2 %else @@ -2500,7 +3800,7 @@ cglobal hadamard_ac_2x2max cglobal hadamard_ac_8x8 mova m6, [mask_ac4] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH mova m7, [pw_1] %else pxor m7, m7 @@ -2522,7 +3822,7 @@ cglobal hadamard_ac_8x8 AC_PADD m5, m0, m7 sub r3, 40 mova [rsp+gprsize+8], m5 ; save satd -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pxor m6, m6 %endif %rep 3 @@ -2537,7 +3837,7 @@ cglobal hadamard_ac_8x8 ABSW2 m1, m3, m1, m3, m4, m5 ABSW2 m0, m2, m0, m2, m4, m5 HADAMARD 0, max, 1, 3, 4, 5 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH pand m0, [mask_ac4] pmaddwd m1, m7 pmaddwd m0, m7 @@ -2561,7 +3861,7 @@ cglobal hadamard_ac_8x8 %macro HADAMARD_AC_WXH_SUM_MMX 2 mova m1, [rsp+1*mmsize] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if %1*%2 >= 128 paddd m0, [rsp+2*mmsize] paddd m1, [rsp+3*mmsize] @@ -2634,7 +3934,7 @@ cglobal pixel_hadamard_ac_%1x%2, 2,4 movd edx, m0 movd eax, m1 shr edx, 1 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 shl rdx, 32 add rax, rdx %endif @@ -2648,7 +3948,7 @@ HADAMARD_AC_WXH_MMX 16, 8 HADAMARD_AC_WXH_MMX 8, 8 %macro LOAD_INC_8x4W_SSE2 5 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movu m%1, [r0] movu m%2, [r0+r1] movu m%3, [r0+r1*2] @@ -2683,18 +3983,18 @@ HADAMARD_AC_WXH_MMX 8, 8 ; in: r0=pix, r1=stride, r2=stride*3 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4 cglobal hadamard_ac_8x8 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 %define spill0 m8 %define spill1 m9 %define spill2 m10 %else %define spill0 [rsp+gprsize] - %define spill1 [rsp+gprsize+16] - %define spill2 [rsp+gprsize+32] + %define spill1 [rsp+gprsize+mmsize] + %define spill2 [rsp+gprsize+mmsize*2] %endif -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %define vertical 1 -%elif cpuflag(ssse3) +%elif cpuflag(ssse3) && notcpuflag(atom) %define vertical 0 ;LOAD_INC loads sumsubs mova m7, [hmul_8p] @@ -2753,17 +4053,14 @@ cglobal hadamard_ac_8x8 AC_PADD m1, m2, [pw_1] ABSW m2, m7, m7 AC_PADD m1, m3, [pw_1] - mova m3, m7 AC_PADD m1, m2, [pw_1] - mova m2, m6 + paddw m3, m7, spill2 psubw m7, spill2 - paddw m3, spill2 - mova [rsp+gprsize+32], m1 ; save satd - mova m1, m5 + mova [rsp+gprsize+mmsize*2], m1 ; save satd + paddw m2, m6, spill1 psubw m6, spill1 - paddw m2, spill1 + paddw m1, m5, spill0 psubw m5, spill0 - paddw m1, spill0 %assign %%x 2 %if vertical %assign %%x 4 @@ -2777,7 +4074,7 @@ cglobal hadamard_ac_8x8 AC_PREP m2, [pw_1] AC_PADD m2, m3, [pw_1] AC_PADD m2, m1, [pw_1] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH paddd m2, m2 %else paddw m2, m2 @@ -2787,20 +4084,22 @@ cglobal hadamard_ac_8x8 ABSW m0, m0, m7 AC_PADD m2, m4, [pw_1] AC_PADD m2, m0, [pw_1] - mova [rsp+gprsize+16], m2 ; save sa8d + mova [rsp+gprsize+mmsize], m2 ; save sa8d SWAP 0, 2 SAVE_MM_PERMUTATION ret HADAMARD_AC_WXH_SSE2 16, 16 -HADAMARD_AC_WXH_SSE2 8, 16 HADAMARD_AC_WXH_SSE2 16, 8 +%if mmsize <= 16 +HADAMARD_AC_WXH_SSE2 8, 16 HADAMARD_AC_WXH_SSE2 8, 8 +%endif %endmacro ; HADAMARD_AC_SSE2 %macro HADAMARD_AC_WXH_SUM_SSE2 2 mova m1, [rsp+2*mmsize] -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH %if %1*%2 >= 128 paddd m0, [rsp+3*mmsize] paddd m1, [rsp+4*mmsize] @@ -2812,68 +4111,75 @@ HADAMARD_AC_WXH_SSE2 8, 8 paddd m1, [rsp+8*mmsize] psrld m0, 1 %endif - HADDD m0, m2 - HADDD m1, m3 + HADDD xm0, xm2 + HADDD xm1, xm3 %else ; !HIGH_BIT_DEPTH -%if %1*%2 >= 128 +%if %1*%2*16/mmsize >= 128 paddusw m0, [rsp+3*mmsize] paddusw m1, [rsp+4*mmsize] %endif -%if %1*%2 == 256 +%if %1*%2*16/mmsize == 256 paddusw m0, [rsp+5*mmsize] paddusw m1, [rsp+6*mmsize] paddusw m0, [rsp+7*mmsize] paddusw m1, [rsp+8*mmsize] psrlw m0, 1 %endif - HADDUW m0, m2 - HADDW m1, m3 +%if mmsize==32 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + paddusw xm0, xm2 + paddusw xm1, xm3 +%endif + HADDUW xm0, xm2 + HADDW xm1, xm3 %endif ; HIGH_BIT_DEPTH %endmacro ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) %macro HADAMARD_AC_WXH_SSE2 2 -cglobal pixel_hadamard_ac_%1x%2, 2,3,11 - %assign pad 16-gprsize-(stack_offset&15) +cglobal pixel_hadamard_ac_%1x%2, 2,4,11 %define ysub r1 FIX_STRIDES r1 - sub rsp, 48+pad - lea r2, [r1*3] + mov r3, rsp + and rsp, ~(mmsize-1) + sub rsp, mmsize*3 + lea r2, [r1*3] call hadamard_ac_8x8 %if %2==16 %define ysub r2 - lea r0, [r0+r1*4] - sub rsp, 32 + lea r0, [r0+r1*4] + sub rsp, mmsize*2 call hadamard_ac_8x8 %endif -%if %1==16 +%if %1==16 && mmsize <= 16 neg ysub - sub rsp, 32 - lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] + sub rsp, mmsize*2 + lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] neg ysub call hadamard_ac_8x8 %if %2==16 - lea r0, [r0+r1*4] - sub rsp, 32 + lea r0, [r0+r1*4] + sub rsp, mmsize*2 call hadamard_ac_8x8 %endif %endif HADAMARD_AC_WXH_SUM_SSE2 %1, %2 - movd edx, m0 - movd eax, m1 - shr edx, 2 - (%1*%2 >> 8) + movd edx, xm0 + movd eax, xm1 + shr edx, 2 - (%1*%2*16/mmsize >> 8) shr eax, 1 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 shl rdx, 32 add rax, rdx %endif - add rsp, 16+%1*%2/2+pad + mov rsp, r3 RET %endmacro ; HADAMARD_AC_WXH_SSE2 ; instantiate satds -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 cextern pixel_sa8d_8x8_internal_mmx2 INIT_MMX mmx2 SA8D @@ -2890,17 +4196,30 @@ SA8D INIT_XMM sse2 SA8D SATDS_SSE2 -%ifndef HIGH_BIT_DEPTH +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 INTRA_SA8D_SSE2 +%endif INIT_MMX mmx2 INTRA_X3_MMX -%endif INIT_XMM sse2 HADAMARD_AC_SSE2 +%if HIGH_BIT_DEPTH == 0 +INIT_XMM ssse3,atom +SATDS_SSE2 +SA8D +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif +%endif + %define DIFFOP DIFF_SUMSUB_SSSE3 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE -%ifndef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 @@ -2909,14 +4228,17 @@ INIT_XMM ssse3 SATDS_SSE2 SA8D HADAMARD_AC_SSE2 -%ifndef HIGH_BIT_DEPTH +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 INTRA_X9 +INTRA8_X9 %endif %undef movdqa ; nehalem doesn't like movaps %undef movdqu ; movups %undef punpcklqdq ; or movlhps -%ifndef HIGH_BIT_DEPTH -INTRA_SA8D_SSE2 +%if HIGH_BIT_DEPTH == 0 INIT_MMX ssse3 INTRA_X3_MMX %endif @@ -2927,29 +4249,368 @@ INIT_XMM sse4 SATDS_SSE2 SA8D HADAMARD_AC_SSE2 -%ifndef HIGH_BIT_DEPTH +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 INTRA_X9 +INTRA8_X9 %endif +; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so +; it's effectively free. +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE INIT_XMM avx SATDS_SSE2 SA8D -%ifndef HIGH_BIT_DEPTH -INTRA_SA8D_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 INTRA_X9 +INTRA8_X9 %endif HADAMARD_AC_SSE2 +%define TRANS TRANS_XOP +INIT_XMM xop +SATDS_SSE2 +SA8D +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 +INTRA_X9 +; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why. +%endif +HADAMARD_AC_SSE2 + + +%if HIGH_BIT_DEPTH == 0 +%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 +%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 +%define TRANS TRANS_SSE4 +INIT_YMM avx2 +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif + +%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] + movq xm%1, [r0] + movq xm%3, [r2] + movq xm%2, [r0+r1] + movq xm%4, [r2+r3] + vinserti128 m%1, m%1, [r0+4*r1], 1 + vinserti128 m%3, m%3, [r2+4*r3], 1 + vinserti128 m%2, m%2, [r0+r4], 1 + vinserti128 m%4, m%4, [r2+r5], 1 + punpcklqdq m%1, m%1 + punpcklqdq m%3, m%3 + punpcklqdq m%2, m%2 + punpcklqdq m%4, m%4 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + + movq xm%3, [r0] + movq xm%5, [r2] + movq xm%4, [r0+r1] + movq xm%6, [r2+r3] + vinserti128 m%3, m%3, [r0+4*r1], 1 + vinserti128 m%5, m%5, [r2+4*r3], 1 + vinserti128 m%4, m%4, [r0+r4], 1 + vinserti128 m%6, m%6, [r2+r5], 1 + punpcklqdq m%3, m%3 + punpcklqdq m%5, m%5 + punpcklqdq m%4, m%4 + punpcklqdq m%6, m%6 + DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 +%endmacro + +%macro SATD_START_AVX2 2-3 0 + FIX_STRIDES r1, r3 +%if %3 + mova %2, [hmul_8p] + lea r4, [5*r1] + lea r5, [5*r3] +%else + mova %2, [hmul_16p] + lea r4, [3*r1] + lea r5, [3*r3] +%endif + pxor %1, %1 +%endmacro + +%define TRANS TRANS_SSE4 +INIT_YMM avx2 +cglobal pixel_satd_16x8_internal + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_16x16, 4,6,8 + SATD_START_AVX2 m6, m7 + call pixel_satd_16x8_internal + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +pixel_satd_16x8_internal: + call pixel_satd_16x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_16x8, 4,6,8 + SATD_START_AVX2 m6, m7 + jmp pixel_satd_16x8_internal + +cglobal pixel_satd_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_8x16, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_sa8d_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + HADAMARD4_V 0, 1, 2, 3, 4 + HADAMARD 8, sumsub, 0, 1, 4, 5 + HADAMARD 8, sumsub, 2, 3, 4, 5 + HADAMARD 2, sumsub, 0, 1, 4, 5 + HADAMARD 2, sumsub, 2, 3, 4, 5 + HADAMARD 1, amax, 0, 1, 4, 5 + HADAMARD 1, amax, 2, 3, 4, 5 + paddw m6, m0 + paddw m6, m2 + ret + +cglobal pixel_sa8d_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_sa8d_8x8_internal + vextracti128 xm1, m6, 1 + paddw xm6, xm1 + HADDW xm6, xm1 + movd eax, xm6 + add eax, 1 + shr eax, 1 + RET + +cglobal intra_sad_x9_8x8, 5,7,8 + %define pred(i,j) [rsp+i*0x40+j*0x20] + + mov r6, rsp + and rsp, ~31 + sub rsp, 0x240 + movu m5, [r0+0*FENC_STRIDE] + movu m6, [r0+4*FENC_STRIDE] + punpcklqdq m5, [r0+2*FENC_STRIDE] + punpcklqdq m6, [r0+6*FENC_STRIDE] + + ; save instruction size: avoid 4-byte memory offsets + lea r0, [intra8x9_h1+128] + %define off(m) (r0+m-(intra8x9_h1+128)) + + vpbroadcastq m0, [r2+16] + psadbw m4, m0, m5 + psadbw m2, m0, m6 + mova pred(0,0), m0 + mova pred(0,1), m0 + paddw m4, m2 + + vpbroadcastq m1, [r2+7] + pshufb m3, m1, [off(intra8x9_h1)] + pshufb m2, m1, [off(intra8x9_h3)] + mova pred(1,0), m3 + mova pred(1,1), m2 + psadbw m3, m5 + psadbw m2, m6 + paddw m3, m2 + + lea r5, [rsp+0x100] + %define pred(i,j) [r5+i*0x40+j*0x20-0x100] + + ; combine the first two + pslldq m3, 2 + por m4, m3 + + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + pshufb m0, m2 + mova pred(2,0), m0 + mova pred(2,1), m0 + psadbw m3, m0, m5 + psadbw m2, m0, m6 + paddw m3, m2 + + pslldq m3, 4 + por m4, m3 + + vbroadcasti128 m0, [r2+16] + vbroadcasti128 m2, [r2+17] + pslldq m1, m0, 1 + pavgb m3, m0, m2 + PRED4x4_LOWPASS m0, m1, m2, m0, m7 + pshufb m1, m0, [off(intra8x9_ddl1)] + pshufb m2, m0, [off(intra8x9_ddl3)] + mova pred(3,0), m1 + mova pred(3,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 6 + por m4, m1 + vextracti128 xm1, m4, 1 + paddw xm4, xm1 + mova [r4], xm4 + + ; for later + vinserti128 m7, m3, xm0, 1 + + vbroadcasti128 m2, [r2+8] + vbroadcasti128 m0, [r2+7] + vbroadcasti128 m1, [r2+6] + pavgb m3, m2, m0 + PRED4x4_LOWPASS m0, m1, m2, m0, m4 + pshufb m1, m0, [off(intra8x9_ddr1)] + pshufb m2, m0, [off(intra8x9_ddr3)] + mova pred(4,0), m1 + mova pred(4,1), m2 + psadbw m4, m1, m5 + psadbw m2, m6 + paddw m4, m2 + + add r0, 256 + add r5, 0xC0 + %define off(m) (r0+m-(intra8x9_h1+256+128)) + %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0] + + vpblendd m2, m3, m0, 11110011b + pshufb m1, m2, [off(intra8x9_vr1)] + pshufb m2, m2, [off(intra8x9_vr3)] + mova pred(5,0), m1 + mova pred(5,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 2 + por m4, m1 + + psrldq m2, m3, 4 + pblendw m2, m0, q3330 + punpcklbw m0, m3 + pshufb m1, m2, [off(intra8x9_hd1)] + pshufb m2, m0, [off(intra8x9_hd3)] + mova pred(6,0), m1 + mova pred(6,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 4 + por m4, m1 + + pshufb m1, m7, [off(intra8x9_vl1)] + pshufb m2, m7, [off(intra8x9_vl3)] + mova pred(7,0), m1 + mova pred(7,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 6 + por m4, m1 + vextracti128 xm1, m4, 1 + paddw xm4, xm1 + mova xm3, [r4] + SBUTTERFLY qdq, 3, 4, 7 + paddw xm3, xm4 + + pslldq m1, m0, 1 + vpbroadcastd m0, [r2+7] + palignr m0, m1, 1 + pshufb m1, m0, [off(intra8x9_hu1)] + pshufb m2, m0, [off(intra8x9_hu3)] + mova pred(8,0), m1 + mova pred(8,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + vextracti128 xm2, m1, 1 + paddw xm1, xm2 + movhlps xm2, xm1 + paddw xm1, xm2 + movd r2d, xm1 + + paddw xm3, [r3] + mova [r4], xm3 + add r2w, word [r3+16] + mov [r4+16], r2w + + phminposuw xm3, xm3 + movd r3d, xm3 + add r2d, 8<<16 + cmp r3w, r2w + cmovg r3d, r2d + + mov r2d, r3d + shr r3, 16 + shl r3, 6 + add r1, 4*FDEC_STRIDE + mova xm0, [rsp+r3+0x00] + mova xm1, [rsp+r3+0x10] + mova xm2, [rsp+r3+0x20] + mova xm3, [rsp+r3+0x30] + movq [r1+FDEC_STRIDE*-4], xm0 + movhps [r1+FDEC_STRIDE*-2], xm0 + movq [r1+FDEC_STRIDE*-3], xm1 + movhps [r1+FDEC_STRIDE*-1], xm1 + movq [r1+FDEC_STRIDE* 0], xm2 + movhps [r1+FDEC_STRIDE* 2], xm2 + movq [r1+FDEC_STRIDE* 1], xm3 + movhps [r1+FDEC_STRIDE* 3], xm3 + mov rsp, r6 + mov eax, r2d + RET +%endif ; HIGH_BIT_DEPTH + ;============================================================================= ; SSIM ;============================================================================= ;----------------------------------------------------------------------------- -; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) +; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, +; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- %macro SSIM_ITER 1 -%ifdef HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH movdqu m5, [r0+(%1&1)*r1] movdqu m6, [r2+(%1&1)*r3] %else @@ -2972,13 +4633,8 @@ HADAMARD_AC_SSE2 pmaddwd m7, m5, m6 pmaddwd m5, m5 pmaddwd m6, m6 -%if %1==0 - SWAP 3, 5 - SWAP 4, 7 -%else - paddd m3, m5 - paddd m4, m7 -%endif + ACCUM paddd, 3, 5, %1 + ACCUM paddd, 4, 7, %1 paddd m3, m6 %endmacro @@ -3005,7 +4661,7 @@ cglobal pixel_ssim_4x4x2_core, 4,4,8 punpckhdq m5, m3, m4 punpckldq m3, m4 -%ifdef UNIX64 +%if UNIX64 %define t0 r4 %else %define t0 rax @@ -3098,7 +4754,7 @@ cglobal pixel_ssim_end4, 3,3,7 addps m0, m4 pshuflw m4, m0, q0032 addss m0, m4 -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 movd r0m, m0 fld dword r0m %endif @@ -3110,17 +4766,86 @@ SSIM INIT_XMM avx SSIM +;----------------------------------------------------------------------------- +; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); +;----------------------------------------------------------------------------- +%macro ASD8 0 +cglobal pixel_asd8, 5,5 + pxor m0, m0 + pxor m1, m1 +.loop: +%if HIGH_BIT_DEPTH + paddw m0, [r0] + paddw m1, [r2] + paddw m0, [r0+2*r1] + paddw m1, [r2+2*r3] + lea r0, [r0+4*r1] + paddw m0, [r0] + paddw m1, [r2+4*r3] + lea r2, [r2+4*r3] + paddw m0, [r0+2*r1] + paddw m1, [r2+2*r3] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%else + movq m2, [r0] + movq m3, [r2] + movhps m2, [r0+r1] + movhps m3, [r2+r3] + lea r0, [r0+2*r1] + psadbw m2, m1 + psadbw m3, m1 + movq m4, [r0] + movq m5, [r2+2*r3] + lea r2, [r2+2*r3] + movhps m4, [r0+r1] + movhps m5, [r2+r3] + lea r0, [r0+2*r1] + paddw m0, m2 + psubw m0, m3 + psadbw m4, m1 + psadbw m5, m1 + lea r2, [r2+2*r3] + paddw m0, m4 + psubw m0, m5 +%endif + sub r4d, 4 + jg .loop +%if HIGH_BIT_DEPTH + psubw m0, m1 + HADDW m0, m1 + ABSD m1, m0 +%else + movhlps m1, m0 + paddw m0, m1 + ABSW m1, m0 +%endif + movd eax, m1 + RET +%endmacro + +INIT_XMM sse2 +ASD8 +INIT_XMM ssse3 +ASD8 +%if HIGH_BIT_DEPTH +INIT_XMM xop +ASD8 +%endif + ;============================================================================= ; Successive Elimination ADS ;============================================================================= %macro ADS_START 0 -%ifdef WIN64 - movsxd r5, r5d +%if UNIX64 + movsxd r5, r5d +%else + mov r5d, r5m %endif - mov r0d, r5d - lea r6, [r4+r5+15] - and r6, ~15; + mov r0d, r5d + lea r6, [r4+r5+(mmsize-1)] + and r6, ~(mmsize-1) shl r2d, 1 %endmacro @@ -3128,10 +4853,19 @@ SSIM add r1, 8*%1 add r3, 8*%1 add r6, 4*%1 - sub r0d, 4*%1 + sub r0d, 4*%1 jg .loop WIN64_RESTORE_XMM rsp - jmp ads_mvs +%if mmsize==32 + vzeroupper +%endif + lea r6, [r4+r5+(mmsize-1)] + and r6, ~(mmsize-1) +%if cpuflag(ssse3) + jmp ads_mvs_ssse3 +%else + jmp ads_mvs_mmx +%endif %endmacro ;----------------------------------------------------------------------------- @@ -3139,193 +4873,227 @@ SSIM ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 -cglobal pixel_ads4, 6,7 - movq mm6, [r0] - movq mm4, [r0+8] - pshufw mm7, mm6, 0 - pshufw mm6, mm6, q2222 - pshufw mm5, mm4, 0 - pshufw mm4, mm4, q2222 +cglobal pixel_ads4, 5,7 + mova m6, [r0] + mova m4, [r0+8] + pshufw m7, m6, 0 + pshufw m6, m6, q2222 + pshufw m5, m4, 0 + pshufw m4, m4, q2222 ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+16] - psubw mm0, mm7 - psubw mm1, mm6 - ABSW mm0, mm0, mm2 - ABSW mm1, mm1, mm3 - movq mm2, [r1+r2] - movq mm3, [r1+r2+16] - psubw mm2, mm5 - psubw mm3, mm4 - paddw mm0, mm1 - ABSW mm2, mm2, mm1 - ABSW mm3, mm3, mm1 - paddw mm0, mm2 - paddw mm0, mm3 - pshufw mm1, r6m, 0 - paddusw mm0, [r3] - psubusw mm1, mm0 - packsswb mm1, mm1 - movd [r6], mm1 + movu m0, [r1] + movu m1, [r1+16] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + movu m2, [r1+r2] + movu m3, [r1+r2+16] + psubw m2, m5 + psubw m3, m4 + paddw m0, m1 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + pshufw m1, r6m, 0 + paddusw m0, [r3] + psubusw m1, m0 + packsswb m1, m1 + movd [r6], m1 ADS_END 1 -cglobal pixel_ads2, 6,7 - movq mm6, [r0] - pshufw mm5, r6m, 0 - pshufw mm7, mm6, 0 - pshufw mm6, mm6, q2222 +cglobal pixel_ads2, 5,7 + mova m6, [r0] + pshufw m5, r6m, 0 + pshufw m7, m6, 0 + pshufw m6, m6, q2222 ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+r2] - psubw mm0, mm7 - psubw mm1, mm6 - ABSW mm0, mm0, mm2 - ABSW mm1, mm1, mm3 - paddw mm0, mm1 - paddusw mm0, [r3] - movq mm4, mm5 - psubusw mm4, mm0 - packsswb mm4, mm4 - movd [r6], mm4 + movu m0, [r1] + movu m1, [r1+r2] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddw m0, m1 + paddusw m0, [r3] + mova m4, m5 + psubusw m4, m0 + packsswb m4, m4 + movd [r6], m4 ADS_END 1 -cglobal pixel_ads1, 6,7 - pshufw mm7, [r0], 0 - pshufw mm6, r6m, 0 +cglobal pixel_ads1, 5,7 + pshufw m7, [r0], 0 + pshufw m6, r6m, 0 ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+8] - psubw mm0, mm7 - psubw mm1, mm7 - ABSW mm0, mm0, mm2 - ABSW mm1, mm1, mm3 - paddusw mm0, [r3] - paddusw mm1, [r3+8] - movq mm4, mm6 - movq mm5, mm6 - psubusw mm4, mm0 - psubusw mm5, mm1 - packsswb mm4, mm5 - movq [r6], mm4 + movu m0, [r1] + movu m1, [r1+8] + psubw m0, m7 + psubw m1, m7 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddusw m0, [r3] + paddusw m1, [r3+8] + mova m4, m6 + mova m5, m6 + psubusw m4, m0 + psubusw m5, m1 + packsswb m4, m5 + mova [r6], m4 ADS_END 2 %macro ADS_XMM 0 -cglobal pixel_ads4, 6,7,12 - movdqa xmm4, [r0] - pshuflw xmm7, xmm4, 0 - pshuflw xmm6, xmm4, q2222 - pshufhw xmm5, xmm4, 0 - pshufhw xmm4, xmm4, q2222 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpckhqdq xmm5, xmm5 - punpckhqdq xmm4, xmm4 -%ifdef ARCH_X86_64 - pshuflw xmm8, r6m, 0 - punpcklqdq xmm8, xmm8 +%if mmsize==32 +cglobal pixel_ads4, 5,7,8 + vpbroadcastw m7, [r0+ 0] + vpbroadcastw m6, [r0+ 4] + vpbroadcastw m5, [r0+ 8] + vpbroadcastw m4, [r0+12] +%else +cglobal pixel_ads4, 5,7,12 + mova m4, [r0] + pshuflw m7, m4, q0000 + pshuflw m6, m4, q2222 + pshufhw m5, m4, q0000 + pshufhw m4, m4, q2222 + punpcklqdq m7, m7 + punpcklqdq m6, m6 + punpckhqdq m5, m5 + punpckhqdq m4, m4 +%endif +%if ARCH_X86_64 && mmsize == 16 + movd m8, r6m + SPLATW m8, m8 ADS_START - movdqu xmm10, [r1] - movdqu xmm11, [r1+r2] + movu m10, [r1] + movu m11, [r1+r2] .loop: - psubw xmm0, xmm10, xmm7 - movdqu xmm10, [r1+16] - psubw xmm1, xmm10, xmm6 - ABSW xmm0, xmm0, xmm2 - ABSW xmm1, xmm1, xmm3 - psubw xmm2, xmm11, xmm5 - movdqu xmm11, [r1+r2+16] - paddw xmm0, xmm1 - psubw xmm3, xmm11, xmm4 - movdqu xmm9, [r3] - ABSW xmm2, xmm2, xmm1 - ABSW xmm3, xmm3, xmm1 - paddw xmm0, xmm2 - paddw xmm0, xmm3 - paddusw xmm0, xmm9 - psubusw xmm1, xmm8, xmm0 - packsswb xmm1, xmm1 - movq [r6], xmm1 + psubw m0, m10, m7 + movu m10, [r1+16] + psubw m1, m10, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + psubw m2, m11, m5 + movu m11, [r1+r2+16] + paddw m0, m1 + psubw m3, m11, m4 + movu m9, [r3] + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + paddusw m0, m9 + psubusw m1, m8, m0 %else ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+16] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - ABSW xmm0, xmm0, xmm2 - ABSW xmm1, xmm1, xmm3 - movdqu xmm2, [r1+r2] - movdqu xmm3, [r1+r2+16] - psubw xmm2, xmm5 - psubw xmm3, xmm4 - paddw xmm0, xmm1 - ABSW xmm2, xmm2, xmm1 - ABSW xmm3, xmm3, xmm1 - paddw xmm0, xmm2 - paddw xmm0, xmm3 - movd xmm1, r6m - movdqu xmm2, [r3] - pshuflw xmm1, xmm1, 0 - punpcklqdq xmm1, xmm1 - paddusw xmm0, xmm2 - psubusw xmm1, xmm0 - packsswb xmm1, xmm1 - movq [r6], xmm1 + movu m0, [r1] + movu m1, [r1+16] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + movu m2, [r1+r2] + movu m3, [r1+r2+16] + psubw m2, m5 + psubw m3, m4 + paddw m0, m1 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + movu m2, [r3] +%if mmsize==32 + vpbroadcastw m1, r6m +%else + movd m1, r6m + pshuflw m1, m1, 0 + punpcklqdq m1, m1 +%endif + paddusw m0, m2 + psubusw m1, m0 %endif ; ARCH - ADS_END 2 + packsswb m1, m1 +%if mmsize==32 + vpermq m1, m1, q3120 + mova [r6], xm1 +%else + movh [r6], m1 +%endif + ADS_END mmsize/8 -cglobal pixel_ads2, 6,7,8 - movq xmm6, [r0] - movd xmm5, r6m - pshuflw xmm7, xmm6, 0 - pshuflw xmm6, xmm6, q2222 - pshuflw xmm5, xmm5, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpcklqdq xmm5, xmm5 +cglobal pixel_ads2, 5,7,8 +%if mmsize==32 + vpbroadcastw m7, [r0+0] + vpbroadcastw m6, [r0+4] + vpbroadcastw m5, r6m +%else + movq m6, [r0] + movd m5, r6m + pshuflw m7, m6, 0 + pshuflw m6, m6, q2222 + pshuflw m5, m5, 0 + punpcklqdq m7, m7 + punpcklqdq m6, m6 + punpcklqdq m5, m5 +%endif ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+r2] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - movdqu xmm4, [r3] - ABSW xmm0, xmm0, xmm2 - ABSW xmm1, xmm1, xmm3 - paddw xmm0, xmm1 - paddusw xmm0, xmm4 - psubusw xmm1, xmm5, xmm0 - packsswb xmm1, xmm1 - movq [r6], xmm1 - ADS_END 2 + movu m0, [r1] + movu m1, [r1+r2] + psubw m0, m7 + psubw m1, m6 + movu m4, [r3] + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddw m0, m1 + paddusw m0, m4 + psubusw m1, m5, m0 + packsswb m1, m1 +%if mmsize==32 + vpermq m1, m1, q3120 + mova [r6], xm1 +%else + movh [r6], m1 +%endif + ADS_END mmsize/8 -cglobal pixel_ads1, 6,7,8 - movd xmm7, [r0] - movd xmm6, r6m - pshuflw xmm7, xmm7, 0 - pshuflw xmm6, xmm6, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 +cglobal pixel_ads1, 5,7,8 +%if mmsize==32 + vpbroadcastw m7, [r0] + vpbroadcastw m6, r6m +%else + movd m7, [r0] + movd m6, r6m + pshuflw m7, m7, 0 + pshuflw m6, m6, 0 + punpcklqdq m7, m7 + punpcklqdq m6, m6 +%endif ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+16] - psubw xmm0, xmm7 - psubw xmm1, xmm7 - movdqu xmm2, [r3] - movdqu xmm3, [r3+16] - ABSW xmm0, xmm0, xmm4 - ABSW xmm1, xmm1, xmm5 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 - psubusw xmm4, xmm6, xmm0 - psubusw xmm5, xmm6, xmm1 - packsswb xmm4, xmm5 - movdqa [r6], xmm4 - ADS_END 4 + movu m0, [r1] + movu m1, [r1+mmsize] + psubw m0, m7 + psubw m1, m7 + movu m2, [r3] + movu m3, [r3+mmsize] + ABSW m0, m0, m4 + ABSW m1, m1, m5 + paddusw m0, m2 + paddusw m1, m3 + psubusw m4, m6, m0 + psubusw m5, m6, m1 + packsswb m4, m5 +%if mmsize==32 + vpermq m4, m4, q3120 +%endif + mova [r6], m4 + ADS_END mmsize/4 %endmacro INIT_XMM sse2 @@ -3334,6 +5102,8 @@ INIT_XMM ssse3 ADS_XMM INIT_XMM avx ADS_XMM +INIT_YMM avx2 +ADS_XMM ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) ; { @@ -3358,11 +5128,9 @@ ADS_XMM inc r1d %endmacro -INIT_MMX +INIT_MMX mmx cglobal pixel_ads_mvs, 0,7,0 -ads_mvs: - lea r6, [r4+r5+15] - and r6, ~15; +ads_mvs_mmx: ; mvs = r4 ; masks = r6 ; width = r5 @@ -3378,11 +5146,11 @@ ALIGN 16 jge .end .loopi: mov r2, [r6+r1] -%ifdef ARCH_X86_64 +%if ARCH_X86_64 test r2, r2 %else mov r3, r2 - or r3d, [r6+r1+4] + add r3d, [r6+r1+4] %endif jz .loopi0 xor r3d, r3d @@ -3390,7 +5158,7 @@ ALIGN 16 TEST 1 TEST 2 TEST 3 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 shr r2, 32 %else mov r2d, [r6+r1] @@ -3404,3 +5172,36 @@ ALIGN 16 .end: movifnidn eax, r0d RET + +INIT_XMM ssse3 +cglobal pixel_ads_mvs, 0,7,0 +ads_mvs_ssse3: + mova m3, [pw_8] + mova m4, [pw_76543210] + pxor m5, m5 + add r5, r6 + xor r0d, r0d ; nmv + mov [r5], r0d +%ifdef PIC + lea r1, [$$] + %define GLOBAL +r1-$$ +%else + %define GLOBAL +%endif +.loop: + movh m0, [r6] + pcmpeqb m0, m5 + pmovmskb r2d, m0 + xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions) + movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt + add r2d, r2d + ; shuffle counters based on mv mask + pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL] + movu [r4+r0*2], m2 + add r0d, r3d + paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7} + add r6, 8 + cmp r6, r5 + jl .loop + movifnidn eax, r0d + RET