X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fpixel-a.asm;h=9791e8665ea635edc8c7a93de77b774cb6ae85b3;hb=7c860f075ccd14fb7891d5fc6c9eab1a37ea555d;hp=13550be290c37fdc32e516e7d403e61a074e190a;hpb=69e69197c424bff9e4b90eb5d608f15b59ca77b4;p=x264 diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 13550be2..9791e866 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -1,11 +1,14 @@ ;***************************************************************************** -;* pixel.asm: h264 encoder library +;* pixel.asm: x86 pixel metrics ;***************************************************************************** -;* Copyright (C) 2003-2008 x264 project +;* Copyright (C) 2003-2014 x264 project ;* ;* Authors: Loren Merritt +;* Holger Lubitz ;* Laurent Aimar ;* Alex Izvorski +;* Fiona Glaser +;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -20,353 +23,1240 @@ ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA -pw_1: times 8 dw 1 -ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 -ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 -mask_ff: times 16 db 0xff - times 16 db 0 -mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1 -mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1 - -SECTION .text - -%macro HADDD 2 ; sum junk -%if mmsize == 16 - movhlps %2, %1 - paddd %1, %2 - pshuflw %2, %1, 0xE - paddd %1, %2 -%else - mova %2, %1 - psrlq %2, 32 - paddd %1, %2 +SECTION_RODATA 32 +hmul_16p: times 16 db 1 + times 8 db 1, -1 +hmul_8p: times 8 db 1 + times 4 db 1, -1 + times 8 db 1 + times 4 db 1, -1 +mask_ff: times 16 db 0xff + times 16 db 0 +mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1 +mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1 +mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1 +%if BIT_DEPTH == 10 +ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 +ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 +pf_64: times 4 dd 64.0 +pf_128: times 4 dd 128.0 +%elif BIT_DEPTH == 9 +ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64 +ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63 +%else ; 8-bit +ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 +ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 %endif +hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 +mask_10: times 4 dw 0, -1 +mask_1100: times 2 dd 0, -1 +pb_pppm: times 4 db 1,1,1,-1 +deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0 + +intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6 +intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4 +intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0 +intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11 +intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8 +intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9 +intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3 +intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1 +intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1 +intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0 +pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007 +pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007 +intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15 + +intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6 +intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4 +intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0 +intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11 +intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8 +intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9 +intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3 +intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1 +intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1 +intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1 +intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1 +intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0 + +ALIGN 32 +intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5 +intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4 +intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1 +intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 +intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10 +intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11 +intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14 +intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15 +intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 +intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9 +intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10 +intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11 +intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13 +intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12 +intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9 +intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8 +intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14 +intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14 +intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12 +intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12 +intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10 +intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8 +intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10 +intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8 +intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2 +intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0 +intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15 +intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15 +pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003 +pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001 + +transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 +transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 + +sw_f0: dq 0xfff0, 0 +pd_f0: times 4 dd 0xffff0000 + +pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7 + +ads_mvs_shuffle: +%macro ADS_MVS_SHUFFLE 8 + %assign y x + %rep 8 + %rep 7 + %rotate (~y)&1 + %assign y y>>((~y)&1) + %endrep + db %1*2, %1*2+1 + %rotate 1 + %assign y y>>1 + %endrep %endmacro +%assign x 0 +%rep 256 + ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7 +%assign x x+1 +%endrep -%macro HADDW 2 - pmaddwd %1, [pw_1 GLOBAL] - HADDD %1, %2 -%endmacro +SECTION .text -%macro HADDUW 2 - mova %2, %1 - pslld %1, 16 - psrld %2, 16 - psrld %1, 16 - paddd %1, %2 - HADDD %1, %2 -%endmacro +cextern pb_0 +cextern pb_1 +cextern pw_1 +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_00ff +cextern pw_ppppmmmm +cextern pw_ppmmppmm +cextern pw_pmpmpmpm +cextern pw_pmmpzzzz +cextern pd_1 +cextern hsub_mul +cextern popcnt_table ;============================================================================= ; SSD ;============================================================================= -%macro SSD_FULL 6 - mova m1, [r0+%1] - mova m2, [r2+%2] - mova m3, [r0+%3] - mova m4, [r2+%4] - - mova m5, m2 - mova m6, m4 - psubusb m2, m1 - psubusb m4, m3 - psubusb m1, m5 - psubusb m3, m6 - por m1, m2 - por m3, m4 - - mova m2, m1 - mova m4, m3 - punpcklbw m1, m7 - punpcklbw m3, m7 - punpckhbw m2, m7 - punpckhbw m4, m7 - pmaddwd m1, m1 - pmaddwd m2, m2 - pmaddwd m3, m3 - pmaddwd m4, m4 +%if HIGH_BIT_DEPTH +;----------------------------------------------------------------------------- +; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;----------------------------------------------------------------------------- +%macro SSD_ONE 2 +cglobal pixel_ssd_%1x%2, 4,7,6 + FIX_STRIDES r1, r3 +%if mmsize == %1*2 + %define offset0_1 r1 + %define offset0_2 r1*2 + %define offset0_3 r5 + %define offset1_1 r3 + %define offset1_2 r3*2 + %define offset1_3 r6 + lea r5, [3*r1] + lea r6, [3*r3] +%elif mmsize == %1 + %define offset0_1 mmsize + %define offset0_2 r1 + %define offset0_3 r1+mmsize + %define offset1_1 mmsize + %define offset1_2 r3 + %define offset1_3 r3+mmsize +%elif mmsize == %1/2 + %define offset0_1 mmsize + %define offset0_2 mmsize*2 + %define offset0_3 mmsize*3 + %define offset1_1 mmsize + %define offset1_2 mmsize*2 + %define offset1_3 mmsize*3 +%endif + %assign %%n %2/(2*mmsize/%1) +%if %%n > 1 + mov r4d, %%n +%endif + pxor m0, m0 +.loop: + mova m1, [r0] + mova m2, [r0+offset0_1] + mova m3, [r0+offset0_2] + mova m4, [r0+offset0_3] + psubw m1, [r2] + psubw m2, [r2+offset1_1] + psubw m3, [r2+offset1_2] + psubw m4, [r2+offset1_3] +%if %%n > 1 + lea r0, [r0+r1*(%2/%%n)] + lea r2, [r2+r3*(%2/%%n)] +%endif + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 +%if %%n > 1 + dec r4d + jg .loop +%endif + HADDD m0, m5 + movd eax, xm0 + RET +%endmacro -%if %6 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] +INIT_MMX mmx2 +SSD_ONE 4, 4 +SSD_ONE 4, 8 +SSD_ONE 4, 16 +SSD_ONE 8, 4 +SSD_ONE 8, 8 +SSD_ONE 8, 16 +SSD_ONE 16, 8 +SSD_ONE 16, 16 +INIT_XMM sse2 +SSD_ONE 8, 4 +SSD_ONE 8, 8 +SSD_ONE 8, 16 +SSD_ONE 16, 8 +SSD_ONE 16, 16 +INIT_YMM avx2 +SSD_ONE 16, 8 +SSD_ONE 16, 16 +%endif ; HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH == 0 +%macro SSD_LOAD_FULL 5 + mova m1, [t0+%1] + mova m2, [t2+%2] + mova m3, [t0+%3] + mova m4, [t2+%4] +%if %5==1 + add t0, t1 + add t2, t3 +%elif %5==2 + lea t0, [t0+2*t1] + lea t2, [t2+2*t3] %endif - paddd m1, m2 - paddd m3, m4 +%endmacro + +%macro LOAD 5 + movh m%1, %3 + movh m%2, %4 %if %5 - paddd m0, m1 -%else - SWAP m0, m1 + lea t0, [t0+2*t1] %endif - paddd m0, m3 %endmacro -%macro SSD_HALF 6 - movh m1, [r0+%1] - movh m2, [r2+%2] - movh m3, [r0+%3] - movh m4, [r2+%4] +%macro JOIN 7 + movh m%3, %5 + movh m%4, %6 +%if %7 + lea t2, [t2+2*t3] +%endif + punpcklbw m%1, m7 + punpcklbw m%3, m7 + psubw m%1, m%3 + punpcklbw m%2, m7 + punpcklbw m%4, m7 + psubw m%2, m%4 +%endmacro - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - psubw m1, m2 - psubw m3, m4 - pmaddwd m1, m1 - pmaddwd m3, m3 +%macro JOIN_SSE2 7 + movh m%3, %5 + movh m%4, %6 +%if %7 + lea t2, [t2+2*t3] +%endif + punpcklqdq m%1, m%2 + punpcklqdq m%3, m%4 + DEINTB %2, %1, %4, %3, 7 + psubw m%2, m%4 + psubw m%1, m%3 +%endmacro -%if %6 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] +%macro JOIN_SSSE3 7 + movh m%3, %5 + movh m%4, %6 +%if %7 + lea t2, [t2+2*t3] %endif + punpcklbw m%1, m%3 + punpcklbw m%2, m%4 +%endmacro + +%macro LOAD_AVX2 5 + mova xm%1, %3 + vinserti128 m%1, m%1, %4, 1 %if %5 - paddd m0, m1 -%else - SWAP m0, m1 + lea t0, [t0+2*t1] %endif - paddd m0, m3 %endmacro -%macro SSD_QUARTER 6 - movd m1, [r0+%1] - movd m2, [r2+%2] - movd m3, [r0+%3] - movd m4, [r2+%4] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - pinsrd m1, [r0+%1], 1 - pinsrd m2, [r2+%2], 1 - pinsrd m3, [r0+%3], 1 - pinsrd m4, [r2+%4], 1 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - psubw m1, m2 - psubw m3, m4 - pmaddwd m1, m1 - pmaddwd m3, m3 +%macro JOIN_AVX2 7 + mova xm%2, %5 + vinserti128 m%2, m%2, %6, 1 +%if %7 + lea t2, [t2+2*t3] +%endif + SBUTTERFLY bw, %1, %2, %3 +%endmacro + +%macro SSD_LOAD_HALF 5 + LOAD 1, 2, [t0+%1], [t0+%3], 1 + JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 + LOAD 3, 4, [t0+%1], [t0+%3], %5 + JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5 +%endmacro -%if %6 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] +%macro SSD_CORE 7-8 +%ifidn %8, FULL + mova m%6, m%2 + mova m%7, m%4 + psubusb m%2, m%1 + psubusb m%4, m%3 + psubusb m%1, m%6 + psubusb m%3, m%7 + por m%1, m%2 + por m%3, m%4 + punpcklbw m%2, m%1, m%5 + punpckhbw m%1, m%5 + punpcklbw m%4, m%3, m%5 + punpckhbw m%3, m%5 %endif -%if %5 - paddd m0, m1 -%else - SWAP m0, m1 + pmaddwd m%1, m%1 + pmaddwd m%2, m%2 + pmaddwd m%3, m%3 + pmaddwd m%4, m%4 +%endmacro + +%macro SSD_CORE_SSE2 7-8 +%ifidn %8, FULL + DEINTB %6, %1, %7, %2, %5 + psubw m%6, m%7 + psubw m%1, m%2 + SWAP %6, %2, %1 + DEINTB %6, %3, %7, %4, %5 + psubw m%6, m%7 + psubw m%3, m%4 + SWAP %6, %4, %3 +%endif + pmaddwd m%1, m%1 + pmaddwd m%2, m%2 + pmaddwd m%3, m%3 + pmaddwd m%4, m%4 +%endmacro + +%macro SSD_CORE_SSSE3 7-8 +%ifidn %8, FULL + punpckhbw m%6, m%1, m%2 + punpckhbw m%7, m%3, m%4 + punpcklbw m%1, m%2 + punpcklbw m%3, m%4 + SWAP %6, %2, %3 + SWAP %7, %4 %endif + pmaddubsw m%1, m%5 + pmaddubsw m%2, m%5 + pmaddubsw m%3, m%5 + pmaddubsw m%4, m%5 + pmaddwd m%1, m%1 + pmaddwd m%2, m%2 + pmaddwd m%3, m%3 + pmaddwd m%4, m%4 +%endmacro + +%macro SSD_ITER 6 + SSD_LOAD_%1 %2,%3,%4,%5,%6 + SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 paddd m0, m3 %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int ) +; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -%macro SSD 3 -cglobal x264_pixel_ssd_%1x%2_%3, 4,4 -%if %1 >= mmsize +%macro SSD 2 +%if %1 != %2 + %assign function_align 8 +%else + %assign function_align 16 +%endif +cglobal pixel_ssd_%1x%2, 0,0,0 + mov al, %1*%2/mmsize/2 + +%if %1 != %2 + jmp mangle(x264_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop) +%else + +.startloop: +%if ARCH_X86_64 + DECLARE_REG_TMP 0,1,2,3 + PROLOGUE 0,0,8 +%else + PROLOGUE 0,5 + DECLARE_REG_TMP 1,2,3,4 + mov t0, r0m + mov t1, r1m + mov t2, r2m + mov t3, r3m +%endif + +%if cpuflag(ssse3) + mova m7, [hsub_mul] +%elifidn cpuname, sse2 + mova m7, [pw_00ff] +%elif %1 >= mmsize pxor m7, m7 %endif -%assign i 0 -%rep %2/2 + pxor m0, m0 + +ALIGN 16 +.loop: %if %1 > mmsize - SSD_FULL 0, 0, mmsize, mmsize, i, 0 - SSD_FULL r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/2-1 + SSD_ITER FULL, 0, 0, mmsize, mmsize, 1 %elif %1 == mmsize - SSD_FULL 0, 0, r1, r3, i, i<%2/2-1 + SSD_ITER FULL, 0, 0, t1, t3, 2 %else - SSD_HALF 0, 0, r1, r3, i, i<%2/2-1 + SSD_ITER HALF, 0, 0, t1, t3, 2 %endif -%assign i i+1 -%endrep + dec al + jg .loop +%if mmsize==32 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + HADDD xm0, xm1 + movd eax, xm0 +%else HADDD m0, m1 movd eax, m0 +%endif RET +%endif %endmacro -INIT_MMX -SSD 16, 16, mmx -SSD 16, 8, mmx -SSD 8, 16, mmx -SSD 8, 8, mmx -SSD 8, 4, mmx -SSD 4, 8, mmx -SSD 4, 4, mmx -INIT_XMM -SSD 16, 16, sse2 -SSD 16, 8, sse2 -SSD 8, 16, sse2 -SSD 8, 8, sse2 -SSD 8, 4, sse2 - -cglobal x264_pixel_ssd_4x8_sse4, 4,4 - SSD_QUARTER 0, 0, r1, r3, 0, 1 - SSD_QUARTER 0, 0, r1, r3, 1, 0 - HADDD m0, m1 - movd eax, m0 - RET +INIT_MMX mmx +SSD 16, 16 +SSD 16, 8 +SSD 8, 8 +SSD 8, 16 +SSD 4, 4 +SSD 8, 4 +SSD 4, 8 +SSD 4, 16 +INIT_XMM sse2slow +SSD 16, 16 +SSD 8, 8 +SSD 16, 8 +SSD 8, 16 +SSD 8, 4 +INIT_XMM sse2 +%define SSD_CORE SSD_CORE_SSE2 +%define JOIN JOIN_SSE2 +SSD 16, 16 +SSD 8, 8 +SSD 16, 8 +SSD 8, 16 +SSD 8, 4 +INIT_XMM ssse3 +%define SSD_CORE SSD_CORE_SSSE3 +%define JOIN JOIN_SSSE3 +SSD 16, 16 +SSD 8, 8 +SSD 16, 8 +SSD 8, 16 +SSD 8, 4 +INIT_XMM avx +SSD 16, 16 +SSD 8, 8 +SSD 16, 8 +SSD 8, 16 +SSD 8, 4 +INIT_MMX ssse3 +SSD 4, 4 +SSD 4, 8 +SSD 4, 16 +INIT_XMM xop +SSD 16, 16 +SSD 8, 8 +SSD 16, 8 +SSD 8, 16 +SSD 8, 4 +%define LOAD LOAD_AVX2 +%define JOIN JOIN_AVX2 +INIT_YMM avx2 +SSD 16, 16 +SSD 16, 8 +%assign function_align 16 +%endif ; !HIGH_BIT_DEPTH -cglobal x264_pixel_ssd_4x4_sse4, 4,4 - SSD_QUARTER 0, 0, r1, r3, 0, 0 - HADDD m0, m1 - movd eax, m0 +;----------------------------------------------------------------------------- +; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2, +; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) +; +; The maximum width this function can handle without risk of overflow is given +; in the following equation: (mmsize in bits) +; +; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2 +; +; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane +; distortion levels it will take much more than that though. +;----------------------------------------------------------------------------- +%if HIGH_BIT_DEPTH +%macro SSD_NV12 0 +cglobal pixel_ssd_nv12_core, 6,7,7 + shl r4d, 2 + FIX_STRIDES r1, r3 + add r0, r4 + add r2, r4 + xor r6, r6 + pxor m4, m4 + pxor m5, m5 + pxor m6, m6 +.loopy: + mov r6, r4 + neg r6 + pxor m2, m2 + pxor m3, m3 +.loopx: + mova m0, [r0+r6] + mova m1, [r0+r6+mmsize] + psubw m0, [r2+r6] + psubw m1, [r2+r6+mmsize] + PSHUFLW m0, m0, q3120 + PSHUFLW m1, m1, q3120 +%if mmsize >= 16 + pshufhw m0, m0, q3120 + pshufhw m1, m1, q3120 +%endif + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + add r6, 2*mmsize + jl .loopx +%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled + jz .no_overread + psubd m3, m1 +.no_overread: +%endif +%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the + ; equation above, putting the width limit at 8208 + punpckhdq m0, m2, m6 + punpckhdq m1, m3, m6 + punpckldq m2, m6 + punpckldq m3, m6 + paddq m3, m2 + paddq m1, m0 + paddq m4, m3 + paddq m4, m1 +%else ; unfortunately paddq is sse2 + ; emulate 48 bit precision for mmx2 instead + mova m0, m2 + mova m1, m3 + punpcklwd m2, m6 + punpcklwd m3, m6 + punpckhwd m0, m6 + punpckhwd m1, m6 + paddd m3, m2 + paddd m1, m0 + paddd m4, m3 + paddd m5, m1 +%endif + add r0, r1 + add r2, r3 + dec r5d + jg .loopy + mov r3, r6m + mov r4, r7m +%if mmsize == 32 + vextracti128 xm0, m4, 1 + paddq xm4, xm0 +%endif +%if mmsize >= 16 + movq [r3], xm4 + movhps [r4], xm4 +%else ; fixup for mmx2 + SBUTTERFLY dq, 4, 5, 0 + mova m0, m4 + psrld m4, 16 + paddd m5, m4 + pslld m0, 16 + SBUTTERFLY dq, 0, 5, 4 + psrlq m0, 16 + psrlq m5, 16 + movq [r3], m0 + movq [r4], m5 +%endif RET +%endmacro ; SSD_NV12 +%endif ; HIGH_BIT_DEPTH +%if HIGH_BIT_DEPTH == 0 +;----------------------------------------------------------------------------- +; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2, +; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) +; +; This implementation can potentially overflow on image widths >= 11008 (or +; 6604 if interlaced), since it is called on blocks of height up to 12 (resp +; 20). At sane distortion levels it will take much more than that though. +;----------------------------------------------------------------------------- +%macro SSD_NV12 0 +cglobal pixel_ssd_nv12_core, 6,7 + add r4d, r4d + add r0, r4 + add r2, r4 + pxor m3, m3 + pxor m4, m4 + mova m5, [pw_00ff] +.loopy: + mov r6, r4 + neg r6 +.loopx: +%if mmsize == 32 ; only 16-byte alignment is guaranteed + movu m2, [r0+r6] + movu m1, [r2+r6] +%else + mova m2, [r0+r6] + mova m1, [r2+r6] +%endif + psubusb m0, m2, m1 + psubusb m1, m2 + por m0, m1 + psrlw m2, m0, 8 + pand m0, m5 + pmaddwd m2, m2 + pmaddwd m0, m0 + paddd m3, m0 + paddd m4, m2 + add r6, mmsize + jl .loopx +%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled + jz .no_overread + pcmpeqb xm1, xm1 + pandn m0, m1, m0 ; zero the lower half + pandn m2, m1, m2 + psubd m3, m0 + psubd m4, m2 +.no_overread: +%endif + add r0, r1 + add r2, r3 + dec r5d + jg .loopy + mov r3, r6m + mov r4, r7m + HADDD m3, m0 + HADDD m4, m0 + pxor xm0, xm0 + punpckldq xm3, xm0 + punpckldq xm4, xm0 + movq [r3], xm3 + movq [r4], xm4 + RET +%endmacro ; SSD_NV12 +%endif ; !HIGH_BIT_DEPTH + +INIT_MMX mmx2 +SSD_NV12 +INIT_XMM sse2 +SSD_NV12 +INIT_XMM avx +SSD_NV12 +INIT_YMM avx2 +SSD_NV12 ;============================================================================= ; variance ;============================================================================= -%macro VAR_START 0 +%macro VAR_START 1 pxor m5, m5 ; sum pxor m6, m6 ; sum squared +%if HIGH_BIT_DEPTH == 0 +%if %1 + mova m7, [pw_00ff] +%elif mmsize < 32 pxor m7, m7 ; zero -%ifdef ARCH_X86_64 - %define t3d r3d -%else - %define t3d r2d %endif +%endif ; !HIGH_BIT_DEPTH %endmacro -%macro VAR_END 1 -%if mmsize == 16 - movhlps m0, m5 - paddw m5, m0 +%macro VAR_END 2 +%if HIGH_BIT_DEPTH +%if mmsize == 8 && %1*%2 == 256 + HADDUW m5, m2 +%else + HADDW m5, m2 %endif - movifnidn r2d, r2m - movd r1d, m5 - movd [r2], m5 ; return sum - imul r1d, r1d +%else ; !HIGH_BIT_DEPTH + HADDW m5, m2 +%endif ; HIGH_BIT_DEPTH HADDD m6, m1 - shr r1d, %1 - movd eax, m6 - sub eax, r1d ; sqr - (sum * sum >> shift) +%if ARCH_X86_64 + punpckldq m5, m6 + movq rax, m5 +%else + movd eax, m5 + movd edx, m6 +%endif RET %endmacro +%macro VAR_CORE 0 + paddw m5, m0 + paddw m5, m3 + paddw m5, m1 + paddw m5, m4 + pmaddwd m0, m0 + pmaddwd m3, m3 + pmaddwd m1, m1 + pmaddwd m4, m4 + paddd m6, m0 + paddd m6, m3 + paddd m6, m1 + paddd m6, m4 +%endmacro + %macro VAR_2ROW 2 - mov t3d, %2 + mov r2d, %2 .loop: +%if HIGH_BIT_DEPTH mova m0, [r0] - mova m1, m0 + mova m1, [r0+mmsize] + mova m3, [r0+%1] + mova m4, [r0+%1+mmsize] +%else ; !HIGH_BIT_DEPTH + mova m0, [r0] + punpckhbw m1, m0, m7 mova m3, [r0+%1] - mova m2, m0 - punpcklbw m0, m7 mova m4, m3 - punpckhbw m1, m7 + punpcklbw m0, m7 +%endif ; HIGH_BIT_DEPTH %ifidn %1, r1 lea r0, [r0+%1*2] %else add r0, r1 %endif - punpckhbw m4, m7 - psadbw m2, m7 - paddw m5, m2 - mova m2, m3 +%if HIGH_BIT_DEPTH == 0 punpcklbw m3, m7 - dec t3d - psadbw m2, m7 - pmaddwd m0, m0 - paddw m5, m2 - pmaddwd m1, m1 - paddd m6, m0 - pmaddwd m3, m3 - paddd m6, m1 - pmaddwd m4, m4 - paddd m6, m3 - paddd m6, m4 + punpckhbw m4, m7 +%endif ; !HIGH_BIT_DEPTH + VAR_CORE + dec r2d jg .loop %endmacro ;----------------------------------------------------------------------------- -; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * ) +; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -INIT_MMX -cglobal x264_pixel_var_16x16_mmxext, 2,3 - VAR_START - VAR_2ROW 8, 16 - VAR_END 8 +INIT_MMX mmx2 +cglobal pixel_var_16x16, 2,3 + FIX_STRIDES r1 + VAR_START 0 + VAR_2ROW 8*SIZEOF_PIXEL, 16 + VAR_END 16, 16 + +cglobal pixel_var_8x16, 2,3 + FIX_STRIDES r1 + VAR_START 0 + VAR_2ROW r1, 8 + VAR_END 8, 16 -cglobal x264_pixel_var_8x8_mmxext, 2,3 - VAR_START +cglobal pixel_var_8x8, 2,3 + FIX_STRIDES r1 + VAR_START 0 VAR_2ROW r1, 4 - VAR_END 6 + VAR_END 8, 8 -INIT_XMM -cglobal x264_pixel_var_16x16_sse2, 2,3 - VAR_START +%if HIGH_BIT_DEPTH +%macro VAR 0 +cglobal pixel_var_16x16, 2,3,8 + FIX_STRIDES r1 + VAR_START 0 VAR_2ROW r1, 8 - VAR_END 8 + VAR_END 16, 16 -cglobal x264_pixel_var_8x8_sse2, 2,3 - VAR_START - mov t3d, 4 +cglobal pixel_var_8x8, 2,3,8 + lea r2, [r1*3] + VAR_START 0 + mova m0, [r0] + mova m1, [r0+r1*2] + mova m3, [r0+r1*4] + mova m4, [r0+r2*2] + lea r0, [r0+r1*8] + VAR_CORE + mova m0, [r0] + mova m1, [r0+r1*2] + mova m3, [r0+r1*4] + mova m4, [r0+r2*2] + VAR_CORE + VAR_END 8, 8 +%endmacro ; VAR + +INIT_XMM sse2 +VAR +INIT_XMM avx +VAR +INIT_XMM xop +VAR +%endif ; HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH == 0 +%macro VAR 0 +cglobal pixel_var_16x16, 2,3,8 + VAR_START 1 + mov r2d, 8 .loop: - movh m0, [r0] - movhps m0, [r0+r1] + mova m0, [r0] + mova m3, [r0+r1] + DEINTB 1, 0, 4, 3, 7 lea r0, [r0+r1*2] - mova m1, m0 + VAR_CORE + dec r2d + jg .loop + VAR_END 16, 16 + +cglobal pixel_var_8x8, 2,4,8 + VAR_START 1 + mov r2d, 2 + lea r3, [r1*3] +.loop: + movh m0, [r0] + movh m3, [r0+r1] + movhps m0, [r0+r1*2] + movhps m3, [r0+r3] + DEINTB 1, 0, 4, 3, 7 + lea r0, [r0+r1*4] + VAR_CORE + dec r2d + jg .loop + VAR_END 8, 8 + +cglobal pixel_var_8x16, 2,4,8 + VAR_START 1 + mov r2d, 4 + lea r3, [r1*3] +.loop: + movh m0, [r0] + movh m3, [r0+r1] + movhps m0, [r0+r1*2] + movhps m3, [r0+r3] + DEINTB 1, 0, 4, 3, 7 + lea r0, [r0+r1*4] + VAR_CORE + dec r2d + jg .loop + VAR_END 8, 16 +%endmacro ; VAR + +INIT_XMM sse2 +VAR +INIT_XMM avx +VAR +INIT_XMM xop +VAR + +INIT_YMM avx2 +cglobal pixel_var_16x16, 2,4,7 + VAR_START 0 + mov r2d, 4 + lea r3, [r1*3] +.loop: + pmovzxbw m0, [r0] + pmovzxbw m3, [r0+r1] + pmovzxbw m1, [r0+r1*2] + pmovzxbw m4, [r0+r3] + lea r0, [r0+r1*4] + VAR_CORE + dec r2d + jg .loop + vextracti128 xm0, m5, 1 + vextracti128 xm1, m6, 1 + paddw xm5, xm0 + paddd xm6, xm1 + HADDW xm5, xm2 + HADDD xm6, xm1 +%if ARCH_X86_64 + punpckldq xm5, xm6 + movq rax, xm5 +%else + movd eax, xm5 + movd edx, xm6 +%endif + RET +%endif ; !HIGH_BIT_DEPTH + +%macro VAR2_END 3 + HADDW %2, xm1 + movd r1d, %2 + imul r1d, r1d + HADDD %3, xm1 + shr r1d, %1 + movd eax, %3 + movd [r4], %3 + sub eax, r1d ; sqr - (sum * sum >> shift) + RET +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * ) +;----------------------------------------------------------------------------- +%macro VAR2_8x8_MMX 2 +cglobal pixel_var2_8x%1, 5,6 + FIX_STRIDES r1, r3 + VAR_START 0 + mov r5d, %1 +.loop: +%if HIGH_BIT_DEPTH + mova m0, [r0] + mova m1, [r0+mmsize] + psubw m0, [r2] + psubw m1, [r2+mmsize] +%else ; !HIGH_BIT_DEPTH + movq m0, [r0] + movq m1, m0 + movq m2, [r2] + movq m3, m2 punpcklbw m0, m7 - mova m2, m1 punpckhbw m1, m7 - dec t3d + punpcklbw m2, m7 + punpckhbw m3, m7 + psubw m0, m2 + psubw m1, m3 +%endif ; HIGH_BIT_DEPTH + paddw m5, m0 + paddw m5, m1 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m6, m0 + paddd m6, m1 + add r0, r1 + add r2, r3 + dec r5d + jg .loop + VAR2_END %2, m5, m6 +%endmacro + +%if ARCH_X86_64 == 0 +INIT_MMX mmx2 +VAR2_8x8_MMX 8, 6 +VAR2_8x8_MMX 16, 7 +%endif + +%macro VAR2_8x8_SSE2 2 +cglobal pixel_var2_8x%1, 5,6,8 + VAR_START 1 + mov r5d, %1/2 +.loop: +%if HIGH_BIT_DEPTH + mova m0, [r0] + mova m1, [r0+r1*2] + mova m2, [r2] + mova m3, [r2+r3*2] +%else ; !HIGH_BIT_DEPTH + movq m1, [r0] + movhps m1, [r0+r1] + movq m3, [r2] + movhps m3, [r2+r3] + DEINTB 0, 1, 2, 3, 7 +%endif ; HIGH_BIT_DEPTH + psubw m0, m2 + psubw m1, m3 + paddw m5, m0 + paddw m5, m1 pmaddwd m0, m0 pmaddwd m1, m1 - psadbw m2, m7 + paddd m6, m0 + paddd m6, m1 + lea r0, [r0+r1*2*SIZEOF_PIXEL] + lea r2, [r2+r3*2*SIZEOF_PIXEL] + dec r5d + jg .loop + VAR2_END %2, m5, m6 +%endmacro + +INIT_XMM sse2 +VAR2_8x8_SSE2 8, 6 +VAR2_8x8_SSE2 16, 7 + +%if HIGH_BIT_DEPTH == 0 +%macro VAR2_8x8_SSSE3 2 +cglobal pixel_var2_8x%1, 5,6,8 + pxor m5, m5 ; sum + pxor m6, m6 ; sum squared + mova m7, [hsub_mul] + mov r5d, %1/4 +.loop: + movq m0, [r0] + movq m2, [r2] + movq m1, [r0+r1] + movq m3, [r2+r3] + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m0, m2 + punpcklbw m1, m3 + movq m2, [r0] + movq m3, [r2] + punpcklbw m2, m3 + movq m3, [r0+r1] + movq m4, [r2+r3] + punpcklbw m3, m4 + pmaddubsw m0, m7 + pmaddubsw m1, m7 + pmaddubsw m2, m7 + pmaddubsw m3, m7 + paddw m5, m0 + paddw m5, m1 paddw m5, m2 + paddw m5, m3 + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 paddd m6, m0 paddd m6, m1 - jnz .loop - VAR_END 6 + paddd m6, m2 + paddd m6, m3 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + dec r5d + jg .loop + VAR2_END %2, m5, m6 +%endmacro + +INIT_XMM ssse3 +VAR2_8x8_SSSE3 8, 6 +VAR2_8x8_SSSE3 16, 7 +INIT_XMM xop +VAR2_8x8_SSSE3 8, 6 +VAR2_8x8_SSSE3 16, 7 + +%macro VAR2_8x8_AVX2 2 +cglobal pixel_var2_8x%1, 5,6,6 + pxor m3, m3 ; sum + pxor m4, m4 ; sum squared + mova m5, [hsub_mul] + mov r5d, %1/4 +.loop: + movq xm0, [r0] + movq xm1, [r2] + vinserti128 m0, m0, [r0+r1], 1 + vinserti128 m1, m1, [r2+r3], 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m0, m1 + movq xm1, [r0] + movq xm2, [r2] + vinserti128 m1, m1, [r0+r1], 1 + vinserti128 m2, m2, [r2+r3], 1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + punpcklbw m1, m2 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + paddw m3, m0 + paddw m3, m1 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m4, m0 + paddd m4, m1 + dec r5d + jg .loop + vextracti128 xm0, m3, 1 + vextracti128 xm1, m4, 1 + paddw xm3, xm0 + paddd xm4, xm1 + VAR2_END %2, xm3, xm4 +%endmacro + +INIT_YMM avx2 +VAR2_8x8_AVX2 8, 6 +VAR2_8x8_AVX2 16, 7 +%endif ; !HIGH_BIT_DEPTH ;============================================================================= ; SATD ;============================================================================= -; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower: -; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1. -; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging, -; whereas phaddw-based transform doesn't care what order the coefs end up in. +%macro JDUP 2 +%if cpuflag(sse4) + ; just use shufps on anything post conroe + shufps %1, %2, 0 +%elif cpuflag(ssse3) && notcpuflag(atom) + ; join 2x 32 bit and duplicate them + ; emulating shufps is faster on conroe + punpcklqdq %1, %2 + movsldup %1, %1 +%else + ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d + punpckldq %1, %2 +%endif +%endmacro + +%macro HSUMSUB 5 + pmaddubsw m%2, m%5 + pmaddubsw m%1, m%5 + pmaddubsw m%4, m%5 + pmaddubsw m%3, m%5 +%endmacro + +%macro DIFF_UNPACK_SSE2 5 + punpcklbw m%1, m%5 + punpcklbw m%2, m%5 + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + psubw m%1, m%2 + psubw m%3, m%4 +%endmacro + +%macro DIFF_SUMSUB_SSSE3 5 + HSUMSUB %1, %2, %3, %4, %5 + psubw m%1, m%2 + psubw m%3, m%4 +%endmacro + +%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer + movd %1, %3 + movd %2, %4 + JDUP %1, %2 +%endmacro -%macro PHSUMSUB 3 - movdqa m%3, m%1 - phaddw m%1, m%2 - phsubw m%3, m%2 - SWAP %2, %3 +%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer + movddup m%3, %6 + movddup m%4, %8 + movddup m%1, %5 + movddup m%2, %7 %endmacro -%macro HADAMARD4_ROW_PHADD 5 - PHSUMSUB %1, %2, %5 - PHSUMSUB %3, %4, %5 - PHSUMSUB %1, %3, %5 - PHSUMSUB %2, %4, %5 - SWAP %3, %4 +%macro LOAD_DUP_4x8P_PENRYN 8 + ; penryn and nehalem run punpcklqdq and movddup in different units + movh m%3, %6 + movh m%4, %8 + punpcklqdq m%3, m%3 + movddup m%1, %5 + punpcklqdq m%4, m%4 + movddup m%2, %7 %endmacro -%macro HADAMARD4_1D 4 - SUMSUB_BADC %1, %2, %3, %4 - SUMSUB_BADC %1, %3, %2, %4 +%macro LOAD_SUMSUB_8x2P 9 + LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro -%macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) - %xdefine %%n n%1 - HADAMARD4_1D m4, m5, m6, m7 - TRANSPOSE4x4W 4, 5, 6, 7, %%n - HADAMARD4_1D m4, m5, m6, m7 - ABS2 m4, m5, m3, m %+ %%n - ABS2 m6, m7, m3, m %+ %%n - paddw m6, m4 - paddw m7, m5 - pavgw m6, m7 - SWAP %%n, 6 +%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] + LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr + movddup m%1, [%7] + movddup m%2, [%7+8] + mova m%4, [%6] + movddup m%3, m%4 + punpckhqdq m%4, m%4 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr + movu m%4, [%7] + mova m%2, [%6] + DEINTB %1, %2, %3, %4, %5 + psubw m%1, m%3 + psubw m%2, m%4 + SUMSUB_BA w, %1, %2, %3 +%endmacro + +%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none +; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] + LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 + LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 + LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 + LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 +%endmacro + +%macro LOAD_SUMSUB_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + vbroadcasti128 m%1, [%6] + vbroadcasti128 m%3, [%7] + vbroadcasti128 m%2, [%8] + vbroadcasti128 m%4, [%9] + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 + LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer + mova xm%3, %6 + mova xm%4, %8 + mova xm%1, %5 + mova xm%2, %7 + vpermq m%3, m%3, q0011 + vpermq m%4, m%4, q0011 + vpermq m%1, m%1, q0011 + vpermq m%2, m%2, q0011 +%endmacro + +%macro LOAD_SUMSUB8_16x2P_AVX2 9 +; 2*dst, 2*tmp, mul, 4*ptr + LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 +%endmacro + +%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 +; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] + LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] + LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif %endmacro ; in: r4=3*stride1, r5=3*stride2 @@ -375,44 +1265,63 @@ cglobal x264_pixel_var_8x8_sse2, 2,3 ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 - LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2] - LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2] - LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2] - LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2] + %xdefine %%n nn%1 + %assign offset %2*SIZEOF_PIXEL + LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] + LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] + LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset] + LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset] %if %3 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif - HADAMARD4x4_SUM %1 + HADAMARD4_2D 4, 5, 6, 7, 3, %%n + paddw m4, m6 + SWAP %%n, 4 %endmacro -%macro SATD_8x4_SSE2 1 - HADAMARD4_1D m0, m1, m2, m3 -%ifidn %1, ssse3_phadd - HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4 +; in: %1 = horizontal if 0, vertical if 1 +%macro SATD_8x4_SSE 8-9 +%if %1 + HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax %else - TRANSPOSE2x4x4W 0, 1, 2, 3, 4 - HADAMARD4_1D m0, m1, m2, m3 + HADAMARD4_V %2, %3, %4, %5, %6 + ; doing the abs first is a slight advantage + ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 + ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 + HADAMARD 1, max, %2, %4, %6, %7 +%endif +%ifnidn %9, swap + paddw m%8, m%2 +%else + SWAP %8, %2 +%endif +%if %1 + paddw m%8, m%4 +%else + HADAMARD 1, max, %3, %5, %6, %7 + paddw m%8, m%3 %endif - ABS4 m0, m1, m2, m3, m4, m5 - paddusw m0, m1 - paddusw m2, m3 - paddusw m6, m0 - paddusw m6, m2 %endmacro %macro SATD_START_MMX 0 + FIX_STRIDES r1, r3 lea r4, [3*r1] ; 3*stride1 lea r5, [3*r3] ; 3*stride2 %endmacro %macro SATD_END_MMX 0 - pshufw m1, m0, 01001110b +%if HIGH_BIT_DEPTH + HADDUW m0, m1 + movd eax, m0 +%else ; !HIGH_BIT_DEPTH + pshufw m1, m0, q1032 paddw m0, m1 - pshufw m1, m0, 10110001b + pshufw m1, m0, q2301 paddw m0, m1 movd eax, m0 and eax, 0xffff +%endif ; HIGH_BIT_DEPTH RET %endmacro @@ -420,10 +1329,10 @@ cglobal x264_pixel_var_8x8_sse2, 2,3 ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- -; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -INIT_MMX -cglobal x264_pixel_satd_16x4_internal_mmxext +INIT_MMX mmx2 +cglobal pixel_satd_16x4_internal SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 @@ -434,330 +1343,570 @@ cglobal x264_pixel_satd_16x4_internal_mmxext paddw m0, m1 ret -cglobal x264_pixel_satd_8x8_internal_mmxext +cglobal pixel_satd_8x8_internal SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 1 paddw m0, m2 paddw m0, m1 -x264_pixel_satd_8x4_internal_mmxext: +pixel_satd_8x4_internal_mmx2: SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 paddw m0, m1 ret -cglobal x264_pixel_satd_16x16_mmxext, 4,6 +%if HIGH_BIT_DEPTH +%macro SATD_MxN_MMX 3 +cglobal pixel_satd_%1x%2, 4,7 + SATD_START_MMX + pxor m0, m0 + call pixel_satd_%1x%3_internal_mmx2 + HADDUW m0, m1 + movd r6d, m0 +%rep %2/%3-1 + pxor m0, m0 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_%1x%3_internal_mmx2 + movd m2, r4 + HADDUW m0, m1 + movd r4, m0 + add r6, r4 + movd r4, m2 +%endrep + movifnidn eax, r6d + RET +%endmacro + +SATD_MxN_MMX 16, 16, 4 +SATD_MxN_MMX 16, 8, 4 +SATD_MxN_MMX 8, 16, 8 +%endif ; HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH == 0 +cglobal pixel_satd_16x16, 4,6 SATD_START_MMX pxor m0, m0 %rep 3 - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmx2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endrep - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmx2 HADDUW m0, m1 movd eax, m0 RET -cglobal x264_pixel_satd_16x8_mmxext, 4,6 +cglobal pixel_satd_16x8, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmx2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] - call x264_pixel_satd_16x4_internal_mmxext + call pixel_satd_16x4_internal_mmx2 SATD_END_MMX -cglobal x264_pixel_satd_8x16_mmxext, 4,6 +cglobal pixel_satd_8x16, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_8x8_internal_mmxext + call pixel_satd_8x8_internal_mmx2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] - call x264_pixel_satd_8x8_internal_mmxext + call pixel_satd_8x8_internal_mmx2 SATD_END_MMX +%endif ; !HIGH_BIT_DEPTH -cglobal x264_pixel_satd_8x8_mmxext, 4,6 +cglobal pixel_satd_8x8, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_8x8_internal_mmxext + call pixel_satd_8x8_internal_mmx2 SATD_END_MMX -cglobal x264_pixel_satd_8x4_mmxext, 4,6 +cglobal pixel_satd_8x4, 4,6 SATD_START_MMX pxor m0, m0 - call x264_pixel_satd_8x4_internal_mmxext + call pixel_satd_8x4_internal_mmx2 SATD_END_MMX -cglobal x264_pixel_satd_4x8_mmxext, 4,6 +cglobal pixel_satd_4x16, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 + SATD_4x4_MMX m1, 0, 1 + paddw m0, m1 + SATD_4x4_MMX m1, 0, 1 + paddw m0, m1 SATD_4x4_MMX m1, 0, 0 paddw m0, m1 SATD_END_MMX -%macro SATD_W4 1 -INIT_MMX -cglobal x264_pixel_satd_4x4_%1, 4,6 +cglobal pixel_satd_4x8, 4,6 SATD_START_MMX - SATD_4x4_MMX m0, 0, 0 + SATD_4x4_MMX m0, 0, 1 + SATD_4x4_MMX m1, 0, 0 + paddw m0, m1 SATD_END_MMX -%endmacro -SATD_W4 mmxext +cglobal pixel_satd_4x4, 4,6 + SATD_START_MMX + SATD_4x4_MMX m0, 0, 0 + SATD_END_MMX -%macro SATD_START_SSE2 0 - pxor m6, m6 +%macro SATD_START_SSE2 2-3 0 + FIX_STRIDES r1, r3 +%if HIGH_BIT_DEPTH && %3 + pxor %2, %2 +%elif cpuflag(ssse3) && notcpuflag(atom) +%if mmsize==32 + mova %2, [hmul_16p] +%else + mova %2, [hmul_8p] +%endif +%endif lea r4, [3*r1] lea r5, [3*r3] + pxor %1, %1 +%endmacro + +%macro SATD_END_SSE2 1-2 +%if HIGH_BIT_DEPTH + HADDUW %1, xm0 +%if %0 == 2 + paddd %1, %2 +%endif +%else + HADDW %1, xm7 +%endif + movd eax, %1 + RET +%endmacro + +%macro SATD_ACCUM 3 +%if HIGH_BIT_DEPTH + HADDUW %1, %2 + paddd %3, %1 + pxor %1, %1 +%endif %endmacro -%macro SATD_END_SSE2 0 - psrlw m6, 1 - HADDW m6, m7 - movd eax, m6 +%macro BACKUP_POINTERS 0 +%if ARCH_X86_64 +%if WIN64 + PUSH r7 +%endif + mov r6, r0 + mov r7, r2 +%endif +%endmacro + +%macro RESTORE_AND_INC_POINTERS 0 +%if ARCH_X86_64 + lea r0, [r6+8*SIZEOF_PIXEL] + lea r2, [r7+8*SIZEOF_PIXEL] +%if WIN64 + POP r7 +%endif +%else + mov r0, r0mp + mov r2, r2mp + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL +%endif +%endmacro + +%macro SATD_4x8_SSE 3 +%if HIGH_BIT_DEPTH + movh m0, [r0+0*r1] + movh m4, [r2+0*r3] + movh m1, [r0+1*r1] + movh m5, [r2+1*r3] + movhps m0, [r0+4*r1] + movhps m4, [r2+4*r3] + movh m2, [r0+2*r1] + movh m6, [r2+2*r3] + psubw m0, m4 + movh m3, [r0+r4] + movh m4, [r2+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + movhps m1, [r0+1*r1] + movhps m5, [r2+1*r3] + movhps m2, [r0+2*r1] + movhps m6, [r2+2*r3] + psubw m1, m5 + movhps m3, [r0+r4] + movhps m4, [r2+r5] + psubw m2, m6 + psubw m3, m4 +%else ; !HIGH_BIT_DEPTH + movd m4, [r2] + movd m5, [r2+r3] + movd m6, [r2+2*r3] + add r2, r5 + movd m0, [r0] + movd m1, [r0+r1] + movd m2, [r0+2*r1] + add r0, r4 + movd m3, [r2+r3] + JDUP m4, m3 + movd m3, [r0+r1] + JDUP m0, m3 + movd m3, [r2+2*r3] + JDUP m5, m3 + movd m3, [r0+2*r1] + JDUP m1, m3 +%if %1==0 && %2==1 + mova m3, [hmul_4p] + DIFFOP 0, 4, 1, 5, 3 +%else + DIFFOP 0, 4, 1, 5, 7 +%endif + movd m5, [r2] + add r2, r5 + movd m3, [r0] + add r0, r4 + movd m4, [r2] + JDUP m6, m4 + movd m4, [r0] + JDUP m2, m4 + movd m4, [r2+r3] + JDUP m5, m4 + movd m4, [r0+r1] + JDUP m3, m4 +%if %1==0 && %2==1 + mova m4, [hmul_4p] + DIFFOP 2, 6, 3, 5, 4 +%else + DIFFOP 2, 6, 3, 5, 7 +%endif +%endif ; HIGH_BIT_DEPTH + SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 +%endmacro + +;----------------------------------------------------------------------------- +; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;----------------------------------------------------------------------------- +%macro SATDS_SSE2 0 +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) + +%if vertical==0 || HIGH_BIT_DEPTH +cglobal pixel_satd_4x4, 4, 6, 6 + SATD_START_MMX + mova m4, [hmul_4p] + LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] + LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] + LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] + LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 + HADAMARD 0, sumsub, 0, 1, 2, 3 + HADAMARD 4, sumsub, 0, 1, 2, 3 + HADAMARD 1, amax, 0, 1, 2, 3 + HADDW m0, m1 + movd eax, m0 RET -%endmacro +%endif -%macro BACKUP_POINTERS 0 -%ifdef ARCH_X86_64 - mov r10, r0 - mov r11, r2 +cglobal pixel_satd_4x8, 4, 6, 8 + SATD_START_MMX +%if vertical==0 + mova m7, [hmul_4p] %endif -%endmacro + SATD_4x8_SSE vertical, 0, swap + HADDW m7, m1 + movd eax, m7 + RET -%macro RESTORE_AND_INC_POINTERS 0 -%ifdef ARCH_X86_64 - lea r0, [r10+8] - lea r2, [r11+8] -%else - mov r0, r0m - mov r2, r2m - add r0, 8 - add r2, 8 +cglobal pixel_satd_4x16, 4, 6, 8 + SATD_START_MMX +%if vertical==0 + mova m7, [hmul_4p] %endif -%endmacro + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0+r1*2*SIZEOF_PIXEL] + lea r2, [r2+r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + HADDW m7, m1 + movd eax, m7 + RET -;----------------------------------------------------------------------------- -; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) -;----------------------------------------------------------------------------- -%macro SATDS_SSE2 1 -INIT_XMM -cglobal x264_pixel_satd_8x8_internal_%1 - LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 - SATD_8x4_SSE2 %1 - lea r0, [r0+4*r1] +cglobal pixel_satd_8x8_internal + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 +%%pixel_satd_8x4_internal: + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 + SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 + ret + +; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) +; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) +%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx) +cglobal pixel_satd_16x4_internal + LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] -x264_pixel_satd_8x4_internal_%1: - LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5 -x264_pixel_satd_4x8_internal_%1: - SAVE_MM_PERMUTATION satd_4x8_internal - SATD_8x4_SSE2 %1 + lea r0, [r0+4*r1] + ; always use horizontal mode here + SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10 + SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10 ret -cglobal x264_pixel_satd_16x16_%1, 4,6 - SATD_START_SSE2 +cglobal pixel_satd_16x8, 4,6,12 + SATD_START_SSE2 m10, m7 +%if vertical + mova m7, [pw_00ff] +%endif + jmp %%pixel_satd_16x8_internal + +cglobal pixel_satd_16x16, 4,6,12 + SATD_START_SSE2 m10, m7 +%if vertical + mova m7, [pw_00ff] +%endif + call pixel_satd_16x4_internal + call pixel_satd_16x4_internal +%%pixel_satd_16x8_internal: + call pixel_satd_16x4_internal + call pixel_satd_16x4_internal + SATD_END_SSE2 m10 +%else +cglobal pixel_satd_16x8, 4,6,8 + SATD_START_SSE2 m6, m7 BACKUP_POINTERS - call x264_pixel_satd_8x8_internal_%1 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal RESTORE_AND_INC_POINTERS - call x264_pixel_satd_8x8_internal_%1 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - call x264_pixel_satd_8x8_internal_%1 - SATD_END_SSE2 + call pixel_satd_8x8_internal + SATD_END_SSE2 m6 -cglobal x264_pixel_satd_16x8_%1, 4,6 - SATD_START_SSE2 +cglobal pixel_satd_16x16, 4,6,8 + SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS - call x264_pixel_satd_8x8_internal_%1 + call pixel_satd_8x8_internal + call pixel_satd_8x8_internal + SATD_ACCUM m6, m0, m7 RESTORE_AND_INC_POINTERS - call x264_pixel_satd_8x8_internal_%1 - SATD_END_SSE2 + call pixel_satd_8x8_internal + call pixel_satd_8x8_internal + SATD_END_SSE2 m6, m7 +%endif -cglobal x264_pixel_satd_8x16_%1, 4,6 - SATD_START_SSE2 - call x264_pixel_satd_8x8_internal_%1 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - call x264_pixel_satd_8x8_internal_%1 - SATD_END_SSE2 - -cglobal x264_pixel_satd_8x8_%1, 4,6 - SATD_START_SSE2 - call x264_pixel_satd_8x8_internal_%1 - SATD_END_SSE2 - -cglobal x264_pixel_satd_8x4_%1, 4,6 - SATD_START_SSE2 - call x264_pixel_satd_8x4_internal_%1 - SATD_END_SSE2 - -cglobal x264_pixel_satd_4x8_%1, 4,6 - INIT_XMM - LOAD_MM_PERMUTATION satd_4x8_internal - %define movh movd - SATD_START_SSE2 - LOAD_DIFF m0, m7, m6, [r0], [r2] - LOAD_DIFF m1, m7, m6, [r0+r1], [r2+r3] - LOAD_DIFF m2, m7, m6, [r0+2*r1], [r2+2*r3] - LOAD_DIFF m3, m7, m6, [r0+r4], [r2+r5] - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - LOAD_DIFF m4, m7, m6, [r0], [r2] - LOAD_DIFF m5, m7, m6, [r0+r1], [r2+r3] - punpcklqdq m0, m4 - punpcklqdq m1, m5 - LOAD_DIFF m4, m7, m6, [r0+2*r1], [r2+2*r3] - LOAD_DIFF m5, m7, m6, [r0+r4], [r2+r5] - punpcklqdq m2, m4 - punpcklqdq m3, m5 - %define movh movq - call x264_pixel_satd_4x8_internal_%1 - SATD_END_SSE2 - -%ifdef ARCH_X86_64 +cglobal pixel_satd_8x16, 4,6,8 + SATD_START_SSE2 m6, m7 + call pixel_satd_8x8_internal + call pixel_satd_8x8_internal + SATD_END_SSE2 m6 + +cglobal pixel_satd_8x8, 4,6,8 + SATD_START_SSE2 m6, m7 + call pixel_satd_8x8_internal + SATD_END_SSE2 m6 + +cglobal pixel_satd_8x4, 4,6,8 + SATD_START_SSE2 m6, m7 + call %%pixel_satd_8x4_internal + SATD_END_SSE2 m6 +%endmacro ; SATDS_SSE2 + +%macro SA8D_INTER 0 +%if ARCH_X86_64 + %define lh m10 + %define rh m0 +%else + %define lh m0 + %define rh [esp+48] +%endif +%if HIGH_BIT_DEPTH + HADDUW m0, m1 + paddd lh, rh +%else + paddusw lh, rh +%endif ; HIGH_BIT_DEPTH +%endmacro + +%macro SA8D 0 +; sse2 doesn't seem to like the horizontal way of doing things +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) + +%if ARCH_X86_64 ;----------------------------------------------------------------------------- -; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ) +; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sa8d_8x8_internal_%1 - lea r10, [r0+4*r1] - lea r11, [r2+4*r3] - LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9, r0, r2 - LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9, r10, r11 - - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - - ABS4 m0, m1, m2, m3, m8, m9 - ABS4 m4, m5, m6, m7, m8, m9 - paddusw m0, m1 - paddusw m2, m3 - paddusw m4, m5 - paddusw m6, m7 - paddusw m0, m2 - paddusw m4, m6 - pavgw m0, m4 +cglobal pixel_sa8d_8x8_internal + lea r6, [r0+4*r1] + lea r7, [r2+4*r3] + LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 + LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 +%if vertical + HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +%else ; non-sse2 + HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11 +%endif + paddw m0, m1 + paddw m0, m2 + paddw m0, m8 + SAVE_MM_PERMUTATION ret -cglobal x264_pixel_sa8d_8x8_%1, 4,6 +cglobal pixel_sa8d_8x8, 4,8,12 + FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] - call x264_pixel_sa8d_8x8_internal_%1 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + call pixel_sa8d_8x8_internal +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%else HADDW m0, m1 +%endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 - ret + RET -cglobal x264_pixel_sa8d_16x16_%1, 4,6 +cglobal pixel_sa8d_16x16, 4,8,12 + FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] - call x264_pixel_sa8d_8x8_internal_%1 ; pix[0] - add r0, 8 - add r2, 8 +%if vertical == 0 + mova m7, [hmul_8p] +%endif + call pixel_sa8d_8x8_internal ; pix[0] + add r2, 8*SIZEOF_PIXEL + add r0, 8*SIZEOF_PIXEL +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif mova m10, m0 - call x264_pixel_sa8d_8x8_internal_%1 ; pix[8] - lea r0, [r0+8*r1] + call pixel_sa8d_8x8_internal ; pix[8] lea r2, [r2+8*r3] - paddusw m10, m0 - call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8] - sub r0, 8 - sub r2, 8 - paddusw m10, m0 - call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride] - paddusw m0, m10 + lea r0, [r0+8*r1] + SA8D_INTER + call pixel_sa8d_8x8_internal ; pix[8*stride+8] + sub r2, 8*SIZEOF_PIXEL + sub r0, 8*SIZEOF_PIXEL + SA8D_INTER + call pixel_sa8d_8x8_internal ; pix[8*stride] + SA8D_INTER + SWAP 0, 10 +%if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 +%endif movd eax, m0 add eax, 1 shr eax, 1 - ret + RET %else ; ARCH_X86_32 -cglobal x264_pixel_sa8d_8x8_internal_%1 - LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7 - movdqa [esp+4], m2 - lea r0, [r0+4*r1] - lea r2, [r2+4*r3] - LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2 - movdqa m2, [esp+4] - - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp+4], [esp+20] - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - -%ifidn %1, sse2 - movdqa [esp+4], m4 - movdqa [esp+20], m2 -%endif - ABS2 m6, m3, m4, m2 - ABS2 m0, m7, m4, m2 - paddusw m0, m6 - paddusw m7, m3 -%ifidn %1, sse2 - movdqa m4, [esp+4] - movdqa m2, [esp+20] -%endif - ABS2 m5, m1, m6, m3 - ABS2 m4, m2, m6, m3 - paddusw m5, m1 - paddusw m4, m2 - paddusw m0, m7 - paddusw m5, m4 - pavgw m0, m5 +%if mmsize == 16 +cglobal pixel_sa8d_8x8_internal + %define spill0 [esp+4] + %define spill1 [esp+20] + %define spill2 [esp+36] +%if vertical + LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 + HADAMARD4_2D 0, 1, 2, 3, 4 + movdqa spill0, m3 + LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 + HADAMARD4_2D 4, 5, 6, 7, 3 + HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax + movdqa m3, spill0 + paddw m0, m1 + HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax +%else ; mmsize == 8 + mova m7, [hmul_8p] + LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1 + ; could do first HADAMARD4_V here to save spilling later + ; surprisingly, not a win on conroe or even p4 + mova spill0, m2 + mova spill1, m3 + mova spill2, m1 + SWAP 1, 7 + LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1 + HADAMARD4_V 4, 5, 6, 7, 3 + mova m1, spill2 + mova m2, spill0 + mova m3, spill1 + mova spill0, m6 + mova spill1, m7 + HADAMARD4_V 0, 1, 2, 3, 7 + SUMSUB_BADC w, 0, 4, 1, 5, 7 + HADAMARD 2, sumsub, 0, 4, 7, 6 + HADAMARD 2, sumsub, 1, 5, 7, 6 + HADAMARD 1, amax, 0, 4, 7, 6 + HADAMARD 1, amax, 1, 5, 7, 6 + mova m6, spill0 + mova m7, spill1 + paddw m0, m1 + SUMSUB_BADC w, 2, 6, 3, 7, 4 + HADAMARD 2, sumsub, 2, 6, 4, 5 + HADAMARD 2, sumsub, 3, 7, 4, 5 + HADAMARD 1, amax, 2, 6, 4, 5 + HADAMARD 1, amax, 3, 7, 4, 5 +%endif ; sse2/non-sse2 + paddw m0, m2 + paddw m0, m3 + SAVE_MM_PERMUTATION ret -%endif ; ARCH -%endmacro ; SATDS_SSE2 - -%macro SA8D_16x16_32 1 -%ifndef ARCH_X86_64 -cglobal x264_pixel_sa8d_8x8_%1, 4,7 - mov r6, esp - and esp, ~15 - sub esp, 32 - lea r4, [3*r1] - lea r5, [3*r3] - call x264_pixel_sa8d_8x8_internal_%1 - HADDW m0, m1 - movd eax, m0 - add eax, 1 - shr eax, 1 - mov esp, r6 +%endif ; ifndef mmx2 + +cglobal pixel_sa8d_8x8, 4,7 + FIX_STRIDES r1, r3 + mov r6, esp + and esp, ~15 + sub esp, 48 + lea r4, [3*r1] + lea r5, [3*r3] + call pixel_sa8d_8x8_internal +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%else + HADDW m0, m1 +%endif ; HIGH_BIT_DEPTH + movd eax, m0 + add eax, 1 + shr eax, 1 + mov esp, r6 RET -cglobal x264_pixel_sa8d_16x16_%1, 4,7 +cglobal pixel_sa8d_16x16, 4,7 + FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 - sub esp, 48 + sub esp, 64 lea r4, [3*r1] lea r5, [3*r3] - call x264_pixel_sa8d_8x8_internal_%1 + call pixel_sa8d_8x8_internal +%if mmsize == 8 lea r0, [r0+4*r1] lea r2, [r2+4*r3] - mova [esp+32], m0 - call x264_pixel_sa8d_8x8_internal_%1 +%endif +%if HIGH_BIT_DEPTH + HADDUW m0, m1 +%endif + mova [esp+48], m0 + call pixel_sa8d_8x8_internal mov r0, [r6+20] mov r2, [r6+28] - add r0, 8 - add r2, 8 - paddusw m0, [esp+32] - mova [esp+32], m0 - call x264_pixel_sa8d_8x8_internal_%1 + add r0, 8*SIZEOF_PIXEL + add r2, 8*SIZEOF_PIXEL + SA8D_INTER + mova [esp+48], m0 + call pixel_sa8d_8x8_internal +%if mmsize == 8 lea r0, [r0+4*r1] lea r2, [r2+4*r3] -%if mmsize == 16 - paddusw m0, [esp+32] +%else + SA8D_INTER %endif - mova [esp+48-mmsize], m0 - call x264_pixel_sa8d_8x8_internal_%1 - paddusw m0, [esp+48-mmsize] + mova [esp+64-mmsize], m0 + call pixel_sa8d_8x8_internal +%if HIGH_BIT_DEPTH + SA8D_INTER +%else ; !HIGH_BIT_DEPTH + paddusw m0, [esp+64-mmsize] %if mmsize == 16 HADDUW m0, m1 %else - mova m2, [esp+32] + mova m2, [esp+48] pxor m7, m7 mova m1, m0 mova m3, m2 @@ -770,27 +1919,200 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,7 paddd m0, m2 HADDD m0, m1 %endif +%endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET %endif ; !ARCH_X86_64 -%endmacro ; SA8D_16x16_32 +%endmacro ; SA8D + +;============================================================================= +; SA8D_SATD +;============================================================================= + +; %1: vertical/horizontal mode +; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9) +; m10: satd result +; m6, m11-15: tmp regs +%macro SA8D_SATD_8x4 5 +%if %1 + LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 + HADAMARD 0, sumsub, %2, %3, 6 + HADAMARD 0, sumsub, %4, %5, 6 + SBUTTERFLY wd, %2, %3, 6 + SBUTTERFLY wd, %4, %5, 6 + HADAMARD2_2D %2, %4, %3, %5, 6, dq + + mova m12, m%2 + mova m13, m%3 + mova m14, m%4 + mova m15, m%5 + HADAMARD 0, sumsub, %2, %3, 6 + HADAMARD 0, sumsub, %4, %5, 6 + SBUTTERFLY qdq, 12, 13, 6 + HADAMARD 0, amax, 12, 13, 6 + SBUTTERFLY qdq, 14, 15, 6 + paddw m10, m12 + HADAMARD 0, amax, 14, 15, 6 + paddw m10, m14 +%else + LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 + HADAMARD4_V %2, %3, %4, %5, 6 + + pabsw m12, m%2 ; doing the abs first is a slight advantage + pabsw m14, m%4 + pabsw m13, m%3 + pabsw m15, m%5 + HADAMARD 1, max, 12, 14, 6, 11 + paddw m10, m12 + HADAMARD 1, max, 13, 15, 6, 11 + paddw m10, m13 +%endif +%endmacro ; SA8D_SATD_8x4 + +; %1: add spilled regs? +; %2: spill regs? +%macro SA8D_SATD_ACCUM 2 +%if HIGH_BIT_DEPTH + pmaddwd m10, [pw_1] + HADDUWD m0, m1 +%if %1 + paddd m10, temp1 + paddd m0, temp0 +%endif +%if %2 + mova temp1, m10 + pxor m10, m10 +%endif +%elif %1 + paddw m0, temp0 +%endif +%if %2 + mova temp0, m0 +%endif +%endmacro + +%macro SA8D_SATD 0 +%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) +cglobal pixel_sa8d_satd_8x8_internal + SA8D_SATD_8x4 vertical, 0, 1, 2, 3 + SA8D_SATD_8x4 vertical, 4, 5, 8, 9 + +%if vertical ; sse2-style + HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax + HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax +%else ; complete sa8d + SUMSUB_BADC w, 0, 4, 1, 5, 12 + HADAMARD 2, sumsub, 0, 4, 12, 11 + HADAMARD 2, sumsub, 1, 5, 12, 11 + SUMSUB_BADC w, 2, 8, 3, 9, 12 + HADAMARD 2, sumsub, 2, 8, 12, 11 + HADAMARD 2, sumsub, 3, 9, 12, 11 + HADAMARD 1, amax, 0, 4, 12, 11 + HADAMARD 1, amax, 1, 5, 12, 4 + HADAMARD 1, amax, 2, 8, 12, 4 + HADAMARD 1, amax, 3, 9, 12, 4 +%endif + + ; create sa8d sub results + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + + SAVE_MM_PERMUTATION + ret + +;------------------------------------------------------------------------------- +; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t ) +;------------------------------------------------------------------------------- +cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize + %define temp0 [rsp+0*mmsize] + %define temp1 [rsp+1*mmsize] + FIX_STRIDES r1, r3 +%if vertical==0 + mova m7, [hmul_8p] +%endif + lea r4, [3*r1] + lea r5, [3*r3] + pxor m10, m10 + +%if mmsize==32 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 0, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 0 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m10, 1 + paddw xm0, xm1 + paddw xm10, xm2 +%else + lea r6, [r2+8*SIZEOF_PIXEL] + lea r7, [r0+8*SIZEOF_PIXEL] + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 0, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 1 + mov r0, r7 + mov r2, r6 + + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 1 + call pixel_sa8d_satd_8x8_internal + SA8D_SATD_ACCUM 1, 0 +%endif + +; xop already has fast horizontal sums +%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0 + pmaddwd xm10, [pw_1] + HADDUWD xm0, xm1 + phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2 + pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1 + paddd xm0, xm1 ; sa8d sa8d satd satd + movd r0d, xm0 + pextrd eax, xm0, 2 +%else +%if HIGH_BIT_DEPTH + HADDD xm0, xm1 + HADDD xm10, xm2 +%else + HADDUW xm0, xm1 + HADDW xm10, xm2 +%endif + movd r0d, xm0 + movd eax, xm10 +%endif + add r0d, 1 + shl rax, 32 + shr r0d, 1 + or rax, r0 + RET +%endmacro ; SA8D_SATD ;============================================================================= ; INTRA SATD ;============================================================================= -%macro INTRA_SA8D_SSE2 1 -%ifdef ARCH_X86_64 -INIT_XMM +%macro HSUMSUB2 8 + pshufd %4, %2, %7 + pshufd %5, %3, %7 + %1 %2, %8 + %1 %6, %8 + paddw %2, %4 + paddw %3, %5 +%endmacro + +; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+, +; and are only retained for old cpus. +%macro INTRA_SA8D_SSE2 0 +%if ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res ) +; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_sa8d_x3_8x8_core_%1 +cglobal intra_sa8d_x3_8x8, 3,3,14 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0*FENC_STRIDE] @@ -809,51 +2131,49 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1 punpcklbw m5, m8 punpcklbw m6, m8 punpcklbw m7, m8 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 - HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 - ; dc - movzx edi, word [r1+0] - add di, word [r1+16] - add edi, 8 - and edi, -16 - shl edi, 2 - - pxor m15, m15 - movdqa m8, m2 - movdqa m9, m3 - movdqa m10, m4 - movdqa m11, m5 - ABS4 m8, m9, m10, m11, m12, m13 + HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8 + + ABSW2 m8, m9, m2, m3, m2, m3 + ABSW2 m10, m11, m4, m5, m4, m5 paddusw m8, m10 paddusw m9, m11 -%ifidn %1, ssse3 - pabsw m10, m6 - pabsw m11, m7 - pabsw m15, m1 -%else - movdqa m10, m6 - movdqa m11, m7 - movdqa m15, m1 - ABS2 m10, m11, m13, m14 - ABS1 m15, m13 -%endif + ABSW2 m10, m11, m6, m7, m6, m7 + ABSW m13, m1, m1 paddusw m10, m11 paddusw m8, m9 - paddusw m15, m10 - paddusw m15, m8 - movdqa m14, m15 ; 7x8 sum - - movdqa m8, [r1+0] ; left edge - movd m9, edi - psllw m8, 3 + paddusw m13, m10 + paddusw m13, m8 + + ; 1D hadamard of edges + movq m8, [r1+7] + movq m9, [r1+16] + pxor m10, m10 + punpcklbw m8, m10 + punpcklbw m9, m10 + HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm] + HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm] + pshuflw m10, m8, q2301 + pshuflw m11, m9, q2301 + pshufhw m10, m10, q2301 + pshufhw m11, m11, q2301 + pmullw m8, [pw_pmpmpmpm] + pmullw m11, [pw_pmpmpmpm] + paddw m8, m10 + paddw m9, m11 + + ; differences + paddw m10, m8, m9 + paddw m10, [pw_8] + pand m10, [sw_f0] + psllw m10, 2 ; dc + + psllw m8, 3 ; left edge psubw m8, m0 - psubw m9, m0 - ABS1 m8, m10 - ABS1 m9, m11 ; 1x8 sum - paddusw m14, m8 - paddusw m15, m9 + psubw m10, m0 + ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum + paddusw m8, m13 + paddusw m13, m10 punpcklwd m0, m1 punpcklwd m2, m3 punpcklwd m4, m5 @@ -861,43 +2181,45 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1 punpckldq m0, m2 punpckldq m4, m6 punpcklqdq m0, m4 ; transpose - movdqa m1, [r1+16] ; top edge - movdqa m2, m15 - psllw m1, 3 - psrldq m2, 2 ; 8x7 sum - psubw m0, m1 ; 8x1 sum - ABS1 m0, m1 + psllw m9, 3 ; top edge + psrldq m2, m13, 2 ; 8x7 sum + psubw m0, m9 ; 8x1 sum + ABSW m0, m0, m9 paddusw m2, m0 ; 3x HADDW - movdqa m7, [pw_1 GLOBAL] + movdqa m7, [pw_1] pmaddwd m2, m7 - pmaddwd m14, m7 - pmaddwd m15, m7 - movdqa m3, m2 - punpckldq m2, m14 - punpckhdq m3, m14 - pshufd m5, m15, 0xf5 + pmaddwd m8, m7 + pmaddwd m13, m7 + punpckhdq m3, m2, m8 + punpckldq m2, m8 + pshufd m5, m13, q3311 paddd m2, m3 - paddd m5, m15 - movdqa m3, m2 + paddd m5, m13 + punpckhqdq m0, m2, m5 punpcklqdq m2, m5 - punpckhqdq m3, m5 - pavgw m3, m2 - pxor m0, m0 - pavgw m3, m0 - movq [r2], m3 ; i8x8_v, i8x8_h - psrldq m3, 8 - movd [r2+8], m3 ; i8x8_dc - ret + pavgw m0, m2 + pxor m1, m1 + pavgw m0, m1 + movq [r2], m0 ; i8x8_v, i8x8_h + psrldq m0, 8 + movd [r2+8], m0 ; i8x8_dc + RET %endif ; ARCH_X86_64 %endmacro ; INTRA_SA8D_SSE2 ; in: r0 = fenc ; out: m0..m3 = hadamard coefs INIT_MMX -ALIGN 16 -load_hadamard: +cglobal hadamard_load +; not really a global, but otherwise cycles get attributed to the wrong function in profiling +%if HIGH_BIT_DEPTH + mova m0, [r0+0*FENC_STRIDEB] + mova m1, [r0+1*FENC_STRIDEB] + mova m2, [r0+2*FENC_STRIDEB] + mova m3, [r0+3*FENC_STRIDEB] +%else pxor m7, m7 movd m0, [r0+0*FENC_STRIDE] movd m1, [r0+1*FENC_STRIDE] @@ -907,155 +2229,125 @@ load_hadamard: punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 - HADAMARD4_1D m0, m1, m2, m3 - TRANSPOSE4x4W 0, 1, 2, 3, 4 - HADAMARD4_1D m0, m1, m2, m3 - SAVE_MM_PERMUTATION load_hadamard +%endif + HADAMARD4_2D 0, 1, 2, 3, 4 + SAVE_MM_PERMUTATION ret -%macro SCALAR_SUMSUB 4 - add %1, %2 - add %3, %4 - add %2, %2 - add %4, %4 - sub %2, %1 - sub %4, %3 -%endmacro - -%macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp -%ifnidn %1, 0 - shl %1d, 5 ; log(FDEC_STRIDE) -%endif - movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE] - movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE] - movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE] - movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE] -%ifnidn %1, 0 - shr %1d, 5 -%endif - SCALAR_SUMSUB %2d, %3d, %4d, %5d - SCALAR_SUMSUB %2d, %4d, %3d, %5d - mov [left_1d+2*%1+0], %2w - mov [left_1d+2*%1+2], %3w - mov [left_1d+2*%1+4], %4w - mov [left_1d+2*%1+6], %5w -%endmacro - -%macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp - movzx %2d, byte [r1+%1-FDEC_STRIDE+0] - movzx %3d, byte [r1+%1-FDEC_STRIDE+1] - movzx %4d, byte [r1+%1-FDEC_STRIDE+2] - movzx %5d, byte [r1+%1-FDEC_STRIDE+3] - SCALAR_SUMSUB %2d, %3d, %4d, %5d - SCALAR_SUMSUB %2d, %4d, %3d, %5d - mov [top_1d+2*%1+0], %2w - mov [top_1d+2*%1+2], %3w - mov [top_1d+2*%1+4], %4w - mov [top_1d+2*%1+6], %5w +%macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp +%ifidn %1, top +%if HIGH_BIT_DEPTH + mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB] +%else + movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB] + pxor %5, %5 + punpcklbw %3, %5 +%endif +%else ; left +%ifnidn %2, 0 + shl %2d, 5 ; log(FDEC_STRIDEB) +%endif + movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB] + pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0 + pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2 + pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3 +%if HIGH_BIT_DEPTH == 0 + psrlw %3, 8 +%endif +%ifnidn %2, 0 + shr %2d, 5 +%endif +%endif ; direction +%if cpuflag(ssse3) + %define %%sign psignw +%else + %define %%sign pmullw +%endif + pshufw %4, %3, q1032 + %%sign %4, [pw_ppmmppmm] + paddw %3, %4 + pshufw %4, %3, q2301 + %%sign %4, [pw_pmpmpmpm] + paddw %3, %4 + psllw %3, 2 + mova [%1_1d+2*%2], %3 %endmacro %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op pxor %7, %7 - pshufw %4, %1, 01001110b - pshufw %5, %2, 01001110b - pshufw %6, %3, 01001110b + pshufw %4, %1, q1032 + pshufw %5, %2, q1032 + pshufw %6, %3, q1032 paddw %1, %4 paddw %2, %5 paddw %3, %6 punpcklwd %1, %7 punpcklwd %2, %7 punpcklwd %3, %7 - pshufw %4, %1, 01001110b - pshufw %5, %2, 01001110b - pshufw %6, %3, 01001110b + pshufw %4, %1, q1032 + pshufw %5, %2, q1032 + pshufw %6, %3, q1032 %8 %1, %4 %8 %2, %5 %8 %3, %6 %endmacro -%macro CLEAR_SUMS 0 -%ifdef ARCH_X86_64 - mov qword [sums+0], 0 - mov qword [sums+8], 0 - mov qword [sums+16], 0 -%else - pxor m7, m7 - movq [sums+0], m7 - movq [sums+8], m7 - movq [sums+16], m7 -%endif -%endmacro - ; in: m1..m3 ; out: m7 ; clobber: m4..m6 -%macro SUM3x4 1 -%ifidn %1, ssse3 - pabsw m4, m1 - pabsw m5, m2 - pabsw m7, m3 - paddw m4, m5 -%else - movq m4, m1 - movq m5, m2 - ABS2 m4, m5, m6, m7 - movq m7, m3 +%macro SUM3x4 0 + ABSW2 m4, m5, m1, m2, m1, m2 + ABSW m7, m3, m3 paddw m4, m5 - ABS1 m7, m6 -%endif paddw m7, m4 %endmacro -; in: m0..m3 (4x4), m7 (3x4) +; in: m0..m3 (4x4) ; out: m0 v, m4 h, m5 dc -; clobber: m6 +; clobber: m1..m3 %macro SUM4x3 3 ; dc, left, top movq m4, %2 +%ifid %1 + movq m5, %1 +%else movd m5, %1 - psllw m4, 2 +%endif psubw m4, m0 psubw m5, m0 punpcklwd m0, m1 punpcklwd m2, m3 punpckldq m0, m2 ; transpose - movq m1, %3 - psllw m1, 2 - psubw m0, m1 - ABS2 m4, m5, m2, m3 ; 1x4 sum - ABS1 m0, m1 ; 4x1 sum + psubw m0, %3 + ABSW2 m4, m5, m4, m5, m2, m3 ; 1x4 sum + ABSW m0, m0, m1 ; 4x1 sum %endmacro -%macro INTRA_SATDS_MMX 1 -INIT_MMX +%macro INTRA_X3_MMX 0 ;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_4x4_%1, 2,6 -%ifdef ARCH_X86_64 +cglobal intra_satd_x3_4x4, 3,3 +%if UNIX64 ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 %define left_1d rsp-16 ; size 8 - %define t0 r10 - %define t0d r10d -%else - ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned - SUB esp, 16 - %define top_1d esp+8 - %define left_1d esp - %define t0 r2 - %define t0d r2d -%endif - - call load_hadamard - SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5 - mov t0d, r0d - SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5 - lea t0d, [t0d + r0d + 4] - and t0d, -8 - shl t0d, 1 ; dc - - SUM3x4 %1 - SUM4x3 t0d, [left_1d], [top_1d] +%else + ; WIN64: stack is 16 byte aligned because abi says so + ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned + SUB rsp, 16 + %define top_1d rsp+8 + %define left_1d rsp +%endif + + call hadamard_load + SCALAR_HADAMARD left, 0, m4, m5 + SCALAR_HADAMARD top, 0, m6, m5, m7 + paddw m6, m4 + pavgw m6, [pw_16] + pand m6, [sw_f0] ; dc + + SUM3x4 + SUM4x3 m6, [left_1d], [top_1d] paddw m4, m7 paddw m5, m7 movq m1, m5 @@ -1063,161 +2355,191 @@ cglobal x264_intra_satd_x3_4x4_%1, 2,6 paddw m0, m1 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw -%ifndef ARCH_X86_64 - mov r2, r2m -%endif movd [r2+0], m0 ; i4x4_v satd movd [r2+4], m4 ; i4x4_h satd movd [r2+8], m5 ; i4x4_dc satd -%ifndef ARCH_X86_64 - ADD esp, 16 +%if UNIX64 == 0 + ADD rsp, 16 %endif RET -%ifdef ARCH_X86_64 - %define t0 r10 - %define t0d r10d - %define t2 r11 - %define t2w r11w - %define t2d r11d -%else - %define t0 r0 - %define t0d r0d - %define t2 r2 - %define t2w r2w - %define t2d r2d -%endif - ;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_16x16_%1, 0,7 -%ifdef ARCH_X86_64 - %assign stack_pad 88 -%else - %assign stack_pad 88 + ((stack_offset+88+4)&15) -%endif +cglobal intra_satd_x3_16x16, 0,5 + %assign stack_pad 120 + ((stack_offset+120+gprsize)&15) ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, stack_pad -%define sums rsp+64 ; size 24 +%define sums rsp+64 ; size 56 %define top_1d rsp+32 ; size 32 %define left_1d rsp ; size 32 - movifnidn r1d, r1m - CLEAR_SUMS + movifnidn r1, r1mp + + pxor m7, m7 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 +%if HIGH_BIT_DEPTH + mova [sums+24], m7 + mova [sums+32], m7 + mova [sums+40], m7 + mova [sums+48], m7 +%endif ; 1D hadamards - xor t2d, t2d - mov t0d, 12 + mov r3d, 12 + movd m6, [pw_32] .loop_edge: - SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6 - add t2d, r3d - SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6 - add t2d, r3d - sub t0d, 4 + SCALAR_HADAMARD left, r3, m0, m1 + SCALAR_HADAMARD top, r3, m1, m2, m3 + pavgw m0, m1 + paddw m6, m0 + sub r3d, 4 jge .loop_edge - shr t2d, 1 - add t2d, 8 - and t2d, -16 ; dc + psrlw m6, 2 + pand m6, [sw_f0] ; dc ; 2D hadamards - movifnidn r0d, r0m - xor r3d, r3d + movifnidn r0, r0mp + mov r3, -4 .loop_y: - xor r4d, r4d + mov r4, -4 .loop_x: - call load_hadamard + call hadamard_load - SUM3x4 %1 - SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4] + SUM3x4 + SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)] pavgw m4, m7 pavgw m5, m7 - paddw m0, [sums+0] ; i16x16_v satd - paddw m4, [sums+8] ; i16x16_h satd + paddw m0, [sums+ 0] ; i16x16_v satd + paddw m4, [sums+ 8] ; i16x16_h satd paddw m5, [sums+16] ; i16x16_dc satd - movq [sums+0], m0 - movq [sums+8], m4 - movq [sums+16], m5 + mova [sums+ 0], m0 + mova [sums+ 8], m4 + mova [sums+16], m5 - add r0, 4 - inc r4d - cmp r4d, 4 + add r0, 4*SIZEOF_PIXEL + inc r4 jl .loop_x - add r0, 4*FENC_STRIDE-16 - inc r3d - cmp r3d, 4 +%if HIGH_BIT_DEPTH + psrld m7, m4, 16 + pslld m4, 16 + psrld m4, 16 + paddd m4, m7 + psrld m7, m0, 16 + pslld m0, 16 + psrld m0, 16 + paddd m0, m7 + paddd m4, [sums+32] + paddd m0, [sums+24] + mova [sums+32], m4 + mova [sums+24], m0 + pxor m7, m7 + punpckhwd m3, m5, m7 + punpcklwd m5, m7 + paddd m3, [sums+48] + paddd m5, [sums+40] + mova [sums+48], m3 + mova [sums+40], m5 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 +%endif + add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL + inc r3 jl .loop_y ; horizontal sum - movifnidn r2d, r2m - movq m2, [sums+16] - movq m1, [sums+8] - movq m0, [sums+0] - movq m7, m2 - SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + movifnidn r2, r2mp +%if HIGH_BIT_DEPTH + mova m1, m5 + paddd m5, m3 + HADDD m5, m7 ; DC satd + HADDD m4, m7 ; H satd + HADDD m0, m7 ; the part of V satd that doesn't overlap with DC + psrld m0, 1 + psrlq m1, 32 ; DC[1] + paddd m0, m3 ; DC[2] + psrlq m3, 32 ; DC[3] + paddd m0, m1 + paddd m0, m3 +%else + mova m7, m5 + SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd psrld m0, 1 pslld m7, 16 psrld m7, 16 - paddd m0, m2 + paddd m0, m5 psubd m0, m7 - movd [r2+8], m2 ; i16x16_dc satd - movd [r2+4], m1 ; i16x16_h satd - movd [r2+0], m0 ; i16x16_v satd - ADD rsp, stack_pad +%endif + movd [r2+8], m5 ; i16x16_dc satd + movd [r2+4], m4 ; i16x16_h satd + movd [r2+0], m0 ; i16x16_v satd + ADD rsp, stack_pad RET +%if ARCH_X86_64 + %define t0 r6 +%else + %define t0 r2 +%endif + ;----------------------------------------------------------------------------- -; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_satd_x3_8x8c_%1, 0,6 +cglobal intra_satd_x3_8x8c, 0,6 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, 72 %define sums rsp+48 ; size 24 %define dc_1d rsp+32 ; size 16 %define top_1d rsp+16 ; size 16 %define left_1d rsp ; size 16 - movifnidn r1d, r1m - CLEAR_SUMS + movifnidn r1, r1mp + pxor m7, m7 + mova [sums+ 0], m7 + mova [sums+ 8], m7 + mova [sums+16], m7 ; 1D hadamards - mov t0d, 4 + mov r3d, 4 .loop_edge: - SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5 - SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5 - sub t0d, 4 + SCALAR_HADAMARD left, r3, m0, m1 + SCALAR_HADAMARD top, r3, m0, m1, m2 + sub r3d, 4 jge .loop_edge ; dc - movzx t2d, word [left_1d+0] + movzx t0d, word [left_1d+0] movzx r3d, word [top_1d+0] movzx r4d, word [left_1d+8] movzx r5d, word [top_1d+8] - add t2d, r3d - lea r3, [r4 + r5] - lea t2, [2*t2 + 8] - lea r3, [2*r3 + 8] - lea r4, [4*r4 + 8] - lea r5, [4*r5 + 8] - and t2d, -16 ; tl + lea t0d, [t0 + r3 + 16] + lea r3d, [r4 + r5 + 16] + shr t0d, 1 + shr r3d, 1 + add r4d, 8 + add r5d, 8 + and t0d, -16 ; tl and r3d, -16 ; br and r4d, -16 ; bl and r5d, -16 ; tr - mov [dc_1d+ 0], t2d ; tl + mov [dc_1d+ 0], t0d ; tl mov [dc_1d+ 4], r5d ; tr mov [dc_1d+ 8], r4d ; bl mov [dc_1d+12], r3d ; br lea r5, [dc_1d] ; 2D hadamards - movifnidn r0d, r0m - movifnidn r2d, r2m - xor r3d, r3d + movifnidn r0, r0mp + movifnidn r2, r2mp + mov r3, -2 .loop_y: - xor r4d, r4d + mov r4, -2 .loop_x: - call load_hadamard + call hadamard_load - SUM3x4 %1 - SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4] + SUM3x4 + SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)] pavgw m4, m7 pavgw m5, m7 paddw m0, [sums+16] ; i4x4_v satd @@ -1227,14 +2549,12 @@ cglobal x264_intra_satd_x3_8x8c_%1, 0,6 movq [sums+8], m4 movq [sums+0], m5 - add r0, 4 - inc r4d - cmp r4d, 2 + add r0, 4*SIZEOF_PIXEL + inc r4 jl .loop_x - add r0, 4*FENC_STRIDE-8 + add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL add r5, 8 - inc r3d - cmp r3d, 2 + inc r3 jl .loop_y ; horizontal sum @@ -1242,33 +2562,1179 @@ cglobal x264_intra_satd_x3_8x8c_%1, 0,6 movq m1, [sums+8] movq m2, [sums+16] movq m7, m0 +%if HIGH_BIT_DEPTH + psrlq m7, 16 + HADDW m7, m3 + SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd + psrld m2, 1 + paddd m2, m7 +%else psrlq m7, 15 paddw m2, m7 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m2, 1 +%endif movd [r2+0], m0 ; i8x8c_dc satd movd [r2+4], m1 ; i8x8c_h satd movd [r2+8], m2 ; i8x8c_v satd ADD rsp, 72 RET -%endmacro ; INTRA_SATDS_MMX +%endmacro ; INTRA_X3_MMX -%macro ABS_MOV_SSSE3 2 - pabsw %1, %2 -%endmacro -%macro ABS_MOV_MMX 2 - pxor %1, %1 - psubw %1, %2 - pmaxsw %1, %2 +%macro PRED4x4_LOWPASS 5 +%ifid %5 + pavgb %5, %2, %3 + pxor %3, %2 + pand %3, [pb_1] + psubusb %5, %3 + pavgb %1, %4, %5 +%else + mova %5, %2 + pavgb %2, %3 + pxor %3, %5 + pand %3, [pb_1] + psubusb %2, %3 + pavgb %1, %4, %2 +%endif %endmacro -%define ABS_MOV ABS_MOV_MMX +%macro INTRA_X9_PRED 2 +%if cpuflag(sse4) + movu m1, [r1-1*FDEC_STRIDE-8] + pinsrb m1, [r1+3*FDEC_STRIDE-1], 0 + pinsrb m1, [r1+2*FDEC_STRIDE-1], 1 + pinsrb m1, [r1+1*FDEC_STRIDE-1], 2 + pinsrb m1, [r1+0*FDEC_STRIDE-1], 3 +%else + movd mm0, [r1+3*FDEC_STRIDE-4] + punpcklbw mm0, [r1+2*FDEC_STRIDE-4] + movd mm1, [r1+1*FDEC_STRIDE-4] + punpcklbw mm1, [r1+0*FDEC_STRIDE-4] + punpckhwd mm0, mm1 + psrlq mm0, 32 + movq2dq m0, mm0 + movu m1, [r1-1*FDEC_STRIDE-8] + movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7 +%endif ; cpuflag + pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ + psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ + psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __ + pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __ + mova %2, m1 + PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __ + ; ddl ddr + ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2 + ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1 + ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0 + ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt + pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1 + pshufb m3, m0, [%1_ddlr2] ; rows 2,3 + ; hd hu + ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2 + ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3 + ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3 + ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3 + pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 + palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt + pshufb m6, m7, [%1_hdu1] + pshufb m7, m7, [%1_hdu2] + ; vr vl + ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4 + ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4 + ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5 + ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 + psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ... + palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 + pshufb m4, m5, [%1_vrl1] + pshufb m5, m5, [%1_vrl2] +%endmacro ; INTRA_X9_PRED + +%macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp + pshufb m2, m%1, [intrax9b_vh1] + pshufb m3, m%1, [intrax9b_vh2] + mova [pred_buf+0x60], m2 + mova [pred_buf+0x70], m3 + pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3 + pmaddubsw m%1, [hmul_4p] + pshufhw m0, m%1, q2301 + pshuflw m0, m0, q2301 + psignw m%1, [pw_pmpmpmpm] + paddw m0, m%1 + psllw m0, 2 ; hadamard(top), hadamard(left) + movhlps m3, m0 + pshufb m1, m0, [intrax9b_v1] + pshufb m2, m0, [intrax9b_v2] + paddw m0, m3 + psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated? + pavgw m0, [pw_16] + pand m0, [sw_f0] ; dc + ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be + ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs. + ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef. + HADAMARD 0, sumsub, %2, %3, %4, %5 + HADAMARD 1, sumsub, %2, %3, %4, %5 + movd r3d, m0 + shr r3d, 4 + imul r3d, 0x01010101 + mov [pred_buf+0x80], r3d + mov [pred_buf+0x88], r3d + mov [pred_buf+0x90], r3d + mov [pred_buf+0x98], r3d + psubw m3, m%2 + psubw m0, m%2 + psubw m1, m%2 + psubw m2, m%3 + pabsw m%3, m%3 + pabsw m3, m3 + pabsw m0, m0 + pabsw m1, m1 + pabsw m2, m2 + pavgw m3, m%3 + pavgw m0, m%3 + pavgw m1, m2 +%if cpuflag(sse4) + phaddw m3, m0 +%else + SBUTTERFLY qdq, 3, 0, 2 + paddw m3, m0 +%endif + movhlps m2, m1 + paddw m1, m2 +%if cpuflag(xop) + vphaddwq m3, m3 + vphaddwq m1, m1 + packssdw m1, m3 +%else + phaddw m1, m3 + pmaddwd m1, [pw_1] ; v, _, h, dc +%endif +%endmacro ; INTRA_X9_VHDC + +%macro INTRA_X9_END 2 +%if cpuflag(sse4) + phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu + movd eax, m0 + add eax, 1<<16 + cmp ax, r3w + cmovge eax, r3d +%else +%if %1 + ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index + psllw m0, 3 + paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu +%else + ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index + psllw m0, 2 + paddusw m0, m0 + paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu +%endif + movhlps m1, m0 + pminsw m0, m1 + pshuflw m1, m0, q0032 + pminsw m0, m1 + pshuflw m1, m0, q0001 + pminsw m0, m1 + movd eax, m0 + movsx r2d, ax + and eax, 7 + sar r2d, 3 + shl eax, 16 + ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits + ; 1<<12: undo sign manipulation + lea eax, [rax+r2+(1<<16)+(1<<12)] + cmp ax, r3w + cmovge eax, r3d +%endif ; cpuflag + + ; output the predicted samples + mov r3d, eax + shr r3d, 16 +%ifdef PIC + lea r2, [%2_lut] + movzx r2d, byte [r2+r3] +%else + movzx r2d, byte [%2_lut+r3] +%endif +%if %1 ; sad + movq mm0, [pred_buf+r2] + movq mm1, [pred_buf+r2+16] + movd [r1+0*FDEC_STRIDE], mm0 + movd [r1+2*FDEC_STRIDE], mm1 + psrlq mm0, 32 + psrlq mm1, 32 + movd [r1+1*FDEC_STRIDE], mm0 + movd [r1+3*FDEC_STRIDE], mm1 +%else ; satd +%assign i 0 +%rep 4 + mov r3d, [pred_buf+r2+8*i] + mov [r1+i*FDEC_STRIDE], r3d +%assign i i+1 +%endrep +%endif +%endmacro ; INTRA_X9_END + +%macro INTRA_X9 0 +;----------------------------------------------------------------------------- +; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts ) +;----------------------------------------------------------------------------- +%if notcpuflag(xop) +cglobal intra_sad_x9_4x4, 3,4,9 + %assign pad 0xc0-gprsize-(stack_offset&15) + %define pred_buf rsp + sub rsp, pad +%if ARCH_X86_64 + INTRA_X9_PRED intrax9a, m8 +%else + INTRA_X9_PRED intrax9a, [rsp+0xa0] +%endif + mova [rsp+0x00], m2 + mova [rsp+0x10], m3 + mova [rsp+0x20], m4 + mova [rsp+0x30], m5 + mova [rsp+0x40], m6 + mova [rsp+0x50], m7 +%if cpuflag(sse4) + movd m0, [r0+0*FENC_STRIDE] + pinsrd m0, [r0+1*FENC_STRIDE], 1 + movd m1, [r0+2*FENC_STRIDE] + pinsrd m1, [r0+3*FENC_STRIDE], 1 +%else + movd mm0, [r0+0*FENC_STRIDE] + punpckldq mm0, [r0+1*FENC_STRIDE] + movd mm1, [r0+2*FENC_STRIDE] + punpckldq mm1, [r0+3*FENC_STRIDE] + movq2dq m0, mm0 + movq2dq m1, mm1 +%endif + punpcklqdq m0, m0 + punpcklqdq m1, m1 + psadbw m2, m0 + psadbw m3, m1 + psadbw m4, m0 + psadbw m5, m1 + psadbw m6, m0 + psadbw m7, m1 + paddd m2, m3 + paddd m4, m5 + paddd m6, m7 +%if ARCH_X86_64 + SWAP 7, 8 + pxor m8, m8 + %define %%zero m8 +%else + mova m7, [rsp+0xa0] + %define %%zero [pb_0] +%endif + pshufb m3, m7, [intrax9a_vh1] + pshufb m5, m7, [intrax9a_vh2] + pshufb m7, [intrax9a_dc] + psadbw m7, %%zero + psrlw m7, 2 + mova [rsp+0x60], m3 + mova [rsp+0x70], m5 + psadbw m3, m0 + pavgw m7, %%zero + pshufb m7, %%zero + psadbw m5, m1 + movq [rsp+0x80], m7 + movq [rsp+0x90], m7 + psadbw m0, m7 + paddd m3, m5 + psadbw m1, m7 + paddd m0, m1 + movzx r3d, word [r2] + movd r0d, m3 ; v + add r3d, r0d + punpckhqdq m3, m0 ; h, dc + shufps m3, m2, q2020 + psllq m6, 32 + por m4, m6 + movu m0, [r2+2] + packssdw m3, m4 + paddw m0, m3 + INTRA_X9_END 1, intrax9a + add rsp, pad + RET +%endif ; cpuflag + +%if ARCH_X86_64 +;----------------------------------------------------------------------------- +; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts ) +;----------------------------------------------------------------------------- +cglobal intra_satd_x9_4x4, 3,4,16 + %assign pad 0xb0-gprsize-(stack_offset&15) + %define pred_buf rsp + sub rsp, pad + INTRA_X9_PRED intrax9b, m15 + mova [rsp+0x00], m2 + mova [rsp+0x10], m3 + mova [rsp+0x20], m4 + mova [rsp+0x30], m5 + mova [rsp+0x40], m6 + mova [rsp+0x50], m7 + movd m8, [r0+0*FENC_STRIDE] + movd m9, [r0+1*FENC_STRIDE] + movd m10, [r0+2*FENC_STRIDE] + movd m11, [r0+3*FENC_STRIDE] + mova m12, [hmul_8p] + pshufd m8, m8, 0 + pshufd m9, m9, 0 + pshufd m10, m10, 0 + pshufd m11, m11, 0 + pmaddubsw m8, m12 + pmaddubsw m9, m12 + pmaddubsw m10, m12 + pmaddubsw m11, m12 + movddup m0, m2 + pshufd m1, m2, q3232 + movddup m2, m3 + movhlps m3, m3 + call .satd_8x4 ; ddr, ddl + movddup m2, m5 + pshufd m3, m5, q3232 + mova m5, m0 + movddup m0, m4 + pshufd m1, m4, q3232 + call .satd_8x4 ; vr, vl + movddup m2, m7 + pshufd m3, m7, q3232 + mova m4, m0 + movddup m0, m6 + pshufd m1, m6, q3232 + call .satd_8x4 ; hd, hu +%if cpuflag(sse4) + punpckldq m4, m0 +%else + punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't +%endif + mova m1, [pw_ppmmppmm] + psignw m8, m1 + psignw m10, m1 + paddw m8, m9 + paddw m10, m11 + INTRA_X9_VHDC 15, 8, 10, 6, 7 + ; find minimum + movu m0, [r2+2] + movd r3d, m1 + palignr m5, m1, 8 +%if notcpuflag(sse4) + pshufhw m0, m0, q3120 ; compensate for different order in unpack +%endif + packssdw m5, m4 + paddw m0, m5 + movzx r0d, word [r2] + add r3d, r0d + INTRA_X9_END 0, intrax9b + add rsp, pad + RET +RESET_MM_PERMUTATION +ALIGN 16 +.satd_8x4: + pmaddubsw m0, m12 + pmaddubsw m1, m12 + pmaddubsw m2, m12 + pmaddubsw m3, m12 + psubw m0, m8 + psubw m1, m9 + psubw m2, m10 + psubw m3, m11 + SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap + pmaddwd m0, [pw_1] +%if cpuflag(sse4) + pshufd m1, m0, q0032 +%else + movhlps m1, m0 +%endif + paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free + ret + +%else ; !ARCH_X86_64 +cglobal intra_satd_x9_4x4, 3,4,8 + %assign pad 0x120-gprsize-(stack_offset&15) + %define fenc_buf rsp + %define pred_buf rsp+0x40 + %define spill rsp+0xe0 + sub rsp, pad + INTRA_X9_PRED intrax9b, [spill+0x20] + mova [pred_buf+0x00], m2 + mova [pred_buf+0x10], m3 + mova [pred_buf+0x20], m4 + mova [pred_buf+0x30], m5 + mova [pred_buf+0x40], m6 + mova [pred_buf+0x50], m7 + movd m4, [r0+0*FENC_STRIDE] + movd m5, [r0+1*FENC_STRIDE] + movd m6, [r0+2*FENC_STRIDE] + movd m0, [r0+3*FENC_STRIDE] + mova m7, [hmul_8p] + pshufd m4, m4, 0 + pshufd m5, m5, 0 + pshufd m6, m6, 0 + pshufd m0, m0, 0 + pmaddubsw m4, m7 + pmaddubsw m5, m7 + pmaddubsw m6, m7 + pmaddubsw m0, m7 + mova [fenc_buf+0x00], m4 + mova [fenc_buf+0x10], m5 + mova [fenc_buf+0x20], m6 + mova [fenc_buf+0x30], m0 + movddup m0, m2 + pshufd m1, m2, q3232 + movddup m2, m3 + movhlps m3, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m7 + pmaddubsw m2, m7 + pmaddubsw m3, m7 + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + call .satd_8x4b ; ddr, ddl + mova m3, [pred_buf+0x30] + mova m1, [pred_buf+0x20] + movddup m2, m3 + movhlps m3, m3 + movq [spill+0x08], m0 + movddup m0, m1 + movhlps m1, m1 + call .satd_8x4 ; vr, vl + mova m3, [pred_buf+0x50] + mova m1, [pred_buf+0x40] + movddup m2, m3 + movhlps m3, m3 + movq [spill+0x10], m0 + movddup m0, m1 + movhlps m1, m1 + call .satd_8x4 ; hd, hu + movq [spill+0x18], m0 + mova m1, [spill+0x20] + mova m4, [fenc_buf+0x00] + mova m5, [fenc_buf+0x20] + mova m2, [pw_ppmmppmm] + psignw m4, m2 + psignw m5, m2 + paddw m4, [fenc_buf+0x10] + paddw m5, [fenc_buf+0x30] + INTRA_X9_VHDC 1, 4, 5, 6, 7 + ; find minimum + movu m0, [r2+2] + movd r3d, m1 + punpckhqdq m1, [spill+0x00] + packssdw m1, [spill+0x10] +%if cpuflag(sse4) + pshufhw m1, m1, q3120 +%else + pshufhw m0, m0, q3120 +%endif + paddw m0, m1 + movzx r0d, word [r2] + add r3d, r0d + INTRA_X9_END 0, intrax9b + add rsp, pad + RET +RESET_MM_PERMUTATION +ALIGN 16 +.satd_8x4: + pmaddubsw m0, m7 + pmaddubsw m1, m7 + pmaddubsw m2, m7 + pmaddubsw m3, m7 + %xdefine fenc_buf fenc_buf+gprsize + psubw m0, [fenc_buf+0x00] + psubw m1, [fenc_buf+0x10] + psubw m2, [fenc_buf+0x20] +.satd_8x4b: + psubw m3, [fenc_buf+0x30] + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap + pmaddwd m0, [pw_1] +%if cpuflag(sse4) + pshufd m1, m0, q0032 +%else + movhlps m1, m0 +%endif + paddd xmm0, m0, m1 + ret +%endif ; ARCH +%endmacro ; INTRA_X9 + + + +%macro INTRA8_X9 0 +;----------------------------------------------------------------------------- +; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds ) +;----------------------------------------------------------------------------- +cglobal intra_sad_x9_8x8, 5,6,9 + %define fenc02 m4 + %define fenc13 m5 + %define fenc46 m6 + %define fenc57 m7 +%if ARCH_X86_64 + %define tmp m8 + %assign padbase 0x0 +%else + %define tmp [rsp] + %assign padbase 0x10 +%endif + %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15) + %define pred(i,j) [rsp+i*0x40+j*0x10+padbase] + + SUB rsp, pad + movq fenc02, [r0+FENC_STRIDE* 0] + movq fenc13, [r0+FENC_STRIDE* 1] + movq fenc46, [r0+FENC_STRIDE* 4] + movq fenc57, [r0+FENC_STRIDE* 5] + movhps fenc02, [r0+FENC_STRIDE* 2] + movhps fenc13, [r0+FENC_STRIDE* 3] + movhps fenc46, [r0+FENC_STRIDE* 6] + movhps fenc57, [r0+FENC_STRIDE* 7] + + ; save instruction size: avoid 4-byte memory offsets + lea r0, [intra8x9_h1+128] + %define off(m) (r0+m-(intra8x9_h1+128)) + +; v + movddup m0, [r2+16] + mova pred(0,0), m0 + psadbw m1, m0, fenc02 + mova pred(0,1), m0 + psadbw m2, m0, fenc13 + mova pred(0,2), m0 + psadbw m3, m0, fenc46 + mova pred(0,3), m0 + psadbw m0, m0, fenc57 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + movd [r4+0], m0 + +; h + movq m0, [r2+7] + pshufb m1, m0, [off(intra8x9_h1)] + pshufb m2, m0, [off(intra8x9_h2)] + mova pred(1,0), m1 + psadbw m1, fenc02 + mova pred(1,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m3, m0, [off(intra8x9_h3)] + pshufb m2, m0, [off(intra8x9_h4)] + mova pred(1,2), m3 + psadbw m3, fenc46 + mova pred(1,3), m2 + psadbw m2, fenc57 + paddw m1, m3 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+2], m1 + + lea r5, [rsp+padbase+0x100] + %define pred(i,j) [r5+i*0x40+j*0x10-0x100] + +; dc + movhps m0, [r2+16] + pxor m2, m2 + psadbw m0, m2 + movhlps m1, m0 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + pshufb m0, m2 + mova pred(2,0), m0 + psadbw m1, m0, fenc02 + mova pred(2,1), m0 + psadbw m2, m0, fenc13 + mova pred(2,2), m0 + psadbw m3, m0, fenc46 + mova pred(2,3), m0 + psadbw m0, m0, fenc57 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + movd [r4+4], m0 + +; ddl +; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 +; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 +; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA +; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB +; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC +; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD +; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE +; Ft8 Ft9 FtA FtB FtC FtD FtE FtF + mova m0, [r2+16] + movu m2, [r2+17] + pslldq m1, m0, 1 + pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___ + PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF + pshufb m1, m0, [off(intra8x9_ddl1)] + pshufb m2, m0, [off(intra8x9_ddl2)] + mova pred(3,0), m1 + psadbw m1, fenc02 + mova pred(3,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddl3)] + mova pred(3,2), m2 + psadbw m2, fenc46 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddl4)] + mova pred(3,3), m2 + psadbw m2, fenc57 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+6], m1 + +; vl +; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 +; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 +; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 +; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 +; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA +; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA +; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB +; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB + pshufb m1, m3, [off(intra8x9_vl1)] + pshufb m2, m0, [off(intra8x9_vl2)] + pshufb m3, m3, [off(intra8x9_vl3)] + pshufb m0, m0, [off(intra8x9_vl4)] + mova pred(7,0), m1 + psadbw m1, fenc02 + mova pred(7,1), m2 + psadbw m2, fenc13 + mova pred(7,2), m3 + psadbw m3, fenc46 + mova pred(7,3), m0 + psadbw m0, fenc57 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(sse4) + pextrw [r4+14], m0, 0 +%else + movd r5d, m0 + mov [r4+14], r5w + lea r5, [rsp+padbase+0x100] +%endif + +; ddr +; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 +; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 +; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 +; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 +; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 +; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 +; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 +; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt + movu m2, [r2+8] + movu m0, [r2+7] + movu m1, [r2+6] + pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 + PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 + pshufb m1, m0, [off(intra8x9_ddr1)] + pshufb m2, m0, [off(intra8x9_ddr2)] + mova pred(4,0), m1 + psadbw m1, fenc02 + mova pred(4,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddr3)] + mova pred(4,2), m2 + psadbw m2, fenc46 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_ddr4)] + mova pred(4,3), m2 + psadbw m2, fenc57 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+8], m1 + + add r0, 256 + add r5, 0xC0 + %define off(m) (r0+m-(intra8x9_h1+256+128)) + %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0] + +; vr +; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 +; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 +; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 +; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 +; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 +; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 +; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 +; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 + movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 + pshufb m1, m2, [off(intra8x9_vr1)] + pshufb m2, m2, [off(intra8x9_vr3)] + mova pred(5,0), m1 + psadbw m1, fenc02 + mova pred(5,2), m2 + psadbw m2, fenc46 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_vr2)] + mova pred(5,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_vr4)] + mova pred(5,3), m2 + psadbw m2, fenc57 + paddw m1, m2 + movhlps m2, m1 + paddw m1, m2 + movd [r4+10], m1 + +; hd +; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 +; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3 +; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1 +; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt +; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 +; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 +; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 +; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 + pshufd m2, m3, q0001 +%if cpuflag(sse4) + pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___ +%else + movss m1, m0, m2 + SWAP 1, 2 +%endif + punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___ + pshufb m1, m2, [off(intra8x9_hd1)] + pshufb m2, m2, [off(intra8x9_hd2)] + mova pred(6,0), m1 + psadbw m1, fenc02 + mova pred(6,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_hd3)] + pshufb m3, m0, [off(intra8x9_hd4)] + mova pred(6,2), m2 + psadbw m2, fenc46 + mova pred(6,3), m3 + psadbw m3, fenc57 + paddw m1, m2 + paddw m1, m3 + movhlps m2, m1 + paddw m1, m2 + ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall + pslldq m1, 12 + SWAP 3, 1 + +; hu +; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 +; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5 +; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 +; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 +; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 +; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 +; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 +; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 +%if cpuflag(sse4) + pinsrb m0, [r2+7], 15 ; Gl7 +%else + movd m1, [r2+7] + pslldq m0, 1 + palignr m1, m0, 1 + SWAP 0, 1 +%endif + pshufb m1, m0, [off(intra8x9_hu1)] + pshufb m2, m0, [off(intra8x9_hu2)] + mova pred(8,0), m1 + psadbw m1, fenc02 + mova pred(8,1), m2 + psadbw m2, fenc13 + paddw m1, m2 + pshufb m2, m0, [off(intra8x9_hu3)] + pshufb m0, m0, [off(intra8x9_hu4)] + mova pred(8,2), m2 + psadbw m2, fenc46 + mova pred(8,3), m0 + psadbw m0, fenc57 + paddw m1, m2 + paddw m1, m0 + movhlps m2, m1 + paddw m1, m2 + movd r2d, m1 + + movu m0, [r3] + por m3, [r4] + paddw m0, m3 + mova [r4], m0 + movzx r5d, word [r3+16] + add r2d, r5d + mov [r4+16], r2w + +%if cpuflag(sse4) + phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl + movd eax, m0 +%else + ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index + paddusw m0, m0 + paddusw m0, m0 + paddw m0, [off(pw_s00112233)] + movhlps m1, m0 + pminsw m0, m1 + pshuflw m1, m0, q0032 + pminsw m0, m1 + movd eax, m0 + ; repack with 3 bit index + xor eax, 0x80008000 + movzx r3d, ax + shr eax, 15 + add r3d, r3d + or eax, 1 + cmp eax, r3d + cmovg eax, r3d + ; reverse to phminposuw order + mov r3d, eax + and eax, 7 + shr r3d, 3 + shl eax, 16 + or eax, r3d +%endif + add r2d, 8<<16 + cmp ax, r2w + cmovg eax, r2d + + mov r2d, eax + shr r2d, 16 + shl r2d, 6 + add r1, 4*FDEC_STRIDE + mova m0, [rsp+padbase+r2+0x00] + mova m1, [rsp+padbase+r2+0x10] + mova m2, [rsp+padbase+r2+0x20] + mova m3, [rsp+padbase+r2+0x30] + movq [r1+FDEC_STRIDE*-4], m0 + movhps [r1+FDEC_STRIDE*-2], m0 + movq [r1+FDEC_STRIDE*-3], m1 + movhps [r1+FDEC_STRIDE*-1], m1 + movq [r1+FDEC_STRIDE* 0], m2 + movhps [r1+FDEC_STRIDE* 2], m2 + movq [r1+FDEC_STRIDE* 1], m3 + movhps [r1+FDEC_STRIDE* 3], m3 + ADD rsp, pad + RET + +%if ARCH_X86_64 +;----------------------------------------------------------------------------- +; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds ) +;----------------------------------------------------------------------------- +cglobal intra_sa8d_x9_8x8, 5,6,16 + %assign pad 0x2c0+0x10-gprsize-(stack_offset&15) + %define fenc_buf rsp + %define pred_buf rsp+0x80 + SUB rsp, pad + mova m15, [hmul_8p] + pxor m8, m8 +%assign %%i 0 +%rep 8 + movddup m %+ %%i, [r0+%%i*FENC_STRIDE] + pmaddubsw m9, m %+ %%i, m15 + punpcklbw m %+ %%i, m8 + mova [fenc_buf+%%i*0x10], m9 +%assign %%i %%i+1 +%endrep + + ; save instruction size: avoid 4-byte memory offsets + lea r0, [intra8x9_h1+0x80] + %define off(m) (r0+m-(intra8x9_h1+0x80)) + lea r5, [pred_buf+0x80] + +; v, h, dc + HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8 + pabsw m11, m1 +%assign %%i 2 +%rep 6 + pabsw m8, m %+ %%i + paddw m11, m8 +%assign %%i %%i+1 +%endrep + + ; 1D hadamard of edges + movq m8, [r2+7] + movddup m9, [r2+16] + mova [r5-0x80], m9 + mova [r5-0x70], m9 + mova [r5-0x60], m9 + mova [r5-0x50], m9 + punpcklwd m8, m8 + pshufb m9, [intrax3_shuf] + pmaddubsw m8, [pb_pppm] + pmaddubsw m9, [pb_pppm] + HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm] + HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm] + + ; dc + paddw m10, m8, m9 + paddw m10, [pw_8] + pand m10, [sw_f0] + psrlw m12, m10, 4 + psllw m10, 2 + pxor m13, m13 + pshufb m12, m13 + mova [r5+0x00], m12 + mova [r5+0x10], m12 + mova [r5+0x20], m12 + mova [r5+0x30], m12 + + ; differences + psllw m8, 3 ; left edge + psubw m8, m0 + psubw m10, m0 + pabsw m8, m8 ; 1x8 sum + pabsw m10, m10 + paddw m8, m11 + paddw m11, m10 + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + punpckldq m0, m2 + punpckldq m4, m6 + punpcklqdq m0, m4 ; transpose + psllw m9, 3 ; top edge + psrldq m10, m11, 2 ; 8x7 sum + psubw m0, m9 ; 8x1 sum + pabsw m0, m0 + paddw m10, m0 + + phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow + psrlw m11, 1 + psrlw m10, 1 + +; store h + movq m3, [r2+7] + pshufb m0, m3, [off(intra8x9_h1)] + pshufb m1, m3, [off(intra8x9_h2)] + pshufb m2, m3, [off(intra8x9_h3)] + pshufb m3, m3, [off(intra8x9_h4)] + mova [r5-0x40], m0 + mova [r5-0x30], m1 + mova [r5-0x20], m2 + mova [r5-0x10], m3 + +; ddl + mova m8, [r2+16] + movu m2, [r2+17] + pslldq m1, m8, 1 + pavgb m9, m8, m2 + PRED4x4_LOWPASS m8, m1, m2, m8, m3 + pshufb m0, m8, [off(intra8x9_ddl1)] + pshufb m1, m8, [off(intra8x9_ddl2)] + pshufb m2, m8, [off(intra8x9_ddl3)] + pshufb m3, m8, [off(intra8x9_ddl4)] + add r5, 0x40 + call .sa8d + phaddd m11, m0 + +; vl + pshufb m0, m9, [off(intra8x9_vl1)] + pshufb m1, m8, [off(intra8x9_vl2)] + pshufb m2, m9, [off(intra8x9_vl3)] + pshufb m3, m8, [off(intra8x9_vl4)] + add r5, 0x100 + call .sa8d + phaddd m10, m11 + mova m12, m0 + +; ddr + movu m2, [r2+8] + movu m8, [r2+7] + movu m1, [r2+6] + pavgb m9, m2, m8 + PRED4x4_LOWPASS m8, m1, m2, m8, m3 + pshufb m0, m8, [off(intra8x9_ddr1)] + pshufb m1, m8, [off(intra8x9_ddr2)] + pshufb m2, m8, [off(intra8x9_ddr3)] + pshufb m3, m8, [off(intra8x9_ddr4)] + sub r5, 0xc0 + call .sa8d + mova m11, m0 + + add r0, 0x100 + %define off(m) (r0+m-(intra8x9_h1+0x180)) + +; vr + movsd m2, m9, m8 + pshufb m0, m2, [off(intra8x9_vr1)] + pshufb m1, m8, [off(intra8x9_vr2)] + pshufb m2, m2, [off(intra8x9_vr3)] + pshufb m3, m8, [off(intra8x9_vr4)] + add r5, 0x40 + call .sa8d + phaddd m11, m0 + +; hd +%if cpuflag(sse4) + pshufd m1, m9, q0001 + pblendw m1, m8, q3330 +%else + pshufd m2, m9, q0001 + movss m1, m8, m2 +%endif + punpcklbw m8, m9 + pshufb m0, m1, [off(intra8x9_hd1)] + pshufb m1, m1, [off(intra8x9_hd2)] + pshufb m2, m8, [off(intra8x9_hd3)] + pshufb m3, m8, [off(intra8x9_hd4)] + add r5, 0x40 + call .sa8d + phaddd m0, m12 + phaddd m11, m0 + +; hu +%if cpuflag(sse4) + pinsrb m8, [r2+7], 15 +%else + movd m9, [r2+7] + pslldq m8, 1 + palignr m9, m8, 1 + SWAP 8, 9 +%endif + pshufb m0, m8, [off(intra8x9_hu1)] + pshufb m1, m8, [off(intra8x9_hu2)] + pshufb m2, m8, [off(intra8x9_hu3)] + pshufb m3, m8, [off(intra8x9_hu4)] + add r5, 0x80 + call .sa8d + + pmaddwd m0, [pw_1] + phaddw m10, m11 + movhlps m1, m0 + paddw m0, m1 + pshuflw m1, m0, q0032 + pavgw m0, m1 + pxor m2, m2 + pavgw m10, m2 + movd r2d, m0 + + movu m0, [r3] + paddw m0, m10 + mova [r4], m0 + movzx r5d, word [r3+16] + add r2d, r5d + mov [r4+16], r2w + +%if cpuflag(sse4) + phminposuw m0, m0 + movd eax, m0 +%else + ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index + paddusw m0, m0 + paddw m0, [off(pw_s00001111)] + movhlps m1, m0 + pminsw m0, m1 + pshuflw m1, m0, q0032 + mova m2, m0 + pminsw m0, m1 + pcmpgtw m2, m1 ; 2nd index bit + movd r3d, m0 + movd r4d, m2 + ; repack with 3 bit index + xor r3d, 0x80008000 + and r4d, 0x00020002 + movzx eax, r3w + movzx r5d, r4w + shr r3d, 16 + shr r4d, 16 + lea eax, [rax*4+r5] + lea r3d, [ r3*4+r4+1] + cmp eax, r3d + cmovg eax, r3d + ; reverse to phminposuw order + mov r3d, eax + and eax, 7 + shr r3d, 3 + shl eax, 16 + or eax, r3d +%endif + add r2d, 8<<16 + cmp ax, r2w + cmovg eax, r2d + + mov r2d, eax + shr r2d, 16 + shl r2d, 6 + add r1, 4*FDEC_STRIDE + mova m0, [pred_buf+r2+0x00] + mova m1, [pred_buf+r2+0x10] + mova m2, [pred_buf+r2+0x20] + mova m3, [pred_buf+r2+0x30] + movq [r1+FDEC_STRIDE*-4], m0 + movhps [r1+FDEC_STRIDE*-2], m0 + movq [r1+FDEC_STRIDE*-3], m1 + movhps [r1+FDEC_STRIDE*-1], m1 + movq [r1+FDEC_STRIDE* 0], m2 + movhps [r1+FDEC_STRIDE* 2], m2 + movq [r1+FDEC_STRIDE* 1], m3 + movhps [r1+FDEC_STRIDE* 3], m3 + ADD rsp, pad + RET + +ALIGN 16 +.sa8d: + %xdefine mret m0 + %xdefine fenc_buf fenc_buf+gprsize + mova [r5+0x00], m0 + mova [r5+0x10], m1 + mova [r5+0x20], m2 + mova [r5+0x30], m3 + movddup m4, m0 + movddup m5, m1 + movddup m6, m2 + movddup m7, m3 + punpckhqdq m0, m0 + punpckhqdq m1, m1 + punpckhqdq m2, m2 + punpckhqdq m3, m3 + PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + psubw m0, [fenc_buf+0x00] + psubw m1, [fenc_buf+0x10] + pmaddubsw m2, m15 + pmaddubsw m3, m15 + psubw m2, [fenc_buf+0x20] + psubw m3, [fenc_buf+0x30] + pmaddubsw m4, m15 + pmaddubsw m5, m15 + psubw m4, [fenc_buf+0x40] + psubw m5, [fenc_buf+0x50] + pmaddubsw m6, m15 + pmaddubsw m7, m15 + psubw m6, [fenc_buf+0x60] + psubw m7, [fenc_buf+0x70] + HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14 + paddw m0, m1 + paddw m0, m2 + paddw mret, m0, m3 + ret +%endif ; ARCH_X86_64 +%endmacro ; INTRA8_X9 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0 ; out: [tmp]=hadamard4, m0=satd -cglobal x264_hadamard_ac_4x4_mmxext +INIT_MMX mmx2 +cglobal hadamard_ac_4x4 +%if HIGH_BIT_DEPTH + mova m0, [r0] + mova m1, [r0+r1] + mova m2, [r0+r1*2] + mova m3, [r0+r2] +%else ; !HIGH_BIT_DEPTH movh m0, [r0] movh m1, [r0+r1] movh m2, [r0+r1*2] @@ -1277,132 +3743,198 @@ cglobal x264_hadamard_ac_4x4_mmxext punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 - HADAMARD4_1D m0, m1, m2, m3 - TRANSPOSE4x4W 0, 1, 2, 3, 4 - HADAMARD4_1D m0, m1, m2, m3 +%endif ; HIGH_BIT_DEPTH + HADAMARD4_2D 0, 1, 2, 3, 4 mova [r3], m0 mova [r3+8], m1 mova [r3+16], m2 mova [r3+24], m3 - ABS1 m0, m4 - ABS1 m1, m4 + ABSW m0, m0, m4 + ABSW m1, m1, m4 pand m0, m6 - ABS1 m2, m4 - ABS1 m3, m4 + ABSW m2, m2, m4 + ABSW m3, m3, m4 paddw m0, m1 paddw m2, m3 paddw m0, m2 - SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext + SAVE_MM_PERMUTATION ret -cglobal x264_hadamard_ac_2x2_mmxext +cglobal hadamard_ac_2x2max mova m0, [r3+0x00] mova m1, [r3+0x20] mova m2, [r3+0x40] mova m3, [r3+0x60] - HADAMARD4_1D m0, m1, m2, m3 - ABS2 m0, m1, m4, m5 - ABS2 m2, m3, m4, m5 - SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext + sub r3, 8 + SUMSUB_BADC w, 0, 1, 2, 3, 4 + ABSW2 m0, m2, m0, m2, m4, m5 + ABSW2 m1, m3, m1, m3, m4, m5 + HADAMARD 0, max, 0, 2, 4, 5 + HADAMARD 0, max, 1, 3, 4, 5 +%if HIGH_BIT_DEPTH + pmaddwd m0, m7 + pmaddwd m1, m7 + paddd m6, m0 + paddd m6, m1 +%else ; !HIGH_BIT_DEPTH + paddw m7, m0 + paddw m7, m1 +%endif ; HIGH_BIT_DEPTH + SAVE_MM_PERMUTATION ret -cglobal x264_hadamard_ac_8x8_mmxext - mova m6, [mask_ac4 GLOBAL] +%macro AC_PREP 2 +%if HIGH_BIT_DEPTH + pmaddwd %1, %2 +%endif +%endmacro + +%macro AC_PADD 3 +%if HIGH_BIT_DEPTH + AC_PREP %2, %3 + paddd %1, %2 +%else + paddw %1, %2 +%endif ; HIGH_BIT_DEPTH +%endmacro + +cglobal hadamard_ac_8x8 + mova m6, [mask_ac4] +%if HIGH_BIT_DEPTH + mova m7, [pw_1] +%else pxor m7, m7 - call x264_hadamard_ac_4x4_mmxext - add r0, 4 +%endif ; HIGH_BIT_DEPTH + call hadamard_ac_4x4_mmx2 + add r0, 4*SIZEOF_PIXEL add r3, 32 mova m5, m0 - call x264_hadamard_ac_4x4_mmxext + AC_PREP m5, m7 + call hadamard_ac_4x4_mmx2 lea r0, [r0+4*r1] add r3, 64 - paddw m5, m0 - call x264_hadamard_ac_4x4_mmxext - sub r0, 4 + AC_PADD m5, m0, m7 + call hadamard_ac_4x4_mmx2 + sub r0, 4*SIZEOF_PIXEL sub r3, 32 - paddw m5, m0 - call x264_hadamard_ac_4x4_mmxext - paddw m5, m0 - sub r3, 64 + AC_PADD m5, m0, m7 + call hadamard_ac_4x4_mmx2 + AC_PADD m5, m0, m7 + sub r3, 40 mova [rsp+gprsize+8], m5 ; save satd - call x264_hadamard_ac_2x2_mmxext - add r3, 8 - pand m6, m0 - mova m7, m1 - paddw m6, m2 - paddw m7, m3 -%rep 2 - call x264_hadamard_ac_2x2_mmxext - add r3, 8 - paddw m6, m0 - paddw m7, m1 - paddw m6, m2 - paddw m7, m3 +%if HIGH_BIT_DEPTH + pxor m6, m6 +%endif +%rep 3 + call hadamard_ac_2x2max_mmx2 %endrep - call x264_hadamard_ac_2x2_mmxext - sub r3, 24 - paddw m6, m0 + mova m0, [r3+0x00] + mova m1, [r3+0x20] + mova m2, [r3+0x40] + mova m3, [r3+0x60] + SUMSUB_BADC w, 0, 1, 2, 3, 4 + HADAMARD 0, sumsub, 0, 2, 4, 5 + ABSW2 m1, m3, m1, m3, m4, m5 + ABSW2 m0, m2, m0, m2, m4, m5 + HADAMARD 0, max, 1, 3, 4, 5 +%if HIGH_BIT_DEPTH + pand m0, [mask_ac4] + pmaddwd m1, m7 + pmaddwd m0, m7 + pmaddwd m2, m7 + paddd m6, m1 + paddd m0, m2 + paddd m6, m6 + paddd m0, m6 + SWAP 0, 6 +%else ; !HIGH_BIT_DEPTH + pand m6, m0 paddw m7, m1 paddw m6, m2 - paddw m7, m3 + paddw m7, m7 paddw m6, m7 +%endif ; HIGH_BIT_DEPTH mova [rsp+gprsize], m6 ; save sa8d - SWAP m0, m6 - SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext + SWAP 0, 6 + SAVE_MM_PERMUTATION ret +%macro HADAMARD_AC_WXH_SUM_MMX 2 + mova m1, [rsp+1*mmsize] +%if HIGH_BIT_DEPTH +%if %1*%2 >= 128 + paddd m0, [rsp+2*mmsize] + paddd m1, [rsp+3*mmsize] +%endif +%if %1*%2 == 256 + mova m2, [rsp+4*mmsize] + paddd m1, [rsp+5*mmsize] + paddd m2, [rsp+6*mmsize] + mova m3, m0 + paddd m1, [rsp+7*mmsize] + paddd m0, m2 +%endif + psrld m0, 1 + HADDD m0, m2 + psrld m1, 1 + HADDD m1, m3 +%else ; !HIGH_BIT_DEPTH +%if %1*%2 >= 128 + paddusw m0, [rsp+2*mmsize] + paddusw m1, [rsp+3*mmsize] +%endif +%if %1*%2 == 256 + mova m2, [rsp+4*mmsize] + paddusw m1, [rsp+5*mmsize] + paddusw m2, [rsp+6*mmsize] + mova m3, m0 + paddusw m1, [rsp+7*mmsize] + pxor m3, m2 + pand m3, [pw_1] + pavgw m0, m2 + psubusw m0, m3 + HADDUW m0, m2 +%else + psrlw m0, 1 + HADDW m0, m2 +%endif + psrlw m1, 1 + HADDW m1, m3 +%endif ; HIGH_BIT_DEPTH +%endmacro + %macro HADAMARD_AC_WXH_MMX 2 -cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4 +cglobal pixel_hadamard_ac_%1x%2, 2,4 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 + FIX_STRIDES r1 sub rsp, 16+128+pad lea r2, [r1*3] lea r3, [rsp+16] - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmx2 %if %2==16 %define ysub r2 lea r0, [r0+r1*4] sub rsp, 16 - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmx2 %endif %if %1==16 neg ysub sub rsp, 16 - lea r0, [r0+ysub*4+8] + lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] neg ysub - call x264_hadamard_ac_8x8_mmxext + call hadamard_ac_8x8_mmx2 %if %2==16 lea r0, [r0+r1*4] sub rsp, 16 - call x264_hadamard_ac_8x8_mmxext -%endif -%endif - mova m1, [rsp+0x08] -%if %1*%2 >= 128 - paddusw m0, [rsp+0x10] - paddusw m1, [rsp+0x18] + call hadamard_ac_8x8_mmx2 %endif -%if %1*%2 == 256 - mova m2, [rsp+0x20] - paddusw m1, [rsp+0x28] - paddusw m2, [rsp+0x30] - mova m3, m0 - paddusw m1, [rsp+0x38] - pxor m3, m2 - pand m3, [pw_1 GLOBAL] - pavgw m0, m2 - psubusw m0, m3 - HADDUW m0, m2 -%else - psrlw m0, 1 - HADDW m0, m2 %endif - psrlw m1, 1 - HADDW m1, m3 + HADAMARD_AC_WXH_SUM_MMX %1, %2 movd edx, m0 movd eax, m1 shr edx, 1 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 shl rdx, 32 add rax, rdx %endif @@ -1415,259 +3947,737 @@ HADAMARD_AC_WXH_MMX 8, 16 HADAMARD_AC_WXH_MMX 16, 8 HADAMARD_AC_WXH_MMX 8, 8 -%macro HADAMARD_AC_SSE2 1 -INIT_XMM +%macro LOAD_INC_8x4W_SSE2 5 +%if HIGH_BIT_DEPTH + movu m%1, [r0] + movu m%2, [r0+r1] + movu m%3, [r0+r1*2] + movu m%4, [r0+r2] +%ifidn %1, 0 + lea r0, [r0+r1*4] +%endif +%else ; !HIGH_BIT_DEPTH + movh m%1, [r0] + movh m%2, [r0+r1] + movh m%3, [r0+r1*2] + movh m%4, [r0+r2] +%ifidn %1, 0 + lea r0, [r0+r1*4] +%endif + punpcklbw m%1, m%5 + punpcklbw m%2, m%5 + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 +%endif ; HIGH_BIT_DEPTH +%endmacro + +%macro LOAD_INC_8x4W_SSSE3 5 + LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1] +%ifidn %1, 0 + lea r0, [r0+r1*4] +%endif + HSUMSUB %1, %2, %3, %4, %5 +%endmacro + +%macro HADAMARD_AC_SSE2 0 ; in: r0=pix, r1=stride, r2=stride*3 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4 -cglobal x264_hadamard_ac_8x8_%1 -%ifdef ARCH_X86_64 +cglobal hadamard_ac_8x8 +%if ARCH_X86_64 %define spill0 m8 %define spill1 m9 %define spill2 m10 %else %define spill0 [rsp+gprsize] - %define spill1 [rsp+gprsize+16] - %define spill2 [rsp+gprsize+32] + %define spill1 [rsp+gprsize+mmsize] + %define spill2 [rsp+gprsize+mmsize*2] %endif +%if HIGH_BIT_DEPTH + %define vertical 1 +%elif cpuflag(ssse3) && notcpuflag(atom) + %define vertical 0 + ;LOAD_INC loads sumsubs + mova m7, [hmul_8p] +%else + %define vertical 1 + ;LOAD_INC only unpacks to words pxor m7, m7 - movh m0, [r0] - movh m1, [r0+r1] - movh m2, [r0+r1*2] - movh m3, [r0+r2] - lea r0, [r0+r1*4] - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - HADAMARD4_1D m0, m1, m2, m3 - mova spill0, m3 - SWAP m3, m7 - movh m4, [r0] - movh m5, [r0+r1] - movh m6, [r0+r1*2] - movh m7, [r0+r2] - punpcklbw m4, m3 - punpcklbw m5, m3 - punpcklbw m6, m3 - punpcklbw m7, m3 - HADAMARD4_1D m4, m5, m6, m7 - mova m3, spill0 -%ifdef ARCH_X86_64 - TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 -%else - TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1 -%endif - HADAMARD4_1D m0, m1, m2, m3 - HADAMARD4_1D m4, m5, m6, m7 +%endif + LOAD_INC_8x4W 0, 1, 2, 3, 7 +%if vertical + HADAMARD4_2D_SSE 0, 1, 2, 3, 4 +%else + HADAMARD4_V 0, 1, 2, 3, 4 +%endif mova spill0, m1 + SWAP 1, 7 + LOAD_INC_8x4W 4, 5, 6, 7, 1 +%if vertical + HADAMARD4_2D_SSE 4, 5, 6, 7, 1 +%else + HADAMARD4_V 4, 5, 6, 7, 1 + ; FIXME SWAP + mova m1, spill0 + mova spill0, m6 + mova spill1, m7 + HADAMARD 1, sumsub, 0, 1, 6, 7 + HADAMARD 1, sumsub, 2, 3, 6, 7 + mova m6, spill0 + mova m7, spill1 + mova spill0, m1 + mova spill1, m0 + HADAMARD 1, sumsub, 4, 5, 1, 0 + HADAMARD 1, sumsub, 6, 7, 1, 0 + mova m0, spill1 +%endif mova spill1, m2 mova spill2, m3 - ABS_MOV m1, m0 - ABS_MOV m2, m4 - ABS_MOV m3, m5 - paddw m1, m2 - SUMSUB_BA m0, m4 - pand m1, [mask_ac4 GLOBAL] - ABS_MOV m2, spill0 - paddw m1, m3 - ABS_MOV m3, spill1 - paddw m1, m2 - ABS_MOV m2, spill2 - paddw m1, m3 - ABS_MOV m3, m6 + ABSW m1, m0, m0 + ABSW m2, m4, m4 + ABSW m3, m5, m5 paddw m1, m2 - ABS_MOV m2, m7 - paddw m1, m3 - mova m3, m7 - paddw m1, m2 - mova m2, m6 + SUMSUB_BA w, 0, 4 +%if vertical + pand m1, [mask_ac4] +%else + pand m1, [mask_ac4b] +%endif + AC_PREP m1, [pw_1] + ABSW m2, spill0 + AC_PADD m1, m3, [pw_1] + ABSW m3, spill1 + AC_PADD m1, m2, [pw_1] + ABSW m2, spill2 + AC_PADD m1, m3, [pw_1] + ABSW m3, m6, m6 + AC_PADD m1, m2, [pw_1] + ABSW m2, m7, m7 + AC_PADD m1, m3, [pw_1] + AC_PADD m1, m2, [pw_1] + paddw m3, m7, spill2 psubw m7, spill2 - paddw m3, spill2 - mova [rsp+gprsize+32], m1 ; save satd - mova m1, m5 + mova [rsp+gprsize+mmsize*2], m1 ; save satd + paddw m2, m6, spill1 psubw m6, spill1 - paddw m2, spill1 + paddw m1, m5, spill0 psubw m5, spill0 - paddw m1, spill0 - mova spill1, m7 - SBUTTERFLY qdq, 0, 4, 7 - SBUTTERFLY qdq, 1, 5, 7 - SBUTTERFLY qdq, 2, 6, 7 - SUMSUB_BADC m0, m4, m1, m5 - SUMSUB_BA m2, m6 - ABS1 m0, m7 - ABS1 m1, m7 - pand m0, [mask_ac8 GLOBAL] - ABS1 m2, m7 - ABS1 m4, m7 - ABS1 m5, m7 - ABS1 m6, m7 - mova m7, spill1 - paddw m0, m4 - SBUTTERFLY qdq, 3, 7, 4 - SUMSUB_BA m3, m7 - paddw m1, m5 - ABS1 m3, m4 - ABS1 m7, m4 - paddw m2, m6 - paddw m3, m7 - paddw m0, m1 - paddw m2, m3 - paddw m0, m2 - mova [rsp+gprsize+16], m0 ; save sa8d - SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1 + %assign %%x 2 +%if vertical + %assign %%x 4 +%endif + mova spill1, m4 + HADAMARD %%x, amax, 3, 7, 4 + HADAMARD %%x, amax, 2, 6, 7, 4 + mova m4, spill1 + HADAMARD %%x, amax, 1, 5, 6, 7 + HADAMARD %%x, sumsub, 0, 4, 5, 6 + AC_PREP m2, [pw_1] + AC_PADD m2, m3, [pw_1] + AC_PADD m2, m1, [pw_1] +%if HIGH_BIT_DEPTH + paddd m2, m2 +%else + paddw m2, m2 +%endif ; HIGH_BIT_DEPTH + ABSW m4, m4, m7 + pand m0, [mask_ac8] + ABSW m0, m0, m7 + AC_PADD m2, m4, [pw_1] + AC_PADD m2, m0, [pw_1] + mova [rsp+gprsize+mmsize], m2 ; save sa8d + SWAP 0, 2 + SAVE_MM_PERMUTATION ret -HADAMARD_AC_WXH_SSE2 16, 16, %1 -HADAMARD_AC_WXH_SSE2 8, 16, %1 -HADAMARD_AC_WXH_SSE2 16, 8, %1 -HADAMARD_AC_WXH_SSE2 8, 8, %1 +HADAMARD_AC_WXH_SSE2 16, 16 +HADAMARD_AC_WXH_SSE2 16, 8 +%if mmsize <= 16 +HADAMARD_AC_WXH_SSE2 8, 16 +HADAMARD_AC_WXH_SSE2 8, 8 +%endif %endmacro ; HADAMARD_AC_SSE2 -; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) -%macro HADAMARD_AC_WXH_SSE2 3 -cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3 - %assign pad 16-gprsize-(stack_offset&15) +%macro HADAMARD_AC_WXH_SUM_SSE2 2 + mova m1, [rsp+2*mmsize] +%if HIGH_BIT_DEPTH +%if %1*%2 >= 128 + paddd m0, [rsp+3*mmsize] + paddd m1, [rsp+4*mmsize] +%endif +%if %1*%2 == 256 + paddd m0, [rsp+5*mmsize] + paddd m1, [rsp+6*mmsize] + paddd m0, [rsp+7*mmsize] + paddd m1, [rsp+8*mmsize] + psrld m0, 1 +%endif + HADDD xm0, xm2 + HADDD xm1, xm3 +%else ; !HIGH_BIT_DEPTH +%if %1*%2*16/mmsize >= 128 + paddusw m0, [rsp+3*mmsize] + paddusw m1, [rsp+4*mmsize] +%endif +%if %1*%2*16/mmsize == 256 + paddusw m0, [rsp+5*mmsize] + paddusw m1, [rsp+6*mmsize] + paddusw m0, [rsp+7*mmsize] + paddusw m1, [rsp+8*mmsize] + psrlw m0, 1 +%endif +%if mmsize==32 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + paddusw xm0, xm2 + paddusw xm1, xm3 +%endif + HADDUW xm0, xm2 + HADDW xm1, xm3 +%endif ; HIGH_BIT_DEPTH +%endmacro + +; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) +%macro HADAMARD_AC_WXH_SSE2 2 +cglobal pixel_hadamard_ac_%1x%2, 2,4,11 %define ysub r1 - sub rsp, 48+pad - lea r2, [r1*3] - call x264_hadamard_ac_8x8_%3 + FIX_STRIDES r1 + mov r3, rsp + and rsp, ~(mmsize-1) + sub rsp, mmsize*3 + lea r2, [r1*3] + call hadamard_ac_8x8 %if %2==16 %define ysub r2 - lea r0, [r0+r1*4] - sub rsp, 32 - call x264_hadamard_ac_8x8_%3 + lea r0, [r0+r1*4] + sub rsp, mmsize*2 + call hadamard_ac_8x8 %endif -%if %1==16 +%if %1==16 && mmsize <= 16 neg ysub - sub rsp, 32 - lea r0, [r0+ysub*4+8] + sub rsp, mmsize*2 + lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] neg ysub - call x264_hadamard_ac_8x8_%3 + call hadamard_ac_8x8 %if %2==16 - lea r0, [r0+r1*4] - sub rsp, 32 - call x264_hadamard_ac_8x8_%3 + lea r0, [r0+r1*4] + sub rsp, mmsize*2 + call hadamard_ac_8x8 %endif %endif - mova m1, [rsp+0x20] -%if %1*%2 >= 128 - paddusw m0, [rsp+0x30] - paddusw m1, [rsp+0x40] -%endif -%if %1*%2 == 256 - paddusw m0, [rsp+0x50] - paddusw m1, [rsp+0x60] - paddusw m0, [rsp+0x70] - paddusw m1, [rsp+0x80] - psrlw m0, 1 -%endif - HADDW m0, m2 - HADDW m1, m3 - movd edx, m0 - movd eax, m1 - shr edx, 2 - (%1*%2 >> 8) + HADAMARD_AC_WXH_SUM_SSE2 %1, %2 + movd edx, xm0 + movd eax, xm1 + shr edx, 2 - (%1*%2*16/mmsize >> 8) shr eax, 1 -%ifdef ARCH_X86_64 +%if ARCH_X86_64 shl rdx, 32 add rax, rdx %endif - add rsp, 16+%1*%2/2+pad + mov rsp, r3 RET %endmacro ; HADAMARD_AC_WXH_SSE2 ; instantiate satds -%ifndef ARCH_X86_64 -cextern x264_pixel_sa8d_8x8_internal_mmxext -SA8D_16x16_32 mmxext +%if ARCH_X86_64 == 0 +cextern pixel_sa8d_8x8_internal_mmx2 +INIT_MMX mmx2 +SA8D +%endif + +%define TRANS TRANS_SSE2 +%define DIFFOP DIFF_UNPACK_SSE2 +%define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2 +%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P +%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2 +%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size +%define movdqu movups +%define punpcklqdq movlhps +INIT_XMM sse2 +SA8D +SATDS_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 +INTRA_SA8D_SSE2 +%endif +INIT_MMX mmx2 +INTRA_X3_MMX +INIT_XMM sse2 +HADAMARD_AC_SSE2 + +%if HIGH_BIT_DEPTH == 0 +INIT_XMM ssse3,atom +SATDS_SSE2 +SA8D +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif +%endif + +%define DIFFOP DIFF_SUMSUB_SSSE3 +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE +%if HIGH_BIT_DEPTH == 0 +%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3 +%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 +%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 +%endif +INIT_XMM ssse3 +SATDS_SSE2 +SA8D +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 +INTRA_X9 +INTRA8_X9 +%endif +%undef movdqa ; nehalem doesn't like movaps +%undef movdqu ; movups +%undef punpcklqdq ; or movlhps +%if HIGH_BIT_DEPTH == 0 +INIT_MMX ssse3 +INTRA_X3_MMX +%endif + +%define TRANS TRANS_SSE4 +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN +INIT_XMM sse4 +SATDS_SSE2 +SA8D +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 +INTRA_X9 +INTRA8_X9 +%endif + +; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so +; it's effectively free. +%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE +INIT_XMM avx +SATDS_SSE2 +SA8D +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 +INTRA_X9 +INTRA8_X9 +%endif +HADAMARD_AC_SSE2 + +%define TRANS TRANS_XOP +INIT_XMM xop +SATDS_SSE2 +SA8D +%if ARCH_X86_64 +SA8D_SATD +%endif +%if HIGH_BIT_DEPTH == 0 +INTRA_X9 +; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why. +%endif +HADAMARD_AC_SSE2 + + +%if HIGH_BIT_DEPTH == 0 +%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 +%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 +%define TRANS TRANS_SSE4 +INIT_YMM avx2 +HADAMARD_AC_SSE2 +%if ARCH_X86_64 +SA8D_SATD +%endif + +%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] + movq xm%1, [r0] + movq xm%3, [r2] + movq xm%2, [r0+r1] + movq xm%4, [r2+r3] + vinserti128 m%1, m%1, [r0+4*r1], 1 + vinserti128 m%3, m%3, [r2+4*r3], 1 + vinserti128 m%2, m%2, [r0+r4], 1 + vinserti128 m%4, m%4, [r2+r5], 1 + punpcklqdq m%1, m%1 + punpcklqdq m%3, m%3 + punpcklqdq m%2, m%2 + punpcklqdq m%4, m%4 + DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + + movq xm%3, [r0] + movq xm%5, [r2] + movq xm%4, [r0+r1] + movq xm%6, [r2+r3] + vinserti128 m%3, m%3, [r0+4*r1], 1 + vinserti128 m%5, m%5, [r2+4*r3], 1 + vinserti128 m%4, m%4, [r0+r4], 1 + vinserti128 m%6, m%6, [r2+r5], 1 + punpcklqdq m%3, m%3 + punpcklqdq m%5, m%5 + punpcklqdq m%4, m%4 + punpcklqdq m%6, m%6 + DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 +%endmacro + +%macro SATD_START_AVX2 2-3 0 + FIX_STRIDES r1, r3 +%if %3 + mova %2, [hmul_8p] + lea r4, [5*r1] + lea r5, [5*r3] +%else + mova %2, [hmul_16p] + lea r4, [3*r1] + lea r5, [3*r3] %endif + pxor %1, %1 +%endmacro + +%define TRANS TRANS_SSE4 +INIT_YMM avx2 +cglobal pixel_satd_16x8_internal + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_16x16, 4,6,8 + SATD_START_AVX2 m6, m7 + call pixel_satd_16x8_internal + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +pixel_satd_16x8_internal: + call pixel_satd_16x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_16x8, 4,6,8 + SATD_START_AVX2 m6, m7 + jmp pixel_satd_16x8_internal + +cglobal pixel_satd_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 + ret + +cglobal pixel_satd_8x16, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET + +cglobal pixel_satd_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_satd_8x8_internal + vextracti128 xm0, m6, 1 + paddw xm0, xm6 + SATD_END_SSE2 xm0 + RET -%define ABS1 ABS1_MMX -%define ABS2 ABS2_MMX -SATDS_SSE2 sse2 -SA8D_16x16_32 sse2 -INTRA_SA8D_SSE2 sse2 -INTRA_SATDS_MMX mmxext -HADAMARD_AC_SSE2 sse2 -%define ABS1 ABS1_SSSE3 -%define ABS2 ABS2_SSSE3 -%define ABS_MOV ABS_MOV_SSSE3 -SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. -SATDS_SSE2 ssse3 -SA8D_16x16_32 ssse3 -INTRA_SA8D_SSE2 ssse3 -INTRA_SATDS_MMX ssse3 -HADAMARD_AC_SSE2 ssse3 -SATDS_SSE2 ssse3_phadd +cglobal pixel_sa8d_8x8_internal + LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 + HADAMARD4_V 0, 1, 2, 3, 4 + HADAMARD 8, sumsub, 0, 1, 4, 5 + HADAMARD 8, sumsub, 2, 3, 4, 5 + HADAMARD 2, sumsub, 0, 1, 4, 5 + HADAMARD 2, sumsub, 2, 3, 4, 5 + HADAMARD 1, amax, 0, 1, 4, 5 + HADAMARD 1, amax, 2, 3, 4, 5 + paddw m6, m0 + paddw m6, m2 + ret +cglobal pixel_sa8d_8x8, 4,6,8 + SATD_START_AVX2 m6, m7, 1 + call pixel_sa8d_8x8_internal + vextracti128 xm1, m6, 1 + paddw xm6, xm1 + HADDW xm6, xm1 + movd eax, xm6 + add eax, 1 + shr eax, 1 + RET +cglobal intra_sad_x9_8x8, 5,7,8 + %define pred(i,j) [rsp+i*0x40+j*0x20] + + mov r6, rsp + and rsp, ~31 + sub rsp, 0x240 + movu m5, [r0+0*FENC_STRIDE] + movu m6, [r0+4*FENC_STRIDE] + punpcklqdq m5, [r0+2*FENC_STRIDE] + punpcklqdq m6, [r0+6*FENC_STRIDE] + + ; save instruction size: avoid 4-byte memory offsets + lea r0, [intra8x9_h1+128] + %define off(m) (r0+m-(intra8x9_h1+128)) + + vpbroadcastq m0, [r2+16] + psadbw m4, m0, m5 + psadbw m2, m0, m6 + mova pred(0,0), m0 + mova pred(0,1), m0 + paddw m4, m2 + + vpbroadcastq m1, [r2+7] + pshufb m3, m1, [off(intra8x9_h1)] + pshufb m2, m1, [off(intra8x9_h3)] + mova pred(1,0), m3 + mova pred(1,1), m2 + psadbw m3, m5 + psadbw m2, m6 + paddw m3, m2 + + lea r5, [rsp+0x100] + %define pred(i,j) [r5+i*0x40+j*0x20-0x100] + + ; combine the first two + pslldq m3, 2 + por m4, m3 + + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + pshufb m0, m2 + mova pred(2,0), m0 + mova pred(2,1), m0 + psadbw m3, m0, m5 + psadbw m2, m0, m6 + paddw m3, m2 + + pslldq m3, 4 + por m4, m3 + + vbroadcasti128 m0, [r2+16] + vbroadcasti128 m2, [r2+17] + pslldq m1, m0, 1 + pavgb m3, m0, m2 + PRED4x4_LOWPASS m0, m1, m2, m0, m7 + pshufb m1, m0, [off(intra8x9_ddl1)] + pshufb m2, m0, [off(intra8x9_ddl3)] + mova pred(3,0), m1 + mova pred(3,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 6 + por m4, m1 + vextracti128 xm1, m4, 1 + paddw xm4, xm1 + mova [r4], xm4 + + ; for later + vinserti128 m7, m3, xm0, 1 + + vbroadcasti128 m2, [r2+8] + vbroadcasti128 m0, [r2+7] + vbroadcasti128 m1, [r2+6] + pavgb m3, m2, m0 + PRED4x4_LOWPASS m0, m1, m2, m0, m4 + pshufb m1, m0, [off(intra8x9_ddr1)] + pshufb m2, m0, [off(intra8x9_ddr3)] + mova pred(4,0), m1 + mova pred(4,1), m2 + psadbw m4, m1, m5 + psadbw m2, m6 + paddw m4, m2 + + add r0, 256 + add r5, 0xC0 + %define off(m) (r0+m-(intra8x9_h1+256+128)) + %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0] + + vpblendd m2, m3, m0, 11110011b + pshufb m1, m2, [off(intra8x9_vr1)] + pshufb m2, m2, [off(intra8x9_vr3)] + mova pred(5,0), m1 + mova pred(5,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 2 + por m4, m1 + + psrldq m2, m3, 4 + pblendw m2, m0, q3330 + punpcklbw m0, m3 + pshufb m1, m2, [off(intra8x9_hd1)] + pshufb m2, m0, [off(intra8x9_hd3)] + mova pred(6,0), m1 + mova pred(6,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 4 + por m4, m1 + + pshufb m1, m7, [off(intra8x9_vl1)] + pshufb m2, m7, [off(intra8x9_vl3)] + mova pred(7,0), m1 + mova pred(7,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + + pslldq m1, 6 + por m4, m1 + vextracti128 xm1, m4, 1 + paddw xm4, xm1 + mova xm3, [r4] + SBUTTERFLY qdq, 3, 4, 7 + paddw xm3, xm4 + + pslldq m1, m0, 1 + vpbroadcastd m0, [r2+7] + palignr m0, m1, 1 + pshufb m1, m0, [off(intra8x9_hu1)] + pshufb m2, m0, [off(intra8x9_hu3)] + mova pred(8,0), m1 + mova pred(8,1), m2 + psadbw m1, m5 + psadbw m2, m6 + paddw m1, m2 + vextracti128 xm2, m1, 1 + paddw xm1, xm2 + movhlps xm2, xm1 + paddw xm1, xm2 + movd r2d, xm1 + + paddw xm3, [r3] + mova [r4], xm3 + add r2w, word [r3+16] + mov [r4+16], r2w + + phminposuw xm3, xm3 + movd r3d, xm3 + add r2d, 8<<16 + cmp r3w, r2w + cmovg r3d, r2d + + mov r2d, r3d + shr r3, 16 + shl r3, 6 + add r1, 4*FDEC_STRIDE + mova xm0, [rsp+r3+0x00] + mova xm1, [rsp+r3+0x10] + mova xm2, [rsp+r3+0x20] + mova xm3, [rsp+r3+0x30] + movq [r1+FDEC_STRIDE*-4], xm0 + movhps [r1+FDEC_STRIDE*-2], xm0 + movq [r1+FDEC_STRIDE*-3], xm1 + movhps [r1+FDEC_STRIDE*-1], xm1 + movq [r1+FDEC_STRIDE* 0], xm2 + movhps [r1+FDEC_STRIDE* 2], xm2 + movq [r1+FDEC_STRIDE* 1], xm3 + movhps [r1+FDEC_STRIDE* 3], xm3 + mov rsp, r6 + mov eax, r2d + RET +%endif ; HIGH_BIT_DEPTH ;============================================================================= ; SSIM ;============================================================================= ;----------------------------------------------------------------------------- -; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) +; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, +; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4 - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - pxor m3, m3 - pxor m4, m4 -%rep 4 - movq m5, [r0] - movq m6, [r2] +%macro SSIM_ITER 1 +%if HIGH_BIT_DEPTH + movdqu m5, [r0+(%1&1)*r1] + movdqu m6, [r2+(%1&1)*r3] +%else + movq m5, [r0+(%1&1)*r1] + movq m6, [r2+(%1&1)*r3] punpcklbw m5, m0 punpcklbw m6, m0 +%endif +%if %1==1 + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] +%endif +%if %1==0 + movdqa m1, m5 + movdqa m2, m6 +%else paddw m1, m5 paddw m2, m6 - movdqa m7, m5 +%endif + pmaddwd m7, m5, m6 pmaddwd m5, m5 - pmaddwd m7, m6 pmaddwd m6, m6 - paddd m3, m5 - paddd m4, m7 + ACCUM paddd, 3, 5, %1 + ACCUM paddd, 4, 7, %1 paddd m3, m6 - add r0, r1 - add r2, r3 -%endrep +%endmacro + +%macro SSIM 0 +cglobal pixel_ssim_4x4x2_core, 4,4,8 + FIX_STRIDES r1, r3 + pxor m0, m0 + SSIM_ITER 0 + SSIM_ITER 1 + SSIM_ITER 2 + SSIM_ITER 3 ; PHADDW m1, m2 ; PHADDD m3, m4 - movdqa m7, [pw_1 GLOBAL] - pshufd m5, m3, 0xb1 + movdqa m7, [pw_1] + pshufd m5, m3, q2301 pmaddwd m1, m7 pmaddwd m2, m7 - pshufd m6, m4, 0xb1 + pshufd m6, m4, q2301 packssdw m1, m2 paddd m3, m5 - pshufd m1, m1, 0xd8 + pshufd m1, m1, q3120 paddd m4, m6 pmaddwd m1, m7 - movdqa m5, m3 + punpckhdq m5, m3, m4 punpckldq m3, m4 - punpckhdq m5, m4 -%ifdef ARCH_X86_64 +%if UNIX64 %define t0 r4 %else - %define t0 eax - mov t0, r4m + %define t0 rax + mov t0, r4mp %endif movq [t0+ 0], m1 movq [t0+ 8], m3 - psrldq m1, 8 - movq [t0+16], m1 + movhps [t0+16], m1 movq [t0+24], m5 RET ;----------------------------------------------------------------------------- -; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) +; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_end4_sse2, 3,3 +cglobal pixel_ssim_end4, 3,3,7 movdqa m0, [r0+ 0] movdqa m1, [r0+16] movdqa m2, [r0+32] @@ -1682,14 +4692,33 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3 paddd m1, m2 paddd m2, m3 paddd m3, m4 - movdqa m5, [ssim_c1 GLOBAL] - movdqa m6, [ssim_c2 GLOBAL] + movdqa m5, [ssim_c1] + movdqa m6, [ssim_c2] TRANSPOSE4x4D 0, 1, 2, 3, 4 ; s1=m0, s2=m1, ss=m2, s12=m3 +%if BIT_DEPTH == 10 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + mulps m2, [pf_64] ; ss*64 + mulps m3, [pf_128] ; s12*128 movdqa m4, m1 + mulps m4, m0 ; s1*s2 + mulps m1, m1 ; s2*s2 + mulps m0, m0 ; s1*s1 + addps m4, m4 ; s1*s2*2 + addps m0, m1 ; s1*s1 + s2*s2 + subps m2, m0 ; vars + subps m3, m4 ; covar*2 + addps m4, m5 ; s1*s2*2 + ssim_c1 + addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1 + addps m2, m6 ; vars + ssim_c2 + addps m3, m6 ; covar*2 + ssim_c2 +%else + pmaddwd m4, m1, m0 ; s1*s2 pslld m1, 16 - pmaddwd m4, m0 ; s1*s2 por m0, m1 pmaddwd m0, m0 ; s1*s1 + s2*s2 pslld m4, 1 @@ -1705,6 +4734,7 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) +%endif mulps m4, m3 mulps m0, m2 divps m4, m0 ; ssim @@ -1713,266 +4743,369 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level neg r2 %ifdef PIC - lea r3, [mask_ff + 16 GLOBAL] + lea r3, [mask_ff + 16] movdqu m1, [r3 + r2*4] %else - movdqu m1, [mask_ff + r2*4 + 16 GLOBAL] + movdqu m1, [mask_ff + r2*4 + 16] %endif pand m4, m1 .skip: movhlps m0, m4 addps m0, m4 - pshuflw m4, m0, 0xE + pshuflw m4, m0, q0032 addss m0, m4 -%ifndef ARCH_X86_64 +%if ARCH_X86_64 == 0 movd r0m, m0 fld dword r0m %endif RET +%endmacro ; SSIM +INIT_XMM sse2 +SSIM +INIT_XMM avx +SSIM + +;----------------------------------------------------------------------------- +; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); +;----------------------------------------------------------------------------- +%macro ASD8 0 +cglobal pixel_asd8, 5,5 + pxor m0, m0 + pxor m1, m1 +.loop: +%if HIGH_BIT_DEPTH + paddw m0, [r0] + paddw m1, [r2] + paddw m0, [r0+2*r1] + paddw m1, [r2+2*r3] + lea r0, [r0+4*r1] + paddw m0, [r0] + paddw m1, [r2+4*r3] + lea r2, [r2+4*r3] + paddw m0, [r0+2*r1] + paddw m1, [r2+2*r3] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%else + movq m2, [r0] + movq m3, [r2] + movhps m2, [r0+r1] + movhps m3, [r2+r3] + lea r0, [r0+2*r1] + psadbw m2, m1 + psadbw m3, m1 + movq m4, [r0] + movq m5, [r2+2*r3] + lea r2, [r2+2*r3] + movhps m4, [r0+r1] + movhps m5, [r2+r3] + lea r0, [r0+2*r1] + paddw m0, m2 + psubw m0, m3 + psadbw m4, m1 + psadbw m5, m1 + lea r2, [r2+2*r3] + paddw m0, m4 + psubw m0, m5 +%endif + sub r4d, 4 + jg .loop +%if HIGH_BIT_DEPTH + psubw m0, m1 + HADDW m0, m1 + ABSD m1, m0 +%else + movhlps m1, m0 + paddw m0, m1 + ABSW m1, m0 +%endif + movd eax, m1 + RET +%endmacro +INIT_XMM sse2 +ASD8 +INIT_XMM ssse3 +ASD8 +%if HIGH_BIT_DEPTH +INIT_XMM xop +ASD8 +%endif ;============================================================================= ; Successive Elimination ADS ;============================================================================= -%macro ADS_START 1 ; unroll_size -%ifdef ARCH_X86_64 - %define t0 r6 - mov r10, rsp +%macro ADS_START 0 +%if UNIX64 + movsxd r5, r5d %else - %define t0 r4 - mov rbp, rsp + mov r5d, r5m %endif - mov r0d, r5m - sub rsp, r0 - sub rsp, %1*4-1 - and rsp, ~15 - mov t0, rsp + mov r0d, r5d + lea r6, [r4+r5+(mmsize-1)] + and r6, ~(mmsize-1) shl r2d, 1 %endmacro -%macro ADS_END 1 +%macro ADS_END 1 ; unroll_size add r1, 8*%1 add r3, 8*%1 - add t0, 4*%1 - sub r0d, 4*%1 + add r6, 4*%1 + sub r0d, 4*%1 jg .loop - jmp ads_mvs + WIN64_RESTORE_XMM rsp +%if mmsize==32 + vzeroupper +%endif + lea r6, [r4+r5+(mmsize-1)] + and r6, ~(mmsize-1) +%if cpuflag(ssse3) + jmp ads_mvs_ssse3 +%else + jmp ads_mvs_mmx +%endif %endmacro -%define ABS1 ABS1_MMX - ;----------------------------------------------------------------------------- -; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, -; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) +; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta, +; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ads4_mmxext, 4,7 - movq mm6, [r0] - movq mm4, [r0+8] - pshufw mm7, mm6, 0 - pshufw mm6, mm6, 0xAA - pshufw mm5, mm4, 0 - pshufw mm4, mm4, 0xAA - ADS_START 1 +INIT_MMX mmx2 +cglobal pixel_ads4, 5,7 + mova m6, [r0] + mova m4, [r0+8] + pshufw m7, m6, 0 + pshufw m6, m6, q2222 + pshufw m5, m4, 0 + pshufw m4, m4, q2222 + ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+16] - psubw mm0, mm7 - psubw mm1, mm6 - ABS1 mm0, mm2 - ABS1 mm1, mm3 - movq mm2, [r1+r2] - movq mm3, [r1+r2+16] - psubw mm2, mm5 - psubw mm3, mm4 - paddw mm0, mm1 - ABS1 mm2, mm1 - ABS1 mm3, mm1 - paddw mm0, mm2 - paddw mm0, mm3 -%ifdef ARCH_X86_64 - pshufw mm1, [r10+8], 0 -%else - pshufw mm1, [ebp+stack_offset+28], 0 -%endif - paddusw mm0, [r3] - psubusw mm1, mm0 - packsswb mm1, mm1 - movd [t0], mm1 + movu m0, [r1] + movu m1, [r1+16] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + movu m2, [r1+r2] + movu m3, [r1+r2+16] + psubw m2, m5 + psubw m3, m4 + paddw m0, m1 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + pshufw m1, r6m, 0 + paddusw m0, [r3] + psubusw m1, m0 + packsswb m1, m1 + movd [r6], m1 ADS_END 1 -cglobal x264_pixel_ads2_mmxext, 4,7 - movq mm6, [r0] - pshufw mm5, r6m, 0 - pshufw mm7, mm6, 0 - pshufw mm6, mm6, 0xAA - ADS_START 1 +cglobal pixel_ads2, 5,7 + mova m6, [r0] + pshufw m5, r6m, 0 + pshufw m7, m6, 0 + pshufw m6, m6, q2222 + ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+r2] - psubw mm0, mm7 - psubw mm1, mm6 - ABS1 mm0, mm2 - ABS1 mm1, mm3 - paddw mm0, mm1 - paddusw mm0, [r3] - movq mm4, mm5 - psubusw mm4, mm0 - packsswb mm4, mm4 - movd [t0], mm4 + movu m0, [r1] + movu m1, [r1+r2] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddw m0, m1 + paddusw m0, [r3] + mova m4, m5 + psubusw m4, m0 + packsswb m4, m4 + movd [r6], m4 ADS_END 1 -cglobal x264_pixel_ads1_mmxext, 4,7 - pshufw mm7, [r0], 0 - pshufw mm6, r6m, 0 - ADS_START 2 +cglobal pixel_ads1, 5,7 + pshufw m7, [r0], 0 + pshufw m6, r6m, 0 + ADS_START .loop: - movq mm0, [r1] - movq mm1, [r1+8] - psubw mm0, mm7 - psubw mm1, mm7 - ABS1 mm0, mm2 - ABS1 mm1, mm3 - paddusw mm0, [r3] - paddusw mm1, [r3+8] - movq mm4, mm6 - movq mm5, mm6 - psubusw mm4, mm0 - psubusw mm5, mm1 - packsswb mm4, mm5 - movq [t0], mm4 + movu m0, [r1] + movu m1, [r1+8] + psubw m0, m7 + psubw m1, m7 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddusw m0, [r3] + paddusw m1, [r3+8] + mova m4, m6 + mova m5, m6 + psubusw m4, m0 + psubusw m5, m1 + packsswb m4, m5 + mova [r6], m4 ADS_END 2 -%macro ADS_SSE2 1 -cglobal x264_pixel_ads4_%1, 4,7 - movdqa xmm4, [r0] - pshuflw xmm7, xmm4, 0 - pshuflw xmm6, xmm4, 0xAA - pshufhw xmm5, xmm4, 0 - pshufhw xmm4, xmm4, 0xAA - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpckhqdq xmm5, xmm5 - punpckhqdq xmm4, xmm4 -%ifdef ARCH_X86_64 - pshuflw xmm8, r6m, 0 - punpcklqdq xmm8, xmm8 - ADS_START 2 - movdqu xmm10, [r1] - movdqu xmm11, [r1+r2] +%macro ADS_XMM 0 +%if mmsize==32 +cglobal pixel_ads4, 5,7,8 + vpbroadcastw m7, [r0+ 0] + vpbroadcastw m6, [r0+ 4] + vpbroadcastw m5, [r0+ 8] + vpbroadcastw m4, [r0+12] +%else +cglobal pixel_ads4, 5,7,12 + mova m4, [r0] + pshuflw m7, m4, q0000 + pshuflw m6, m4, q2222 + pshufhw m5, m4, q0000 + pshufhw m4, m4, q2222 + punpcklqdq m7, m7 + punpcklqdq m6, m6 + punpckhqdq m5, m5 + punpckhqdq m4, m4 +%endif +%if ARCH_X86_64 && mmsize == 16 + movd m8, r6m + SPLATW m8, m8 + ADS_START + movu m10, [r1] + movu m11, [r1+r2] .loop: - movdqa xmm0, xmm10 - movdqu xmm1, [r1+16] - movdqa xmm10, xmm1 - psubw xmm0, xmm7 - psubw xmm1, xmm6 - ABS1 xmm0, xmm2 - ABS1 xmm1, xmm3 - movdqa xmm2, xmm11 - movdqu xmm3, [r1+r2+16] - movdqa xmm11, xmm3 - psubw xmm2, xmm5 - psubw xmm3, xmm4 - paddw xmm0, xmm1 - movdqu xmm9, [r3] - ABS1 xmm2, xmm1 - ABS1 xmm3, xmm1 - paddw xmm0, xmm2 - paddw xmm0, xmm3 - paddusw xmm0, xmm9 - movdqa xmm1, xmm8 - psubusw xmm1, xmm0 - packsswb xmm1, xmm1 - movq [t0], xmm1 -%else - ADS_START 2 + psubw m0, m10, m7 + movu m10, [r1+16] + psubw m1, m10, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + psubw m2, m11, m5 + movu m11, [r1+r2+16] + paddw m0, m1 + psubw m3, m11, m4 + movu m9, [r3] + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + paddusw m0, m9 + psubusw m1, m8, m0 +%else + ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+16] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - ABS1 xmm0, xmm2 - ABS1 xmm1, xmm3 - movdqu xmm2, [r1+r2] - movdqu xmm3, [r1+r2+16] - psubw xmm2, xmm5 - psubw xmm3, xmm4 - paddw xmm0, xmm1 - ABS1 xmm2, xmm1 - ABS1 xmm3, xmm1 - paddw xmm0, xmm2 - paddw xmm0, xmm3 - movd xmm1, [ebp+stack_offset+28] - movdqu xmm2, [r3] - pshuflw xmm1, xmm1, 0 - punpcklqdq xmm1, xmm1 - paddusw xmm0, xmm2 - psubusw xmm1, xmm0 - packsswb xmm1, xmm1 - movq [t0], xmm1 + movu m0, [r1] + movu m1, [r1+16] + psubw m0, m7 + psubw m1, m6 + ABSW m0, m0, m2 + ABSW m1, m1, m3 + movu m2, [r1+r2] + movu m3, [r1+r2+16] + psubw m2, m5 + psubw m3, m4 + paddw m0, m1 + ABSW m2, m2, m1 + ABSW m3, m3, m1 + paddw m0, m2 + paddw m0, m3 + movu m2, [r3] +%if mmsize==32 + vpbroadcastw m1, r6m +%else + movd m1, r6m + pshuflw m1, m1, 0 + punpcklqdq m1, m1 +%endif + paddusw m0, m2 + psubusw m1, m0 %endif ; ARCH - ADS_END 2 + packsswb m1, m1 +%if mmsize==32 + vpermq m1, m1, q3120 + mova [r6], xm1 +%else + movh [r6], m1 +%endif + ADS_END mmsize/8 -cglobal x264_pixel_ads2_%1, 4,7 - movq xmm6, [r0] - movd xmm5, r6m - pshuflw xmm7, xmm6, 0 - pshuflw xmm6, xmm6, 0xAA - pshuflw xmm5, xmm5, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - punpcklqdq xmm5, xmm5 - ADS_START 2 +cglobal pixel_ads2, 5,7,8 +%if mmsize==32 + vpbroadcastw m7, [r0+0] + vpbroadcastw m6, [r0+4] + vpbroadcastw m5, r6m +%else + movq m6, [r0] + movd m5, r6m + pshuflw m7, m6, 0 + pshuflw m6, m6, q2222 + pshuflw m5, m5, 0 + punpcklqdq m7, m7 + punpcklqdq m6, m6 + punpcklqdq m5, m5 +%endif + ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+r2] - psubw xmm0, xmm7 - psubw xmm1, xmm6 - movdqu xmm4, [r3] - ABS1 xmm0, xmm2 - ABS1 xmm1, xmm3 - paddw xmm0, xmm1 - paddusw xmm0, xmm4 - movdqa xmm1, xmm5 - psubusw xmm1, xmm0 - packsswb xmm1, xmm1 - movq [t0], xmm1 - ADS_END 2 + movu m0, [r1] + movu m1, [r1+r2] + psubw m0, m7 + psubw m1, m6 + movu m4, [r3] + ABSW m0, m0, m2 + ABSW m1, m1, m3 + paddw m0, m1 + paddusw m0, m4 + psubusw m1, m5, m0 + packsswb m1, m1 +%if mmsize==32 + vpermq m1, m1, q3120 + mova [r6], xm1 +%else + movh [r6], m1 +%endif + ADS_END mmsize/8 -cglobal x264_pixel_ads1_%1, 4,7 - movd xmm7, [r0] - movd xmm6, r6m - pshuflw xmm7, xmm7, 0 - pshuflw xmm6, xmm6, 0 - punpcklqdq xmm7, xmm7 - punpcklqdq xmm6, xmm6 - ADS_START 4 +cglobal pixel_ads1, 5,7,8 +%if mmsize==32 + vpbroadcastw m7, [r0] + vpbroadcastw m6, r6m +%else + movd m7, [r0] + movd m6, r6m + pshuflw m7, m7, 0 + pshuflw m6, m6, 0 + punpcklqdq m7, m7 + punpcklqdq m6, m6 +%endif + ADS_START .loop: - movdqu xmm0, [r1] - movdqu xmm1, [r1+16] - psubw xmm0, xmm7 - psubw xmm1, xmm7 - movdqu xmm2, [r3] - movdqu xmm3, [r3+16] - ABS1 xmm0, xmm4 - ABS1 xmm1, xmm5 - paddusw xmm0, xmm2 - paddusw xmm1, xmm3 - movdqa xmm4, xmm6 - movdqa xmm5, xmm6 - psubusw xmm4, xmm0 - psubusw xmm5, xmm1 - packsswb xmm4, xmm5 - movdqa [t0], xmm4 - ADS_END 4 -%endmacro - -ADS_SSE2 sse2 -%define ABS1 ABS1_SSSE3 -ADS_SSE2 ssse3 - -; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) + movu m0, [r1] + movu m1, [r1+mmsize] + psubw m0, m7 + psubw m1, m7 + movu m2, [r3] + movu m3, [r3+mmsize] + ABSW m0, m0, m4 + ABSW m1, m1, m5 + paddusw m0, m2 + paddusw m1, m3 + psubusw m4, m6, m0 + psubusw m5, m6, m1 + packsswb m4, m5 +%if mmsize==32 + vpermq m4, m4, q3120 +%endif + mova [r6], m4 + ADS_END mmsize/4 +%endmacro + +INIT_XMM sse2 +ADS_XMM +INIT_XMM ssse3 +ADS_XMM +INIT_XMM avx +ADS_XMM +INIT_YMM avx2 +ADS_XMM + +; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) ; { ; int nmv=0, i, j; ; *(uint32_t*)(masks+width) = 0; @@ -1986,85 +5119,89 @@ ADS_SSE2 ssse3 ; } ; return nmv; ; } -cglobal x264_pixel_ads_mvs -ads_mvs: - xor eax, eax - xor esi, esi -%ifdef ARCH_X86_64 + +%macro TEST 1 + mov [r4+r0*2], r1w + test r2d, 0xff<<(%1*8) + setne r3b + add r0d, r3d + inc r1d +%endmacro + +INIT_MMX mmx +cglobal pixel_ads_mvs, 0,7,0 +ads_mvs_mmx: ; mvs = r4 - ; masks = rsp + ; masks = r6 ; width = r5 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.) - mov dword [rsp+r5], 0 + xor r0d, r0d + xor r1d, r1d + mov [r6+r5], r0d jmp .loopi +ALIGN 16 .loopi0: - add esi, 8 - cmp esi, r5d + add r1d, 8 + cmp r1d, r5d jge .end .loopi: - mov rdi, [rsp+rsi] - test rdi, rdi + mov r2, [r6+r1] +%if ARCH_X86_64 + test r2, r2 +%else + mov r3, r2 + add r3d, [r6+r1+4] +%endif jz .loopi0 - xor ecx, ecx -%macro TEST 1 - mov [r4+rax*2], si - test edi, 0xff<<(%1*8) - setne cl - add eax, ecx - inc esi -%endmacro + xor r3d, r3d TEST 0 TEST 1 TEST 2 TEST 3 - shr rdi, 32 +%if ARCH_X86_64 + shr r2, 32 +%else + mov r2d, [r6+r1] +%endif TEST 0 TEST 1 TEST 2 TEST 3 - cmp esi, r5d + cmp r1d, r5d jl .loopi .end: - mov rsp, r10 - ret + movifnidn eax, r0d + RET +INIT_XMM ssse3 +cglobal pixel_ads_mvs, 0,7,0 +ads_mvs_ssse3: + mova m3, [pw_8] + mova m4, [pw_76543210] + pxor m5, m5 + add r5, r6 + xor r0d, r0d ; nmv + mov [r5], r0d +%ifdef PIC + lea r1, [$$] + %define GLOBAL +r1-$$ %else - ; no PROLOGUE, inherit from x264_pixel_ads1 - mov ebx, [ebp+stack_offset+20] ; mvs - mov edi, [ebp+stack_offset+24] ; width - mov dword [esp+edi], 0 - push ebp - jmp .loopi -.loopi0: - add esi, 8 - cmp esi, edi - jge .end -.loopi: - mov ebp, [esp+esi+4] - mov edx, [esp+esi+8] - mov ecx, ebp - or ecx, edx - jz .loopi0 - xor ecx, ecx -%macro TEST 2 - mov [ebx+eax*2], si - test %2, 0xff<<(%1*8) - setne cl - add eax, ecx - inc esi -%endmacro - TEST 0, ebp - TEST 1, ebp - TEST 2, ebp - TEST 3, ebp - TEST 0, edx - TEST 1, edx - TEST 2, edx - TEST 3, edx - cmp esi, edi - jl .loopi -.end: - pop esp + %define GLOBAL +%endif +.loop: + movh m0, [r6] + pcmpeqb m0, m5 + pmovmskb r2d, m0 + xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions) + movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt + add r2d, r2d + ; shuffle counters based on mv mask + pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL] + movu [r4+r0*2], m2 + add r0d, r3d + paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7} + add r6, 8 + cmp r6, r5 + jl .loop + movifnidn eax, r0d RET -%endif ; ARCH -