;*****************************************************************************
;* pixel.asm: x86 pixel metrics
;*****************************************************************************
-;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2003-2016 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
sw_f0: dq 0xfff0, 0
-sq_0f: dq 0xffffffff, 0
pd_f0: times 4 dd 0xffff0000
+pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
+
+ads_mvs_shuffle:
+%macro ADS_MVS_SHUFFLE 8
+ %assign y x
+ %rep 8
+ %rep 7
+ %rotate (~y)&1
+ %assign y y>>((~y)&1)
+ %endrep
+ db %1*2, %1*2+1
+ %rotate 1
+ %assign y y>>1
+ %endrep
+%endmacro
+%assign x 0
+%rep 256
+ ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
+%assign x x+1
+%endrep
+
SECTION .text
cextern pb_0
cextern pw_pmmpzzzz
cextern pd_1
cextern hsub_mul
+cextern popcnt_table
;=============================================================================
; SSD
mov r4d, %%n
%endif
pxor m0, m0
-.loop
+.loop:
mova m1, [r0]
mova m2, [r0+offset0_1]
mova m3, [r0+offset0_2]
psubw m1, [r2+r6+mmsize]
PSHUFLW m0, m0, q3120
PSHUFLW m1, m1, q3120
-%if mmsize==16
+%if mmsize >= 16
pshufhw m0, m0, q3120
pshufhw m1, m1, q3120
%endif
+%if cpuflag(xop)
+ pmadcswd m2, m0, m0, m2
+ pmadcswd m3, m1, m1, m3
+%else
pmaddwd m0, m0
pmaddwd m1, m1
paddd m2, m0
paddd m3, m1
+%endif
add r6, 2*mmsize
jl .loopx
-%if mmsize==16 ; using HADDD would remove the mmsize/32 part from the
- ; equation above, putting the width limit at 8208
+%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
+ jz .no_overread
+ psubd m3, m1
+.no_overread:
+%endif
+%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
+ ; equation above, putting the width limit at 8208
punpckhdq m0, m2, m6
punpckhdq m1, m3, m6
punpckldq m2, m6
jg .loopy
mov r3, r6m
mov r4, r7m
-%if mmsize==16
- movq [r3], m4
- movhps [r4], m4
+%if mmsize == 32
+ vextracti128 xm0, m4, 1
+ paddq xm4, xm0
+%endif
+%if mmsize >= 16
+ movq [r3], xm4
+ movhps [r4], xm4
%else ; fixup for mmx2
SBUTTERFLY dq, 4, 5, 0
mova m0, m4
;-----------------------------------------------------------------------------
%macro SSD_NV12 0
cglobal pixel_ssd_nv12_core, 6,7
- shl r4d, 1
+ add r4d, r4d
add r0, r4
add r2, r4
pxor m3, m3
mov r6, r4
neg r6
.loopx:
- mova m0, [r0+r6]
+%if mmsize == 32 ; only 16-byte alignment is guaranteed
+ movu m2, [r0+r6]
+ movu m1, [r2+r6]
+%else
+ mova m2, [r0+r6]
mova m1, [r2+r6]
- psubusb m0, m1
- psubusb m1, [r0+r6]
+%endif
+ psubusb m0, m2, m1
+ psubusb m1, m2
por m0, m1
psrlw m2, m0, 8
pand m0, m5
+%if cpuflag(xop)
+ pmadcswd m4, m2, m2, m4
+ pmadcswd m3, m0, m0, m3
+%else
pmaddwd m2, m2
pmaddwd m0, m0
- paddd m3, m0
paddd m4, m2
+ paddd m3, m0
+%endif
add r6, mmsize
jl .loopx
+%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
+ jz .no_overread
+ pcmpeqb xm1, xm1
+ pandn m0, m1, m0 ; zero the lower half
+ pandn m2, m1, m2
+ psubd m3, m0
+ psubd m4, m2
+.no_overread:
+%endif
add r0, r1
add r2, r3
dec r5d
jg .loopy
mov r3, r6m
mov r4, r7m
- mova m5, [sq_0f]
HADDD m3, m0
HADDD m4, m0
- pand m3, m5
- pand m4, m5
- movq [r3], m3
- movq [r4], m4
+ pxor xm0, xm0
+ punpckldq xm3, xm0
+ punpckldq xm4, xm0
+ movq [r3], xm3
+ movq [r4], xm4
RET
%endmacro ; SSD_NV12
%endif ; !HIGH_BIT_DEPTH
SSD_NV12
INIT_XMM avx
SSD_NV12
+INIT_XMM xop
+SSD_NV12
+INIT_YMM avx2
+SSD_NV12
;=============================================================================
; variance
%endmacro
%macro VAR_END 2
-%if HIGH_BIT_DEPTH
-%if mmsize == 8 && %1*%2 == 256
+%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256
HADDUW m5, m2
%else
HADDW m5, m2
%endif
-%else ; !HIGH_BIT_DEPTH
- HADDW m5, m2
-%endif ; HIGH_BIT_DEPTH
HADDD m6, m1
%if ARCH_X86_64
punpckldq m5, m6
mova m4, [r0+%1+mmsize]
%else ; !HIGH_BIT_DEPTH
mova m0, [r0]
- punpckhbw m1, m0, m7
mova m3, [r0+%1]
- mova m4, m3
+ punpckhbw m1, m0, m7
punpcklbw m0, m7
+ punpckhbw m4, m3, m7
+ punpcklbw m3, m7
%endif ; HIGH_BIT_DEPTH
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
-%if HIGH_BIT_DEPTH == 0
- punpcklbw m3, m7
- punpckhbw m4, m7
-%endif ; !HIGH_BIT_DEPTH
VAR_CORE
dec r2d
jg .loop
VAR
INIT_XMM xop
VAR
+%endif ; !HIGH_BIT_DEPTH
INIT_YMM avx2
cglobal pixel_var_16x16, 2,4,7
+ FIX_STRIDES r1
VAR_START 0
mov r2d, 4
lea r3, [r1*3]
.loop:
+%if HIGH_BIT_DEPTH
+ mova m0, [r0]
+ mova m3, [r0+r1]
+ mova m1, [r0+r1*2]
+ mova m4, [r0+r3]
+%else
pmovzxbw m0, [r0]
pmovzxbw m3, [r0+r1]
pmovzxbw m1, [r0+r1*2]
pmovzxbw m4, [r0+r3]
+%endif
lea r0, [r0+r1*4]
VAR_CORE
dec r2d
movd edx, xm6
%endif
RET
-%endif ; !HIGH_BIT_DEPTH
%macro VAR2_END 3
HADDW %2, xm1
; clobber: m3..m7
; out: %1 = satd
%macro SATD_4x4_MMX 3
- %xdefine %%n n%1
+ %xdefine %%n nn%1
%assign offset %2*SIZEOF_PIXEL
LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
%macro SATDS_SSE2 0
%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
-%if vertical==0 || HIGH_BIT_DEPTH
+%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
cglobal pixel_satd_4x4, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
; clobber: m1..m3
%macro SUM4x3 3 ; dc, left, top
movq m4, %2
-%ifid %1
+%ifnum sizeof%1
movq m5, %1
%else
movd m5, %1
%macro PRED4x4_LOWPASS 5
-%ifid %5
+%ifnum sizeof%5
pavgb %5, %2, %3
pxor %3, %2
pand %3, [pb_1]
psignw m%1, [pw_pmpmpmpm]
paddw m0, m%1
psllw m0, 2 ; hadamard(top), hadamard(left)
- movhlps m3, m0
+ MOVHL m3, m0
pshufb m1, m0, [intrax9b_v1]
pshufb m2, m0, [intrax9b_v2]
paddw m0, m3
SBUTTERFLY qdq, 3, 0, 2
paddw m3, m0
%endif
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
%if cpuflag(xop)
vphaddwq m3, m3
movddup m0, m2
pshufd m1, m2, q3232
movddup m2, m3
- movhlps m3, m3
+ punpckhqdq m3, m3
call .satd_8x4 ; ddr, ddl
movddup m2, m5
pshufd m3, m5, q3232
psubw m3, m11
SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
pmaddwd m0, [pw_1]
-%if cpuflag(sse4)
- pshufd m1, m0, q0032
-%else
- movhlps m1, m0
-%endif
+ MOVHL m1, m0
paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
ret
movddup m0, m2
pshufd m1, m2, q3232
movddup m2, m3
- movhlps m3, m3
+ punpckhqdq m3, m3
pmaddubsw m0, m7
pmaddubsw m1, m7
pmaddubsw m2, m7
mova m3, [pred_buf+0x30]
mova m1, [pred_buf+0x20]
movddup m2, m3
- movhlps m3, m3
+ punpckhqdq m3, m3
movq [spill+0x08], m0
movddup m0, m1
- movhlps m1, m1
+ punpckhqdq m1, m1
call .satd_8x4 ; vr, vl
mova m3, [pred_buf+0x50]
mova m1, [pred_buf+0x40]
movddup m2, m3
- movhlps m3, m3
+ punpckhqdq m3, m3
movq [spill+0x10], m0
movddup m0, m1
- movhlps m1, m1
+ punpckhqdq m1, m1
call .satd_8x4 ; hd, hu
movq [spill+0x18], m0
mova m1, [spill+0x20]
psubw m3, [fenc_buf+0x30]
SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
pmaddwd m0, [pw_1]
-%if cpuflag(sse4)
- pshufd m1, m0, q0032
-%else
- movhlps m1, m0
-%endif
+ MOVHL m1, m0
paddd xmm0, m0, m1
ret
%endif ; ARCH
%endmacro ; INTRA_X9
-
-
%macro INTRA8_X9 0
;-----------------------------------------------------------------------------
; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
paddw m1, m2
paddw m0, m3
paddw m0, m1
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
movd [r4+0], m0
psadbw m2, fenc57
paddw m1, m3
paddw m1, m2
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd [r4+2], m1
movhps m0, [r2+16]
pxor m2, m2
psadbw m0, m2
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
psrlw m0, 3
pavgw m0, m2
paddw m1, m2
paddw m0, m3
paddw m0, m1
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
movd [r4+4], m0
mova pred(3,3), m2
psadbw m2, fenc57
paddw m1, m2
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd [r4+6], m1
paddw m1, m2
paddw m0, m3
paddw m0, m1
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
%if cpuflag(sse4)
pextrw [r4+14], m0, 0
mova pred(4,3), m2
psadbw m2, fenc57
paddw m1, m2
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd [r4+8], m1
mova pred(5,3), m2
psadbw m2, fenc57
paddw m1, m2
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd [r4+10], m1
psadbw m3, fenc57
paddw m1, m2
paddw m1, m3
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
pslldq m1, 12
psadbw m0, fenc57
paddw m1, m2
paddw m1, m0
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd r2d, m1
paddusw m0, m0
paddusw m0, m0
paddw m0, [off(pw_s00112233)]
- movhlps m1, m0
+ MOVHL m1, m0
pminsw m0, m1
pshuflw m1, m0, q0032
pminsw m0, m1
pmaddwd m0, [pw_1]
phaddw m10, m11
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
pshuflw m1, m0, q0032
pavgw m0, m1
; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
paddusw m0, m0
paddw m0, [off(pw_s00001111)]
- movhlps m1, m0
+ MOVHL m1, m0
pminsw m0, m1
pshuflw m1, m0, q0032
mova m2, m0
mov r6, rsp
and rsp, ~31
- SUB rsp, 0x240
+ sub rsp, 0x240
movu m5, [r0+0*FENC_STRIDE]
movu m6, [r0+4*FENC_STRIDE]
punpcklqdq m5, [r0+2*FENC_STRIDE]
paddw m1, m2
vextracti128 xm2, m1, 1
paddw xm1, xm2
- movhlps xm2, xm1
+ MOVHL xm2, xm1
paddw xm1, xm2
movd r2d, xm1
;-----------------------------------------------------------------------------
; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-cglobal pixel_ssim_end4, 3,3,7
- movdqa m0, [r0+ 0]
- movdqa m1, [r0+16]
- movdqa m2, [r0+32]
- movdqa m3, [r0+48]
- movdqa m4, [r0+64]
+cglobal pixel_ssim_end4, 2,3
+ mov r2d, r2m
+ mova m0, [r0+ 0]
+ mova m1, [r0+16]
+ mova m2, [r0+32]
+ mova m3, [r0+48]
+ mova m4, [r0+64]
paddd m0, [r1+ 0]
paddd m1, [r1+16]
paddd m2, [r1+32]
paddd m1, m2
paddd m2, m3
paddd m3, m4
- movdqa m5, [ssim_c1]
- movdqa m6, [ssim_c2]
TRANSPOSE4x4D 0, 1, 2, 3, 4
; s1=m0, s2=m1, ss=m2, s12=m3
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
+ mulps m4, m0, m1 ; s1*s2
+ mulps m0, m0 ; s1*s1
+ mulps m1, m1 ; s2*s2
mulps m2, [pf_64] ; ss*64
mulps m3, [pf_128] ; s12*128
- movdqa m4, m1
- mulps m4, m0 ; s1*s2
- mulps m1, m1 ; s2*s2
- mulps m0, m0 ; s1*s1
addps m4, m4 ; s1*s2*2
addps m0, m1 ; s1*s1 + s2*s2
subps m2, m0 ; vars
subps m3, m4 ; covar*2
- addps m4, m5 ; s1*s2*2 + ssim_c1
- addps m0, m5 ; s1*s1 + s2*s2 + ssim_c1
- addps m2, m6 ; vars + ssim_c2
- addps m3, m6 ; covar*2 + ssim_c2
+ movaps m1, [ssim_c1]
+ addps m4, m1 ; s1*s2*2 + ssim_c1
+ addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
+ movaps m1, [ssim_c2]
+ addps m2, m1 ; vars + ssim_c2
+ addps m3, m1 ; covar*2 + ssim_c2
%else
pmaddwd m4, m1, m0 ; s1*s2
pslld m1, 16
pslld m2, 6
psubd m3, m4 ; covar*2
psubd m2, m0 ; vars
- paddd m0, m5
- paddd m4, m5
- paddd m3, m6
- paddd m2, m6
+ mova m1, [ssim_c1]
+ paddd m0, m1
+ paddd m4, m1
+ mova m1, [ssim_c2]
+ paddd m3, m1
+ paddd m2, m1
cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
cmp r2d, 4
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
+
%ifdef PIC
lea r3, [mask_ff + 16]
- movdqu m1, [r3 + r2*4]
+ %xdefine %%mask r3
%else
- movdqu m1, [mask_ff + r2*4 + 16]
+ %xdefine %%mask mask_ff + 16
%endif
- pand m4, m1
+%if cpuflag(avx)
+ andps m4, [%%mask + r2*4]
+%else
+ movups m0, [%%mask + r2*4]
+ andps m4, m0
+%endif
+
.skip:
movhlps m0, m4
addps m0, m4
+%if cpuflag(ssse3)
+ movshdup m4, m0
+%else
pshuflw m4, m0, q0032
+%endif
addss m0, m4
%if ARCH_X86_64 == 0
- movd r0m, m0
+ movss r0m, m0
fld dword r0m
%endif
RET
HADDW m0, m1
ABSD m1, m0
%else
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
ABSW m1, m0
%endif
%endif
lea r6, [r4+r5+(mmsize-1)]
and r6, ~(mmsize-1)
- jmp ads_mvs
+%if cpuflag(ssse3)
+ jmp ads_mvs_ssse3
+%else
+ jmp ads_mvs_mmx
+%endif
%endmacro
;-----------------------------------------------------------------------------
inc r1d
%endmacro
-INIT_MMX
+INIT_MMX mmx
cglobal pixel_ads_mvs, 0,7,0
-ads_mvs:
+ads_mvs_mmx:
; mvs = r4
; masks = r6
; width = r5
.end:
movifnidn eax, r0d
RET
+
+INIT_XMM ssse3
+cglobal pixel_ads_mvs, 0,7,0
+ads_mvs_ssse3:
+ mova m3, [pw_8]
+ mova m4, [pw_76543210]
+ pxor m5, m5
+ add r5, r6
+ xor r0d, r0d ; nmv
+ mov [r5], r0d
+%ifdef PIC
+ lea r1, [$$]
+ %define GLOBAL +r1-$$
+%else
+ %define GLOBAL
+%endif
+.loop:
+ movh m0, [r6]
+ pcmpeqb m0, m5
+ pmovmskb r2d, m0
+ xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
+ movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
+ add r2d, r2d
+ ; shuffle counters based on mv mask
+ pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
+ movu [r4+r0*2], m2
+ add r0d, r3d
+ paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
+ add r6, 8
+ cmp r6, r5
+ jl .loop
+ movifnidn eax, r0d
+ RET