psignw m%1, [pw_pmpmpmpm]
paddw m0, m%1
psllw m0, 2 ; hadamard(top), hadamard(left)
- movhlps m3, m0
+ MOVHL m3, m0
pshufb m1, m0, [intrax9b_v1]
pshufb m2, m0, [intrax9b_v2]
paddw m0, m3
SBUTTERFLY qdq, 3, 0, 2
paddw m3, m0
%endif
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
%if cpuflag(xop)
vphaddwq m3, m3
movddup m0, m2
pshufd m1, m2, q3232
movddup m2, m3
- movhlps m3, m3
+ punpckhqdq m3, m3
call .satd_8x4 ; ddr, ddl
movddup m2, m5
pshufd m3, m5, q3232
psubw m3, m11
SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
pmaddwd m0, [pw_1]
-%if cpuflag(sse4)
- pshufd m1, m0, q0032
-%else
- movhlps m1, m0
-%endif
+ MOVHL m1, m0
paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
ret
movddup m0, m2
pshufd m1, m2, q3232
movddup m2, m3
- movhlps m3, m3
+ punpckhqdq m3, m3
pmaddubsw m0, m7
pmaddubsw m1, m7
pmaddubsw m2, m7
mova m3, [pred_buf+0x30]
mova m1, [pred_buf+0x20]
movddup m2, m3
- movhlps m3, m3
+ punpckhqdq m3, m3
movq [spill+0x08], m0
movddup m0, m1
- movhlps m1, m1
+ punpckhqdq m1, m1
call .satd_8x4 ; vr, vl
mova m3, [pred_buf+0x50]
mova m1, [pred_buf+0x40]
movddup m2, m3
- movhlps m3, m3
+ punpckhqdq m3, m3
movq [spill+0x10], m0
movddup m0, m1
- movhlps m1, m1
+ punpckhqdq m1, m1
call .satd_8x4 ; hd, hu
movq [spill+0x18], m0
mova m1, [spill+0x20]
psubw m3, [fenc_buf+0x30]
SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
pmaddwd m0, [pw_1]
-%if cpuflag(sse4)
- pshufd m1, m0, q0032
-%else
- movhlps m1, m0
-%endif
+ MOVHL m1, m0
paddd xmm0, m0, m1
ret
%endif ; ARCH
%endmacro ; INTRA_X9
-
-
%macro INTRA8_X9 0
;-----------------------------------------------------------------------------
; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
paddw m1, m2
paddw m0, m3
paddw m0, m1
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
movd [r4+0], m0
psadbw m2, fenc57
paddw m1, m3
paddw m1, m2
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd [r4+2], m1
movhps m0, [r2+16]
pxor m2, m2
psadbw m0, m2
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
psrlw m0, 3
pavgw m0, m2
paddw m1, m2
paddw m0, m3
paddw m0, m1
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
movd [r4+4], m0
mova pred(3,3), m2
psadbw m2, fenc57
paddw m1, m2
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd [r4+6], m1
paddw m1, m2
paddw m0, m3
paddw m0, m1
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
%if cpuflag(sse4)
pextrw [r4+14], m0, 0
mova pred(4,3), m2
psadbw m2, fenc57
paddw m1, m2
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd [r4+8], m1
mova pred(5,3), m2
psadbw m2, fenc57
paddw m1, m2
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd [r4+10], m1
psadbw m3, fenc57
paddw m1, m2
paddw m1, m3
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
pslldq m1, 12
psadbw m0, fenc57
paddw m1, m2
paddw m1, m0
- movhlps m2, m1
+ MOVHL m2, m1
paddw m1, m2
movd r2d, m1
paddusw m0, m0
paddusw m0, m0
paddw m0, [off(pw_s00112233)]
- movhlps m1, m0
+ MOVHL m1, m0
pminsw m0, m1
pshuflw m1, m0, q0032
pminsw m0, m1
pmaddwd m0, [pw_1]
phaddw m10, m11
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
pshuflw m1, m0, q0032
pavgw m0, m1
; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
paddusw m0, m0
paddw m0, [off(pw_s00001111)]
- movhlps m1, m0
+ MOVHL m1, m0
pminsw m0, m1
pshuflw m1, m0, q0032
mova m2, m0
paddw m1, m2
vextracti128 xm2, m1, 1
paddw xm1, xm2
- movhlps xm2, xm1
+ MOVHL xm2, xm1
paddw xm1, xm2
movd r2d, xm1
HADDW m0, m1
ABSD m1, m0
%else
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
ABSW m1, m0
%endif
;=============================================================================
%macro SAD_END_SSE2 0
- movhlps m1, m0
+ MOVHL m1, m0
paddw m0, m1
movd eax, m0
RET
sub r2d, 2
jg .loop
.end:
- movhlps m1, m0
+ MOVHL m1, m0
;max sum: 31*16*255(pixel_max)=126480
paddd m0, m1
movd eax, m0
paddw xmm1, xmm2
paddw xmm1, xmm3
paddw xmm1, xmm4
- movhlps xmm0, xmm1
+ MOVHL xmm0, xmm1
paddw xmm1, xmm0
movd [r2], xmm1
%else
%if mmsize==16
pslldq m3, 4
por m3, m2
- movhlps m1, m3
+ MOVHL m1, m3
paddw m3, m1
movq [r2+0], m3
- movhlps m1, m4
+ MOVHL m1, m4
paddw m4, m1
%else
movd [r2+0], m2
cglobal intra_sad_x3_16x16, 3,5,6
pxor xm0, xm0
psadbw xm0, [r1-FDEC_STRIDE]
- movhlps xm1, xm0
+ MOVHL xm1, xm0
paddw xm0, xm1
movd r3d, xm0
%assign x 0
add r3d, -FENC_STRIDE
jge .vloop
punpckhqdq m5, m4, m4
- movhlps xm2, xm3
+ MOVHL xm2, xm3
paddw m4, m5 ; DC / V
paddw xm3, xm2 ; H
vextracti128 xm2, m4, 1
mov r4d, %2/2
pxor xmm0, xmm0
call r5
- movhlps xmm1, xmm0
+ MOVHL xmm1, xmm0
paddw xmm0, xmm1
movd eax, xmm0
RET
pminsw %1, %3
%endmacro
+%macro MOVHL 2 ; dst, src
+%ifidn %1, %2
+ punpckhqdq %1, %2
+%elif cpuflag(avx)
+ punpckhqdq %1, %2, %2
+%elif cpuflag(sse4)
+ pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
+%else
+ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
+%endif
+%endmacro
+
%macro HADDD 2 ; sum junk
%if sizeof%1 == 32
%define %2 xmm%2
paddd %1, %2
%endif
%if mmsize >= 16
-%if cpuflag(xop) && sizeof%1 == 16
- vphadddq %1, %1
-%endif
- movhlps %2, %1
+ MOVHL %2, %1
paddd %1, %2
%endif
-%if notcpuflag(xop) || sizeof%1 != 16
+%if cpuflag(xop) && sizeof%1 == 16
+ vphadddq %1, %1
+%else
PSHUFLW %2, %1, q0032
paddd %1, %2
%endif
%macro HADDW 2 ; reg, tmp
%if cpuflag(xop) && sizeof%1 == 16
vphaddwq %1, %1
- movhlps %2, %1
+ MOVHL %2, %1
paddd %1, %2
%else
- pmaddwd %1, [pw_1]
- HADDD %1, %2
+ pmaddwd %1, [pw_1]
+ HADDD %1, %2
%endif
%endmacro
%macro HADDUW 2
%if cpuflag(xop) && sizeof%1 == 16
vphadduwq %1, %1
- movhlps %2, %1
+ MOVHL %2, %1
paddd %1, %2
%else
HADDUWD %1, %2