From 1637239a64f3ec9a491b91202bd37097f15a253d Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sun, 11 Oct 2015 22:32:11 +0200 Subject: [PATCH] x86: Avoid some bypass delays and false dependencies A bypass delay of 1-3 clock cycles may occur on some CPUs when transitioning between int and float domains, so try to avoid that if possible. --- common/x86/deblock-a.asm | 6 ++-- common/x86/mc-a.asm | 10 +++++-- common/x86/pixel-a.asm | 60 +++++++++++++++++----------------------- common/x86/predict-a.asm | 4 +-- common/x86/sad-a.asm | 16 +++++------ common/x86/x86util.asm | 29 +++++++++++++------ 6 files changed, 66 insertions(+), 59 deletions(-) diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index a579e6da..7d1ae16b 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -869,8 +869,8 @@ DEBLOCK_LUMA_INTRA mova m2, m0 punpckldq m0, m1 punpckhdq m2, m1 - movhlps m1, m0 - movhlps m3, m2 + MOVHL m1, m0 + MOVHL m3, m2 %endif %endmacro @@ -883,7 +883,7 @@ DEBLOCK_LUMA_INTRA punpcklwd m1, m2 %else punpcklwd m1, m2 - movhlps m0, m1 + MOVHL m0, m1 %endif movd %3, m0 movd %1, m1 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 7ce396f2..54fd9534 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -2016,7 +2016,7 @@ cglobal mc_chroma movhps [r1+r2], xm2 %else movu m0, [r3] - pshufb m0, xm5 + pshufb m0, m5 .loop4: movu m1, [r3+r4] pshufb m1, m5 @@ -2033,13 +2033,19 @@ cglobal mc_chroma pmulhrsw m3, shiftround mova m0, m4 packuswb m1, m3 + movd [r0], m1 +%if cpuflag(sse4) + pextrd [r1], m1, 1 + pextrd [r0+r2], m1, 2 + pextrd [r1+r2], m1, 3 +%else movhlps m3, m1 - movd [r0], xm1 movd [r0+r2], m3 psrldq m1, 4 psrldq m3, 4 movd [r1], m1 movd [r1+r2], m3 +%endif lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index ddfd7ae5..d9acdb46 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -2676,7 +2676,7 @@ cglobal intra_satd_x3_8x8c, 0,6 psignw m%1, [pw_pmpmpmpm] paddw m0, m%1 psllw m0, 2 ; hadamard(top), hadamard(left) - movhlps m3, m0 + MOVHL m3, m0 pshufb m1, m0, [intrax9b_v1] pshufb m2, m0, [intrax9b_v2] paddw m0, m3 @@ -2713,7 +2713,7 @@ cglobal intra_satd_x3_8x8c, 0,6 SBUTTERFLY qdq, 3, 0, 2 paddw m3, m0 %endif - movhlps m2, m1 + MOVHL m2, m1 paddw m1, m2 %if cpuflag(xop) vphaddwq m3, m3 @@ -2904,7 +2904,7 @@ cglobal intra_satd_x9_4x4, 3,4,16 movddup m0, m2 pshufd m1, m2, q3232 movddup m2, m3 - movhlps m3, m3 + punpckhqdq m3, m3 call .satd_8x4 ; ddr, ddl movddup m2, m5 pshufd m3, m5, q3232 @@ -2956,11 +2956,7 @@ ALIGN 16 psubw m3, m11 SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap pmaddwd m0, [pw_1] -%if cpuflag(sse4) - pshufd m1, m0, q0032 -%else - movhlps m1, m0 -%endif + MOVHL m1, m0 paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free ret @@ -2998,7 +2994,7 @@ cglobal intra_satd_x9_4x4, 3,4,8 movddup m0, m2 pshufd m1, m2, q3232 movddup m2, m3 - movhlps m3, m3 + punpckhqdq m3, m3 pmaddubsw m0, m7 pmaddubsw m1, m7 pmaddubsw m2, m7 @@ -3010,18 +3006,18 @@ cglobal intra_satd_x9_4x4, 3,4,8 mova m3, [pred_buf+0x30] mova m1, [pred_buf+0x20] movddup m2, m3 - movhlps m3, m3 + punpckhqdq m3, m3 movq [spill+0x08], m0 movddup m0, m1 - movhlps m1, m1 + punpckhqdq m1, m1 call .satd_8x4 ; vr, vl mova m3, [pred_buf+0x50] mova m1, [pred_buf+0x40] movddup m2, m3 - movhlps m3, m3 + punpckhqdq m3, m3 movq [spill+0x10], m0 movddup m0, m1 - movhlps m1, m1 + punpckhqdq m1, m1 call .satd_8x4 ; hd, hu movq [spill+0x18], m0 mova m1, [spill+0x20] @@ -3064,18 +3060,12 @@ ALIGN 16 psubw m3, [fenc_buf+0x30] SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap pmaddwd m0, [pw_1] -%if cpuflag(sse4) - pshufd m1, m0, q0032 -%else - movhlps m1, m0 -%endif + MOVHL m1, m0 paddd xmm0, m0, m1 ret %endif ; ARCH %endmacro ; INTRA_X9 - - %macro INTRA8_X9 0 ;----------------------------------------------------------------------------- ; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds ) @@ -3122,7 +3112,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 paddw m1, m2 paddw m0, m3 paddw m0, m1 - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 movd [r4+0], m0 @@ -3143,7 +3133,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 psadbw m2, fenc57 paddw m1, m3 paddw m1, m2 - movhlps m2, m1 + MOVHL m2, m1 paddw m1, m2 movd [r4+2], m1 @@ -3154,7 +3144,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 movhps m0, [r2+16] pxor m2, m2 psadbw m0, m2 - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 psrlw m0, 3 pavgw m0, m2 @@ -3170,7 +3160,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 paddw m1, m2 paddw m0, m3 paddw m0, m1 - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 movd [r4+4], m0 @@ -3203,7 +3193,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 mova pred(3,3), m2 psadbw m2, fenc57 paddw m1, m2 - movhlps m2, m1 + MOVHL m2, m1 paddw m1, m2 movd [r4+6], m1 @@ -3231,7 +3221,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 paddw m1, m2 paddw m0, m3 paddw m0, m1 - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 %if cpuflag(sse4) pextrw [r4+14], m0, 0 @@ -3270,7 +3260,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 mova pred(4,3), m2 psadbw m2, fenc57 paddw m1, m2 - movhlps m2, m1 + MOVHL m2, m1 paddw m1, m2 movd [r4+8], m1 @@ -3304,7 +3294,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 mova pred(5,3), m2 psadbw m2, fenc57 paddw m1, m2 - movhlps m2, m1 + MOVHL m2, m1 paddw m1, m2 movd [r4+10], m1 @@ -3340,7 +3330,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 psadbw m3, fenc57 paddw m1, m2 paddw m1, m3 - movhlps m2, m1 + MOVHL m2, m1 paddw m1, m2 ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall pslldq m1, 12 @@ -3378,7 +3368,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 psadbw m0, fenc57 paddw m1, m2 paddw m1, m0 - movhlps m2, m1 + MOVHL m2, m1 paddw m1, m2 movd r2d, m1 @@ -3398,7 +3388,7 @@ cglobal intra_sad_x9_8x8, 5,6,9 paddusw m0, m0 paddusw m0, m0 paddw m0, [off(pw_s00112233)] - movhlps m1, m0 + MOVHL m1, m0 pminsw m0, m1 pshuflw m1, m0, q0032 pminsw m0, m1 @@ -3626,7 +3616,7 @@ cglobal intra_sa8d_x9_8x8, 5,6,16 pmaddwd m0, [pw_1] phaddw m10, m11 - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 pshuflw m1, m0, q0032 pavgw m0, m1 @@ -3648,7 +3638,7 @@ cglobal intra_sa8d_x9_8x8, 5,6,16 ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index paddusw m0, m0 paddw m0, [off(pw_s00001111)] - movhlps m1, m0 + MOVHL m1, m0 pminsw m0, m1 pshuflw m1, m0, q0032 mova m2, m0 @@ -4578,7 +4568,7 @@ cglobal intra_sad_x9_8x8, 5,7,8 paddw m1, m2 vextracti128 xm2, m1, 1 paddw xm1, xm2 - movhlps xm2, xm1 + MOVHL xm2, xm1 paddw xm1, xm2 movd r2d, xm1 @@ -4842,7 +4832,7 @@ cglobal pixel_asd8, 5,5 HADDW m0, m1 ABSD m1, m0 %else - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 ABSW m1, m0 %endif diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 6ac37554..74534c29 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -1457,7 +1457,7 @@ cglobal predict_8x8_vr, 2,2 movhps [r0-4*FDEC_STRIDE], m3 movhps [r0-3*FDEC_STRIDE], m0 %if cpuflag(ssse3) - movhlps m3, m3 + punpckhqdq m3, m3 pshufb m0, [shuf_vr] palignr m3, m0, 13 %else @@ -2166,7 +2166,7 @@ cglobal predict_16x16_dc_left_core, 1,1 %else ; !HIGH_BIT_DEPTH pxor m0, m0 psadbw m0, [r0 - FDEC_STRIDE] - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 paddusw m0, %1 psrlw m0, %2 ; dc diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index d4fc8bd0..a898ad73 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -113,7 +113,7 @@ SAD 4, 4 ;============================================================================= %macro SAD_END_SSE2 0 - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 movd eax, m0 RET @@ -322,7 +322,7 @@ cglobal pixel_vsad_sse2, 3,3 sub r2d, 2 jg .loop .end: - movhlps m1, m0 + MOVHL m1, m0 ;max sum: 31*16*255(pixel_max)=126480 paddd m0, m1 movd eax, m0 @@ -520,7 +520,7 @@ cglobal intra_sad_x3_8x8c, 3,3 paddw xmm1, xmm2 paddw xmm1, xmm3 paddw xmm1, xmm4 - movhlps xmm0, xmm1 + MOVHL xmm0, xmm1 paddw xmm1, xmm0 movd [r2], xmm1 %else @@ -692,10 +692,10 @@ cglobal intra_sad_x3_16x16, 3,5,8 %if mmsize==16 pslldq m3, 4 por m3, m2 - movhlps m1, m3 + MOVHL m1, m3 paddw m3, m1 movq [r2+0], m3 - movhlps m1, m4 + MOVHL m1, m4 paddw m4, m1 %else movd [r2+0], m2 @@ -716,7 +716,7 @@ INIT_YMM avx2 cglobal intra_sad_x3_16x16, 3,5,6 pxor xm0, xm0 psadbw xm0, [r1-FDEC_STRIDE] - movhlps xm1, xm0 + MOVHL xm1, xm0 paddw xm0, xm1 movd r3d, xm0 %assign x 0 @@ -748,7 +748,7 @@ cglobal intra_sad_x3_16x16, 3,5,6 add r3d, -FENC_STRIDE jge .vloop punpckhqdq m5, m4, m4 - movhlps xm2, xm3 + MOVHL xm2, xm3 paddw m4, m5 ; DC / V paddw xm3, xm2 ; H vextracti128 xm2, m4, 1 @@ -1642,7 +1642,7 @@ cglobal pixel_sad_16x%2_cache64_%1 mov r4d, %2/2 pxor xmm0, xmm0 call r5 - movhlps xmm1, xmm0 + MOVHL xmm1, xmm0 paddw xmm0, xmm1 movd eax, xmm0 RET diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index 41b52df2..ea8b2cf3 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -290,6 +290,18 @@ pminsw %1, %3 %endmacro +%macro MOVHL 2 ; dst, src +%ifidn %1, %2 + punpckhqdq %1, %2 +%elif cpuflag(avx) + punpckhqdq %1, %2, %2 +%elif cpuflag(sse4) + pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones +%else + movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst +%endif +%endmacro + %macro HADDD 2 ; sum junk %if sizeof%1 == 32 %define %2 xmm%2 @@ -298,13 +310,12 @@ paddd %1, %2 %endif %if mmsize >= 16 -%if cpuflag(xop) && sizeof%1 == 16 - vphadddq %1, %1 -%endif - movhlps %2, %1 + MOVHL %2, %1 paddd %1, %2 %endif -%if notcpuflag(xop) || sizeof%1 != 16 +%if cpuflag(xop) && sizeof%1 == 16 + vphadddq %1, %1 +%else PSHUFLW %2, %1, q0032 paddd %1, %2 %endif @@ -315,11 +326,11 @@ %macro HADDW 2 ; reg, tmp %if cpuflag(xop) && sizeof%1 == 16 vphaddwq %1, %1 - movhlps %2, %1 + MOVHL %2, %1 paddd %1, %2 %else - pmaddwd %1, [pw_1] - HADDD %1, %2 + pmaddwd %1, [pw_1] + HADDD %1, %2 %endif %endmacro @@ -337,7 +348,7 @@ %macro HADDUW 2 %if cpuflag(xop) && sizeof%1 == 16 vphadduwq %1, %1 - movhlps %2, %1 + MOVHL %2, %1 paddd %1, %2 %else HADDUWD %1, %2 -- 2.39.2