From: Jason Garrett-Glaser Date: Thu, 28 Jul 2011 01:09:49 +0000 (-0700) Subject: H.264: tweak some other x86 asm for Atom X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=a3bf7b864acae5921f5de53b45945770e93e6237;p=ffmpeg H.264: tweak some other x86 asm for Atom --- diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index dd318a13990..9909fdab78d 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -456,12 +456,12 @@ static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si "movdqu (%1,%3), %%xmm1 \n\t" "movdqu (%1,%3,2), %%xmm2 \n\t" "movdqu (%1,%4), %%xmm3 \n\t" + "lea (%1,%3,4), %1 \n\t" "movdqa %%xmm0, (%2) \n\t" "movdqa %%xmm1, (%2,%3) \n\t" "movdqa %%xmm2, (%2,%3,2) \n\t" "movdqa %%xmm3, (%2,%4) \n\t" "subl $4, %0 \n\t" - "lea (%1,%3,4), %1 \n\t" "lea (%2,%3,4), %2 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) @@ -478,6 +478,7 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si "movdqu (%1,%3), %%xmm1 \n\t" "movdqu (%1,%3,2), %%xmm2 \n\t" "movdqu (%1,%4), %%xmm3 \n\t" + "lea (%1,%3,4), %1 \n\t" "pavgb (%2), %%xmm0 \n\t" "pavgb (%2,%3), %%xmm1 \n\t" "pavgb (%2,%3,2), %%xmm2 \n\t" @@ -487,7 +488,6 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si "movdqa %%xmm2, (%2,%3,2) \n\t" "movdqa %%xmm3, (%2,%4) \n\t" "subl $4, %0 \n\t" - "lea (%1,%3,4), %1 \n\t" "lea (%2,%3,4), %2 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm index 0a37994fb9a..16cf2ec43ee 100644 --- a/libavcodec/x86/h264_chromamc.asm +++ b/libavcodec/x86/h264_chromamc.asm @@ -72,17 +72,17 @@ SECTION .text .next4rows movq mm0, [r1 ] movq mm1, [r1+r2] + add r1, r4 CHROMAMC_AVG mm0, [r0 ] CHROMAMC_AVG mm1, [r0+r2] movq [r0 ], mm0 movq [r0+r2], mm1 add r0, r4 - add r1, r4 movq mm0, [r1 ] movq mm1, [r1+r2] + add r1, r4 CHROMAMC_AVG mm0, [r0 ] CHROMAMC_AVG mm1, [r0+r2] - add r1, r4 movq [r0 ], mm0 movq [r0+r2], mm1 add r0, r4 @@ -472,8 +472,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 mov r6d, r4d shl r4d, 8 sub r4, r6 - add r4, 8 ; x*288+8 = x<<8 | (8-x) mov r6, 8 + add r4, 8 ; x*288+8 = x<<8 | (8-x) sub r6d, r5d imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) @@ -481,24 +481,23 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 movd m7, r6d movd m6, r4d movdqa m5, [rnd_2d_%2] + movq m0, [r1 ] + movq m1, [r1+1] pshuflw m7, m7, 0 pshuflw m6, m6, 0 + punpcklbw m0, m1 movlhps m7, m7 movlhps m6, m6 - movq m0, [r1 ] - movq m1, [r1 +1] - punpcklbw m0, m1 - add r1, r2 .next2rows - movq m1, [r1 ] - movq m2, [r1 +1] - movq m3, [r1+r2 ] - movq m4, [r1+r2+1] + movq m1, [r1+r2*1 ] + movq m2, [r1+r2*1+1] + movq m3, [r1+r2*2 ] + movq m4, [r1+r2*2+1] lea r1, [r1+r2*2] punpcklbw m1, m2 - punpcklbw m3, m4 movdqa m2, m1 + punpcklbw m3, m4 movdqa m4, m3 pmaddubsw m0, m7 pmaddubsw m1, m6 @@ -508,8 +507,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 paddw m2, m5 paddw m1, m0 paddw m3, m2 - movdqa m0, m4 psrlw m1, 6 + movdqa m0, m4 psrlw m3, 6 %ifidn %1, avg movq m2, [r0 ] @@ -576,6 +575,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 movq m1, [r1+r2 ] movdqa m2, m1 movq m3, [r1+r2*2] + lea r1, [r1+r2*2] punpcklbw m0, m1 punpcklbw m2, m3 pmaddubsw m0, m7 @@ -594,7 +594,6 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 movhps [r0+r2], m0 sub r3d, 2 lea r0, [r0+r2*2] - lea r1, [r1+r2*2] jg .next2yrows REP_RET %endmacro @@ -607,8 +606,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 mov r6, r4 shl r4d, 8 sub r4d, r6d - add r4d, 8 ; x*288+8 mov r6, 8 + add r4d, 8 ; x*288+8 sub r6d, r5d imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) @@ -616,17 +615,16 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 movd m7, r6d movd m6, r4d movq m5, [pw_32] + movd m0, [r1 ] pshufw m7, m7, 0 + punpcklbw m0, [r1+1] pshufw m6, m6, 0 - movd m0, [r1 ] - punpcklbw m0, [r1 +1] - add r1, r2 .next2rows - movd m1, [r1 ] - movd m3, [r1+r2 ] - punpcklbw m1, [r1 +1] - punpcklbw m3, [r1+r2+1] + movd m1, [r1+r2*1 ] + movd m3, [r1+r2*2 ] + punpcklbw m1, [r1+r2*1+1] + punpcklbw m3, [r1+r2*2+1] lea r1, [r1+r2*2] movq m2, m1 movq m4, m3 @@ -638,8 +636,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 paddw m2, m5 paddw m1, m0 paddw m3, m2 - movq m0, m4 psrlw m1, 6 + movq m0, m4 psrlw m3, 6 packuswb m1, m1 packuswb m3, m3 diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 9831ca2cd6e..92f91acade6 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -240,17 +240,17 @@ cextern pb_A1 ; out: m1=p0' m2=q0' ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 - pxor m5, m1, m2 ; p0^q0 - pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 + pxor m5, m1, m2 ; p0^q0 pxor m3, m4 + pand m5, [pb_1] ; (p0^q0)&1 pavgb m3, m0 ; (p1 - q1 + 256)>>1 - pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pxor m4, m1 + pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m3, m5 - paddusb m3, m4 ; d+128+33 mova m6, [pb_A1] + paddusb m3, m4 ; d+128+33 psubusb m6, m3 psubusb m3, [pb_A1] pminub m6, m7 @@ -411,16 +411,16 @@ cglobal deblock_%2_luma_8_%1, 5,5 LOAD_MASK r2, r3 mov r3, r4mp + pcmpeqb m3, m3 movd m4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] mova [esp+%3], m4 ; tc - pcmpeqb m3, m3 pcmpgtb m4, m3 + mova m3, [r4] ; p2 pand m4, m7 mova [esp], m4 ; mask - mova m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m4 pand m4, [esp+%3] ; tc @@ -430,11 +430,10 @@ cglobal deblock_%2_luma_8_%1, 5,5 mova m4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 - mova m5, [esp] ; mask - pand m6, m5 + pand m6, [esp] ; mask mova m5, [esp+%3] ; tc - pand m5, m6 psubb m7, m6 + pand m5, m6 mova m3, [r0+r1] LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 @@ -482,10 +481,10 @@ cglobal deblock_h_luma_8_%1, 0,5 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) mov r0, r0mp sub r0, 2 - lea r1, [r0+r4] movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] + lea r1, [r0+r4] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 4788da98e0c..37c2c904764 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -82,10 +82,10 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0 RET %macro IDCT8_1D 2 - mova m4, m5 mova m0, m1 - psraw m4, 1 psraw m1, 1 + mova m4, m5 + psraw m4, 1 paddw m4, m5 paddw m1, m0 paddw m4, m7 @@ -95,16 +95,16 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0 psubw m0, m3 psubw m5, m3 + psraw m3, 1 paddw m0, m7 psubw m5, m7 - psraw m3, 1 psraw m7, 1 psubw m0, m3 psubw m5, m7 - mova m3, m4 mova m7, m1 psraw m1, 2 + mova m3, m4 psraw m3, 2 paddw m3, m0 psraw m0, 2 @@ -113,12 +113,12 @@ cglobal h264_idct_add_8_mmx, 3, 3, 0 psubw m0, m4 psubw m7, m5 - mova m4, m2 mova m5, m6 - psraw m4, 1 psraw m6, 1 - psubw m4, m5 + mova m4, m2 + psraw m4, 1 paddw m6, m2 + psubw m4, m5 mova m2, %1 mova m5, %2 @@ -337,7 +337,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7, 0 test r6, r6 jz .skipblock mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] + add r6, r0 add word [r2], 32 IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2+8, rsp+64 @@ -391,7 +391,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 REP_RET .no_dc mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] + add r6, r0 IDCT4_ADD r6, r2, r3 .skipblock inc r5 @@ -414,7 +414,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 test r6, r6 jz .skipblock mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] + add r6, r0 IDCT4_ADD r6, r2, r3 .skipblock inc r5 @@ -456,7 +456,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 %define dst_regd r1d %endif mov dst_regd, dword [r1+r5*4] - lea dst_reg, [r0+dst_reg] + add dst_reg, r0 DC_ADD_MMX2_OP movh, dst_reg, r3, r6 %ifndef ARCH_X86_64 mov r1, r1m @@ -513,7 +513,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 RET .no_dc mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] + add r6, r0 add word [r2], 32 IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2+8, rsp+64 @@ -558,7 +558,7 @@ INIT_MMX %define dst_regd r1d %endif mov dst_regd, dword [r1+r5*4] - lea dst_reg, [r0+dst_reg] + add dst_reg, r0 DC_ADD_MMX2_OP mova, dst_reg, r3, r6 lea dst_reg, [dst_reg+r3*4] DC_ADD_MMX2_OP mova, dst_reg, r3, r6 @@ -573,7 +573,7 @@ INIT_MMX .no_dc INIT_XMM mov dst_regd, dword [r1+r5*4] - lea dst_reg, [r0+dst_reg] + add dst_reg, r0 IDCT8_ADD_SSE dst_reg, r2, r3, r6 %ifndef ARCH_X86_64 mov r1, r1m diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm index 45196625feb..5f391073792 100644 --- a/libavcodec/x86/x86util.asm +++ b/libavcodec/x86/x86util.asm @@ -497,10 +497,10 @@ %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride movh %3, [%7] movh %4, [%7+%8] - punpcklbw %3, %5 - punpcklbw %4, %5 psraw %1, %6 psraw %2, %6 + punpcklbw %3, %5 + punpcklbw %4, %5 paddw %3, %1 paddw %4, %2 packuswb %3, %5