X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fmc-a.asm;h=608efcd97b40d70c9223ab23719dcd95baf5de07;hb=c82c7374938f4342971adf8b2495c3a1bbe621c4;hp=9f368f4b4bebb6ef590544bb99a1f152088f2c18;hpb=295f83af2afa93073d7810ab96b1d8d889a53ed2;p=x264 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 9f368f4b..608efcd9 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* mc-a.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2003-2013 x264 project +;* Copyright (C) 2003-2016 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser @@ -34,7 +34,6 @@ SECTION_RODATA 32 -pw_512: times 16 dw 512 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 ch_shuf_adj: times 8 db 0 times 8 db 2 @@ -50,6 +49,7 @@ cextern pw_4 cextern pw_8 cextern pw_32 cextern pw_64 +cextern pw_512 cextern pw_00ff cextern pw_pixel_max cextern sw_64 @@ -1029,59 +1029,48 @@ cglobal pixel_avg2_w20_mmx2, 6,7 jg .height_loop RET +INIT_XMM cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: - movdqu xmm0, [r2] - movdqu xmm2, [r2+r3] - movdqu xmm1, [r2+r4] - movdqu xmm3, [r2+r6] + movu m0, [r2] + movu m2, [r2+r3] + movu m1, [r2+r4] + movu m3, [r2+r6] lea r2, [r2+r3*2] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 - movdqa [r0], xmm0 - movdqa [r0+r1], xmm2 + pavgb m0, m1 + pavgb m2, m3 + mova [r0], m0 + mova [r0+r1], m2 lea r0, [r0+r1*2] - sub r5d, 2 - jg .height_loop + sub r5d, 2 + jg .height_loop RET -%macro AVG2_W20 1 -cglobal pixel_avg2_w20_%1, 6,7 +cglobal pixel_avg2_w20_sse2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: - movdqu xmm0, [r4] - movdqu xmm2, [r4+r3] -%ifidn %1, sse2_misalign - movd mm4, [r4+16] - movd mm5, [r4+r3+16] - pavgb xmm0, [r4+r2] - pavgb xmm2, [r4+r6] -%else - movdqu xmm1, [r4+r2] - movdqu xmm3, [r4+r6] - movd mm4, [r4+16] - movd mm5, [r4+r3+16] - pavgb xmm0, xmm1 - pavgb xmm2, xmm3 -%endif - pavgb mm4, [r4+r2+16] - pavgb mm5, [r4+r6+16] + movu m0, [r4] + movu m2, [r4+r3] + movu m1, [r4+r2] + movu m3, [r4+r6] + movd mm4, [r4+16] + movd mm5, [r4+r3+16] + pavgb m0, m1 + pavgb m2, m3 + pavgb mm4, [r4+r2+16] + pavgb mm5, [r4+r6+16] lea r4, [r4+r3*2] - movdqa [r0], xmm0 - movd [r0+16], mm4 - movdqa [r0+r1], xmm2 - movd [r0+r1+16], mm5 + mova [r0], m0 + mova [r0+r1], m2 + movd [r0+16], mm4 + movd [r0+r1+16], mm5 lea r0, [r0+r1*2] - sub r5d, 2 - jg .height_loop + sub r5d, 2 + jg .height_loop RET -%endmacro - -AVG2_W20 sse2 -AVG2_W20 sse2_misalign INIT_YMM avx2 cglobal pixel_avg2_w20, 6,7 @@ -1524,7 +1513,7 @@ cglobal prefetch_ref, 3,3 %endmacro %else ; !HIGH_BIT_DEPTH %macro UNPACK_UNALIGNED 3 -%if mmsize == 8 || cpuflag(misalign) +%if mmsize == 8 punpcklwd %1, %3 %else movh %2, %3 @@ -1794,10 +1783,9 @@ ALIGN 4 %if ARCH_X86_64 ; too many regs for x86_32 RESET_MM_PERMUTATION %if WIN64 -%if xmm_regs_used > 6 - %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16 - %assign xmm_regs_used 6 -%endif + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + %assign xmm_regs_used 0 %endif .mc1dy: and t2d, 7 @@ -1924,11 +1912,7 @@ ALIGN 4 %macro MC_CHROMA_SSSE3 0 cglobal mc_chroma -%if cpuflag(avx2) - MC_CHROMA_START 9 -%else - MC_CHROMA_START 10 -%endif + MC_CHROMA_START 10-cpuflag(avx2) and r5d, 7 and t2d, 7 mov t0d, r5d @@ -2032,7 +2016,7 @@ cglobal mc_chroma movhps [r1+r2], xm2 %else movu m0, [r3] - pshufb m0, xm5 + pshufb m0, m5 .loop4: movu m1, [r3+r4] pshufb m1, m5 @@ -2049,13 +2033,19 @@ cglobal mc_chroma pmulhrsw m3, shiftround mova m0, m4 packuswb m1, m3 + movd [r0], m1 +%if cpuflag(sse4) + pextrd [r1], m1, 1 + pextrd [r0+r2], m1, 2 + pextrd [r1+r2], m1, 3 +%else movhlps m3, m1 - movd [r0], xm1 movd [r0+r2], m3 psrldq m1, 4 psrldq m3, 4 movd [r1], m1 movd [r1+r2], m3 +%endif lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] @@ -2131,8 +2121,6 @@ MC_CHROMA %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 MC_CHROMA -INIT_XMM sse2, misalign -MC_CHROMA INIT_XMM sse2 MC_CHROMA INIT_XMM ssse3