X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fpredict-a.asm;h=16c29eee88bffb34b168552e245ebb49accef2f7;hb=64f4e24909924fceeea6e154d71b7dfbf586c7ea;hp=2337e8930b23ab549ab6200f776ba80d83bae966;hpb=97ad171ae33c51f48e6214abdf7c978e4dd5d2d1;p=x264 diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 2337e893..16c29eee 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* predict-a.asm: x86 intra prediction ;***************************************************************************** -;* Copyright (C) 2005-2013 x264 project +;* Copyright (C) 2005-2016 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz @@ -31,7 +31,6 @@ SECTION_RODATA 32 -pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 16 dw -3 pw_m7: times 16 dw -7 @@ -56,6 +55,7 @@ cextern pw_8 cextern pw_16 cextern pw_00ff cextern pw_pixel_max +cextern pw_0to15 %macro STORE8 1 mova [r0+0*FDEC_STRIDEB], %1 @@ -1457,7 +1457,7 @@ cglobal predict_8x8_vr, 2,2 movhps [r0-4*FDEC_STRIDE], m3 movhps [r0-3*FDEC_STRIDE], m0 %if cpuflag(ssse3) - movhlps m3, m3 + punpckhqdq m3, m3 pshufb m0, [shuf_vr] palignr m3, m0, 13 %else @@ -2094,8 +2094,7 @@ PREDICT_16x16_H ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core( pixel *src, int i_dc_left ) ;----------------------------------------------------------------------------- - -%macro PRED16x16_DC 2 +%macro PRED16x16_DC_MMX 2 %if HIGH_BIT_DEPTH mova m0, [r0 - FDEC_STRIDEB+ 0] paddw m0, [r0 - FDEC_STRIDEB+ 8] @@ -2124,15 +2123,15 @@ INIT_MMX mmx2 cglobal predict_16x16_dc_core, 1,2 %if ARCH_X86_64 movd m6, r1d - PRED16x16_DC m6, 5 + PRED16x16_DC_MMX m6, 5 %else - PRED16x16_DC r1m, 5 + PRED16x16_DC_MMX r1m, 5 %endif RET INIT_MMX mmx2 cglobal predict_16x16_dc_top, 1,2 - PRED16x16_DC [pw_8], 4 + PRED16x16_DC_MMX [pw_8], 4 RET INIT_MMX mmx2 @@ -2151,23 +2150,23 @@ cglobal predict_16x16_dc_left_core, 1,1 RET %endif -;----------------------------------------------------------------------------- -; void predict_16x16_dc_core( pixel *src, int i_dc_left ) -;----------------------------------------------------------------------------- - -%macro PRED16x16_DC_SSE2 2 +%macro PRED16x16_DC 2 %if HIGH_BIT_DEPTH - mova m0, [r0 - FDEC_STRIDEB+ 0] - paddw m0, [r0 - FDEC_STRIDEB+16] - HADDW m0, m2 - paddw m0, %1 - psrlw m0, %2 - SPLATW m0, m0 + mova xm0, [r0 - FDEC_STRIDEB+ 0] + paddw xm0, [r0 - FDEC_STRIDEB+16] + HADDW xm0, xm2 + paddw xm0, %1 + psrlw xm0, %2 + SPLATW m0, xm0 +%if mmsize == 32 + STORE16 m0 +%else STORE16 m0, m0 +%endif %else ; !HIGH_BIT_DEPTH pxor m0, m0 psadbw m0, [r0 - FDEC_STRIDE] - movhlps m1, m0 + MOVHL m1, m0 paddw m0, m1 paddusw m0, %1 psrlw m0, %2 ; dc @@ -2177,28 +2176,36 @@ cglobal predict_16x16_dc_left_core, 1,1 %endif %endmacro -INIT_XMM sse2 +%macro PREDICT_16x16_DC_CORE 0 cglobal predict_16x16_dc_core, 2,2,4 - movd m3, r1m - PRED16x16_DC_SSE2 m3, 5 + movd xm3, r1m + PRED16x16_DC xm3, 5 RET cglobal predict_16x16_dc_top, 1,2 - PRED16x16_DC_SSE2 [pw_8], 4 + PRED16x16_DC [pw_8], 4 RET -INIT_XMM sse2 -%if HIGH_BIT_DEPTH cglobal predict_16x16_dc_left_core, 1,2 - movd m0, r1m - SPLATW m0, m0 + movd xm0, r1m + SPLATW m0, xm0 +%if HIGH_BIT_DEPTH && mmsize == 16 STORE16 m0, m0 - RET -%else ; !HIGH_BIT_DEPTH -cglobal predict_16x16_dc_left_core, 1,1 - movd m0, r1m - SPLATW m0, m0 +%else +%if HIGH_BIT_DEPTH == 0 packuswb m0, m0 +%endif STORE16 m0 +%endif RET +%endmacro + +INIT_XMM sse2 +PREDICT_16x16_DC_CORE +%if HIGH_BIT_DEPTH +INIT_YMM avx2 +PREDICT_16x16_DC_CORE +%else +INIT_XMM avx2 +PREDICT_16x16_DC_CORE %endif