X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fsad16-a.asm;h=6bb94eee05472543e809b119bd5b8a3412b4949c;hb=5c3d473a966e4b013759097fb98cd4a9cb5a34f5;hp=a3a68928dde7b8a7fbdffb847768425cab7c55ee;hpb=0e69048d4f9664f1293c5eed0604522c67adaff5;p=x264 diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm index a3a68928..6bb94eee 100644 --- a/common/x86/sad16-a.asm +++ b/common/x86/sad16-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* sad16-a.asm: x86 high depth sad functions ;***************************************************************************** -;* Copyright (C) 2010-2013 x264 project +;* Copyright (C) 2010-2015 x264 project ;* ;* Authors: Oskar Arvidsson ;* Henrik Gramner @@ -205,9 +205,6 @@ SAD 8, 8 INIT_YMM avx2 SAD 16, 16 SAD 16, 8 -INIT_YMM avx2, aligned -SAD 16, 16 -SAD 16, 8 ;============================================================================= ; SAD x3/x4 @@ -258,14 +255,14 @@ SAD 16, 8 HADDW m2, m5 %endif %if UNIX64 - movd [r5+0], m0 - movd [r5+4], m1 - movd [r5+8], m2 + movd [r5+0], xm0 + movd [r5+4], xm1 + movd [r5+8], xm2 %else mov r0, r5mp - movd [r0+0], m0 - movd [r0+4], m1 - movd [r0+8], m2 + movd [r0+0], xm0 + movd [r0+4], xm1 + movd [r0+8], xm2 %endif RET %endmacro @@ -354,10 +351,10 @@ SAD 16, 8 HADDW m3, m7 %endif mov r0, r6mp - movd [r0+ 0], m0 - movd [r0+ 4], m1 - movd [r0+ 8], m2 - movd [r0+12], m3 + movd [r0+ 0], xm0 + movd [r0+ 4], xm1 + movd [r0+ 8], xm2 + movd [r0+12], xm3 RET %endmacro @@ -453,7 +450,7 @@ cglobal pixel_vsad, 3,3 RET ;----------------------------------------------------------------------------- -; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, +; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, ; uint16_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 @@ -497,29 +494,51 @@ SAD_X 3, 4, 4 SAD_X 4, 4, 8 SAD_X 4, 4, 4 INIT_XMM ssse3 -%define XMM_REGS 9 +%define XMM_REGS 7 SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 +%define XMM_REGS 9 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 INIT_XMM sse2 +%define XMM_REGS 8 +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 %define XMM_REGS 11 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 +INIT_XMM xop +%define XMM_REGS 7 SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 +%define XMM_REGS 9 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 +INIT_YMM avx2 +%define XMM_REGS 7 +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +%define XMM_REGS 9 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 ;----------------------------------------------------------------------------- ; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] ); @@ -527,52 +546,62 @@ SAD_X 4, 8, 4 %macro INTRA_SAD_X3_4x4 0 cglobal intra_sad_x3_4x4, 3,3,7 +%if cpuflag(ssse3) + movddup m0, [r1-1*FDEC_STRIDEB] +%else movq m0, [r1-1*FDEC_STRIDEB] + punpcklqdq m0, m0 +%endif movq m1, [r0+0*FENC_STRIDEB] movq m2, [r0+2*FENC_STRIDEB] pshuflw m6, m0, q1032 paddw m6, m0 pshuflw m5, m6, q2301 paddw m6, m5 - punpcklqdq m6, m6 ;A+B+C+D 8 times - punpcklqdq m0, m0 + punpcklqdq m6, m6 ; A+B+C+D 8 times movhps m1, [r0+1*FENC_STRIDEB] movhps m2, [r0+3*FENC_STRIDEB] psubw m3, m1, m0 psubw m0, m2 - ABSW m3, m3, m5 - ABSW m0, m0, m5 + ABSW2 m3, m0, m3, m0, m4, m5 paddw m0, m3 - HADDW m0, m5 - movd [r2], m0 ;V prediction cost movd m3, [r1+0*FDEC_STRIDEB-4] - movhps m3, [r1+1*FDEC_STRIDEB-8] movd m4, [r1+2*FDEC_STRIDEB-4] + movhps m3, [r1+1*FDEC_STRIDEB-8] movhps m4, [r1+3*FDEC_STRIDEB-8] pshufhw m3, m3, q3333 pshufhw m4, m4, q3333 pshuflw m3, m3, q1111 ; FF FF EE EE pshuflw m4, m4, q1111 ; HH HH GG GG paddw m5, m3, m4 - pshufd m0, m5, q1032 + paddw m6, [pw_4] + paddw m6, m5 + pshufd m5, m5, q1032 paddw m5, m6 - paddw m5, m0 - paddw m5, [pw_4] psrlw m5, 3 psubw m6, m5, m2 psubw m5, m1 psubw m1, m3 psubw m2, m4 - ABSW m5, m5, m0 - ABSW m6, m6, m0 - ABSW m1, m1, m0 - ABSW m2, m2, m0 + ABSW2 m5, m6, m5, m6, m3, m4 + ABSW2 m1, m2, m1, m2, m3, m4 paddw m5, m6 paddw m1, m2 - HADDW m5, m0 - HADDW m1, m2 - movd [r2+8], m5 ;DC prediction cost - movd [r2+4], m1 ;H prediction cost +%if cpuflag(ssse3) + phaddw m0, m1 + movhlps m3, m5 + paddw m5, m3 + phaddw m0, m5 + pmaddwd m0, [pw_1] + mova [r2], m0 +%else + HADDW m0, m3 + HADDW m1, m3 + HADDW m5, m3 + movd [r2], m0 ; V prediction cost + movd [r2+4], m1 ; H prediction cost + movd [r2+8], m5 ; DC prediction cost +%endif RET %endmacro @@ -633,12 +662,21 @@ cglobal intra_sad_x3_8x8, 3,3,8 INTRA_SAD_HVDC_ITER 5, q2222 INTRA_SAD_HVDC_ITER 6, q1111 INTRA_SAD_HVDC_ITER 7, q0000 +%if cpuflag(ssse3) + phaddw m2, m3 ; 2 2 2 2 3 3 3 3 + movhlps m3, m1 + paddw m1, m3 ; 1 1 1 1 _ _ _ _ + phaddw m2, m1 ; 2 2 3 3 1 1 _ _ + pmaddwd m2, [pw_1] ; 2 3 1 _ + mova [r2], m2 +%else HADDW m2, m4 HADDW m3, m4 HADDW m1, m4 movd [r2+0], m2 movd [r2+4], m3 movd [r2+8], m1 +%endif RET %endmacro @@ -646,3 +684,44 @@ INIT_XMM sse2 INTRA_SAD_X3_8x8 INIT_XMM ssse3 INTRA_SAD_X3_8x8 + +%macro INTRA_SAD_HVDC_ITER_YMM 2 + mova xm4, [r0+(%1-4)*FENC_STRIDEB] + vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1 + pshufd m5, m7, %2 + psubw m5, m4 + pabsw m5, m5 + ACCUM paddw, 2, 5, %1 ; H + psubw m5, m4, m6 + psubw m4, m0 + pabsw m5, m5 + pabsw m4, m4 + ACCUM paddw, 1, 5, %1 ; V + ACCUM paddw, 3, 4, %1 ; DC +%endmacro + +INIT_YMM avx2 +cglobal intra_sad_x3_8x8, 3,3,8 + add r0, 4*FENC_STRIDEB + movu xm0, [r1+7*SIZEOF_PIXEL] + vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction + vpermq m7, m0, q0011 + paddw xm0, xm6 + paddw xm0, [pw_1] ; equal to +8 after HADDW + HADDW xm0, xm4 + psrld xm0, 4 + vpbroadcastw m0, xm0 + punpcklwd m7, m7 + INTRA_SAD_HVDC_ITER_YMM 0, q3333 + INTRA_SAD_HVDC_ITER_YMM 1, q2222 + INTRA_SAD_HVDC_ITER_YMM 2, q1111 + INTRA_SAD_HVDC_ITER_YMM 3, q0000 + phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 + punpckhqdq m2, m3, m3 + paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _ + phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _ + vextracti128 xm2, m1, 1 + paddw xm1, xm2 ; 1 1 2 2 3 3 _ _ + pmaddwd xm1, [pw_1] ; 1 2 3 _ + mova [r2], xm1 + RET