;*****************************************************************************
;* sad16-a.asm: x86 high depth sad functions
;*****************************************************************************
-;* Copyright (C) 2010 x264 project
+;* Copyright (C) 2010-2016 x264 project
;*
;* Authors: Oskar Arvidsson <oskar@irock.se>
+;* Henrik Gramner <henrik@gramner.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
SECTION .text
cextern pw_1
+cextern pw_4
+cextern pw_8
;=============================================================================
; SAD MMX
psubw m2, [r2+ 8]
psubw m3, [r2+16]
psubw m4, [r2+24]
- ABS2 m1, m2, m5, m6
- ABS2 m3, m4, m7, m5
+ ABSW2 m1, m2, m1, m2, m5, m6
+ ABSW2 m3, m4, m3, m4, m7, m5
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw m1, m2
psubw m2, [r2+8]
psubw m3, [r2+2*r3+0]
psubw m4, [r2+2*r3+8]
- ABS2 m1, m2, m5, m6
- ABS2 m3, m4, m7, m5
+ ABSW2 m1, m2, m1, m2, m5, m6
+ ABSW2 m3, m4, m3, m4, m7, m5
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddw m1, m2
movu m2, [r0+2*r1]
psubw m1, [r2]
psubw m2, [r2+2*r3]
- ABS2 m1, m2, m3, m4
+ ABSW2 m1, m2, m1, m2, m3, m4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddw m0, m1
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
+; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
-%macro SAD_MMX 4
-cglobal pixel_sad_%1x%2_%4, 4,4
+%macro SAD_MMX 3
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
pxor m0, m0
-%rep %2/%3
+%if %2 == 4
SAD_INC_%3x%1P_MMX
-%endrep
+ SAD_INC_%3x%1P_MMX
+%else
+ mov r4d, %2/%3
+.loop:
+ SAD_INC_%3x%1P_MMX
+ dec r4d
+ jg .loop
+%endif
%if %1*%2 == 256
HADDUW m0, m1
%else
RET
%endmacro
-INIT_MMX
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-SAD_MMX 16, 16, 1, mmxext
-SAD_MMX 16, 8, 1, mmxext
-SAD_MMX 8, 16, 2, mmxext
-SAD_MMX 8, 8, 2, mmxext
-SAD_MMX 8, 4, 2, mmxext
-SAD_MMX 4, 8, 2, mmxext
-SAD_MMX 4, 4, 2, mmxext
-%define ABS1 ABS1_SSSE3
-%define ABS2 ABS2_SSSE3
-SAD_MMX 4, 8, 2, ssse3
-SAD_MMX 4, 4, 2, ssse3
-%undef ABS1
-%undef ABS2
+INIT_MMX mmx2
+SAD_MMX 16, 16, 1
+SAD_MMX 16, 8, 1
+SAD_MMX 8, 16, 2
+SAD_MMX 8, 8, 2
+SAD_MMX 8, 4, 2
+SAD_MMX 4, 8, 2
+SAD_MMX 4, 4, 2
+INIT_MMX ssse3
+SAD_MMX 4, 8, 2
+SAD_MMX 4, 4, 2
;=============================================================================
; SAD XMM
;=============================================================================
-%macro SAD_INC_2x16P_XMM 0
+%macro SAD_INC_2ROW 1
+%if 2*%1 > mmsize
movu m1, [r2+ 0]
movu m2, [r2+16]
movu m3, [r2+2*r3+ 0]
psubw m2, [r0+16]
psubw m3, [r0+2*r1+ 0]
psubw m4, [r0+2*r1+16]
- ABS2 m1, m2, m5, m6
+ ABSW2 m1, m2, m1, m2, m5, m6
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- ABS2 m3, m4, m7, m5
+ ABSW2 m3, m4, m3, m4, m7, m5
paddw m1, m2
paddw m3, m4
paddw m0, m1
paddw m0, m3
-%endmacro
-
-%macro SAD_INC_2x8P_XMM 0
+%else
movu m1, [r2]
movu m2, [r2+2*r3]
psubw m1, [r0]
psubw m2, [r0+2*r1]
- ABS2 m1, m2, m3, m4
+ ABSW2 m1, m2, m1, m2, m3, m4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddw m0, m1
paddw m0, m2
+%endif
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
+; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
-%macro SAD_XMM 3
-cglobal pixel_sad_%1x%2_%3, 4,4,8
+%macro SAD 2
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
pxor m0, m0
-%rep %2/2
- SAD_INC_2x%1P_XMM
-%endrep
+%if %2 == 4
+ SAD_INC_2ROW %1
+ SAD_INC_2ROW %1
+%else
+ mov r4d, %2/2
+.loop:
+ SAD_INC_2ROW %1
+ dec r4d
+ jg .loop
+%endif
HADDW m0, m1
- movd eax, m0
+ movd eax, xm0
RET
%endmacro
-INIT_XMM
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-SAD_XMM 16, 16, sse2
-SAD_XMM 16, 8, sse2
-SAD_XMM 8, 16, sse2
-SAD_XMM 8, 8, sse2
-SAD_XMM 8, 4, sse2
-%define movdqu movdqa
-SAD_XMM 16, 16, sse2_aligned
-SAD_XMM 16, 8, sse2_aligned
-SAD_XMM 8, 16, sse2_aligned
-SAD_XMM 8, 8, sse2_aligned
-%undef movdqu
-%define ABS1 ABS1_SSSE3
-%define ABS2 ABS2_SSSE3
-SAD_XMM 16, 16, ssse3
-SAD_XMM 16, 8, ssse3
-SAD_XMM 8, 16, ssse3
-SAD_XMM 8, 8, ssse3
-SAD_XMM 8, 4, ssse3
-%define movdqu movdqa
-SAD_XMM 16, 16, ssse3_aligned
-SAD_XMM 16, 8, ssse3_aligned
-SAD_XMM 8, 16, ssse3_aligned
-SAD_XMM 8, 8, ssse3_aligned
-%undef movdqu
-%undef ABS1
-%undef ABS2
+INIT_XMM sse2
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+SAD 8, 4
+INIT_XMM sse2, aligned
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+INIT_XMM ssse3
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+SAD 8, 4
+INIT_XMM ssse3, aligned
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+INIT_YMM avx2
+SAD 16, 16
+SAD 16, 8
;=============================================================================
; SAD x3/x4
psubw m0, m3
psubw m1, m3
psubw m2, m3
- ABS2 m0, m1, m4, m5
- ABS1 m2, m6
+ ABSW2 m0, m1, m0, m1, m4, m5
+ ABSW m2, m2, m6
%endmacro
%macro SAD_X3_ONE 2
psubw m3, m6
psubw m4, m6
psubw m5, m6
- ABS2 m3, m4, m7, m6
- ABS1 m5, m6
+ ABSW2 m3, m4, m3, m4, m7, m6
+ ABSW m5, m5, m6
paddw m0, m3
paddw m1, m4
paddw m2, m5
HADDW m1, m4
HADDW m2, m5
%endif
-%ifdef UNIX64
- movd [r5+0], m0
- movd [r5+4], m1
- movd [r5+8], m2
+%if UNIX64
+ movd [r5+0], xm0
+ movd [r5+4], xm1
+ movd [r5+8], xm2
%else
mov r0, r5mp
- movd [r0+0], m0
- movd [r0+4], m1
- movd [r0+8], m2
+ movd [r0+0], xm0
+ movd [r0+4], xm1
+ movd [r0+8], xm2
%endif
RET
%endmacro
psubw m1, m4
psubw m2, m4
psubw m3, m4
- ABS2 m0, m1, m5, m6
- ABS2 m2, m3, m4, m7
+ ABSW2 m0, m1, m0, m1, m5, m6
+ ABSW2 m2, m3, m2, m3, m4, m7
%endmacro
%macro SAD_X4_ONE 2
psubw m6, m4
psubw m7, m4
psubw m8, m4
- ABS2 m5, m6, m9, m10
- ABS2 m7, m8, m9, m10
+ ABSW2 m5, m6, m5, m6, m9, m10
+ ABSW2 m7, m8, m7, m8, m9, m10
paddw m0, m5
paddw m1, m6
paddw m2, m7
paddw m3, m8
-%elifidn ABS1, ABS1_SSSE3
+%elif cpuflag(ssse3)
movu m7, [r3+%2]
psubw m5, m4
psubw m6, m4
%else ; num_mmregs == 8 && !ssse3
psubw m5, m4
psubw m6, m4
- ABS1 m5, m7
- ABS1 m6, m7
+ ABSW m5, m5, m7
+ ABSW m6, m6, m7
paddw m0, m5
paddw m1, m6
movu m5, [r3+%2]
movu m6, [r4+%2]
psubw m5, m4
psubw m6, m4
- ABS2 m5, m6, m7, m4
+ ABSW2 m5, m6, m5, m6, m7, m4
paddw m2, m5
paddw m3, m6
%endif
HADDW m3, m7
%endif
mov r0, r6mp
- movd [r0+ 0], m0
- movd [r0+ 4], m1
- movd [r0+ 8], m2
- movd [r0+12], m3
+ movd [r0+ 0], xm0
+ movd [r0+ 4], xm1
+ movd [r0+ 8], xm2
+ movd [r0+12], xm3
RET
%endmacro
%endrep
%endmacro
+%macro PIXEL_VSAD 0
+cglobal pixel_vsad, 3,3,8
+ mova m0, [r0]
+ mova m1, [r0+16]
+ mova m2, [r0+2*r1]
+ mova m3, [r0+2*r1+16]
+ lea r0, [r0+4*r1]
+ psubw m0, m2
+ psubw m1, m3
+ ABSW2 m0, m1, m0, m1, m4, m5
+ paddw m0, m1
+ sub r2d, 2
+ je .end
+.loop:
+ mova m4, [r0]
+ mova m5, [r0+16]
+ mova m6, [r0+2*r1]
+ mova m7, [r0+2*r1+16]
+ lea r0, [r0+4*r1]
+ psubw m2, m4
+ psubw m3, m5
+ psubw m4, m6
+ psubw m5, m7
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ ABSW m4, m4, m1
+ ABSW m5, m5, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ paddw m0, m5
+ mova m2, m6
+ mova m3, m7
+ sub r2d, 2
+ jg .loop
+.end:
+%if BIT_DEPTH == 9
+ HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
+%else
+ HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
+%endif
+ movd eax, m0
+ RET
+%endmacro
+INIT_XMM sse2
+PIXEL_VSAD
+INIT_XMM ssse3
+PIXEL_VSAD
+INIT_XMM xop
+PIXEL_VSAD
+
+INIT_YMM avx2
+cglobal pixel_vsad, 3,3
+ mova m0, [r0]
+ mova m1, [r0+2*r1]
+ lea r0, [r0+4*r1]
+ psubw m0, m1
+ pabsw m0, m0
+ sub r2d, 2
+ je .end
+.loop:
+ mova m2, [r0]
+ mova m3, [r0+2*r1]
+ lea r0, [r0+4*r1]
+ psubw m1, m2
+ psubw m2, m3
+ pabsw m1, m1
+ pabsw m2, m2
+ paddw m0, m1
+ paddw m0, m2
+ mova m1, m3
+ sub r2d, 2
+ jg .loop
+.end:
+%if BIT_DEPTH == 9
+ HADDW m0, m1
+%else
+ HADDUW m0, m1
+%endif
+ movd eax, xm0
+ RET
+
;-----------------------------------------------------------------------------
-; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
-; uint16_t *pix2, int i_stride, int scores[3] )
+; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
+; uint16_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
-%macro SAD_X 4
-cglobal pixel_sad_x%1_%2x%3_%4, 6,7,XMM_REGS
+%macro SAD_X 3
+cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
%assign regnum %1+1
%xdefine STRIDE r %+ regnum
-%ifdef WIN64
- movsxd STRIDE, STRIDE %+ d
-%endif
mov r6, %3/2-1
SAD_X%1_ONE_START
SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
.loop:
SAD_X%1_INC_P
- dec r6
SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
+ dec r6
jg .loop
%if %1 == 4
mov r6, r6m
SAD_X%1_END %2, %3
%endmacro
-INIT_MMX
+INIT_MMX mmx2
%define XMM_REGS 0
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-SAD_X 3, 16, 16, mmxext
-SAD_X 3, 16, 8, mmxext
-SAD_X 3, 8, 16, mmxext
-SAD_X 3, 8, 8, mmxext
-SAD_X 3, 8, 4, mmxext
-SAD_X 3, 4, 8, mmxext
-SAD_X 3, 4, 4, mmxext
-SAD_X 4, 16, 16, mmxext
-SAD_X 4, 16, 8, mmxext
-SAD_X 4, 8, 16, mmxext
-SAD_X 4, 8, 8, mmxext
-SAD_X 4, 8, 4, mmxext
-SAD_X 4, 4, 8, mmxext
-SAD_X 4, 4, 4, mmxext
-%define ABS1 ABS1_SSSE3
-%define ABS2 ABS2_SSSE3
-SAD_X 3, 4, 8, ssse3
-SAD_X 3, 4, 4, ssse3
-SAD_X 4, 4, 8, ssse3
-SAD_X 4, 4, 4, ssse3
-INIT_XMM
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+SAD_X 3, 4, 8
+SAD_X 3, 4, 4
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+SAD_X 4, 4, 8
+SAD_X 4, 4, 4
+INIT_MMX ssse3
+SAD_X 3, 4, 8
+SAD_X 3, 4, 4
+SAD_X 4, 4, 8
+SAD_X 4, 4, 4
+INIT_XMM ssse3
+%define XMM_REGS 7
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
%define XMM_REGS 9
-SAD_X 3, 16, 16, ssse3
-SAD_X 3, 16, 8, ssse3
-SAD_X 3, 8, 16, ssse3
-SAD_X 3, 8, 8, ssse3
-SAD_X 3, 8, 4, ssse3
-SAD_X 4, 16, 16, ssse3
-SAD_X 4, 16, 8, ssse3
-SAD_X 4, 8, 16, ssse3
-SAD_X 4, 8, 8, ssse3
-SAD_X 4, 8, 4, ssse3
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+INIT_XMM sse2
+%define XMM_REGS 8
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
%define XMM_REGS 11
-%define ABS1 ABS1_MMX
-%define ABS2 ABS2_MMX
-SAD_X 3, 16, 16, sse2
-SAD_X 3, 16, 8, sse2
-SAD_X 3, 8, 16, sse2
-SAD_X 3, 8, 8, sse2
-SAD_X 3, 8, 4, sse2
-SAD_X 4, 16, 16, sse2
-SAD_X 4, 16, 8, sse2
-SAD_X 4, 8, 16, sse2
-SAD_X 4, 8, 8, sse2
-SAD_X 4, 8, 4, sse2
-%undef ABS1
-%undef ABS2
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+INIT_XMM xop
+%define XMM_REGS 7
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+%define XMM_REGS 9
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+INIT_YMM avx2
+%define XMM_REGS 7
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+%define XMM_REGS 9
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+%macro INTRA_SAD_X3_4x4 0
+cglobal intra_sad_x3_4x4, 3,3,7
+%if cpuflag(ssse3)
+ movddup m0, [r1-1*FDEC_STRIDEB]
+%else
+ movq m0, [r1-1*FDEC_STRIDEB]
+ punpcklqdq m0, m0
+%endif
+ movq m1, [r0+0*FENC_STRIDEB]
+ movq m2, [r0+2*FENC_STRIDEB]
+ pshuflw m6, m0, q1032
+ paddw m6, m0
+ pshuflw m5, m6, q2301
+ paddw m6, m5
+ punpcklqdq m6, m6 ; A+B+C+D 8 times
+ movhps m1, [r0+1*FENC_STRIDEB]
+ movhps m2, [r0+3*FENC_STRIDEB]
+ psubw m3, m1, m0
+ psubw m0, m2
+ ABSW2 m3, m0, m3, m0, m4, m5
+ paddw m0, m3
+ movd m3, [r1+0*FDEC_STRIDEB-4]
+ movd m4, [r1+2*FDEC_STRIDEB-4]
+ movhps m3, [r1+1*FDEC_STRIDEB-8]
+ movhps m4, [r1+3*FDEC_STRIDEB-8]
+ pshufhw m3, m3, q3333
+ pshufhw m4, m4, q3333
+ pshuflw m3, m3, q1111 ; FF FF EE EE
+ pshuflw m4, m4, q1111 ; HH HH GG GG
+ paddw m5, m3, m4
+ paddw m6, [pw_4]
+ paddw m6, m5
+ pshufd m5, m5, q1032
+ paddw m5, m6
+ psrlw m5, 3
+ psubw m6, m5, m2
+ psubw m5, m1
+ psubw m1, m3
+ psubw m2, m4
+ ABSW2 m5, m6, m5, m6, m3, m4
+ ABSW2 m1, m2, m1, m2, m3, m4
+ paddw m5, m6
+ paddw m1, m2
+%if cpuflag(ssse3)
+ phaddw m0, m1
+ movhlps m3, m5
+ paddw m5, m3
+ phaddw m0, m5
+ pmaddwd m0, [pw_1]
+ mova [r2], m0
+%else
+ HADDW m0, m3
+ HADDW m1, m3
+ HADDW m5, m3
+ movd [r2], m0 ; V prediction cost
+ movd [r2+4], m1 ; H prediction cost
+ movd [r2+8], m5 ; DC prediction cost
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+INTRA_SAD_X3_4x4
+INIT_XMM ssse3
+INTRA_SAD_X3_4x4
+INIT_XMM avx
+INTRA_SAD_X3_4x4
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
+;-----------------------------------------------------------------------------
+
+;m0 = DC
+;m6 = V
+;m7 = H
+;m1 = DC score
+;m2 = V score
+;m3 = H score
+;m5 = temp
+;m4 = pixel row
+
+%macro INTRA_SAD_HVDC_ITER 2
+ mova m4, [r0+(%1-4)*FENC_STRIDEB]
+ psubw m4, m0
+ ABSW m4, m4, m5
+ ACCUM paddw, 1, 4, %1
+ mova m4, [r0+(%1-4)*FENC_STRIDEB]
+ psubw m4, m6
+ ABSW m4, m4, m5
+ ACCUM paddw, 2, 4, %1
+ pshufd m5, m7, %2
+ psubw m5, [r0+(%1-4)*FENC_STRIDEB]
+ ABSW m5, m5, m4
+ ACCUM paddw, 3, 5, %1
+%endmacro
+
+%macro INTRA_SAD_X3_8x8 0
+cglobal intra_sad_x3_8x8, 3,3,8
+ add r0, 4*FENC_STRIDEB
+ movu m0, [r1+7*SIZEOF_PIXEL]
+ mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction
+ mova m7, m0
+ paddw m0, m6
+ punpckhwd m7, m7
+ HADDW m0, m4
+ paddw m0, [pw_8]
+ psrlw m0, 4
+ SPLATW m0, m0
+ INTRA_SAD_HVDC_ITER 0, q3333
+ INTRA_SAD_HVDC_ITER 1, q2222
+ INTRA_SAD_HVDC_ITER 2, q1111
+ INTRA_SAD_HVDC_ITER 3, q0000
+ movq m7, [r1+7*SIZEOF_PIXEL]
+ punpcklwd m7, m7
+ INTRA_SAD_HVDC_ITER 4, q3333
+ INTRA_SAD_HVDC_ITER 5, q2222
+ INTRA_SAD_HVDC_ITER 6, q1111
+ INTRA_SAD_HVDC_ITER 7, q0000
+%if cpuflag(ssse3)
+ phaddw m2, m3 ; 2 2 2 2 3 3 3 3
+ movhlps m3, m1
+ paddw m1, m3 ; 1 1 1 1 _ _ _ _
+ phaddw m2, m1 ; 2 2 3 3 1 1 _ _
+ pmaddwd m2, [pw_1] ; 2 3 1 _
+ mova [r2], m2
+%else
+ HADDW m2, m4
+ HADDW m3, m4
+ HADDW m1, m4
+ movd [r2+0], m2
+ movd [r2+4], m3
+ movd [r2+8], m1
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+INTRA_SAD_X3_8x8
+INIT_XMM ssse3
+INTRA_SAD_X3_8x8
+
+%macro INTRA_SAD_HVDC_ITER_YMM 2
+ mova xm4, [r0+(%1-4)*FENC_STRIDEB]
+ vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
+ pshufd m5, m7, %2
+ psubw m5, m4
+ pabsw m5, m5
+ ACCUM paddw, 2, 5, %1 ; H
+ psubw m5, m4, m6
+ psubw m4, m0
+ pabsw m5, m5
+ pabsw m4, m4
+ ACCUM paddw, 1, 5, %1 ; V
+ ACCUM paddw, 3, 4, %1 ; DC
+%endmacro
+
+INIT_YMM avx2
+cglobal intra_sad_x3_8x8, 3,3,8
+ add r0, 4*FENC_STRIDEB
+ movu xm0, [r1+7*SIZEOF_PIXEL]
+ vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
+ vpermq m7, m0, q0011
+ paddw xm0, xm6
+ paddw xm0, [pw_1] ; equal to +8 after HADDW
+ HADDW xm0, xm4
+ psrld xm0, 4
+ vpbroadcastw m0, xm0
+ punpcklwd m7, m7
+ INTRA_SAD_HVDC_ITER_YMM 0, q3333
+ INTRA_SAD_HVDC_ITER_YMM 1, q2222
+ INTRA_SAD_HVDC_ITER_YMM 2, q1111
+ INTRA_SAD_HVDC_ITER_YMM 3, q0000
+ phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
+ punpckhqdq m2, m3, m3
+ paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
+ phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
+ vextracti128 xm2, m1, 1
+ paddw xm1, xm2 ; 1 1 2 2 3 3 _ _
+ pmaddwd xm1, [pw_1] ; 1 2 3 _
+ mova [r2], xm1
+ RET