+
+%if HIGH_BIT_DEPTH == 0
+%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
+%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
+%define TRANS TRANS_SSE4
+INIT_YMM avx2
+HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+
+%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
+ movq xm%1, [r0]
+ movq xm%3, [r2]
+ movq xm%2, [r0+r1]
+ movq xm%4, [r2+r3]
+ vinserti128 m%1, m%1, [r0+4*r1], 1
+ vinserti128 m%3, m%3, [r2+4*r3], 1
+ vinserti128 m%2, m%2, [r0+r4], 1
+ vinserti128 m%4, m%4, [r2+r5], 1
+ punpcklqdq m%1, m%1
+ punpcklqdq m%3, m%3
+ punpcklqdq m%2, m%2
+ punpcklqdq m%4, m%4
+ DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+
+ movq xm%3, [r0]
+ movq xm%5, [r2]
+ movq xm%4, [r0+r1]
+ movq xm%6, [r2+r3]
+ vinserti128 m%3, m%3, [r0+4*r1], 1
+ vinserti128 m%5, m%5, [r2+4*r3], 1
+ vinserti128 m%4, m%4, [r0+r4], 1
+ vinserti128 m%6, m%6, [r2+r5], 1
+ punpcklqdq m%3, m%3
+ punpcklqdq m%5, m%5
+ punpcklqdq m%4, m%4
+ punpcklqdq m%6, m%6
+ DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
+%endmacro
+
+%macro SATD_START_AVX2 2-3 0
+ FIX_STRIDES r1, r3
+%if %3
+ mova %2, [hmul_8p]
+ lea r4, [5*r1]
+ lea r5, [5*r3]
+%else
+ mova %2, [hmul_16p]
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+%endif
+ pxor %1, %1
+%endmacro
+
+%define TRANS TRANS_SSE4
+INIT_YMM avx2
+cglobal pixel_satd_16x8_internal
+ LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ ret
+
+cglobal pixel_satd_16x16, 4,6,8
+ SATD_START_AVX2 m6, m7
+ call pixel_satd_16x8_internal
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+pixel_satd_16x8_internal:
+ call pixel_satd_16x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_satd_16x8, 4,6,8
+ SATD_START_AVX2 m6, m7
+ jmp pixel_satd_16x8_internal
+
+cglobal pixel_satd_8x8_internal
+ LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
+ ret
+
+cglobal pixel_satd_8x16, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_satd_8x8_internal
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ call pixel_satd_8x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_satd_8x8, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_satd_8x8_internal
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ SATD_END_SSE2 xm0
+ RET
+
+cglobal pixel_sa8d_8x8_internal
+ LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
+ HADAMARD4_V 0, 1, 2, 3, 4
+ HADAMARD 8, sumsub, 0, 1, 4, 5
+ HADAMARD 8, sumsub, 2, 3, 4, 5
+ HADAMARD 2, sumsub, 0, 1, 4, 5
+ HADAMARD 2, sumsub, 2, 3, 4, 5
+ HADAMARD 1, amax, 0, 1, 4, 5
+ HADAMARD 1, amax, 2, 3, 4, 5
+ paddw m6, m0
+ paddw m6, m2
+ ret
+
+cglobal pixel_sa8d_8x8, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+ call pixel_sa8d_8x8_internal
+ vextracti128 xm1, m6, 1
+ paddw xm6, xm1
+ HADDW xm6, xm1
+ movd eax, xm6
+ add eax, 1
+ shr eax, 1
+ RET
+
+cglobal intra_sad_x9_8x8, 5,7,8
+ %define pred(i,j) [rsp+i*0x40+j*0x20]
+
+ mov r6, rsp
+ and rsp, ~31
+ SUB rsp, 0x240
+ movu m5, [r0+0*FENC_STRIDE]
+ movu m6, [r0+4*FENC_STRIDE]
+ punpcklqdq m5, [r0+2*FENC_STRIDE]
+ punpcklqdq m6, [r0+6*FENC_STRIDE]
+
+ ; save instruction size: avoid 4-byte memory offsets
+ lea r0, [intra8x9_h1+128]
+ %define off(m) (r0+m-(intra8x9_h1+128))
+
+ vpbroadcastq m0, [r2+16]
+ psadbw m4, m0, m5
+ psadbw m2, m0, m6
+ mova pred(0,0), m0
+ mova pred(0,1), m0
+ paddw m4, m2
+
+ vpbroadcastq m1, [r2+7]
+ pshufb m3, m1, [off(intra8x9_h1)]
+ pshufb m2, m1, [off(intra8x9_h3)]
+ mova pred(1,0), m3
+ mova pred(1,1), m2
+ psadbw m3, m5
+ psadbw m2, m6
+ paddw m3, m2
+
+ lea r5, [rsp+0x100]
+ %define pred(i,j) [r5+i*0x40+j*0x20-0x100]
+
+ ; combine the first two
+ pslldq m3, 2
+ por m4, m3
+
+ pxor m2, m2
+ psadbw m0, m2
+ psadbw m1, m2
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ pshufb m0, m2
+ mova pred(2,0), m0
+ mova pred(2,1), m0
+ psadbw m3, m0, m5
+ psadbw m2, m0, m6
+ paddw m3, m2
+
+ pslldq m3, 4
+ por m4, m3
+
+ vbroadcasti128 m0, [r2+16]
+ vbroadcasti128 m2, [r2+17]
+ pslldq m1, m0, 1
+ pavgb m3, m0, m2
+ PRED4x4_LOWPASS m0, m1, m2, m0, m7
+ pshufb m1, m0, [off(intra8x9_ddl1)]
+ pshufb m2, m0, [off(intra8x9_ddl3)]
+ mova pred(3,0), m1
+ mova pred(3,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 6
+ por m4, m1
+ vextracti128 xm1, m4, 1
+ paddw xm4, xm1
+ mova [r4], xm4
+
+ ; for later
+ vinserti128 m7, m3, xm0, 1
+
+ vbroadcasti128 m2, [r2+8]
+ vbroadcasti128 m0, [r2+7]
+ vbroadcasti128 m1, [r2+6]
+ pavgb m3, m2, m0
+ PRED4x4_LOWPASS m0, m1, m2, m0, m4
+ pshufb m1, m0, [off(intra8x9_ddr1)]
+ pshufb m2, m0, [off(intra8x9_ddr3)]
+ mova pred(4,0), m1
+ mova pred(4,1), m2
+ psadbw m4, m1, m5
+ psadbw m2, m6
+ paddw m4, m2
+
+ add r0, 256
+ add r5, 0xC0
+ %define off(m) (r0+m-(intra8x9_h1+256+128))
+ %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
+
+ vpblendd m2, m3, m0, 11110011b
+ pshufb m1, m2, [off(intra8x9_vr1)]
+ pshufb m2, m2, [off(intra8x9_vr3)]
+ mova pred(5,0), m1
+ mova pred(5,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 2
+ por m4, m1
+
+ psrldq m2, m3, 4
+ pblendw m2, m0, q3330
+ punpcklbw m0, m3
+ pshufb m1, m2, [off(intra8x9_hd1)]
+ pshufb m2, m0, [off(intra8x9_hd3)]
+ mova pred(6,0), m1
+ mova pred(6,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 4
+ por m4, m1
+
+ pshufb m1, m7, [off(intra8x9_vl1)]
+ pshufb m2, m7, [off(intra8x9_vl3)]
+ mova pred(7,0), m1
+ mova pred(7,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+
+ pslldq m1, 6
+ por m4, m1
+ vextracti128 xm1, m4, 1
+ paddw xm4, xm1
+ mova xm3, [r4]
+ SBUTTERFLY qdq, 3, 4, 7
+ paddw xm3, xm4
+
+ pslldq m1, m0, 1
+ vpbroadcastd m0, [r2+7]
+ palignr m0, m1, 1
+ pshufb m1, m0, [off(intra8x9_hu1)]
+ pshufb m2, m0, [off(intra8x9_hu3)]
+ mova pred(8,0), m1
+ mova pred(8,1), m2
+ psadbw m1, m5
+ psadbw m2, m6
+ paddw m1, m2
+ vextracti128 xm2, m1, 1
+ paddw xm1, xm2
+ movhlps xm2, xm1
+ paddw xm1, xm2
+ movd r2d, xm1
+
+ paddw xm3, [r3]
+ mova [r4], xm3
+ add r2w, word [r3+16]
+ mov [r4+16], r2w
+
+ phminposuw xm3, xm3
+ movd r3d, xm3
+ add r2d, 8<<16
+ cmp r3w, r2w
+ cmovg r3d, r2d
+
+ mov r2d, r3d
+ shr r3, 16
+ shl r3, 6
+ add r1, 4*FDEC_STRIDE
+ mova xm0, [rsp+r3+0x00]
+ mova xm1, [rsp+r3+0x10]
+ mova xm2, [rsp+r3+0x20]
+ mova xm3, [rsp+r3+0x30]
+ movq [r1+FDEC_STRIDE*-4], xm0
+ movhps [r1+FDEC_STRIDE*-2], xm0
+ movq [r1+FDEC_STRIDE*-3], xm1
+ movhps [r1+FDEC_STRIDE*-1], xm1
+ movq [r1+FDEC_STRIDE* 0], xm2
+ movhps [r1+FDEC_STRIDE* 2], xm2
+ movq [r1+FDEC_STRIDE* 1], xm3
+ movhps [r1+FDEC_STRIDE* 3], xm3
+ mov rsp, r6
+ mov eax, r2d
+ RET
+%endif ; HIGH_BIT_DEPTH
+