+%endmacro ; INTRA_X3_MMX
+
+
+
+%macro PRED4x4_LOWPASS 5
+%ifid %5
+ pavgb %5, %2, %3
+ pxor %3, %2
+ pand %3, [pb_1]
+ psubusb %5, %3
+ pavgb %1, %4, %5
+%else
+ mova %5, %2
+ pavgb %2, %3
+ pxor %3, %5
+ pand %3, [pb_1]
+ psubusb %2, %3
+ pavgb %1, %4, %2
+%endif
+%endmacro
+
+%macro INTRA_X9_PRED 2
+%if cpuflag(sse4)
+ movu m1, [r1-1*FDEC_STRIDE-8]
+ pinsrb m1, [r1+3*FDEC_STRIDE-1], 0
+ pinsrb m1, [r1+2*FDEC_STRIDE-1], 1
+ pinsrb m1, [r1+1*FDEC_STRIDE-1], 2
+ pinsrb m1, [r1+0*FDEC_STRIDE-1], 3
+%else
+ movd mm0, [r1+3*FDEC_STRIDE-4]
+ punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
+ movd mm1, [r1+1*FDEC_STRIDE-4]
+ punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
+ punpckhwd mm0, mm1
+ psrlq mm0, 32
+ movq2dq m0, mm0
+ movu m1, [r1-1*FDEC_STRIDE-8]
+ movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
+%endif ; cpuflag
+ pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
+ psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
+ psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
+ pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __
+ mova %2, m1
+ PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
+ ; ddl ddr
+ ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2
+ ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1
+ ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0
+ ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt
+ pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
+ pshufb m3, m0, [%1_ddlr2] ; rows 2,3
+ ; hd hu
+ ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2
+ ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3
+ ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3
+ ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3
+ pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+ palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
+ pshufb m6, m7, [%1_hdu1]
+ pshufb m7, m7, [%1_hdu2]
+ ; vr vl
+ ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4
+ ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4
+ ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5
+ ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+ psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
+ palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
+ pshufb m4, m5, [%1_vrl1]
+ pshufb m5, m5, [%1_vrl2]
+%endmacro ; INTRA_X9_PRED
+
+%macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
+ pshufb m2, m%1, [intrax9b_vh1]
+ pshufb m3, m%1, [intrax9b_vh2]
+ mova [pred_buf+0x60], m2
+ mova [pred_buf+0x70], m3
+ pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
+ pmaddubsw m%1, [hmul_4p]
+ pshufhw m0, m%1, q2301
+ pshuflw m0, m0, q2301
+ psignw m%1, [pw_pmpmpmpm]
+ paddw m0, m%1
+ psllw m0, 2 ; hadamard(top), hadamard(left)
+ movhlps m3, m0
+ pshufb m1, m0, [intrax9b_v1]
+ pshufb m2, m0, [intrax9b_v2]
+ paddw m0, m3
+ psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
+ pavgw m0, [pw_16]
+ pand m0, [sw_f0] ; dc
+ ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
+ ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
+ ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
+ HADAMARD 0, sumsub, %2, %3, %4, %5
+ HADAMARD 1, sumsub, %2, %3, %4, %5
+ movd r3d, m0
+ shr r3d, 4
+ imul r3d, 0x01010101
+ mov [pred_buf+0x80], r3d
+ mov [pred_buf+0x88], r3d
+ mov [pred_buf+0x90], r3d
+ mov [pred_buf+0x98], r3d
+ psubw m3, m%2
+ psubw m0, m%2
+ psubw m1, m%2
+ psubw m2, m%3
+ pabsw m%3, m%3
+ pabsw m3, m3
+ pabsw m0, m0
+ pabsw m1, m1
+ pabsw m2, m2
+ pavgw m3, m%3
+ pavgw m0, m%3
+ pavgw m1, m2
+%if cpuflag(sse4)
+ phaddw m3, m0
+%else
+ SBUTTERFLY qdq, 3, 0, 2
+ paddw m3, m0
+%endif
+ movhlps m2, m1
+ paddw m1, m2
+%if cpuflag(xop)
+ vphaddwq m3, m3
+ vphaddwq m1, m1
+ packssdw m1, m3
+%else
+ phaddw m1, m3
+ pmaddwd m1, [pw_1] ; v, _, h, dc
+%endif
+%endmacro ; INTRA_X9_VHDC
+
+%macro INTRA_X9_END 2
+%if cpuflag(sse4)
+ phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
+ movd eax, m0
+ add eax, 1<<16
+ cmp ax, r3w
+ cmovge eax, r3d
+%else
+%if %1
+ ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
+ psllw m0, 3
+ paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
+%else
+ ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
+ psllw m0, 2
+ paddusw m0, m0
+ paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
+%endif
+ movhlps m1, m0
+ pminsw m0, m1
+ pshuflw m1, m0, q0032
+ pminsw m0, m1
+ pshuflw m1, m0, q0001
+ pminsw m0, m1
+ movd eax, m0
+ movsx r2d, ax
+ and eax, 7
+ sar r2d, 3
+ shl eax, 16
+ ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
+ ; 1<<12: undo sign manipulation
+ lea eax, [rax+r2+(1<<16)+(1<<12)]
+ cmp ax, r3w
+ cmovge eax, r3d
+%endif ; cpuflag
+
+ ; output the predicted samples
+ mov r3d, eax
+ shr r3d, 16
+%ifdef PIC
+ lea r2, [%2_lut]
+ movzx r2d, byte [r2+r3]
+%else
+ movzx r2d, byte [%2_lut+r3]
+%endif
+%if %1 ; sad
+ movq mm0, [pred_buf+r2]
+ movq mm1, [pred_buf+r2+16]
+ movd [r1+0*FDEC_STRIDE], mm0
+ movd [r1+2*FDEC_STRIDE], mm1
+ psrlq mm0, 32
+ psrlq mm1, 32
+ movd [r1+1*FDEC_STRIDE], mm0
+ movd [r1+3*FDEC_STRIDE], mm1
+%else ; satd
+%assign i 0
+%rep 4
+ mov r3d, [pred_buf+r2+8*i]
+ mov [r1+i*FDEC_STRIDE], r3d
+%assign i i+1
+%endrep
+%endif
+%endmacro ; INTRA_X9_END
+
+%macro INTRA_X9 0
+;-----------------------------------------------------------------------------
+; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
+;-----------------------------------------------------------------------------
+%if notcpuflag(xop)
+cglobal intra_sad_x9_4x4, 3,4,9
+ %assign pad 0xc0-gprsize-(stack_offset&15)
+ %define pred_buf rsp
+ sub rsp, pad
+%if ARCH_X86_64
+ INTRA_X9_PRED intrax9a, m8
+%else
+ INTRA_X9_PRED intrax9a, [rsp+0xa0]
+%endif
+ mova [rsp+0x00], m2
+ mova [rsp+0x10], m3
+ mova [rsp+0x20], m4
+ mova [rsp+0x30], m5
+ mova [rsp+0x40], m6
+ mova [rsp+0x50], m7
+%if cpuflag(sse4)
+ movd m0, [r0+0*FENC_STRIDE]
+ pinsrd m0, [r0+1*FENC_STRIDE], 1
+ movd m1, [r0+2*FENC_STRIDE]
+ pinsrd m1, [r0+3*FENC_STRIDE], 1
+%else
+ movd mm0, [r0+0*FENC_STRIDE]
+ punpckldq mm0, [r0+1*FENC_STRIDE]
+ movd mm1, [r0+2*FENC_STRIDE]
+ punpckldq mm1, [r0+3*FENC_STRIDE]
+ movq2dq m0, mm0
+ movq2dq m1, mm1
+%endif
+ punpcklqdq m0, m0
+ punpcklqdq m1, m1
+ psadbw m2, m0
+ psadbw m3, m1
+ psadbw m4, m0
+ psadbw m5, m1
+ psadbw m6, m0
+ psadbw m7, m1
+ paddd m2, m3
+ paddd m4, m5
+ paddd m6, m7
+%if ARCH_X86_64
+ SWAP 7, 8
+ pxor m8, m8
+ %define %%zero m8
+%else
+ mova m7, [rsp+0xa0]
+ %define %%zero [pb_0]
+%endif
+ pshufb m3, m7, [intrax9a_vh1]
+ pshufb m5, m7, [intrax9a_vh2]
+ pshufb m7, [intrax9a_dc]
+ psadbw m7, %%zero
+ psrlw m7, 2
+ mova [rsp+0x60], m3
+ mova [rsp+0x70], m5
+ psadbw m3, m0
+ pavgw m7, %%zero
+ pshufb m7, %%zero
+ psadbw m5, m1
+ movq [rsp+0x80], m7
+ movq [rsp+0x90], m7
+ psadbw m0, m7
+ paddd m3, m5
+ psadbw m1, m7
+ paddd m0, m1
+ movzx r3d, word [r2]
+ movd r0d, m3 ; v
+ add r3d, r0d
+ punpckhqdq m3, m0 ; h, dc
+ shufps m3, m2, q2020
+ psllq m6, 32
+ por m4, m6
+ movu m0, [r2+2]
+ packssdw m3, m4
+ paddw m0, m3
+ INTRA_X9_END 1, intrax9a
+ add rsp, pad
+ RET
+%endif ; cpuflag
+
+%if ARCH_X86_64
+;-----------------------------------------------------------------------------
+; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
+;-----------------------------------------------------------------------------
+cglobal intra_satd_x9_4x4, 3,4,16
+ %assign pad 0xb0-gprsize-(stack_offset&15)
+ %define pred_buf rsp
+ sub rsp, pad
+ INTRA_X9_PRED intrax9b, m15
+ mova [rsp+0x00], m2
+ mova [rsp+0x10], m3
+ mova [rsp+0x20], m4
+ mova [rsp+0x30], m5
+ mova [rsp+0x40], m6
+ mova [rsp+0x50], m7
+ movd m8, [r0+0*FENC_STRIDE]
+ movd m9, [r0+1*FENC_STRIDE]
+ movd m10, [r0+2*FENC_STRIDE]
+ movd m11, [r0+3*FENC_STRIDE]
+ mova m12, [hmul_8p]
+ pshufd m8, m8, 0
+ pshufd m9, m9, 0
+ pshufd m10, m10, 0
+ pshufd m11, m11, 0
+ pmaddubsw m8, m12
+ pmaddubsw m9, m12
+ pmaddubsw m10, m12
+ pmaddubsw m11, m12
+ movddup m0, m2
+ pshufd m1, m2, q3232
+ movddup m2, m3
+ movhlps m3, m3
+ call .satd_8x4 ; ddr, ddl
+ movddup m2, m5
+ pshufd m3, m5, q3232
+ mova m5, m0
+ movddup m0, m4
+ pshufd m1, m4, q3232
+ call .satd_8x4 ; vr, vl
+ movddup m2, m7
+ pshufd m3, m7, q3232
+ mova m4, m0
+ movddup m0, m6
+ pshufd m1, m6, q3232
+ call .satd_8x4 ; hd, hu
+%if cpuflag(sse4)
+ punpckldq m4, m0
+%else
+ punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
+%endif
+ mova m1, [pw_ppmmppmm]
+ psignw m8, m1
+ psignw m10, m1
+ paddw m8, m9
+ paddw m10, m11
+ INTRA_X9_VHDC 15, 8, 10, 6, 7
+ ; find minimum
+ movu m0, [r2+2]
+ movd r3d, m1
+ palignr m5, m1, 8
+%if notcpuflag(sse4)
+ pshufhw m0, m0, q3120 ; compensate for different order in unpack
+%endif
+ packssdw m5, m4
+ paddw m0, m5
+ movzx r0d, word [r2]
+ add r3d, r0d
+ INTRA_X9_END 0, intrax9b
+ add rsp, pad
+ RET
+RESET_MM_PERMUTATION
+ALIGN 16
+.satd_8x4:
+ pmaddubsw m0, m12
+ pmaddubsw m1, m12
+ pmaddubsw m2, m12
+ pmaddubsw m3, m12
+ psubw m0, m8
+ psubw m1, m9
+ psubw m2, m10
+ psubw m3, m11
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
+ pmaddwd m0, [pw_1]
+%if cpuflag(sse4)
+ pshufd m1, m0, q0032
+%else
+ movhlps m1, m0
+%endif
+ paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
+ ret
+
+%else ; !ARCH_X86_64
+cglobal intra_satd_x9_4x4, 3,4,8
+ %assign pad 0x120-gprsize-(stack_offset&15)
+ %define fenc_buf rsp
+ %define pred_buf rsp+0x40
+ %define spill rsp+0xe0
+ sub rsp, pad
+ INTRA_X9_PRED intrax9b, [spill+0x20]
+ mova [pred_buf+0x00], m2
+ mova [pred_buf+0x10], m3
+ mova [pred_buf+0x20], m4
+ mova [pred_buf+0x30], m5
+ mova [pred_buf+0x40], m6
+ mova [pred_buf+0x50], m7
+ movd m4, [r0+0*FENC_STRIDE]
+ movd m5, [r0+1*FENC_STRIDE]
+ movd m6, [r0+2*FENC_STRIDE]
+ movd m0, [r0+3*FENC_STRIDE]
+ mova m7, [hmul_8p]
+ pshufd m4, m4, 0
+ pshufd m5, m5, 0
+ pshufd m6, m6, 0
+ pshufd m0, m0, 0
+ pmaddubsw m4, m7
+ pmaddubsw m5, m7
+ pmaddubsw m6, m7
+ pmaddubsw m0, m7
+ mova [fenc_buf+0x00], m4
+ mova [fenc_buf+0x10], m5
+ mova [fenc_buf+0x20], m6
+ mova [fenc_buf+0x30], m0
+ movddup m0, m2
+ pshufd m1, m2, q3232
+ movddup m2, m3
+ movhlps m3, m3
+ pmaddubsw m0, m7
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ pmaddubsw m3, m7
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ call .satd_8x4b ; ddr, ddl
+ mova m3, [pred_buf+0x30]
+ mova m1, [pred_buf+0x20]
+ movddup m2, m3
+ movhlps m3, m3
+ movq [spill+0x08], m0
+ movddup m0, m1
+ movhlps m1, m1
+ call .satd_8x4 ; vr, vl
+ mova m3, [pred_buf+0x50]
+ mova m1, [pred_buf+0x40]
+ movddup m2, m3
+ movhlps m3, m3
+ movq [spill+0x10], m0
+ movddup m0, m1
+ movhlps m1, m1
+ call .satd_8x4 ; hd, hu
+ movq [spill+0x18], m0
+ mova m1, [spill+0x20]
+ mova m4, [fenc_buf+0x00]
+ mova m5, [fenc_buf+0x20]
+ mova m2, [pw_ppmmppmm]
+ psignw m4, m2
+ psignw m5, m2
+ paddw m4, [fenc_buf+0x10]
+ paddw m5, [fenc_buf+0x30]
+ INTRA_X9_VHDC 1, 4, 5, 6, 7
+ ; find minimum
+ movu m0, [r2+2]
+ movd r3d, m1
+ punpckhqdq m1, [spill+0x00]
+ packssdw m1, [spill+0x10]
+%if cpuflag(sse4)
+ pshufhw m1, m1, q3120
+%else
+ pshufhw m0, m0, q3120
+%endif
+ paddw m0, m1
+ movzx r0d, word [r2]
+ add r3d, r0d
+ INTRA_X9_END 0, intrax9b
+ add rsp, pad
+ RET
+RESET_MM_PERMUTATION
+ALIGN 16
+.satd_8x4:
+ pmaddubsw m0, m7
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ pmaddubsw m3, m7
+ %xdefine fenc_buf fenc_buf+gprsize
+ psubw m0, [fenc_buf+0x00]
+ psubw m1, [fenc_buf+0x10]
+ psubw m2, [fenc_buf+0x20]
+.satd_8x4b:
+ psubw m3, [fenc_buf+0x30]
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
+ pmaddwd m0, [pw_1]
+%if cpuflag(sse4)
+ pshufd m1, m0, q0032
+%else
+ movhlps m1, m0
+%endif
+ paddd xmm0, m0, m1
+ ret
+%endif ; ARCH
+%endmacro ; INTRA_X9
+
+
+
+%macro INTRA8_X9 0
+;-----------------------------------------------------------------------------
+; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
+;-----------------------------------------------------------------------------
+cglobal intra_sad_x9_8x8, 5,6,9
+ %define fenc02 m4
+ %define fenc13 m5
+ %define fenc46 m6
+ %define fenc57 m7
+%if ARCH_X86_64
+ %define tmp m8
+ %assign padbase 0x0
+%else
+ %define tmp [rsp]
+ %assign padbase 0x10
+%endif
+ %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
+ %define pred(i,j) [rsp+i*0x40+j*0x10+padbase]
+
+ SUB rsp, pad
+ movq fenc02, [r0+FENC_STRIDE* 0]
+ movq fenc13, [r0+FENC_STRIDE* 1]
+ movq fenc46, [r0+FENC_STRIDE* 4]
+ movq fenc57, [r0+FENC_STRIDE* 5]
+ movhps fenc02, [r0+FENC_STRIDE* 2]
+ movhps fenc13, [r0+FENC_STRIDE* 3]
+ movhps fenc46, [r0+FENC_STRIDE* 6]
+ movhps fenc57, [r0+FENC_STRIDE* 7]
+
+ ; save instruction size: avoid 4-byte memory offsets
+ lea r0, [intra8x9_h1+128]
+ %define off(m) (r0+m-(intra8x9_h1+128))
+
+; v
+ movddup m0, [r2+16]
+ mova pred(0,0), m0
+ psadbw m1, m0, fenc02
+ mova pred(0,1), m0
+ psadbw m2, m0, fenc13
+ mova pred(0,2), m0
+ psadbw m3, m0, fenc46
+ mova pred(0,3), m0
+ psadbw m0, m0, fenc57
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+ movd [r4+0], m0
+
+; h
+ movq m0, [r2+7]
+ pshufb m1, m0, [off(intra8x9_h1)]
+ pshufb m2, m0, [off(intra8x9_h2)]
+ mova pred(1,0), m1
+ psadbw m1, fenc02
+ mova pred(1,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m3, m0, [off(intra8x9_h3)]
+ pshufb m2, m0, [off(intra8x9_h4)]
+ mova pred(1,2), m3
+ psadbw m3, fenc46
+ mova pred(1,3), m2
+ psadbw m2, fenc57
+ paddw m1, m3
+ paddw m1, m2
+ movhlps m2, m1
+ paddw m1, m2
+ movd [r4+2], m1
+
+ lea r5, [rsp+padbase+0x100]
+ %define pred(i,j) [r5+i*0x40+j*0x10-0x100]
+
+; dc
+ movhps m0, [r2+16]
+ pxor m2, m2
+ psadbw m0, m2
+ movhlps m1, m0
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ pshufb m0, m2
+ mova pred(2,0), m0
+ psadbw m1, m0, fenc02
+ mova pred(2,1), m0
+ psadbw m2, m0, fenc13
+ mova pred(2,2), m0
+ psadbw m3, m0, fenc46
+ mova pred(2,3), m0
+ psadbw m0, m0, fenc57
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+ movd [r4+4], m0
+
+; ddl
+; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
+; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
+; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
+; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
+; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
+; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
+; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
+; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
+ mova m0, [r2+16]
+ movu m2, [r2+17]
+ pslldq m1, m0, 1
+ pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
+ PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
+ pshufb m1, m0, [off(intra8x9_ddl1)]
+ pshufb m2, m0, [off(intra8x9_ddl2)]
+ mova pred(3,0), m1
+ psadbw m1, fenc02
+ mova pred(3,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_ddl3)]
+ mova pred(3,2), m2
+ psadbw m2, fenc46
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_ddl4)]
+ mova pred(3,3), m2
+ psadbw m2, fenc57
+ paddw m1, m2
+ movhlps m2, m1
+ paddw m1, m2
+ movd [r4+6], m1
+
+; vl
+; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
+; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
+; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
+; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
+; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
+; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
+; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
+; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
+ pshufb m1, m3, [off(intra8x9_vl1)]
+ pshufb m2, m0, [off(intra8x9_vl2)]
+ pshufb m3, m3, [off(intra8x9_vl3)]
+ pshufb m0, m0, [off(intra8x9_vl4)]
+ mova pred(7,0), m1
+ psadbw m1, fenc02
+ mova pred(7,1), m2
+ psadbw m2, fenc13
+ mova pred(7,2), m3
+ psadbw m3, fenc46
+ mova pred(7,3), m0
+ psadbw m0, fenc57
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(sse4)
+ pextrw [r4+14], m0, 0
+%else
+ movd r5d, m0
+ mov [r4+14], r5w
+ lea r5, [rsp+padbase+0x100]
+%endif
+
+; ddr
+; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
+; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
+; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
+; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
+; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
+; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
+; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
+ movu m2, [r2+8]
+ movu m0, [r2+7]
+ movu m1, [r2+6]
+ pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
+ PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
+ pshufb m1, m0, [off(intra8x9_ddr1)]
+ pshufb m2, m0, [off(intra8x9_ddr2)]
+ mova pred(4,0), m1
+ psadbw m1, fenc02
+ mova pred(4,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_ddr3)]
+ mova pred(4,2), m2
+ psadbw m2, fenc46
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_ddr4)]
+ mova pred(4,3), m2
+ psadbw m2, fenc57
+ paddw m1, m2
+ movhlps m2, m1
+ paddw m1, m2
+ movd [r4+8], m1
+
+ add r0, 256
+ add r5, 0xC0
+ %define off(m) (r0+m-(intra8x9_h1+256+128))
+ %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]
+
+; vr
+; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
+; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
+; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
+; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
+; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
+; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
+; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
+ movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
+ pshufb m1, m2, [off(intra8x9_vr1)]
+ pshufb m2, m2, [off(intra8x9_vr3)]
+ mova pred(5,0), m1
+ psadbw m1, fenc02
+ mova pred(5,2), m2
+ psadbw m2, fenc46
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_vr2)]
+ mova pred(5,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_vr4)]
+ mova pred(5,3), m2
+ psadbw m2, fenc57
+ paddw m1, m2
+ movhlps m2, m1
+ paddw m1, m2
+ movd [r4+10], m1
+
+; hd
+; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
+; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
+; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
+; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
+; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
+; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
+; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
+ pshufd m2, m3, q0001
+%if cpuflag(sse4)
+ pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
+%else
+ movss m1, m0, m2
+ SWAP 1, 2
+%endif
+ punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
+ pshufb m1, m2, [off(intra8x9_hd1)]
+ pshufb m2, m2, [off(intra8x9_hd2)]
+ mova pred(6,0), m1
+ psadbw m1, fenc02
+ mova pred(6,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_hd3)]
+ pshufb m3, m0, [off(intra8x9_hd4)]
+ mova pred(6,2), m2
+ psadbw m2, fenc46
+ mova pred(6,3), m3
+ psadbw m3, fenc57
+ paddw m1, m2
+ paddw m1, m3
+ movhlps m2, m1
+ paddw m1, m2
+ ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
+ pslldq m1, 12
+ SWAP 3, 1
+
+; hu
+; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
+; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
+; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
+; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
+; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
+; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
+; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
+; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
+%if cpuflag(sse4)
+ pinsrb m0, [r2+7], 15 ; Gl7
+%else
+ movd m1, [r2+7]
+ pslldq m0, 1
+ palignr m1, m0, 1
+ SWAP 0, 1
+%endif
+ pshufb m1, m0, [off(intra8x9_hu1)]
+ pshufb m2, m0, [off(intra8x9_hu2)]
+ mova pred(8,0), m1
+ psadbw m1, fenc02
+ mova pred(8,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_hu3)]
+ pshufb m0, m0, [off(intra8x9_hu4)]
+ mova pred(8,2), m2
+ psadbw m2, fenc46
+ mova pred(8,3), m0
+ psadbw m0, fenc57
+ paddw m1, m2
+ paddw m1, m0
+ movhlps m2, m1
+ paddw m1, m2
+ movd r2d, m1
+
+ movu m0, [r3]
+ por m3, [r4]
+ paddw m0, m3
+ mova [r4], m0
+ movzx r5d, word [r3+16]
+ add r2d, r5d
+ mov [r4+16], r2w