deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
+intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6
+intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4
+intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0
+intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11
+intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8
+intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
+intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
+intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
+intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
+pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
+pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
+intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15
+
+intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6
+intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4
+intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
+intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
+intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
+intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
+intrax9b_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
+intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
+intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
+
sw_f0: dq 0xfff0, 0
+sq_0f: dq 0xffffffff, 0
pd_f0: times 4 dd 0xffff0000
-sq_0f: times 1 dq 0xffffffff
SECTION .text
+cextern pb_0
+cextern pb_1
cextern pw_1
cextern pw_8
+cextern pw_16
cextern pw_64
cextern pw_00ff
cextern pw_ppppmmmm
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
+cextern pw_pmmpzzzz
cextern hsub_mul
;=============================================================================
; SATD
;=============================================================================
-%define TRANS TRANS_SSE2
-
%macro JDUP 2
%if cpuflag(sse4)
; just use shufps on anything post conroe
ABSW m0, m0, m1 ; 4x1 sum
%endmacro
-%macro INTRA_SATDS_MMX 0
+%macro INTRA_X3_MMX 0
;-----------------------------------------------------------------------------
; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
SCALAR_HADAMARD left, 0, m4, m5
SCALAR_HADAMARD top, 0, m6, m5, m7
paddw m6, m4
- psrlw m6, 1
- paddw m6, [pw_8]
+ pavgw m6, [pw_16]
pand m6, [sw_f0] ; dc
SUM3x4
movd [r2+8], m2 ; i8x8c_v satd
ADD rsp, 72
RET
-%endmacro ; INTRA_SATDS_MMX
+%endmacro ; INTRA_X3_MMX
+
+
+
+%macro PRED4x4_LOWPASS 5
+ mova %5, %2
+ pavgb %2, %3
+ pxor %3, %5
+ pand %3, [pb_1]
+ psubusb %2, %3
+ pavgb %1, %4, %2
+%endmacro
+
+%macro INTRA_X9_PRED 2
+%if cpuflag(sse4)
+ movu m1, [r1-1*FDEC_STRIDE-8]
+ pinsrb m1, [r1+3*FDEC_STRIDE-1], 0
+ pinsrb m1, [r1+2*FDEC_STRIDE-1], 1
+ pinsrb m1, [r1+1*FDEC_STRIDE-1], 2
+ pinsrb m1, [r1+0*FDEC_STRIDE-1], 3
+%else
+ movd mm0, [r1+3*FDEC_STRIDE-4]
+ punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
+ movd mm1, [r1+1*FDEC_STRIDE-4]
+ punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
+ punpckhwd mm0, mm1
+ psrlq mm0, 32
+ movq2dq m0, mm0
+ movu m1, [r1-1*FDEC_STRIDE-8]
+ movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
+%endif ; cpuflag
+ pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
+ psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
+ psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
+ pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __
+ mova %2, m1
+ PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
+ ; ddl ddr
+ ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2
+ ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1
+ ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0
+ ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt
+ pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
+ pshufb m3, m0, [%1_ddlr2] ; rows 2,3
+ ; hd hu
+ ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2
+ ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3
+ ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3
+ ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3
+ pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+ palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
+ pshufb m6, m7, [%1_hdu1]
+ pshufb m7, m7, [%1_hdu2]
+ ; vr vl
+ ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4
+ ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4
+ ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5
+ ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+ psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
+ palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
+ pshufb m4, m5, [%1_vrl1]
+ pshufb m5, m5, [%1_vrl2]
+%endmacro ; INTRA_X9_PRED
+
+%macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
+ pshufb m%1, [intrax9b_vh1] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
+ pmaddubsw m%1, [hmul_4p]
+ pshufhw m0, m%1, q2301
+ pshuflw m0, m0, q2301
+ psignw m%1, [pw_pmpmpmpm]
+ paddw m0, m%1
+ psllw m0, 2 ; hadamard(top), hadamard(left)
+ mova m1, m0
+ mova m2, m0
+ movhlps m3, m0
+ pshufb m1, [intrax9b_v1]
+ pshufb m2, [intrax9b_v2]
+ paddw m0, m3
+ psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
+ pavgw m0, [pw_16]
+ pand m0, [sw_f0] ; dc
+ ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
+ ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
+ ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
+ HADAMARD 0, sumsub, %2, %3, %4, %5
+ HADAMARD 1, sumsub, %2, %3, %4, %5
+ psubw m3, m%2
+ psubw m0, m%2
+ psubw m1, m%2
+ psubw m2, m%3
+ pabsw m%3, m%3
+ pabsw m3, m3
+ pabsw m0, m0
+ pabsw m1, m1
+ pabsw m2, m2
+ pavgw m3, m%3
+ pavgw m0, m%3
+ pavgw m1, m2
+%if cpuflag(sse4)
+ phaddw m3, m0
+%else
+ SBUTTERFLY qdq, 3, 0, 2
+ paddw m3, m0
+%endif
+ movhlps m2, m1
+ paddw m1, m2
+ phaddw m1, m3
+ pmaddwd m1, [pw_1] ; v, _, h, dc
+%endmacro ; INTRA_X9_VHDC
+
+%macro INTRA_X9_END 1
+%if cpuflag(sse4)
+ phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
+ movd eax, m0
+ add eax, 1<<16
+ cmp ax, r1w
+ cmovge eax, r1d
+%else
+%if %1
+ ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
+ psllw m0, 3
+ paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
+%else
+ ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
+ psllw m0, 2
+ paddusw m0, m0
+ paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
+%endif
+ movhlps m1, m0
+ pminsw m0, m1
+ pshuflw m1, m0, q0032
+ pminsw m0, m1
+ pshuflw m1, m0, q0001
+ pminsw m0, m1
+ movd eax, m0
+ movsx r2d, ax
+ and eax, 7
+ sar r2d, 3
+ shl eax, 16
+ ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
+ ; 1<<12: undo sign manipulation
+ lea eax, [rax+r2+(1<<16)+(1<<12)]
+ cmp ax, r1w
+ cmovge eax, r1d
+%endif ; cpuflag
+%endmacro ; INTRA_X9_END
+
+%macro INTRA_X9 0
+;-----------------------------------------------------------------------------
+; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
+;-----------------------------------------------------------------------------
+cglobal intra_sad_x9_4x4, 3,3,9
+%ifdef ARCH_X86_64
+ INTRA_X9_PRED intrax9a, m8
+%else
+ sub rsp, 0x1c
+ INTRA_X9_PRED intrax9a, [rsp]
+%endif
+%if cpuflag(sse4)
+ movd m0, [r0+0*FENC_STRIDE]
+ pinsrd m0, [r0+1*FENC_STRIDE], 1
+ movd m1, [r0+2*FENC_STRIDE]
+ pinsrd m1, [r0+3*FENC_STRIDE], 1
+%else
+ movd mm0, [r0+0*FENC_STRIDE]
+ punpckldq mm0, [r0+1*FENC_STRIDE]
+ movd mm1, [r0+2*FENC_STRIDE]
+ punpckldq mm1, [r0+3*FENC_STRIDE]
+ movq2dq m0, mm0
+ movq2dq m1, mm1
+%endif
+ punpcklqdq m0, m0
+ punpcklqdq m1, m1
+ psadbw m2, m0
+ psadbw m3, m1
+ psadbw m4, m0
+ psadbw m5, m1
+ psadbw m6, m0
+ psadbw m7, m1
+ paddd m2, m3
+ paddd m4, m5
+ paddd m6, m7
+%ifdef ARCH_X86_64
+ SWAP 7, 8
+ pxor m8, m8
+ %define %%zero m8
+%else
+ mova m7, [rsp]
+ %define %%zero [pb_0]
+%endif
+ mova m3, m7
+ mova m5, m7
+ pshufb m7, [intrax9a_dc]
+ pshufb m3, [intrax9a_vh1]
+ psadbw m7, %%zero
+ pshufb m5, [intrax9a_vh2]
+ psrlw m7, 2
+ psadbw m3, m0
+ pavgw m7, %%zero
+ pshufb m7, %%zero
+ psadbw m5, m1
+ psadbw m0, m7
+ paddd m3, m5
+ psadbw m1, m7
+ paddd m0, m1
+ movzx r1d, word [r2]
+ movd r0d, m3 ; v
+ add r1d, r0d
+ punpckhqdq m3, m0 ; h, dc
+ shufps m3, m2, q2020
+ psllq m6, 32
+ por m4, m6
+ movu m0, [r2+2]
+ packssdw m3, m4
+ paddw m0, m3
+ INTRA_X9_END 1
+%ifndef ARCH_X86_64
+ add rsp, 0x1c
+%endif
+ RET
+
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
+;-----------------------------------------------------------------------------
+cglobal intra_satd_x9_4x4, 3,3,16
+ INTRA_X9_PRED intrax9b, m15
+ movd m8, [r0+0*FENC_STRIDE]
+ movd m9, [r0+1*FENC_STRIDE]
+ movd m10, [r0+2*FENC_STRIDE]
+ movd m11, [r0+3*FENC_STRIDE]
+ mova m12, [hmul_8p]
+ pshufd m8, m8, 0
+ pshufd m9, m9, 0
+ pshufd m10, m10, 0
+ pshufd m11, m11, 0
+ pmaddubsw m8, m12
+ pmaddubsw m9, m12
+ pmaddubsw m10, m12
+ pmaddubsw m11, m12
+ movddup m0, m2
+ pshufd m1, m2, q3232
+ movddup m2, m3
+ movhlps m3, m3
+ call .satd_8x4 ; ddr, ddl
+ movddup m2, m5
+ pshufd m3, m5, q3232
+ mova m5, m0
+ movddup m0, m4
+ pshufd m1, m4, q3232
+ call .satd_8x4 ; vr, vl
+ movddup m2, m7
+ pshufd m3, m7, q3232
+ mova m4, m0
+ movddup m0, m6
+ pshufd m1, m6, q3232
+ call .satd_8x4 ; hd, hu
+%if cpuflag(sse4)
+ punpckldq m4, m0
+%else
+ punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
+%endif
+ mova m1, [pw_ppmmppmm]
+ psignw m8, m1
+ psignw m10, m1
+ paddw m8, m9
+ paddw m10, m11
+ INTRA_X9_VHDC 15, 8, 10, 6, 7
+ ; find minimum
+ movu m0, [r2+2]
+ movd r1d, m1
+ palignr m5, m1, 8
+%if notcpuflag(sse4)
+ pshufhw m0, m0, q3120 ; compensate for different order in unpack
+%endif
+ packssdw m5, m4
+ paddw m0, m5
+ movzx r0d, word [r2]
+ add r1d, r0d
+ INTRA_X9_END 0
+ RET
+RESET_MM_PERMUTATION
+ALIGN 16
+.satd_8x4:
+ pmaddubsw m0, m12
+ pmaddubsw m1, m12
+ pmaddubsw m2, m12
+ pmaddubsw m3, m12
+ psubw m0, m8
+ psubw m1, m9
+ psubw m2, m10
+ psubw m3, m11
+ SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap
+ pmaddwd m0, [pw_1]
+%if cpuflag(sse4)
+ pshufd m1, m0, q0032
+%else
+ movhlps m1, m0
+%endif
+ paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
+ ret
+
+%else ; !ARCH_X86_64
+cglobal intra_satd_x9_4x4, 3,3,8
+ sub rsp, 0x9c
+ INTRA_X9_PRED intrax9b, [rsp+0x80]
+ mova [rsp+0x40], m4
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m6
+ mova [rsp+0x70], m7
+ movd m4, [r0+0*FENC_STRIDE]
+ movd m5, [r0+1*FENC_STRIDE]
+ movd m6, [r0+2*FENC_STRIDE]
+ movd m0, [r0+3*FENC_STRIDE]
+ mova m7, [hmul_8p]
+ pshufd m4, m4, 0
+ pshufd m5, m5, 0
+ pshufd m6, m6, 0
+ pshufd m0, m0, 0
+ pmaddubsw m4, m7
+ pmaddubsw m5, m7
+ pmaddubsw m6, m7
+ pmaddubsw m0, m7
+ mova [rsp+0x00], m4
+ mova [rsp+0x10], m5
+ mova [rsp+0x20], m6
+ mova [rsp+0x30], m0
+ movddup m0, m2
+ pshufd m1, m2, q3232
+ movddup m2, m3
+ movhlps m3, m3
+ pmaddubsw m0, m7
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ pmaddubsw m3, m7
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ call .satd_8x4b ; ddr, ddl
+ mova m3, [rsp+0x50]
+ mova m1, [rsp+0x40]
+ movddup m2, m3
+ movhlps m3, m3
+ movq [rsp+0x48], m0
+ movddup m0, m1
+ movhlps m1, m1
+ call .satd_8x4 ; vr, vl
+ mova m3, [rsp+0x70]
+ mova m1, [rsp+0x60]
+ movddup m2, m3
+ movhlps m3, m3
+ movq [rsp+0x50], m0
+ movddup m0, m1
+ movhlps m1, m1
+ call .satd_8x4 ; hd, hu
+ movq [rsp+0x58], m0
+ mova m1, [rsp+0x80]
+ mova m4, [rsp+0x00]
+ mova m5, [rsp+0x20]
+ mova m2, [pw_ppmmppmm]
+ psignw m4, m2
+ psignw m5, m2
+ paddw m4, [rsp+0x10]
+ paddw m5, [rsp+0x30]
+ INTRA_X9_VHDC 1, 4, 5, 6, 7
+ ; find minimum
+ movu m0, [r2+2]
+ movd r1d, m1
+ movhlps m1, m1
+ movhps m1, [rsp+0x48]
+%if cpuflag(sse4)
+ pshufd m2, [rsp+0x50], q3120
+ packssdw m1, m2
+%else
+ packssdw m1, [rsp+0x50]
+ pshufhw m0, m0, q3120
+%endif
+ paddw m0, m1
+ movzx r0d, word [r2]
+ add r1d, r0d
+ INTRA_X9_END 0
+ add rsp, 0x9c
+ RET
+RESET_MM_PERMUTATION
+ALIGN 16
+.satd_8x4:
+ pmaddubsw m0, m7
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ pmaddubsw m3, m7
+ psubw m0, [rsp+0x00+gprsize]
+ psubw m1, [rsp+0x10+gprsize]
+ psubw m2, [rsp+0x20+gprsize]
+.satd_8x4b:
+ psubw m3, [rsp+0x30+gprsize]
+ SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap
+ pmaddwd m0, [pw_1]
+%if cpuflag(sse4)
+ pshufd m1, m0, q0032
+%else
+ movhlps m1, m0
+%endif
+ paddd xmm0, m0, m1
+ ret
+%endif ; ARCH
+%endmacro ; INTRA_X9
+
; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INIT_MMX mmx2
-INTRA_SATDS_MMX
+INTRA_X3_MMX
%endif
INIT_XMM sse2
HADAMARD_AC_SSE2
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
+%ifndef HIGH_BIT_DEPTH
+INTRA_X9
+%endif
%undef movdqa ; nehalem doesn't like movaps
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INIT_MMX ssse3
-INTRA_SATDS_MMX
+INTRA_X3_MMX
%endif
%define TRANS TRANS_SSE4
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
+%ifndef HIGH_BIT_DEPTH
+INTRA_X9
+%endif
INIT_XMM avx
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
+INTRA_X9
%endif
HADAMARD_AC_SSE2
static uint16_t x264_cost_ref[QP_MAX+1][3][33];
static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
+static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
float *x264_analyse_prepare_costs( x264_t *h )
{
h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
}
}
+ uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
+ for( int i = 0; i < 17; i++ )
+ cost_i4x4_mode[i] = 3*lambda*(i!=8);
return 0;
fail:
return -1;
{
int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
+ uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
h->mb.i_cbp_luma = 0;
if( a->b_early_terminate && a->i_mbrd )
/* emulate missing topright samples */
MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
- if( !h->mb.b_lossless && predict_mode[5] >= 0 )
+ if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
{
- int satd[9];
- h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
- int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
- satd[i_pred_mode] -= 3 * lambda;
- for( int i = 2; i >= 0; i-- )
- COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
-
- /* Take analysis shortcuts: don't analyse modes that are too
- * far away direction-wise from the favored mode. */
- if( a->i_mbrd < 1 + a->b_fast_intra )
- predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
- else
- predict_mode += 3;
+ /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
+ i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
+ a->i_predict4x4[idx] = i_best >> 16;
+ i_best &= 0xffff;
}
-
- if( i_best > 0 )
+ else
{
- for( ; *predict_mode >= 0; predict_mode++ )
+ if( !h->mb.b_lossless && predict_mode[5] >= 0 )
{
- int i_satd;
- int i_mode = *predict_mode;
-
- if( h->mb.b_lossless )
- x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
+ int satd[9];
+ h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
+ int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
+ satd[i_pred_mode] -= 3 * lambda;
+ i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
+ COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
+ COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
+
+ /* Take analysis shortcuts: don't analyse modes that are too
+ * far away direction-wise from the favored mode. */
+ if( a->i_mbrd < 1 + a->b_fast_intra )
+ predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
else
- h->predict_4x4[i_mode]( p_dst_by );
+ predict_mode += 3;
+ }
- i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
- if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
+ if( i_best > 0 )
+ {
+ for( ; *predict_mode >= 0; predict_mode++ )
{
- i_satd -= lambda * 3;
- if( i_satd <= 0 )
+ int i_satd;
+ int i_mode = *predict_mode;
+
+ if( h->mb.b_lossless )
+ x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
+ else
+ h->predict_4x4[i_mode]( p_dst_by );
+
+ i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
+ if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
{
- i_best = i_satd;
- a->i_predict4x4[idx] = i_mode;
- break;
+ i_satd -= lambda * 3;
+ if( i_satd <= 0 )
+ {
+ i_best = i_satd;
+ a->i_predict4x4[idx] = i_mode;
+ break;
+ }
}
- }
- COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
+ COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
+ }
}
+ i_best += 3 * lambda;
}
- i_cost += i_best + 3 * lambda;
+ i_cost += i_best;
if( i_cost > i_satd_thresh || idx == 15 )
break;
x264_pixel_function_t pixel_c;
x264_pixel_function_t pixel_ref;
x264_pixel_function_t pixel_asm;
- x264_predict8x8_t predict_8x8[9+3];
+ x264_predict_t predict_4x4[12];
+ x264_predict8x8_t predict_8x8[12];
x264_predict_8x8_filter_t predict_8x8_filter;
ALIGNED_16( pixel edge[36] );
uint16_t cost_mv[32];
x264_pixel_init( 0, &pixel_c );
x264_pixel_init( cpu_ref, &pixel_ref );
x264_pixel_init( cpu_new, &pixel_asm );
+ x264_predict_4x4_init( 0, predict_4x4 );
x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter );
predict_8x8_filter( pbuf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
}
report( "pixel vsad :" );
-#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \
+#define TEST_INTRA_X3( name, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
int res_c[3], res_asm[3]; \
} \
}
+#define TEST_INTRA_X9( name, cmp ) \
+ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
+ { \
+ set_func_name( #name ); \
+ used_asm = 1; \
+ ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
+ for( int i=0; i<17; i++ ) \
+ bitcosts[i] = 9*(i!=8); \
+ for( int i=0; i<32; i++ ) \
+ { \
+ pixel *fenc = pbuf1+48+i*12; \
+ pixel *fdec = pbuf3+48+i*12; \
+ int pred_mode = i%9; \
+ int res_c = INT_MAX; \
+ for( int j=0; j<9; j++ ) \
+ { \
+ predict_4x4[j]( fdec ); \
+ int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec, FDEC_STRIDE ) + 9*(j!=pred_mode); \
+ if( cost < (uint16_t)res_c ) \
+ res_c = cost + (j<<16); \
+ } \
+ int res_a = call_a( pixel_asm.name, fenc, fdec, bitcosts+8-pred_mode ); \
+ if( res_c != res_a ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
+ break; \
+ } \
+ } \
+ }
+
+ memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) );
ok = 1; used_asm = 0;
- TEST_INTRA_MBCMP( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
- TEST_INTRA_MBCMP( intra_satd_x3_8x8c , predict_8x8c , satd[PIXEL_8x8] , 0 );
- TEST_INTRA_MBCMP( intra_satd_x3_4x4 , predict_4x4 , satd[PIXEL_4x4] , 0 );
- TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge );
+ TEST_INTRA_X3( intra_satd_x3_16x16, 0 );
+ TEST_INTRA_X3( intra_satd_x3_8x8c, 0 );
+ TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge );
+ TEST_INTRA_X3( intra_satd_x3_4x4, 0 );
report( "intra satd_x3 :" );
- TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
- TEST_INTRA_MBCMP( intra_sad_x3_8x8c , predict_8x8c , sad [PIXEL_8x8] , 0 );
- TEST_INTRA_MBCMP( intra_sad_x3_8x8 , predict_8x8 , sad [PIXEL_8x8] , 1, edge );
- TEST_INTRA_MBCMP( intra_sad_x3_4x4 , predict_4x4 , sad [PIXEL_4x4] , 0 );
+ ok = 1; used_asm = 0;
+ TEST_INTRA_X3( intra_sad_x3_16x16, 0 );
+ TEST_INTRA_X3( intra_sad_x3_8x8c, 0 );
+ TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge );
+ TEST_INTRA_X3( intra_sad_x3_4x4, 0 );
report( "intra sad_x3 :" );
+ ok = 1; used_asm = 0;
+ TEST_INTRA_X9( intra_satd_x9_4x4, satd );
+ report( "intra satd_x9 :" );
+ ok = 1; used_asm = 0;
+ TEST_INTRA_X9( intra_sad_x9_4x4, sad );
+ report( "intra sad_x9 :" );
ok = 1; used_asm = 0;
if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core )