~10% faster Hadamard functions (SATD/SA8D/hadamard_ac) plus other improvements.
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"AVX", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
+ {"XOP", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_XOP},
+ {"FMA4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_FMA4},
#undef SSE2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
cpu |= X264_CPU_SSE_MISALIGN;
x264_cpu_mask_misalign_sse();
}
+
+ if( cpu & X264_CPU_AVX )
+ {
+ if( ecx&0x00000800 ) /* XOP */
+ cpu |= X264_CPU_XOP;
+ if( ecx&0x00010000 ) /* FMA4 */
+ cpu |= X264_CPU_FMA4;
+ }
}
}
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
}
+ if( cpu&X264_CPU_XOP )
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL7( _sse4 )
SATD_X_DECL7( _avx )
+SATD_X_DECL7( _xop )
#endif // !HIGH_BIT_DEPTH
#endif
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx;
}
INIT5( ssd, _avx );
-#if ARCH_X86_64
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
+#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_avx;
#endif
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx;
}
+
+ if( cpu&X264_CPU_XOP )
+ {
+ INIT7( satd, _xop );
+ INIT7( satd_x3, _xop );
+ INIT7( satd_x4, _xop );
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ INIT4( hadamard_ac, _xop );
+ pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop;
+ }
+ INIT5( ssd, _xop );
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
+#if ARCH_X86_64
+ pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_xop;
+#endif
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
+ }
#endif //HAVE_MMX
#if HAVE_ARMV6
{
pf->denoise_dct = x264_denoise_dct_avx;
}
+ if( cpu&X264_CPU_XOP )
+ {
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
+ if( h->param.i_cqm_preset != X264_CQM_FLAT )
+ {
+ pf->dequant_4x4 = x264_dequant_4x4_xop;
+ pf->dequant_8x8 = x264_dequant_8x8_xop;
+ }
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
+
+ if( cpu&X264_CPU_XOP )
+ {
+ if( h->param.i_cqm_preset != X264_CQM_FLAT )
+ {
+ pf->dequant_4x4 = x264_dequant_4x4_xop;
+ pf->dequant_8x8 = x264_dequant_8x8_xop;
+ }
+ }
#endif // HAVE_MMX
#if HAVE_ALTIVEC
%include "x86inc.asm"
%include "x86util.asm"
-%macro SHUFFLE_16BIT 8
- %rep 8
- db %1*2
- db %1*2+1
- %rotate 1
- %endrep
-%endmacro
-
SECTION_RODATA
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
-pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
-pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
+pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
+pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
+pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
+pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
SCAN_4x4_FRAME
INIT_XMM avx
SCAN_4x4_FRAME
+
+INIT_XMM xop
+cglobal zigzag_scan_4x4_frame, 2,2
+ mova m0, [r1+ 0]
+ mova m1, [r1+16]
+ vpperm m2, m0, m1, [pb_scan4frame2a]
+ vpperm m1, m0, m1, [pb_scan4frame2b]
+ mova [r0+ 0], m2
+ mova [r0+16], m1
+ RET
%endif ; !HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
-INIT_XMM
-cglobal mbtree_propagate_cost_sse2, 7,7,7
+%macro MBTREE 0
+cglobal mbtree_propagate_cost, 7,7,7
add r6d, r6d
lea r0, [r0+r6*2]
add r1, r6
pand xmm3, xmm5
punpcklwd xmm1, xmm4
punpcklwd xmm3, xmm4
+%if cpuflag(fma4)
+ cvtdq2ps xmm0, xmm0
+ cvtdq2ps xmm1, xmm1
+ vfmaddps xmm0, xmm0, xmm6, xmm1
+ cvtdq2ps xmm1, xmm2
+ psubd xmm2, xmm3
+ cvtdq2ps xmm2, xmm2
+ rcpps xmm3, xmm1
+ mulps xmm1, xmm3
+ mulps xmm0, xmm2
+ addps xmm2, xmm3, xmm3
+ vfnmaddps xmm3, xmm1, xmm3, xmm2
+ mulps xmm0, xmm3
+%else
cvtdq2ps xmm0, xmm0
mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
cvtdq2ps xmm1, xmm1 ; prop
addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
subps xmm3, xmm1 ; 2nd approximation for 1/intra
mulps xmm0, xmm3 ; / intra
+%endif
cvtps2dq xmm0, xmm0
movdqa [r0+r6*2], xmm0
add r6, 8
jl .loop
REP_RET
+%endmacro
+
+INIT_XMM sse2
+MBTREE
+; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
+INIT_XMM fma4
+MBTREE
%macro INT16_TO_FLOAT 1
vpunpckhwd xmm4, xmm%1, xmm7
%endmacro
; FIXME: align loads/stores to 16 bytes
-cglobal mbtree_propagate_cost_avx, 7,7,8
+INIT_YMM avx
+cglobal mbtree_propagate_cost, 7,7,8
add r6d, r6d
lea r0, [r0+r6*2]
add r1, r6
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
if( !(cpu&X264_CPU_AVX) )
return;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
+
+ if( !(cpu&X264_CPU_FMA4) )
+ return;
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
}
intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
+transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
+transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
+
sw_f0: dq 0xfff0, 0
sq_0f: dq 0xffffffff, 0
pd_f0: times 4 dd 0xffff0000
SSD 4, 4
SSD 4, 8
SSD 4, 16
+INIT_XMM xop
+SSD 16, 16
+SSD 8, 8
+SSD 16, 8
+SSD 8, 16
+SSD 8, 4
%assign function_align 16
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, int )
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal pixel_var_16x16_mmx2, 2,3
+INIT_MMX mmx2
+cglobal pixel_var_16x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
-cglobal pixel_var_8x16_mmx2, 2,3
+cglobal pixel_var_8x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 8, 16
-cglobal pixel_var_8x8_mmx2, 2,3
+cglobal pixel_var_8x8, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
VAR
INIT_XMM avx
VAR
+INIT_XMM xop
+VAR
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
VAR
INIT_XMM avx
VAR
+INIT_XMM xop
+VAR
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_END 0
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal pixel_var2_8x8_mmx2, 5,6
+INIT_MMX mmx2
+cglobal pixel_var2_8x8, 5,6
FIX_STRIDES r1, r3
VAR_START 0
mov r5d, 8
VAR2_END
RET
-INIT_XMM
-cglobal pixel_var2_8x8_sse2, 5,6,8
+INIT_XMM sse2
+cglobal pixel_var2_8x8, 5,6,8
VAR_START 1
mov r5d, 4
.loop:
RET
%ifndef HIGH_BIT_DEPTH
-cglobal pixel_var2_8x8_ssse3, 5,6,8
+%macro VAR2_8x8 0
+cglobal pixel_var2_8x8, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
jg .loop
VAR2_END
RET
+%endmacro
+
+INIT_XMM ssse3
+VAR2_8x8
+INIT_XMM xop
+VAR2_8x8
+
%endif ; !HIGH_BIT_DEPTH
;=============================================================================
paddusw m2, m0
; 3x HADDW
+%if cpuflag(xop)
+ phaddw m2, m14
+ vphadduwq m0, m15
+ movhlps m1, m0
+ vphadduwq m2, m2 ; i8x8_v, i8x8_h
+ paddd m0, m1 ; i8x8_dc
+ packusdw m2, m0 ; i8x8_v, i8x8_h, i8x8_dc
+ pxor m3, m3
+ psrlw m2, 1
+ pavgw m2, m3
+ movq [r2], m2 ; i8x8_v, i8x8_h
+ psrldq m2, 8
+ movd [r2+8], m2 ; i8x8_dc
+%else
movdqa m7, [pw_1]
pmaddwd m2, m7
pmaddwd m14, m7
movq [r2], m3 ; i8x8_v, i8x8_h
psrldq m3, 8
movd [r2+8], m3 ; i8x8_dc
+%endif
RET
%endif ; ARCH_X86_64
%endmacro ; INTRA_SA8D_SSE2
psignw m%1, [pw_pmpmpmpm]
paddw m0, m%1
psllw m0, 2 ; hadamard(top), hadamard(left)
- mova m1, m0
- mova m2, m0
movhlps m3, m0
- pshufb m1, [intrax9b_v1]
- pshufb m2, [intrax9b_v2]
+ pshufb m1, m0, [intrax9b_v1]
+ pshufb m2, m0, [intrax9b_v2]
paddw m0, m3
psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
pavgw m0, [pw_16]
%endif
movhlps m2, m1
paddw m1, m2
+%if cpuflag(xop)
+ vphaddwq m3, m3
+ vphaddwq m1, m1
+ packssdw m1, m3
+%else
phaddw m1, m3
pmaddwd m1, [pw_1] ; v, _, h, dc
+%endif
%endmacro ; INTRA_X9_VHDC
%macro INTRA_X9_END 1
;-----------------------------------------------------------------------------
; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
;-----------------------------------------------------------------------------
+%if notcpuflag(xop)
cglobal intra_sad_x9_4x4, 3,3,9
%ifdef ARCH_X86_64
INTRA_X9_PRED intrax9a, m8
mova m7, [rsp]
%define %%zero [pb_0]
%endif
- mova m3, m7
- mova m5, m7
+ pshufb m3, m7, [intrax9a_vh1]
+ pshufb m5, m7, [intrax9a_vh2]
pshufb m7, [intrax9a_dc]
- pshufb m3, [intrax9a_vh1]
psadbw m7, %%zero
- pshufb m5, [intrax9a_vh2]
psrlw m7, 2
psadbw m3, m0
pavgw m7, %%zero
add rsp, 0x1c
%endif
RET
+%endif
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
%endif
HADAMARD_AC_SSE2
+%define TRANS TRANS_XOP
+INIT_XMM xop
+SATDS_SSE2
+SA8D
+%ifndef HIGH_BIT_DEPTH
+INTRA_SA8D_SSE2
+INTRA_X9
+%endif
+HADAMARD_AC_SSE2
+
;=============================================================================
; SSIM
;=============================================================================
DECL_X1( ssd, sse2 )
DECL_X1( ssd, ssse3 )
DECL_X1( ssd, avx )
+DECL_X1( ssd, xop )
DECL_X1( satd, mmx2 )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
+DECL_X1( satd, xop )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
+DECL_X1( sa8d, xop )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, int i_stride ))
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_xop ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
+int x264_intra_satd_x9_4x4_xop ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
int x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
int x264_pixel_vsad_sse2( pixel *src, int stride, int height );
;-----------------------------------------------------------------------------
; void predict_4x4_dc( pixel *src )
;-----------------------------------------------------------------------------
+INIT_MMX mmx2
%ifdef HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_4x4_dc_mmx2, 1,1
+cglobal predict_4x4_dc, 1,1
mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
RET
%else ; !HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_4x4_dc_mmx2, 1,4
+cglobal predict_4x4_dc, 1,4
pxor mm7, mm7
movd mm0, [r0-FDEC_STRIDEB]
psadbw mm0, mm7
; void predict_8x8_dc( pixel *src, pixel *edge );
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-cglobal predict_8x8_dc_sse2, 2,2
+INIT_XMM sse2
+cglobal predict_8x8_dc, 2,2
movu m0, [r1+14]
paddw m0, [r1+32]
HADDW m0, m1
REP_RET
%else ; !HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_8x8_dc_mmx2, 2,2
+INIT_MMX mmx2
+cglobal predict_8x8_dc, 2,2
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1+7]
STORE8x8 m0, m0
RET
%endmacro
-INIT_XMM
-PREDICT_8x8_DC predict_8x8_dc_top_sse2 , 32, mova
-PREDICT_8x8_DC predict_8x8_dc_left_sse2, 14, movu
+INIT_XMM sse2
+PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
+PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
%else ; !HIGH_BIT_DEPTH
%macro PREDICT_8x8_DC 2
REP_RET
%endif ; !ARCH_X86_64
-INIT_XMM
+%macro PREDICT_8x8C 0
%ifdef HIGH_BIT_DEPTH
-cglobal predict_8x8c_p_core_sse2, 1,1,7
+cglobal predict_8x8c_p_core, 1,1,7
movd m0, r1m
movd m2, r2m
movd m4, r3m
jg .loop
REP_RET
%else ; !HIGH_BIT_DEPTH
-cglobal predict_8x8c_p_core_sse2, 1,1
+cglobal predict_8x8c_p_core, 1,1
movd m0, r1m
movd m2, r2m
movd m4, r3m
movhps [r0+FDEC_STRIDE*3], m5
RET
%endif ; HIGH_BIT_DEPTH
+%endmacro
+
+INIT_XMM sse2
+PREDICT_8x8C
+INIT_XMM avx
+PREDICT_8x8C
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
-cglobal predict_16x16_p_core_mmx2, 1,2
+INIT_MMX mmx2
+cglobal predict_16x16_p_core, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
%endif
%endmacro
-INIT_MMX
-cglobal predict_16x16_dc_core_mmx2, 1,2
+INIT_MMX mmx2
+cglobal predict_16x16_dc_core, 1,2
%ifdef ARCH_X86_64
movd m6, r1d
PRED16x16_DC m6, 5
%endif
REP_RET
-INIT_MMX
-cglobal predict_16x16_dc_top_mmx2, 1,2
+INIT_MMX mmx2
+cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC [pw_8], 4
REP_RET
-INIT_MMX
+INIT_MMX mmx2
%ifdef HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core_mmx2, 1,2
+cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16x16 m0, m0, m0, m0
REP_RET
%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core_mmx2, 1,1
+cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
pshufw m0, m0, 0
packuswb m0, m0
%endif
%endmacro
-INIT_XMM
-cglobal predict_16x16_dc_core_sse2, 2,2,4
+INIT_XMM sse2
+cglobal predict_16x16_dc_core, 2,2,4
movd m3, r1m
PRED16x16_DC_SSE2 m3, 5
REP_RET
-cglobal predict_16x16_dc_top_sse2, 1,2
+cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC_SSE2 [pw_8], 4
REP_RET
-INIT_XMM
+INIT_XMM sse2
%ifdef HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core_sse2, 1,2
+cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16x16_SSE2 m0, m0
REP_RET
%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core_sse2, 1,1
+cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
SPLATW m0, m0
packuswb m0, m0
#endif //!HIGH_BIT_DEPTH
#if HAVE_X86_INLINE_ASM
+
+#define PREDICT_8x8C_P_CORE\
+ V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\
+ + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\
+ + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\
+ + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\
+ H += -4 * src[-1*FDEC_STRIDE -1];\
+ int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
+ int b = ( 17 * H + 16 ) >> 5;\
+ int c = ( 17 * V + 16 ) >> 5;\
+
#if HIGH_BIT_DEPTH
-static void x264_predict_8x8c_p_sse2( uint16_t *src )
-#else
-static void x264_predict_8x8c_p_ssse3( uint8_t *src )
-#endif
-{
- int a, b, c, i00;
- int H, V;
-#if HIGH_BIT_DEPTH
- asm (
- "movdqa %1, %%xmm0 \n"
- "pmaddwd %2, %%xmm0 \n"
- "movhlps %%xmm0, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "pshuflw $14, %%xmm0, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "movd %%xmm0, %0 \n"
- :"=r"(H)
- :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)
- );
-#else
- asm (
- "movq %1, %%mm0 \n"
- "pmaddubsw %2, %%mm0 \n"
- "pshufw $14, %%mm0, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "pshufw $1, %%mm0, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "movd %%mm0, %0 \n"
- "movswl %w0, %0 \n"
- :"=r"(H)
- :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)
- );
-#endif
- V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )
- + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )
- + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )
- + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );
- H += -4 * src[-1*FDEC_STRIDE -1];
- a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );
- b = ( 17 * H + 16 ) >> 5;
- c = ( 17 * V + 16 ) >> 5;
- i00 = a -3*b -3*c + 16;
-#if HIGH_BIT_DEPTH
- x264_predict_8x8c_p_core_sse2( src, a, b, c );
-#else
- x264_predict_8x8c_p_core_sse2( src, i00, b, c );
-#endif
+#define PREDICT_8x8_P2(cpu1, cpu2)\
+static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
+{\
+ int H, V;\
+ asm (\
+ "movdqa %1, %%xmm0 \n"\
+ "pmaddwd %2, %%xmm0 \n"\
+ "movhlps %%xmm0, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "pshuflw $14, %%xmm0, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "movd %%xmm0, %0 \n"\
+ :"=r"(H)\
+ :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)\
+ );\
+ PREDICT_8x8C_P_CORE\
+ x264_predict_8x8c_p_core_ ## cpu2( src, a, b, c );\
}
+
+PREDICT_8x8_P2(sse2, sse2)
+PREDICT_8x8_P2( avx, avx)
+
+#else //!HIGH_BIT_DEPTH
+#define PREDICT_8x8_P2(cpu1, cpu2)\
+static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
+{\
+ int H, V;\
+ asm (\
+ "movq %1, %%mm0 \n"\
+ "pmaddubsw %2, %%mm0 \n"\
+ "pshufw $14, %%mm0, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "pshufw $1, %%mm0, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "movd %%mm0, %0 \n"\
+ "movswl %w0, %0 \n"\
+ :"=r"(H)\
+ :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)\
+ );\
+ PREDICT_8x8C_P_CORE\
+ int i00 = a -3*b -3*c + 16;\
+ x264_predict_8x8c_p_core_ ## cpu2( src, i00, b, c );\
+}
+
+PREDICT_8x8_P2(ssse3, sse2)
+PREDICT_8x8_P2( avx, avx)
+#endif
#endif
#if ARCH_X86_64 && !HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2;
#if HAVE_X86_INLINE_ASM
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx;
#endif
#else
#if ARCH_X86_64
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3;
#if HAVE_X86_INLINE_ASM
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx;
#endif
#endif // HIGH_BIT_DEPTH
}
void x264_predict_8x16c_h_ssse3( uint8_t *src );
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
+void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_dc_mmx2( pixel *src );
void x264_predict_8x8c_dc_sse2( uint16_t *src );
void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
;;; m4 0
mova m0, %1
%ifdef HIGH_BIT_DEPTH
- pmaddwd m0, %2
- paddd m0, m3
+ pmadcswd m0, m0, %2, m3
psrad m0, m2
%else
punpckhwd m1, m0, m4
punpcklwd m0, m4
- pmaddwd m0, %2
- pmaddwd m1, %3
- paddd m0, m3
- paddd m1, m3
+ pmadcswd m0, m0, %2, m3
+ pmadcswd m1, m1, %3, m3
psrad m0, m2
psrad m1, m2
packssdw m0, m1
INIT_XMM sse2
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
+INIT_XMM xop
+DEQUANT 4, 4, 1
+DEQUANT 8, 6, 1
%else
%ifndef ARCH_X86_64
INIT_MMX mmx
INIT_XMM avx
DEQUANT 4, 4, 2
DEQUANT 8, 6, 2
+INIT_XMM xop
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
%endif
%macro DEQUANT_DC 2
pshufd m2, m2, 0
%rep SIZEOF_PIXEL*32/mmsize
mova m0, [r0+x]
- pmaddwd m0, m2
- paddd m0, m4
+ pmadcswd m0, m0, m2, m4
psrad m0, m3
mova [r0+x], m0
%assign x x+mmsize
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
DEQUANT_DC d, pmaddwd
+INIT_XMM xop
+DEQUANT_DC d, pmaddwd
%else
%ifndef ARCH_X86_64
INIT_MMX mmx2
void x264_dequant_4x4_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
%assign cpuflags_sse4 (1<<7) | cpuflags_ssse3
%assign cpuflags_sse42 (1<<8) | cpuflags_sse4
%assign cpuflags_avx (1<<9) | cpuflags_sse42
+%assign cpuflags_xop (1<<10)| cpuflags_avx
+%assign cpuflags_fma4 (1<<11)| cpuflags_avx
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
AVX_INSTR pfsub, 1, 0
AVX_INSTR pfmul, 1, 0
-
; base-4 constants for shuffles
%assign i 0
%rep 256
%endrep
%undef i
%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %else
+ %6 %1, %2, %3
+ %7 %1, %4
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsdd, pmulld, paddd
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
%endmacro
%macro HADDW 2
+%if cpuflag(xop) && mmsize == 16
+ vphaddwq %1, %1
+ movhlps %2, %1
+ paddd %1, %2
+%else
pmaddwd %1, [pw_1]
HADDD %1, %2
+%endif
%endmacro
%macro HADDUW 2
+%if cpuflag(xop) && mmsize == 16
+ vphadduwq %1, %1
+ movhlps %2, %1
+ paddd %1, %2
+%else
psrld %2, %1, 16
pslld %1, 16
psrld %1, 16
paddd %1, %2
HADDD %1, %2
+%endif
%endmacro
%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
%endif
%endmacro
+%macro TRANS_XOP 5-6
+%ifidn %1, d
+ vpperm m%5, m%3, m%4, [transd_shuf1]
+ vpperm m%3, m%3, m%4, [transd_shuf2]
+%elifidn %1, q
+ shufps m%5, m%3, m%4, q3131
+ shufps m%3, m%4, q2020
+%endif
+ SWAP %4, %5
+%endmacro
+
%macro HADAMARD 5-6
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
packuswb %1, %1
movh %4, %1
%endmacro
+
+%macro SHUFFLE_MASK_W 8
+ %rep 8
+ db %1*2
+ db %1*2+1
+ %rotate 1
+ %endrep
+%endmacro
fi
if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
- if ! as_check "vpaddw xmm0, xmm0, xmm0" ; then
+ if ! as_check "vfmaddps xmm0, xmm0, xmm0, xmm0" ; then
VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
echo "Found $VER"
- echo "Minimum version is yasm-0.7.0"
+ echo "Minimum version is yasm-1.0.0"
echo "If you really want to compile without asm, configure with --disable-asm."
exit 1
fi
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+ b->cpu&X264_CPU_FMA4 ? "fma4" :
+ b->cpu&X264_CPU_XOP ? "xop" :
b->cpu&X264_CPU_AVX ? "avx" :
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
// I don't care about exact rounding, this is just how close the floating-point implementation happens to be
x264_emms();
- for( int j = 0; j < 100; j++ )
+ for( int j = 0; j < 100 && ok; j++ )
+ {
ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
+ if( !ok )
+ fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
+ }
}
report( "mbtree propagate :" );
}
}
if( x264_cpu_detect() & X264_CPU_AVX )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
+ if( x264_cpu_detect() & X264_CPU_XOP )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
+ if( x264_cpu_detect() & X264_CPU_FMA4 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
****************************************************************************/
/* CPU flags
*/
-#define X264_CPU_CACHELINE_32 0x000001 /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64 0x000002 /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_ALTIVEC 0x000004
-#define X264_CPU_MMX 0x000008
-#define X264_CPU_MMX2 0x000010 /* MMX2 aka MMXEXT aka ISSE */
-#define X264_CPU_MMXEXT X264_CPU_MMX2
-#define X264_CPU_SSE 0x000020
-#define X264_CPU_SSE2 0x000040
-#define X264_CPU_SSE2_IS_SLOW 0x000080 /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SSE3 0x000200
-#define X264_CPU_SSSE3 0x000400
-#define X264_CPU_SHUFFLE_IS_FAST 0x000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
-#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
-#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
-#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
-#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_ARMV6 0x020000
-#define X264_CPU_NEON 0x040000 /* ARM NEON */
-#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
-#define X264_CPU_SLOW_CTZ 0x100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM 0x200000 /* The Atom just sucks */
-#define X264_CPU_AVX 0x400000 /* AVX support: requires OS support even if YMM registers
- * aren't used. */
+#define X264_CPU_CACHELINE_32 0x0000001 /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64 0x0000002 /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_ALTIVEC 0x0000004
+#define X264_CPU_MMX 0x0000008
+#define X264_CPU_MMX2 0x0000010 /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_MMXEXT X264_CPU_MMX2
+#define X264_CPU_SSE 0x0000020
+#define X264_CPU_SSE2 0x0000040
+#define X264_CPU_SSE2_IS_SLOW 0x0000080 /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST 0x0000100 /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SSE3 0x0000200
+#define X264_CPU_SSSE3 0x0000400
+#define X264_CPU_SHUFFLE_IS_FAST 0x0000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
+#define X264_CPU_STACK_MOD4 0x0001000 /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SSE4 0x0002000 /* SSE4.1 */
+#define X264_CPU_SSE42 0x0004000 /* SSE4.2 */
+#define X264_CPU_SSE_MISALIGN 0x0008000 /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT 0x0010000 /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_ARMV6 0x0020000
+#define X264_CPU_NEON 0x0040000 /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC 0x0080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X264_CPU_SLOW_CTZ 0x0100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM 0x0200000 /* The Atom just sucks */
+#define X264_CPU_AVX 0x0400000 /* AVX support: requires OS support even if YMM registers
+ * aren't used. */
+#define X264_CPU_XOP 0x0800000 /* AMD XOP */
+#define X264_CPU_FMA4 0x1000000 /* AMD FMA4 */
/* Analyse flags
*/