b_error = 1;
}
free( buf );
- if( p->cpu & X264_CPU_SSSE3 )
+ if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
p->cpu |= X264_CPU_SSE2_IS_FAST;
- if( p->cpu & X264_CPU_SSE4 )
- p->cpu |= X264_CPU_SHUFFLE_IS_FAST;
}
}
OPT("threads")
const x264_cpu_name_t x264_cpu_names[] =
{
- {"Altivec", X264_CPU_ALTIVEC},
-// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
- {"MMX2", X264_CPU_MMX|X264_CPU_MMX2},
- {"MMXEXT", X264_CPU_MMX|X264_CPU_MMX2},
- {"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE},
-#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2
+#if HAVE_MMX
+// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
+// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it
+#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
+ {"MMX2", MMX2},
+ {"MMXEXT", MMX2},
+ {"SSE", MMX2|X264_CPU_SSE},
+#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
{"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", SSE2},
{"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
{"SSE3", SSE2|X264_CPU_SSE3},
{"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
- {"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST},
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"FMA3", AVX|X264_CPU_FMA3},
#undef AVX
#undef SSE2
+#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"BMI1", X264_CPU_BMI1},
{"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
- {"TBM", X264_CPU_TBM},
- {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
- {"ARMv6", X264_CPU_ARMV6},
- {"NEON", X264_CPU_NEON},
- {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
{"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
+ {"SlowPshufb", X264_CPU_SLOW_PSHUFB},
+ {"SlowPalignr", X264_CPU_SLOW_PALIGNR},
+ {"SlowShuffle", X264_CPU_SLOW_SHUFFLE},
+ {"UnalignedStack", X264_CPU_STACK_MOD4},
+#elif ARCH_PPC
+ {"Altivec", X264_CPU_ALTIVEC},
+#elif ARCH_ARM
+ {"ARMv6", X264_CPU_ARMV6},
+ {"NEON", X264_CPU_NEON},
+ {"FastNeonMRC", X264_CPU_FAST_NEON_MRC},
+#endif
{"", 0},
};
if( edx&0x00800000 )
cpu |= X264_CPU_MMX;
else
- return 0;
+ return cpu;
if( edx&0x02000000 )
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
+ if( edx&0x00008000 )
+ cpu |= X264_CPU_CMOV;
+ else
+ return cpu;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
- if( cpu & X264_CPU_SSE4 )
- cpu |= X264_CPU_SHUFFLE_IS_FAST;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
- if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
+ if( max_extended_cap >= 0x80000001 )
{
- cpu |= X264_CPU_SLOW_CTZ;
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
- if( edx&0x00400000 )
- cpu |= X264_CPU_MMX2;
- if( cpu & X264_CPU_SSE2 )
+
+ if( ecx&0x00000020 )
+ cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
+ if( ecx&0x00000040 ) /* SSE4a, AMD only */
{
- if( ecx&0x00000040 ) /* SSE4a */
+ int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+ cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
+ if( family == 0x14 )
{
- cpu |= X264_CPU_SSE2_IS_FAST;
- cpu |= X264_CPU_LZCNT;
- cpu |= X264_CPU_SHUFFLE_IS_FAST;
- cpu &= ~X264_CPU_SLOW_CTZ;
+ cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
+ cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */
+ cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */
}
- else
- cpu |= X264_CPU_SSE2_IS_SLOW;
+ }
- if( ecx&0x00000080 ) /* Misalign SSE */
- {
- cpu |= X264_CPU_SSE_MISALIGN;
- x264_cpu_mask_misalign_sse();
- }
+ if( ecx&0x00000080 ) /* Misalign SSE */
+ {
+ cpu |= X264_CPU_SSE_MISALIGN;
+ x264_cpu_mask_misalign_sse();
+ }
- if( cpu & X264_CPU_AVX )
- {
- if( ecx&0x00000800 ) /* XOP */
- cpu |= X264_CPU_XOP;
- if( ecx&0x00010000 ) /* FMA4 */
- cpu |= X264_CPU_FMA4;
- }
+ if( cpu & X264_CPU_AVX )
+ {
+ if( ecx&0x00000800 ) /* XOP */
+ cpu |= X264_CPU_XOP;
+ if( ecx&0x00010000 ) /* FMA4 */
+ cpu |= X264_CPU_FMA4;
+ }
- if( ecx&0x00200000 )
- cpu |= X264_CPU_TBM;
+ if( !strcmp((char*)vendor, "AuthenticAMD") )
+ {
+ if( edx&0x00400000 )
+ cpu |= X264_CPU_MMX2;
+ if( !(cpu&X264_CPU_LZCNT) )
+ cpu |= X264_CPU_SLOW_CTZ;
+ if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
+ cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
}
}
{
cpu |= X264_CPU_SLOW_ATOM;
cpu |= X264_CPU_SLOW_CTZ;
+ cpu |= X264_CPU_SLOW_PSHUFB;
}
- /* Some Penryns and Nehalems are pointlessly crippled (SSE4 disabled), so
- * detect them here. */
- else if( model >= 23 )
- cpu |= X264_CPU_SHUFFLE_IS_FAST;
+ /* Conroe has a slow shuffle unit. Check the model number to make sure not
+ * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
+ else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
+ cpu |= X264_CPU_SLOW_SHUFFLE;
}
}
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
- dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
- dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
- dctf->add8x8_idct = x264_add8x8_idct_sse2;
- dctf->add16x16_idct = x264_add16x16_idct_sse2;
- dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
+ if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
+ {
+ dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
+ dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
+ dctf->add8x8_idct = x264_add8x8_idct_sse2;
+ dctf->add16x16_idct = x264_add16x16_idct_sse2;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
+ }
}
- if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
+ if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
- dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
- dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
- dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
- dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
- dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
- dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
- dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+ if( !(cpu&X264_CPU_SLOW_ATOM) )
+ {
+ dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
+ dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
+ dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
+ if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+ {
+ dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+ }
+ }
}
if( cpu&X264_CPU_SSE4 )
pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
- if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
}
if( cpu&X264_CPU_AVX )
pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
#endif
- if( cpu&X264_CPU_SHUFFLE_IS_FAST )
- pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
}
if( cpu&X264_CPU_XOP )
{
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
}
- if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
{
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
int i_stride, i_width, i_lines, luma_plane_count;
int i_padv = PADV << PARAM_INTERLACED;
int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
- int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
+#if ARCH_PPC
+ int disalign = 1<<9;
+#else
+ int disalign = 1<<10;
+#endif
CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
#if !HIGH_BIT_DEPTH
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
+SATD_X_DECL6( _ssse3_atom )
SATD_X_DECL7( _sse4 )
SATD_X_DECL7( _avx )
SATD_X_DECL7( _xop )
INIT4( sad_x3, _cache32_mmx2 );
INIT4( sad_x4, _cache32_mmx2 );
}
- else if( cpu&X264_CPU_CACHELINE_64 )
+ else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
{
INIT5( sad, _cache64_mmx2 );
INIT4( sad_x3, _cache64_mmx2 );
INIT4( sad_x4, _cache64_mmx2 );
}
#else
- if( cpu&X264_CPU_CACHELINE_64 )
+ if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
{
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmx2;
#endif
}
INIT_ADS( _ssse3 );
- if( !(cpu&X264_CPU_SLOW_ATOM) )
+ if( cpu&X264_CPU_SLOW_ATOM )
+ {
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3_atom;
+ INIT6( satd, _ssse3_atom );
+ pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3_atom;
+ INIT6( satd_x3, _ssse3_atom );
+ INIT6( satd_x4, _ssse3_atom );
+ INIT4( hadamard_ac, _ssse3_atom );
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom;
+#endif
+ }
+ else
{
INIT8( ssd, _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
INIT8( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
+#if ARCH_X86_64
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
+#endif
}
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
- pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
+ if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
pixf->asd8 = x264_pixel_asd8_ssse3;
-#if ARCH_X86_64
- pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
-#endif
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
- if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
+ if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
{
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
}
PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
#endif // HIGH_BIT_DEPTH
#define MC_COPY_WTAB(instr, name1, name2, name3)\
#endif
MC_LUMA(cache64_sse2,cache64_sse2,sse)
MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
+MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse)
#endif // !HIGH_BIT_DEPTH
#define GET_REF(name)\
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
GET_REF(cache64_ssse3)
+GET_REF(cache64_ssse3_atom)
#endif // !HIGH_BIT_DEPTH
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
- if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
+ if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
if( !(cpu&X264_CPU_AVX) )
pf->hpel_filter = x264_hpel_filter_sse2_amd;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
- if( cpu&X264_CPU_SSE2_IS_SLOW )
- return;
-
- pf->weight = x264_mc_weight_wtab_sse2;
- if( !(cpu&X264_CPU_SLOW_ATOM) )
- {
- pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
- pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
- }
-
- pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
- pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
- pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
- pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
- pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
- pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
- pf->hpel_filter = x264_hpel_filter_sse2;
- if( cpu&X264_CPU_SSE_MISALIGN )
- pf->hpel_filter = x264_hpel_filter_sse2_misalign;
- pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
- if( !(cpu&X264_CPU_STACK_MOD4) )
- pf->mc_chroma = x264_mc_chroma_sse2;
-
- if( cpu&X264_CPU_SSE2_IS_FAST )
+ if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
- pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
- pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
- pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
- pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
- pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
- pf->mc_luma = mc_luma_sse2;
- pf->get_ref = get_ref_sse2;
- if( cpu&X264_CPU_CACHELINE_64 )
+ pf->weight = x264_mc_weight_wtab_sse2;
+ if( !(cpu&X264_CPU_SLOW_ATOM) )
{
- pf->mc_luma = mc_luma_cache64_sse2;
- pf->get_ref = get_ref_cache64_sse2;
+ pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+ pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
}
+
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
+ pf->hpel_filter = x264_hpel_filter_sse2;
if( cpu&X264_CPU_SSE_MISALIGN )
+ pf->hpel_filter = x264_hpel_filter_sse2_misalign;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ pf->mc_chroma = x264_mc_chroma_sse2;
+
+ if( cpu&X264_CPU_SSE2_IS_FAST )
{
- pf->get_ref = get_ref_sse2_misalign;
- if( !(cpu&X264_CPU_STACK_MOD4) )
- pf->mc_chroma = x264_mc_chroma_sse2_misalign;
+ pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
+ pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
+ pf->mc_luma = mc_luma_sse2;
+ pf->get_ref = get_ref_sse2;
+ if( cpu&X264_CPU_CACHELINE_64 )
+ {
+ pf->mc_luma = mc_luma_cache64_sse2;
+ pf->get_ref = get_ref_cache64_sse2;
+ }
+ if( cpu&X264_CPU_SSE_MISALIGN )
+ {
+ pf->get_ref = get_ref_sse2_misalign;
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ pf->mc_chroma = x264_mc_chroma_sse2_misalign;
+ }
}
}
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
- pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
- pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
- pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
+ if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+ {
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
+ }
- pf->hpel_filter = x264_hpel_filter_ssse3;
- pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+ if( !(cpu&X264_CPU_SLOW_PALIGNR) )
+ {
+#if ARCH_X86_64
+ if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */
+#endif
+ pf->hpel_filter = x264_hpel_filter_ssse3;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+ }
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
pf->mc_luma = mc_luma_cache64_ssse3;
pf->get_ref = get_ref_cache64_ssse3;
-
- /* ssse3 weight is slower on Nehalem, so only assign here. */
- pf->weight_cache = x264_weight_cache_ssse3;
- pf->weight = x264_mc_weight_wtab_ssse3;
+ if( cpu&X264_CPU_SLOW_ATOM )
+ {
+ pf->mc_luma = mc_luma_cache64_ssse3_atom;
+ pf->get_ref = get_ref_cache64_ssse3_atom;
+ }
}
- if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
+ pf->weight_cache = x264_weight_cache_ssse3;
+ pf->weight = x264_mc_weight_wtab_ssse3;
+
+ if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
if( !(cpu&X264_CPU_SSE4) )
pf->integral_init8h = x264_integral_init8h_avx;
pf->hpel_filter = x264_hpel_filter_avx;
- /* ssse3 weight seems to be faster again on Sandy Bridge and Bulldozer. */
- pf->weight_cache = x264_weight_cache_ssse3;
- pf->weight = x264_mc_weight_wtab_ssse3;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
%if cpuflag(sse4)
; just use shufps on anything post conroe
shufps %1, %2, 0
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
; join 2x 32 bit and duplicate them
; emulating shufps is faster on conroe
punpcklqdq %1, %2
SWAP %%n, 4
%endmacro
+; in: %1 = horizontal if 0, vertical if 1
%macro SATD_8x4_SSE 8-9
%if %1
HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
FIX_STRIDES r1, r3
%if HIGH_BIT_DEPTH && %3
pxor %2, %2
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
mova %2, [hmul_8p]
%endif
lea r4, [3*r1]
%endif
%endmacro
-%macro SATD_4x8_SSE 2
+%macro SATD_4x8_SSE 3
%if HIGH_BIT_DEPTH
movh m0, [r0+0*r1]
movh m4, [r2+0*r3]
JDUP m5, m3
movd m3, [r0+2*r1]
JDUP m1, m3
-%if cpuflag(ssse3) && %1==1
+%if %1==0 && %2==1
mova m3, [hmul_4p]
DIFFOP 0, 4, 1, 5, 3
%else
JDUP m5, m4
movd m4, [r0+r1]
JDUP m3, m4
-%if cpuflag(ssse3) && %1==1
+%if %1==0 && %2==1
mova m4, [hmul_4p]
DIFFOP 2, 6, 3, 5, 4
%else
DIFFOP 2, 6, 3, 5, 7
%endif
%endif ; HIGH_BIT_DEPTH
- SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 7, %2
+ SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
%endmacro
;-----------------------------------------------------------------------------
; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 0
-%if cpuflag(ssse3)
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
+
+%if vertical==0 || HIGH_BIT_DEPTH
cglobal pixel_satd_4x4, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
cglobal pixel_satd_4x8, 4, 6, 8
SATD_START_MMX
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
- mova m7, [hmul_4p]
+%if vertical==0
+ mova m7, [hmul_4p]
%endif
- SATD_4x8_SSE 0, swap
- HADDW m7, m1
- movd eax, m7
+ SATD_4x8_SSE vertical, 0, swap
+ HADDW m7, m1
+ movd eax, m7
RET
cglobal pixel_satd_4x16, 4, 6, 8
SATD_START_MMX
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+%if vertical==0
mova m7, [hmul_4p]
%endif
- SATD_4x8_SSE 0, swap
+ SATD_4x8_SSE vertical, 0, swap
lea r0, [r0+r1*2*SIZEOF_PIXEL]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
- SATD_4x8_SSE 1, add
+ SATD_4x8_SSE vertical, 1, add
HADDW m7, m1
movd eax, m7
RET
cglobal pixel_satd_8x8_internal
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
- SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
+ SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
%%pixel_satd_8x4_internal:
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
- SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
+ SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
ret
%if HIGH_BIT_DEPTH == 0 && UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
lea r0, [r0+4*r1]
+ ; always use horizontal mode here
SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
ret
cglobal pixel_satd_16x8, 4,6,12
SATD_START_SSE2 m10, m7
-%if notcpuflag(ssse3)
+%if vertical
mova m7, [pw_00ff]
%endif
jmp %%pixel_satd_16x8_internal
cglobal pixel_satd_16x16, 4,6,12
SATD_START_SSE2 m10, m7
-%if notcpuflag(ssse3)
+%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal
%endmacro
%macro SA8D 0
-%if HIGH_BIT_DEPTH
- %define vertical 1
-%else ; sse2 doesn't seem to like the horizontal way of doing things
- %define vertical (cpuflags == cpuflags_sse2)
-%endif
+; sse2 doesn't seem to like the horizontal way of doing things
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; SA8D_SATD
;=============================================================================
-; %1-%4: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
+; %1: vertical/horizontal mode
+; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
; m10: satd result
; m6, m11-15: tmp regs
-%macro SA8D_SATD_8x4 4
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
- LOAD_SUMSUB_8x4P_SSSE3 %1, %2, %3, %4, 6, 11, 7, r0, r2, 1
- HADAMARD4_V %1, %2, %3, %4, 6
-
- pabsw m12, m%1 ; doing the abs first is a slight advantage
- pabsw m14, m%3
- pabsw m13, m%2
- pabsw m15, m%4
- HADAMARD 1, max, 12, 14, 6, 11
- paddw m10, m12
- HADAMARD 1, max, 13, 15, 6, 11
- paddw m10, m13
-%else
- LOAD_DIFF_8x4P %1, %2, %3, %4, 6, 11, 7, r0, r2, 1
- HADAMARD 0, sumsub, %1, %2, 6
- HADAMARD 0, sumsub, %3, %4, 6
- SBUTTERFLY wd, %1, %2, 6
- SBUTTERFLY wd, %3, %4, 6
- HADAMARD2_2D %1, %3, %2, %4, 6, dq
-
- mova m12, m%1
- mova m13, m%2
- mova m14, m%3
- mova m15, m%4
- HADAMARD 0, sumsub, %1, %2, 6
- HADAMARD 0, sumsub, %3, %4, 6
+%macro SA8D_SATD_8x4 5
+%if %1
+ LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
+ HADAMARD 0, sumsub, %2, %3, 6
+ HADAMARD 0, sumsub, %4, %5, 6
+ SBUTTERFLY wd, %2, %3, 6
+ SBUTTERFLY wd, %4, %5, 6
+ HADAMARD2_2D %2, %4, %3, %5, 6, dq
+
+ mova m12, m%2
+ mova m13, m%3
+ mova m14, m%4
+ mova m15, m%5
+ HADAMARD 0, sumsub, %2, %3, 6
+ HADAMARD 0, sumsub, %4, %5, 6
SBUTTERFLY qdq, 12, 13, 6
HADAMARD 0, amax, 12, 13, 6
SBUTTERFLY qdq, 14, 15, 6
paddw m10, m12
HADAMARD 0, amax, 14, 15, 6
paddw m10, m14
+%else
+ LOAD_SUMSUB_8x4P_SSSE3 %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
+ HADAMARD4_V %2, %3, %4, %5, 6
+
+ pabsw m12, m%2 ; doing the abs first is a slight advantage
+ pabsw m14, m%4
+ pabsw m13, m%3
+ pabsw m15, m%5
+ HADAMARD 1, max, 12, 14, 6, 11
+ paddw m10, m12
+ HADAMARD 1, max, 13, 15, 6, 11
+ paddw m10, m13
%endif
%endmacro ; SA8D_SATD_8x4
%endmacro
%macro SA8D_SATD 0
+%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
cglobal pixel_sa8d_satd_8x8_internal
- SA8D_SATD_8x4 0, 1, 2, 3
- SA8D_SATD_8x4 4, 5, 8, 9
+ SA8D_SATD_8x4 vertical, 0, 1, 2, 3
+ SA8D_SATD_8x4 vertical, 4, 5, 8, 9
- ; complete sa8d
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+%if vertical ; sse2-style
+ HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
+ HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
+%else ; complete sa8d
SUMSUB_BADC w, 0, 4, 1, 5, 12
HADAMARD 2, sumsub, 0, 4, 12, 11
HADAMARD 2, sumsub, 1, 5, 12, 11
HADAMARD 1, amax, 1, 5, 12, 4
HADAMARD 1, amax, 2, 8, 12, 4
HADAMARD 1, amax, 3, 9, 12, 4
-%else ; sse2
- HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
- HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
%endif
; create sa8d sub results
%define temp0 [rsp+0*mmsize]
%define temp1 [rsp+1*mmsize]
FIX_STRIDES r1, r3
-%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+%if vertical==0
mova m7, [hmul_8p]
%endif
lea r4, [3*r1]
psubw m1, m9
psubw m2, m10
psubw m3, m11
- SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 13, 14, 0, swap
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
pmaddwd m0, [pw_1]
%if cpuflag(sse4)
pshufd m1, m0, q0032
psubw m2, [fenc_buf+0x20]
.satd_8x4b:
psubw m3, [fenc_buf+0x30]
- SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 0, swap
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
pmaddwd m0, [pw_1]
%if cpuflag(sse4)
pshufd m1, m0, q0032
%endif
%if HIGH_BIT_DEPTH
%define vertical 1
-%elif cpuflag(ssse3)
+%elif cpuflag(ssse3) && notcpuflag(atom)
%define vertical 0
;LOAD_INC loads sumsubs
mova m7, [hmul_8p]
INIT_XMM sse2
HADAMARD_AC_SSE2
+%if HIGH_BIT_DEPTH == 0
+INIT_XMM ssse3,atom
+SATDS_SSE2
+SA8D
+HADAMARD_AC_SSE2
+%if ARCH_X86_64
+SA8D_SATD
+%endif
+%endif
+
%define DIFFOP DIFF_SUMSUB_SSSE3
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
%if HIGH_BIT_DEPTH == 0
DECL_X1( satd, mmx2 )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
+DECL_X1( satd, ssse3_atom )
DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
DECL_X1( satd, xop )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
+DECL_X1( sa8d, ssse3_atom )
DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
DECL_X1( sa8d, xop )
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, intptr_t i_stride ))
int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
-uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
-uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_ssse3 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
pf[I_PRED_16x16_P] = x264_predict_16x16_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
+ if( !(cpu&X264_CPU_SLOW_PSHUFB) )
+ pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
#if HAVE_X86_INLINE_ASM
pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3;
#endif
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
- pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
- pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
+ if( !(cpu&X264_CPU_SLOW_PALIGNR) )
+ {
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
+ }
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
if( !(cpu&X264_CPU_AVX) )
static int x264_validate_parameters( x264_t *h, int b_open )
{
#if HAVE_MMX
-#ifdef __SSE__
- if( b_open && !(x264_cpu_detect() & X264_CPU_SSE) )
+ if( b_open )
{
- x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
+ int cpuflags = x264_cpu_detect();
+ int fail = 0;
+#ifdef __SSE__
+ if( !(cpuflags & X264_CPU_SSE) )
+ {
+ x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm\n");
+ fail = 1;
+ }
#else
- if( b_open && !(x264_cpu_detect() & X264_CPU_MMX2) )
- {
- x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+ if( !(cpuflags & X264_CPU_MMX2) )
+ {
+ x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm\n");
+ fail = 1;
+ }
#endif
- x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
- return -1;
+ if( !fail && !(cpuflags & X264_CPU_CMOV) )
+ {
+ x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
+ fail = 1;
+ }
+ if( fail )
+ {
+ x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
+ return -1;
+ }
}
#endif
p = buf + sprintf( buf, "using cpu capabilities:" );
for( int i = 0; x264_cpu_names[i].flags; i++ )
{
+ if( !strcmp(x264_cpu_names[i].name, "SSE")
+ && h->param.cpu & (X264_CPU_SSE2) )
+ continue;
if( !strcmp(x264_cpu_names[i].name, "SSE2")
&& h->param.cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
continue;
{
x264_log( h, X264_LOG_ERROR, "CLZ test failed: x264 has been miscompiled!\n" );
#if ARCH_X86 || ARCH_X86_64
- x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a-targeted build on a CPU that\n" );
+ x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a/LZCNT-targeted build on a CPU that\n" );
x264_log( h, X264_LOG_ERROR, "doesn't support it?\n" );
#endif
goto fail;
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+#if HAVE_MMX
b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
b->cpu&X264_CPU_AVX2 ? "avx2" :
b->cpu&X264_CPU_FMA3 ? "fma3" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_SSE ? "sse" :
b->cpu&X264_CPU_MMX ? "mmx" :
+#elif ARCH_PPC
b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+#elif ARCH_ARM
b->cpu&X264_CPU_NEON ? "neon" :
- b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
+ b->cpu&X264_CPU_ARMV6 ? "armv6" :
+#endif
+ "c",
+#if HAVE_MMX
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
+ b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
- b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
+ b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
b->cpu&X264_CPU_BMI2 ? "_bmi2" :
- b->cpu&X264_CPU_TBM ? "_tbm" :
b->cpu&X264_CPU_BMI1 ? "_bmi1" :
- b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
- b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
+ b->cpu&X264_CPU_SLOW_ATOM ? "_atom" :
+#elif ARCH_ARM
+ b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
+#endif
+ "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
cpu1 &= ~X264_CPU_CACHELINE_64;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
- cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
+ cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
- cpu1 &= ~X264_CPU_SLOW_ATOM;
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
cpu1 &= ~X264_CPU_CACHELINE_64;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
- cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
+ cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
cpu1 &= ~X264_CPU_SLOW_ATOM;
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
if( x264_cpu_detect() & X264_CPU_AVX )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
if( x264_cpu_detect() & X264_CPU_XOP )
if( x264_cpu_detect() & X264_CPU_BMI1 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
- if( x264_cpu_detect() & X264_CPU_TBM )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_TBM, "TBM" );
- cpu1 &= ~X264_CPU_TBM;
- }
if( x264_cpu_detect() & X264_CPU_BMI2 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
#include "x264_config.h"
-#define X264_BUILD 129
+#define X264_BUILD 130
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
/****************************************************************************
* Encoder parameters
****************************************************************************/
-/* CPU flags
- */
-#define X264_CPU_CACHELINE_32 0x0000001 /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64 0x0000002 /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_ALTIVEC 0x0000004
-#define X264_CPU_MMX 0x0000008
-#define X264_CPU_MMX2 0x0000010 /* MMX2 aka MMXEXT aka ISSE */
+/* CPU flags */
+
+/* x86 */
+#define X264_CPU_CMOV 0x0000001
+#define X264_CPU_MMX 0x0000002
+#define X264_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */
#define X264_CPU_MMXEXT X264_CPU_MMX2
-#define X264_CPU_SSE 0x0000020
-#define X264_CPU_SSE2 0x0000040
-#define X264_CPU_SSE2_IS_SLOW 0x0000080 /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST 0x0000100 /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SSE3 0x0000200
-#define X264_CPU_SSSE3 0x0000400
-#define X264_CPU_SHUFFLE_IS_FAST 0x0000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
-#define X264_CPU_STACK_MOD4 0x0001000 /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SSE4 0x0002000 /* SSE4.1 */
-#define X264_CPU_SSE42 0x0004000 /* SSE4.2 */
-#define X264_CPU_SSE_MISALIGN 0x0008000 /* Phenom support for misaligned SSE instruction arguments */
-#define X264_CPU_LZCNT 0x0010000 /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_ARMV6 0x0020000
-#define X264_CPU_NEON 0x0040000 /* ARM NEON */
-#define X264_CPU_FAST_NEON_MRC 0x0080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
-#define X264_CPU_SLOW_CTZ 0x0100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM 0x0200000 /* The Atom just sucks */
-#define X264_CPU_AVX 0x0400000 /* AVX support: requires OS support even if YMM registers
- * aren't used. */
-#define X264_CPU_XOP 0x0800000 /* AMD XOP */
-#define X264_CPU_FMA4 0x1000000 /* AMD FMA4 */
-#define X264_CPU_AVX2 0x2000000 /* AVX2 */
-#define X264_CPU_FMA3 0x4000000 /* Intel FMA3 */
-#define X264_CPU_BMI1 0x8000000 /* BMI1 */
-#define X264_CPU_BMI2 0x10000000 /* BMI2 */
-#define X264_CPU_TBM 0x20000000 /* AMD TBM */
-
-/* Analyse flags
- */
+#define X264_CPU_SSE 0x0000008
+#define X264_CPU_SSE2 0x0000010
+#define X264_CPU_SSE3 0x0000020
+#define X264_CPU_SSSE3 0x0000040
+#define X264_CPU_SSE4 0x0000080 /* SSE4.1 */
+#define X264_CPU_SSE42 0x0000100 /* SSE4.2 */
+#define X264_CPU_SSE_MISALIGN 0x0000200 /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT 0x0000400 /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_AVX 0x0000800 /* AVX support: requires OS support even if YMM registers aren't used. */
+#define X264_CPU_XOP 0x0001000 /* AMD XOP */
+#define X264_CPU_FMA4 0x0002000 /* AMD FMA4 */
+#define X264_CPU_AVX2 0x0004000 /* AVX2 */
+#define X264_CPU_FMA3 0x0008000 /* Intel FMA3 */
+#define X264_CPU_BMI1 0x0010000 /* BMI1 */
+#define X264_CPU_BMI2 0x0020000 /* BMI2 */
+/* x86 modifiers */
+#define X264_CPU_CACHELINE_32 0x0040000 /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64 0x0080000 /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_SSE2_IS_SLOW 0x0100000 /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST 0x0200000 /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SLOW_SHUFFLE 0x0400000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X264_CPU_STACK_MOD4 0x0800000 /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SLOW_CTZ 0x1000000 /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM 0x2000000 /* The Atom is terrible: slow SSE unaligned loads, slow
+ * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
+ * cacheline split penalties -- gather everything here that
+ * isn't shared by other CPUs to avoid making half a dozen
+ * new SLOW flags. */
+#define X264_CPU_SLOW_PSHUFB 0x4000000 /* such as on the Intel Atom */
+#define X264_CPU_SLOW_PALIGNR 0x8000000 /* such as on the AMD Bobcat */
+
+/* PowerPC */
+#define X264_CPU_ALTIVEC 0x0000001
+
+/* ARM */
+#define X264_CPU_ARMV6 0x0000001
+#define X264_CPU_NEON 0x0000002 /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+
+/* Analyse flags */
#define X264_ANALYSE_I4x4 0x0001 /* Analyse i4x4 */
#define X264_ANALYSE_I8x8 0x0002 /* Analyse i8x8 (requires 8x8 transform) */
#define X264_ANALYSE_PSUB16x16 0x0010 /* Analyse p16x8, p8x16 and p8x8 */