I'm not going to actually optimize for this pile of garbage unless someone pays me.
But it can't hurt to at least enable the correct functions based on benchmarks.
Also save some cache on Intel CPUs that don't need the decimate LUT due to having fast bsr/bsf.
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
{"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
+ {"SlowCTZ", X264_CPU_SLOW_CTZ},
+ {"SlowAtom", X264_CPU_SLOW_ATOM},
{"", 0},
};
if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
{
+ cpu |= X264_CPU_SLOW_CTZ;
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
if( edx&0x00400000 )
cpu |= X264_CPU_MMXEXT;
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_LZCNT;
cpu |= X264_CPU_SHUFFLE_IS_FAST;
+ cpu &= ~X264_CPU_SLOW_CTZ;
}
else
cpu |= X264_CPU_SSE2_IS_SLOW;
if( !strcmp((char*)vendor, "GenuineIntel") )
{
- int family, model, stepping;
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
- family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
- model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
- stepping = eax&0xf;
+ int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+ int model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
* theoretically support sse2, but it's significantly slower than mmx for
* almost all of x264's functions, so let's just pretend they don't. */
cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
}
+ /* Detect Atom CPU */
+ if( family == 6 && model == 28 )
+ {
+ cpu |= X264_CPU_SLOW_ATOM;
+ cpu |= X264_CPU_SLOW_CTZ;
+ }
}
if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
}
- if( cpu&X264_CPU_SSSE3 )
+ if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
{
dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
if( cpu&X264_CPU_SSSE3 )
{
- INIT7( ssd, _ssse3 );
- INIT7( satd, _ssse3 );
- INIT7( satd_x3, _ssse3 );
- INIT7( satd_x4, _ssse3 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _ssse3 );
}
INIT_ADS( _ssse3 );
- pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
- pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
+ if( !(cpu&X264_CPU_SLOW_ATOM) )
+ {
+ INIT7( ssd, _ssse3 );
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
+ INIT7( satd, _ssse3 );
+ INIT7( satd_x3, _ssse3 );
+ INIT7( satd_x4, _ssse3 );
+ }
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
- if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
+ if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
{
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
}
pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
pf->decimate_score15 = x264_decimate_score15_mmxext;
pf->decimate_score16 = x264_decimate_score16_mmxext;
+ if( cpu&X264_CPU_SLOW_CTZ )
+ {
+ pf->decimate_score15 = x264_decimate_score15_mmxext_slowctz;
+ pf->decimate_score16 = x264_decimate_score16_mmxext_slowctz;
+ }
pf->decimate_score64 = x264_decimate_score64_mmxext;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
pf->decimate_score64 = x264_decimate_score64_sse2;
+ if( cpu&X264_CPU_SLOW_CTZ )
+ {
+ pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
+ pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
+ }
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->denoise_dct = x264_denoise_dct_ssse3;
pf->decimate_score15 = x264_decimate_score15_ssse3;
pf->decimate_score16 = x264_decimate_score16_ssse3;
+ if( cpu&X264_CPU_SLOW_CTZ )
+ {
+ pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
+ pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
+ }
pf->decimate_score64 = x264_decimate_score64_ssse3;
}
return;
pf->weight = x264_mc_weight_wtab_sse2;
- pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
- pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+ if( !(cpu&X264_CPU_SLOW_ATOM) )
+ {
+ pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+ pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+ }
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->weight = x264_mc_weight_wtab_ssse3;
}
- if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
pf->integral_init4v = x264_integral_init4v_ssse3;
if( !(cpu&X264_CPU_SSE4) )
cextern decimate_table4
cextern decimate_table8
-%macro DECIMATE4x4 2
+%macro DECIMATE4x4 3
-;A LUT is faster than bsf on AMD processors, and no slower on Intel
+;A LUT is faster than bsf on AMD processors.
;This is not true for score64.
cglobal decimate_score%1_%2, 1,3
%ifdef PIC
%if %1==15
shr edx, 1
%endif
+%if %3==1
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
cmp edx, ecx
shr edx, cl
add al, byte [table + rcx]
add al, byte [mask_table + rdx]
+%else
+.loop:
+ bsf ecx, edx
+ shr edx, cl
+ add al, byte [table + rcx]
+ shr edx, 1
+ jne .loop
+%endif
.ret:
- REP_RET
+ RET
.ret9:
mov eax, 9
RET
%ifndef ARCH_X86_64
%define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE4x4 15, mmxext
-DECIMATE4x4 16, mmxext
+DECIMATE4x4 15, mmxext, 0
+DECIMATE4x4 16, mmxext, 0
+DECIMATE4x4 15, mmxext_slowctz, 1
+DECIMATE4x4 16, mmxext_slowctz, 1
%endif
%define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE4x4 15, sse2
-DECIMATE4x4 15, ssse3
-DECIMATE4x4 16, sse2
-DECIMATE4x4 16, ssse3
+DECIMATE4x4 15, sse2, 0
+DECIMATE4x4 16, sse2, 0
+DECIMATE4x4 15, sse2_slowctz, 1
+DECIMATE4x4 16, sse2_slowctz, 1
+DECIMATE4x4 15, ssse3, 0
+DECIMATE4x4 16, ssse3, 0
+DECIMATE4x4 15, ssse3_slowctz, 1
+DECIMATE4x4 16, ssse3_slowctz, 1
%macro DECIMATE8x8 1
int x264_decimate_score16_mmxext( int16_t *dct );
int x264_decimate_score16_sse2 ( int16_t *dct );
int x264_decimate_score16_ssse3 ( int16_t *dct );
+int x264_decimate_score15_mmxext_slowctz( int16_t *dct );
+int x264_decimate_score15_sse2_slowctz ( int16_t *dct );
+int x264_decimate_score15_ssse3_slowctz ( int16_t *dct );
+int x264_decimate_score16_mmxext_slowctz( int16_t *dct );
+int x264_decimate_score16_sse2_slowctz ( int16_t *dct );
+int x264_decimate_score16_ssse3_slowctz ( int16_t *dct );
int x264_decimate_score64_mmxext( int16_t *dct );
int x264_decimate_score64_sse2 ( int16_t *dct );
int x264_decimate_score64_ssse3 ( int16_t *dct );
/* calculate dct coeffs */
for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
{
- /* We don't need to zero the DC coefficient before quantization because we already
- * checked that all the DCs were zero above at twice the precision that quant4x4
- * uses. This applies even though the DC here is being quantized before the 2x2
- * transform. */
+ dct4x4[i4x4][0] = 0;
if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
continue;
h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
- b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
+ b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
+ b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
+ b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
+ cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
+ cpu1 &= ~X264_CPU_SLOW_CTZ;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
+ cpu1 &= ~X264_CPU_SLOW_ATOM;
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
+ cpu1 &= ~X264_CPU_SLOW_CTZ;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
+ cpu1 &= ~X264_CPU_SLOW_ATOM;
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
{
#define X264_CPU_ARMV6 0x020000
#define X264_CPU_NEON 0x040000 /* ARM NEON */
#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X264_CPU_SLOW_CTZ 0x100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM 0x200000 /* The Atom just sucks */
/* Analyse flags
*/