#define BENCH_RUNS 100 // tradeoff between accuracy and speed
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
-#define MAX_CPUS 10 // number of different combinations of cpu flags
+#define MAX_CPUS 30 // number of different combinations of cpu flags
typedef struct
{
b->cpu&X264_CPU_XOP ? "xop" :
b->cpu&X264_CPU_AVX ? "avx" :
b->cpu&X264_CPU_SSE4 ? "sse4" :
- b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
/* print sse2slow only if there's also a sse2fast version of the same func */
- b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
+ b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" :
b->cpu&X264_CPU_ALTIVEC ? "altivec" :
b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
+ b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
{
ALIGNED_16( uint16_t sums[72] );
ALIGNED_16( int dc[4] );
- int16_t mvs_a[32], mvs_c[32];
+ ALIGNED_16( int16_t mvs_a[32] );
+ ALIGNED_16( int16_t mvs_c[32] );
int mvn_a, mvn_c;
int thresh = rand() & 0x3fff;
set_func_name( "esa_ads" );
{
*cpu_ref = *cpu_new;
*cpu_new |= flags;
+#if BROKEN_STACK_ALIGNMENT
+ *cpu_new |= X264_CPU_STACK_MOD4;
+#endif
if( *cpu_new & X264_CPU_SSE2_IS_FAST )
*cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
if( !quiet )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
- cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
cpu1 &= ~X264_CPU_SSE_MISALIGN;
}
if( x264_cpu_detect() & X264_CPU_LZCNT )
{
- cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
if( x264_cpu_detect() & X264_CPU_SSE3 )
+ {
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
+ }
if( x264_cpu_detect() & X264_CPU_SSSE3 )
{
- cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_ATOM;
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
- {
- cpu1 &= ~X264_CPU_CACHELINE_64;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
- }
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );
if( x264_cpu_detect() & X264_CPU_AVX )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
if( x264_cpu_detect() & X264_CPU_XOP )