TBM and BMI1 are supported by Trinity/Piledriver.
The others (and BMI1) will probably appear in Intel's upcoming Haswell.
Also update x86inc with AVX2 stuff.
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
- {"AVX", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
- {"XOP", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_XOP},
- {"FMA4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_FMA4},
+#define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
+ {"AVX", AVX},
+ {"XOP", AVX|X264_CPU_XOP},
+ {"FMA4", AVX|X264_CPU_FMA4},
+ {"AVX2", AVX|X264_CPU_AVX2},
+ {"FMA3", AVX|X264_CPU_FMA3},
+#undef AVX
#undef SSE2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
+ {"BMI1", X264_CPU_BMI1},
+ {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
+ {"TBM", X264_CPU_TBM},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
/* Check for OS support */
x264_cpu_xgetbv( 0, &eax, &edx );
if( (eax&0x6) == 0x6 )
+ {
cpu |= X264_CPU_AVX;
+ if( ecx&0x00001000 )
+ cpu |= X264_CPU_FMA3;
+ }
+ }
+
+ x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
+ /* AVX2 requires OS support, but BMI1/2 don't. */
+ if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
+ cpu |= X264_CPU_AVX2;
+ if( ebx&0x00000008 )
+ {
+ cpu |= X264_CPU_BMI1;
+ if( ebx&0x00000100 )
+ cpu |= X264_CPU_BMI2;
}
if( cpu & X264_CPU_SSSE3 )
if( ecx&0x00010000 ) /* FMA4 */
cpu |= X264_CPU_FMA4;
}
+
+ if( ecx&0x00200000 )
+ cpu |= X264_CPU_TBM;
}
}
push r2
push r1
mov eax, r0d
+ xor ecx, ecx
cpuid
pop rsi
mov [rsi], eax
%assign cpuflags_avx (1<<9) | cpuflags_sse42
%assign cpuflags_xop (1<<10)| cpuflags_avx
%assign cpuflags_fma4 (1<<11)| cpuflags_avx
+%assign cpuflags_avx2 (1<<12)| cpuflags_avx
+%assign cpuflags_fma3 (1<<13)| cpuflags_avx
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
%assign cpuflags_lzcnt (1<<19)
%assign cpuflags_misalign (1<<20)
%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_bmi1 (1<<22)
+%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
+%assign cpuflags_tbm (1<<24)|cpuflags_bmi1
%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
;%4 == number of operands given
;%5+: operands
%macro RUN_AVX_INSTR 6-7+
- %ifid %5
- %define %%sizeofreg sizeof%5
- %elifid %6
+ %ifid %6
%define %%sizeofreg sizeof%6
+ %elifid %5
+ %define %%sizeofreg sizeof%5
%else
%define %%sizeofreg mmsize
%endif
AVX_INSTR mulss, 1, 0, 1
AVX_INSTR orpd, 1, 0, 1
AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
AVX_INSTR packsswb, 0, 0, 0
AVX_INSTR packssdw, 0, 0, 0
AVX_INSTR packuswb, 0, 0, 0
AVX_INSTR pminub, 0, 0, 1
AVX_INSTR pminuw, 0, 0, 1
AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
AVX_INSTR pmulhuw, 0, 0, 1
AVX_INSTR pmulhrsw, 0, 0, 1
AVX_INSTR pmulhw, 0, 0, 1
AVX_INSTR por, 0, 0, 1
AVX_INSTR psadbw, 0, 0, 1
AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
AVX_INSTR psignb, 0, 0, 0
AVX_INSTR psignw, 0, 0, 0
AVX_INSTR psignd, 0, 0, 0
AVX_INSTR psubsw, 0, 0, 0
AVX_INSTR psubusb, 0, 0, 0
AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
AVX_INSTR punpckhbw, 0, 0, 0
AVX_INSTR punpckhwd, 0, 0, 0
AVX_INSTR punpckhdq, 0, 0, 0
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+ b->cpu&X264_CPU_AVX2 ? "avx2" :
+ b->cpu&X264_CPU_FMA3 ? "fma3" :
b->cpu&X264_CPU_FMA4 ? "fma4" :
b->cpu&X264_CPU_XOP ? "xop" :
b->cpu&X264_CPU_AVX ? "avx" :
b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
+ b->cpu&X264_CPU_BMI2 ? "_bmi2" :
+ b->cpu&X264_CPU_TBM ? "_tbm" :
+ b->cpu&X264_CPU_BMI1 ? "_bmi1" :
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
if( x264_cpu_detect() & X264_CPU_XOP )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
if( x264_cpu_detect() & X264_CPU_FMA4 )
+ {
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
+ cpu1 &= ~X264_CPU_FMA4;
+ }
+ if( x264_cpu_detect() & X264_CPU_FMA3 )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
+ cpu1 &= ~X264_CPU_FMA3;
+ }
+ if( x264_cpu_detect() & X264_CPU_BMI1 )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
+ if( x264_cpu_detect() & X264_CPU_TBM )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_TBM, "TBM" );
+ cpu1 &= ~X264_CPU_TBM;
+ }
+ if( x264_cpu_detect() & X264_CPU_BMI2 )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
+ cpu1 &= ~X264_CPU_BMI2;
+ }
+ cpu1 &= ~X264_CPU_BMI1;
+ }
+ if( x264_cpu_detect() & X264_CPU_AVX2 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
* aren't used. */
#define X264_CPU_XOP 0x0800000 /* AMD XOP */
#define X264_CPU_FMA4 0x1000000 /* AMD FMA4 */
+#define X264_CPU_AVX2 0x2000000 /* AVX2 */
+#define X264_CPU_FMA3 0x4000000 /* Intel FMA3 */
+#define X264_CPU_BMI1 0x8000000 /* BMI1 */
+#define X264_CPU_BMI2 0x10000000 /* BMI2 */
+#define X264_CPU_TBM 0x20000000 /* AMD TBM */
/* Analyse flags
*/