X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fcpu.c;h=cfcdfd4f9972973e9c9e5ddecec5638e02df762b;hb=92b0bd9665860d7b48f313d6fd72a583ecb01ddf;hp=860cd9574b6b02a2982a323fff74e3302bb788cb;hpb=d25d50c9ffb02571c12e13c09356fa08fe97b0b4;p=x264 diff --git a/common/cpu.c b/common/cpu.c index 860cd957..cfcdfd4f 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -1,7 +1,7 @@ /***************************************************************************** - * cpu.c: h264 encoder library + * cpu.c: cpu detection ***************************************************************************** - * Copyright (C) 2003-2008 x264 project + * Copyright (C) 2003-2012 x264 project * * Authors: Loren Merritt * Laurent Aimar @@ -20,64 +20,105 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. *****************************************************************************/ -#if defined(HAVE_PTHREAD) && defined(SYS_LINUX) -#define _GNU_SOURCE +#define _GNU_SOURCE // for sched_getaffinity +#include "common.h" +#include "cpu.h" + +#if HAVE_POSIXTHREAD && SYS_LINUX #include #endif -#ifdef SYS_BEOS +#if SYS_BEOS #include #endif -#if defined(SYS_MACOSX) || defined(SYS_FREEBSD) +#if SYS_MACOSX || SYS_FREEBSD #include #include #endif -#ifdef SYS_OPENBSD +#if SYS_OPENBSD #include #include #include #endif -#include "common.h" -#include "cpu.h" - -const x264_cpu_name_t x264_cpu_names[] = { - {"Altivec", X264_CPU_ALTIVEC}, -// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore - {"MMX2", X264_CPU_MMX|X264_CPU_MMXEXT}, - {"MMXEXT", X264_CPU_MMX|X264_CPU_MMXEXT}, -// {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264 - {"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW}, - {"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2}, - {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST}, - {"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3}, - {"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, - {"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST}, - {"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, - {"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, - {"Cache32", X264_CPU_CACHELINE_32}, - {"Cache64", X264_CPU_CACHELINE_64}, - {"SSEMisalign", X264_CPU_SSE_MISALIGN}, - {"LZCNT", X264_CPU_LZCNT}, +const x264_cpu_name_t x264_cpu_names[] = +{ + {"Altivec", X264_CPU_ALTIVEC}, +// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore + {"MMX2", X264_CPU_MMX|X264_CPU_MMX2}, + {"MMXEXT", X264_CPU_MMX|X264_CPU_MMX2}, +// {"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE}, // there are no sse1 functions in x264 +#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2 + {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW}, + {"SSE2", SSE2}, + {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST}, + {"SSE3", SSE2|X264_CPU_SSE3}, + {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, + {"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST}, + {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, + {"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, + {"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, +#define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX + {"AVX", AVX}, + {"XOP", AVX|X264_CPU_XOP}, + {"FMA4", AVX|X264_CPU_FMA4}, + {"AVX2", AVX|X264_CPU_AVX2}, + {"FMA3", AVX|X264_CPU_FMA3}, +#undef AVX +#undef SSE2 + {"Cache32", X264_CPU_CACHELINE_32}, + {"Cache64", X264_CPU_CACHELINE_64}, + {"SSEMisalign", X264_CPU_SSE_MISALIGN}, + {"LZCNT", X264_CPU_LZCNT}, + {"BMI1", X264_CPU_BMI1}, + {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2}, + {"TBM", X264_CPU_TBM}, {"Slow_mod4_stack", X264_CPU_STACK_MOD4}, + {"ARMv6", X264_CPU_ARMV6}, + {"NEON", X264_CPU_NEON}, + {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC}, + {"SlowCTZ", X264_CPU_SLOW_CTZ}, + {"SlowAtom", X264_CPU_SLOW_ATOM}, {"", 0}, }; +#if (ARCH_PPC && SYS_LINUX) || (ARCH_ARM && !HAVE_NEON) +#include +#include +static sigjmp_buf jmpbuf; +static volatile sig_atomic_t canjump = 0; + +static void sigill_handler( int sig ) +{ + if( !canjump ) + { + signal( sig, SIG_DFL ); + raise( sig ); + } -#ifdef HAVE_MMX -extern int x264_cpu_cpuid_test( void ); -extern uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx ); + canjump = 0; + siglongjmp( jmpbuf, 1 ); +} +#endif + +#if HAVE_MMX +int x264_cpu_cpuid_test( void ); +void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx ); +void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx ); uint32_t x264_cpu_detect( void ) { uint32_t cpu = 0; uint32_t eax, ebx, ecx, edx; uint32_t vendor[4] = {0}; - int max_extended_cap; + uint32_t max_extended_cap; int cache; -#ifndef ARCH_X86_64 +#if !ARCH_X86_64 if( !x264_cpu_cpuid_test() ) return 0; #endif @@ -92,7 +133,7 @@ uint32_t x264_cpu_detect( void ) else return 0; if( edx&0x02000000 ) - cpu |= X264_CPU_MMXEXT|X264_CPU_SSE; + cpu |= X264_CPU_MMX2|X264_CPU_SSE; if( edx&0x04000000 ) cpu |= X264_CPU_SSE2; if( ecx&0x00000001 ) @@ -103,6 +144,29 @@ uint32_t x264_cpu_detect( void ) cpu |= X264_CPU_SSE4; if( ecx&0x00100000 ) cpu |= X264_CPU_SSE42; + /* Check OXSAVE and AVX bits */ + if( (ecx&0x18000000) == 0x18000000 ) + { + /* Check for OS support */ + x264_cpu_xgetbv( 0, &eax, &edx ); + if( (eax&0x6) == 0x6 ) + { + cpu |= X264_CPU_AVX; + if( ecx&0x00001000 ) + cpu |= X264_CPU_FMA3; + } + } + + x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx ); + /* AVX2 requires OS support, but BMI1/2 don't. */ + if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) ) + cpu |= X264_CPU_AVX2; + if( ebx&0x00000008 ) + { + cpu |= X264_CPU_BMI1; + if( ebx&0x00000100 ) + cpu |= X264_CPU_BMI2; + } if( cpu & X264_CPU_SSSE3 ) cpu |= X264_CPU_SSE2_IS_FAST; @@ -114,38 +178,66 @@ uint32_t x264_cpu_detect( void ) if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 ) { + cpu |= X264_CPU_SLOW_CTZ; x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx ); if( edx&0x00400000 ) - cpu |= X264_CPU_MMXEXT; + cpu |= X264_CPU_MMX2; if( cpu & X264_CPU_SSE2 ) { if( ecx&0x00000040 ) /* SSE4a */ { cpu |= X264_CPU_SSE2_IS_FAST; - cpu |= X264_CPU_SSE_MISALIGN; cpu |= X264_CPU_LZCNT; cpu |= X264_CPU_SHUFFLE_IS_FAST; - x264_cpu_mask_misalign_sse(); + cpu &= ~X264_CPU_SLOW_CTZ; } else cpu |= X264_CPU_SSE2_IS_SLOW; + + if( ecx&0x00000080 ) /* Misalign SSE */ + { + cpu |= X264_CPU_SSE_MISALIGN; + x264_cpu_mask_misalign_sse(); + } + + if( cpu & X264_CPU_AVX ) + { + if( ecx&0x00000800 ) /* XOP */ + cpu |= X264_CPU_XOP; + if( ecx&0x00010000 ) /* FMA4 */ + cpu |= X264_CPU_FMA4; + } + + if( ecx&0x00200000 ) + cpu |= X264_CPU_TBM; } } if( !strcmp((char*)vendor, "GenuineIntel") ) { - int family, model, stepping; x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); - family = ((eax>>8)&0xf) + ((eax>>20)&0xff); - model = ((eax>>4)&0xf) + ((eax>>12)&0xf0); - stepping = eax&0xf; - /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") - * theoretically support sse2, but it's significantly slower than mmx for - * almost all of x264's functions, so let's just pretend they don't. */ - if( family==6 && (model==9 || model==13 || model==14) ) + int family = ((eax>>8)&0xf) + ((eax>>20)&0xff); + int model = ((eax>>4)&0xf) + ((eax>>12)&0xf0); + if( family == 6 ) { - cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3); - assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4))); + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") + * theoretically support sse2, but it's significantly slower than mmx for + * almost all of x264's functions, so let's just pretend they don't. */ + if( model == 9 || model == 13 || model == 14 ) + { + cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3); + assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4))); + } + /* Detect Atom CPU */ + else if( model == 28 ) + { + cpu |= X264_CPU_SLOW_ATOM; + cpu |= X264_CPU_SLOW_CTZ; + } + /* Some Penryns and Nehalems are pointlessly crippled (SSE4 disabled), so + * detect them here. */ + else if( model >= 23 ) + cpu |= X264_CPU_SHUFFLE_IS_FAST; } } @@ -163,14 +255,15 @@ uint32_t x264_cpu_detect( void ) { // Cache and TLB Information static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; - static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; + static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, + 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; uint32_t buf[4]; - int max, i=0, j; + int max, i = 0; do { x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 ); max = buf[0]&0xff; buf[0] &= ~0xff; - for(j=0; j<4; j++) + for( int j = 0; j < 4; j++ ) if( !(buf[j]>>31) ) while( buf[j] ) { @@ -188,25 +281,25 @@ uint32_t x264_cpu_detect( void ) else if( cache == 64 ) cpu |= X264_CPU_CACHELINE_64; else - fprintf( stderr, "x264 [warning]: unable to determine cacheline size\n" ); + x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" ); } -#ifdef BROKEN_STACK_ALIGNMENT +#if BROKEN_STACK_ALIGNMENT cpu |= X264_CPU_STACK_MOD4; #endif return cpu; } -#elif defined( ARCH_PPC ) +#elif ARCH_PPC -#if defined(SYS_MACOSX) || defined(SYS_OPENBSD) +#if SYS_MACOSX || SYS_OPENBSD #include uint32_t x264_cpu_detect( void ) { /* Thank you VLC */ uint32_t cpu = 0; -#ifdef SYS_OPENBSD +#if SYS_OPENBSD int selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC }; #else int selectors[2] = { CTL_HW, HW_VECTORUNIT }; @@ -216,34 +309,16 @@ uint32_t x264_cpu_detect( void ) int error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 ); if( error == 0 && has_altivec != 0 ) - { cpu |= X264_CPU_ALTIVEC; - } return cpu; } -#elif defined( SYS_LINUX ) -#include -#include -static sigjmp_buf jmpbuf; -static volatile sig_atomic_t canjump = 0; - -static void sigill_handler( int sig ) -{ - if( !canjump ) - { - signal( sig, SIG_DFL ); - raise( sig ); - } - - canjump = 0; - siglongjmp( jmpbuf, 1 ); -} +#elif SYS_LINUX uint32_t x264_cpu_detect( void ) { - static void (* oldsig)( int ); + static void (*oldsig)( int ); oldsig = signal( SIGILL, sigill_handler ); if( sigsetjmp( jmpbuf, 1 ) ) @@ -265,58 +340,100 @@ uint32_t x264_cpu_detect( void ) } #endif -#else +#elif ARCH_ARM + +void x264_cpu_neon_test( void ); +int x264_cpu_fast_neon_mrc_test( void ); uint32_t x264_cpu_detect( void ) { - return 0; -} + int flags = 0; +#if HAVE_ARMV6 + flags |= X264_CPU_ARMV6; + + // don't do this hack if compiled with -mfpu=neon +#if !HAVE_NEON + static void (* oldsig)( int ); + oldsig = signal( SIGILL, sigill_handler ); + if( sigsetjmp( jmpbuf, 1 ) ) + { + signal( SIGILL, oldsig ); + return flags; + } + canjump = 1; + x264_cpu_neon_test(); + canjump = 0; + signal( SIGILL, oldsig ); #endif -#ifndef HAVE_MMX -void x264_emms( void ) + flags |= X264_CPU_NEON; + + // fast neon -> arm (Cortex-A9) detection relies on user access to the + // cycle counter; this assumes ARMv7 performance counters. + // NEON requires at least ARMv7, ARMv8 may require changes here, but + // hopefully this hacky detection method will have been replaced by then. + // Note that there is potential for a race condition if another program or + // x264 instance disables or reinits the counters while x264 is using them, + // which may result in incorrect detection and the counters stuck enabled. + flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0; + // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc) +#endif + return flags; +} + +#else + +uint32_t x264_cpu_detect( void ) { + return 0; } -#endif +#endif int x264_cpu_num_processors( void ) { -#if !defined(HAVE_PTHREAD) +#if !HAVE_THREAD return 1; -#elif defined(_WIN32) - return pthread_num_processors_np(); +#elif SYS_WINDOWS + return x264_pthread_num_processors_np(); + +#elif SYS_CYGWIN + return sysconf( _SC_NPROCESSORS_ONLN ); -#elif defined(SYS_LINUX) - unsigned int bit; - int np; +#elif SYS_LINUX cpu_set_t p_aff; memset( &p_aff, 0, sizeof(p_aff) ); - sched_getaffinity( 0, sizeof(p_aff), &p_aff ); - for( np = 0, bit = 0; bit < sizeof(p_aff); bit++ ) + if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) ) + return 1; +#if HAVE_CPU_COUNT + return CPU_COUNT(&p_aff); +#else + int np = 0; + for( unsigned int bit = 0; bit < 8 * sizeof(p_aff); bit++ ) np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1; return np; +#endif -#elif defined(SYS_BEOS) +#elif SYS_BEOS system_info info; get_system_info( &info ); return info.cpu_count; -#elif defined(SYS_MACOSX) || defined(SYS_FREEBSD) || defined(SYS_OPENBSD) - int numberOfCPUs; - size_t length = sizeof( numberOfCPUs ); -#ifdef SYS_OPENBSD +#elif SYS_MACOSX || SYS_FREEBSD || SYS_OPENBSD + int ncpu; + size_t length = sizeof( ncpu ); +#if SYS_OPENBSD int mib[2] = { CTL_HW, HW_NCPU }; - if( sysctl(mib, 2, &numberOfCPUs, &length, NULL, 0) ) + if( sysctl(mib, 2, &ncpu, &length, NULL, 0) ) #else - if( sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0) ) + if( sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0) ) #endif { - numberOfCPUs = 1; + ncpu = 1; } - return numberOfCPUs; + return ncpu; #else return 1;