X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fcpu.c;h=1c9894d70c9354826f24f432b573597235119479;hb=aa26e880bc2cd04cc81c776051d5e21d03fc975a;hp=8d4f63522d345f65d19f3b4a36e09cac0296d68b;hpb=ce7ee9d2eeed6a81ab9b2a7d8d9f4bc5674c18c7;p=x264 diff --git a/common/cpu.c b/common/cpu.c index 8d4f6352..1c9894d7 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -1,7 +1,7 @@ /***************************************************************************** * cpu.c: cpu detection ***************************************************************************** - * Copyright (C) 2003-2011 x264 project + * Copyright (C) 2003-2016 x264 project * * Authors: Loren Merritt * Laurent Aimar @@ -25,11 +25,10 @@ * For more information, contact us at licensing@x264.com. *****************************************************************************/ -#define _GNU_SOURCE // for sched_getaffinity #include "common.h" #include "cpu.h" -#if HAVE_PTHREAD && SYS_LINUX +#if HAVE_POSIXTHREAD && SYS_LINUX #include #endif #if SYS_BEOS @@ -45,31 +44,56 @@ #include #endif -const x264_cpu_name_t x264_cpu_names[] = { - {"Altivec", X264_CPU_ALTIVEC}, -// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore - {"MMX2", X264_CPU_MMX|X264_CPU_MMXEXT}, - {"MMXEXT", X264_CPU_MMX|X264_CPU_MMXEXT}, -// {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264 - {"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW}, - {"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2}, - {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST}, - {"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3}, - {"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, - {"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST}, - {"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, - {"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, - {"AVX", X264_CPU_AVX}, - {"Cache32", X264_CPU_CACHELINE_32}, - {"Cache64", X264_CPU_CACHELINE_64}, - {"SSEMisalign", X264_CPU_SSE_MISALIGN}, - {"LZCNT", X264_CPU_LZCNT}, - {"Slow_mod4_stack", X264_CPU_STACK_MOD4}, - {"ARMv6", X264_CPU_ARMV6}, - {"NEON", X264_CPU_NEON}, - {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC}, - {"SlowCTZ", X264_CPU_SLOW_CTZ}, - {"SlowAtom", X264_CPU_SLOW_ATOM}, +const x264_cpu_name_t x264_cpu_names[] = +{ +#if HAVE_MMX +// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore +// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it +#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV + {"MMX2", MMX2}, + {"MMXEXT", MMX2}, + {"SSE", MMX2|X264_CPU_SSE}, +#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2 + {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW}, + {"SSE2", SSE2}, + {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST}, + {"SSE3", SSE2|X264_CPU_SSE3}, + {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, + {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, + {"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, + {"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, +#define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX + {"AVX", AVX}, + {"XOP", AVX|X264_CPU_XOP}, + {"FMA4", AVX|X264_CPU_FMA4}, + {"FMA3", AVX|X264_CPU_FMA3}, + {"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2}, +#undef AVX +#undef SSE2 +#undef MMX2 + {"Cache32", X264_CPU_CACHELINE_32}, + {"Cache64", X264_CPU_CACHELINE_64}, + {"LZCNT", X264_CPU_LZCNT}, + {"BMI1", X264_CPU_BMI1}, + {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2}, + {"SlowCTZ", X264_CPU_SLOW_CTZ}, + {"SlowAtom", X264_CPU_SLOW_ATOM}, + {"SlowPshufb", X264_CPU_SLOW_PSHUFB}, + {"SlowPalignr", X264_CPU_SLOW_PALIGNR}, + {"SlowShuffle", X264_CPU_SLOW_SHUFFLE}, + {"UnalignedStack", X264_CPU_STACK_MOD4}, +#elif ARCH_PPC + {"Altivec", X264_CPU_ALTIVEC}, +#elif ARCH_ARM + {"ARMv6", X264_CPU_ARMV6}, + {"NEON", X264_CPU_NEON}, + {"FastNeonMRC", X264_CPU_FAST_NEON_MRC}, +#elif ARCH_AARCH64 + {"ARMv8", X264_CPU_ARMV8}, + {"NEON", X264_CPU_NEON}, +#elif ARCH_MIPS + {"MSA", X264_CPU_MSA}, +#endif {"", 0}, }; @@ -95,14 +119,14 @@ static void sigill_handler( int sig ) #if HAVE_MMX int x264_cpu_cpuid_test( void ); void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx ); -void x264_cpu_xgetbv( uint32_t op, int *eax, int *edx ); +void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx ); uint32_t x264_cpu_detect( void ) { uint32_t cpu = 0; uint32_t eax, ebx, ecx, edx; uint32_t vendor[4] = {0}; - int max_extended_cap; + uint32_t max_extended_cap, max_basic_cap; int cache; #if !ARCH_X86_64 @@ -111,16 +135,21 @@ uint32_t x264_cpu_detect( void ) #endif x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 ); - if( eax == 0 ) + max_basic_cap = eax; + if( max_basic_cap == 0 ) return 0; x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); if( edx&0x00800000 ) cpu |= X264_CPU_MMX; else - return 0; + return cpu; if( edx&0x02000000 ) - cpu |= X264_CPU_MMXEXT|X264_CPU_SSE; + cpu |= X264_CPU_MMX2|X264_CPU_SSE; + if( edx&0x00008000 ) + cpu |= X264_CPU_CMOV; + else + return cpu; if( edx&0x04000000 ) cpu |= X264_CPU_SSE2; if( ecx&0x00000001 ) @@ -137,41 +166,74 @@ uint32_t x264_cpu_detect( void ) /* Check for OS support */ x264_cpu_xgetbv( 0, &eax, &edx ); if( (eax&0x6) == 0x6 ) + { cpu |= X264_CPU_AVX; + if( ecx&0x00001000 ) + cpu |= X264_CPU_FMA3; + } + } + + if( max_basic_cap >= 7 ) + { + x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx ); + /* AVX2 requires OS support, but BMI1/2 don't. */ + if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) ) + cpu |= X264_CPU_AVX2; + if( ebx&0x00000008 ) + { + cpu |= X264_CPU_BMI1; + if( ebx&0x00000100 ) + cpu |= X264_CPU_BMI2; + } } if( cpu & X264_CPU_SSSE3 ) cpu |= X264_CPU_SSE2_IS_FAST; - if( cpu & X264_CPU_SSE4 ) - cpu |= X264_CPU_SHUFFLE_IS_FAST; x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; - if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 ) + if( max_extended_cap >= 0x80000001 ) { - cpu |= X264_CPU_SLOW_CTZ; x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx ); - if( edx&0x00400000 ) - cpu |= X264_CPU_MMXEXT; - if( cpu & X264_CPU_SSE2 ) + + if( ecx&0x00000020 ) + cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ + if( ecx&0x00000040 ) /* SSE4a, AMD only */ { - if( ecx&0x00000040 ) /* SSE4a */ + int family = ((eax>>8)&0xf) + ((eax>>20)&0xff); + cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ + if( family == 0x14 ) { - cpu |= X264_CPU_SSE2_IS_FAST; - cpu |= X264_CPU_LZCNT; - cpu |= X264_CPU_SHUFFLE_IS_FAST; - cpu &= ~X264_CPU_SLOW_CTZ; + cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ + cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ + cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ } - else - cpu |= X264_CPU_SSE2_IS_SLOW; - - if( ecx&0x00000080 ) /* Misalign SSE */ + if( family == 0x16 ) { - cpu |= X264_CPU_SSE_MISALIGN; - x264_cpu_mask_misalign_sse(); + cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough + * compared to alternate instruction sequences that this + * is equal or faster on almost all such functions. */ } } + + if( cpu & X264_CPU_AVX ) + { + if( ecx&0x00000800 ) /* XOP */ + cpu |= X264_CPU_XOP; + if( ecx&0x00010000 ) /* FMA4 */ + cpu |= X264_CPU_FMA4; + } + + if( !strcmp((char*)vendor, "AuthenticAMD") ) + { + if( edx&0x00400000 ) + cpu |= X264_CPU_MMX2; + if( !(cpu&X264_CPU_LZCNT) ) + cpu |= X264_CPU_SLOW_CTZ; + if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) ) + cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ + } } if( !strcmp((char*)vendor, "GenuineIntel") ) @@ -179,19 +241,27 @@ uint32_t x264_cpu_detect( void ) x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); int family = ((eax>>8)&0xf) + ((eax>>20)&0xff); int model = ((eax>>4)&0xf) + ((eax>>12)&0xf0); - /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") - * theoretically support sse2, but it's significantly slower than mmx for - * almost all of x264's functions, so let's just pretend they don't. */ - if( family == 6 && (model == 9 || model == 13 || model == 14) ) - { - cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3); - assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4))); - } - /* Detect Atom CPU */ - if( family == 6 && model == 28 ) + if( family == 6 ) { - cpu |= X264_CPU_SLOW_ATOM; - cpu |= X264_CPU_SLOW_CTZ; + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") + * theoretically support sse2, but it's significantly slower than mmx for + * almost all of x264's functions, so let's just pretend they don't. */ + if( model == 9 || model == 13 || model == 14 ) + { + cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3); + assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4))); + } + /* Detect Atom CPU */ + else if( model == 28 ) + { + cpu |= X264_CPU_SLOW_ATOM; + cpu |= X264_CPU_SLOW_CTZ; + cpu |= X264_CPU_SLOW_PSHUFB; + } + /* Conroe has a slow shuffle unit. Check the model number to make sure not + * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ + else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 ) + cpu |= X264_CPU_SLOW_SHUFFLE; } } @@ -205,7 +275,7 @@ uint32_t x264_cpu_detect( void ) x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx ); cache = ecx&0xff; // cacheline size } - if( !cache ) + if( !cache && max_basic_cap >= 2 ) { // Cache and TLB Information static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; @@ -238,16 +308,16 @@ uint32_t x264_cpu_detect( void ) x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" ); } -#if BROKEN_STACK_ALIGNMENT +#if STACK_ALIGNMENT < 16 cpu |= X264_CPU_STACK_MOD4; #endif return cpu; } -#elif ARCH_PPC +#elif ARCH_PPC && HAVE_ALTIVEC -#if SYS_MACOSX || SYS_OPENBSD +#if SYS_MACOSX || SYS_OPENBSD || SYS_FREEBSD #include uint32_t x264_cpu_detect( void ) { @@ -255,12 +325,16 @@ uint32_t x264_cpu_detect( void ) uint32_t cpu = 0; #if SYS_OPENBSD int selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC }; -#else +#elif SYS_MACOSX int selectors[2] = { CTL_HW, HW_VECTORUNIT }; #endif int has_altivec = 0; size_t length = sizeof( has_altivec ); +#if SYS_MACOSX || SYS_OPENBSD int error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 ); +#else + int error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 ); +#endif if( error == 0 && has_altivec != 0 ) cpu |= X264_CPU_ALTIVEC; @@ -272,6 +346,9 @@ uint32_t x264_cpu_detect( void ) uint32_t x264_cpu_detect( void ) { +#ifdef __NO_FPRS__ + return 0; +#else static void (*oldsig)( int ); oldsig = signal( SIGILL, sigill_handler ); @@ -291,6 +368,7 @@ uint32_t x264_cpu_detect( void ) signal( SIGILL, oldsig ); return X264_CPU_ALTIVEC; +#endif } #endif @@ -330,12 +408,33 @@ uint32_t x264_cpu_detect( void ) // Note that there is potential for a race condition if another program or // x264 instance disables or reinits the counters while x264 is using them, // which may result in incorrect detection and the counters stuck enabled. + // right now Apple does not seem to support performance counters for this test +#ifndef __MACH__ flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0; +#endif // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc) #endif return flags; } +#elif ARCH_AARCH64 + +uint32_t x264_cpu_detect( void ) +{ + return X264_CPU_ARMV8 | X264_CPU_NEON; +} + +#elif ARCH_MIPS + +uint32_t x264_cpu_detect( void ) +{ + uint32_t flags = 0; +#if HAVE_MSA + flags |= X264_CPU_MSA; +#endif + return flags; +} + #else uint32_t x264_cpu_detect( void ) @@ -350,18 +449,30 @@ int x264_cpu_num_processors( void ) #if !HAVE_THREAD return 1; -#elif defined(_WIN32) +#elif SYS_WINDOWS return x264_pthread_num_processors_np(); +#elif SYS_CYGWIN || SYS_SunOS + return sysconf( _SC_NPROCESSORS_ONLN ); + #elif SYS_LINUX - unsigned int bit; - int np; +#ifdef __ANDROID__ + // Android NDK does not expose sched_getaffinity + return sysconf( _SC_NPROCESSORS_CONF ); +#else cpu_set_t p_aff; memset( &p_aff, 0, sizeof(p_aff) ); - sched_getaffinity( 0, sizeof(p_aff), &p_aff ); - for( np = 0, bit = 0; bit < sizeof(p_aff); bit++ ) + if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) ) + return 1; +#if HAVE_CPU_COUNT + return CPU_COUNT(&p_aff); +#else + int np = 0; + for( unsigned int bit = 0; bit < 8 * sizeof(p_aff); bit++ ) np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1; return np; +#endif +#endif #elif SYS_BEOS system_info info;