git.sesse.net Git - x264/blob - common/cpu.c

   1 /*****************************************************************************
   2  * cpu.c: cpu detection
   3  *****************************************************************************
   4  * Copyright (C) 2003-2016 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *
  24  * This program is also available under a commercial proprietary license.
  25  * For more information, contact us at licensing@x264.com.
  26  *****************************************************************************/
  27
  28 #include "common.h"
  29 #include "cpu.h"
  30
  31 #if HAVE_POSIXTHREAD && SYS_LINUX
  32 #include <sched.h>
  33 #endif
  34 #if SYS_BEOS
  35 #include <kernel/OS.h>
  36 #endif
  37 #if SYS_MACOSX || SYS_FREEBSD
  38 #include <sys/types.h>
  39 #include <sys/sysctl.h>
  40 #endif
  41 #if SYS_OPENBSD
  42 #include <sys/param.h>
  43 #include <sys/sysctl.h>
  44 #include <machine/cpu.h>
  45 #endif
  46
  47 const x264_cpu_name_t x264_cpu_names[] =
  48 {
  49 #if HAVE_MMX
  50 //  {"MMX",         X264_CPU_MMX},  // we don't support asm on mmx1 cpus anymore
  51 //  {"CMOV",        X264_CPU_CMOV}, // we require this unconditionally, so don't print it
  52 #define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
  53     {"MMX2",        MMX2},
  54     {"MMXEXT",      MMX2},
  55     {"SSE",         MMX2|X264_CPU_SSE},
  56 #define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
  57     {"SSE2Slow",    SSE2|X264_CPU_SSE2_IS_SLOW},
  58     {"SSE2",        SSE2},
  59     {"SSE2Fast",    SSE2|X264_CPU_SSE2_IS_FAST},
  60     {"SSE3",        SSE2|X264_CPU_SSE3},
  61     {"SSSE3",       SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
  62     {"SSE4.1",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
  63     {"SSE4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
  64     {"SSE4.2",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
  65 #define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
  66     {"AVX",         AVX},
  67     {"XOP",         AVX|X264_CPU_XOP},
  68     {"FMA4",        AVX|X264_CPU_FMA4},
  69     {"FMA3",        AVX|X264_CPU_FMA3},
  70     {"AVX2",        AVX|X264_CPU_FMA3|X264_CPU_AVX2},
  71 #undef AVX
  72 #undef SSE2
  73 #undef MMX2
  74     {"Cache32",         X264_CPU_CACHELINE_32},
  75     {"Cache64",         X264_CPU_CACHELINE_64},
  76     {"LZCNT",           X264_CPU_LZCNT},
  77     {"BMI1",            X264_CPU_BMI1},
  78     {"BMI2",            X264_CPU_BMI1|X264_CPU_BMI2},
  79     {"SlowCTZ",         X264_CPU_SLOW_CTZ},
  80     {"SlowAtom",        X264_CPU_SLOW_ATOM},
  81     {"SlowPshufb",      X264_CPU_SLOW_PSHUFB},
  82     {"SlowPalignr",     X264_CPU_SLOW_PALIGNR},
  83     {"SlowShuffle",     X264_CPU_SLOW_SHUFFLE},
  84     {"UnalignedStack",  X264_CPU_STACK_MOD4},
  85 #elif ARCH_PPC
  86     {"Altivec",         X264_CPU_ALTIVEC},
  87 #elif ARCH_ARM
  88     {"ARMv6",           X264_CPU_ARMV6},
  89     {"NEON",            X264_CPU_NEON},
  90     {"FastNeonMRC",     X264_CPU_FAST_NEON_MRC},
  91 #elif ARCH_AARCH64
  92     {"ARMv8",           X264_CPU_ARMV8},
  93     {"NEON",            X264_CPU_NEON},
  94 #elif ARCH_MIPS
  95     {"MSA",             X264_CPU_MSA},
  96 #endif
  97     {"", 0},
  98 };
  99
 100 #if (ARCH_PPC && SYS_LINUX) || (ARCH_ARM && !HAVE_NEON)
 101 #include <signal.h>
 102 #include <setjmp.h>
 103 static sigjmp_buf jmpbuf;
 104 static volatile sig_atomic_t canjump = 0;
 105
 106 static void sigill_handler( int sig )
 107 {
 108     if( !canjump )
 109     {
 110         signal( sig, SIG_DFL );
 111         raise( sig );
 112     }
 113
 114     canjump = 0;
 115     siglongjmp( jmpbuf, 1 );
 116 }
 117 #endif
 118
 119 #if HAVE_MMX
 120 int x264_cpu_cpuid_test( void );
 121 void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
 122 void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );
 123
 124 uint32_t x264_cpu_detect( void )
 125 {
 126     uint32_t cpu = 0;
 127     uint32_t eax, ebx, ecx, edx;
 128     uint32_t vendor[4] = {0};
 129     uint32_t max_extended_cap, max_basic_cap;
 130     int cache;
 131
 132 #if !ARCH_X86_64
 133     if( !x264_cpu_cpuid_test() )
 134         return 0;
 135 #endif
 136
 137     x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
 138     max_basic_cap = eax;
 139     if( max_basic_cap == 0 )
 140         return 0;
 141
 142     x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
 143     if( edx&0x00800000 )
 144         cpu |= X264_CPU_MMX;
 145     else
 146         return cpu;
 147     if( edx&0x02000000 )
 148         cpu |= X264_CPU_MMX2|X264_CPU_SSE;
 149     if( edx&0x00008000 )
 150         cpu |= X264_CPU_CMOV;
 151     else
 152         return cpu;
 153     if( edx&0x04000000 )
 154         cpu |= X264_CPU_SSE2;
 155     if( ecx&0x00000001 )
 156         cpu |= X264_CPU_SSE3;
 157     if( ecx&0x00000200 )
 158         cpu |= X264_CPU_SSSE3;
 159     if( ecx&0x00080000 )
 160         cpu |= X264_CPU_SSE4;
 161     if( ecx&0x00100000 )
 162         cpu |= X264_CPU_SSE42;
 163     /* Check OXSAVE and AVX bits */
 164     if( (ecx&0x18000000) == 0x18000000 )
 165     {
 166         /* Check for OS support */
 167         x264_cpu_xgetbv( 0, &eax, &edx );
 168         if( (eax&0x6) == 0x6 )
 169         {
 170             cpu |= X264_CPU_AVX;
 171             if( ecx&0x00001000 )
 172                 cpu |= X264_CPU_FMA3;
 173         }
 174     }
 175
 176     if( max_basic_cap >= 7 )
 177     {
 178         x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
 179         /* AVX2 requires OS support, but BMI1/2 don't. */
 180         if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
 181             cpu |= X264_CPU_AVX2;
 182         if( ebx&0x00000008 )
 183         {
 184             cpu |= X264_CPU_BMI1;
 185             if( ebx&0x00000100 )
 186                 cpu |= X264_CPU_BMI2;
 187         }
 188     }
 189
 190     if( cpu & X264_CPU_SSSE3 )
 191         cpu |= X264_CPU_SSE2_IS_FAST;
 192
 193     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
 194     max_extended_cap = eax;
 195
 196     if( max_extended_cap >= 0x80000001 )
 197     {
 198         x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
 199
 200         if( ecx&0x00000020 )
 201             cpu |= X264_CPU_LZCNT;             /* Supported by Intel chips starting with Haswell */
 202         if( ecx&0x00000040 ) /* SSE4a, AMD only */
 203         {
 204             int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
 205             cpu |= X264_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
 206             if( family == 0x14 )
 207             {
 208                 cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
 209                 cpu |= X264_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
 210                 cpu |= X264_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
 211             }
 212             if( family == 0x16 )
 213             {
 214                 cpu |= X264_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
 215                                                 * compared to alternate instruction sequences that this
 216                                                 * is equal or faster on almost all such functions. */
 217             }
 218         }
 219
 220         if( cpu & X264_CPU_AVX )
 221         {
 222             if( ecx&0x00000800 ) /* XOP */
 223                 cpu |= X264_CPU_XOP;
 224             if( ecx&0x00010000 ) /* FMA4 */
 225                 cpu |= X264_CPU_FMA4;
 226         }
 227
 228         if( !strcmp((char*)vendor, "AuthenticAMD") )
 229         {
 230             if( edx&0x00400000 )
 231                 cpu |= X264_CPU_MMX2;
 232             if( !(cpu&X264_CPU_LZCNT) )
 233                 cpu |= X264_CPU_SLOW_CTZ;
 234             if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
 235                 cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
 236         }
 237     }
 238
 239     if( !strcmp((char*)vendor, "GenuineIntel") )
 240     {
 241         x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
 242         int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
 243         int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
 244         if( family == 6 )
 245         {
 246             /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
 247              * theoretically support sse2, but it's significantly slower than mmx for
 248              * almost all of x264's functions, so let's just pretend they don't. */
 249             if( model == 9 || model == 13 || model == 14 )
 250             {
 251                 cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
 252                 assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
 253             }
 254             /* Detect Atom CPU */
 255             else if( model == 28 )
 256             {
 257                 cpu |= X264_CPU_SLOW_ATOM;
 258                 cpu |= X264_CPU_SLOW_CTZ;
 259                 cpu |= X264_CPU_SLOW_PSHUFB;
 260             }
 261             /* Conroe has a slow shuffle unit. Check the model number to make sure not
 262              * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
 263             else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
 264                 cpu |= X264_CPU_SLOW_SHUFFLE;
 265         }
 266     }
 267
 268     if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
 269     {
 270         /* cacheline size is specified in 3 places, any of which may be missing */
 271         x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
 272         cache = (ebx&0xff00)>>5; // cflush size
 273         if( !cache && max_extended_cap >= 0x80000006 )
 274         {
 275             x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
 276             cache = ecx&0xff; // cacheline size
 277         }
 278         if( !cache && max_basic_cap >= 2 )
 279         {
 280             // Cache and TLB Information
 281             static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
 282             static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
 283                                                 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
 284             uint32_t buf[4];
 285             int max, i = 0;
 286             do {
 287                 x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
 288                 max = buf[0]&0xff;
 289                 buf[0] &= ~0xff;
 290                 for( int j = 0; j < 4; j++ )
 291                     if( !(buf[j]>>31) )
 292                         while( buf[j] )
 293                         {
 294                             if( strchr( cache32_ids, buf[j]&0xff ) )
 295                                 cache = 32;
 296                             if( strchr( cache64_ids, buf[j]&0xff ) )
 297                                 cache = 64;
 298                             buf[j] >>= 8;
 299                         }
 300             } while( ++i < max );
 301         }
 302
 303         if( cache == 32 )
 304             cpu |= X264_CPU_CACHELINE_32;
 305         else if( cache == 64 )
 306             cpu |= X264_CPU_CACHELINE_64;
 307         else
 308             x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" );
 309     }
 310
 311 #if STACK_ALIGNMENT < 16
 312     cpu |= X264_CPU_STACK_MOD4;
 313 #endif
 314
 315     return cpu;
 316 }
 317
 318 #elif ARCH_PPC && HAVE_ALTIVEC
 319
 320 #if SYS_MACOSX || SYS_OPENBSD || SYS_FREEBSD
 321 #include <sys/sysctl.h>
 322 uint32_t x264_cpu_detect( void )
 323 {
 324     /* Thank you VLC */
 325     uint32_t cpu = 0;
 326 #if SYS_OPENBSD
 327     int      selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
 328 #elif SYS_MACOSX
 329     int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
 330 #endif
 331     int      has_altivec = 0;
 332     size_t   length = sizeof( has_altivec );
 333 #if SYS_MACOSX || SYS_OPENBSD
 334     int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
 335 #else
 336     int      error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 );
 337 #endif
 338
 339     if( error == 0 && has_altivec != 0 )
 340         cpu |= X264_CPU_ALTIVEC;
 341
 342     return cpu;
 343 }
 344
 345 #elif SYS_LINUX
 346
 347 uint32_t x264_cpu_detect( void )
 348 {
 349 #ifdef __NO_FPRS__
 350     return 0;
 351 #else
 352     static void (*oldsig)( int );
 353
 354     oldsig = signal( SIGILL, sigill_handler );
 355     if( sigsetjmp( jmpbuf, 1 ) )
 356     {
 357         signal( SIGILL, oldsig );
 358         return 0;
 359     }
 360
 361     canjump = 1;
 362     asm volatile( "mtspr 256, %0\n\t"
 363                   "vand 0, 0, 0\n\t"
 364                   :
 365                   : "r"(-1) );
 366     canjump = 0;
 367
 368     signal( SIGILL, oldsig );
 369
 370     return X264_CPU_ALTIVEC;
 371 #endif
 372 }
 373 #endif
 374
 375 #elif ARCH_ARM
 376
 377 void x264_cpu_neon_test( void );
 378 int x264_cpu_fast_neon_mrc_test( void );
 379
 380 uint32_t x264_cpu_detect( void )
 381 {
 382     int flags = 0;
 383 #if HAVE_ARMV6
 384     flags |= X264_CPU_ARMV6;
 385
 386     // don't do this hack if compiled with -mfpu=neon
 387 #if !HAVE_NEON
 388     static void (* oldsig)( int );
 389     oldsig = signal( SIGILL, sigill_handler );
 390     if( sigsetjmp( jmpbuf, 1 ) )
 391     {
 392         signal( SIGILL, oldsig );
 393         return flags;
 394     }
 395
 396     canjump = 1;
 397     x264_cpu_neon_test();
 398     canjump = 0;
 399     signal( SIGILL, oldsig );
 400 #endif
 401
 402     flags |= X264_CPU_NEON;
 403
 404     // fast neon -> arm (Cortex-A9) detection relies on user access to the
 405     // cycle counter; this assumes ARMv7 performance counters.
 406     // NEON requires at least ARMv7, ARMv8 may require changes here, but
 407     // hopefully this hacky detection method will have been replaced by then.
 408     // Note that there is potential for a race condition if another program or
 409     // x264 instance disables or reinits the counters while x264 is using them,
 410     // which may result in incorrect detection and the counters stuck enabled.
 411     // right now Apple does not seem to support performance counters for this test
 412 #ifndef __MACH__
 413     flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
 414 #endif
 415     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
 416 #endif
 417     return flags;
 418 }
 419
 420 #elif ARCH_AARCH64
 421
 422 uint32_t x264_cpu_detect( void )
 423 {
 424     return X264_CPU_ARMV8 | X264_CPU_NEON;
 425 }
 426
 427 #elif ARCH_MIPS
 428
 429 uint32_t x264_cpu_detect( void )
 430 {
 431     uint32_t flags = 0;
 432 #if HAVE_MSA
 433     flags |= X264_CPU_MSA;
 434 #endif
 435     return flags;
 436 }
 437
 438 #else
 439
 440 uint32_t x264_cpu_detect( void )
 441 {
 442     return 0;
 443 }
 444
 445 #endif
 446
 447 int x264_cpu_num_processors( void )
 448 {
 449 #if !HAVE_THREAD
 450     return 1;
 451
 452 #elif SYS_WINDOWS
 453     return x264_pthread_num_processors_np();
 454
 455 #elif SYS_CYGWIN || SYS_SunOS
 456     return sysconf( _SC_NPROCESSORS_ONLN );
 457
 458 #elif SYS_LINUX
 459 #ifdef __ANDROID__
 460     // Android NDK does not expose sched_getaffinity
 461     return sysconf( _SC_NPROCESSORS_CONF );
 462 #else
 463     cpu_set_t p_aff;
 464     memset( &p_aff, 0, sizeof(p_aff) );
 465     if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) )
 466         return 1;
 467 #if HAVE_CPU_COUNT
 468     return CPU_COUNT(&p_aff);
 469 #else
 470     int np = 0;
 471     for( unsigned int bit = 0; bit < 8 * sizeof(p_aff); bit++ )
 472         np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
 473     return np;
 474 #endif
 475 #endif
 476
 477 #elif SYS_BEOS
 478     system_info info;
 479     get_system_info( &info );
 480     return info.cpu_count;
 481
 482 #elif SYS_MACOSX || SYS_FREEBSD || SYS_OPENBSD
 483     int ncpu;
 484     size_t length = sizeof( ncpu );
 485 #if SYS_OPENBSD
 486     int mib[2] = { CTL_HW, HW_NCPU };
 487     if( sysctl(mib, 2, &ncpu, &length, NULL, 0) )
 488 #else
 489     if( sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0) )
 490 #endif
 491     {
 492         ncpu = 1;
 493     }
 494     return ncpu;
 495
 496 #else
 497     return 1;
 498 #endif
 499 }