git.sesse.net Git - x264/blob - common/cpu.c

   1 /*****************************************************************************
   2  * cpu.c: cpu detection
   3  *****************************************************************************
   4  * Copyright (C) 2003-2016 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *
  24  * This program is also available under a commercial proprietary license.
  25  * For more information, contact us at licensing@x264.com.
  26  *****************************************************************************/
  27
  28 #define _GNU_SOURCE // for sched_getaffinity
  29 #include "common.h"
  30 #include "cpu.h"
  31
  32 #if HAVE_POSIXTHREAD && SYS_LINUX
  33 #include <sched.h>
  34 #endif
  35 #if SYS_BEOS
  36 #include <kernel/OS.h>
  37 #endif
  38 #if SYS_MACOSX || SYS_FREEBSD
  39 #include <sys/types.h>
  40 #include <sys/sysctl.h>
  41 #endif
  42 #if SYS_OPENBSD
  43 #include <sys/param.h>
  44 #include <sys/sysctl.h>
  45 #include <machine/cpu.h>
  46 #endif
  47
  48 const x264_cpu_name_t x264_cpu_names[] =
  49 {
  50 #if HAVE_MMX
  51 //  {"MMX",         X264_CPU_MMX},  // we don't support asm on mmx1 cpus anymore
  52 //  {"CMOV",        X264_CPU_CMOV}, // we require this unconditionally, so don't print it
  53 #define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
  54     {"MMX2",        MMX2},
  55     {"MMXEXT",      MMX2},
  56     {"SSE",         MMX2|X264_CPU_SSE},
  57 #define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
  58     {"SSE2Slow",    SSE2|X264_CPU_SSE2_IS_SLOW},
  59     {"SSE2",        SSE2},
  60     {"SSE2Fast",    SSE2|X264_CPU_SSE2_IS_FAST},
  61     {"SSE3",        SSE2|X264_CPU_SSE3},
  62     {"SSSE3",       SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
  63     {"SSE4.1",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
  64     {"SSE4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
  65     {"SSE4.2",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
  66 #define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
  67     {"AVX",         AVX},
  68     {"XOP",         AVX|X264_CPU_XOP},
  69     {"FMA4",        AVX|X264_CPU_FMA4},
  70     {"FMA3",        AVX|X264_CPU_FMA3},
  71     {"AVX2",        AVX|X264_CPU_FMA3|X264_CPU_AVX2},
  72 #undef AVX
  73 #undef SSE2
  74 #undef MMX2
  75     {"Cache32",         X264_CPU_CACHELINE_32},
  76     {"Cache64",         X264_CPU_CACHELINE_64},
  77     {"LZCNT",           X264_CPU_LZCNT},
  78     {"BMI1",            X264_CPU_BMI1},
  79     {"BMI2",            X264_CPU_BMI1|X264_CPU_BMI2},
  80     {"SlowCTZ",         X264_CPU_SLOW_CTZ},
  81     {"SlowAtom",        X264_CPU_SLOW_ATOM},
  82     {"SlowPshufb",      X264_CPU_SLOW_PSHUFB},
  83     {"SlowPalignr",     X264_CPU_SLOW_PALIGNR},
  84     {"SlowShuffle",     X264_CPU_SLOW_SHUFFLE},
  85     {"UnalignedStack",  X264_CPU_STACK_MOD4},
  86 #elif ARCH_PPC
  87     {"Altivec",         X264_CPU_ALTIVEC},
  88 #elif ARCH_ARM
  89     {"ARMv6",           X264_CPU_ARMV6},
  90     {"NEON",            X264_CPU_NEON},
  91     {"FastNeonMRC",     X264_CPU_FAST_NEON_MRC},
  92 #elif ARCH_AARCH64
  93     {"ARMv8",           X264_CPU_ARMV8},
  94     {"NEON",            X264_CPU_NEON},
  95 #elif ARCH_MIPS
  96     {"MSA",             X264_CPU_MSA},
  97 #endif
  98     {"", 0},
  99 };
 100
 101 #if (ARCH_PPC && SYS_LINUX) || (ARCH_ARM && !HAVE_NEON)
 102 #include <signal.h>
 103 #include <setjmp.h>
 104 static sigjmp_buf jmpbuf;
 105 static volatile sig_atomic_t canjump = 0;
 106
 107 static void sigill_handler( int sig )
 108 {
 109     if( !canjump )
 110     {
 111         signal( sig, SIG_DFL );
 112         raise( sig );
 113     }
 114
 115     canjump = 0;
 116     siglongjmp( jmpbuf, 1 );
 117 }
 118 #endif
 119
 120 #if HAVE_MMX
 121 int x264_cpu_cpuid_test( void );
 122 void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
 123 void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );
 124
 125 uint32_t x264_cpu_detect( void )
 126 {
 127     uint32_t cpu = 0;
 128     uint32_t eax, ebx, ecx, edx;
 129     uint32_t vendor[4] = {0};
 130     uint32_t max_extended_cap, max_basic_cap;
 131     int cache;
 132
 133 #if !ARCH_X86_64
 134     if( !x264_cpu_cpuid_test() )
 135         return 0;
 136 #endif
 137
 138     x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
 139     max_basic_cap = eax;
 140     if( max_basic_cap == 0 )
 141         return 0;
 142
 143     x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
 144     if( edx&0x00800000 )
 145         cpu |= X264_CPU_MMX;
 146     else
 147         return cpu;
 148     if( edx&0x02000000 )
 149         cpu |= X264_CPU_MMX2|X264_CPU_SSE;
 150     if( edx&0x00008000 )
 151         cpu |= X264_CPU_CMOV;
 152     else
 153         return cpu;
 154     if( edx&0x04000000 )
 155         cpu |= X264_CPU_SSE2;
 156     if( ecx&0x00000001 )
 157         cpu |= X264_CPU_SSE3;
 158     if( ecx&0x00000200 )
 159         cpu |= X264_CPU_SSSE3;
 160     if( ecx&0x00080000 )
 161         cpu |= X264_CPU_SSE4;
 162     if( ecx&0x00100000 )
 163         cpu |= X264_CPU_SSE42;
 164     /* Check OXSAVE and AVX bits */
 165     if( (ecx&0x18000000) == 0x18000000 )
 166     {
 167         /* Check for OS support */
 168         x264_cpu_xgetbv( 0, &eax, &edx );
 169         if( (eax&0x6) == 0x6 )
 170         {
 171             cpu |= X264_CPU_AVX;
 172             if( ecx&0x00001000 )
 173                 cpu |= X264_CPU_FMA3;
 174         }
 175     }
 176
 177     if( max_basic_cap >= 7 )
 178     {
 179         x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
 180         /* AVX2 requires OS support, but BMI1/2 don't. */
 181         if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
 182             cpu |= X264_CPU_AVX2;
 183         if( ebx&0x00000008 )
 184         {
 185             cpu |= X264_CPU_BMI1;
 186             if( ebx&0x00000100 )
 187                 cpu |= X264_CPU_BMI2;
 188         }
 189     }
 190
 191     if( cpu & X264_CPU_SSSE3 )
 192         cpu |= X264_CPU_SSE2_IS_FAST;
 193
 194     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
 195     max_extended_cap = eax;
 196
 197     if( max_extended_cap >= 0x80000001 )
 198     {
 199         x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
 200
 201         if( ecx&0x00000020 )
 202             cpu |= X264_CPU_LZCNT;             /* Supported by Intel chips starting with Haswell */
 203         if( ecx&0x00000040 ) /* SSE4a, AMD only */
 204         {
 205             int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
 206             cpu |= X264_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
 207             if( family == 0x14 )
 208             {
 209                 cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
 210                 cpu |= X264_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
 211                 cpu |= X264_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
 212             }
 213             if( family == 0x16 )
 214             {
 215                 cpu |= X264_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
 216                                                 * compared to alternate instruction sequences that this
 217                                                 * is equal or faster on almost all such functions. */
 218             }
 219         }
 220
 221         if( cpu & X264_CPU_AVX )
 222         {
 223             if( ecx&0x00000800 ) /* XOP */
 224                 cpu |= X264_CPU_XOP;
 225             if( ecx&0x00010000 ) /* FMA4 */
 226                 cpu |= X264_CPU_FMA4;
 227         }
 228
 229         if( !strcmp((char*)vendor, "AuthenticAMD") )
 230         {
 231             if( edx&0x00400000 )
 232                 cpu |= X264_CPU_MMX2;
 233             if( !(cpu&X264_CPU_LZCNT) )
 234                 cpu |= X264_CPU_SLOW_CTZ;
 235             if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
 236                 cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
 237         }
 238     }
 239
 240     if( !strcmp((char*)vendor, "GenuineIntel") )
 241     {
 242         x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
 243         int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
 244         int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
 245         if( family == 6 )
 246         {
 247             /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
 248              * theoretically support sse2, but it's significantly slower than mmx for
 249              * almost all of x264's functions, so let's just pretend they don't. */
 250             if( model == 9 || model == 13 || model == 14 )
 251             {
 252                 cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
 253                 assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
 254             }
 255             /* Detect Atom CPU */
 256             else if( model == 28 )
 257             {
 258                 cpu |= X264_CPU_SLOW_ATOM;
 259                 cpu |= X264_CPU_SLOW_CTZ;
 260                 cpu |= X264_CPU_SLOW_PSHUFB;
 261             }
 262             /* Conroe has a slow shuffle unit. Check the model number to make sure not
 263              * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
 264             else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
 265                 cpu |= X264_CPU_SLOW_SHUFFLE;
 266         }
 267     }
 268
 269     if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
 270     {
 271         /* cacheline size is specified in 3 places, any of which may be missing */
 272         x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
 273         cache = (ebx&0xff00)>>5; // cflush size
 274         if( !cache && max_extended_cap >= 0x80000006 )
 275         {
 276             x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
 277             cache = ecx&0xff; // cacheline size
 278         }
 279         if( !cache && max_basic_cap >= 2 )
 280         {
 281             // Cache and TLB Information
 282             static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
 283             static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
 284                                                 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
 285             uint32_t buf[4];
 286             int max, i = 0;
 287             do {
 288                 x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
 289                 max = buf[0]&0xff;
 290                 buf[0] &= ~0xff;
 291                 for( int j = 0; j < 4; j++ )
 292                     if( !(buf[j]>>31) )
 293                         while( buf[j] )
 294                         {
 295                             if( strchr( cache32_ids, buf[j]&0xff ) )
 296                                 cache = 32;
 297                             if( strchr( cache64_ids, buf[j]&0xff ) )
 298                                 cache = 64;
 299                             buf[j] >>= 8;
 300                         }
 301             } while( ++i < max );
 302         }
 303
 304         if( cache == 32 )
 305             cpu |= X264_CPU_CACHELINE_32;
 306         else if( cache == 64 )
 307             cpu |= X264_CPU_CACHELINE_64;
 308         else
 309             x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" );
 310     }
 311
 312 #if STACK_ALIGNMENT < 16
 313     cpu |= X264_CPU_STACK_MOD4;
 314 #endif
 315
 316     return cpu;
 317 }
 318
 319 #elif ARCH_PPC && HAVE_ALTIVEC
 320
 321 #if SYS_MACOSX || SYS_OPENBSD || SYS_FREEBSD
 322 #include <sys/sysctl.h>
 323 uint32_t x264_cpu_detect( void )
 324 {
 325     /* Thank you VLC */
 326     uint32_t cpu = 0;
 327 #if SYS_OPENBSD
 328     int      selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
 329 #elif SYS_MACOSX
 330     int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
 331 #endif
 332     int      has_altivec = 0;
 333     size_t   length = sizeof( has_altivec );
 334 #if SYS_MACOSX || SYS_OPENBSD
 335     int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
 336 #else
 337     int      error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 );
 338 #endif
 339
 340     if( error == 0 && has_altivec != 0 )
 341         cpu |= X264_CPU_ALTIVEC;
 342
 343     return cpu;
 344 }
 345
 346 #elif SYS_LINUX
 347
 348 uint32_t x264_cpu_detect( void )
 349 {
 350 #ifdef __NO_FPRS__
 351     return 0;
 352 #else
 353     static void (*oldsig)( int );
 354
 355     oldsig = signal( SIGILL, sigill_handler );
 356     if( sigsetjmp( jmpbuf, 1 ) )
 357     {
 358         signal( SIGILL, oldsig );
 359         return 0;
 360     }
 361
 362     canjump = 1;
 363     asm volatile( "mtspr 256, %0\n\t"
 364                   "vand 0, 0, 0\n\t"
 365                   :
 366                   : "r"(-1) );
 367     canjump = 0;
 368
 369     signal( SIGILL, oldsig );
 370
 371     return X264_CPU_ALTIVEC;
 372 #endif
 373 }
 374 #endif
 375
 376 #elif ARCH_ARM
 377
 378 void x264_cpu_neon_test( void );
 379 int x264_cpu_fast_neon_mrc_test( void );
 380
 381 uint32_t x264_cpu_detect( void )
 382 {
 383     int flags = 0;
 384 #if HAVE_ARMV6
 385     flags |= X264_CPU_ARMV6;
 386
 387     // don't do this hack if compiled with -mfpu=neon
 388 #if !HAVE_NEON
 389     static void (* oldsig)( int );
 390     oldsig = signal( SIGILL, sigill_handler );
 391     if( sigsetjmp( jmpbuf, 1 ) )
 392     {
 393         signal( SIGILL, oldsig );
 394         return flags;
 395     }
 396
 397     canjump = 1;
 398     x264_cpu_neon_test();
 399     canjump = 0;
 400     signal( SIGILL, oldsig );
 401 #endif
 402
 403     flags |= X264_CPU_NEON;
 404
 405     // fast neon -> arm (Cortex-A9) detection relies on user access to the
 406     // cycle counter; this assumes ARMv7 performance counters.
 407     // NEON requires at least ARMv7, ARMv8 may require changes here, but
 408     // hopefully this hacky detection method will have been replaced by then.
 409     // Note that there is potential for a race condition if another program or
 410     // x264 instance disables or reinits the counters while x264 is using them,
 411     // which may result in incorrect detection and the counters stuck enabled.
 412     // right now Apple does not seem to support performance counters for this test
 413 #ifndef __MACH__
 414     flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
 415 #endif
 416     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
 417 #endif
 418     return flags;
 419 }
 420
 421 #elif ARCH_AARCH64
 422
 423 uint32_t x264_cpu_detect( void )
 424 {
 425     return X264_CPU_ARMV8 | X264_CPU_NEON;
 426 }
 427
 428 #elif ARCH_MIPS
 429
 430 uint32_t x264_cpu_detect( void )
 431 {
 432     uint32_t flags = 0;
 433 #if HAVE_MSA
 434     flags |= X264_CPU_MSA;
 435 #endif
 436     return flags;
 437 }
 438
 439 #else
 440
 441 uint32_t x264_cpu_detect( void )
 442 {
 443     return 0;
 444 }
 445
 446 #endif
 447
 448 int x264_cpu_num_processors( void )
 449 {
 450 #if !HAVE_THREAD
 451     return 1;
 452
 453 #elif SYS_WINDOWS
 454     return x264_pthread_num_processors_np();
 455
 456 #elif SYS_CYGWIN || SYS_SunOS
 457     return sysconf( _SC_NPROCESSORS_ONLN );
 458
 459 #elif SYS_LINUX
 460 #ifdef __ANDROID__
 461     // Android NDK does not expose sched_getaffinity
 462     return sysconf( _SC_NPROCESSORS_CONF );
 463 #else
 464     cpu_set_t p_aff;
 465     memset( &p_aff, 0, sizeof(p_aff) );
 466     if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) )
 467         return 1;
 468 #if HAVE_CPU_COUNT
 469     return CPU_COUNT(&p_aff);
 470 #else
 471     int np = 0;
 472     for( unsigned int bit = 0; bit < 8 * sizeof(p_aff); bit++ )
 473         np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
 474     return np;
 475 #endif
 476 #endif
 477
 478 #elif SYS_BEOS
 479     system_info info;
 480     get_system_info( &info );
 481     return info.cpu_count;
 482
 483 #elif SYS_MACOSX || SYS_FREEBSD || SYS_OPENBSD
 484     int ncpu;
 485     size_t length = sizeof( ncpu );
 486 #if SYS_OPENBSD
 487     int mib[2] = { CTL_HW, HW_NCPU };
 488     if( sysctl(mib, 2, &ncpu, &length, NULL, 0) )
 489 #else
 490     if( sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0) )
 491 #endif
 492     {
 493         ncpu = 1;
 494     }
 495     return ncpu;
 496
 497 #else
 498     return 1;
 499 #endif
 500 }