]> git.sesse.net Git - x264/commitdiff
x86: Make AVX2 also imply FMA3
authorHenrik Gramner <henrik@gramner.com>
Sat, 2 Aug 2014 16:26:18 +0000 (18:26 +0200)
committerHenrik Gramner <henrik@gramner.com>
Fri, 12 Dec 2014 23:42:28 +0000 (00:42 +0100)
All CPUs with AVX2 supports FMA3 (but not the other way around).

common/cpu.c
common/x86/mc-a2.asm
common/x86/mc-c.c
common/x86/x86inc.asm
tools/checkasm.c
x264.h

index 4877a015f6dbd72e65a67d4d71445f9424798b87..cad5f2c2e9d0857e4d38f97ed7322612e582b785 100644 (file)
@@ -67,8 +67,8 @@ const x264_cpu_name_t x264_cpu_names[] =
     {"AVX",         AVX},
     {"XOP",         AVX|X264_CPU_XOP},
     {"FMA4",        AVX|X264_CPU_FMA4},
-    {"AVX2",        AVX|X264_CPU_AVX2},
     {"FMA3",        AVX|X264_CPU_FMA3},
+    {"AVX2",        AVX|X264_CPU_FMA3|X264_CPU_AVX2},
 #undef AVX
 #undef SSE2
 #undef MMX2
index d56ad4f6020ce7a2e95daea74802a0781d98c9cd..9e1746c36622855679620311d8dbeac56975d373 100644 (file)
@@ -2136,7 +2136,7 @@ cglobal mbtree_propagate_cost, 6,6,%1
 
 INIT_YMM avx
 MBTREE_AVX 8
-INIT_YMM avx2,fma3
+INIT_YMM avx2
 MBTREE_AVX 7
 
 %macro MBTREE_PROPAGATE_LIST 0
index d231a8c9341ebba49c4675a7cc0f546b5fc38f7a..9101997f83f3e0a6854b7394898d3edea895c1fb 100644 (file)
@@ -167,8 +167,8 @@ void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                           uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
 void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
@@ -938,7 +938,5 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( !(cpu&X264_CPU_AVX2) )
         return;
     pf->get_ref = get_ref_avx2;
-
-    if( cpu&X264_CPU_FMA3 )
-        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
 }
index 0f0dcdfc299243c2f5c991294c3a14b043998820..67dfff6a1bc7b83d6d742cf42a7d3cd9f0b87aec 100644 (file)
@@ -738,8 +738,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %assign cpuflags_avx      (1<<11)| cpuflags_sse42
 %assign cpuflags_xop      (1<<12)| cpuflags_avx
 %assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_avx2     (1<<14)| cpuflags_avx
-%assign cpuflags_fma3     (1<<15)| cpuflags_avx
+%assign cpuflags_fma3     (1<<14)| cpuflags_avx
+%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
 
 %assign cpuflags_cache32  (1<<16)
 %assign cpuflags_cache64  (1<<17)
index 7163976a9fbfd9ae9b9bb08e1cce70691f30f567..a348aa18c8df89f8ad76f5348b1f891f684864c5 100644 (file)
@@ -167,12 +167,12 @@ static void print_bench(void)
                 continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
 #if HAVE_MMX
-                    b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
                     b->cpu&X264_CPU_AVX2 ? "avx2" :
                     b->cpu&X264_CPU_FMA3 ? "fma3" :
                     b->cpu&X264_CPU_FMA4 ? "fma4" :
                     b->cpu&X264_CPU_XOP ? "xop" :
                     b->cpu&X264_CPU_AVX ? "avx" :
+                    b->cpu&X264_CPU_SSE42 ? "sse42" :
                     b->cpu&X264_CPU_SSE4 ? "sse4" :
                     b->cpu&X264_CPU_SSSE3 ? "ssse3" :
                     b->cpu&X264_CPU_SSE3 ? "sse3" :
@@ -2651,7 +2651,7 @@ static int check_all_flags( void )
 #endif
         if( cpu_detect & X264_CPU_LZCNT )
         {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" );
             cpu1 &= ~X264_CPU_LZCNT;
         }
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
@@ -2669,11 +2669,11 @@ static int check_all_flags( void )
         cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
         cpu1 &= ~X264_CPU_SLOW_CTZ;
-    }
-    if( cpu_detect & X264_CPU_LZCNT )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
-        cpu1 &= ~X264_CPU_LZCNT;
+        if( cpu_detect & X264_CPU_LZCNT )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" );
+            cpu1 &= ~X264_CPU_LZCNT;
+        }
     }
     if( cpu_detect & X264_CPU_SSE3 )
     {
@@ -2693,9 +2693,16 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
         cpu1 &= ~X264_CPU_CACHELINE_64;
         cpu1 &= ~X264_CPU_SLOW_ATOM;
+        if( cpu_detect & X264_CPU_LZCNT )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" );
+            cpu1 &= ~X264_CPU_LZCNT;
+        }
     }
     if( cpu_detect & X264_CPU_SSE4 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+    if( cpu_detect & X264_CPU_SSE42 )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" );
     if( cpu_detect & X264_CPU_AVX )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
     if( cpu_detect & X264_CPU_XOP )
@@ -2705,30 +2712,30 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
         cpu1 &= ~X264_CPU_FMA4;
     }
-    if( cpu_detect & X264_CPU_BMI1 )
+    if( cpu_detect & X264_CPU_FMA3 )
     {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
-        cpu1 &= ~X264_CPU_BMI1;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
+        cpu1 &= ~X264_CPU_FMA3;
     }
     if( cpu_detect & X264_CPU_AVX2 )
     {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" );
         if( cpu_detect & X264_CPU_LZCNT )
         {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" );
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" );
             cpu1 &= ~X264_CPU_LZCNT;
         }
     }
+    if( cpu_detect & X264_CPU_BMI1 )
+    {
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
+        cpu1 &= ~X264_CPU_BMI1;
+    }
     if( cpu_detect & X264_CPU_BMI2 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
         cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
     }
-    if( cpu_detect & X264_CPU_FMA3 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
-        cpu1 &= ~X264_CPU_FMA3;
-    }
 #elif ARCH_PPC
     if( cpu_detect & X264_CPU_ALTIVEC )
     {
diff --git a/x264.h b/x264.h
index b896f913d31ea7f7be02eafe09b6c2c790928416..e3b1d15b1c4121d46378211cc1088effaae97c88 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 142
+#define X264_BUILD 143
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -129,8 +129,8 @@ typedef struct
 #define X264_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
 #define X264_CPU_XOP             0x0000800  /* AMD XOP */
 #define X264_CPU_FMA4            0x0001000  /* AMD FMA4 */
-#define X264_CPU_AVX2            0x0002000  /* AVX2 */
-#define X264_CPU_FMA3            0x0004000  /* Intel FMA3 */
+#define X264_CPU_FMA3            0x0002000  /* FMA3 */
+#define X264_CPU_AVX2            0x0004000  /* AVX2 */
 #define X264_CPU_BMI1            0x0008000  /* BMI1 */
 #define X264_CPU_BMI2            0x0010000  /* BMI2 */
 /* x86 modifiers */