]> git.sesse.net Git - x264/commitdiff
Detect Atom CPU, enable appropriate asm functions
authorFiona Glaser <fiona@x264.com>
Tue, 25 May 2010 23:13:59 +0000 (16:13 -0700)
committerFiona Glaser <fiona@x264.com>
Wed, 26 May 2010 09:06:43 +0000 (02:06 -0700)
I'm not going to actually optimize for this pile of garbage unless someone pays me.
But it can't hurt to at least enable the correct functions based on benchmarks.

Also save some cache on Intel CPUs that don't need the decimate LUT due to having fast bsr/bsf.

common/cpu.c
common/dct.c
common/pixel.c
common/quant.c
common/x86/mc-c.c
common/x86/quant-a.asm
common/x86/quant.h
encoder/macroblock.c
tools/checkasm.c
x264.h

index 6e0bfda31589d757a337eba1bead83a170175bf1..87e21c33547132022ef30dbeb499f64b91e51a2f 100644 (file)
@@ -64,6 +64,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
     {"ARMv6", X264_CPU_ARMV6},
     {"NEON",  X264_CPU_NEON},
     {"Fast_NEON_MRC",  X264_CPU_FAST_NEON_MRC},
+    {"SlowCTZ", X264_CPU_SLOW_CTZ},
+    {"SlowAtom", X264_CPU_SLOW_ATOM},
     {"", 0},
 };
 
@@ -135,6 +137,7 @@ uint32_t x264_cpu_detect( void )
 
     if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
     {
+        cpu |= X264_CPU_SLOW_CTZ;
         x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
         if( edx&0x00400000 )
             cpu |= X264_CPU_MMXEXT;
@@ -145,6 +148,7 @@ uint32_t x264_cpu_detect( void )
                 cpu |= X264_CPU_SSE2_IS_FAST;
                 cpu |= X264_CPU_LZCNT;
                 cpu |= X264_CPU_SHUFFLE_IS_FAST;
+                cpu &= ~X264_CPU_SLOW_CTZ;
             }
             else
                 cpu |= X264_CPU_SSE2_IS_SLOW;
@@ -159,11 +163,9 @@ uint32_t x264_cpu_detect( void )
 
     if( !strcmp((char*)vendor, "GenuineIntel") )
     {
-        int family, model, stepping;
         x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
-        family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
-        model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
-        stepping = eax&0xf;
+        int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+        int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
         /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
          * theoretically support sse2, but it's significantly slower than mmx for
          * almost all of x264's functions, so let's just pretend they don't. */
@@ -172,6 +174,12 @@ uint32_t x264_cpu_detect( void )
             cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
             assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
         }
+        /* Detect Atom CPU */
+        if( family == 6 && model == 28 )
+        {
+            cpu |= X264_CPU_SLOW_ATOM;
+            cpu |= X264_CPU_SLOW_CTZ;
+        }
     }
 
     if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
index 3917510c9b3fbbcf43db6f67304e71a6763dc7bb..10fe2f77e4eb7d253662212ccd71629d0c7240c9 100644 (file)
@@ -457,7 +457,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
     }
 
-    if( cpu&X264_CPU_SSSE3 )
+    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
     {
         dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
         dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
index 20c5170aebe132ec3aad7f84e736622b03bd9447..5759abf2bc9d61ac2ad964d2f17f163e57ac3ad8 100644 (file)
@@ -768,17 +768,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 
     if( cpu&X264_CPU_SSSE3 )
     {
-        INIT7( ssd, _ssse3 );
-        INIT7( satd, _ssse3 );
-        INIT7( satd_x3, _ssse3 );
-        INIT7( satd_x4, _ssse3 );
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
             INIT4( hadamard_ac, _ssse3 );
         }
         INIT_ADS( _ssse3 );
-        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+        if( !(cpu&X264_CPU_SLOW_ATOM) )
+        {
+            INIT7( ssd, _ssse3 );
+            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
+            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+            INIT7( satd, _ssse3 );
+            INIT7( satd_x3, _ssse3 );
+            INIT7( satd_x4, _ssse3 );
+        }
         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
         pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
@@ -794,7 +797,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
             INIT2( sad_x3, _cache64_ssse3 );
             INIT2( sad_x4, _cache64_ssse3 );
         }
-        if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
+        if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
         {
             INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
         }
index ce074e267798c434b5d46de90e2753face6aa9fe..8b1fc87f3d13980a819d46d9c8348be3e4f0a18a 100644 (file)
@@ -312,6 +312,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
         pf->decimate_score15 = x264_decimate_score15_mmxext;
         pf->decimate_score16 = x264_decimate_score16_mmxext;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_mmxext_slowctz;
+            pf->decimate_score16 = x264_decimate_score16_mmxext_slowctz;
+        }
         pf->decimate_score64 = x264_decimate_score64_mmxext;
         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmxext;
         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
@@ -345,6 +350,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score15 = x264_decimate_score15_sse2;
         pf->decimate_score16 = x264_decimate_score16_sse2;
         pf->decimate_score64 = x264_decimate_score64_sse2;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
+            pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
+        }
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
@@ -369,6 +379,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->denoise_dct = x264_denoise_dct_ssse3;
         pf->decimate_score15 = x264_decimate_score15_ssse3;
         pf->decimate_score16 = x264_decimate_score16_ssse3;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
+            pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
+        }
         pf->decimate_score64 = x264_decimate_score64_ssse3;
     }
 
index 0022b13b95a8d9132061c773572c5ae95ae78e5b..5906ddd41fdb329ced38f2f2aaf070232f598edd 100644 (file)
@@ -427,8 +427,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         return;
 
     pf->weight = x264_mc_weight_wtab_sse2;
-    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
-    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+    if( !(cpu&X264_CPU_SLOW_ATOM) )
+    {
+        pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+        pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+    }
 
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
@@ -481,7 +484,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->weight = x264_mc_weight_wtab_ssse3;
     }
 
-    if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
 
     if( !(cpu&X264_CPU_SSE4) )
index f9f78978530773dcd8ce9269828fa5516abb483e..d571cedd46281ff9b1c3396a4e5e963f306f6181 100644 (file)
@@ -583,9 +583,9 @@ DENOISE_DCT ssse3, 7
 cextern decimate_table4
 cextern decimate_table8
 
-%macro DECIMATE4x4 2
+%macro DECIMATE4x4 3
 
-;A LUT is faster than bsf on AMD processors, and no slower on Intel
+;A LUT is faster than bsf on AMD processors.
 ;This is not true for score64.
 cglobal decimate_score%1_%2, 1,3
 %ifdef PIC
@@ -605,6 +605,7 @@ cglobal decimate_score%1_%2, 1,3
 %if %1==15
     shr   edx, 1
 %endif
+%if %3==1
     movzx ecx, dl
     movzx eax, byte [mask_table + rcx]
     cmp   edx, ecx
@@ -617,8 +618,16 @@ cglobal decimate_score%1_%2, 1,3
     shr   edx, cl
     add    al, byte [table + rcx]
     add    al, byte [mask_table + rdx]
+%else
+.loop:
+    bsf   ecx, edx
+    shr   edx, cl
+    add    al, byte [table + rcx]
+    shr   edx, 1
+    jne  .loop
+%endif
 .ret:
-    REP_RET
+    RET
 .ret9:
     mov   eax, 9
     RET
@@ -627,14 +636,20 @@ cglobal decimate_score%1_%2, 1,3
 
 %ifndef ARCH_X86_64
 %define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE4x4 15, mmxext
-DECIMATE4x4 16, mmxext
+DECIMATE4x4 15, mmxext, 0
+DECIMATE4x4 16, mmxext, 0
+DECIMATE4x4 15, mmxext_slowctz, 1
+DECIMATE4x4 16, mmxext_slowctz, 1
 %endif
 %define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE4x4 15, sse2
-DECIMATE4x4 15, ssse3
-DECIMATE4x4 16, sse2
-DECIMATE4x4 16, ssse3
+DECIMATE4x4 15, sse2, 0
+DECIMATE4x4 16, sse2, 0
+DECIMATE4x4 15, sse2_slowctz, 1
+DECIMATE4x4 16, sse2_slowctz, 1
+DECIMATE4x4 15, ssse3, 0
+DECIMATE4x4 16, ssse3, 0
+DECIMATE4x4 15, ssse3_slowctz, 1
+DECIMATE4x4 16, ssse3_slowctz, 1
 
 %macro DECIMATE8x8 1
 
index 4e42b8129ec25c926517dfd4c36c6ad2e96bd431..8d6a7625c187e01d1d99ff4e3e5f7fc9b7cbba7c 100644 (file)
@@ -57,6 +57,12 @@ int x264_decimate_score15_ssse3 ( int16_t *dct );
 int x264_decimate_score16_mmxext( int16_t *dct );
 int x264_decimate_score16_sse2  ( int16_t *dct );
 int x264_decimate_score16_ssse3 ( int16_t *dct );
+int x264_decimate_score15_mmxext_slowctz( int16_t *dct );
+int x264_decimate_score15_sse2_slowctz  ( int16_t *dct );
+int x264_decimate_score15_ssse3_slowctz ( int16_t *dct );
+int x264_decimate_score16_mmxext_slowctz( int16_t *dct );
+int x264_decimate_score16_sse2_slowctz  ( int16_t *dct );
+int x264_decimate_score16_ssse3_slowctz ( int16_t *dct );
 int x264_decimate_score64_mmxext( int16_t *dct );
 int x264_decimate_score64_sse2  ( int16_t *dct );
 int x264_decimate_score64_ssse3 ( int16_t *dct );
index d298c3e96e4374b9cd766934a01441bf1bdd10e4..3faf006d8666fb76f61f160270abfa6045be6762 100644 (file)
@@ -993,10 +993,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
         /* calculate dct coeffs */
         for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
         {
-            /* We don't need to zero the DC coefficient before quantization because we already
-             * checked that all the DCs were zero above at twice the precision that quant4x4
-             * uses.  This applies even though the DC here is being quantized before the 2x2
-             * transform. */
+            dct4x4[i4x4][0] = 0;
             if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
                 continue;
             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
index 7aeb899fe25f0baf3059f52d99547f5b1d6b2fad..17ad22f6b8b16150b20929b695f08d510f060cdd 100644 (file)
@@ -173,7 +173,9 @@ static void print_bench(void)
                     b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
                     b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
                     b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
-                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
+                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
+                    b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
+                    b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
                     ((int64_t)10*b->cycles/b->den - nop_time)/4 );
         }
 }
@@ -1700,6 +1702,8 @@ static int check_all_flags( void )
             ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
             cpu1 &= ~X264_CPU_LZCNT;
         }
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
+        cpu1 &= ~X264_CPU_SLOW_CTZ;
     }
     if( x264_cpu_detect() & X264_CPU_SSE2 )
     {
@@ -1708,6 +1712,10 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
         cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
+        cpu1 &= ~X264_CPU_SLOW_CTZ;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
+        cpu1 &= ~X264_CPU_SLOW_ATOM;
     }
     if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
     {
@@ -1730,6 +1738,10 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
         cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
+        cpu1 &= ~X264_CPU_SLOW_CTZ;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
+        cpu1 &= ~X264_CPU_SLOW_ATOM;
     }
     if( x264_cpu_detect() & X264_CPU_SSE4 )
     {
diff --git a/x264.h b/x264.h
index f714b726473282987f4eec42c05b131978d01bc5..6d7b70396f8616800792ba0026845b43aad46749 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -66,6 +66,8 @@ typedef struct x264_t x264_t;
 #define X264_CPU_ARMV6          0x020000
 #define X264_CPU_NEON           0x040000  /* ARM NEON */
 #define X264_CPU_FAST_NEON_MRC  0x080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X264_CPU_SLOW_CTZ       0x100000  /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM      0x200000  /* The Atom just sucks */
 
 /* Analyse flags
  */