Detect Atom CPU, enable appropriate asm functions

author Fiona Glaser <fiona@x264.com>

Tue, 25 May 2010 23:13:59 +0000 (16:13 -0700)

committer Fiona Glaser <fiona@x264.com>

Wed, 26 May 2010 09:06:43 +0000 (02:06 -0700)
author Fiona Glaser <fiona@x264.com>
Tue, 25 May 2010 23:13:59 +0000 (16:13 -0700)
committer Fiona Glaser <fiona@x264.com>
Wed, 26 May 2010 09:06:43 +0000 (02:06 -0700)
diff --git a/common/cpu.c b/common/cpu.c

index 6e0bfda31589d757a337eba1bead83a170175bf1..87e21c33547132022ef30dbeb499f64b91e51a2f 100644 (file)
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -64,6 +64,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
      {"ARMv6", X264_CPU_ARMV6},
      {"NEON",  X264_CPU_NEON},
      {"Fast_NEON_MRC",  X264_CPU_FAST_NEON_MRC},
+    {"SlowCTZ", X264_CPU_SLOW_CTZ},
+    {"SlowAtom", X264_CPU_SLOW_ATOM},
      {"", 0},
  };
  
@@ -135,6 +137,7 @@ uint32_t x264_cpu_detect( void )
  
      if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
      {
+        cpu |= X264_CPU_SLOW_CTZ;
          x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
          if( edx&0x00400000 )
              cpu |= X264_CPU_MMXEXT;
@@ -145,6 +148,7 @@ uint32_t x264_cpu_detect( void )
                  cpu |= X264_CPU_SSE2_IS_FAST;
                  cpu |= X264_CPU_LZCNT;
                  cpu |= X264_CPU_SHUFFLE_IS_FAST;
+                cpu &= ~X264_CPU_SLOW_CTZ;
              }
              else
                  cpu |= X264_CPU_SSE2_IS_SLOW;
@@ -159,11 +163,9 @@ uint32_t x264_cpu_detect( void )
  
      if( !strcmp((char*)vendor, "GenuineIntel") )
      {
-        int family, model, stepping;
          x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
-        family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
-        model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
-        stepping = eax&0xf;
+        int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+        int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
          /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
           * theoretically support sse2, but it's significantly slower than mmx for
           * almost all of x264's functions, so let's just pretend they don't. */
@@ -172,6 +174,12 @@ uint32_t x264_cpu_detect( void )
              cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
              assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
          }
+        /* Detect Atom CPU */
+        if( family == 6 && model == 28 )
+        {
+            cpu |= X264_CPU_SLOW_ATOM;
+            cpu |= X264_CPU_SLOW_CTZ;
+        }
      }
  
      if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
diff --git a/common/dct.c b/common/dct.c

index 3917510c9b3fbbcf43db6f67304e71a6763dc7bb..10fe2f77e4eb7d253662212ccd71629d0c7240c9 100644 (file)
--- a/common/dct.c
+++ b/common/dct.c
@@ -457,7 +457,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
          dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
      }
  
-    if( cpu&X264_CPU_SSSE3 )
+    if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
      {
          dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
          dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
diff --git a/common/pixel.c b/common/pixel.c

index 20c5170aebe132ec3aad7f84e736622b03bd9447..5759abf2bc9d61ac2ad964d2f17f163e57ac3ad8 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -768,17 +768,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  
      if( cpu&X264_CPU_SSSE3 )
      {
-        INIT7( ssd, _ssse3 );
-        INIT7( satd, _ssse3 );
-        INIT7( satd_x3, _ssse3 );
-        INIT7( satd_x4, _ssse3 );
          if( !(cpu&X264_CPU_STACK_MOD4) )
          {
              INIT4( hadamard_ac, _ssse3 );
          }
          INIT_ADS( _ssse3 );
-        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
-        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+        if( !(cpu&X264_CPU_SLOW_ATOM) )
+        {
+            INIT7( ssd, _ssse3 );
+            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
+            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+            INIT7( satd, _ssse3 );
+            INIT7( satd_x3, _ssse3 );
+            INIT7( satd_x4, _ssse3 );
+        }
          pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
          pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
          pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
@@ -794,7 +797,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
              INIT2( sad_x3, _cache64_ssse3 );
              INIT2( sad_x4, _cache64_ssse3 );
          }
-        if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
+        if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
          {
              INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
          }
diff --git a/common/quant.c b/common/quant.c

index ce074e267798c434b5d46de90e2753face6aa9fe..8b1fc87f3d13980a819d46d9c8348be3e4f0a18a 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -312,6 +312,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
          pf->decimate_score15 = x264_decimate_score15_mmxext;
          pf->decimate_score16 = x264_decimate_score16_mmxext;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_mmxext_slowctz;
+            pf->decimate_score16 = x264_decimate_score16_mmxext_slowctz;
+        }
          pf->decimate_score64 = x264_decimate_score64_mmxext;
          pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmxext;
          pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
@@ -345,6 +350,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->decimate_score15 = x264_decimate_score15_sse2;
          pf->decimate_score16 = x264_decimate_score16_sse2;
          pf->decimate_score64 = x264_decimate_score64_sse2;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
+            pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
+        }
          pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
          pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
          pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
@@ -369,6 +379,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->denoise_dct = x264_denoise_dct_ssse3;
          pf->decimate_score15 = x264_decimate_score15_ssse3;
          pf->decimate_score16 = x264_decimate_score16_ssse3;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
+            pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
+        }
          pf->decimate_score64 = x264_decimate_score64_ssse3;
      }
  
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 0022b13b95a8d9132061c773572c5ae95ae78e5b..5906ddd41fdb329ced38f2f2aaf070232f598edd 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -427,8 +427,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
          return;
  
      pf->weight = x264_mc_weight_wtab_sse2;
-    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
-    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+    if( !(cpu&X264_CPU_SLOW_ATOM) )
+    {
+        pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+        pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+    }
  
      pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
      pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
@@ -481,7 +484,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
          pf->weight = x264_mc_weight_wtab_ssse3;
      }
  
-    if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
          pf->integral_init4v = x264_integral_init4v_ssse3;
  
      if( !(cpu&X264_CPU_SSE4) )
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm

index f9f78978530773dcd8ce9269828fa5516abb483e..d571cedd46281ff9b1c3396a4e5e963f306f6181 100644 (file)
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -583,9 +583,9 @@ DENOISE_DCT ssse3, 7
  cextern decimate_table4
  cextern decimate_table8
  
-%macro DECIMATE4x4 2
+%macro DECIMATE4x4 3
  
-;A LUT is faster than bsf on AMD processors, and no slower on Intel
+;A LUT is faster than bsf on AMD processors.
  ;This is not true for score64.
  cglobal decimate_score%1_%2, 1,3
  %ifdef PIC
@@ -605,6 +605,7 @@ cglobal decimate_score%1_%2, 1,3
  %if %1==15
      shr   edx, 1
  %endif
+%if %3==1
      movzx ecx, dl
      movzx eax, byte [mask_table + rcx]
      cmp   edx, ecx
@@ -617,8 +618,16 @@ cglobal decimate_score%1_%2, 1,3
      shr   edx, cl
      add    al, byte [table + rcx]
      add    al, byte [mask_table + rdx]
+%else
+.loop:
+    bsf   ecx, edx
+    shr   edx, cl
+    add    al, byte [table + rcx]
+    shr   edx, 1
+    jne  .loop
+%endif
  .ret:
-    REP_RET
+    RET
  .ret9:
      mov   eax, 9
      RET
@@ -627,14 +636,20 @@ cglobal decimate_score%1_%2, 1,3
  
  %ifndef ARCH_X86_64
  %define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE4x4 15, mmxext
-DECIMATE4x4 16, mmxext
+DECIMATE4x4 15, mmxext, 0
+DECIMATE4x4 16, mmxext, 0
+DECIMATE4x4 15, mmxext_slowctz, 1
+DECIMATE4x4 16, mmxext_slowctz, 1
  %endif
  %define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE4x4 15, sse2
-DECIMATE4x4 15, ssse3
-DECIMATE4x4 16, sse2
-DECIMATE4x4 16, ssse3
+DECIMATE4x4 15, sse2, 0
+DECIMATE4x4 16, sse2, 0
+DECIMATE4x4 15, sse2_slowctz, 1
+DECIMATE4x4 16, sse2_slowctz, 1
+DECIMATE4x4 15, ssse3, 0
+DECIMATE4x4 16, ssse3, 0
+DECIMATE4x4 15, ssse3_slowctz, 1
+DECIMATE4x4 16, ssse3_slowctz, 1
  
  %macro DECIMATE8x8 1
  
diff --git a/common/x86/quant.h b/common/x86/quant.h

index 4e42b8129ec25c926517dfd4c36c6ad2e96bd431..8d6a7625c187e01d1d99ff4e3e5f7fc9b7cbba7c 100644 (file)
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -57,6 +57,12 @@ int x264_decimate_score15_ssse3 ( int16_t *dct );
  int x264_decimate_score16_mmxext( int16_t *dct );
  int x264_decimate_score16_sse2  ( int16_t *dct );
  int x264_decimate_score16_ssse3 ( int16_t *dct );
+int x264_decimate_score15_mmxext_slowctz( int16_t *dct );
+int x264_decimate_score15_sse2_slowctz  ( int16_t *dct );
+int x264_decimate_score15_ssse3_slowctz ( int16_t *dct );
+int x264_decimate_score16_mmxext_slowctz( int16_t *dct );
+int x264_decimate_score16_sse2_slowctz  ( int16_t *dct );
+int x264_decimate_score16_ssse3_slowctz ( int16_t *dct );
  int x264_decimate_score64_mmxext( int16_t *dct );
  int x264_decimate_score64_sse2  ( int16_t *dct );
  int x264_decimate_score64_ssse3 ( int16_t *dct );
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index d298c3e96e4374b9cd766934a01441bf1bdd10e4..3faf006d8666fb76f61f160270abfa6045be6762 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -993,10 +993,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
          /* calculate dct coeffs */
          for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
          {
-            /* We don't need to zero the DC coefficient before quantization because we already
-             * checked that all the DCs were zero above at twice the precision that quant4x4
-             * uses.  This applies even though the DC here is being quantized before the 2x2
-             * transform. */
+            dct4x4[i4x4][0] = 0;
              if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
                  continue;
              h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 7aeb899fe25f0baf3059f52d99547f5b1d6b2fad..17ad22f6b8b16150b20929b695f08d510f060cdd 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -173,7 +173,9 @@ static void print_bench(void)
                      b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
                      b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
                      b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
-                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
+                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
+                    b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
+                    b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
                      ((int64_t)10*b->cycles/b->den - nop_time)/4 );
          }
  }
@@ -1700,6 +1702,8 @@ static int check_all_flags( void )
              ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
              cpu1 &= ~X264_CPU_LZCNT;
          }
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
+        cpu1 &= ~X264_CPU_SLOW_CTZ;
      }
      if( x264_cpu_detect() & X264_CPU_SSE2 )
      {
@@ -1708,6 +1712,10 @@ static int check_all_flags( void )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
          cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
+        cpu1 &= ~X264_CPU_SLOW_CTZ;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
+        cpu1 &= ~X264_CPU_SLOW_ATOM;
      }
      if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
      {
@@ -1730,6 +1738,10 @@ static int check_all_flags( void )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
          cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
+        cpu1 &= ~X264_CPU_SLOW_CTZ;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
+        cpu1 &= ~X264_CPU_SLOW_ATOM;
      }
      if( x264_cpu_detect() & X264_CPU_SSE4 )
      {
diff --git a/x264.h b/x264.h

index f714b726473282987f4eec42c05b131978d01bc5..6d7b70396f8616800792ba0026845b43aad46749 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -66,6 +66,8 @@ typedef struct x264_t x264_t;
  #define X264_CPU_ARMV6          0x020000
  #define X264_CPU_NEON           0x040000  /* ARM NEON */
  #define X264_CPU_FAST_NEON_MRC  0x080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X264_CPU_SLOW_CTZ       0x100000  /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM      0x200000  /* The Atom just sucks */
  
  /* Analyse flags
   */
author	Fiona Glaser <fiona@x264.com>
	Tue, 25 May 2010 23:13:59 +0000 (16:13 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Wed, 26 May 2010 09:06:43 +0000 (02:06 -0700)
common/cpu.c		patch \| blob \| history
common/dct.c		patch \| blob \| history
common/pixel.c		patch \| blob \| history
common/quant.c		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
common/x86/quant-a.asm		patch \| blob \| history
common/x86/quant.h		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history
x264.h		patch \| blob \| history