Faster mbtree propagate and x264_log2, less memory usage

[x264] / common / quant.c
diff --git a/common/quant.c b/common/quant.c

index ac798a25a8d58f901245079623b29845bf42e4ed..263fb7c1529cbbec42a34937290b86e4eca79c06 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -29,6 +29,9 @@
  #ifdef ARCH_PPC
  #   include "ppc/quant.h"
  #endif
+#ifdef ARCH_ARM
+#   include "arm/quant.h"
+#endif
  
  #define QUANT_ONE( coef, mf, f ) \
  { \
@@ -36,35 +39,41 @@
          (coef) = (f + (coef)) * (mf) >> 16; \
      else \
          (coef) = - ((f - (coef)) * (mf) >> 16); \
+    nz |= (coef); \
  }
  
-static void quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
+static int quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
  {
-    int i;
+    int i, nz = 0;
      for( i = 0; i < 64; i++ )
          QUANT_ONE( dct[0][i], mf[i], bias[i] );
+    return !!nz;
  }
  
-static void quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
+static int quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
  {
-    int i;
+    int i, nz = 0;
      for( i = 0; i < 16; i++ )
          QUANT_ONE( dct[0][i], mf[i], bias[i] );
+    return !!nz;
  }
  
-static void quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
+static int quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
  {
-    int i;
+    int i, nz = 0;
      for( i = 0; i < 16; i++ )
          QUANT_ONE( dct[0][i], mf, bias );
+    return !!nz;
  }
  
-static void quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
+static int quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
  {
+    int nz = 0;
      QUANT_ONE( dct[0][0], mf, bias );
      QUANT_ONE( dct[0][1], mf, bias );
      QUANT_ONE( dct[0][2], mf, bias );
      QUANT_ONE( dct[0][3], mf, bias );
+    return !!nz;
  }
  
  #define DEQUANT_SHL( x ) \
@@ -402,6 +411,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->decimate_score16 = x264_decimate_score16_ssse3;
          pf->decimate_score64 = x264_decimate_score64_ssse3;
      }
+
+    if( cpu&X264_CPU_SSE4 )
+    {
+        pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
+        pf->quant_4x4 = x264_quant_4x4_sse4;
+        pf->quant_8x8 = x264_quant_8x8_sse4;
+    }
  #endif // HAVE_MMX
  
  #ifdef ARCH_PPC
@@ -415,6 +431,25 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->dequant_8x8 = x264_dequant_8x8_altivec;
      }
  #endif
+
+#ifdef HAVE_ARMV6
+    if( cpu&X264_CPU_ARMV6 )
+        pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_arm;
+
+    if( cpu&X264_CPU_NEON )
+    {
+        pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
+        pf->quant_4x4      = x264_quant_4x4_neon;
+        pf->quant_4x4_dc   = x264_quant_4x4_dc_neon;
+        pf->quant_8x8      = x264_quant_8x8_neon;
+        pf->dequant_4x4    = x264_dequant_4x4_neon;
+        pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
+        pf->dequant_8x8    = x264_dequant_8x8_neon;
+        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
+    }
+#endif
      pf->coeff_last[  DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
      pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
      pf->coeff_level_run[  DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];