Add more inline asm and a runtime check for MMXEXT support

author Fiona Glaser <fiona@x264.com>

Sun, 15 Jun 2008 17:59:25 +0000 (11:59 -0600)

committer Fiona Glaser <fiona@x264.com>

Sun, 15 Jun 2008 17:59:25 +0000 (11:59 -0600)
author Fiona Glaser <fiona@x264.com>
Sun, 15 Jun 2008 17:59:25 +0000 (11:59 -0600)
committer Fiona Glaser <fiona@x264.com>
Sun, 15 Jun 2008 17:59:25 +0000 (11:59 -0600)
diff --git a/common/common.h b/common/common.h

index aaf584a2de333da925cba22ba0cc3b0ba86166d0..04f5243e9d55ef42ccfc5d22bb6c21af51aae20f 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -141,10 +141,6 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
      return sum;
  }
  
-#ifdef HAVE_MMX
-#include "x86/util.h"
-#endif
-
  /****************************************************************************
   *
   ****************************************************************************/
@@ -595,5 +591,9 @@ struct x264_t
  // included at the end because it needs x264_t
  #include "macroblock.h"
  
+#ifdef HAVE_MMX
+#include "x86/util.h"
+#endif
+
  #endif
  
diff --git a/common/macroblock.h b/common/macroblock.h

index 2766ddd0be66bafb001874276ff5c953c588c9d2..660978a9b15145f6199e8a054da43fc37d2a373b 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -356,6 +356,38 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x,
      int8_t *cache = &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y];
      cache[0] = cache[1] = cache[8] = cache[9] = i_mode;
  }
+#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
+#define array_non_zero_int array_non_zero_int_c
+static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
+{
+    uint64_t *x = v;
+    if(i_count == 8)
+        return !!x[0];
+    else if(i_count == 16)
+        return !!(x[0]|x[1]);
+    else if(i_count == 32)
+        return !!(x[0]|x[1]|x[2]|x[3]);
+    else
+    {
+        int i;
+        i_count /= sizeof(uint64_t);
+        for( i = 0; i < i_count; i++ )
+            if( x[i] ) return 1;
+        return 0;
+    }
+}
+/* This function and its MMX version only work on arrays of size 16 */
+static ALWAYS_INLINE int array_non_zero_count( int16_t *v )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < 16; i++ )
+        if( v[i] )
+            i_nz++;
+
+    return i_nz;
+}
  
  #endif
  
diff --git a/common/x86/util.h b/common/x86/util.h

index 59d174950dd37c915d35621cfd42867454c79eb8..e100a4e4d3e10e8b4d9565659566f8628a4c6e94 100644 (file)
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -65,11 +65,67 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
          "paddusw %%mm0, %%mm4 \n"
          "jg 1b                \n"
          "movq    %%mm4, %0    \n"
-        :"=m"(output), "+r"(i_mvc), "+r"(mvc)
+        :"=m"(output), "+r"(i_mvc)
+        :"r"(mvc)
      );
      sum += output[0] + output[1] + output[2] + output[3];
      return sum;
  }
+#define array_non_zero_count array_non_zero_count_mmx
+static inline int array_non_zero_count_mmx( int16_t *v )
+{
+    static const uint64_t pw_2 = 0x0202020202020202ULL;
+    int count;
+    asm(
+        "pxor     %%mm7,  %%mm7 \n"
+        "movq     (%1),   %%mm0 \n"
+        "movq     16(%1), %%mm1 \n"
+        "packsswb 8(%1),  %%mm0 \n"
+        "packsswb 24(%1), %%mm1 \n"
+        "pcmpeqb  %%mm7,  %%mm0 \n"
+        "pcmpeqb  %%mm7,  %%mm1 \n"
+        "paddb    %%mm0,  %%mm1 \n"
+        "paddb    %2,     %%mm1 \n"
+        "psadbw   %%mm7,  %%mm1 \n"
+        "movd     %%mm1,  %0    \n"
+        :"=r"(count)
+        :"r"(v), "m"(pw_2)
+    );
+    return count;
+}
+#undef array_non_zero_int
+#define array_non_zero_int array_non_zero_int_mmx
+static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
+{
+    if(i_count == 128)
+    {
+        int nonzero;
+        asm(
+            "movq     (%1),    %%mm0 \n"
+            "por      8(%1),   %%mm0 \n"
+            "por      16(%1),  %%mm0 \n"
+            "por      24(%1),  %%mm0 \n"
+            "por      32(%1),  %%mm0 \n"
+            "por      40(%1),  %%mm0 \n"
+            "por      48(%1),  %%mm0 \n"
+            "por      56(%1),  %%mm0 \n"
+            "por      64(%1),  %%mm0 \n"
+            "por      72(%1),  %%mm0 \n"
+            "por      80(%1),  %%mm0 \n"
+            "por      88(%1),  %%mm0 \n"
+            "por      96(%1),  %%mm0 \n"
+            "por      104(%1), %%mm0 \n"
+            "por      112(%1), %%mm0 \n"
+            "por      120(%1), %%mm0 \n"
+            "packsswb %%mm0,   %%mm0 \n"
+            "movd     %%mm0,   %0    \n"
+            :"=r"(nonzero)
+            :"r"(v)
+        );
+        return !!nonzero;
+    }
+    else return array_non_zero_int_c( v, i_count );
+}
  #endif
  
  #endif
diff --git a/configure b/configure

index 2cb9bad1382a7a678bed731f9219f45fb5e61fd0..29e1b25bc047448128cf51b7a8a5b94ce4f15b32 100755 (executable)
--- a/configure
+++ b/configure
@@ -7,10 +7,11 @@ echo ""
  echo "available options:"
  echo ""
  echo "  --help                   print this message"
-echo "  --enable-avis-input      enables avisynth input (win32 only)"
-echo "  --enable-mp4-output      enables mp4 output (using gpac)"
+echo "  --disable-avis-input     disables avisynth input (win32 only)"
+echo "  --disable-mp4-output     disables mp4 output (using gpac)"
+echo "  --disable-pthread        disables multithreaded encoding"
+echo "  --disable-asm            disables assembly optimizations on x86"
  echo "  --enable-gtk             build GTK+ interface"
-echo "  --enable-pthread         enables multithreaded encoding"
  echo "  --enable-debug           adds -g, doesn't strip"
  echo "  --enable-gprof           adds -pg, doesn't strip"
  echo "  --enable-visualize       enables visualization (X11 only)"
@@ -53,6 +54,7 @@ DEVNULL='/dev/null'
  avis_input="auto"
  mp4_output="auto"
  pthread="auto"
+asm="yes"
  debug="no"
  gprof="no"
  pic="no"
@@ -102,6 +104,12 @@ for opt do
          --includedir=*)
              includedir="$optarg"
              ;;
+        --enable-asm)
+            asm="yes"
+            ;;
+        --disable-asm)
+            asm="no"
+            ;;
          --enable-avis-input)
              avis_input="yes"
              ;;
@@ -300,7 +308,7 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" \)
      pic="yes"
  fi
  
-if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then
+if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
      if [ $ARCH = X86 -a $pic = yes -a x$AS = xyasm -a\
           "`yasm --version 2>$DEVNULL | head -n 1`" "<" "yasm 0.6.2" ] ; then
           echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..."
@@ -309,10 +317,12 @@ if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then
      if as_check "pabsw xmm0, xmm0" ; then
          CFLAGS="$CFLAGS -DHAVE_MMX"
      else
-        echo "No suitable assembler found.  x264 will be several times slower."
-        echo "Please install 'yasm' to get MMX/SSE optimized code."
-        AS=""
+        echo "No suitable assembler found.  Install 'yasm' to get MMX/SSE optimized code."
+        echo "If you really want to compile without asm, configure with --disable-asm."
+        exit 1
      fi
+else
+    AS=""
  fi
  
  CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS"
@@ -482,6 +492,7 @@ EOF
  
  echo "Platform:   $ARCH"
  echo "System:     $SYS"
+echo "asm:        $asm"
  echo "avis input: $avis_input"
  echo "mp4 output: $mp4_output"
  echo "pthread:    $pthread"
diff --git a/encoder/cavlc.c b/encoder/cavlc.c

index 6213654751a9228e559ac26a6df20dbb6161b9f3..7d5bc84c6af4660a867c7e51c761ebe09b24cd5a 100644 (file)
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -301,7 +301,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
          if( h->mb.i_cbp_luma & (1 << i8) )
              for( i4 = 0; i4 < 4; i4++ )
              {
-                h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 );
+                h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
                  block_residual_write_cavlc( h, s, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
              }
  }
@@ -657,7 +657,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
          if( h->mb.i_cbp_luma != 0 )
              for( i = 0; i < 16; i++ )
              {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                  block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
              }
      }
@@ -674,7 +674,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
          if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
              for( i = 16; i < 24; i++ )
              {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                  block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
              }
      }
@@ -741,9 +741,9 @@ int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
      for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
      {
          x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
-        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8]+1, 15 );
+        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] );
          block_residual_write_cavlc( h, &s, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
-        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8]+1, 15 );
+        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] );
          block_residual_write_cavlc( h, &s, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
          i8 += x264_pixel_size[i_pixel].h >> 3;
      }
@@ -768,7 +768,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
          for( i = 0; i < 16; i++ )
              h->dct.luma4x4[i4+i8*4][i] = h->dct.luma8x8[i8][i4+i*4];
          h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] =
-            array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 );
+            array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
          block_residual_write_cavlc( h, &h->out.bs, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
      }
      return h->out.bs.i_bits_encoded;
@@ -794,7 +794,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
              int i;
              for( i = 16; i < 24; i++ )
              {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                  block_residual_write_cavlc( h, &h->out.bs, i, h->dct.luma4x4[i]+1, 15 );
              }
          }
diff --git a/encoder/encoder.c b/encoder/encoder.c

index cffaeeb6aa9f0d2aedb6e82c1f9908f5b200cce6..533e8a830d65c3ad96ecb447081161981e1e9804 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -300,6 +300,14 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
  
  static int x264_validate_parameters( x264_t *h )
  {
+#ifdef HAVE_MMX
+    if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+    {
+        x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+        x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
+        return -1;
+    }
+#endif
      if( h->param.i_width <= 0 || h->param.i_height <= 0 )
      {
          x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
diff --git a/encoder/macroblock.h b/encoder/macroblock.h

index e80995d785ce446efa7171bc4a09cf23e3386f09..5ac58349ce7d4ac06310f86849e42c8dc90047d0 100644 (file)
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -54,29 +54,5 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
  void x264_noise_reduction_update( x264_t *h );
  void x264_denoise_dct( x264_t *h, int16_t *dct );
  
-#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
-static inline int array_non_zero_int( void *v, int i_count )
-{
-    int i;
-    uint64_t *x = v;
-    i_count /= sizeof(uint64_t);
-    for( i = 0; i < i_count; i++ )
-        if( x[i] ) return 1;
-    return 0;
-}
-
-static inline int array_non_zero_count( int16_t *v, int i_count )
-{
-    int i;
-    int i_nz;
-
-    for( i = 0, i_nz = 0; i < i_count; i++ )
-        if( v[i] )
-            i_nz++;
-
-    return i_nz;
-}
-
-
  #endif
author	Fiona Glaser <fiona@x264.com>
	Sun, 15 Jun 2008 17:59:25 +0000 (11:59 -0600)
committer	Fiona Glaser <fiona@x264.com>
	Sun, 15 Jun 2008 17:59:25 +0000 (11:59 -0600)
common/common.h		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
common/x86/util.h		patch \| blob \| history
configure		patch \| blob \| history
encoder/cavlc.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/macroblock.h		patch \| blob \| history