GSOC merge part 1: Framework for ARM assembly optimizations

author David Conrad <lessen42@gmail.com>

Thu, 20 Aug 2009 00:03:02 +0000 (17:03 -0700)

committer Fiona Glaser <fiona@x264.com>

Thu, 20 Aug 2009 20:12:15 +0000 (13:12 -0700)
author David Conrad <lessen42@gmail.com>
Thu, 20 Aug 2009 00:03:02 +0000 (17:03 -0700)
committer Fiona Glaser <fiona@x264.com>
Thu, 20 Aug 2009 20:12:15 +0000 (13:12 -0700)
diff --git a/Makefile b/Makefile

index 563f185be1f6eb76267564c88a474d39aba197b3..725c919088e9dea29cc7697d554ea21b91a8386e 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -55,6 +55,14 @@ SRCS += $(ALTIVECSRC)
  $(ALTIVECSRC:%.c=%.o): CFLAGS += $(ALTIVECFLAGS)
  endif
  
+# NEON optims
+ifeq ($(ARCH),ARM)
+ifneq ($(AS),)
+ASMSRC += common/arm/cpu-a.S
+OBJASM  = $(ASMSRC:%.S=%.o)
+endif
+endif
+
  # VIS optims
  ifeq ($(ARCH),UltraSparc)
  ASMSRC += common/sparc/pixel.asm
@@ -88,6 +96,10 @@ checkasm: tools/checkasm.o libx264.a
  
  %.o: %.asm
         $(AS) $(ASFLAGS) -o $@ $<
+
+%.o: %.S
+       $(AS) $(ASFLAGS) -o $@ $<
+
  # delete local/anonymous symbols, so they don't show up in oprofile
         -@ $(STRIP) -x $@
  
diff --git a/common/cpu.c b/common/cpu.c

index 860cd9574b6b02a2982a323fff74e3302bb788cb..757163d24b73c74c7233ae378f2340fa5cd49290 100644 (file)
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -61,9 +61,30 @@ const x264_cpu_name_t x264_cpu_names[] = {
      {"SSEMisalign", X264_CPU_SSE_MISALIGN},
      {"LZCNT", X264_CPU_LZCNT},
      {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
+    {"ARMv6", X264_CPU_ARMV6},
+    {"NEON",  X264_CPU_NEON},
+    {"Fast_NEON_MRC",  X264_CPU_FAST_NEON_MRC},
      {"", 0},
  };
  
+#if (defined(ARCH_PPC) && defined(SYS_LINUX)) || (defined(ARCH_ARM) && !defined(HAVE_NEON))
+#include <signal.h>
+#include <setjmp.h>
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static void sigill_handler( int sig )
+{
+    if( !canjump )
+    {
+        signal( sig, SIG_DFL );
+        raise( sig );
+    }
+
+    canjump = 0;
+    siglongjmp( jmpbuf, 1 );
+}
+#endif
  
  #ifdef HAVE_MMX
  extern int  x264_cpu_cpuid_test( void );
@@ -224,22 +245,6 @@ uint32_t x264_cpu_detect( void )
  }
  
  #elif defined( SYS_LINUX )
-#include <signal.h>
-#include <setjmp.h>
-static sigjmp_buf jmpbuf;
-static volatile sig_atomic_t canjump = 0;
-
-static void sigill_handler( int sig )
-{
-    if( !canjump )
-    {
-        signal( sig, SIG_DFL );
-        raise( sig );
-    }
-
-    canjump = 0;
-    siglongjmp( jmpbuf, 1 );
-}
  
  uint32_t x264_cpu_detect( void )
  {
@@ -265,6 +270,48 @@ uint32_t x264_cpu_detect( void )
  }
  #endif
  
+#elif defined( ARCH_ARM )
+
+void x264_cpu_neon_test();
+int x264_cpu_fast_neon_mrc_test();
+
+uint32_t x264_cpu_detect( void )
+{
+    int flags = 0;
+#ifdef HAVE_ARMV6
+    flags |= X264_CPU_ARMV6;
+
+    // don't do this hack if compiled with -mfpu=neon
+#ifndef HAVE_NEON
+    static void (* oldsig)( int );
+    oldsig = signal( SIGILL, sigill_handler );
+    if( sigsetjmp( jmpbuf, 1 ) )
+    {
+        signal( SIGILL, oldsig );
+        return flags;
+    }
+
+    canjump = 1;
+    x264_cpu_neon_test();
+    canjump = 0;
+    signal( SIGILL, oldsig );
+#endif
+
+    flags |= X264_CPU_NEON;
+
+    // fast neon -> arm (Cortex-A9) detection relies on user access to the
+    // cycle counter; this assumes ARMv7 performance counters.
+    // NEON requires at least ARMv7, ARMv8 may require changes here, but
+    // hopefully this hacky detection method will have been replaced by then.
+    // Note that there is potential for a race condition if another program or
+    // x264 instance disables or reinits the counters while x264 is using them,
+    // which may result in incorrect detection and the counters stuck enabled.
+    flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
+    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
+#endif
+    return flags;
+}
+
  #else
  
  uint32_t x264_cpu_detect( void )
diff --git a/common/osdep.h b/common/osdep.h

index 57642dc8599544d8fe331f32e3a2f0f6ce94618e..a691d06dcd293dcbcca77d73ca38ab0ffb3fff9f 100644 (file)
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -163,6 +163,13 @@ static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
      asm("bswap %0":"+r"(x));
      return x;
  }
+#elif defined(__GNUC__) && defined(HAVE_ARMV6)
+static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
+{
+    asm("rev %0, %0":"+r"(x));
+    return x;
+}
+#define endian_fix32 endian_fix
  #else
  static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
  {
diff --git a/configure b/configure

index 2e7360addcde444750c8236a532c4de60fdd77af..a4af82ad88822c50a56b04caf532c3abf17604f6 100755 (executable)
--- a/configure
+++ b/configure
@@ -10,7 +10,7 @@ echo "  --help                   print this message"
  echo "  --disable-avis-input     disables avisynth input (win32 only)"
  echo "  --disable-mp4-output     disables mp4 output (using gpac)"
  echo "  --disable-pthread        disables multithreaded encoding"
-echo "  --disable-asm            disables assembly optimizations on x86"
+echo "  --disable-asm            disables assembly optimizations on x86 and arm"
  echo "  --enable-debug           adds -g, doesn't strip"
  echo "  --enable-gprof           adds -pg, doesn't strip"
  echo "  --enable-visualize       enables visualization (X11 only)"
@@ -157,7 +157,6 @@ CC="${CC-${cross_prefix}gcc}"
  AR="${AR-${cross_prefix}ar}"
  RANLIB="${RANLIB-${cross_prefix}ranlib}"
  STRIP="${STRIP-${cross_prefix}strip}"
-AS=""
  
  if [ "x$host" = x ]; then
      host=`./config.guess`
@@ -286,6 +285,7 @@ case $host_cpu in
      ;;
    arm*)
      ARCH="ARM"
+    AS="${AS-${cross_prefix}gcc}"
      ;;
    s390|s390x)
      ARCH="S390"
@@ -324,6 +324,17 @@ if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
      fi
      CFLAGS="$CFLAGS -DHAVE_MMX"
  fi
+
+if [ $asm = yes -a $ARCH = ARM ] ; then
+    if  cc_check '' '' 'asm("rev r0, r0");' ; then      CFLAGS="$CFLAGS -DHAVE_ARMV6"
+        cc_check '' '' 'asm("movt r0, #0");'         && CFLAGS="$CFLAGS -DHAVE_ARMV6T2"
+        cc_check '' '' 'asm("vadd.i16 q0, q0, q0");' && CFLAGS="$CFLAGS -DHAVE_NEON"
+        ASFLAGS="$ASFLAGS $CFLAGS -c"
+    else
+        asm="no"
+    fi
+fi
+
  [ $asm = no ] && AS=""
  [ "x$AS" = x ] && asm="no"
  
diff --git a/tools/checkasm.c b/tools/checkasm.c

index c2bd738a5dd8ce01f98f5bed7ba1d5066ab6b5a9..36fd7340f823dbd0e08e93ab955fb5d564de8e8b 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -30,6 +30,12 @@
  #include "common/common.h"
  #include "common/cpu.h"
  
+// GCC doesn't align stack variables on ARM, so use .bss
+#ifdef ARCH_ARM
+#undef DECLARE_ALIGNED_16
+#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
+#endif
+
  /* buf1, buf2: initialised to random data and shouldn't write into them */
  uint8_t * buf1, * buf2;
  /* buf3, buf4: used to store output */
@@ -76,17 +82,15 @@ static const char **intra_predict_8x8_names = intra_predict_4x4_names;
  
  static inline uint32_t read_time(void)
  {
+    uint32_t a = 0;
  #if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
-    uint32_t a;
      asm volatile( "rdtsc" :"=a"(a) ::"edx" );
-    return a;
  #elif defined(ARCH_PPC)
-    uint32_t a;
      asm volatile( "mftb %0" : "=r" (a) );
-    return a;
-#else
-    return 0;
+#elif defined(ARCH_ARM)     // ARMv7 only
+    asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) );
  #endif
+    return a;
  }
  
  static bench_t* get_bench( const char *name, int cpu )
@@ -158,11 +162,14 @@ static void print_bench(void)
                      b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
                      b->cpu&X264_CPU_SSE2 ? "sse2" :
                      b->cpu&X264_CPU_MMX ? "mmx" :
-                    b->cpu&X264_CPU_ALTIVEC ? "altivec" : "c",
+                    b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+                    b->cpu&X264_CPU_NEON ? "neon" :
+                    b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
                      b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
                      b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
                      b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
-                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
+                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
+                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
                      ((int64_t)10*b->cycles/b->den - nop_time)/4 );
          }
  }
@@ -1580,6 +1587,13 @@ static int check_all_flags( void )
          fprintf( stderr, "x264: ALTIVEC against C\n" );
          ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
      }
+#elif ARCH_ARM
+    if( x264_cpu_detect() & X264_CPU_ARMV6 )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
+    if( x264_cpu_detect() & X264_CPU_NEON )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
+    if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
  #endif
      return ret;
  }
@@ -1591,7 +1605,7 @@ int main(int argc, char *argv[])
  
      if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
      {
-#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC)
+#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC) && !defined(ARCH_ARM)
          fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
          return 1;
  #endif
diff --git a/x264.h b/x264.h

index 37a643cb9ddfebbb9f650b09c0c9d3f04ebcca10..7fa508d1eec21675957039497d2232b6ff52d1d6 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -63,6 +63,9 @@ typedef struct x264_t x264_t;
  #define X264_CPU_SSE42          0x004000  /* SSE4.2 */
  #define X264_CPU_SSE_MISALIGN   0x008000  /* Phenom support for misaligned SSE instruction arguments */
  #define X264_CPU_LZCNT          0x010000  /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_ARMV6          0x020000
+#define X264_CPU_NEON           0x040000  /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC  0x080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
  
  /* Analyse flags
   */
author	David Conrad <lessen42@gmail.com>
	Thu, 20 Aug 2009 00:03:02 +0000 (17:03 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Thu, 20 Aug 2009 20:12:15 +0000 (13:12 -0700)
Makefile		patch \| blob \| history
common/cpu.c		patch \| blob \| history
common/osdep.h		patch \| blob \| history
configure		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history
x264.h		patch \| blob \| history