x264 will detect which ARM core it's building for and only build NEON asm if the target is ARMv6 or above, then enable NEON at runtime.
$(ALTIVECSRC:%.c=%.o): CFLAGS += $(ALTIVECFLAGS)
endif
+# NEON optims
+ifeq ($(ARCH),ARM)
+ifneq ($(AS),)
+ASMSRC += common/arm/cpu-a.S
+OBJASM = $(ASMSRC:%.S=%.o)
+endif
+endif
+
# VIS optims
ifeq ($(ARCH),UltraSparc)
ASMSRC += common/sparc/pixel.asm
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
+
+%.o: %.S
+ $(AS) $(ASFLAGS) -o $@ $<
+
# delete local/anonymous symbols, so they don't show up in oprofile
-@ $(STRIP) -x $@
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
+ {"ARMv6", X264_CPU_ARMV6},
+ {"NEON", X264_CPU_NEON},
+ {"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
{"", 0},
};
+#if (defined(ARCH_PPC) && defined(SYS_LINUX)) || (defined(ARCH_ARM) && !defined(HAVE_NEON))
+#include <signal.h>
+#include <setjmp.h>
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static void sigill_handler( int sig )
+{
+ if( !canjump )
+ {
+ signal( sig, SIG_DFL );
+ raise( sig );
+ }
+
+ canjump = 0;
+ siglongjmp( jmpbuf, 1 );
+}
+#endif
#ifdef HAVE_MMX
extern int x264_cpu_cpuid_test( void );
}
#elif defined( SYS_LINUX )
-#include <signal.h>
-#include <setjmp.h>
-static sigjmp_buf jmpbuf;
-static volatile sig_atomic_t canjump = 0;
-
-static void sigill_handler( int sig )
-{
- if( !canjump )
- {
- signal( sig, SIG_DFL );
- raise( sig );
- }
-
- canjump = 0;
- siglongjmp( jmpbuf, 1 );
-}
uint32_t x264_cpu_detect( void )
{
}
#endif
+#elif defined( ARCH_ARM )
+
+void x264_cpu_neon_test();
+int x264_cpu_fast_neon_mrc_test();
+
+uint32_t x264_cpu_detect( void )
+{
+ int flags = 0;
+#ifdef HAVE_ARMV6
+ flags |= X264_CPU_ARMV6;
+
+ // don't do this hack if compiled with -mfpu=neon
+#ifndef HAVE_NEON
+ static void (* oldsig)( int );
+ oldsig = signal( SIGILL, sigill_handler );
+ if( sigsetjmp( jmpbuf, 1 ) )
+ {
+ signal( SIGILL, oldsig );
+ return flags;
+ }
+
+ canjump = 1;
+ x264_cpu_neon_test();
+ canjump = 0;
+ signal( SIGILL, oldsig );
+#endif
+
+ flags |= X264_CPU_NEON;
+
+ // fast neon -> arm (Cortex-A9) detection relies on user access to the
+ // cycle counter; this assumes ARMv7 performance counters.
+ // NEON requires at least ARMv7, ARMv8 may require changes here, but
+ // hopefully this hacky detection method will have been replaced by then.
+ // Note that there is potential for a race condition if another program or
+ // x264 instance disables or reinits the counters while x264 is using them,
+ // which may result in incorrect detection and the counters stuck enabled.
+ flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
+ // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
+#endif
+ return flags;
+}
+
#else
uint32_t x264_cpu_detect( void )
asm("bswap %0":"+r"(x));
return x;
}
+#elif defined(__GNUC__) && defined(HAVE_ARMV6)
+static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
+{
+ asm("rev %0, %0":"+r"(x));
+ return x;
+}
+#define endian_fix32 endian_fix
#else
static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
echo " --disable-avis-input disables avisynth input (win32 only)"
echo " --disable-mp4-output disables mp4 output (using gpac)"
echo " --disable-pthread disables multithreaded encoding"
-echo " --disable-asm disables assembly optimizations on x86"
+echo " --disable-asm disables assembly optimizations on x86 and arm"
echo " --enable-debug adds -g, doesn't strip"
echo " --enable-gprof adds -pg, doesn't strip"
echo " --enable-visualize enables visualization (X11 only)"
AR="${AR-${cross_prefix}ar}"
RANLIB="${RANLIB-${cross_prefix}ranlib}"
STRIP="${STRIP-${cross_prefix}strip}"
-AS=""
if [ "x$host" = x ]; then
host=`./config.guess`
;;
arm*)
ARCH="ARM"
+ AS="${AS-${cross_prefix}gcc}"
;;
s390|s390x)
ARCH="S390"
fi
CFLAGS="$CFLAGS -DHAVE_MMX"
fi
+
+if [ $asm = yes -a $ARCH = ARM ] ; then
+ if cc_check '' '' 'asm("rev r0, r0");' ; then CFLAGS="$CFLAGS -DHAVE_ARMV6"
+ cc_check '' '' 'asm("movt r0, #0");' && CFLAGS="$CFLAGS -DHAVE_ARMV6T2"
+ cc_check '' '' 'asm("vadd.i16 q0, q0, q0");' && CFLAGS="$CFLAGS -DHAVE_NEON"
+ ASFLAGS="$ASFLAGS $CFLAGS -c"
+ else
+ asm="no"
+ fi
+fi
+
[ $asm = no ] && AS=""
[ "x$AS" = x ] && asm="no"
#include "common/common.h"
#include "common/cpu.h"
+// GCC doesn't align stack variables on ARM, so use .bss
+#ifdef ARCH_ARM
+#undef DECLARE_ALIGNED_16
+#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
+#endif
+
/* buf1, buf2: initialised to random data and shouldn't write into them */
uint8_t * buf1, * buf2;
/* buf3, buf4: used to store output */
static inline uint32_t read_time(void)
{
+ uint32_t a = 0;
#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
- uint32_t a;
asm volatile( "rdtsc" :"=a"(a) ::"edx" );
- return a;
#elif defined(ARCH_PPC)
- uint32_t a;
asm volatile( "mftb %0" : "=r" (a) );
- return a;
-#else
- return 0;
+#elif defined(ARCH_ARM) // ARMv7 only
+ asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) );
#endif
+ return a;
}
static bench_t* get_bench( const char *name, int cpu )
b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" :
- b->cpu&X264_CPU_ALTIVEC ? "altivec" : "c",
+ b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+ b->cpu&X264_CPU_NEON ? "neon" :
+ b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
- b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
+ b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
+ b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
fprintf( stderr, "x264: ALTIVEC against C\n" );
ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
}
+#elif ARCH_ARM
+ if( x264_cpu_detect() & X264_CPU_ARMV6 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
+ if( x264_cpu_detect() & X264_CPU_NEON )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
+ if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
#endif
return ret;
}
if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
{
-#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC)
+#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC) && !defined(ARCH_ARM)
fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
return 1;
#endif
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_ARMV6 0x020000
+#define X264_CPU_NEON 0x040000 /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
/* Analyse flags
*/