From 35d32d09e163bb0f2ce60a8e13f9f22125445346 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Fri, 28 Aug 2015 09:40:24 +0300 Subject: [PATCH] checkasm: arm: Check register clobbering Cast the function pointer to a different type signature, to be able to use uint64_t as return type (instead of intptr_t) for those calls that require it. Use two separate functions, depending on whether neon is available. --- Makefile | 1 + tools/checkasm-arm.S | 132 +++++++++++++++++++++++++++++++++++++++++++ tools/checkasm.c | 19 ++++++- 3 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 tools/checkasm-arm.S diff --git a/Makefile b/Makefile index 4feef339..d0b16338 100644 --- a/Makefile +++ b/Makefile @@ -122,6 +122,7 @@ ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \ common/arm/predict-a.S common/arm/bitstream-a.S SRCS += common/arm/mc-c.c common/arm/predict-c.c OBJASM = $(ASMSRC:%.S=%.o) +OBJCHK += tools/checkasm-arm.o endif endif diff --git a/tools/checkasm-arm.S b/tools/checkasm-arm.S new file mode 100644 index 00000000..35de22c1 --- /dev/null +++ b/tools/checkasm-arm.S @@ -0,0 +1,132 @@ +/**************************************************************************** + * checkasm-arm.S: assembly check tool + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Martin Storsjo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "../common/arm/asm.S" + +.section .rodata +.align 4 +register_init: +.quad 0x21f86d66c8ca00ce +.quad 0x75b6ba21077c48ad +.quad 0xed56bb2dcb3c7736 +.quad 0x8bda43d3fd1a7e06 +.quad 0xb64a9c9e5d318408 +.quad 0xdf9a54b303f1d3a3 +.quad 0x4a75479abd64e097 +.quad 0x249214109d5d1c88 + +error_message: +.asciz "failed to preserve register" + +.text + +@ max number of args used by any x264 asm function. +#define MAX_ARGS 15 + +#define ARG_STACK 4*(MAX_ARGS - 2) + +.macro clobbercheck variant +.equ pushed, 4*10 +function x264_checkasm_call_\variant + push {r4-r11, lr} +.ifc \variant, neon + vpush {q4-q7} +.equ pushed, pushed + 16*4 +.endif + + movrel r12, register_init +.ifc \variant, neon + vldm r12, {q4-q7} +.endif + ldm r12, {r4-r11} + + push {r1} + + sub sp, sp, #ARG_STACK +.equ pos, 0 +.rept MAX_ARGS-2 + ldr r12, [sp, #ARG_STACK + pushed + 8 + pos] + str r12, [sp, #pos] +.equ pos, pos + 4 +.endr + + mov r12, r0 + mov r0, r2 + mov r1, r3 + ldrd r2, r3, [sp, #ARG_STACK + pushed] + blx r12 + add sp, sp, #ARG_STACK + pop {r2} + + push {r0, r1} + movrel r12, register_init +.ifc \variant, neon + vldm r12, {q0-q3} + veor q0, q0, q4 + veor q1, q1, q5 + veor q2, q2, q6 + veor q3, q3, q7 + vorr q0, q0, q1 + vorr q0, q0, q2 + vorr q0, q0, q3 + vorr d0, d0, d1 + vrev64.32 d1, d0 + vorr d0, d0, d1 + vmov.32 r3, d0[0] +.else + mov r3, #0 +.endif + +.macro check_reg reg1, reg2 + ldrd r0, r1, [r12], #8 + eor r0, r0, \reg1 + eor r1, r1, \reg2 + orr r3, r3, r0 + orr r3, r3, r1 +.endm + check_reg r4, r5 + check_reg r6, r7 + check_reg r8, r9 + check_reg r10, r11 +.purgem check_reg + + cmp r3, #0 + beq 0f + + mov r12, #0 + str r12, [r2] + movrel r0, error_message + bl puts +0: + pop {r0, r1} +.ifc \variant, neon + vpop {q4-q7} +.endif + pop {r4-r11, pc} +endfunc +.endm + +clobbercheck neon +clobbercheck noneon diff --git a/tools/checkasm.c b/tools/checkasm.c index 01a97c93..721c3d7f 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -231,6 +231,12 @@ intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); #endif +#if ARCH_ARM +intptr_t x264_checkasm_call_neon( intptr_t (*func)(), int *ok, ... ); +intptr_t x264_checkasm_call_noneon( intptr_t (*func)(), int *ok, ... ); +intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon; +#endif + #define call_c1(func,...) func(__VA_ARGS__) #if ARCH_X86_64 @@ -248,12 +254,18 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... ); uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \ x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \ x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); }) -#elif ARCH_X86 || (ARCH_AARCH64 && !defined(__APPLE__)) +#elif ARCH_X86 || (ARCH_AARCH64 && !defined(__APPLE__)) || ARCH_ARM #define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ ) #else #define call_a1 call_c1 #endif +#if ARCH_ARM +#define call_a1_64(func,...) ((uint64_t (*)(intptr_t(*)(), int*, ...))x264_checkasm_call)( (intptr_t(*)())func, &ok, __VA_ARGS__ ) +#else +#define call_a1_64 call_a1 +#endif + #define call_bench(func,cpu,...)\ if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\ {\ @@ -286,6 +298,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... ); #define call_c(func,...) ({ call_c2(func,__VA_ARGS__); call_c1(func,__VA_ARGS__); }) #define call_a2(func,...) ({ call_bench(func,cpu_new,__VA_ARGS__); }) #define call_c2(func,...) ({ call_bench(func,0,__VA_ARGS__); }) +#define call_a64(func,...) ({ call_a2(func,__VA_ARGS__); call_a1_64(func,__VA_ARGS__); }) static int check_pixel( int cpu_ref, int cpu_new ) @@ -372,7 +385,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) { uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 ); uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 ); - uint64_t res_a = call_a( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 ); + uint64_t res_a = call_a64( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 ); uint32_t cost8_a = res_a; uint32_t cost4_a = res_a >> 32; if( cost8_a != cost8_c || cost4_a != cost4_c ) @@ -2786,6 +2799,8 @@ static int check_all_flags( void ) ret = check_all_funcs( 0, X264_CPU_ALTIVEC ); } #elif ARCH_ARM + if( cpu_detect & X264_CPU_NEON ) + x264_checkasm_call = x264_checkasm_call_neon; if( cpu_detect & X264_CPU_ARMV6 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" ); if( cpu_detect & X264_CPU_NEON ) -- 2.39.5