From ac8f2e8a4cf21b2026957509bea8865ff7879fb4 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 12 Mar 2014 14:35:31 +0100 Subject: [PATCH] arm: implement x264_pixel_var2_8x16_neon checkasm --bench on a cortex-a9: var2_8x16_c: 5677 var2_8x16_neon: 1421 --- common/arm/pixel-a.S | 43 +++++++++++++++++++++++++++++++++++++++++++ common/arm/pixel.h | 3 ++- common/pixel.c | 1 + 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S index 6bc904d8..0b996a85 100644 --- a/common/arm/pixel-a.S +++ b/common/arm/pixel-a.S @@ -635,6 +635,49 @@ function x264_pixel_var2_8x8_neon bx lr .endfunc +function x264_pixel_var2_8x16_neon + vld1.64 {d16}, [r0,:64], r1 + vld1.64 {d17}, [r2,:64], r3 + vld1.64 {d18}, [r0,:64], r1 + vld1.64 {d19}, [r2,:64], r3 + vsubl.u8 q10, d16, d17 + vsubl.u8 q11, d18, d19 + SQR_ACC q1, d20, d21, vmull.s16 + vld1.64 {d16}, [r0,:64], r1 + vadd.s16 q0, q10, q11 + vld1.64 {d17}, [r2,:64], r3 + SQR_ACC q2, d22, d23, vmull.s16 + mov ip, #14 +1: subs ip, ip, #2 + vld1.64 {d18}, [r0,:64], r1 + vsubl.u8 q10, d16, d17 + vld1.64 {d19}, [r2,:64], r3 + vadd.s16 q0, q0, q10 + SQR_ACC q1, d20, d21 + vsubl.u8 q11, d18, d19 + beq 2f + vld1.64 {d16}, [r0,:64], r1 + vadd.s16 q0, q0, q11 + vld1.64 {d17}, [r2,:64], r3 + SQR_ACC q2, d22, d23 + b 1b +2: + vadd.s16 q0, q0, q11 + SQR_ACC q2, d22, d23 + + ldr ip, [sp] + vadd.s16 d0, d0, d1 + vadd.s32 q1, q1, q2 + vpaddl.s16 d0, d0 + vadd.s32 d1, d2, d3 + vpadd.s32 d0, d0, d1 + + vmov r0, r1, d0 + vst1.32 {d0[1]}, [ip,:32] + mul r0, r0, r0 + sub r0, r1, r0, lsr #7 + bx lr +.endfunc .macro LOAD_DIFF_8x4 q0 q1 q2 q3 vld1.32 {d1}, [r2], r3 diff --git a/common/arm/pixel.h b/common/arm/pixel.h index acd517b9..c55ed9a7 100644 --- a/common/arm/pixel.h +++ b/common/arm/pixel.h @@ -58,7 +58,8 @@ int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); -int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); diff --git a/common/pixel.c b/common/pixel.c index b760a117..f62e2b38 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1350,6 +1350,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon; -- 2.39.2