From: Martin Storsjö Date: Tue, 25 Aug 2015 11:38:11 +0000 (+0300) Subject: arm: Add neon versions of vsad, asd8 and ssd_nv12_core X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=8feb733ed1dcb1cc94df3b0e6c98009832ea85cc;p=x264 arm: Add neon versions of vsad, asd8 and ssd_nv12_core These are straight translations of the aarch64 versions. checkasm timing Cortex-A7 A8 A9 vsad_c 16234 10984 9850 vsad_neon 2132 1020 789 asd8_c 5859 3561 3543 asd8_neon 1407 1279 1250 ssd_nv12_c 608096 591072 426285 ssd_nv12_neon 72752 33549 41347 --- diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S index 36858bc4..bbe082d7 100644 --- a/common/arm/pixel-a.S +++ b/common/arm/pixel-a.S @@ -4,6 +4,7 @@ * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad + * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -388,6 +389,59 @@ SAD_X_FUNC 4, 8, 16 SAD_X_FUNC 4, 16, 8 SAD_X_FUNC 4, 16, 16 +function x264_pixel_vsad_neon + subs r2, r2, #2 + vld1.8 {q0}, [r0], r1 + vld1.8 {q1}, [r0], r1 + vabdl.u8 q2, d0, d2 + vabdl.u8 q3, d1, d3 + ble 2f +1: + subs r2, r2, #2 + vld1.8 {q0}, [r0], r1 + vabal.u8 q2, d2, d0 + vabal.u8 q3, d3, d1 + vld1.8 {q1}, [r0], r1 + blt 2f + vabal.u8 q2, d0, d2 + vabal.u8 q3, d1, d3 + bgt 1b +2: + vadd.u16 q0, q2, q3 + HORIZ_ADD d0, d0, d1 + vmov.32 r0, d0[0] + bx lr +endfunc + +function x264_pixel_asd8_neon + ldr r12, [sp, #0] + sub r12, r12, #2 + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d3}, [r2], r3 + vsubl.u8 q8, d0, d1 +1: + subs r12, r12, #2 + vld1.8 {d4}, [r0], r1 + vld1.8 {d5}, [r2], r3 + vsubl.u8 q9, d2, d3 + vsubl.u8 q10, d4, d5 + vadd.s16 q8, q9 + vld1.8 {d2}, [r0], r1 + vld1.8 {d3}, [r2], r3 + vadd.s16 q8, q10 + bgt 1b + vsubl.u8 q9, d2, d3 + vadd.s16 q8, q9 + vpaddl.s16 q8, q8 + vpadd.s32 d16, d16, d17 + vpadd.s32 d16, d16, d17 + vabs.s32 d16, d16 + vmov.32 r0, d16[0] + bx lr +endfunc + .macro SSD_START_4 vld1.32 {d16[]}, [r0,:32], r1 @@ -489,6 +543,79 @@ SSD_FUNC 8, 16 SSD_FUNC 16, 8 SSD_FUNC 16, 16 +function x264_pixel_ssd_nv12_core_neon + push {r4-r5} + ldrd r4, r5, [sp, #8] + add r12, r4, #8 + bic r12, r12, #15 + vmov.u64 q8, #0 + vmov.u64 q9, #0 + sub r1, r1, r12, lsl #1 + sub r3, r3, r12, lsl #1 +1: + subs r12, r4, #16 + vld2.8 {d0,d1}, [r0]! + vld2.8 {d2,d3}, [r2]! + vld2.8 {d4,d5}, [r0]! + vld2.8 {d6,d7}, [r2]! + + vsubl.u8 q10, d0, d2 + vsubl.u8 q11, d1, d3 + vmull.s16 q14, d20, d20 + vmull.s16 q15, d22, d22 + vsubl.u8 q12, d4, d6 + vsubl.u8 q13, d5, d7 + vmlal.s16 q14, d21, d21 + vmlal.s16 q15, d23, d23 + + blt 4f + beq 3f +2: + vmlal.s16 q14, d24, d24 + vmlal.s16 q15, d26, d26 + vld2.8 {d0,d1}, [r0]! + vld2.8 {d2,d3}, [r2]! + vmlal.s16 q14, d25, d25 + vmlal.s16 q15, d27, d27 + + subs r12, r12, #16 + vsubl.u8 q10, d0, d2 + vsubl.u8 q11, d1, d3 + vmlal.s16 q14, d20, d20 + vmlal.s16 q15, d22, d22 + vld2.8 {d4,d5}, [r0]! + vld2.8 {d6,d7}, [r2]! + vmlal.s16 q14, d21, d21 + vmlal.s16 q15, d23, d23 + blt 4f + + vsubl.u8 q12, d4, d6 + vsubl.u8 q13, d5, d7 + bgt 2b +3: + vmlal.s16 q14, d24, d24 + vmlal.s16 q15, d26, d26 + vmlal.s16 q14, d25, d25 + vmlal.s16 q15, d27, d27 +4: + subs r5, r5, #1 + vaddw.s32 q8, q8, d28 + vaddw.s32 q9, q9, d30 + add r0, r0, r1 + add r2, r2, r3 + vaddw.s32 q8, q8, d29 + vaddw.s32 q9, q9, d31 + bgt 1b + + vadd.u64 d16, d16, d17 + vadd.u64 d18, d18, d19 + ldrd r4, r5, [sp, #16] + vst1.64 {d16}, [r4] + vst1.64 {d18}, [r5] + + pop {r4-r5} + bx lr +endfunc .macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16 vmull.u8 \qsqr, \dsrc, \dsrc diff --git a/common/arm/pixel.h b/common/arm/pixel.h index f361b9d5..81c21dc8 100644 --- a/common/arm/pixel.h +++ b/common/arm/pixel.h @@ -52,6 +52,10 @@ DECL_X4( sad, neon ) DECL_X1( satd, neon ) DECL_X1( ssd, neon ) +void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * ); + +int x264_pixel_vsad_neon( uint8_t *, intptr_t, int ); + int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); @@ -71,4 +75,6 @@ void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t, int sums[2][4] ); float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); +int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + #endif diff --git a/common/pixel.c b/common/pixel.c index e0ad76cc..9904b176 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1380,6 +1380,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + pixf->vsad = x264_pixel_vsad_neon; + pixf->asd8 = x264_pixel_asd8_neon; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon; @@ -1392,6 +1394,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon;