From 6efb57ada652fd015ec4cacffd09282632bb975b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Thu, 13 Aug 2015 23:59:31 +0300 Subject: [PATCH] arm: Implement x264_sub8x16_dct_dc_neon checkasm timing Cortex-A7 A8 A9 sub8x16_dct_dc_c 6386 3901 4080 sub8x16_dct_dc_neon 1491 698 917 --- common/arm/dct-a.S | 94 ++++++++++++++++++++++++++++++++++++++++++++++ common/arm/dct.h | 1 + common/dct.c | 2 - 3 files changed, 95 insertions(+), 2 deletions(-) diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S index a8fee790..58af364b 100644 --- a/common/arm/dct-a.S +++ b/common/arm/dct-a.S @@ -4,6 +4,7 @@ * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad + * Martin Storsjo * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -658,6 +659,99 @@ function x264_sub8x8_dct_dc_neon bx lr endfunc +function x264_sub8x16_dct_dc_neon + mov r3, #FENC_STRIDE + mov ip, #FDEC_STRIDE + vld1.64 {d16}, [r1,:64], r3 + vld1.64 {d17}, [r2,:64], ip + vsubl.u8 q8, d16, d17 + vld1.64 {d18}, [r1,:64], r3 + vld1.64 {d19}, [r2,:64], ip + vsubl.u8 q9, d18, d19 + vld1.64 {d20}, [r1,:64], r3 + vld1.64 {d21}, [r2,:64], ip + vsubl.u8 q10, d20, d21 + vld1.64 {d22}, [r1,:64], r3 + vadd.s16 q0, q8, q9 + vld1.64 {d23}, [r2,:64], ip + vsubl.u8 q11, d22, d23 + vld1.64 {d24}, [r1,:64], r3 + vadd.s16 q0, q0, q10 + vld1.64 {d25}, [r2,:64], ip + vsubl.u8 q12, d24, d25 + vld1.64 {d26}, [r1,:64], r3 + vadd.s16 q0, q0, q11 + vld1.64 {d27}, [r2,:64], ip + vsubl.u8 q13, d26, d27 + vld1.64 {d28}, [r1,:64], r3 + vld1.64 {d29}, [r2,:64], ip + vsubl.u8 q14, d28, d29 + vld1.64 {d30}, [r1,:64], r3 + vadd.s16 q1, q12, q13 + vld1.64 {d31}, [r2,:64], ip + vsubl.u8 q15, d30, d31 + + vld1.64 {d16}, [r1,:64], r3 + vadd.s16 q1, q1, q14 + vld1.64 {d17}, [r2,:64], ip + vadd.s16 q1, q1, q15 + vld1.64 {d18}, [r1,:64], r3 + vsubl.u8 q8, d16, d17 + vld1.64 {d19}, [r2,:64], ip + vsubl.u8 q9, d18, d19 + vld1.64 {d20}, [r1,:64], r3 + vld1.64 {d21}, [r2,:64], ip + vsubl.u8 q10, d20, d21 + vld1.64 {d22}, [r1,:64], r3 + vadd.s16 q2, q8, q9 + vld1.64 {d23}, [r2,:64], ip + vsubl.u8 q11, d22, d23 + vld1.64 {d24}, [r1,:64], r3 + vadd.s16 q2, q2, q10 + vld1.64 {d25}, [r2,:64], ip + vsubl.u8 q12, d24, d25 + vld1.64 {d26}, [r1,:64], r3 + vadd.s16 q2, q2, q11 + vld1.64 {d27}, [r2,:64], ip + vsubl.u8 q13, d26, d27 + vld1.64 {d28}, [r1,:64], r3 + vld1.64 {d29}, [r2,:64], ip + vsubl.u8 q14, d28, d29 + vld1.64 {d30}, [r1,:64], r3 + vadd.s16 q3, q12, q13 + vld1.64 {d31}, [r2,:64], ip + vsubl.u8 q15, d30, d31 + vadd.s16 q3, q3, q14 + + vadd.s16 d16, d0, d1 @ b0 + vadd.s16 q3, q3, q15 + vsub.s16 d17, d0, d1 @ b4 + vadd.s16 d18, d2, d3 @ b1 + vsub.s16 d19, d2, d3 @ b5 + vadd.s16 d20, d4, d5 @ b2 + vsub.s16 d21, d4, d5 @ b6 + vadd.s16 d22, d6, d7 @ b3 + vsub.s16 d23, d6, d7 @ b7 + vadd.s16 q0, q8, q9 @ b0 + b1, b4 + b5; a0, a2 + vsub.s16 q1, q8, q9 @ b0 - b1, b4 - b5; a4, a6 + vadd.s16 q2, q10, q11 @ b2 + b3, b6 + b7; a1, a3 + vsub.s16 q3, q10, q11 @ b2 - b3, b6 - b7; a5, a7 + + vadd.s16 q8, q0, q2 @ a0 + a1, a2 + a3 + vsub.s16 q9, q0, q2 @ a0 - a1, a2 - a3 + vsub.s16 q10, q1, q3 @ a4 - a5, a6 - a7 + vadd.s16 q11, q1, q3 @ a4 + a5, a6 + a7 + + vpadd.s16 d0, d16, d17 + vpadd.s16 d1, d18, d19 + vpadd.s16 d2, d20, d21 + vpadd.s16 d3, d22, d23 + vpadd.s16 d0, d0, d1 + vpadd.s16 d1, d2, d3 + vst1.64 {q0}, [r0,:64] + bx lr +endfunc + function x264_zigzag_scan_4x4_frame_neon movrel r2, scan4x4_frame diff --git a/common/arm/dct.h b/common/arm/dct.h index e252e7eb..77063d8c 100644 --- a/common/arm/dct.h +++ b/common/arm/dct.h @@ -40,6 +40,7 @@ void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] ); void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] ); void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] ); void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); diff --git a/common/dct.c b/common/dct.c index e80d64bd..aafd9fb9 100644 --- a/common/dct.c +++ b/common/dct.c @@ -750,9 +750,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add8x8_idct8 = x264_add8x8_idct8_neon; dctf->add16x16_idct8= x264_add16x16_idct8_neon; -#if ARCH_AARCH64 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon; -#endif } #endif -- 2.39.2