From 3902ae02a0edede5d6c44cb3ee9e24e618c66e6a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Tue, 25 Aug 2015 23:36:44 +0300 Subject: [PATCH] arm: Implement chroma intra deblock checkasm timing Cortex-A7 A8 A9 deblock_chroma_420_intra_mbaff_c 1469 1276 1181 deblock_chroma_420_intra_mbaff_neon 981 717 644 deblock_chroma_intra[1]_c 2954 2402 2321 deblock_chroma_intra[1]_neon 947 581 575 deblock_h_chroma_420_intra_c 2859 2509 2264 deblock_h_chroma_420_intra_neon 1480 1119 1028 deblock_h_chroma_422_intra_c 6211 5030 4792 deblock_h_chroma_422_intra_neon 2894 1990 2077 --- common/arm/deblock-a.S | 116 +++++++++++++++++++++++++++++++++++++++++ common/deblock.c | 4 +- 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S index 1ef708e4..a3def9f6 100644 --- a/common/arm/deblock-a.S +++ b/common/arm/deblock-a.S @@ -365,6 +365,122 @@ function x264_deblock_h_chroma_mbaff_neon bx lr endfunc +.macro h264_loop_filter_chroma_intra, width=16 + vdup.8 q11, r2 @ alpha + vabd.u8 q13, q8, q0 @ abs(p0 - q0) + vabd.u8 q14, q9, q8 @ abs(p1 - p0) + vabd.u8 q15, q1, q0 @ abs(q1 - q0) + vclt.u8 q13, q13, q11 @ < alpha + vdup.8 q11, r3 @ beta + vclt.u8 q14, q14, q11 @ < beta + vclt.u8 q15, q15, q11 @ < beta + vand q13, q13, q14 + vand q13, q13, q15 + + vshll.u8 q14, d18, #1 + vshll.u8 q2, d2, #1 +.ifc \width, 16 + vshll.u8 q15, d19, #1 + vshll.u8 q3, d3, #1 + vaddl.u8 q12, d17, d3 + vaddl.u8 q10, d1, d19 +.endif + vaddl.u8 q11, d16, d2 + vaddl.u8 q1, d18, d0 @ or vaddw q2, to not clobber q1 + vadd.u16 q14, q14, q11 + vadd.u16 q2, q2, q1 +.ifc \width, 16 + vadd.u16 q15, q15, q12 + vadd.u16 q3, q3, q10 +.endif + vqrshrn.u16 d28, q14, #2 + vqrshrn.u16 d4, q2, #2 +.ifc \width, 16 + vqrshrn.u16 d29, q15, #2 + vqrshrn.u16 d5, q3, #2 +.endif + vbit q8, q14, q13 + vbit q0, q2, q13 +.endm + +function x264_deblock_v_chroma_intra_neon + sub r0, r0, r1, lsl #1 + vld2.8 {d18,d19}, [r0,:128], r1 + vld2.8 {d16,d17}, [r0,:128], r1 + vld2.8 {d0, d1}, [r0,:128], r1 + vld2.8 {d2, d3}, [r0,:128] + + h264_loop_filter_chroma_intra + + sub r0, r0, r1, lsl #1 + vst2.8 {d16,d17}, [r0,:128], r1 + vst2.8 {d0, d1}, [r0,:128], r1 + + bx lr +endfunc + +function x264_deblock_h_chroma_intra_neon + sub r0, r0, #4 + vld1.8 {d18}, [r0], r1 + vld1.8 {d16}, [r0], r1 + vld1.8 {d0}, [r0], r1 + vld1.8 {d2}, [r0], r1 + vld1.8 {d19}, [r0], r1 + vld1.8 {d17}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d3}, [r0], r1 + + TRANSPOSE4x4_16 q9, q8, q0, q1 + + h264_loop_filter_chroma_intra + + vtrn.16 q8, q0 + + sub r0, r0, r1, lsl #3 + add r0, r0, #2 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 + vst1.32 {d0[1]}, [r0], r1 + vst1.32 {d17[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + vst1.32 {d17[1]}, [r0], r1 + vst1.32 {d1[1]}, [r0], r1 + + bx lr +endfunc + +function x264_deblock_h_chroma_422_intra_neon + push {lr} + bl X(x264_deblock_h_chroma_intra_neon) + add r0, r0, #2 + pop {lr} + b X(x264_deblock_h_chroma_intra_neon) +endfunc + +function x264_deblock_h_chroma_intra_mbaff_neon + sub r0, r0, #4 + vld1.8 {d18}, [r0], r1 + vld1.8 {d16}, [r0], r1 + vld1.8 {d0}, [r0], r1 + vld1.8 {d2}, [r0], r1 + + TRANSPOSE4x4_16 d18, d16, d0, d2 + + h264_loop_filter_chroma_intra width=8 + + vtrn.16 d16, d0 + + sub r0, r0, r1, lsl #2 + add r0, r0, #2 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 + vst1.32 {d0[1]}, [r0] + + bx lr +endfunc + function x264_deblock_strength_neon ldr ip, [sp] vmov.i8 q8, #0 diff --git a/common/deblock.c b/common/deblock.c index 1d398add..46379ecc 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -741,11 +741,11 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26 int mvy_limit, int bframe ); void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); -#if ARCH_AARCH64 void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +#if ARCH_AARCH64 void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #endif @@ -875,11 +875,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon; pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon; -#if ARCH_AARCH64 pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon; pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon; +#if ARCH_AARCH64 pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon; #endif -- 2.39.2