From 3c66591e859045ef79a7131b991a5f20c80ffbb4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Tue, 25 Aug 2015 14:38:15 +0300 Subject: [PATCH] arm: Implement x264_deblock_h_chroma_422_neon checkasm timing Cortex-A7 A8 A9 deblock_h_chroma_422_c 6953 6269 5145 deblock_h_chroma_422_neon 3905 2569 2551 --- common/arm/deblock-a.S | 18 ++++++++++++++++++ common/deblock.c | 4 ++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S index 446e6780..a300220f 100644 --- a/common/arm/deblock-a.S +++ b/common/arm/deblock-a.S @@ -4,6 +4,7 @@ * Copyright (C) 2009-2015 x264 project * * Authors: Mans Rullgard + * Martin Storsjo * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -261,6 +262,7 @@ function x264_deblock_h_chroma_neon h264_loop_filter_start sub r0, r0, #4 +deblock_h_chroma: vld1.8 {d18}, [r0], r1 vld1.8 {d16}, [r0], r1 vld1.8 {d0}, [r0], r1 @@ -290,6 +292,22 @@ function x264_deblock_h_chroma_neon bx lr endfunc +function x264_deblock_h_chroma_422_neon + h264_loop_filter_start + push {lr} + sub r0, r0, #4 + add r1, r1, r1 + bl deblock_h_chroma + ldr ip, [sp, #4] + ldr ip, [ip] + vdup.32 d24, ip + sub r0, r0, r1, lsl #3 + add r0, r0, r1, lsr #1 + sub r0, r0, #2 + pop {lr} + b deblock_h_chroma +endfunc + function x264_deblock_strength_neon ldr ip, [sp] vmov.i8 q8, #0 diff --git a/common/deblock.c b/common/deblock.c index 374e2936..83bda620 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -739,8 +739,8 @@ void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int b void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); -#if ARCH_AARCH64 void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +#if ARCH_AARCH64 void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); @@ -873,11 +873,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[0] = x264_deblock_h_luma_neon; pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; + pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon; #if ARCH_AARCH64 pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon; pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon; - pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon; pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon; -- 2.39.2