From b08403b5593307b919bfe5bfbd743da825326a4c Mon Sep 17 00:00:00 2001
From: =?utf8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 25 Aug 2015 14:38:13 +0300
Subject: [PATCH] arm: Implement x264_denoise_dct_neon

checkasm timing       Cortex-A7      A8     A9
denoise_dct_c                6604    5510   5858
denoise_dct_neon             1774    1139   1614
---
 common/arm/quant-a.S | 29 +++++++++++++++++++++++++++++
 common/arm/quant.h   |  2 ++
 common/quant.c       |  2 +-
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index ad8d8f84..e63170e3 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -4,6 +4,7 @@
  * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -404,3 +405,31 @@ function x264_coeff_last64_neon
     movlt       r0,  #0
     bx          lr
 endfunc
+
+function x264_denoise_dct_neon
+1:  subs        r3,  r3,  #16
+    vld1.16     {q0,  q1},  [r0]
+    vld1.32     {q12, q13}, [r1]!
+    vld1.32     {q14, q15}, [r1]
+    sub         r1,  #32
+    vabs.s16    q8,  q0
+    vabs.s16    q9,  q1
+    vld1.16     {q2, q3}, [r2]!
+    vclt.s16    q10, q0,  #0
+    vclt.s16    q11, q1,  #0
+    vaddw.u16   q12, q12, d16
+    vaddw.u16   q13, q13, d17
+    vqsub.u16   q0,  q8,  q2
+    vqsub.u16   q1,  q9,  q3
+    vaddw.u16   q14, q14, d18
+    vaddw.u16   q15, q15, d19
+    vneg.s16    q8,  q0
+    vneg.s16    q9,  q1
+    vbsl        q10, q8,  q0
+    vbsl        q11, q9,  q1
+    vst1.32     {q12, q13}, [r1]!
+    vst1.32     {q14, q15}, [r1]!
+    vst1.16     {q10, q11}, [r0]!
+    bgt         1b
+    bx          lr
+endfunc
diff --git a/common/arm/quant.h b/common/arm/quant.h
index 8ea179a1..78178e8d 100644
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -44,4 +44,6 @@ int x264_coeff_last15_neon( int16_t * );
 int x264_coeff_last16_neon( int16_t * );
 int x264_coeff_last64_neon( int16_t * );
 
+void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
+
 #endif
diff --git a/common/quant.c b/common/quant.c
index bc9e8d73..f8279a77 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -750,6 +750,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
+        pf->denoise_dct = x264_denoise_dct_neon;
     }
 #endif
 #if ARCH_AARCH64
@@ -767,7 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score15 = x264_decimate_score15_neon;
         pf->decimate_score16 = x264_decimate_score16_neon;
         pf->decimate_score64 = x264_decimate_score64_neon;
-        pf->denoise_dct = x264_denoise_dct_neon;
     }
 #endif
 
-- 
2.39.2