]> git.sesse.net Git - x264/commitdiff
arm: Implement x264_denoise_dct_neon
authorMartin Storsjö <martin@martin.st>
Tue, 25 Aug 2015 11:38:13 +0000 (14:38 +0300)
committerHenrik Gramner <henrik@gramner.com>
Sun, 11 Oct 2015 16:44:54 +0000 (18:44 +0200)
checkasm timing       Cortex-A7      A8     A9
denoise_dct_c                6604    5510   5858
denoise_dct_neon             1774    1139   1614

common/arm/quant-a.S
common/arm/quant.h
common/quant.c

index ad8d8f8421dd7dd482c7ad2215d504c3ce1c607a..e63170e30e0e95d50ea36d22a3ba9d21a40a579e 100644 (file)
@@ -4,6 +4,7 @@
  * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -404,3 +405,31 @@ function x264_coeff_last64_neon
     movlt       r0,  #0
     bx          lr
 endfunc
+
+function x264_denoise_dct_neon
+1:  subs        r3,  r3,  #16
+    vld1.16     {q0,  q1},  [r0]
+    vld1.32     {q12, q13}, [r1]!
+    vld1.32     {q14, q15}, [r1]
+    sub         r1,  #32
+    vabs.s16    q8,  q0
+    vabs.s16    q9,  q1
+    vld1.16     {q2, q3}, [r2]!
+    vclt.s16    q10, q0,  #0
+    vclt.s16    q11, q1,  #0
+    vaddw.u16   q12, q12, d16
+    vaddw.u16   q13, q13, d17
+    vqsub.u16   q0,  q8,  q2
+    vqsub.u16   q1,  q9,  q3
+    vaddw.u16   q14, q14, d18
+    vaddw.u16   q15, q15, d19
+    vneg.s16    q8,  q0
+    vneg.s16    q9,  q1
+    vbsl        q10, q8,  q0
+    vbsl        q11, q9,  q1
+    vst1.32     {q12, q13}, [r1]!
+    vst1.32     {q14, q15}, [r1]!
+    vst1.16     {q10, q11}, [r0]!
+    bgt         1b
+    bx          lr
+endfunc
index 8ea179a1c77c1de1c716ef32974d1e4a66892434..78178e8d5713c8fe6b310ff37e6598a8b6a550ef 100644 (file)
@@ -44,4 +44,6 @@ int x264_coeff_last15_neon( int16_t * );
 int x264_coeff_last16_neon( int16_t * );
 int x264_coeff_last64_neon( int16_t * );
 
+void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
+
 #endif
index bc9e8d73de974e69ba9060b98269d20a04cb66b8..f8279a77fe0f104acf0ed590b75524fe8b073989 100644 (file)
@@ -750,6 +750,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
+        pf->denoise_dct = x264_denoise_dct_neon;
     }
 #endif
 #if ARCH_AARCH64
@@ -767,7 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score15 = x264_decimate_score15_neon;
         pf->decimate_score16 = x264_decimate_score16_neon;
         pf->decimate_score64 = x264_decimate_score64_neon;
-        pf->denoise_dct = x264_denoise_dct_neon;
     }
 #endif