From 5c13589be828b524100c787057d6bef77898c657 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 25 Aug 2015 23:36:45 +0300
Subject: [PATCH] arm: Implement x284_decimate_score15/16/64_neon

checkasm timing       Cortex-A7      A8     A9
decimate_score15_c           764     736    535
decimate_score15_neon        487     494    453
decimate_score16_c           782     727    553
decimate_score16_neon        487     494    521
decimate_score64_c           2361    2597   2011
decimate_score64_neon        1017    802    785
---
 common/aarch64/quant-a.S |   1 +
 common/arm/quant-a.S     | 138 +++++++++++++++++++++++++++++++++++++++
 common/arm/quant.h       |   4 ++
 common/quant.c           |   6 +-
 4 files changed, 146 insertions(+), 3 deletions(-)

diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
index 3e7e35e4..40909004 100644
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -5,6 +5,7 @@
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Janne Grunau <janne-x264@jannau.net>
+ *          Martin Storsjo <martin@martin.st>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index e63170e3..7a2667f1 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -32,6 +32,14 @@ pmovmskb_byte:
 .byte 1,2,4,8,16,32,64,128
 .byte 1,2,4,8,16,32,64,128
 
+mask_2bit:
+.byte 3,12,48,192,3,12,48,192
+.byte 3,12,48,192,3,12,48,192
+
+mask_1bit:
+.byte 128,64,32,16,8,4,2,1
+.byte 128,64,32,16,8,4,2,1
+
 .text
 
 .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
@@ -308,6 +316,136 @@ dequant_4x4_dc_rshift:
     bx          lr
 endfunc
 
+.macro decimate_score_1x size
+function x264_decimate_score\size\()_neon
+    vld1.16     {q0, q1}, [r0, :128]
+    movrel      r3, mask_2bit
+    vmov.s8     q3,  #0x01
+    vqmovn.s16  d0,  q0
+    vqmovn.s16  d1,  q1
+    vqabs.s8    q2,  q0
+    vld1.8      {q8}, [r3, :128]
+    vceq.s8     q1,  q0,  #0
+    vcgt.s8     q2,  q2,  q3
+    vand.u8     q1,  q1,  q8
+    vshrn.u16   d4,  q2,  #4
+    vpadd.u8    d2,  d2,  d3
+    vpadd.u8    d4,  d4,  d4
+    vpadd.u8    d2,  d2,  d2
+    vmov.32     r2,  d4[0]
+    vmov.32     r1,  d2[0]
+    cmp         r2,  #0
+    beq         0f
+    mov         r0,  #9
+    bx          lr
+0:
+    mvns        r1,  r1
+    mov         r0,  #0
+    bxeq        lr
+.ifc \size, 15
+    lsr         r1,  r1,  #2
+.endif
+    rbit        r1,  r1
+    movrel      r3,  X(x264_decimate_table4)
+1:
+    clz         r2,  r1
+    lsl         r1,  r1,  r2
+    lsr         r12, r2,  #1
+    ldrb        r2,  [r3, r12]
+    lsls        r1,  r1,  #2
+    add         r0,  r0,  r2
+    bne         1b
+    bx          lr
+endfunc
+.endm
+
+decimate_score_1x 15
+decimate_score_1x 16
+
+function x264_decimate_score64_neon
+    push        {lr}
+    vld1.16     {q8,  q9},  [r0, :128]!
+    vld1.16     {q10, q11}, [r0, :128]!
+    vld1.16     {q12, q13}, [r0, :128]!
+    vld1.16     {q14, q15}, [r0, :128]
+    movrel      r3, mask_1bit
+    vmov.s8     q3,  #0x01
+    vqmovn.s16  d17, q8
+    vqmovn.s16  d16, q9
+    vqmovn.s16  d19, q10
+    vqmovn.s16  d18, q11
+    vqmovn.s16  d21, q12
+    vqmovn.s16  d20, q13
+    vqmovn.s16  d23, q14
+    vqmovn.s16  d22, q15
+    vqabs.s8    q12, q8
+    vqabs.s8    q13, q9
+    vqabs.s8    q14, q10
+    vqabs.s8    q15, q11
+    vld1.8      {q2}, [r3, :128]
+    vceq.s8     q8,  q8,  #0
+    vceq.s8     q9,  q9,  #0
+    vceq.s8     q10, q10, #0
+    vceq.s8     q11, q11, #0
+    vmax.s8     q12, q12, q13
+    vmax.s8     q14, q14, q15
+    vand.u8     q8,  q8,  q2
+    vand.u8     q9,  q9,  q2
+    vand.u8     q10, q10, q2
+    vand.u8     q11, q11, q2
+    vmax.s8     q12, q12, q14
+    vpadd.u8    d18, d18, d19
+    vpadd.u8    d19, d16, d17
+    vcgt.s8     q12, q12, q3
+    vpadd.u8    d22, d22, d23
+    vpadd.u8    d23, d20, d21
+    vshrn.u16   d24, q12, #4
+    vpadd.u8    d16, d22, d23
+    vpadd.u8    d17, d18, d19
+    vpadd.u8    d24, d24, d24
+    vpadd.u8    d16, d16, d17
+    vmov.32     r2,  d24[0]
+    vmov        r12, r1,  d16
+    cmp         r2,  #0
+    beq         0f
+    mov         r0,  #9
+    pop         {pc}
+0:
+    mvns        r1,  r1
+    mvn         r12, r12
+    mov         r0,  #0
+    mov         lr,  #32
+    movrel      r3,  X(x264_decimate_table8)
+    beq         2f
+1:
+    clz         r2,  r1
+    lsl         r1,  r1,  r2
+    sub         lr,  lr,  r2
+    ldrb        r2,  [r3, r2]
+    lsls        r1,  r1,  #1
+    sub         lr,  lr,  #1
+    add         r0,  r0,  r2
+    bne         1b
+2:
+    cmp         r12, #0
+    popeq       {pc}
+
+    clz         r2,  r12
+    lsl         r1,  r12, r2
+    add         r2,  r2,  lr
+    ldrb        r2,  [r3, r2]
+    lsls        r1,  r1,  #1
+    add         r0,  r0,  r2
+    popeq       {pc}
+3:
+    clz         r2,  r1
+    lsl         r1,  r1,  r2
+    ldrb        r2,  [r3, r2]
+    lsls        r1,  r1,  #1
+    add         r0,  r0,  r2
+    bne         3b
+    pop         {pc}
+endfunc
 
 // int coeff_last( int16_t *l )
 function x264_coeff_last4_arm
diff --git a/common/arm/quant.h b/common/arm/quant.h
index 78178e8d..2ec91ebe 100644
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -38,6 +38,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
 void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 
+int x264_decimate_score15_neon( int16_t * );
+int x264_decimate_score16_neon( int16_t * );
+int x264_decimate_score64_neon( int16_t * );
+
 int x264_coeff_last4_arm( int16_t * );
 int x264_coeff_last8_arm( int16_t * );
 int x264_coeff_last15_neon( int16_t * );
diff --git a/common/quant.c b/common/quant.c
index f8279a77..be000ec4 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -751,6 +751,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
         pf->denoise_dct = x264_denoise_dct_neon;
+        pf->decimate_score15 = x264_decimate_score15_neon;
+        pf->decimate_score16 = x264_decimate_score16_neon;
+        pf->decimate_score64 = x264_decimate_score64_neon;
     }
 #endif
 #if ARCH_AARCH64
@@ -765,9 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_level_run8 = x264_coeff_level_run8_neon;
         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_neon;
         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
-        pf->decimate_score15 = x264_decimate_score15_neon;
-        pf->decimate_score16 = x264_decimate_score16_neon;
-        pf->decimate_score64 = x264_decimate_score64_neon;
     }
 #endif
 
-- 
2.39.2