From aec81efd3fe43008551916aa6073eb0732a58210 Mon Sep 17 00:00:00 2001
From: Janne Grunau <janne-x264@jannau.net>
Date: Mon, 17 Aug 2015 16:39:20 +0200
Subject: [PATCH] aarch64: Optimize various intra_predict asm functions

Make them at least as fast as the compiled C version (tested on
cortex-a53 vs. gcc 4.9.2).

                        C     NEON (before)   NEON (after)
intra_predict_4x4_dc:   260   335             260
intra_predict_4x4_dct:  210   265             200
intra_predict_8x8c_dc:  497   548             493
intra_predict_8x8c_v:   232   309             179 (arm64)
intra_predict_8x16c_dc: 795   830             790
---
 common/aarch64/predict-a.S | 132 +++++++++++++++++++++----------------
 common/aarch64/predict-c.c |   7 +-
 common/aarch64/predict.h   |   3 +-
 3 files changed, 82 insertions(+), 60 deletions(-)

diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
index a7dd2d1c..bcc3d7a8 100644
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -90,40 +90,37 @@ endfunc
 
 function x264_predict_4x4_dc_neon, export=1
     sub         x1,  x0,  #FDEC_STRIDE
-    sub         x2,  x0,  #1
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x1]
-    ld1r       {v1.8b}, [x2], x7
-    ld1r       {v2.8b}, [x2], x7
-    ld1r       {v3.8b}, [x2], x7
-    ld1r       {v4.8b}, [x2], x7
-    uaddlp      v0.4h,  v0.8b
-    uaddl       v1.8h,  v1.8b,  v2.8b
-    uaddl       v2.8h,  v3.8b,  v4.8b
-    addp        v0.4h,  v0.4h,  v0.4h
-    add         v1.4h,  v1.4h,  v2.4h
+    ldrb        w4,  [x0, #-1 + 0 * FDEC_STRIDE]
+    ldrb        w5,  [x0, #-1 + 1 * FDEC_STRIDE]
+    ldrb        w6,  [x0, #-1 + 2 * FDEC_STRIDE]
+    ldrb        w7,  [x0, #-1 + 3 * FDEC_STRIDE]
+    add         w4,  w4,  w5
+    ldr         s0, [x1]
+    add         w6,  w6,  w7
+    uaddlv      h0,  v0.8b
+    add         w4,  w4,  w6
     dup         v0.4h,  v0.h[0]
+    dup         v1.4h,  w4
     add         v0.4h,  v0.4h,  v1.4h
     rshrn       v0.8b,  v0.8h,  #3
-    str         s0,  [x0], #FDEC_STRIDE
-    str         s0,  [x0], #FDEC_STRIDE
-    str         s0,  [x0], #FDEC_STRIDE
     str         s0,  [x0]
+    str         s0,  [x0, #1 * FDEC_STRIDE]
+    str         s0,  [x0, #2 * FDEC_STRIDE]
+    str         s0,  [x0, #3 * FDEC_STRIDE]
     ret
 endfunc
 
 function x264_predict_4x4_dc_top_neon, export=1
     sub         x1,  x0,  #FDEC_STRIDE
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x1]
-    uaddlp      v0.4h,  v0.8b
-    addp        v0.4h,  v0.4h,  v0.4h
+    ldr         s0, [x1]
+    uaddlv      h0,  v0.8b
     dup         v0.4h,  v0.h[0]
     rshrn       v0.8b,  v0.8h,  #2
-    str         s0,  [x0], #FDEC_STRIDE
-    str         s0,  [x0], #FDEC_STRIDE
-    str         s0,  [x0], #FDEC_STRIDE
     str         s0,  [x0]
+    str         s0,  [x0, #1 * FDEC_STRIDE]
+    str         s0,  [x0, #2 * FDEC_STRIDE]
+    str         s0,  [x0, #3 * FDEC_STRIDE]
+    ret
     ret
 endfunc
 
@@ -456,30 +453,48 @@ function x264_predict_8x8c_dc_left_neon, export=1
 endfunc
 
 function x264_predict_8x8c_dc_neon, export=1
-    sub         x2,  x0,  #FDEC_STRIDE
-    sub         x3,  x0,  #1
     mov         x1,  #FDEC_STRIDE
-    ld1        {v2.8b},  [x2]
-    ldcol.8     v3,  x3,  x1
-    transpose   v0.2s,  v1.2s,  v2.2s,  v3.2s
-    uaddlp      v0.4h,  v0.8b  // s0, s2
-    uaddlp      v1.4h,  v1.8b  // s1, s3
-    addp        v0.4h,  v0.4h,  v1.4h // s0, s2, s1, s3
-    addp        v1.4h,  v0.4h,  v0.4h
-    rshrn       v2.8b,  v0.8h,  #2
+    sub         x2,  x0,  #FDEC_STRIDE
+    ldrb        w10, [x0, #0 * FDEC_STRIDE - 1]
+    ldrb        w11, [x0, #1 * FDEC_STRIDE - 1]
+    ldrb        w12, [x0, #2 * FDEC_STRIDE - 1]
+    ldrb        w13, [x0, #3 * FDEC_STRIDE - 1]
+    add         w10, w10, w11
+    ldrb        w4,  [x0, #4 * FDEC_STRIDE - 1]
+    ldrb        w5,  [x0, #5 * FDEC_STRIDE - 1]
+    add         w12, w12, w13
+    ldrb        w6,  [x0, #6 * FDEC_STRIDE - 1]
+    ldrb        w7,  [x0, #7 * FDEC_STRIDE - 1]
+    add         w4,  w4,  w5
+    add         w6,  w6,  w7
+    add         w10, w10, w12, lsl #16
+    add         w4,  w4,  w6,  lsl #16
+    ld1        {v0.8b},  [x2]
+    add         x10, x10, x4,  lsl #32
+    uaddlp      v0.4h,  v0.8b  // s0, s1
+    mov         v1.d[0],  x10  // s2, s3
+    add         v3.4h,  v0.4h,  v1.4h
+    addp        v0.4h,  v0.4h,  v1.4h // s0, s1, s2, s3
+    addp        v1.4h,  v3.4h,  v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
+    uzp2        v0.4h,  v0.4h,  v0.4h // s1,    s3,    s1,    s3
+    uzp1        v1.2d,  v1.2d,  v1.2d
+    uzp1        v0.2d,  v0.2d,  v0.2d
     rshrn       v3.8b,  v1.8h,  #3
-    dup         v5.8b,  v2.b[2]  // dc1
-    dup         v6.8b,  v3.b[1]  // dc2
-    dup         v4.8b,  v3.b[0]  // dc0
-    dup         v7.8b,  v2.b[3]  // dc3
-    trn1        v0.2s,  v4.2s,  v5.2s
-    trn1        v1.2s,  v7.2s,  v6.2s
+    rshrn       v2.8b,  v0.8h,  #2
+    uzp1        v0.8b,  v3.8b,  v2.8b
+    uzp2        v1.8b,  v2.8b,  v3.8b
 pred8x8c_dc_end:
-    add         x2,  x0,  x1,  lsl #2
-.rept 4
+    add         x2,  x0,  #2 * FDEC_STRIDE
+    add         x4,  x0,  #4 * FDEC_STRIDE
+    add         x5,  x0,  #6 * FDEC_STRIDE
     st1        {v0.8b}, [x0], x1
-    st1        {v1.8b}, [x2], x1
-.endr
+    st1        {v0.8b}, [x2], x1
+    st1        {v0.8b}, [x0]
+    st1        {v0.8b}, [x2]
+    st1        {v1.8b}, [x4], x1
+    st1        {v1.8b}, [x5], x1
+    st1        {v1.8b}, [x4]
+    st1        {v1.8b}, [x5]
     ret
 endfunc
 
@@ -495,12 +510,10 @@ function x264_predict_8x8c_h_neon, export=1
     ret
 endfunc
 
-function x264_predict_8x8c_v_neon, export=1
-    sub         x0,  x0,  #FDEC_STRIDE
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x0], x7
-.rept 8
-    st1        {v0.8b}, [x0], x7
+function x264_predict_8x8c_v_aarch64, export=1
+    ldr         x1,  [x0, #-FDEC_STRIDE]
+.irp c, 0,1,2,3,4,5,6,7
+    str         x1,  [x0, #\c * FDEC_STRIDE]
 .endr
     ret
 endfunc
@@ -661,20 +674,20 @@ function x264_predict_8x16c_p_neon, export=1
 endfunc
 
 function x264_predict_8x16c_dc_neon, export=1
-    sub         x3,  x0,  #FDEC_STRIDE
     mov         x1,  #FDEC_STRIDE
-    ld1        {v6.8b}, [x3]
+    sub         x10, x0,  #FDEC_STRIDE
     loadsum4    w2, w3, w4, w5, x0, 0
+    ld1        {v6.8b}, [x10]
+    loadsum4    w6, w7, w8, w9, x0, 4
     uaddlp      v6.4h,  v6.8b
     dup         v22.8h, w2              // s2
-    loadsum4    w6, w7, w8, w9, x0, 4
-    addp        v6.4h,  v6.4h,  v6.4h   // s0, s1
     dup         v23.8h, w6              // s3
     loadsum4    w2, w3, w4, w5, x0, 8
-    dup         v20.8h, v6.h[0]         // s0
-    dup         v24.8h, w2              // s4
+    addp        v6.4h,  v6.4h,  v6.4h   // s0, s1
     loadsum4    w6, w7, w8, w9, x0, 12
+    dup         v20.8h, v6.h[0]         // s0
     dup         v21.8h, v6.h[1]         // s1
+    dup         v24.8h, w2              // s4
     dup         v25.8h, w6              // s5
 
     ext         v16.16b, v20.16b, v21.16b, #8
@@ -692,10 +705,15 @@ function x264_predict_8x16c_dc_neon, export=1
     rshrn       v1.8b,  v1.8h,  #3
     rshrn       v2.8b,  v2.8h,  #3
     rshrn       v3.8b,  v3.8h,  #3
-.irp  idx, 0, 1, 2, 3
+
+    add         x11, x0,  #4  * FDEC_STRIDE
+    add         x12, x0,  #8  * FDEC_STRIDE
+    add         x13, x0,  #12 * FDEC_STRIDE
 .rept 4
-    st1        {v\idx\().8b}, [x0], x1
-.endr
+    st1        {v0.8b}, [x0],  x1
+    st1        {v1.8b}, [x11], x1
+    st1        {v2.8b}, [x12], x1
+    st1        {v3.8b}, [x13], x1
 .endr
     ret
 endfunc
diff --git a/common/aarch64/predict-c.c b/common/aarch64/predict-c.c
index 3556c3c9..1fbb3229 100644
--- a/common/aarch64/predict-c.c
+++ b/common/aarch64/predict-c.c
@@ -72,15 +72,18 @@ void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] )
 
 void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] )
 {
+#if !HIGH_BIT_DEPTH
+    if (cpu&X264_CPU_ARMV8) {
+        pf[I_PRED_CHROMA_V]   = x264_predict_8x8c_v_aarch64;
+    }
+
     if (!(cpu&X264_CPU_NEON))
         return;
 
-#if !HIGH_BIT_DEPTH
     pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
     pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
     pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
     pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_neon;
-    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_neon;
     pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_neon;
 #endif // !HIGH_BIT_DEPTH
 }
diff --git a/common/aarch64/predict.h b/common/aarch64/predict.h
index 4e0054c9..f1562346 100644
--- a/common/aarch64/predict.h
+++ b/common/aarch64/predict.h
@@ -29,10 +29,12 @@
 
 void x264_predict_4x4_h_aarch64( uint8_t *src );
 void x264_predict_4x4_v_aarch64( uint8_t *src );
+void x264_predict_8x8c_v_aarch64( uint8_t *src );
 
 // for the merged 4x4 intra sad/satd which expects unified suffix
 #define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
 #define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
+#define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
 
 void x264_predict_4x4_dc_neon( uint8_t *src );
 void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
@@ -40,7 +42,6 @@ void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8c_dc_neon( uint8_t *src );
 void x264_predict_8x8c_h_neon( uint8_t *src );
-void x264_predict_8x8c_v_neon( uint8_t *src );
 void x264_predict_8x16c_v_neon( uint8_t *src );
 void x264_predict_8x16c_h_neon( uint8_t *src );
 void x264_predict_8x16c_dc_neon( uint8_t *src );
-- 
2.39.5