aarch64: Optimize various intra_predict asm functions

author Janne Grunau <janne-x264@jannau.net>

Mon, 17 Aug 2015 14:39:20 +0000 (16:39 +0200)

committer Henrik Gramner <henrik@gramner.com>

Sun, 11 Oct 2015 16:44:54 +0000 (18:44 +0200)
author Janne Grunau <janne-x264@jannau.net>
Mon, 17 Aug 2015 14:39:20 +0000 (16:39 +0200)
committer Henrik Gramner <henrik@gramner.com>
Sun, 11 Oct 2015 16:44:54 +0000 (18:44 +0200)
diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S

index a7dd2d1cf7748fe4e3fc0a0f47e4b541d57d7fa3..bcc3d7a88eb5d63c0e92d0538736065adff5ece9 100644 (file)
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -90,40 +90,37 @@ endfunc
  
  function x264_predict_4x4_dc_neon, export=1
      sub         x1,  x0,  #FDEC_STRIDE
  
  function x264_predict_4x4_dc_neon, export=1
      sub         x1,  x0,  #FDEC_STRIDE
-    sub         x2,  x0,  #1
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x1]
-    ld1r       {v1.8b}, [x2], x7
-    ld1r       {v2.8b}, [x2], x7
-    ld1r       {v3.8b}, [x2], x7
-    ld1r       {v4.8b}, [x2], x7
-    uaddlp      v0.4h,  v0.8b
-    uaddl       v1.8h,  v1.8b,  v2.8b
-    uaddl       v2.8h,  v3.8b,  v4.8b
-    addp        v0.4h,  v0.4h,  v0.4h
-    add         v1.4h,  v1.4h,  v2.4h
+    ldrb        w4,  [x0, #-1 + 0 * FDEC_STRIDE]
+    ldrb        w5,  [x0, #-1 + 1 * FDEC_STRIDE]
+    ldrb        w6,  [x0, #-1 + 2 * FDEC_STRIDE]
+    ldrb        w7,  [x0, #-1 + 3 * FDEC_STRIDE]
+    add         w4,  w4,  w5
+    ldr         s0, [x1]
+    add         w6,  w6,  w7
+    uaddlv      h0,  v0.8b
+    add         w4,  w4,  w6
      dup         v0.4h,  v0.h[0]
      dup         v0.4h,  v0.h[0]
+    dup         v1.4h,  w4
      add         v0.4h,  v0.4h,  v1.4h
      rshrn       v0.8b,  v0.8h,  #3
      add         v0.4h,  v0.4h,  v1.4h
      rshrn       v0.8b,  v0.8h,  #3
-    str         s0,  [x0], #FDEC_STRIDE
-    str         s0,  [x0], #FDEC_STRIDE
-    str         s0,  [x0], #FDEC_STRIDE
      str         s0,  [x0]
      str         s0,  [x0]
+    str         s0,  [x0, #1 * FDEC_STRIDE]
+    str         s0,  [x0, #2 * FDEC_STRIDE]
+    str         s0,  [x0, #3 * FDEC_STRIDE]
      ret
  endfunc
  
  function x264_predict_4x4_dc_top_neon, export=1
      sub         x1,  x0,  #FDEC_STRIDE
      ret
  endfunc
  
  function x264_predict_4x4_dc_top_neon, export=1
      sub         x1,  x0,  #FDEC_STRIDE
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x1]
-    uaddlp      v0.4h,  v0.8b
-    addp        v0.4h,  v0.4h,  v0.4h
+    ldr         s0, [x1]
+    uaddlv      h0,  v0.8b
      dup         v0.4h,  v0.h[0]
      rshrn       v0.8b,  v0.8h,  #2
      dup         v0.4h,  v0.h[0]
      rshrn       v0.8b,  v0.8h,  #2
-    str         s0,  [x0], #FDEC_STRIDE
-    str         s0,  [x0], #FDEC_STRIDE
-    str         s0,  [x0], #FDEC_STRIDE
      str         s0,  [x0]
      str         s0,  [x0]
+    str         s0,  [x0, #1 * FDEC_STRIDE]
+    str         s0,  [x0, #2 * FDEC_STRIDE]
+    str         s0,  [x0, #3 * FDEC_STRIDE]
+    ret
      ret
  endfunc
  
      ret
  endfunc
  
@@ -456,30 +453,48 @@ function x264_predict_8x8c_dc_left_neon, export=1
  endfunc
  
  function x264_predict_8x8c_dc_neon, export=1
  endfunc
  
  function x264_predict_8x8c_dc_neon, export=1
-    sub         x2,  x0,  #FDEC_STRIDE
-    sub         x3,  x0,  #1
      mov         x1,  #FDEC_STRIDE
      mov         x1,  #FDEC_STRIDE
-    ld1        {v2.8b},  [x2]
-    ldcol.8     v3,  x3,  x1
-    transpose   v0.2s,  v1.2s,  v2.2s,  v3.2s
-    uaddlp      v0.4h,  v0.8b  // s0, s2
-    uaddlp      v1.4h,  v1.8b  // s1, s3
-    addp        v0.4h,  v0.4h,  v1.4h // s0, s2, s1, s3
-    addp        v1.4h,  v0.4h,  v0.4h
-    rshrn       v2.8b,  v0.8h,  #2
+    sub         x2,  x0,  #FDEC_STRIDE
+    ldrb        w10, [x0, #0 * FDEC_STRIDE - 1]
+    ldrb        w11, [x0, #1 * FDEC_STRIDE - 1]
+    ldrb        w12, [x0, #2 * FDEC_STRIDE - 1]
+    ldrb        w13, [x0, #3 * FDEC_STRIDE - 1]
+    add         w10, w10, w11
+    ldrb        w4,  [x0, #4 * FDEC_STRIDE - 1]
+    ldrb        w5,  [x0, #5 * FDEC_STRIDE - 1]
+    add         w12, w12, w13
+    ldrb        w6,  [x0, #6 * FDEC_STRIDE - 1]
+    ldrb        w7,  [x0, #7 * FDEC_STRIDE - 1]
+    add         w4,  w4,  w5
+    add         w6,  w6,  w7
+    add         w10, w10, w12, lsl #16
+    add         w4,  w4,  w6,  lsl #16
+    ld1        {v0.8b},  [x2]
+    add         x10, x10, x4,  lsl #32
+    uaddlp      v0.4h,  v0.8b  // s0, s1
+    mov         v1.d[0],  x10  // s2, s3
+    add         v3.4h,  v0.4h,  v1.4h
+    addp        v0.4h,  v0.4h,  v1.4h // s0, s1, s2, s3
+    addp        v1.4h,  v3.4h,  v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
+    uzp2        v0.4h,  v0.4h,  v0.4h // s1,    s3,    s1,    s3
+    uzp1        v1.2d,  v1.2d,  v1.2d
+    uzp1        v0.2d,  v0.2d,  v0.2d
      rshrn       v3.8b,  v1.8h,  #3
      rshrn       v3.8b,  v1.8h,  #3
-    dup         v5.8b,  v2.b[2]  // dc1
-    dup         v6.8b,  v3.b[1]  // dc2
-    dup         v4.8b,  v3.b[0]  // dc0
-    dup         v7.8b,  v2.b[3]  // dc3
-    trn1        v0.2s,  v4.2s,  v5.2s
-    trn1        v1.2s,  v7.2s,  v6.2s
+    rshrn       v2.8b,  v0.8h,  #2
+    uzp1        v0.8b,  v3.8b,  v2.8b
+    uzp2        v1.8b,  v2.8b,  v3.8b
  pred8x8c_dc_end:
  pred8x8c_dc_end:
-    add         x2,  x0,  x1,  lsl #2
-.rept 4
+    add         x2,  x0,  #2 * FDEC_STRIDE
+    add         x4,  x0,  #4 * FDEC_STRIDE
+    add         x5,  x0,  #6 * FDEC_STRIDE
      st1        {v0.8b}, [x0], x1
      st1        {v0.8b}, [x0], x1
-    st1        {v1.8b}, [x2], x1
-.endr
+    st1        {v0.8b}, [x2], x1
+    st1        {v0.8b}, [x0]
+    st1        {v0.8b}, [x2]
+    st1        {v1.8b}, [x4], x1
+    st1        {v1.8b}, [x5], x1
+    st1        {v1.8b}, [x4]
+    st1        {v1.8b}, [x5]
      ret
  endfunc
  
      ret
  endfunc
  
@@ -495,12 +510,10 @@ function x264_predict_8x8c_h_neon, export=1
      ret
  endfunc
  
      ret
  endfunc
  
-function x264_predict_8x8c_v_neon, export=1
-    sub         x0,  x0,  #FDEC_STRIDE
-    mov         x7,  #FDEC_STRIDE
-    ld1        {v0.8b}, [x0], x7
-.rept 8
-    st1        {v0.8b}, [x0], x7
+function x264_predict_8x8c_v_aarch64, export=1
+    ldr         x1,  [x0, #-FDEC_STRIDE]
+.irp c, 0,1,2,3,4,5,6,7
+    str         x1,  [x0, #\c * FDEC_STRIDE]
  .endr
      ret
  endfunc
  .endr
      ret
  endfunc
@@ -661,20 +674,20 @@ function x264_predict_8x16c_p_neon, export=1
  endfunc
  
  function x264_predict_8x16c_dc_neon, export=1
  endfunc
  
  function x264_predict_8x16c_dc_neon, export=1
-    sub         x3,  x0,  #FDEC_STRIDE
      mov         x1,  #FDEC_STRIDE
      mov         x1,  #FDEC_STRIDE
-    ld1        {v6.8b}, [x3]
+    sub         x10, x0,  #FDEC_STRIDE
      loadsum4    w2, w3, w4, w5, x0, 0
      loadsum4    w2, w3, w4, w5, x0, 0
+    ld1        {v6.8b}, [x10]
+    loadsum4    w6, w7, w8, w9, x0, 4
      uaddlp      v6.4h,  v6.8b
      dup         v22.8h, w2              // s2
      uaddlp      v6.4h,  v6.8b
      dup         v22.8h, w2              // s2
-    loadsum4    w6, w7, w8, w9, x0, 4
-    addp        v6.4h,  v6.4h,  v6.4h   // s0, s1
      dup         v23.8h, w6              // s3
      loadsum4    w2, w3, w4, w5, x0, 8
      dup         v23.8h, w6              // s3
      loadsum4    w2, w3, w4, w5, x0, 8
-    dup         v20.8h, v6.h[0]         // s0
-    dup         v24.8h, w2              // s4
+    addp        v6.4h,  v6.4h,  v6.4h   // s0, s1
      loadsum4    w6, w7, w8, w9, x0, 12
      loadsum4    w6, w7, w8, w9, x0, 12
+    dup         v20.8h, v6.h[0]         // s0
      dup         v21.8h, v6.h[1]         // s1
      dup         v21.8h, v6.h[1]         // s1
+    dup         v24.8h, w2              // s4
      dup         v25.8h, w6              // s5
  
      ext         v16.16b, v20.16b, v21.16b, #8
      dup         v25.8h, w6              // s5
  
      ext         v16.16b, v20.16b, v21.16b, #8
@@ -692,10 +705,15 @@ function x264_predict_8x16c_dc_neon, export=1
      rshrn       v1.8b,  v1.8h,  #3
      rshrn       v2.8b,  v2.8h,  #3
      rshrn       v3.8b,  v3.8h,  #3
      rshrn       v1.8b,  v1.8h,  #3
      rshrn       v2.8b,  v2.8h,  #3
      rshrn       v3.8b,  v3.8h,  #3
-.irp  idx, 0, 1, 2, 3
+
+    add         x11, x0,  #4  * FDEC_STRIDE
+    add         x12, x0,  #8  * FDEC_STRIDE
+    add         x13, x0,  #12 * FDEC_STRIDE
  .rept 4
  .rept 4
-    st1        {v\idx\().8b}, [x0], x1
-.endr
+    st1        {v0.8b}, [x0],  x1
+    st1        {v1.8b}, [x11], x1
+    st1        {v2.8b}, [x12], x1
+    st1        {v3.8b}, [x13], x1
  .endr
      ret
  endfunc
  .endr
      ret
  endfunc
diff --git a/common/aarch64/predict-c.c b/common/aarch64/predict-c.c

index 3556c3c9b733cd2624145a41208945118eca9366..1fbb32295138198f64ab711a4eb04fc26321f758 100644 (file)
--- a/common/aarch64/predict-c.c
+++ b/common/aarch64/predict-c.c
@@ -72,15 +72,18 @@ void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] )
  
  void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] )
  {
  
  void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] )
  {
+#if !HIGH_BIT_DEPTH
+    if (cpu&X264_CPU_ARMV8) {
+        pf[I_PRED_CHROMA_V]   = x264_predict_8x8c_v_aarch64;
+    }
+
      if (!(cpu&X264_CPU_NEON))
          return;
  
      if (!(cpu&X264_CPU_NEON))
          return;
  
-#if !HIGH_BIT_DEPTH
      pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
      pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
      pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
      pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_neon;
      pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
      pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
      pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
      pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_neon;
-    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_neon;
      pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_neon;
  #endif // !HIGH_BIT_DEPTH
  }
      pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_neon;
  #endif // !HIGH_BIT_DEPTH
  }
diff --git a/common/aarch64/predict.h b/common/aarch64/predict.h

index 4e0054c9baa1325800d19ae669b23fbc53127786..f1562346bc20ecd81eabba5b65269cb888c1d7b8 100644 (file)
--- a/common/aarch64/predict.h
+++ b/common/aarch64/predict.h
@@ -29,10 +29,12 @@
  
  void x264_predict_4x4_h_aarch64( uint8_t *src );
  void x264_predict_4x4_v_aarch64( uint8_t *src );
  
  void x264_predict_4x4_h_aarch64( uint8_t *src );
  void x264_predict_4x4_v_aarch64( uint8_t *src );
+void x264_predict_8x8c_v_aarch64( uint8_t *src );
  
  // for the merged 4x4 intra sad/satd which expects unified suffix
  #define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
  #define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
  
  // for the merged 4x4 intra sad/satd which expects unified suffix
  #define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
  #define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
+#define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
  
  void x264_predict_4x4_dc_neon( uint8_t *src );
  void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
  
  void x264_predict_4x4_dc_neon( uint8_t *src );
  void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
@@ -40,7 +42,6 @@ void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
  void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
  void x264_predict_8x8c_dc_neon( uint8_t *src );
  void x264_predict_8x8c_h_neon( uint8_t *src );
  void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
  void x264_predict_8x8c_dc_neon( uint8_t *src );
  void x264_predict_8x8c_h_neon( uint8_t *src );
-void x264_predict_8x8c_v_neon( uint8_t *src );
  void x264_predict_8x16c_v_neon( uint8_t *src );
  void x264_predict_8x16c_h_neon( uint8_t *src );
  void x264_predict_8x16c_dc_neon( uint8_t *src );
  void x264_predict_8x16c_v_neon( uint8_t *src );
  void x264_predict_8x16c_h_neon( uint8_t *src );
  void x264_predict_8x16c_dc_neon( uint8_t *src );
author	Janne Grunau <janne-x264@jannau.net>
	Mon, 17 Aug 2015 14:39:20 +0000 (16:39 +0200)
committer	Henrik Gramner <henrik@gramner.com>
	Sun, 11 Oct 2015 16:44:54 +0000 (18:44 +0200)
common/aarch64/predict-a.S		patch \| blob \| history
common/aarch64/predict-c.c		patch \| blob \| history
common/aarch64/predict.h		patch \| blob \| history