aarch64: NEON asm for intra chroma deblocking

author Janne Grunau <janne-x264@jannau.net>

Fri, 10 Oct 2014 08:29:15 +0000 (10:29 +0200)

committer Anton Mitrofanov <BugMaster@narod.ru>

Tue, 16 Dec 2014 17:40:02 +0000 (20:40 +0300)
author Janne Grunau <janne-x264@jannau.net>
Fri, 10 Oct 2014 08:29:15 +0000 (10:29 +0200)
committer Anton Mitrofanov <BugMaster@narod.ru>
Tue, 16 Dec 2014 17:40:02 +0000 (20:40 +0300)
diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S

index 00be8e70637cd47a7ddedc6a013e78db7dbd9d8c..9bcd6ade2b9d5e005c570d16d2793a9566faf84b 100644 (file)
--- a/common/aarch64/deblock-a.S
+++ b/common/aarch64/deblock-a.S
@@ -275,6 +275,173 @@ function x264_deblock_h_chroma_neon, export=1
      ret
  endfunc
  
+.macro h264_loop_filter_start_intra
+    orr             w4,  w2,  w3
+    cmp             w4,  #0
+    b.ne            1f
+    ret
+1:
+    dup             v30.16b, w2                // alpha
+    dup             v31.16b, w3                // beta
+.endm
+
+.macro h264_loop_filter_chroma_intra, width=16
+    uabd            v26.16b, v16.16b, v17.16b  // abs(p0 - q0)
+    uabd            v27.16b, v18.16b, v16.16b  // abs(p1 - p0)
+    uabd            v28.16b, v19.16b, v17.16b  // abs(q1 - q0)
+    cmhi            v26.16b, v30.16b, v26.16b  // < alpha
+    cmhi            v27.16b, v31.16b, v27.16b  // < beta
+    cmhi            v28.16b, v31.16b, v28.16b  // < beta
+    and             v26.16b, v26.16b, v27.16b
+    and             v26.16b, v26.16b, v28.16b
+
+    ushll           v4.8h,   v18.8b,  #1
+    ushll           v6.8h,   v19.8b,  #1
+.ifc \width, 16
+    ushll2          v5.8h,   v18.16b, #1
+    ushll2          v7.8h,   v19.16b, #1
+    uaddl2          v21.8h,  v16.16b, v19.16b
+    uaddl2          v23.8h,  v17.16b, v18.16b
+.endif
+    uaddl           v20.8h,  v16.8b,  v19.8b
+    uaddl           v22.8h,  v17.8b,  v18.8b
+    add             v20.8h,  v20.8h,  v4.8h     // mlal?
+    add             v22.8h,  v22.8h,  v6.8h
+.ifc \width, 16
+    add             v21.8h,  v21.8h,  v5.8h
+    add             v23.8h,  v23.8h,  v7.8h
+.endif
+    uqrshrn         v24.8b,  v20.8h,  #2
+    uqrshrn         v25.8b,  v22.8h,  #2
+.ifc \width, 16
+    uqrshrn2        v24.16b, v21.8h,  #2
+    uqrshrn2        v25.16b, v23.8h,  #2
+.endif
+    bit             v16.16b, v24.16b, v26.16b
+    bit             v17.16b, v25.16b, v26.16b
+.endm
+
+function x264_deblock_v_chroma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    ld1             {v18.16b}, [x0], x1
+    ld1             {v16.16b}, [x0], x1
+    ld1             {v17.16b}, [x0], x1
+    ld1             {v19.16b}, [x0]
+
+    h264_loop_filter_chroma_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v16.16b}, [x0], x1
+    st1             {v17.16b}, [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_intra_mbaff_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+    ld1             {v18.8b}, [x4], x1
+    ld1             {v16.8b}, [x4], x1
+    ld1             {v17.8b}, [x4], x1
+    ld1             {v19.8b}, [x4], x1
+
+    transpose4x4.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra, width=8
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+    ld1             {v18.d}[0], [x4], x1
+    ld1             {v16.d}[0], [x4], x1
+    ld1             {v17.d}[0], [x4], x1
+    ld1             {v19.d}[0], [x4], x1
+    ld1             {v18.d}[1], [x4], x1
+    ld1             {v16.d}[1], [x4], x1
+    ld1             {v17.d}[1], [x4], x1
+    ld1             {v19.d}[1], [x4], x1
+
+    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+    st2             {v16.h,v17.h}[4], [x0], x1
+    st2             {v16.h,v17.h}[5], [x0], x1
+    st2             {v16.h,v17.h}[6], [x0], x1
+    st2             {v16.h,v17.h}[7], [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_422_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+    ld1             {v18.d}[0], [x4], x1
+    ld1             {v16.d}[0], [x4], x1
+    ld1             {v17.d}[0], [x4], x1
+    ld1             {v19.d}[0], [x4], x1
+    ld1             {v18.d}[1], [x4], x1
+    ld1             {v16.d}[1], [x4], x1
+    ld1             {v17.d}[1], [x4], x1
+    ld1             {v19.d}[1], [x4], x1
+
+    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+    st2             {v16.h,v17.h}[4], [x0], x1
+    st2             {v16.h,v17.h}[5], [x0], x1
+    st2             {v16.h,v17.h}[6], [x0], x1
+    st2             {v16.h,v17.h}[7], [x0], x1
+
+    ld1             {v18.d}[0], [x4], x1
+    ld1             {v16.d}[0], [x4], x1
+    ld1             {v17.d}[0], [x4], x1
+    ld1             {v19.d}[0], [x4], x1
+    ld1             {v18.d}[1], [x4], x1
+    ld1             {v16.d}[1], [x4], x1
+    ld1             {v17.d}[1], [x4], x1
+    ld1             {v19.d}[1], [x4], x1
+
+    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+    st2             {v16.h,v17.h}[4], [x0], x1
+    st2             {v16.h,v17.h}[5], [x0], x1
+    st2             {v16.h,v17.h}[6], [x0], x1
+    st2             {v16.h,v17.h}[7], [x0], x1
+
+    ret
+endfunc
  
  //static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
  //                                int8_t ref[2][X264_SCAN8_LUMA_SIZE],
diff --git a/common/deblock.c b/common/deblock.c

index 51f7782b22f3fc211311ec6673d425fe213d32a7..101d0bbdeeb09f13c86c7bf21fc23884e6430d85 100644 (file)
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -737,6 +737,12 @@ void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int b
  void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                   int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                   int mvy_limit, int bframe );
+#if ARCH_AARCH64
+void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#endif
  #endif
  
  void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
@@ -845,6 +851,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
          pf->deblock_luma[0] = x264_deblock_h_luma_neon;
          pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
          pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
+#if ARCH_AARCH64
+        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
+        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
+        pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
+        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
+#endif
          pf->deblock_strength     = x264_deblock_strength_neon;
      }
  #endif
author	Janne Grunau <janne-x264@jannau.net>
	Fri, 10 Oct 2014 08:29:15 +0000 (10:29 +0200)
committer	Anton Mitrofanov <BugMaster@narod.ru>
	Tue, 16 Dec 2014 17:40:02 +0000 (20:40 +0300)
common/aarch64/deblock-a.S		patch \| blob \| history
common/deblock.c		patch \| blob \| history