]> git.sesse.net Git - x264/commitdiff
aarch64: NEON asm for intra chroma deblocking
authorJanne Grunau <janne-x264@jannau.net>
Fri, 10 Oct 2014 08:29:15 +0000 (10:29 +0200)
committerAnton Mitrofanov <BugMaster@narod.ru>
Tue, 16 Dec 2014 17:40:02 +0000 (20:40 +0300)
deblock_h_chroma_420_intra, deblock_h_chroma_422_intra and
x264_deblock_h_chroma_intra_mbaff_neon are ~3 times faster.
deblock_chroma_intra[1] is ~4 times faster than C.

common/aarch64/deblock-a.S
common/deblock.c

index 00be8e70637cd47a7ddedc6a013e78db7dbd9d8c..9bcd6ade2b9d5e005c570d16d2793a9566faf84b 100644 (file)
@@ -275,6 +275,173 @@ function x264_deblock_h_chroma_neon, export=1
     ret
 endfunc
 
+.macro h264_loop_filter_start_intra
+    orr             w4,  w2,  w3
+    cmp             w4,  #0
+    b.ne            1f
+    ret
+1:
+    dup             v30.16b, w2                // alpha
+    dup             v31.16b, w3                // beta
+.endm
+
+.macro h264_loop_filter_chroma_intra, width=16
+    uabd            v26.16b, v16.16b, v17.16b  // abs(p0 - q0)
+    uabd            v27.16b, v18.16b, v16.16b  // abs(p1 - p0)
+    uabd            v28.16b, v19.16b, v17.16b  // abs(q1 - q0)
+    cmhi            v26.16b, v30.16b, v26.16b  // < alpha
+    cmhi            v27.16b, v31.16b, v27.16b  // < beta
+    cmhi            v28.16b, v31.16b, v28.16b  // < beta
+    and             v26.16b, v26.16b, v27.16b
+    and             v26.16b, v26.16b, v28.16b
+
+    ushll           v4.8h,   v18.8b,  #1
+    ushll           v6.8h,   v19.8b,  #1
+.ifc \width, 16
+    ushll2          v5.8h,   v18.16b, #1
+    ushll2          v7.8h,   v19.16b, #1
+    uaddl2          v21.8h,  v16.16b, v19.16b
+    uaddl2          v23.8h,  v17.16b, v18.16b
+.endif
+    uaddl           v20.8h,  v16.8b,  v19.8b
+    uaddl           v22.8h,  v17.8b,  v18.8b
+    add             v20.8h,  v20.8h,  v4.8h     // mlal?
+    add             v22.8h,  v22.8h,  v6.8h
+.ifc \width, 16
+    add             v21.8h,  v21.8h,  v5.8h
+    add             v23.8h,  v23.8h,  v7.8h
+.endif
+    uqrshrn         v24.8b,  v20.8h,  #2
+    uqrshrn         v25.8b,  v22.8h,  #2
+.ifc \width, 16
+    uqrshrn2        v24.16b, v21.8h,  #2
+    uqrshrn2        v25.16b, v23.8h,  #2
+.endif
+    bit             v16.16b, v24.16b, v26.16b
+    bit             v17.16b, v25.16b, v26.16b
+.endm
+
+function x264_deblock_v_chroma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    ld1             {v18.16b}, [x0], x1
+    ld1             {v16.16b}, [x0], x1
+    ld1             {v17.16b}, [x0], x1
+    ld1             {v19.16b}, [x0]
+
+    h264_loop_filter_chroma_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v16.16b}, [x0], x1
+    st1             {v17.16b}, [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_intra_mbaff_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+    ld1             {v18.8b}, [x4], x1
+    ld1             {v16.8b}, [x4], x1
+    ld1             {v17.8b}, [x4], x1
+    ld1             {v19.8b}, [x4], x1
+
+    transpose4x4.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra, width=8
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+    ld1             {v18.d}[0], [x4], x1
+    ld1             {v16.d}[0], [x4], x1
+    ld1             {v17.d}[0], [x4], x1
+    ld1             {v19.d}[0], [x4], x1
+    ld1             {v18.d}[1], [x4], x1
+    ld1             {v16.d}[1], [x4], x1
+    ld1             {v17.d}[1], [x4], x1
+    ld1             {v19.d}[1], [x4], x1
+
+    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+    st2             {v16.h,v17.h}[4], [x0], x1
+    st2             {v16.h,v17.h}[5], [x0], x1
+    st2             {v16.h,v17.h}[6], [x0], x1
+    st2             {v16.h,v17.h}[7], [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_422_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+    ld1             {v18.d}[0], [x4], x1
+    ld1             {v16.d}[0], [x4], x1
+    ld1             {v17.d}[0], [x4], x1
+    ld1             {v19.d}[0], [x4], x1
+    ld1             {v18.d}[1], [x4], x1
+    ld1             {v16.d}[1], [x4], x1
+    ld1             {v17.d}[1], [x4], x1
+    ld1             {v19.d}[1], [x4], x1
+
+    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+    st2             {v16.h,v17.h}[4], [x0], x1
+    st2             {v16.h,v17.h}[5], [x0], x1
+    st2             {v16.h,v17.h}[6], [x0], x1
+    st2             {v16.h,v17.h}[7], [x0], x1
+
+    ld1             {v18.d}[0], [x4], x1
+    ld1             {v16.d}[0], [x4], x1
+    ld1             {v17.d}[0], [x4], x1
+    ld1             {v19.d}[0], [x4], x1
+    ld1             {v18.d}[1], [x4], x1
+    ld1             {v16.d}[1], [x4], x1
+    ld1             {v17.d}[1], [x4], x1
+    ld1             {v19.d}[1], [x4], x1
+
+    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+    st2             {v16.h,v17.h}[4], [x0], x1
+    st2             {v16.h,v17.h}[5], [x0], x1
+    st2             {v16.h,v17.h}[6], [x0], x1
+    st2             {v16.h,v17.h}[7], [x0], x1
+
+    ret
+endfunc
 
 //static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
 //                                int8_t ref[2][X264_SCAN8_LUMA_SIZE],
index 51f7782b22f3fc211311ec6673d425fe213d32a7..101d0bbdeeb09f13c86c7bf21fc23884e6430d85 100644 (file)
@@ -737,6 +737,12 @@ void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int b
 void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
+#if ARCH_AARCH64
+void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#endif
 #endif
 
 void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
@@ -845,6 +851,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
         pf->deblock_luma[0] = x264_deblock_h_luma_neon;
         pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
         pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
+#if ARCH_AARCH64
+        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
+        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
+        pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
+        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
+#endif
         pf->deblock_strength     = x264_deblock_strength_neon;
     }
 #endif