]> git.sesse.net Git - x264/commitdiff
aarch64: x264_deblock_h_chroma_mbaff_neon
authorJanne Grunau <janne-x264@jannau.net>
Mon, 13 Oct 2014 10:43:50 +0000 (12:43 +0200)
committerAnton Mitrofanov <BugMaster@narod.ru>
Tue, 16 Dec 2014 17:40:03 +0000 (20:40 +0300)
deblock_chroma_420_mbaff_neon  2 times faster

common/aarch64/deblock-a.S
common/deblock.c

index 9bcd6ade2b9d5e005c570d16d2793a9566faf84b..9618665df71767d697499b42d6726778f284b6ad 100644 (file)
@@ -275,6 +275,60 @@ function x264_deblock_h_chroma_neon, export=1
     ret
 endfunc
 
+.macro h264_loop_filter_chroma8
+    dup             v22.8b,  w2                 // alpha
+    uxtl            v24.8h,  v24.8b
+    uabd            v26.8b,  v16.8b,  v17.8b    // abs(p0 - q0)
+    uxtl            v4.8h,   v17.8b
+    uabd            v28.8b,  v18.8b,  v16.8b    // abs(p1 - p0)
+    usubw           v4.8h,   v4.8h,   v16.8b
+    sli             v24.8h,  v24.8h,  #8
+    shl             v4.8h,   v4.8h,   #2
+    uabd            v30.8b,  v19.8b,  v17.8b    // abs(q1 - q0)
+    uaddw           v4.8h,   v4.8h,   v18.8b
+    cmhi            v26.8b,  v22.8b,  v26.8b    // < alpha
+    usubw           v4.8h,   v4.8h,   v19.8b
+    dup             v22.8b,  w3                 // beta
+    rshrn           v4.8b,   v4.8h,   #3
+    cmhi            v28.8b,  v22.8b,  v28.8b    // < beta
+    cmhi            v30.8b,  v22.8b,  v30.8b    // < beta
+    smin            v4.8b,   v4.8b,   v24.8b
+    neg             v25.8b,  v24.8b
+    and             v26.8b,  v26.8b,  v28.8b
+    smax            v4.8b,   v4.8b,   v25.8b
+    and             v26.8b,  v26.8b,  v30.8b
+    uxtl            v22.8h,  v17.8b
+    and             v4.8b,   v4.8b,   v26.8b
+    uxtl            v28.8h,  v16.8b
+    saddw           v28.8h,  v28.8h,  v4.8b
+    ssubw           v22.8h,  v22.8h,  v4.8b
+    sqxtun          v16.8b,  v28.8h
+    sqxtun          v17.8b,  v22.8h
+.endm
+
+function x264_deblock_h_chroma_mbaff_neon, export=1
+    h264_loop_filter_start
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+
+    ld1             {v18.8b}, [x4], x1
+    ld1             {v16.8b}, [x4], x1
+    ld1             {v17.8b},  [x4], x1
+    ld1             {v19.8b},  [x4]
+
+    transpose4x4.h  v18, v16, v17, v19, v28, v29, v30, v31
+
+    h264_loop_filter_chroma8
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0]
+
+    ret
+endfunc
+
 .macro h264_loop_filter_start_intra
     orr             w4,  w2,  w3
     cmp             w4,  #0
index 101d0bbdeeb09f13c86c7bf21fc23884e6430d85..b0b8d2b625426845e8686e9673d912a7f6271681 100644 (file)
@@ -738,6 +738,7 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
 #if ARCH_AARCH64
+void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
@@ -852,6 +853,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
         pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
         pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
 #if ARCH_AARCH64
+        pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
         pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
         pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
         pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;