]> git.sesse.net Git - x264/commitdiff
aarch64: cabac_encode_{decision,bypass,terminal}_asm
authorJanne Grunau <janne-x264@jannau.net>
Tue, 18 Nov 2014 23:33:55 +0000 (00:33 +0100)
committerAnton Mitrofanov <BugMaster@narod.ru>
Tue, 16 Dec 2014 17:40:12 +0000 (20:40 +0300)
benchmarks on a Nexus 9 (nvidia denver):
101.3 cycles in x264_cabac_encode_decision_c,   67105369 runs, 3495 skips
 97.3 cycles in x264_cabac_encode_decision_asm, 67105493 runs, 3371 skips
132.8 cycles in x264_cabac_encode_terminal_c,    1046950 runs, 1626 skips
116.1 cycles in x264_cabac_encode_terminal_asm,  1048424 runs, 152 skips
 92.4 cycles in x264_cabac_encode_bypass_c,     16776192 runs, 1024 skips
 89.6 cycles in x264_cabac_encode_bypass_asm,   16776453 runs, 763 skips

Cycle counts are not as stable as one would like. The dynamic code
optimisation seems to produce different results for small chnages in a
binary. Repeated runs with the same binary produce stable results
though (ignoring the first run).

Makefile
common/aarch64/asm-offsets.c [new file with mode: 0644]
common/aarch64/asm-offsets.h [new file with mode: 0644]
common/aarch64/cabac-a.S [new file with mode: 0644]
common/cabac.h
tools/checkasm.c

index f29354217aede58ca020a628d81dad5d276060d3..12c74e4441ff218acccb51e68af83d77f48fac3c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -129,13 +129,15 @@ endif
 ifeq ($(ARCH),AARCH64)
 ifneq ($(AS),)
 ASMSRC += common/aarch64/bitstream-a.S \
+          common/aarch64/cabac-a.S     \
           common/aarch64/dct-a.S     \
           common/aarch64/deblock-a.S \
           common/aarch64/mc-a.S      \
           common/aarch64/pixel-a.S   \
           common/aarch64/predict-a.S \
           common/aarch64/quant-a.S
-SRCS   += common/aarch64/mc-c.c      \
+SRCS   += common/aarch64/asm-offsets.c \
+          common/aarch64/mc-c.c        \
           common/aarch64/predict-c.c
 OBJASM  = $(ASMSRC:%.S=%.o)
 endif
diff --git a/common/aarch64/asm-offsets.c b/common/aarch64/asm-offsets.c
new file mode 100644 (file)
index 0000000..c0630d4
--- /dev/null
@@ -0,0 +1,42 @@
+/*****************************************************************************
+ * asm-offsets.c: check asm offsets for aarch64
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "asm-offsets.h"
+
+#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
+{ \
+    int m_##m[2 * (offsetof(s, m) == o) - 1]; \
+}
+
+X264_CHECK_OFFSET(x264_cabac_t, i_low,               CABAC_I_LOW);
+X264_CHECK_OFFSET(x264_cabac_t, i_range,             CABAC_I_RANGE);
+X264_CHECK_OFFSET(x264_cabac_t, i_queue,             CABAC_I_QUEUE);
+X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
+X264_CHECK_OFFSET(x264_cabac_t, p_start,             CABAC_P_START);
+X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
+X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
+X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
+X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);
diff --git a/common/aarch64/asm-offsets.h b/common/aarch64/asm-offsets.h
new file mode 100644 (file)
index 0000000..b35baae
--- /dev/null
@@ -0,0 +1,39 @@
+/*****************************************************************************
+ * asm-offsets.h: asm offsets for aarch64
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_ASM_OFFSETS_H
+#define X264_AARCH64_ASM_OFFSETS_H
+
+#define CABAC_I_LOW                 0x00
+#define CABAC_I_RANGE               0x04
+#define CABAC_I_QUEUE               0x08
+#define CABAC_I_BYTES_OUTSTANDING   0x0c
+#define CABAC_P_START               0x10
+#define CABAC_P                     0x18
+#define CABAC_P_END                 0x20
+#define CABAC_F8_BITS_ENCODED       0x30
+#define CABAC_STATE                 0x34
+
+#endif
diff --git a/common/aarch64/cabac-a.S b/common/aarch64/cabac-a.S
new file mode 100644 (file)
index 0000000..abd8b6a
--- /dev/null
@@ -0,0 +1,122 @@
+/*****************************************************************************
+ * cabac-a.S: aarch64 cabac
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "asm-offsets.h"
+
+// w11 holds x264_cabac_t.i_low
+// w12 holds x264_cabac_t.i_range
+
+function x264_cabac_encode_decision_asm, export=1
+    movrel      x8,  X(x264_cabac_range_lps)
+    movrel      x9,  X(x264_cabac_transition)
+    add         w10, w1, #CABAC_STATE
+    ldrb        w3,  [x0,  x10]         // i_state
+    ldr         w12, [x0,  #CABAC_I_RANGE]
+    and         x4,  x3,  #~1
+    asr         w5,  w12, #6
+    add         x8,  x8,  x4, lsl #1
+    sub         w5,  w5,  #4
+    eor         w6,  w2,  w3            // b ^ i_state
+    ldrb        w4,  [x8,  x5]          // i_range_lps
+    ldr         w11, [x0, #CABAC_I_LOW]
+    sub         w12, w12, w4
+    tbz         w6,  #0,  1f            // (b ^ i_state) & 1
+    add         w11, w11, w12
+    mov         w12,  w4
+1:
+    orr         w4,  w2,  w3, lsl #1
+    ldrb        w9,  [x9,  x4]
+    strb        w9,  [x0,  x10]    // i_state
+
+cabac_encode_renorm:
+    clz         w5,  w12
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    sub         w5,  w5,  #23
+    lsl         w12, w12, w5
+    lsl         w11, w11, w5
+2:
+    adds        w2,  w2,  w5
+    str         w12, [x0, #CABAC_I_RANGE]
+    b.lt        0f
+cabac_putbyte:
+    mov         w13, #0x400
+    add         w12, w2,  #10
+    lsl         w13, w13, w2
+    asr         w4,  w11, w12           // out
+    sub         w2,  w2,  #8
+    sub         w13, w13, #1
+    subs        w5,  w4,  #0xff
+    and         w11, w11, w13
+    ldr         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    b.ne        1f
+
+    add         w6,  w6,  #1
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
+    ret
+
+1:
+    ldr         x7,  [x0, #CABAC_P]
+    asr         w5,  w4,  #8            // carry
+    ldrb        w8,  [x7, #-1]
+    add         w8,  w8,  w5
+    sub         w5,  w5,  #1
+    strb        w8,  [x7, #-1]
+    cbz         w6,  3f
+2:
+    subs        w6,  w6,  #1
+    strb        w5,  [x7],  #1
+    b.gt        2b
+3:
+    strb        w4,  [x7],  #1
+    str         wzr, [x0, #CABAC_I_BYTES_OUTSTANDING]
+    str         x7,  [x0, #CABAC_P]
+0:
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
+endfunc
+
+function x264_cabac_encode_bypass_asm, export=1
+    ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    and         w1,  w1,  w12
+    add         w11, w1,  w11, lsl #1
+    adds        w2,  w2,  #1
+    b.ge        cabac_putbyte
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
+endfunc
+
+function x264_cabac_encode_terminal_asm, export=1
+    ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    sub         w12, w12, #2
+    b           cabac_encode_renorm
+endfunc
index dbe682061d08e3a8013a2e46f985157c3e756362..cc27761625fc4dc9ac29ae6a215ad71e4de8f83c 100644 (file)
@@ -72,6 +72,10 @@ void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );
 #define x264_cabac_encode_decision x264_cabac_encode_decision_asm
 #define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
 #define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
+#elif defined(ARCH_AARCH64)
+#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
+#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
+#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
 #else
 #define x264_cabac_encode_decision x264_cabac_encode_decision_c
 #define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
index c90c0cfcd56640fdc9217b3740435c790c368053..2ac5f0c765dd3f400bd62dd6d0fb3b1dc86345c8 100644 (file)
@@ -2437,6 +2437,8 @@ static void run_cabac_terminal_##cpu( x264_t *h, uint8_t *dst )\
 DECL_CABAC(c)
 #if HAVE_MMX
 DECL_CABAC(asm)
+#elif defined(ARCH_AARCH64)
+DECL_CABAC(asm)
 #else
 #define run_cabac_decision_asm run_cabac_decision_c
 #define run_cabac_bypass_asm run_cabac_bypass_c