aarch64: NEON asm for decimate_score

author Janne Grunau <janne-x264@jannau.net>

Tue, 12 Aug 2014 15:26:10 +0000 (17:26 +0200)

committer Anton Mitrofanov <BugMaster@narod.ru>

Tue, 16 Dec 2014 17:39:58 +0000 (20:39 +0300)
author Janne Grunau <janne-x264@jannau.net>
Tue, 12 Aug 2014 15:26:10 +0000 (17:26 +0200)
committer Anton Mitrofanov <BugMaster@narod.ru>
Tue, 16 Dec 2014 17:39:58 +0000 (20:39 +0300)
diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S

index 02b71b2a21e919cdb55c3c55db6c6ec65ac35b6a..ed9b3ca8c4b7df16d590810b8323244bcdb10e9f 100644 (file)
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -4,6 +4,7 @@
   * Copyright (C) 2009-2014 x264 project
   *
   * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -300,6 +301,118 @@ dequant_4x4_dc_rshift:
      ret
  endfunc
  
+.macro decimate_score_1x size
+function x264_decimate_score\size\()_neon, export=1
+    ld1        {v0.8h,v1.8h}, [x0]
+    movrel      x5,  X(x264_decimate_table4)
+    movi        v3.16b, #0x01
+    sqxtn       v0.8b,  v0.8h
+    sqxtn2      v0.16b, v1.8h
+    abs         v2.16b, v0.16b
+    cmeq        v1.16b, v0.16b, #0
+    cmhi        v2.16b, v2.16b, v3.16b
+    shrn        v1.8b,  v1.8h,  #4
+    shrn        v2.8b,  v2.8h,  #4
+    fmov        x2,  d2
+    fmov        x1,  d1
+    cbnz        x2,  9f
+    mvn         x1,  x1
+    mov         w0,  #0
+    cbz         x1,  0f
+.ifc \size, 15
+    lsr         x1,  x1,  #1
+.endif
+    rbit        x1,  x1
+1:
+    clz         x3,  x1
+    lsr         x6,  x3,  #2
+    lsl         x1,  x1,  x3
+    ldrb        w7,  [x5, x6]
+    cbz         x1,  2f
+    lsl         x1,  x1,  #4
+    add         w0,  w0,  w7
+    cbnz        x1,  1b
+    ret
+2:
+    add         w0,  w0,  w7
+0:
+    ret
+9:
+    mov         w0,  #9
+    ret
+endfunc
+.endm
+
+decimate_score_1x 15
+decimate_score_1x 16
+
+const mask64, align=6
+    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+endconst
+
+function x264_decimate_score64_neon, export=1
+    ld1        {v0.8h,v1.8h}, [x0], #32
+    ld1        {v2.8h,v3.8h}, [x0], #32
+    ld1        {v4.8h,v5.8h}, [x0], #32
+    ld1        {v6.8h,v7.8h}, [x0]
+    movrel      x6,  mask64
+    movi        v31.16b, #0x01
+    sqxtn       v16.8b,  v1.8h
+    sqxtn2      v16.16b, v0.8h
+    sqxtn       v17.8b,  v3.8h
+    sqxtn2      v17.16b, v2.8h
+    sqxtn       v18.8b,  v5.8h
+    sqxtn2      v18.16b, v4.8h
+    sqxtn       v19.8b,  v7.8h
+    sqxtn2      v19.16b, v6.8h
+    abs         v4.16b, v16.16b
+    abs         v5.16b, v17.16b
+    abs         v6.16b, v18.16b
+    abs         v7.16b, v19.16b
+    ld1        {v30.16b}, [x6]
+    cmeq        v0.16b, v16.16b, #0
+    cmeq        v1.16b, v17.16b, #0
+    cmeq        v2.16b, v18.16b, #0
+    cmeq        v3.16b, v19.16b, #0
+    umax        v4.16b, v4.16b, v5.16b
+    umax        v6.16b, v6.16b, v7.16b
+    and         v0.16b, v0.16b, v30.16b
+    and         v1.16b, v1.16b, v30.16b
+    and         v2.16b, v2.16b, v30.16b
+    and         v3.16b, v3.16b, v30.16b
+    umax        v4.16b, v4.16b, v6.16b
+    addp        v0.16b, v1.16b, v0.16b
+    addp        v2.16b, v3.16b, v2.16b
+    cmhi        v4.16b, v4.16b, v31.16b
+    addp        v0.16b, v2.16b, v0.16b
+    shrn        v4.8b,  v4.8h,  #4
+    addp        v0.16b, v0.16b, v0.16b
+    fmov        x2,  d4
+    fmov        x1,  d0
+    cbnz        x2,  9f
+    mvn         x1,  x1
+    mov         w0,  #0
+    cbz         x1,  0f
+    movrel      x5,  X(x264_decimate_table8)
+1:
+    clz         x3,  x1
+    lsl         x1,  x1,  x3
+    ldrb        w7,  [x5, x3]
+    cbz         x1,  2f
+    lsl         x1,  x1,  #1
+    add         w0,  w0,  w7
+    cbnz        x1,  1b
+    ret
+2:
+    add         w0,  w0,  w7
+0:
+    ret
+9:
+    mov         w0,  #9
+    ret
+endfunc
+
  // int coeff_last( int16_t *l )
  function x264_coeff_last4_aarch64, export=1
      ldr         x2,  [x0]
diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h

index dfcac25538acc6912a542e5996d085b1aa7dba58..5a797c1ac389518fcf070e8f7e00caeef7b03c91 100644 (file)
--- a/common/aarch64/quant.h
+++ b/common/aarch64/quant.h
@@ -4,6 +4,7 @@
   * Copyright (C) 2005-2014 x264 project
   *
   * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -38,6 +39,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
  void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
  void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
  
+int x264_decimate_score15_neon( int16_t * );
+int x264_decimate_score16_neon( int16_t * );
+int x264_decimate_score64_neon( int16_t * );
+
  int x264_coeff_last4_aarch64( int16_t * );
  int x264_coeff_last8_aarch64( int16_t * );
  int x264_coeff_last15_neon( int16_t * );
diff --git a/common/quant.c b/common/quant.c

index 31d8901dcba6d378f6ff5c4102cd8751249a194f..d1b89c089b9d0e1f1d588c23f41c899b7c758c5b 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -714,7 +714,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
  #endif // HAVE_MMX
  
  #if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC ) {
+    if( cpu&X264_CPU_ALTIVEC )
+    {
          pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
          pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
          pf->quant_4x4 = x264_quant_4x4_altivec;
@@ -754,6 +755,12 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->coeff_last4 = x264_coeff_last4_aarch64;
          pf->coeff_last8 = x264_coeff_last8_aarch64;
      }
+    if( cpu&X264_CPU_NEON )
+    {
+        pf->decimate_score15 = x264_decimate_score15_neon;
+        pf->decimate_score16 = x264_decimate_score16_neon;
+        pf->decimate_score64 = x264_decimate_score64_neon;
+    }
  #endif
  #endif // HIGH_BIT_DEPTH
      pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
author	Janne Grunau <janne-x264@jannau.net>
	Tue, 12 Aug 2014 15:26:10 +0000 (17:26 +0200)
committer	Anton Mitrofanov <BugMaster@narod.ru>
	Tue, 16 Dec 2014 17:39:58 +0000 (20:39 +0300)
common/aarch64/quant-a.S		patch \| blob \| history
common/aarch64/quant.h		patch \| blob \| history
common/quant.c		patch \| blob \| history