x86: Use one less register in mbtree_propagate_cost_avx2

[x264] / common / aarch64 / quant-a.S
diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S

index ed9b3ca8c4b7df16d590810b8323244bcdb10e9f..46b971ee252c60084941eb9606c124cf11f59113 100644 (file)
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -1,10 +1,11 @@
  /****************************************************************************
   * quant.S: arm quantization and level-run
   *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2016 x264 project
   *
   * Authors: David Conrad <lessen42@gmail.com>
   *          Janne Grunau <janne-x264@jannau.net>
+ *          Martin Storsjo <martin@martin.st>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -328,17 +329,13 @@ function x264_decimate_score\size\()_neon, export=1
      lsr         x6,  x3,  #2
      lsl         x1,  x1,  x3
      ldrb        w7,  [x5, x6]
-    cbz         x1,  2f
      lsl         x1,  x1,  #4
      add         w0,  w0,  w7
      cbnz        x1,  1b
      ret
-2:
-    add         w0,  w0,  w7
-0:
-    ret
  9:
      mov         w0,  #9
+0:
      ret
  endfunc
  .endm
@@ -399,17 +396,13 @@ function x264_decimate_score64_neon, export=1
      clz         x3,  x1
      lsl         x1,  x1,  x3
      ldrb        w7,  [x5, x3]
-    cbz         x1,  2f
      lsl         x1,  x1,  #1
      add         w0,  w0,  w7
      cbnz        x1,  1b
      ret
-2:
-    add         w0,  w0,  w7
-0:
-    ret
  9:
      mov         w0,  #9
+0:
      ret
  endfunc
  
@@ -497,3 +490,103 @@ function x264_coeff_last64_neon, export=1
      sub         w0,  w3,  w2
      ret
  endfunc
+
+.macro coeff_level_run_start size
+    add         x6,  x1,  #23            // runlevel->mask
+    mov         w7,  #0
+    mov         w8,  #0
+    mov         w9,  #1
+    and         x6,  x6,  #~15
+    mov         w4,  #\size - 1
+.endm
+
+.macro coeff_level_run shift
+    clz         x3,  x2
+    subs        w4,  w4,  w3, lsr #\shift
+    str         w4,  [x1], #4
+1:
+    ldrh        w5,  [x0, x4, lsl #1]
+    strh        w5,  [x6], #2
+    add         w7,  w7,  #1
+    lsl         w10, w9, w4
+    orr         w8,  w8,  w10
+    b.le        2f
+    add         w3,  w3,  #1 << \shift
+    sub         w4,  w4,  #1
+    and         x3,  x3,  #~((1 << \shift) - 1)
+    lsl         x2,  x2,  x3
+    clz         x3,  x2
+    subs        w4,  w4,  w3, lsr #\shift
+    b.ge        1b
+2:
+    str         w8,  [x1]
+    mov         w0,  w7
+.endm
+
+function x264_coeff_level_run4_aarch64, export=1
+    ldr         x2,  [x0]
+
+    coeff_level_run_start 4
+
+    coeff_level_run 4
+
+    ret
+endfunc
+
+.macro X264_COEFF_LEVEL_RUN size
+function x264_coeff_level_run\size\()_neon, export=1
+.if \size == 15
+    sub         x0,  x0,  #2
+.endif
+.if         \size < 15
+    ld1         {v0.8h}, [x0]
+    uqxtn       v0.8b,  v0.8h
+    cmtst       v0.8b,  v0.8b,  v0.8b
+.else
+    ld1         {v0.8h,v1.8h}, [x0]
+    uqxtn       v0.8b,  v0.8h
+    uqxtn2      v0.16b, v1.8h
+    cmtst       v0.16b, v0.16b, v0.16b
+    shrn        v0.8b,  v0.8h,  #4
+.endif
+    fmov        x2,  d0
+.if \size == 15
+    add         x0,  x0,  #2
+.endif
+
+    coeff_level_run_start \size
+
+    coeff_level_run (4 - (\size + 1) / 8)
+
+    ret
+endfunc
+.endm
+
+X264_COEFF_LEVEL_RUN 8
+X264_COEFF_LEVEL_RUN 15
+X264_COEFF_LEVEL_RUN 16
+
+function x264_denoise_dct_neon, export=1
+1:  subs        w3,  w3,  #16
+    ld1         {v0.8h,v1.8h}, [x0]
+    ld1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
+    abs         v16.8h,  v0.8h
+    abs         v17.8h,  v1.8h
+    ld1         {v2.8h,v3.8h}, [x2], #32
+    cmlt        v18.8h,  v0.8h,   #0
+    cmlt        v19.8h,  v1.8h,   #0
+    uaddw       v4.4s,   v4.4s,   v16.4h
+    uaddw2      v5.4s,   v5.4s,   v16.8h
+    uqsub       v20.8h,  v16.8h,  v2.8h
+    uqsub       v21.8h,  v17.8h,  v3.8h
+    uaddw       v6.4s,   v6.4s,   v17.4h
+    uaddw2      v7.4s,   v7.4s,   v17.8h
+    neg         v22.8h,  v20.8h
+    neg         v23.8h,  v21.8h
+    bsl         v18.16b, v22.16b, v20.16b
+    bsl         v19.16b, v23.16b, v21.16b
+    st1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
+    st1         {v18.8h,v19.8h}, [x0], #32
+    b.gt        1b
+    ret
+endfunc