/****************************************************************************
* quant.S: arm quantization and level-run
*****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2016 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
+ * Martin Storsjo <martin@martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
lsr x6, x3, #2
lsl x1, x1, x3
ldrb w7, [x5, x6]
- cbz x1, 2f
lsl x1, x1, #4
add w0, w0, w7
cbnz x1, 1b
ret
-2:
- add w0, w0, w7
-0:
- ret
9:
mov w0, #9
+0:
ret
endfunc
.endm
clz x3, x1
lsl x1, x1, x3
ldrb w7, [x5, x3]
- cbz x1, 2f
lsl x1, x1, #1
add w0, w0, w7
cbnz x1, 1b
ret
-2:
- add w0, w0, w7
-0:
- ret
9:
mov w0, #9
+0:
ret
endfunc
sub w0, w3, w2
ret
endfunc
+
+.macro coeff_level_run_start size
+ add x6, x1, #23 // runlevel->mask
+ mov w7, #0
+ mov w8, #0
+ mov w9, #1
+ and x6, x6, #~15
+ mov w4, #\size - 1
+.endm
+
+.macro coeff_level_run shift
+ clz x3, x2
+ subs w4, w4, w3, lsr #\shift
+ str w4, [x1], #4
+1:
+ ldrh w5, [x0, x4, lsl #1]
+ strh w5, [x6], #2
+ add w7, w7, #1
+ lsl w10, w9, w4
+ orr w8, w8, w10
+ b.le 2f
+ add w3, w3, #1 << \shift
+ sub w4, w4, #1
+ and x3, x3, #~((1 << \shift) - 1)
+ lsl x2, x2, x3
+ clz x3, x2
+ subs w4, w4, w3, lsr #\shift
+ b.ge 1b
+2:
+ str w8, [x1]
+ mov w0, w7
+.endm
+
+function x264_coeff_level_run4_aarch64, export=1
+ ldr x2, [x0]
+
+ coeff_level_run_start 4
+
+ coeff_level_run 4
+
+ ret
+endfunc
+
+.macro X264_COEFF_LEVEL_RUN size
+function x264_coeff_level_run\size\()_neon, export=1
+.if \size == 15
+ sub x0, x0, #2
+.endif
+.if \size < 15
+ ld1 {v0.8h}, [x0]
+ uqxtn v0.8b, v0.8h
+ cmtst v0.8b, v0.8b, v0.8b
+.else
+ ld1 {v0.8h,v1.8h}, [x0]
+ uqxtn v0.8b, v0.8h
+ uqxtn2 v0.16b, v1.8h
+ cmtst v0.16b, v0.16b, v0.16b
+ shrn v0.8b, v0.8h, #4
+.endif
+ fmov x2, d0
+.if \size == 15
+ add x0, x0, #2
+.endif
+
+ coeff_level_run_start \size
+
+ coeff_level_run (4 - (\size + 1) / 8)
+
+ ret
+endfunc
+.endm
+
+X264_COEFF_LEVEL_RUN 8
+X264_COEFF_LEVEL_RUN 15
+X264_COEFF_LEVEL_RUN 16
+
+function x264_denoise_dct_neon, export=1
+1: subs w3, w3, #16
+ ld1 {v0.8h,v1.8h}, [x0]
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
+ abs v16.8h, v0.8h
+ abs v17.8h, v1.8h
+ ld1 {v2.8h,v3.8h}, [x2], #32
+ cmlt v18.8h, v0.8h, #0
+ cmlt v19.8h, v1.8h, #0
+ uaddw v4.4s, v4.4s, v16.4h
+ uaddw2 v5.4s, v5.4s, v16.8h
+ uqsub v20.8h, v16.8h, v2.8h
+ uqsub v21.8h, v17.8h, v3.8h
+ uaddw v6.4s, v6.4s, v17.4h
+ uaddw2 v7.4s, v7.4s, v17.8h
+ neg v22.8h, v20.8h
+ neg v23.8h, v21.8h
+ bsl v18.16b, v22.16b, v20.16b
+ bsl v19.16b, v23.16b, v21.16b
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
+ st1 {v18.8h,v19.8h}, [x0], #32
+ b.gt 1b
+ ret
+endfunc