-/*****************************************************************************
- * quant.S: h264 encoder
+/****************************************************************************
+ * quant.S: arm quantization and level-run
*****************************************************************************
- * Copyright (C) 2009 x264 project
+ * Copyright (C) 2009-2016 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
+ * Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
-.fpu neon
-
.section .rodata
.align 4
pmovmskb_byte:
.byte 1,2,4,8,16,32,64,128
.byte 1,2,4,8,16,32,64,128
+mask_2bit:
+.byte 3,12,48,192,3,12,48,192
+.byte 3,12,48,192,3,12,48,192
+
+mask_1bit:
+.byte 128,64,32,16,8,4,2,1
+.byte 128,64,32,16,8,4,2,1
+
.text
-.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
+.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
vadd.u16 q8, q8, \bias0
vadd.u16 q9, q9, \bias1
.ifc \load_mf, yes
veor q9, q9, q15
vsub.s16 q8, q8, q14
vsub.s16 q9, q9, q15
- vorr \bias0, q8, q9
+ vorr \mask, q8, q9
vst1.64 {d16-d19}, [r0,:128]!
.endm
bx lr
.endm
-// quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
-function x264_quant_2x2_dc_neon, export=1
+// quant_2x2_dc( int16_t dct[4], int mf, int bias )
+function x264_quant_2x2_dc_neon
vld1.64 {d0}, [r0,:64]
vabs.s16 d3, d0
vdup.16 d2, r2
vsub.s16 d3, d3, d0
vst1.64 {d3}, [r0,:64]
QUANT_END d3
-.endfunc
+endfunc
-// quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
-function x264_quant_4x4_dc_neon, export=1
+// quant_4x4_dc( int16_t dct[16], int mf, int bias )
+function x264_quant_4x4_dc_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vdup.16 q0, r2
vdup.16 q2, r1
- QUANT_TWO q0, q0, d4, d5, d4, d5
+ QUANT_TWO q0, q0, d4, d5, d4, d5, q0
vorr d0, d0, d1
QUANT_END d0
-.endfunc
+endfunc
-// quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
-function x264_quant_4x4_neon, export=1
+// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]
vld1.64 {d4-d7}, [r1,:128]
- QUANT_TWO q0, q1, d4, d5, d6, d7
+ QUANT_TWO q0, q1, d4, d5, d6, d7, q0
vorr d0, d0, d1
QUANT_END d0
-.endfunc
+endfunc
-// quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
-function x264_quant_8x8_neon, export=1
+// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4x4_neon
+ vpush {d8-d15}
+ vld1.64 {d28-d31}, [r0,:128]
+ vabs.s16 q8, q14
+ vabs.s16 q9, q15
+ vld1.64 {d0-d3}, [r2,:128]
+ vld1.64 {d4-d7}, [r1,:128]
+ QUANT_TWO q0, q1, d4, d5, d6, d7, q4
+ vld1.64 {d28-d31}, [r0,:128]
+ vabs.s16 q8, q14
+ vabs.s16 q9, q15
+ QUANT_TWO q0, q1, d4, d5, d6, d7, q5
+ vld1.64 {d28-d31}, [r0,:128]
+ vabs.s16 q8, q14
+ vabs.s16 q9, q15
+ QUANT_TWO q0, q1, d4, d5, d6, d7, q6
+ vld1.64 {d28-d31}, [r0,:128]
+ vabs.s16 q8, q14
+ vabs.s16 q9, q15
+ QUANT_TWO q0, q1, d4, d5, d6, d7, q7
+ vorr d8, d8, d9
+ vorr d10, d10, d11
+ vorr d12, d12, d13
+ vorr d14, d14, d15
+ vmov r0, r1, d8
+ vmov r2, r3, d10
+ orrs r0, r1
+ movne r0, #1
+ orrs r2, r3
+ orrne r0, #2
+ vmov r1, r2, d12
+ vmov r3, ip, d14
+ orrs r1, r2
+ orrne r0, #4
+ orrs r3, ip
+ orrne r0, #8
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
+function x264_quant_8x8_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]!
vld1.64 {d4-d7}, [r1,:128]!
- QUANT_TWO q0, q1, d4, d5, d6, d7
+ QUANT_TWO q0, q1, d4, d5, d6, d7, q0
.rept 3
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d2-d5}, [r2,:128]!
- QUANT_TWO q1, q2, d4, d5, d6, d7, yes
+ QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
vorr q0, q0, q1
.endr
vorr d0, d0, d1
QUANT_END d0
-.endfunc
+endfunc
.macro DEQUANT_START mf_size offset dc=no
mov r3, #0x2b
subs r3, r3, #\offset // 6 for 8x8
.endm
-// dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
.macro DEQUANT size bits
-function x264_dequant_\size\()_neon, export=1
+function x264_dequant_\size\()_neon
DEQUANT_START \bits+2, \bits
.ifc \size, 8x8
mov r2, #4
bgt dequant_\size\()_rshift_loop
.endif
bx lr
-.endfunc
+endfunc
.endm
DEQUANT 4x4, 4
DEQUANT 8x8, 6
-// dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
-function x264_dequant_4x4_dc_neon, export=1
+// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+function x264_dequant_4x4_dc_neon
DEQUANT_START 6, 6, yes
blt dequant_4x4_dc_rshift
vmovn.s32 d3, q13
vst1.16 {d0-d3}, [r0,:128]
bx lr
-.endfunc
+endfunc
+
+.macro decimate_score_1x size
+function x264_decimate_score\size\()_neon
+ vld1.16 {q0, q1}, [r0, :128]
+ movrel r3, mask_2bit
+ vmov.s8 q3, #0x01
+ vqmovn.s16 d0, q0
+ vqmovn.s16 d1, q1
+ vqabs.s8 q2, q0
+ vld1.8 {q8}, [r3, :128]
+ vceq.s8 q1, q0, #0
+ vcgt.s8 q2, q2, q3
+ vand.u8 q1, q1, q8
+ vshrn.u16 d4, q2, #4
+ vpadd.u8 d2, d2, d3
+ vpadd.u8 d4, d4, d4
+ vpadd.u8 d2, d2, d2
+ vmov.32 r2, d4[0]
+ vmov.32 r1, d2[0]
+ cmp r2, #0
+ beq 0f
+ mov r0, #9
+ bx lr
+0:
+ mvns r1, r1
+ mov r0, #0
+ bxeq lr
+.ifc \size, 15
+ lsr r1, r1, #2
+.endif
+ rbit r1, r1
+ movrelx r3, X(x264_decimate_table4), r2
+1:
+ clz r2, r1
+ lsl r1, r1, r2
+ lsr r12, r2, #1
+ ldrb r2, [r3, r12]
+ lsls r1, r1, #2
+ add r0, r0, r2
+ bne 1b
+ bx lr
+endfunc
+.endm
+decimate_score_1x 15
+decimate_score_1x 16
+
+function x264_decimate_score64_neon
+ push {lr}
+ vld1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r0, :128]
+ movrel r3, mask_1bit
+ vmov.s8 q3, #0x01
+ vqmovn.s16 d17, q8
+ vqmovn.s16 d16, q9
+ vqmovn.s16 d19, q10
+ vqmovn.s16 d18, q11
+ vqmovn.s16 d21, q12
+ vqmovn.s16 d20, q13
+ vqmovn.s16 d23, q14
+ vqmovn.s16 d22, q15
+ vqabs.s8 q12, q8
+ vqabs.s8 q13, q9
+ vqabs.s8 q14, q10
+ vqabs.s8 q15, q11
+ vld1.8 {q2}, [r3, :128]
+ vceq.s8 q8, q8, #0
+ vceq.s8 q9, q9, #0
+ vceq.s8 q10, q10, #0
+ vceq.s8 q11, q11, #0
+ vmax.s8 q12, q12, q13
+ vmax.s8 q14, q14, q15
+ vand.u8 q8, q8, q2
+ vand.u8 q9, q9, q2
+ vand.u8 q10, q10, q2
+ vand.u8 q11, q11, q2
+ vmax.s8 q12, q12, q14
+ vpadd.u8 d18, d18, d19
+ vpadd.u8 d19, d16, d17
+ vcgt.s8 q12, q12, q3
+ vpadd.u8 d22, d22, d23
+ vpadd.u8 d23, d20, d21
+ vshrn.u16 d24, q12, #4
+ vpadd.u8 d16, d22, d23
+ vpadd.u8 d17, d18, d19
+ vpadd.u8 d24, d24, d24
+ vpadd.u8 d16, d16, d17
+ vmov.32 r2, d24[0]
+ vmov r12, r1, d16
+ cmp r2, #0
+ beq 0f
+ mov r0, #9
+ pop {pc}
+0:
+ mvns r1, r1
+ mvn r12, r12
+ mov r0, #0
+ mov lr, #32
+ movrelx r3, X(x264_decimate_table8), r2
+ beq 2f
+1:
+ clz r2, r1
+ lsl r1, r1, r2
+ sub lr, lr, r2
+ ldrb r2, [r3, r2]
+ lsls r1, r1, #1
+ sub lr, lr, #1
+ add r0, r0, r2
+ bne 1b
+2:
+ cmp r12, #0
+ popeq {pc}
+
+ clz r2, r12
+ lsl r1, r12, r2
+ add r2, r2, lr
+ ldrb r2, [r3, r2]
+ lsls r1, r1, #1
+ add r0, r0, r2
+ popeq {pc}
+3:
+ clz r2, r1
+ lsl r1, r1, r2
+ ldrb r2, [r3, r2]
+ lsls r1, r1, #1
+ add r0, r0, r2
+ bne 3b
+ pop {pc}
+endfunc
// int coeff_last( int16_t *l )
-function x264_coeff_last4_arm, export=1
- ldrd r2, [r0]
+function x264_coeff_last4_arm
+ ldrd r2, r3, [r0]
subs r0, r3, #0
movne r0, #2
movne r2, r3
lsrs r2, r2, #16
addne r0, r0, #1
bx lr
-.endfunc
+endfunc
+
+function x264_coeff_last8_arm
+ ldrd r2, r3, [r0, #8]
+ orrs ip, r2, r3
+ movne r0, #4
+ ldrdeq r2, r3, [r0]
+ moveq r0, #0
+ tst r3, r3
+ addne r0, #2
+ movne r2, r3
+ lsrs r2, r2, #16
+ addne r0, r0, #1
+ bx lr
+endfunc
.macro COEFF_LAST_1x size
-function x264_coeff_last\size\()_neon, export=1
+function x264_coeff_last\size\()_neon
.if \size == 15
sub r0, r0, #2
- vld1.64 {d0-d3}, [r0]
-.else
- vld1.64 {d0-d3}, [r0,:128]
.endif
+ vld1.64 {d0-d3}, [r0,:128]
vtst.16 q0, q0
vtst.16 q1, q1
vshrn.u16 d0, q0, #8
subs r1, ip, r1, lsr #2
addge r0, r1, #\size - 8
- sublts r0, r3, r0, lsr #2
+ subslt r0, r3, r0, lsr #2
movlt r0, #0
bx lr
-.endfunc
+endfunc
.endm
COEFF_LAST_1x 15
COEFF_LAST_1x 16
-function x264_coeff_last64_neon, export=1
+function x264_coeff_last64_neon
vld1.64 {d16-d19}, [r0,:128]!
vqmovn.u16 d16, q8
vqmovn.u16 d17, q9
subs r1, ip, r1
addge r0, r1, #32
- sublts r0, ip, r0
+ subslt r0, ip, r0
movlt r0, #0
bx lr
-.endfunc
+endfunc
+
+function x264_denoise_dct_neon
+1: subs r3, r3, #16
+ vld1.16 {q0, q1}, [r0]
+ vld1.32 {q12, q13}, [r1]!
+ vld1.32 {q14, q15}, [r1]
+ sub r1, #32
+ vabs.s16 q8, q0
+ vabs.s16 q9, q1
+ vld1.16 {q2, q3}, [r2]!
+ vclt.s16 q10, q0, #0
+ vclt.s16 q11, q1, #0
+ vaddw.u16 q12, q12, d16
+ vaddw.u16 q13, q13, d17
+ vqsub.u16 q0, q8, q2
+ vqsub.u16 q1, q9, q3
+ vaddw.u16 q14, q14, d18
+ vaddw.u16 q15, q15, d19
+ vneg.s16 q8, q0
+ vneg.s16 q9, q1
+ vbsl q10, q8, q0
+ vbsl q11, q9, q1
+ vst1.32 {q12, q13}, [r1]!
+ vst1.32 {q14, q15}, [r1]!
+ vst1.16 {q10, q11}, [r0]!
+ bgt 1b
+ bx lr
+endfunc