/****************************************************************************
* quant.S: arm quantization and level-run
*****************************************************************************
- * Copyright (C) 2009-2013 x264 project
+ * Copyright (C) 2009-2015 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
+ * Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
#include "asm.S"
-.fpu neon
-
.section .rodata
.align 4
pmovmskb_byte:
vsub.s16 d3, d3, d0
vst1.64 {d3}, [r0,:64]
QUANT_END d3
-.endfunc
+endfunc
// quant_4x4_dc( int16_t dct[16], int mf, int bias )
function x264_quant_4x4_dc_neon
QUANT_TWO q0, q0, d4, d5, d4, d5, q0
vorr d0, d0, d1
QUANT_END d0
-.endfunc
+endfunc
// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
function x264_quant_4x4_neon
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
vorr d0, d0, d1
QUANT_END d0
-.endfunc
+endfunc
// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
function x264_quant_4x4x4_neon
orrne r0, #8
vpop {d8-d15}
bx lr
-.endfunc
+endfunc
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
function x264_quant_8x8_neon
.endr
vorr d0, d0, d1
QUANT_END d0
-.endfunc
+endfunc
.macro DEQUANT_START mf_size offset dc=no
mov r3, #0x2b
bgt dequant_\size\()_rshift_loop
.endif
bx lr
-.endfunc
+endfunc
.endm
DEQUANT 4x4, 4
vmovn.s32 d3, q13
vst1.16 {d0-d3}, [r0,:128]
bx lr
-.endfunc
+endfunc
// int coeff_last( int16_t *l )
lsrs r2, r2, #16
addne r0, r0, #1
bx lr
-.endfunc
+endfunc
+
+function x264_coeff_last8_arm
+ ldrd r2, r3, [r0, #8]
+ orrs ip, r2, r3
+ movne r0, #4
+ ldrdeq r2, r3, [r0]
+ moveq r0, #0
+ tst r3, r3
+ addne r0, #2
+ movne r2, r3
+ lsrs r2, r2, #16
+ addne r0, r0, #1
+ bx lr
+endfunc
.macro COEFF_LAST_1x size
function x264_coeff_last\size\()_neon
.if \size == 15
sub r0, r0, #2
- vld1.64 {d0-d3}, [r0]
-.else
- vld1.64 {d0-d3}, [r0,:128]
.endif
+ vld1.64 {d0-d3}, [r0,:128]
vtst.16 q0, q0
vtst.16 q1, q1
vshrn.u16 d0, q0, #8
subslt r0, r3, r0, lsr #2
movlt r0, #0
bx lr
-.endfunc
+endfunc
.endm
COEFF_LAST_1x 15
subslt r0, ip, r0
movlt r0, #0
bx lr
-.endfunc
+endfunc
+
+function x264_denoise_dct_neon
+1: subs r3, r3, #16
+ vld1.16 {q0, q1}, [r0]
+ vld1.32 {q12, q13}, [r1]!
+ vld1.32 {q14, q15}, [r1]
+ sub r1, #32
+ vabs.s16 q8, q0
+ vabs.s16 q9, q1
+ vld1.16 {q2, q3}, [r2]!
+ vclt.s16 q10, q0, #0
+ vclt.s16 q11, q1, #0
+ vaddw.u16 q12, q12, d16
+ vaddw.u16 q13, q13, d17
+ vqsub.u16 q0, q8, q2
+ vqsub.u16 q1, q9, q3
+ vaddw.u16 q14, q14, d18
+ vaddw.u16 q15, q15, d19
+ vneg.s16 q8, q0
+ vneg.s16 q9, q1
+ vbsl q10, q8, q0
+ vbsl q11, q9, q1
+ vst1.32 {q12, q13}, [r1]!
+ vst1.32 {q14, q15}, [r1]!
+ vst1.16 {q10, q11}, [r0]!
+ bgt 1b
+ bx lr
+endfunc