1 /****************************************************************************
2 * quant.S: arm quantization and level-run
3 *****************************************************************************
4 * Copyright (C) 2009-2015 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
32 .byte 1,2,4,8,16,32,64,128
33 .byte 1,2,4,8,16,32,64,128
37 .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
38 vadd.u16 q8, q8, \bias0
39 vadd.u16 q9, q9, \bias1
41 vld1.64 {\mf0-\mf3}, [r1,:128]!
43 vmull.u16 q10, d16, \mf0
44 vmull.u16 q11, d17, \mf1
45 vmull.u16 q12, d18, \mf2
46 vmull.u16 q13, d19, \mf3
47 vshr.s16 q14, q14, #15
48 vshr.s16 q15, q15, #15
49 vshrn.u32 d16, q10, #16
50 vshrn.u32 d17, q11, #16
51 vshrn.u32 d18, q12, #16
52 vshrn.u32 d19, q13, #16
58 vst1.64 {d16-d19}, [r0,:128]!
68 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
69 function x264_quant_2x2_dc_neon
70 vld1.64 {d0}, [r0,:64]
80 vst1.64 {d3}, [r0,:64]
84 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
85 function x264_quant_4x4_dc_neon
86 vld1.64 {d28-d31}, [r0,:128]
91 QUANT_TWO q0, q0, d4, d5, d4, d5, q0
96 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
97 function x264_quant_4x4_neon
98 vld1.64 {d28-d31}, [r0,:128]
101 vld1.64 {d0-d3}, [r2,:128]
102 vld1.64 {d4-d7}, [r1,:128]
103 QUANT_TWO q0, q1, d4, d5, d6, d7, q0
108 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
109 function x264_quant_4x4x4_neon
111 vld1.64 {d28-d31}, [r0,:128]
114 vld1.64 {d0-d3}, [r2,:128]
115 vld1.64 {d4-d7}, [r1,:128]
116 QUANT_TWO q0, q1, d4, d5, d6, d7, q4
117 vld1.64 {d28-d31}, [r0,:128]
120 QUANT_TWO q0, q1, d4, d5, d6, d7, q5
121 vld1.64 {d28-d31}, [r0,:128]
124 QUANT_TWO q0, q1, d4, d5, d6, d7, q6
125 vld1.64 {d28-d31}, [r0,:128]
128 QUANT_TWO q0, q1, d4, d5, d6, d7, q7
149 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
150 function x264_quant_8x8_neon
151 vld1.64 {d28-d31}, [r0,:128]
154 vld1.64 {d0-d3}, [r2,:128]!
155 vld1.64 {d4-d7}, [r1,:128]!
156 QUANT_TWO q0, q1, d4, d5, d6, d7, q0
158 vld1.64 {d28-d31}, [r0,:128]
161 vld1.64 {d2-d5}, [r2,:128]!
162 QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
169 .macro DEQUANT_START mf_size offset dc=no
172 lsr r3, r3, #8 // i_qbits = i_qp / 6
173 add ip, r3, r3, lsl #1
174 sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
176 add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
178 ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
180 subs r3, r3, #\offset // 6 for 8x8
183 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
184 .macro DEQUANT size bits
185 function x264_dequant_\size\()_neon
186 DEQUANT_START \bits+2, \bits
190 blt dequant_\size\()_rshift
193 dequant_\size\()_lshift_loop:
197 vld1.32 {d16-d17}, [r1,:128]!
198 vld1.32 {d18-d19}, [r1,:128]!
200 vld1.32 {d20-d21}, [r1,:128]!
202 vld1.32 {d22-d23}, [r1,:128]!
204 vld1.16 {d0-d3}, [r0,:128]
210 vst1.16 {d0-d3}, [r0,:128]!
212 bgt dequant_\size\()_lshift_loop
216 dequant_\size\()_rshift:
224 dequant_\size\()_rshift_loop:
228 vld1.32 {d16-d17}, [r1,:128]!
230 vld1.32 {d18-d19}, [r1,:128]!
232 vld1.32 {d16-d17}, [r1,:128]!
234 vld1.32 {d18-d19}, [r1,:128]!
236 vld1.16 {d0-d3}, [r0,:128]
241 vmlal.s16 q10, d0, d4
242 vmlal.s16 q11, d1, d5
243 vmlal.s16 q12, d2, d6
244 vmlal.s16 q13, d3, d7
245 vshl.s32 q10, q10, q15
246 vshl.s32 q11, q11, q15
247 vshl.s32 q12, q12, q15
248 vshl.s32 q13, q13, q15
254 vst1.16 {d0-d3}, [r0,:128]!
256 bgt dequant_\size\()_rshift_loop
265 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
266 function x264_dequant_4x4_dc_neon
267 DEQUANT_START 6, 6, yes
268 blt dequant_4x4_dc_rshift
272 vld1.16 {d0-d3}, [r0,:128]
277 vst1.16 {d0-d3}, [r0,:128]
280 dequant_4x4_dc_rshift:
290 vld1.16 {d0-d3}, [r0,:128]
294 vmlal.s16 q10, d0, d4
295 vmlal.s16 q11, d1, d4
296 vmlal.s16 q12, d2, d4
297 vmlal.s16 q13, d3, d4
298 vshl.s32 q10, q10, q15
299 vshl.s32 q11, q11, q15
300 vshl.s32 q12, q12, q15
301 vshl.s32 q13, q13, q15
307 vst1.16 {d0-d3}, [r0,:128]
312 // int coeff_last( int16_t *l )
313 function x264_coeff_last4_arm
323 function x264_coeff_last8_arm
324 ldrd r2, r3, [r0, #8]
337 .macro COEFF_LAST_1x size
338 function x264_coeff_last\size\()_neon
342 vld1.64 {d0-d3}, [r0,:128]
353 subs r1, ip, r1, lsr #2
354 addge r0, r1, #\size - 8
355 subslt r0, r3, r0, lsr #2
364 function x264_coeff_last64_neon
365 vld1.64 {d16-d19}, [r0,:128]!
368 vld1.64 {d20-d23}, [r0,:128]!
371 vld1.64 {d24-d27}, [r0,:128]!
374 vld1.64 {d28-d31}, [r0,:128]!
378 movrel r1, pmovmskb_byte
379 vld1.64 {d0-d1}, [r1,:128]
391 vpadd.u8 d0, d16, d17
392 vpadd.u8 d1, d18, d19
393 vpadd.u8 d2, d20, d21
394 vpadd.u8 d3, d22, d23
409 function x264_denoise_dct_neon
411 vld1.16 {q0, q1}, [r0]
412 vld1.32 {q12, q13}, [r1]!
413 vld1.32 {q14, q15}, [r1]
417 vld1.16 {q2, q3}, [r2]!
420 vaddw.u16 q12, q12, d16
421 vaddw.u16 q13, q13, d17
424 vaddw.u16 q14, q14, d18
425 vaddw.u16 q15, q15, d19
430 vst1.32 {q12, q13}, [r1]!
431 vst1.32 {q14, q15}, [r1]!
432 vst1.16 {q10, q11}, [r0]!