1 /****************************************************************************
2 * quant.S: arm quantization and level-run
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
33 .byte 1,2,4,8,16,32,64,128
34 .byte 1,2,4,8,16,32,64,128
38 .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
39 vadd.u16 q8, q8, \bias0
40 vadd.u16 q9, q9, \bias1
42 vld1.64 {\mf0-\mf3}, [r1,:128]!
44 vmull.u16 q10, d16, \mf0
45 vmull.u16 q11, d17, \mf1
46 vmull.u16 q12, d18, \mf2
47 vmull.u16 q13, d19, \mf3
48 vshr.s16 q14, q14, #15
49 vshr.s16 q15, q15, #15
50 vshrn.u32 d16, q10, #16
51 vshrn.u32 d17, q11, #16
52 vshrn.u32 d18, q12, #16
53 vshrn.u32 d19, q13, #16
59 vst1.64 {d16-d19}, [r0,:128]!
69 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
70 function x264_quant_2x2_dc_neon
71 vld1.64 {d0}, [r0,:64]
81 vst1.64 {d3}, [r0,:64]
85 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
86 function x264_quant_4x4_dc_neon
87 vld1.64 {d28-d31}, [r0,:128]
92 QUANT_TWO q0, q0, d4, d5, d4, d5, q0
97 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
98 function x264_quant_4x4_neon
99 vld1.64 {d28-d31}, [r0,:128]
102 vld1.64 {d0-d3}, [r2,:128]
103 vld1.64 {d4-d7}, [r1,:128]
104 QUANT_TWO q0, q1, d4, d5, d6, d7, q0
109 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
110 function x264_quant_4x4x4_neon
112 vld1.64 {d28-d31}, [r0,:128]
115 vld1.64 {d0-d3}, [r2,:128]
116 vld1.64 {d4-d7}, [r1,:128]
117 QUANT_TWO q0, q1, d4, d5, d6, d7, q4
118 vld1.64 {d28-d31}, [r0,:128]
121 QUANT_TWO q0, q1, d4, d5, d6, d7, q5
122 vld1.64 {d28-d31}, [r0,:128]
125 QUANT_TWO q0, q1, d4, d5, d6, d7, q6
126 vld1.64 {d28-d31}, [r0,:128]
129 QUANT_TWO q0, q1, d4, d5, d6, d7, q7
150 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
151 function x264_quant_8x8_neon
152 vld1.64 {d28-d31}, [r0,:128]
155 vld1.64 {d0-d3}, [r2,:128]!
156 vld1.64 {d4-d7}, [r1,:128]!
157 QUANT_TWO q0, q1, d4, d5, d6, d7, q0
159 vld1.64 {d28-d31}, [r0,:128]
162 vld1.64 {d2-d5}, [r2,:128]!
163 QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
170 .macro DEQUANT_START mf_size offset dc=no
173 lsr r3, r3, #8 // i_qbits = i_qp / 6
174 add ip, r3, r3, lsl #1
175 sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
177 add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
179 ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
181 subs r3, r3, #\offset // 6 for 8x8
184 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
185 .macro DEQUANT size bits
186 function x264_dequant_\size\()_neon
187 DEQUANT_START \bits+2, \bits
191 blt dequant_\size\()_rshift
194 dequant_\size\()_lshift_loop:
198 vld1.32 {d16-d17}, [r1,:128]!
199 vld1.32 {d18-d19}, [r1,:128]!
201 vld1.32 {d20-d21}, [r1,:128]!
203 vld1.32 {d22-d23}, [r1,:128]!
205 vld1.16 {d0-d3}, [r0,:128]
211 vst1.16 {d0-d3}, [r0,:128]!
213 bgt dequant_\size\()_lshift_loop
217 dequant_\size\()_rshift:
225 dequant_\size\()_rshift_loop:
229 vld1.32 {d16-d17}, [r1,:128]!
231 vld1.32 {d18-d19}, [r1,:128]!
233 vld1.32 {d16-d17}, [r1,:128]!
235 vld1.32 {d18-d19}, [r1,:128]!
237 vld1.16 {d0-d3}, [r0,:128]
242 vmlal.s16 q10, d0, d4
243 vmlal.s16 q11, d1, d5
244 vmlal.s16 q12, d2, d6
245 vmlal.s16 q13, d3, d7
246 vshl.s32 q10, q10, q15
247 vshl.s32 q11, q11, q15
248 vshl.s32 q12, q12, q15
249 vshl.s32 q13, q13, q15
255 vst1.16 {d0-d3}, [r0,:128]!
257 bgt dequant_\size\()_rshift_loop
266 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
267 function x264_dequant_4x4_dc_neon
268 DEQUANT_START 6, 6, yes
269 blt dequant_4x4_dc_rshift
273 vld1.16 {d0-d3}, [r0,:128]
278 vst1.16 {d0-d3}, [r0,:128]
281 dequant_4x4_dc_rshift:
291 vld1.16 {d0-d3}, [r0,:128]
295 vmlal.s16 q10, d0, d4
296 vmlal.s16 q11, d1, d4
297 vmlal.s16 q12, d2, d4
298 vmlal.s16 q13, d3, d4
299 vshl.s32 q10, q10, q15
300 vshl.s32 q11, q11, q15
301 vshl.s32 q12, q12, q15
302 vshl.s32 q13, q13, q15
308 vst1.16 {d0-d3}, [r0,:128]
313 // int coeff_last( int16_t *l )
314 function x264_coeff_last4_arm
324 function x264_coeff_last8_arm
325 ldrd r2, r3, [r0, #8]
338 .macro COEFF_LAST_1x size
339 function x264_coeff_last\size\()_neon
342 vld1.64 {d0-d3}, [r0]
344 vld1.64 {d0-d3}, [r0,:128]
356 subs r1, ip, r1, lsr #2
357 addge r0, r1, #\size - 8
358 subslt r0, r3, r0, lsr #2
367 function x264_coeff_last64_neon
368 vld1.64 {d16-d19}, [r0,:128]!
371 vld1.64 {d20-d23}, [r0,:128]!
374 vld1.64 {d24-d27}, [r0,:128]!
377 vld1.64 {d28-d31}, [r0,:128]!
381 movrel r1, pmovmskb_byte
382 vld1.64 {d0-d1}, [r1,:128]
394 vpadd.u8 d0, d16, d17
395 vpadd.u8 d1, d18, d19
396 vpadd.u8 d2, d20, d21
397 vpadd.u8 d3, d22, d23