1 /****************************************************************************
2 * quant.S: arm quantization and level-run
3 *****************************************************************************
4 * Copyright (C) 2009-2013 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
33 .byte 1,2,4,8,16,32,64,128
34 .byte 1,2,4,8,16,32,64,128
38 .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
39 vadd.u16 q8, q8, \bias0
40 vadd.u16 q9, q9, \bias1
42 vld1.64 {\mf0-\mf3}, [r1,:128]!
44 vmull.u16 q10, d16, \mf0
45 vmull.u16 q11, d17, \mf1
46 vmull.u16 q12, d18, \mf2
47 vmull.u16 q13, d19, \mf3
48 vshr.s16 q14, q14, #15
49 vshr.s16 q15, q15, #15
50 vshrn.u32 d16, q10, #16
51 vshrn.u32 d17, q11, #16
52 vshrn.u32 d18, q12, #16
53 vshrn.u32 d19, q13, #16
59 vst1.64 {d16-d19}, [r0,:128]!
69 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
70 function x264_quant_2x2_dc_neon
71 vld1.64 {d0}, [r0,:64]
81 vst1.64 {d3}, [r0,:64]
85 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
86 function x264_quant_4x4_dc_neon
87 vld1.64 {d28-d31}, [r0,:128]
92 QUANT_TWO q0, q0, d4, d5, d4, d5
97 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
98 function x264_quant_4x4_neon
99 vld1.64 {d28-d31}, [r0,:128]
102 vld1.64 {d0-d3}, [r2,:128]
103 vld1.64 {d4-d7}, [r1,:128]
104 QUANT_TWO q0, q1, d4, d5, d6, d7
109 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
110 function x264_quant_8x8_neon
111 vld1.64 {d28-d31}, [r0,:128]
114 vld1.64 {d0-d3}, [r2,:128]!
115 vld1.64 {d4-d7}, [r1,:128]!
116 QUANT_TWO q0, q1, d4, d5, d6, d7
118 vld1.64 {d28-d31}, [r0,:128]
121 vld1.64 {d2-d5}, [r2,:128]!
122 QUANT_TWO q1, q2, d4, d5, d6, d7, yes
129 .macro DEQUANT_START mf_size offset dc=no
132 lsr r3, r3, #8 // i_qbits = i_qp / 6
133 add ip, r3, r3, lsl #1
134 sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
136 add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
138 ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
140 subs r3, r3, #\offset // 6 for 8x8
143 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
144 .macro DEQUANT size bits
145 function x264_dequant_\size\()_neon
146 DEQUANT_START \bits+2, \bits
150 blt dequant_\size\()_rshift
153 dequant_\size\()_lshift_loop:
157 vld1.32 {d16-d17}, [r1,:128]!
158 vld1.32 {d18-d19}, [r1,:128]!
160 vld1.32 {d20-d21}, [r1,:128]!
162 vld1.32 {d22-d23}, [r1,:128]!
164 vld1.16 {d0-d3}, [r0,:128]
170 vst1.16 {d0-d3}, [r0,:128]!
172 bgt dequant_\size\()_lshift_loop
176 dequant_\size\()_rshift:
184 dequant_\size\()_rshift_loop:
188 vld1.32 {d16-d17}, [r1,:128]!
190 vld1.32 {d18-d19}, [r1,:128]!
192 vld1.32 {d16-d17}, [r1,:128]!
194 vld1.32 {d18-d19}, [r1,:128]!
196 vld1.16 {d0-d3}, [r0,:128]
201 vmlal.s16 q10, d0, d4
202 vmlal.s16 q11, d1, d5
203 vmlal.s16 q12, d2, d6
204 vmlal.s16 q13, d3, d7
205 vshl.s32 q10, q10, q15
206 vshl.s32 q11, q11, q15
207 vshl.s32 q12, q12, q15
208 vshl.s32 q13, q13, q15
214 vst1.16 {d0-d3}, [r0,:128]!
216 bgt dequant_\size\()_rshift_loop
225 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
226 function x264_dequant_4x4_dc_neon
227 DEQUANT_START 6, 6, yes
228 blt dequant_4x4_dc_rshift
232 vld1.16 {d0-d3}, [r0,:128]
237 vst1.16 {d0-d3}, [r0,:128]
240 dequant_4x4_dc_rshift:
250 vld1.16 {d0-d3}, [r0,:128]
254 vmlal.s16 q10, d0, d4
255 vmlal.s16 q11, d1, d4
256 vmlal.s16 q12, d2, d4
257 vmlal.s16 q13, d3, d4
258 vshl.s32 q10, q10, q15
259 vshl.s32 q11, q11, q15
260 vshl.s32 q12, q12, q15
261 vshl.s32 q13, q13, q15
267 vst1.16 {d0-d3}, [r0,:128]
272 // int coeff_last( int16_t *l )
273 function x264_coeff_last4_arm
283 .macro COEFF_LAST_1x size
284 function x264_coeff_last\size\()_neon
287 vld1.64 {d0-d3}, [r0]
289 vld1.64 {d0-d3}, [r0,:128]
301 subs r1, ip, r1, lsr #2
302 addge r0, r1, #\size - 8
303 sublts r0, r3, r0, lsr #2
312 function x264_coeff_last64_neon
313 vld1.64 {d16-d19}, [r0,:128]!
316 vld1.64 {d20-d23}, [r0,:128]!
319 vld1.64 {d24-d27}, [r0,:128]!
322 vld1.64 {d28-d31}, [r0,:128]!
326 movrel r1, pmovmskb_byte
327 vld1.64 {d0-d1}, [r1,:128]
339 vpadd.u8 d0, d16, d17
340 vpadd.u8 d1, d18, d19
341 vpadd.u8 d2, d20, d21
342 vpadd.u8 d3, d22, d23