1 /****************************************************************************
2 * quant.S: arm quantization and level-run
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
31 .byte 1,2,4,8,16,32,64,128
32 .byte 1,2,4,8,16,32,64,128
36 .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
37 vadd.u16 q8, q8, \bias0
38 vadd.u16 q9, q9, \bias1
40 vld1.64 {\mf0-\mf3}, [r1,:128]!
42 vmull.u16 q10, d16, \mf0
43 vmull.u16 q11, d17, \mf1
44 vmull.u16 q12, d18, \mf2
45 vmull.u16 q13, d19, \mf3
46 vshr.s16 q14, q14, #15
47 vshr.s16 q15, q15, #15
48 vshrn.u32 d16, q10, #16
49 vshrn.u32 d17, q11, #16
50 vshrn.u32 d18, q12, #16
51 vshrn.u32 d19, q13, #16
57 vst1.64 {d16-d19}, [r0,:128]!
67 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
68 function x264_quant_2x2_dc_neon
69 vld1.64 {d0}, [r0,:64]
79 vst1.64 {d3}, [r0,:64]
83 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
84 function x264_quant_4x4_dc_neon
85 vld1.64 {d28-d31}, [r0,:128]
90 QUANT_TWO q0, q0, d4, d5, d4, d5, q0
95 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
96 function x264_quant_4x4_neon
97 vld1.64 {d28-d31}, [r0,:128]
100 vld1.64 {d0-d3}, [r2,:128]
101 vld1.64 {d4-d7}, [r1,:128]
102 QUANT_TWO q0, q1, d4, d5, d6, d7, q0
107 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
108 function x264_quant_4x4x4_neon
110 vld1.64 {d28-d31}, [r0,:128]
113 vld1.64 {d0-d3}, [r2,:128]
114 vld1.64 {d4-d7}, [r1,:128]
115 QUANT_TWO q0, q1, d4, d5, d6, d7, q4
116 vld1.64 {d28-d31}, [r0,:128]
119 QUANT_TWO q0, q1, d4, d5, d6, d7, q5
120 vld1.64 {d28-d31}, [r0,:128]
123 QUANT_TWO q0, q1, d4, d5, d6, d7, q6
124 vld1.64 {d28-d31}, [r0,:128]
127 QUANT_TWO q0, q1, d4, d5, d6, d7, q7
148 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
149 function x264_quant_8x8_neon
150 vld1.64 {d28-d31}, [r0,:128]
153 vld1.64 {d0-d3}, [r2,:128]!
154 vld1.64 {d4-d7}, [r1,:128]!
155 QUANT_TWO q0, q1, d4, d5, d6, d7, q0
157 vld1.64 {d28-d31}, [r0,:128]
160 vld1.64 {d2-d5}, [r2,:128]!
161 QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
168 .macro DEQUANT_START mf_size offset dc=no
171 lsr r3, r3, #8 // i_qbits = i_qp / 6
172 add ip, r3, r3, lsl #1
173 sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
175 add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
177 ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
179 subs r3, r3, #\offset // 6 for 8x8
182 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
183 .macro DEQUANT size bits
184 function x264_dequant_\size\()_neon
185 DEQUANT_START \bits+2, \bits
189 blt dequant_\size\()_rshift
192 dequant_\size\()_lshift_loop:
196 vld1.32 {d16-d17}, [r1,:128]!
197 vld1.32 {d18-d19}, [r1,:128]!
199 vld1.32 {d20-d21}, [r1,:128]!
201 vld1.32 {d22-d23}, [r1,:128]!
203 vld1.16 {d0-d3}, [r0,:128]
209 vst1.16 {d0-d3}, [r0,:128]!
211 bgt dequant_\size\()_lshift_loop
215 dequant_\size\()_rshift:
223 dequant_\size\()_rshift_loop:
227 vld1.32 {d16-d17}, [r1,:128]!
229 vld1.32 {d18-d19}, [r1,:128]!
231 vld1.32 {d16-d17}, [r1,:128]!
233 vld1.32 {d18-d19}, [r1,:128]!
235 vld1.16 {d0-d3}, [r0,:128]
240 vmlal.s16 q10, d0, d4
241 vmlal.s16 q11, d1, d5
242 vmlal.s16 q12, d2, d6
243 vmlal.s16 q13, d3, d7
244 vshl.s32 q10, q10, q15
245 vshl.s32 q11, q11, q15
246 vshl.s32 q12, q12, q15
247 vshl.s32 q13, q13, q15
253 vst1.16 {d0-d3}, [r0,:128]!
255 bgt dequant_\size\()_rshift_loop
264 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
265 function x264_dequant_4x4_dc_neon
266 DEQUANT_START 6, 6, yes
267 blt dequant_4x4_dc_rshift
271 vld1.16 {d0-d3}, [r0,:128]
276 vst1.16 {d0-d3}, [r0,:128]
279 dequant_4x4_dc_rshift:
289 vld1.16 {d0-d3}, [r0,:128]
293 vmlal.s16 q10, d0, d4
294 vmlal.s16 q11, d1, d4
295 vmlal.s16 q12, d2, d4
296 vmlal.s16 q13, d3, d4
297 vshl.s32 q10, q10, q15
298 vshl.s32 q11, q11, q15
299 vshl.s32 q12, q12, q15
300 vshl.s32 q13, q13, q15
306 vst1.16 {d0-d3}, [r0,:128]
311 // int coeff_last( int16_t *l )
312 function x264_coeff_last4_arm
322 function x264_coeff_last8_arm
323 ldrd r2, r3, [r0, #8]
336 .macro COEFF_LAST_1x size
337 function x264_coeff_last\size\()_neon
340 vld1.64 {d0-d3}, [r0]
342 vld1.64 {d0-d3}, [r0,:128]
354 subs r1, ip, r1, lsr #2
355 addge r0, r1, #\size - 8
356 subslt r0, r3, r0, lsr #2
365 function x264_coeff_last64_neon
366 vld1.64 {d16-d19}, [r0,:128]!
369 vld1.64 {d20-d23}, [r0,:128]!
372 vld1.64 {d24-d27}, [r0,:128]!
375 vld1.64 {d28-d31}, [r0,:128]!
379 movrel r1, pmovmskb_byte
380 vld1.64 {d0-d1}, [r1,:128]
392 vpadd.u8 d0, d16, d17
393 vpadd.u8 d1, d18, d19
394 vpadd.u8 d2, d20, d21
395 vpadd.u8 d3, d22, d23