1 /*****************************************************************************
2 * quant.S: h264 encoder
3 *****************************************************************************
4 * Copyright (C) 2009 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *****************************************************************************/
30 .byte 1,2,4,8,16,32,64,128
31 .byte 1,2,4,8,16,32,64,128
35 .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
36 vadd.u16 q8, q8, \bias0
37 vadd.u16 q9, q9, \bias1
39 vld1.64 {\mf0-\mf3}, [r1,:128]!
41 vmull.u16 q10, d16, \mf0
42 vmull.u16 q11, d17, \mf1
43 vmull.u16 q12, d18, \mf2
44 vmull.u16 q13, d19, \mf3
45 vshr.s16 q14, q14, #15
46 vshr.s16 q15, q15, #15
47 vshrn.u32 d16, q10, #16
48 vshrn.u32 d17, q11, #16
49 vshrn.u32 d18, q12, #16
50 vshrn.u32 d19, q13, #16
56 vst1.64 {d16-d19}, [r0,:128]!
66 // quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
67 function x264_quant_2x2_dc_neon, export=1
68 vld1.64 {d0}, [r0,:64]
78 vst1.64 {d3}, [r0,:64]
82 // quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
83 function x264_quant_4x4_dc_neon, export=1
84 vld1.64 {d28-d31}, [r0,:128]
89 QUANT_TWO q0, q0, d4, d5, d4, d5
94 // quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
95 function x264_quant_4x4_neon, export=1
96 vld1.64 {d28-d31}, [r0,:128]
99 vld1.64 {d0-d3}, [r2,:128]
100 vld1.64 {d4-d7}, [r1,:128]
101 QUANT_TWO q0, q1, d4, d5, d6, d7
106 // quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
107 function x264_quant_8x8_neon, export=1
108 vld1.64 {d28-d31}, [r0,:128]
111 vld1.64 {d0-d3}, [r2,:128]!
112 vld1.64 {d4-d7}, [r1,:128]!
113 QUANT_TWO q0, q1, d4, d5, d6, d7
115 vld1.64 {d28-d31}, [r0,:128]
118 vld1.64 {d2-d5}, [r2,:128]!
119 QUANT_TWO q1, q2, d4, d5, d6, d7, yes
126 .macro DEQUANT_START mf_size offset dc=no
129 lsr r3, r3, #8 // i_qbits = i_qp / 6
130 add ip, r3, r3, lsl #1
131 sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
133 add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
135 ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
137 subs r3, r3, #\offset // 6 for 8x8
140 // dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
141 .macro DEQUANT size bits
142 function x264_dequant_\size\()_neon, export=1
143 DEQUANT_START \bits+2, \bits
147 blt dequant_\size\()_rshift
150 dequant_\size\()_lshift_loop:
154 vld1.32 {d16-d17}, [r1,:128]!
155 vld1.32 {d18-d19}, [r1,:128]!
157 vld1.32 {d20-d21}, [r1,:128]!
159 vld1.32 {d22-d23}, [r1,:128]!
161 vld1.16 {d0-d3}, [r0,:128]
167 vst1.16 {d0-d3}, [r0,:128]!
169 bgt dequant_\size\()_lshift_loop
173 dequant_\size\()_rshift:
181 dequant_\size\()_rshift_loop:
185 vld1.32 {d16-d17}, [r1,:128]!
187 vld1.32 {d18-d19}, [r1,:128]!
189 vld1.32 {d16-d17}, [r1,:128]!
191 vld1.32 {d18-d19}, [r1,:128]!
193 vld1.16 {d0-d3}, [r0,:128]
198 vmlal.s16 q10, d0, d4
199 vmlal.s16 q11, d1, d5
200 vmlal.s16 q12, d2, d6
201 vmlal.s16 q13, d3, d7
202 vshl.s32 q10, q10, q15
203 vshl.s32 q11, q11, q15
204 vshl.s32 q12, q12, q15
205 vshl.s32 q13, q13, q15
211 vst1.16 {d0-d3}, [r0,:128]!
213 bgt dequant_\size\()_rshift_loop
222 // dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
223 function x264_dequant_4x4_dc_neon, export=1
224 DEQUANT_START 6, 6, yes
225 blt dequant_4x4_dc_rshift
229 vld1.16 {d0-d3}, [r0,:128]
234 vst1.16 {d0-d3}, [r0,:128]
237 dequant_4x4_dc_rshift:
247 vld1.16 {d0-d3}, [r0,:128]
251 vmlal.s16 q10, d0, d4
252 vmlal.s16 q11, d1, d4
253 vmlal.s16 q12, d2, d4
254 vmlal.s16 q13, d3, d4
255 vshl.s32 q10, q10, q15
256 vshl.s32 q11, q11, q15
257 vshl.s32 q12, q12, q15
258 vshl.s32 q13, q13, q15
264 vst1.16 {d0-d3}, [r0,:128]
269 // int coeff_last( int16_t *l )
270 function x264_coeff_last4_arm, export=1
280 .macro COEFF_LAST_1x size
281 function x264_coeff_last\size\()_neon, export=1
284 vld1.64 {d0-d3}, [r0]
286 vld1.64 {d0-d3}, [r0,:128]
298 subs r1, ip, r1, lsr #2
299 addge r0, r1, #\size - 8
300 sublts r0, r3, r0, lsr #2
309 function x264_coeff_last64_neon, export=1
310 vld1.64 {d16-d19}, [r0,:128]!
313 vld1.64 {d20-d23}, [r0,:128]!
316 vld1.64 {d24-d27}, [r0,:128]!
319 vld1.64 {d28-d31}, [r0,:128]!
323 movrel r1, pmovmskb_byte
324 vld1.64 {d0-d1}, [r1,:128]
336 vpadd.u8 d0, d16, d17
337 vpadd.u8 d1, d18, d19
338 vpadd.u8 d2, d20, d21
339 vpadd.u8 d3, d22, d23