1 /****************************************************************************
2 * quant.S: arm quantization and level-run
3 *****************************************************************************
4 * Copyright (C) 2009-2015 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
32 .byte 1,2,4,8,16,32,64,128
33 .byte 1,2,4,8,16,32,64,128
36 .byte 3,12,48,192,3,12,48,192
37 .byte 3,12,48,192,3,12,48,192
40 .byte 128,64,32,16,8,4,2,1
41 .byte 128,64,32,16,8,4,2,1
45 .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
46 vadd.u16 q8, q8, \bias0
47 vadd.u16 q9, q9, \bias1
49 vld1.64 {\mf0-\mf3}, [r1,:128]!
51 vmull.u16 q10, d16, \mf0
52 vmull.u16 q11, d17, \mf1
53 vmull.u16 q12, d18, \mf2
54 vmull.u16 q13, d19, \mf3
55 vshr.s16 q14, q14, #15
56 vshr.s16 q15, q15, #15
57 vshrn.u32 d16, q10, #16
58 vshrn.u32 d17, q11, #16
59 vshrn.u32 d18, q12, #16
60 vshrn.u32 d19, q13, #16
66 vst1.64 {d16-d19}, [r0,:128]!
76 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
77 function x264_quant_2x2_dc_neon
78 vld1.64 {d0}, [r0,:64]
88 vst1.64 {d3}, [r0,:64]
92 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
93 function x264_quant_4x4_dc_neon
94 vld1.64 {d28-d31}, [r0,:128]
99 QUANT_TWO q0, q0, d4, d5, d4, d5, q0
104 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
105 function x264_quant_4x4_neon
106 vld1.64 {d28-d31}, [r0,:128]
109 vld1.64 {d0-d3}, [r2,:128]
110 vld1.64 {d4-d7}, [r1,:128]
111 QUANT_TWO q0, q1, d4, d5, d6, d7, q0
116 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
117 function x264_quant_4x4x4_neon
119 vld1.64 {d28-d31}, [r0,:128]
122 vld1.64 {d0-d3}, [r2,:128]
123 vld1.64 {d4-d7}, [r1,:128]
124 QUANT_TWO q0, q1, d4, d5, d6, d7, q4
125 vld1.64 {d28-d31}, [r0,:128]
128 QUANT_TWO q0, q1, d4, d5, d6, d7, q5
129 vld1.64 {d28-d31}, [r0,:128]
132 QUANT_TWO q0, q1, d4, d5, d6, d7, q6
133 vld1.64 {d28-d31}, [r0,:128]
136 QUANT_TWO q0, q1, d4, d5, d6, d7, q7
157 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
158 function x264_quant_8x8_neon
159 vld1.64 {d28-d31}, [r0,:128]
162 vld1.64 {d0-d3}, [r2,:128]!
163 vld1.64 {d4-d7}, [r1,:128]!
164 QUANT_TWO q0, q1, d4, d5, d6, d7, q0
166 vld1.64 {d28-d31}, [r0,:128]
169 vld1.64 {d2-d5}, [r2,:128]!
170 QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
177 .macro DEQUANT_START mf_size offset dc=no
180 lsr r3, r3, #8 // i_qbits = i_qp / 6
181 add ip, r3, r3, lsl #1
182 sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
184 add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
186 ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
188 subs r3, r3, #\offset // 6 for 8x8
191 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
192 .macro DEQUANT size bits
193 function x264_dequant_\size\()_neon
194 DEQUANT_START \bits+2, \bits
198 blt dequant_\size\()_rshift
201 dequant_\size\()_lshift_loop:
205 vld1.32 {d16-d17}, [r1,:128]!
206 vld1.32 {d18-d19}, [r1,:128]!
208 vld1.32 {d20-d21}, [r1,:128]!
210 vld1.32 {d22-d23}, [r1,:128]!
212 vld1.16 {d0-d3}, [r0,:128]
218 vst1.16 {d0-d3}, [r0,:128]!
220 bgt dequant_\size\()_lshift_loop
224 dequant_\size\()_rshift:
232 dequant_\size\()_rshift_loop:
236 vld1.32 {d16-d17}, [r1,:128]!
238 vld1.32 {d18-d19}, [r1,:128]!
240 vld1.32 {d16-d17}, [r1,:128]!
242 vld1.32 {d18-d19}, [r1,:128]!
244 vld1.16 {d0-d3}, [r0,:128]
249 vmlal.s16 q10, d0, d4
250 vmlal.s16 q11, d1, d5
251 vmlal.s16 q12, d2, d6
252 vmlal.s16 q13, d3, d7
253 vshl.s32 q10, q10, q15
254 vshl.s32 q11, q11, q15
255 vshl.s32 q12, q12, q15
256 vshl.s32 q13, q13, q15
262 vst1.16 {d0-d3}, [r0,:128]!
264 bgt dequant_\size\()_rshift_loop
273 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
274 function x264_dequant_4x4_dc_neon
275 DEQUANT_START 6, 6, yes
276 blt dequant_4x4_dc_rshift
280 vld1.16 {d0-d3}, [r0,:128]
285 vst1.16 {d0-d3}, [r0,:128]
288 dequant_4x4_dc_rshift:
298 vld1.16 {d0-d3}, [r0,:128]
302 vmlal.s16 q10, d0, d4
303 vmlal.s16 q11, d1, d4
304 vmlal.s16 q12, d2, d4
305 vmlal.s16 q13, d3, d4
306 vshl.s32 q10, q10, q15
307 vshl.s32 q11, q11, q15
308 vshl.s32 q12, q12, q15
309 vshl.s32 q13, q13, q15
315 vst1.16 {d0-d3}, [r0,:128]
319 .macro decimate_score_1x size
320 function x264_decimate_score\size\()_neon
321 vld1.16 {q0, q1}, [r0, :128]
327 vld1.8 {q8}, [r3, :128]
349 movrelx r3, X(x264_decimate_table4), r2
365 function x264_decimate_score64_neon
367 vld1.16 {q8, q9}, [r0, :128]!
368 vld1.16 {q10, q11}, [r0, :128]!
369 vld1.16 {q12, q13}, [r0, :128]!
370 vld1.16 {q14, q15}, [r0, :128]
385 vld1.8 {q2}, [r3, :128]
390 vmax.s8 q12, q12, q13
391 vmax.s8 q14, q14, q15
396 vmax.s8 q12, q12, q14
397 vpadd.u8 d18, d18, d19
398 vpadd.u8 d19, d16, d17
400 vpadd.u8 d22, d22, d23
401 vpadd.u8 d23, d20, d21
402 vshrn.u16 d24, q12, #4
403 vpadd.u8 d16, d22, d23
404 vpadd.u8 d17, d18, d19
405 vpadd.u8 d24, d24, d24
406 vpadd.u8 d16, d16, d17
418 movrelx r3, X(x264_decimate_table8), r2
450 // int coeff_last( int16_t *l )
451 function x264_coeff_last4_arm
461 function x264_coeff_last8_arm
462 ldrd r2, r3, [r0, #8]
475 .macro COEFF_LAST_1x size
476 function x264_coeff_last\size\()_neon
480 vld1.64 {d0-d3}, [r0,:128]
491 subs r1, ip, r1, lsr #2
492 addge r0, r1, #\size - 8
493 subslt r0, r3, r0, lsr #2
502 function x264_coeff_last64_neon
503 vld1.64 {d16-d19}, [r0,:128]!
506 vld1.64 {d20-d23}, [r0,:128]!
509 vld1.64 {d24-d27}, [r0,:128]!
512 vld1.64 {d28-d31}, [r0,:128]!
516 movrel r1, pmovmskb_byte
517 vld1.64 {d0-d1}, [r1,:128]
529 vpadd.u8 d0, d16, d17
530 vpadd.u8 d1, d18, d19
531 vpadd.u8 d2, d20, d21
532 vpadd.u8 d3, d22, d23
547 function x264_denoise_dct_neon
549 vld1.16 {q0, q1}, [r0]
550 vld1.32 {q12, q13}, [r1]!
551 vld1.32 {q14, q15}, [r1]
555 vld1.16 {q2, q3}, [r2]!
558 vaddw.u16 q12, q12, d16
559 vaddw.u16 q13, q13, d17
562 vaddw.u16 q14, q14, d18
563 vaddw.u16 q15, q15, d19
568 vst1.32 {q12, q13}, [r1]!
569 vst1.32 {q14, q15}, [r1]!
570 vst1.16 {q10, q11}, [r0]!