1 /****************************************************************************
2 * quant.S: arm quantization and level-run
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
29 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
30 add v18.8h, v18.8h, \bias0
31 add v19.8h, v19.8h, \bias1
32 umull v20.4s, v18.4h, \mf0_1\().4h
33 umull2 v21.4s, v18.8h, \mf0_1\().8h
34 umull v22.4s, v19.4h, \mf2_3\().4h
35 umull2 v23.4s, v19.8h, \mf2_3\().8h
36 sshr v16.8h, v16.8h, #15
37 sshr v17.8h, v17.8h, #15
38 shrn v18.4h, v20.4s, #16
39 shrn2 v18.8h, v21.4s, #16
40 shrn v19.4h, v22.4s, #16
41 shrn2 v19.8h, v23.4s, #16
42 eor v18.16b, v18.16b, v16.16b
43 eor v19.16b, v19.16b, v17.16b
44 sub v18.8h, v18.8h, v16.8h
45 sub v19.8h, v19.8h, v17.8h
46 orr \mask, v18.16b, v19.16b
47 st1 {v18.8h,v19.8h}, [x0], #32
58 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
59 function x264_quant_2x2_dc_neon, export=1
64 add v3.4h, v3.4h, v2.4h
65 umull v3.4s, v3.4h, v1.4h
66 sshr v0.4h, v0.4h, #15
67 shrn v3.4h, v3.4s, #16
68 eor v3.8b, v3.8b, v0.8b
69 sub v3.4h, v3.4h, v0.4h
74 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
75 function x264_quant_4x4_dc_neon, export=1
76 ld1 {v16.8h,v17.8h}, [x0]
81 QUANT_TWO v0.8h, v0.8h, v2, v2, v0.16b
86 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
87 function x264_quant_4x4_neon, export=1
88 ld1 {v16.8h,v17.8h}, [x0]
91 ld1 {v0.8h,v1.8h}, [x2]
92 ld1 {v2.8h,v3.8h}, [x1]
93 QUANT_TWO v0.8h, v1.8h, v2, v3, v0.16b
98 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
99 function x264_quant_4x4x4_neon, export=1
100 ld1 {v16.8h,v17.8h}, [x0]
103 ld1 {v0.8h,v1.8h}, [x2]
104 ld1 {v2.8h,v3.8h}, [x1]
105 QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
106 ld1 {v16.8h,v17.8h}, [x0]
109 QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
110 ld1 {v16.8h,v17.8h}, [x0]
113 QUANT_TWO v0.8h, v1.8h, v2, v3, v6.16b
114 ld1 {v16.8h,v17.8h}, [x0]
117 QUANT_TWO v0.8h, v1.8h, v2, v3, v7.16b
141 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
142 function x264_quant_8x8_neon, export=1
143 ld1 {v16.8h,v17.8h}, [x0]
146 ld1 {v0.8h,v1.8h}, [x2], #32
147 ld1 {v2.8h,v3.8h}, [x1], #32
148 QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
150 ld1 {v16.8h,v17.8h}, [x0]
153 ld1 {v0.8h,v1.8h}, [x2], #32
154 ld1 {v2.8h,v3.8h}, [x1], #32
155 QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
156 orr v4.16b, v4.16b, v5.16b
162 .macro DEQUANT_START mf_size offset dc=no
165 lsr w3, w3, #8 // i_qbits = i_qp / 6
166 add w5, w3, w3, lsl #1
167 sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6
168 lsl w2, w2, #\mf_size
170 add x1, x1, w2, sxtw // dequant_mf[i_mf]
172 ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0]
174 subs w3, w3, #\offset // 6 for 8x8
177 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
178 .macro DEQUANT size bits
179 function x264_dequant_\size\()_neon, export=1
180 DEQUANT_START \bits+2, \bits
184 b.lt dequant_\size\()_rshift
187 dequant_\size\()_lshift_loop:
191 ld1 {v16.4s}, [x1], #16
192 ld1 {v17.4s}, [x1], #16
194 ld1 {v18.4s}, [x1], #16
196 ld1 {v19.4s}, [x1], #16
198 ld1 {v0.8h,v1.8h}, [x0]
200 mul v0.8h, v0.8h, v2.8h
201 mul v1.8h, v1.8h, v3.8h
202 sshl v0.8h, v0.8h, v31.8h
203 sshl v1.8h, v1.8h, v31.8h
204 st1 {v0.8h,v1.8h}, [x0], #32
206 b.gt dequant_\size\()_lshift_loop
210 dequant_\size\()_rshift:
218 dequant_\size\()_rshift_loop:
221 ld1 {v16.4s}, [x1], #16
222 ld1 {v17.4s}, [x1], #16
224 ld1 {v18.4s}, [x1], #16
227 ld1 {v19.4s}, [x1], #16
230 ld1 {v0.8h,v1.8h}, [x0]
235 smlal v16.4s, v0.4h, v2.4h
236 smlal2 v17.4s, v0.8h, v2.8h
237 smlal v18.4s, v1.4h, v3.4h
238 smlal2 v19.4s, v1.8h, v3.8h
239 sshl v16.4s, v16.4s, v31.4s
240 sshl v17.4s, v17.4s, v31.4s
241 sshl v18.4s, v18.4s, v31.4s
242 sshl v19.4s, v19.4s, v31.4s
248 st1 {v0.8h,v1.8h}, [x0], #32
250 b.gt dequant_\size\()_rshift_loop
259 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
260 function x264_dequant_4x4_dc_neon, export=1
261 DEQUANT_START 6, 6, yes
262 b.lt dequant_4x4_dc_rshift
266 ld1 {v0.8h,v1.8h}, [x0]
268 mul v0.8h, v0.8h, v2.8h
269 mul v1.8h, v1.8h, v2.8h
270 st1 {v0.8h,v1.8h}, [x0]
273 dequant_4x4_dc_rshift:
283 ld1 {v0.8h,v1.8h}, [x0]
287 smlal v16.4s, v0.4h, v4.4h
288 smlal2 v17.4s, v0.8h, v4.8h
289 smlal v18.4s, v1.4h, v4.4h
290 smlal2 v19.4s, v1.8h, v4.8h
291 sshl v16.4s, v16.4s, v3.4s
292 sshl v17.4s, v17.4s, v3.4s
293 sshl v18.4s, v18.4s, v3.4s
294 sshl v19.4s, v19.4s, v3.4s
300 st1 {v0.8h,v1.8h}, [x0]
304 .macro decimate_score_1x size
305 function x264_decimate_score\size\()_neon, export=1
306 ld1 {v0.8h,v1.8h}, [x0]
307 movrel x5, X(x264_decimate_table4)
312 cmeq v1.16b, v0.16b, #0
313 cmhi v2.16b, v2.16b, v3.16b
314 shrn v1.8b, v1.8h, #4
315 shrn v2.8b, v2.8h, #4
349 const mask64, align=6
350 .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
351 .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
354 function x264_decimate_score64_neon, export=1
355 ld1 {v0.8h,v1.8h}, [x0], #32
356 ld1 {v2.8h,v3.8h}, [x0], #32
357 ld1 {v4.8h,v5.8h}, [x0], #32
358 ld1 {v6.8h,v7.8h}, [x0]
362 sqxtn2 v16.16b, v0.8h
364 sqxtn2 v17.16b, v2.8h
366 sqxtn2 v18.16b, v4.8h
368 sqxtn2 v19.16b, v6.8h
374 cmeq v0.16b, v16.16b, #0
375 cmeq v1.16b, v17.16b, #0
376 cmeq v2.16b, v18.16b, #0
377 cmeq v3.16b, v19.16b, #0
378 umax v4.16b, v4.16b, v5.16b
379 umax v6.16b, v6.16b, v7.16b
380 and v0.16b, v0.16b, v30.16b
381 and v1.16b, v1.16b, v30.16b
382 and v2.16b, v2.16b, v30.16b
383 and v3.16b, v3.16b, v30.16b
384 umax v4.16b, v4.16b, v6.16b
385 addp v0.16b, v1.16b, v0.16b
386 addp v2.16b, v3.16b, v2.16b
387 cmhi v4.16b, v4.16b, v31.16b
388 addp v0.16b, v2.16b, v0.16b
389 shrn v4.8b, v4.8h, #4
390 addp v0.16b, v0.16b, v0.16b
397 movrel x5, X(x264_decimate_table8)
416 // int coeff_last( int16_t *l )
417 function x264_coeff_last4_aarch64, export=1
421 sub w0, w4, w0, lsr #4
425 function x264_coeff_last8_aarch64, export=1
435 sub w0, w4, w2, lsr #4
439 .macro COEFF_LAST_1x size
440 function x264_coeff_last\size\()_neon, export=1
444 ld1 {v0.8h,v1.8h}, [x0]
447 cmtst v0.16b, v0.16b, v0.16b
448 shrn v0.8b, v0.8h, #4
452 sub w0, w3, w2, lsr #2
460 function x264_coeff_last64_neon, export=1
461 ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
466 ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
474 cmtst v0.16b, v0.16b, v0.16b
475 cmtst v1.16b, v1.16b, v1.16b
476 cmtst v2.16b, v2.16b, v2.16b
477 cmtst v3.16b, v3.16b, v3.16b
479 shrn v0.8b, v0.8h, #4
480 shrn2 v0.16b, v1.8h, #4
481 shrn v1.8b, v2.8h, #4
482 shrn2 v1.16b, v3.8h, #4
487 shrn v0.4h, v0.4s, #2
488 shrn2 v0.8h, v1.4s, #2
490 sub v0.8h, v31.8h, v0.8h
491 sshl v0.8h, v30.8h, v0.8h
492 shrn v0.8b, v0.8h, #1