1 /****************************************************************************
2 * quant.S: arm quantization and level-run
3 *****************************************************************************
4 * Copyright (C) 2009-2015 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
29 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
30 add v18.8h, v18.8h, \bias0
31 add v19.8h, v19.8h, \bias1
32 umull v20.4s, v18.4h, \mf0_1\().4h
33 umull2 v21.4s, v18.8h, \mf0_1\().8h
34 umull v22.4s, v19.4h, \mf2_3\().4h
35 umull2 v23.4s, v19.8h, \mf2_3\().8h
36 sshr v16.8h, v16.8h, #15
37 sshr v17.8h, v17.8h, #15
38 shrn v18.4h, v20.4s, #16
39 shrn2 v18.8h, v21.4s, #16
40 shrn v19.4h, v22.4s, #16
41 shrn2 v19.8h, v23.4s, #16
42 eor v18.16b, v18.16b, v16.16b
43 eor v19.16b, v19.16b, v17.16b
44 sub v18.8h, v18.8h, v16.8h
45 sub v19.8h, v19.8h, v17.8h
46 orr \mask, v18.16b, v19.16b
47 st1 {v18.8h,v19.8h}, [x0], #32
58 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
59 function x264_quant_2x2_dc_neon, export=1
64 add v3.4h, v3.4h, v2.4h
65 umull v3.4s, v3.4h, v1.4h
66 sshr v0.4h, v0.4h, #15
67 shrn v3.4h, v3.4s, #16
68 eor v3.8b, v3.8b, v0.8b
69 sub v3.4h, v3.4h, v0.4h
74 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
75 function x264_quant_4x4_dc_neon, export=1
76 ld1 {v16.8h,v17.8h}, [x0]
81 QUANT_TWO v0.8h, v0.8h, v2, v2, v0.16b
86 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
87 function x264_quant_4x4_neon, export=1
88 ld1 {v16.8h,v17.8h}, [x0]
91 ld1 {v0.8h,v1.8h}, [x2]
92 ld1 {v2.8h,v3.8h}, [x1]
93 QUANT_TWO v0.8h, v1.8h, v2, v3, v0.16b
98 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
99 function x264_quant_4x4x4_neon, export=1
100 ld1 {v16.8h,v17.8h}, [x0]
103 ld1 {v0.8h,v1.8h}, [x2]
104 ld1 {v2.8h,v3.8h}, [x1]
105 QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
106 ld1 {v16.8h,v17.8h}, [x0]
109 QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
110 ld1 {v16.8h,v17.8h}, [x0]
113 QUANT_TWO v0.8h, v1.8h, v2, v3, v6.16b
114 ld1 {v16.8h,v17.8h}, [x0]
117 QUANT_TWO v0.8h, v1.8h, v2, v3, v7.16b
141 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
142 function x264_quant_8x8_neon, export=1
143 ld1 {v16.8h,v17.8h}, [x0]
146 ld1 {v0.8h,v1.8h}, [x2], #32
147 ld1 {v2.8h,v3.8h}, [x1], #32
148 QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
150 ld1 {v16.8h,v17.8h}, [x0]
153 ld1 {v0.8h,v1.8h}, [x2], #32
154 ld1 {v2.8h,v3.8h}, [x1], #32
155 QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
156 orr v4.16b, v4.16b, v5.16b
162 .macro DEQUANT_START mf_size offset dc=no
165 lsr w3, w3, #8 // i_qbits = i_qp / 6
166 add w5, w3, w3, lsl #1
167 sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6
168 lsl w2, w2, #\mf_size
170 add x1, x1, w2, sxtw // dequant_mf[i_mf]
172 ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0]
174 subs w3, w3, #\offset // 6 for 8x8
177 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
178 .macro DEQUANT size bits
179 function x264_dequant_\size\()_neon, export=1
180 DEQUANT_START \bits+2, \bits
184 b.lt dequant_\size\()_rshift
187 dequant_\size\()_lshift_loop:
191 ld1 {v16.4s}, [x1], #16
192 ld1 {v17.4s}, [x1], #16
194 ld1 {v18.4s}, [x1], #16
196 ld1 {v19.4s}, [x1], #16
198 ld1 {v0.8h,v1.8h}, [x0]
200 mul v0.8h, v0.8h, v2.8h
201 mul v1.8h, v1.8h, v3.8h
202 sshl v0.8h, v0.8h, v31.8h
203 sshl v1.8h, v1.8h, v31.8h
204 st1 {v0.8h,v1.8h}, [x0], #32
206 b.gt dequant_\size\()_lshift_loop
210 dequant_\size\()_rshift:
218 dequant_\size\()_rshift_loop:
221 ld1 {v16.4s}, [x1], #16
222 ld1 {v17.4s}, [x1], #16
224 ld1 {v18.4s}, [x1], #16
227 ld1 {v19.4s}, [x1], #16
230 ld1 {v0.8h,v1.8h}, [x0]
235 smlal v16.4s, v0.4h, v2.4h
236 smlal2 v17.4s, v0.8h, v2.8h
237 smlal v18.4s, v1.4h, v3.4h
238 smlal2 v19.4s, v1.8h, v3.8h
239 sshl v16.4s, v16.4s, v31.4s
240 sshl v17.4s, v17.4s, v31.4s
241 sshl v18.4s, v18.4s, v31.4s
242 sshl v19.4s, v19.4s, v31.4s
248 st1 {v0.8h,v1.8h}, [x0], #32
250 b.gt dequant_\size\()_rshift_loop
259 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
260 function x264_dequant_4x4_dc_neon, export=1
261 DEQUANT_START 6, 6, yes
262 b.lt dequant_4x4_dc_rshift
266 ld1 {v0.8h,v1.8h}, [x0]
268 mul v0.8h, v0.8h, v2.8h
269 mul v1.8h, v1.8h, v2.8h
270 st1 {v0.8h,v1.8h}, [x0]
273 dequant_4x4_dc_rshift:
283 ld1 {v0.8h,v1.8h}, [x0]
287 smlal v16.4s, v0.4h, v4.4h
288 smlal2 v17.4s, v0.8h, v4.8h
289 smlal v18.4s, v1.4h, v4.4h
290 smlal2 v19.4s, v1.8h, v4.8h
291 sshl v16.4s, v16.4s, v3.4s
292 sshl v17.4s, v17.4s, v3.4s
293 sshl v18.4s, v18.4s, v3.4s
294 sshl v19.4s, v19.4s, v3.4s
300 st1 {v0.8h,v1.8h}, [x0]
304 .macro decimate_score_1x size
305 function x264_decimate_score\size\()_neon, export=1
306 ld1 {v0.8h,v1.8h}, [x0]
307 movrel x5, X(x264_decimate_table4)
312 cmeq v1.16b, v0.16b, #0
313 cmhi v2.16b, v2.16b, v3.16b
314 shrn v1.8b, v1.8h, #4
315 shrn v2.8b, v2.8h, #4
345 const mask64, align=6
346 .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
347 .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
350 function x264_decimate_score64_neon, export=1
351 ld1 {v0.8h,v1.8h}, [x0], #32
352 ld1 {v2.8h,v3.8h}, [x0], #32
353 ld1 {v4.8h,v5.8h}, [x0], #32
354 ld1 {v6.8h,v7.8h}, [x0]
358 sqxtn2 v16.16b, v0.8h
360 sqxtn2 v17.16b, v2.8h
362 sqxtn2 v18.16b, v4.8h
364 sqxtn2 v19.16b, v6.8h
370 cmeq v0.16b, v16.16b, #0
371 cmeq v1.16b, v17.16b, #0
372 cmeq v2.16b, v18.16b, #0
373 cmeq v3.16b, v19.16b, #0
374 umax v4.16b, v4.16b, v5.16b
375 umax v6.16b, v6.16b, v7.16b
376 and v0.16b, v0.16b, v30.16b
377 and v1.16b, v1.16b, v30.16b
378 and v2.16b, v2.16b, v30.16b
379 and v3.16b, v3.16b, v30.16b
380 umax v4.16b, v4.16b, v6.16b
381 addp v0.16b, v1.16b, v0.16b
382 addp v2.16b, v3.16b, v2.16b
383 cmhi v4.16b, v4.16b, v31.16b
384 addp v0.16b, v2.16b, v0.16b
385 shrn v4.8b, v4.8h, #4
386 addp v0.16b, v0.16b, v0.16b
393 movrel x5, X(x264_decimate_table8)
408 // int coeff_last( int16_t *l )
409 function x264_coeff_last4_aarch64, export=1
413 sub w0, w4, w0, lsr #4
417 function x264_coeff_last8_aarch64, export=1
427 sub w0, w4, w2, lsr #4
431 .macro COEFF_LAST_1x size
432 function x264_coeff_last\size\()_neon, export=1
436 ld1 {v0.8h,v1.8h}, [x0]
439 cmtst v0.16b, v0.16b, v0.16b
440 shrn v0.8b, v0.8h, #4
444 sub w0, w3, w2, lsr #2
452 function x264_coeff_last64_neon, export=1
453 ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
458 ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
466 cmtst v0.16b, v0.16b, v0.16b
467 cmtst v1.16b, v1.16b, v1.16b
468 cmtst v2.16b, v2.16b, v2.16b
469 cmtst v3.16b, v3.16b, v3.16b
471 shrn v0.8b, v0.8h, #4
472 shrn2 v0.16b, v1.8h, #4
473 shrn v1.8b, v2.8h, #4
474 shrn2 v1.16b, v3.8h, #4
479 shrn v0.4h, v0.4s, #2
480 shrn2 v0.8h, v1.4s, #2
482 sub v0.8h, v31.8h, v0.8h
483 sshl v0.8h, v30.8h, v0.8h
484 shrn v0.8b, v0.8h, #1
493 .macro coeff_level_run_start size
494 add x6, x1, #23 // runlevel->mask
502 .macro coeff_level_run shift
504 subs w4, w4, w3, lsr #\shift
507 ldrh w5, [x0, x4, lsl #1]
513 add w3, w3, #1 << \shift
515 and x3, x3, #~((1 << \shift) - 1)
518 subs w4, w4, w3, lsr #\shift
525 function x264_coeff_level_run4_aarch64, export=1
528 coeff_level_run_start 4
535 .macro X264_COEFF_LEVEL_RUN size
536 function x264_coeff_level_run\size\()_neon, export=1
543 cmtst v0.8b, v0.8b, v0.8b
545 ld1 {v0.8h,v1.8h}, [x0]
548 cmtst v0.16b, v0.16b, v0.16b
549 shrn v0.8b, v0.8h, #4
556 coeff_level_run_start \size
558 coeff_level_run (4 - (\size + 1) / 8)
564 X264_COEFF_LEVEL_RUN 8
565 X264_COEFF_LEVEL_RUN 15
566 X264_COEFF_LEVEL_RUN 16
568 function x264_denoise_dct_neon, export=1
570 ld1 {v0.8h,v1.8h}, [x0]
571 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
574 ld1 {v2.8h,v3.8h}, [x2], #32
575 cmlt v18.8h, v0.8h, #0
576 cmlt v19.8h, v1.8h, #0
577 uaddw v4.4s, v4.4s, v16.4h
578 uaddw2 v5.4s, v5.4s, v16.8h
579 uqsub v20.8h, v16.8h, v2.8h
580 uqsub v21.8h, v17.8h, v3.8h
581 uaddw v6.4s, v6.4s, v17.4h
582 uaddw2 v7.4s, v7.4s, v17.8h
585 bsl v18.16b, v22.16b, v20.16b
586 bsl v19.16b, v23.16b, v21.16b
587 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
588 st1 {v18.8h,v19.8h}, [x0], #32