1 /****************************************************************************
2 * quant.S: arm quantization and level-run
3 *****************************************************************************
4 * Copyright (C) 2009-2015 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
8 * Martin Storsjo <martin@martin.st>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
30 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
31 add v18.8h, v18.8h, \bias0
32 add v19.8h, v19.8h, \bias1
33 umull v20.4s, v18.4h, \mf0_1\().4h
34 umull2 v21.4s, v18.8h, \mf0_1\().8h
35 umull v22.4s, v19.4h, \mf2_3\().4h
36 umull2 v23.4s, v19.8h, \mf2_3\().8h
37 sshr v16.8h, v16.8h, #15
38 sshr v17.8h, v17.8h, #15
39 shrn v18.4h, v20.4s, #16
40 shrn2 v18.8h, v21.4s, #16
41 shrn v19.4h, v22.4s, #16
42 shrn2 v19.8h, v23.4s, #16
43 eor v18.16b, v18.16b, v16.16b
44 eor v19.16b, v19.16b, v17.16b
45 sub v18.8h, v18.8h, v16.8h
46 sub v19.8h, v19.8h, v17.8h
47 orr \mask, v18.16b, v19.16b
48 st1 {v18.8h,v19.8h}, [x0], #32
59 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
60 function x264_quant_2x2_dc_neon, export=1
65 add v3.4h, v3.4h, v2.4h
66 umull v3.4s, v3.4h, v1.4h
67 sshr v0.4h, v0.4h, #15
68 shrn v3.4h, v3.4s, #16
69 eor v3.8b, v3.8b, v0.8b
70 sub v3.4h, v3.4h, v0.4h
75 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
76 function x264_quant_4x4_dc_neon, export=1
77 ld1 {v16.8h,v17.8h}, [x0]
82 QUANT_TWO v0.8h, v0.8h, v2, v2, v0.16b
87 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
88 function x264_quant_4x4_neon, export=1
89 ld1 {v16.8h,v17.8h}, [x0]
92 ld1 {v0.8h,v1.8h}, [x2]
93 ld1 {v2.8h,v3.8h}, [x1]
94 QUANT_TWO v0.8h, v1.8h, v2, v3, v0.16b
99 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
100 function x264_quant_4x4x4_neon, export=1
101 ld1 {v16.8h,v17.8h}, [x0]
104 ld1 {v0.8h,v1.8h}, [x2]
105 ld1 {v2.8h,v3.8h}, [x1]
106 QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
107 ld1 {v16.8h,v17.8h}, [x0]
110 QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
111 ld1 {v16.8h,v17.8h}, [x0]
114 QUANT_TWO v0.8h, v1.8h, v2, v3, v6.16b
115 ld1 {v16.8h,v17.8h}, [x0]
118 QUANT_TWO v0.8h, v1.8h, v2, v3, v7.16b
142 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
143 function x264_quant_8x8_neon, export=1
144 ld1 {v16.8h,v17.8h}, [x0]
147 ld1 {v0.8h,v1.8h}, [x2], #32
148 ld1 {v2.8h,v3.8h}, [x1], #32
149 QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
151 ld1 {v16.8h,v17.8h}, [x0]
154 ld1 {v0.8h,v1.8h}, [x2], #32
155 ld1 {v2.8h,v3.8h}, [x1], #32
156 QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
157 orr v4.16b, v4.16b, v5.16b
163 .macro DEQUANT_START mf_size offset dc=no
166 lsr w3, w3, #8 // i_qbits = i_qp / 6
167 add w5, w3, w3, lsl #1
168 sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6
169 lsl w2, w2, #\mf_size
171 add x1, x1, w2, sxtw // dequant_mf[i_mf]
173 ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0]
175 subs w3, w3, #\offset // 6 for 8x8
178 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
179 .macro DEQUANT size bits
180 function x264_dequant_\size\()_neon, export=1
181 DEQUANT_START \bits+2, \bits
185 b.lt dequant_\size\()_rshift
188 dequant_\size\()_lshift_loop:
192 ld1 {v16.4s}, [x1], #16
193 ld1 {v17.4s}, [x1], #16
195 ld1 {v18.4s}, [x1], #16
197 ld1 {v19.4s}, [x1], #16
199 ld1 {v0.8h,v1.8h}, [x0]
201 mul v0.8h, v0.8h, v2.8h
202 mul v1.8h, v1.8h, v3.8h
203 sshl v0.8h, v0.8h, v31.8h
204 sshl v1.8h, v1.8h, v31.8h
205 st1 {v0.8h,v1.8h}, [x0], #32
207 b.gt dequant_\size\()_lshift_loop
211 dequant_\size\()_rshift:
219 dequant_\size\()_rshift_loop:
222 ld1 {v16.4s}, [x1], #16
223 ld1 {v17.4s}, [x1], #16
225 ld1 {v18.4s}, [x1], #16
228 ld1 {v19.4s}, [x1], #16
231 ld1 {v0.8h,v1.8h}, [x0]
236 smlal v16.4s, v0.4h, v2.4h
237 smlal2 v17.4s, v0.8h, v2.8h
238 smlal v18.4s, v1.4h, v3.4h
239 smlal2 v19.4s, v1.8h, v3.8h
240 sshl v16.4s, v16.4s, v31.4s
241 sshl v17.4s, v17.4s, v31.4s
242 sshl v18.4s, v18.4s, v31.4s
243 sshl v19.4s, v19.4s, v31.4s
249 st1 {v0.8h,v1.8h}, [x0], #32
251 b.gt dequant_\size\()_rshift_loop
260 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
261 function x264_dequant_4x4_dc_neon, export=1
262 DEQUANT_START 6, 6, yes
263 b.lt dequant_4x4_dc_rshift
267 ld1 {v0.8h,v1.8h}, [x0]
269 mul v0.8h, v0.8h, v2.8h
270 mul v1.8h, v1.8h, v2.8h
271 st1 {v0.8h,v1.8h}, [x0]
274 dequant_4x4_dc_rshift:
284 ld1 {v0.8h,v1.8h}, [x0]
288 smlal v16.4s, v0.4h, v4.4h
289 smlal2 v17.4s, v0.8h, v4.8h
290 smlal v18.4s, v1.4h, v4.4h
291 smlal2 v19.4s, v1.8h, v4.8h
292 sshl v16.4s, v16.4s, v3.4s
293 sshl v17.4s, v17.4s, v3.4s
294 sshl v18.4s, v18.4s, v3.4s
295 sshl v19.4s, v19.4s, v3.4s
301 st1 {v0.8h,v1.8h}, [x0]
305 .macro decimate_score_1x size
306 function x264_decimate_score\size\()_neon, export=1
307 ld1 {v0.8h,v1.8h}, [x0]
308 movrel x5, X(x264_decimate_table4)
313 cmeq v1.16b, v0.16b, #0
314 cmhi v2.16b, v2.16b, v3.16b
315 shrn v1.8b, v1.8h, #4
316 shrn v2.8b, v2.8h, #4
346 const mask64, align=6
347 .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
348 .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
351 function x264_decimate_score64_neon, export=1
352 ld1 {v0.8h,v1.8h}, [x0], #32
353 ld1 {v2.8h,v3.8h}, [x0], #32
354 ld1 {v4.8h,v5.8h}, [x0], #32
355 ld1 {v6.8h,v7.8h}, [x0]
359 sqxtn2 v16.16b, v0.8h
361 sqxtn2 v17.16b, v2.8h
363 sqxtn2 v18.16b, v4.8h
365 sqxtn2 v19.16b, v6.8h
371 cmeq v0.16b, v16.16b, #0
372 cmeq v1.16b, v17.16b, #0
373 cmeq v2.16b, v18.16b, #0
374 cmeq v3.16b, v19.16b, #0
375 umax v4.16b, v4.16b, v5.16b
376 umax v6.16b, v6.16b, v7.16b
377 and v0.16b, v0.16b, v30.16b
378 and v1.16b, v1.16b, v30.16b
379 and v2.16b, v2.16b, v30.16b
380 and v3.16b, v3.16b, v30.16b
381 umax v4.16b, v4.16b, v6.16b
382 addp v0.16b, v1.16b, v0.16b
383 addp v2.16b, v3.16b, v2.16b
384 cmhi v4.16b, v4.16b, v31.16b
385 addp v0.16b, v2.16b, v0.16b
386 shrn v4.8b, v4.8h, #4
387 addp v0.16b, v0.16b, v0.16b
394 movrel x5, X(x264_decimate_table8)
409 // int coeff_last( int16_t *l )
410 function x264_coeff_last4_aarch64, export=1
414 sub w0, w4, w0, lsr #4
418 function x264_coeff_last8_aarch64, export=1
428 sub w0, w4, w2, lsr #4
432 .macro COEFF_LAST_1x size
433 function x264_coeff_last\size\()_neon, export=1
437 ld1 {v0.8h,v1.8h}, [x0]
440 cmtst v0.16b, v0.16b, v0.16b
441 shrn v0.8b, v0.8h, #4
445 sub w0, w3, w2, lsr #2
453 function x264_coeff_last64_neon, export=1
454 ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
459 ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
467 cmtst v0.16b, v0.16b, v0.16b
468 cmtst v1.16b, v1.16b, v1.16b
469 cmtst v2.16b, v2.16b, v2.16b
470 cmtst v3.16b, v3.16b, v3.16b
472 shrn v0.8b, v0.8h, #4
473 shrn2 v0.16b, v1.8h, #4
474 shrn v1.8b, v2.8h, #4
475 shrn2 v1.16b, v3.8h, #4
480 shrn v0.4h, v0.4s, #2
481 shrn2 v0.8h, v1.4s, #2
483 sub v0.8h, v31.8h, v0.8h
484 sshl v0.8h, v30.8h, v0.8h
485 shrn v0.8b, v0.8h, #1
494 .macro coeff_level_run_start size
495 add x6, x1, #23 // runlevel->mask
503 .macro coeff_level_run shift
505 subs w4, w4, w3, lsr #\shift
508 ldrh w5, [x0, x4, lsl #1]
514 add w3, w3, #1 << \shift
516 and x3, x3, #~((1 << \shift) - 1)
519 subs w4, w4, w3, lsr #\shift
526 function x264_coeff_level_run4_aarch64, export=1
529 coeff_level_run_start 4
536 .macro X264_COEFF_LEVEL_RUN size
537 function x264_coeff_level_run\size\()_neon, export=1
544 cmtst v0.8b, v0.8b, v0.8b
546 ld1 {v0.8h,v1.8h}, [x0]
549 cmtst v0.16b, v0.16b, v0.16b
550 shrn v0.8b, v0.8h, #4
557 coeff_level_run_start \size
559 coeff_level_run (4 - (\size + 1) / 8)
565 X264_COEFF_LEVEL_RUN 8
566 X264_COEFF_LEVEL_RUN 15
567 X264_COEFF_LEVEL_RUN 16
569 function x264_denoise_dct_neon, export=1
571 ld1 {v0.8h,v1.8h}, [x0]
572 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
575 ld1 {v2.8h,v3.8h}, [x2], #32
576 cmlt v18.8h, v0.8h, #0
577 cmlt v19.8h, v1.8h, #0
578 uaddw v4.4s, v4.4s, v16.4h
579 uaddw2 v5.4s, v5.4s, v16.8h
580 uqsub v20.8h, v16.8h, v2.8h
581 uqsub v21.8h, v17.8h, v3.8h
582 uaddw v6.4s, v6.4s, v17.4h
583 uaddw2 v7.4s, v7.4s, v17.8h
586 bsl v18.16b, v22.16b, v20.16b
587 bsl v19.16b, v23.16b, v21.16b
588 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
589 st1 {v18.8h,v19.8h}, [x0], #32