]> git.sesse.net Git - x264/blob - common/aarch64/quant-a.S
3e7e35e4f05974f57e394f75f79a7ad30de94230
[x264] / common / aarch64 / quant-a.S
1 /****************************************************************************
2  * quant.S: arm quantization and level-run
3  *****************************************************************************
4  * Copyright (C) 2009-2015 x264 project
5  *
6  * Authors: David Conrad <lessen42@gmail.com>
7  *          Janne Grunau <janne-x264@jannau.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *
23  * This program is also available under a commercial proprietary license.
24  * For more information, contact us at licensing@x264.com.
25  *****************************************************************************/
26
27 #include "asm.S"
28
29 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
30     add         v18.8h, v18.8h, \bias0
31     add         v19.8h, v19.8h, \bias1
32     umull       v20.4s, v18.4h, \mf0_1\().4h
33     umull2      v21.4s, v18.8h, \mf0_1\().8h
34     umull       v22.4s, v19.4h, \mf2_3\().4h
35     umull2      v23.4s, v19.8h, \mf2_3\().8h
36     sshr        v16.8h, v16.8h, #15
37     sshr        v17.8h, v17.8h, #15
38     shrn        v18.4h, v20.4s, #16
39     shrn2       v18.8h, v21.4s, #16
40     shrn        v19.4h, v22.4s, #16
41     shrn2       v19.8h, v23.4s, #16
42     eor         v18.16b, v18.16b, v16.16b
43     eor         v19.16b, v19.16b, v17.16b
44     sub         v18.8h, v18.8h, v16.8h
45     sub         v19.8h, v19.8h, v17.8h
46     orr         \mask,  v18.16b, v19.16b
47     st1        {v18.8h,v19.8h}, [x0], #32
48 .endm
49
50 .macro QUANT_END d
51     fmov        x2,  \d
52     mov         w0,  #0
53     tst         x2,  x2
54     cinc        w0,  w0,  ne
55     ret
56 .endm
57
58 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
59 function x264_quant_2x2_dc_neon, export=1
60     ld1        {v0.4h}, [x0]
61     dup         v2.4h,  w2
62     dup         v1.4h,  w1
63     abs         v3.4h,  v0.4h
64     add         v3.4h,  v3.4h,  v2.4h
65     umull       v3.4s,  v3.4h,  v1.4h
66     sshr        v0.4h,  v0.4h,  #15
67     shrn        v3.4h,  v3.4s,  #16
68     eor         v3.8b,  v3.8b,  v0.8b
69     sub         v3.4h,  v3.4h,  v0.4h
70     st1        {v3.4h}, [x0]
71     QUANT_END   d3
72 endfunc
73
74 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
75 function x264_quant_4x4_dc_neon, export=1
76     ld1        {v16.8h,v17.8h}, [x0]
77     abs         v18.8h,  v16.8h
78     abs         v19.8h,  v17.8h
79     dup         v0.8h,  w2
80     dup         v2.8h,  w1
81     QUANT_TWO   v0.8h,  v0.8h,  v2,  v2,  v0.16b
82     uqxtn       v0.8b,  v0.8h
83     QUANT_END   d0
84 endfunc
85
86 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
87 function x264_quant_4x4_neon, export=1
88     ld1        {v16.8h,v17.8h}, [x0]
89     abs         v18.8h,  v16.8h
90     abs         v19.8h,  v17.8h
91     ld1        {v0.8h,v1.8h}, [x2]
92     ld1        {v2.8h,v3.8h}, [x1]
93     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v0.16b
94     uqxtn       v0.8b,  v0.8h
95     QUANT_END   d0
96 endfunc
97
98 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
99 function x264_quant_4x4x4_neon, export=1
100     ld1        {v16.8h,v17.8h}, [x0]
101     abs         v18.8h, v16.8h
102     abs         v19.8h, v17.8h
103     ld1        {v0.8h,v1.8h}, [x2]
104     ld1        {v2.8h,v3.8h}, [x1]
105     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
106     ld1        {v16.8h,v17.8h}, [x0]
107     abs         v18.8h, v16.8h
108     abs         v19.8h, v17.8h
109     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
110     ld1        {v16.8h,v17.8h}, [x0]
111     abs         v18.8h, v16.8h
112     abs         v19.8h, v17.8h
113     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v6.16b
114     ld1        {v16.8h,v17.8h}, [x0]
115     abs         v18.8h, v16.8h
116     abs         v19.8h, v17.8h
117     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v7.16b
118     uqxtn       v4.8b,  v4.8h
119     uqxtn       v7.8b,  v7.8h
120     uqxtn       v6.8b,  v6.8h
121     uqxtn       v5.8b,  v5.8h
122     fmov        x7,  d7
123     fmov        x6,  d6
124     fmov        x5,  d5
125     fmov        x4,  d4
126     mov         w0,  #0
127     tst         x7,  x7
128     cinc        w0,  w0,  ne
129     lsl         w0,  w0,  #1
130     tst         x6,  x6
131     cinc        w0,  w0,  ne
132     lsl         w0,  w0,  #1
133     tst         x5,  x5
134     cinc        w0,  w0,  ne
135     lsl         w0,  w0,  #1
136     tst         x4,  x4
137     cinc        w0,  w0,  ne
138     ret
139 endfunc
140
141 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
142 function x264_quant_8x8_neon, export=1
143     ld1        {v16.8h,v17.8h}, [x0]
144     abs         v18.8h, v16.8h
145     abs         v19.8h, v17.8h
146     ld1        {v0.8h,v1.8h}, [x2], #32
147     ld1        {v2.8h,v3.8h}, [x1], #32
148     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
149 .rept 3
150     ld1        {v16.8h,v17.8h}, [x0]
151     abs         v18.8h, v16.8h
152     abs         v19.8h, v17.8h
153     ld1        {v0.8h,v1.8h}, [x2], #32
154     ld1        {v2.8h,v3.8h}, [x1], #32
155     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
156     orr         v4.16b, v4.16b, v5.16b
157 .endr
158     uqxtn       v0.8b,  v4.8h
159     QUANT_END   d0
160 endfunc
161
162 .macro DEQUANT_START mf_size offset dc=no
163     mov         w3,  #0x2b
164     mul         w3,  w3,  w2
165     lsr         w3,  w3,  #8            // i_qbits = i_qp / 6
166     add         w5,  w3,  w3,  lsl #1
167     sub         w2,  w2,  w5,  lsl #1   // i_mf = i_qp % 6
168     lsl         w2,  w2,  #\mf_size
169 .ifc \dc,no
170     add         x1,  x1,  w2, sxtw      // dequant_mf[i_mf]
171 .else
172     ldr         x1, [x1,  w2, sxtw]     // dequant_mf[i_mf][0][0]
173 .endif
174     subs        w3,  w3,  #\offset      // 6 for 8x8
175 .endm
176
177 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
178 .macro DEQUANT size bits
179 function x264_dequant_\size\()_neon, export=1
180     DEQUANT_START \bits+2, \bits
181 .ifc \size, 8x8
182     mov         w2,  #4
183 .endif
184     b.lt        dequant_\size\()_rshift
185
186     dup         v31.8h, w3
187 dequant_\size\()_lshift_loop:
188 .ifc \size, 8x8
189     subs        w2,  w2,  #1
190 .endif
191     ld1        {v16.4s}, [x1], #16
192     ld1        {v17.4s}, [x1], #16
193     sqxtn       v2.4h,  v16.4s
194     ld1        {v18.4s}, [x1], #16
195     sqxtn2      v2.8h,  v17.4s
196     ld1        {v19.4s}, [x1], #16
197     sqxtn       v3.4h,  v18.4s
198     ld1        {v0.8h,v1.8h}, [x0]
199     sqxtn2      v3.8h,  v19.4s
200     mul         v0.8h,  v0.8h,  v2.8h
201     mul         v1.8h,  v1.8h,  v3.8h
202     sshl        v0.8h,  v0.8h,  v31.8h
203     sshl        v1.8h,  v1.8h,  v31.8h
204     st1        {v0.8h,v1.8h}, [x0], #32
205 .ifc \size, 8x8
206     b.gt        dequant_\size\()_lshift_loop
207 .endif
208     ret
209
210 dequant_\size\()_rshift:
211     dup         v31.4s, w3
212     neg         w3,  w3
213     mov         w5,  #1
214     sub         w3,  w3,  #1
215     lsl         w5,  w5,  w3
216
217 .ifc \size, 8x8
218 dequant_\size\()_rshift_loop:
219     subs        w2,  w2,  #1
220 .endif
221     ld1        {v16.4s}, [x1], #16
222     ld1        {v17.4s}, [x1], #16
223     sqxtn       v2.4h,  v16.4s
224     ld1        {v18.4s}, [x1], #16
225     dup         v16.4s, w5
226     sqxtn2      v2.8h,  v17.4s
227     ld1        {v19.4s}, [x1], #16
228     dup         v17.4s, w5
229     sqxtn       v3.4h,  v18.4s
230     ld1        {v0.8h,v1.8h}, [x0]
231     dup         v18.4s, w5
232     sqxtn2      v3.8h,  v19.4s
233     dup         v19.4s, w5
234
235     smlal       v16.4s, v0.4h,  v2.4h
236     smlal2      v17.4s, v0.8h,  v2.8h
237     smlal       v18.4s, v1.4h,  v3.4h
238     smlal2      v19.4s, v1.8h,  v3.8h
239     sshl        v16.4s, v16.4s, v31.4s
240     sshl        v17.4s, v17.4s, v31.4s
241     sshl        v18.4s, v18.4s, v31.4s
242     sshl        v19.4s, v19.4s, v31.4s
243
244     sqxtn       v0.4h,  v16.4s
245     sqxtn2      v0.8h,  v17.4s
246     sqxtn       v1.4h,  v18.4s
247     sqxtn2      v1.8h,  v19.4s
248     st1        {v0.8h,v1.8h}, [x0], #32
249 .ifc \size, 8x8
250     b.gt        dequant_\size\()_rshift_loop
251 .endif
252     ret
253 endfunc
254 .endm
255
256 DEQUANT 4x4, 4
257 DEQUANT 8x8, 6
258
259 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
260 function x264_dequant_4x4_dc_neon, export=1
261     DEQUANT_START 6, 6, yes
262     b.lt        dequant_4x4_dc_rshift
263
264     lsl         w1,  w1,  w3
265     dup         v2.8h,  w1
266     ld1        {v0.8h,v1.8h},   [x0]
267
268     mul         v0.8h,  v0.8h,  v2.8h
269     mul         v1.8h,  v1.8h,  v2.8h
270     st1        {v0.8h,v1.8h},   [x0]
271     ret
272
273 dequant_4x4_dc_rshift:
274     dup         v4.8h,  w1
275     dup         v3.4s, w3
276     neg         w3,  w3
277     mov         w5,  #1
278     sub         w3,  w3,  #1
279     lsl         w5,  w5,  w3
280
281     dup         v16.4s, w5
282     dup         v17.4s, w5
283     ld1        {v0.8h,v1.8h}, [x0]
284     dup         v18.4s, w5
285     dup         v19.4s, w5
286
287     smlal       v16.4s, v0.4h,  v4.4h
288     smlal2      v17.4s, v0.8h,  v4.8h
289     smlal       v18.4s, v1.4h,  v4.4h
290     smlal2      v19.4s, v1.8h,  v4.8h
291     sshl        v16.4s, v16.4s, v3.4s
292     sshl        v17.4s, v17.4s, v3.4s
293     sshl        v18.4s, v18.4s, v3.4s
294     sshl        v19.4s, v19.4s, v3.4s
295
296     sqxtn       v0.4h,  v16.4s
297     sqxtn2      v0.8h,  v17.4s
298     sqxtn       v1.4h,  v18.4s
299     sqxtn2      v1.8h,  v19.4s
300     st1        {v0.8h,v1.8h}, [x0]
301     ret
302 endfunc
303
304 .macro decimate_score_1x size
305 function x264_decimate_score\size\()_neon, export=1
306     ld1        {v0.8h,v1.8h}, [x0]
307     movrel      x5,  X(x264_decimate_table4)
308     movi        v3.16b, #0x01
309     sqxtn       v0.8b,  v0.8h
310     sqxtn2      v0.16b, v1.8h
311     abs         v2.16b, v0.16b
312     cmeq        v1.16b, v0.16b, #0
313     cmhi        v2.16b, v2.16b, v3.16b
314     shrn        v1.8b,  v1.8h,  #4
315     shrn        v2.8b,  v2.8h,  #4
316     fmov        x2,  d2
317     fmov        x1,  d1
318     cbnz        x2,  9f
319     mvn         x1,  x1
320     mov         w0,  #0
321     cbz         x1,  0f
322 .ifc \size, 15
323     lsr         x1,  x1,  #1
324 .endif
325     rbit        x1,  x1
326 1:
327     clz         x3,  x1
328     lsr         x6,  x3,  #2
329     lsl         x1,  x1,  x3
330     ldrb        w7,  [x5, x6]
331     lsl         x1,  x1,  #4
332     add         w0,  w0,  w7
333     cbnz        x1,  1b
334     ret
335 9:
336     mov         w0,  #9
337 0:
338     ret
339 endfunc
340 .endm
341
342 decimate_score_1x 15
343 decimate_score_1x 16
344
345 const mask64, align=6
346     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
347     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
348 endconst
349
350 function x264_decimate_score64_neon, export=1
351     ld1        {v0.8h,v1.8h}, [x0], #32
352     ld1        {v2.8h,v3.8h}, [x0], #32
353     ld1        {v4.8h,v5.8h}, [x0], #32
354     ld1        {v6.8h,v7.8h}, [x0]
355     movrel      x6,  mask64
356     movi        v31.16b, #0x01
357     sqxtn       v16.8b,  v1.8h
358     sqxtn2      v16.16b, v0.8h
359     sqxtn       v17.8b,  v3.8h
360     sqxtn2      v17.16b, v2.8h
361     sqxtn       v18.8b,  v5.8h
362     sqxtn2      v18.16b, v4.8h
363     sqxtn       v19.8b,  v7.8h
364     sqxtn2      v19.16b, v6.8h
365     abs         v4.16b, v16.16b
366     abs         v5.16b, v17.16b
367     abs         v6.16b, v18.16b
368     abs         v7.16b, v19.16b
369     ld1        {v30.16b}, [x6]
370     cmeq        v0.16b, v16.16b, #0
371     cmeq        v1.16b, v17.16b, #0
372     cmeq        v2.16b, v18.16b, #0
373     cmeq        v3.16b, v19.16b, #0
374     umax        v4.16b, v4.16b, v5.16b
375     umax        v6.16b, v6.16b, v7.16b
376     and         v0.16b, v0.16b, v30.16b
377     and         v1.16b, v1.16b, v30.16b
378     and         v2.16b, v2.16b, v30.16b
379     and         v3.16b, v3.16b, v30.16b
380     umax        v4.16b, v4.16b, v6.16b
381     addp        v0.16b, v1.16b, v0.16b
382     addp        v2.16b, v3.16b, v2.16b
383     cmhi        v4.16b, v4.16b, v31.16b
384     addp        v0.16b, v2.16b, v0.16b
385     shrn        v4.8b,  v4.8h,  #4
386     addp        v0.16b, v0.16b, v0.16b
387     fmov        x2,  d4
388     fmov        x1,  d0
389     cbnz        x2,  9f
390     mvn         x1,  x1
391     mov         w0,  #0
392     cbz         x1,  0f
393     movrel      x5,  X(x264_decimate_table8)
394 1:
395     clz         x3,  x1
396     lsl         x1,  x1,  x3
397     ldrb        w7,  [x5, x3]
398     lsl         x1,  x1,  #1
399     add         w0,  w0,  w7
400     cbnz        x1,  1b
401     ret
402 9:
403     mov         w0,  #9
404 0:
405     ret
406 endfunc
407
408 // int coeff_last( int16_t *l )
409 function x264_coeff_last4_aarch64, export=1
410     ldr         x2,  [x0]
411     mov         w4,  #3
412     clz         x0,  x2
413     sub         w0,  w4,  w0, lsr #4
414     ret
415 endfunc
416
417 function x264_coeff_last8_aarch64, export=1
418     ldr         x3,  [x0, #8]
419     mov         w4,  #7
420     clz         x2,  x3
421     cmp         w2,  #64
422     b.ne        1f
423     ldr         x3,  [x0]
424     sub         w4,  w4,  #4
425     clz         x2,  x3
426 1:
427     sub         w0,  w4,  w2, lsr #4
428     ret
429 endfunc
430
431 .macro COEFF_LAST_1x size
432 function x264_coeff_last\size\()_neon, export=1
433 .if \size == 15
434     sub         x0,  x0,  #2
435 .endif
436     ld1        {v0.8h,v1.8h}, [x0]
437     uqxtn       v0.8b,  v0.8h
438     uqxtn2      v0.16b, v1.8h
439     cmtst       v0.16b, v0.16b, v0.16b
440     shrn        v0.8b,  v0.8h,  #4
441     fmov        x1,  d0
442     mov         w3,  #\size - 1
443     clz         x2,  x1
444     sub         w0,  w3,  w2, lsr #2
445     ret
446 endfunc
447 .endm
448
449 COEFF_LAST_1x 15
450 COEFF_LAST_1x 16
451
452 function x264_coeff_last64_neon, export=1
453     ld1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
454     movi        v31.8h,  #8
455     movi        v30.8h,  #1
456     uqxtn       v0.8b,  v0.8h
457     uqxtn2      v0.16b, v1.8h
458     ld1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
459     uqxtn       v1.8b,  v2.8h
460     uqxtn2      v1.16b, v3.8h
461     uqxtn       v2.8b,  v4.8h
462     uqxtn2      v2.16b, v5.8h
463     uqxtn       v3.8b,  v6.8h
464     uqxtn2      v3.16b, v7.8h
465
466     cmtst       v0.16b, v0.16b, v0.16b
467     cmtst       v1.16b, v1.16b, v1.16b
468     cmtst       v2.16b, v2.16b, v2.16b
469     cmtst       v3.16b, v3.16b, v3.16b
470
471     shrn        v0.8b,  v0.8h,  #4
472     shrn2       v0.16b, v1.8h,  #4
473     shrn        v1.8b,  v2.8h,  #4
474     shrn2       v1.16b, v3.8h,  #4
475
476     clz         v0.4s,  v0.4s
477     clz         v1.4s,  v1.4s
478
479     shrn        v0.4h,  v0.4s,  #2
480     shrn2       v0.8h,  v1.4s,  #2
481
482     sub         v0.8h,  v31.8h,  v0.8h
483     sshl        v0.8h,  v30.8h,  v0.8h
484     shrn        v0.8b,  v0.8h,  #1
485
486     fmov        x2,  d0
487     mov         w3,  #63
488     clz         x2,  x2
489     sub         w0,  w3,  w2
490     ret
491 endfunc
492
493 .macro coeff_level_run_start size
494     add         x6,  x1,  #23            // runlevel->mask
495     mov         w7,  #0
496     mov         w8,  #0
497     mov         w9,  #1
498     and         x6,  x6,  #~15
499     mov         w4,  #\size - 1
500 .endm
501
502 .macro coeff_level_run shift
503     clz         x3,  x2
504     subs        w4,  w4,  w3, lsr #\shift
505     str         w4,  [x1], #4
506 1:
507     ldrh        w5,  [x0, x4, lsl #1]
508     strh        w5,  [x6], #2
509     add         w7,  w7,  #1
510     lsl         w10, w9, w4
511     orr         w8,  w8,  w10
512     b.le        2f
513     add         w3,  w3,  #1 << \shift
514     sub         w4,  w4,  #1
515     and         x3,  x3,  #~((1 << \shift) - 1)
516     lsl         x2,  x2,  x3
517     clz         x3,  x2
518     subs        w4,  w4,  w3, lsr #\shift
519     b.ge        1b
520 2:
521     str         w8,  [x1]
522     mov         w0,  w7
523 .endm
524
525 function x264_coeff_level_run4_aarch64, export=1
526     ldr         x2,  [x0]
527
528     coeff_level_run_start 4
529
530     coeff_level_run 4
531
532     ret
533 endfunc
534
535 .macro X264_COEFF_LEVEL_RUN size
536 function x264_coeff_level_run\size\()_neon, export=1
537 .if \size == 15
538     sub         x0,  x0,  #2
539 .endif
540 .if         \size < 15
541     ld1         {v0.8h}, [x0]
542     uqxtn       v0.8b,  v0.8h
543     cmtst       v0.8b,  v0.8b,  v0.8b
544 .else
545     ld1         {v0.8h,v1.8h}, [x0]
546     uqxtn       v0.8b,  v0.8h
547     uqxtn2      v0.16b, v1.8h
548     cmtst       v0.16b, v0.16b, v0.16b
549     shrn        v0.8b,  v0.8h,  #4
550 .endif
551     fmov        x2,  d0
552 .if \size == 15
553     add         x0,  x0,  #2
554 .endif
555
556     coeff_level_run_start \size
557
558     coeff_level_run (4 - (\size + 1) / 8)
559
560     ret
561 endfunc
562 .endm
563
564 X264_COEFF_LEVEL_RUN 8
565 X264_COEFF_LEVEL_RUN 15
566 X264_COEFF_LEVEL_RUN 16
567
568 function x264_denoise_dct_neon, export=1
569 1:  subs        w3,  w3,  #16
570     ld1         {v0.8h,v1.8h}, [x0]
571     ld1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
572     abs         v16.8h,  v0.8h
573     abs         v17.8h,  v1.8h
574     ld1         {v2.8h,v3.8h}, [x2], #32
575     cmlt        v18.8h,  v0.8h,   #0
576     cmlt        v19.8h,  v1.8h,   #0
577     uaddw       v4.4s,   v4.4s,   v16.4h
578     uaddw2      v5.4s,   v5.4s,   v16.8h
579     uqsub       v20.8h,  v16.8h,  v2.8h
580     uqsub       v21.8h,  v17.8h,  v3.8h
581     uaddw       v6.4s,   v6.4s,   v17.4h
582     uaddw2      v7.4s,   v7.4s,   v17.8h
583     neg         v22.8h,  v20.8h
584     neg         v23.8h,  v21.8h
585     bsl         v18.16b, v22.16b, v20.16b
586     bsl         v19.16b, v23.16b, v21.16b
587     st1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
588     st1         {v18.8h,v19.8h}, [x0], #32
589     b.gt        1b
590     ret
591 endfunc