]> git.sesse.net Git - x264/blob - common/aarch64/quant-a.S
409090049bd35d9c26b4b7678af3f981b9c6d093
[x264] / common / aarch64 / quant-a.S
1 /****************************************************************************
2  * quant.S: arm quantization and level-run
3  *****************************************************************************
4  * Copyright (C) 2009-2015 x264 project
5  *
6  * Authors: David Conrad <lessen42@gmail.com>
7  *          Janne Grunau <janne-x264@jannau.net>
8  *          Martin Storsjo <martin@martin.st>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23  *
24  * This program is also available under a commercial proprietary license.
25  * For more information, contact us at licensing@x264.com.
26  *****************************************************************************/
27
28 #include "asm.S"
29
30 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
31     add         v18.8h, v18.8h, \bias0
32     add         v19.8h, v19.8h, \bias1
33     umull       v20.4s, v18.4h, \mf0_1\().4h
34     umull2      v21.4s, v18.8h, \mf0_1\().8h
35     umull       v22.4s, v19.4h, \mf2_3\().4h
36     umull2      v23.4s, v19.8h, \mf2_3\().8h
37     sshr        v16.8h, v16.8h, #15
38     sshr        v17.8h, v17.8h, #15
39     shrn        v18.4h, v20.4s, #16
40     shrn2       v18.8h, v21.4s, #16
41     shrn        v19.4h, v22.4s, #16
42     shrn2       v19.8h, v23.4s, #16
43     eor         v18.16b, v18.16b, v16.16b
44     eor         v19.16b, v19.16b, v17.16b
45     sub         v18.8h, v18.8h, v16.8h
46     sub         v19.8h, v19.8h, v17.8h
47     orr         \mask,  v18.16b, v19.16b
48     st1        {v18.8h,v19.8h}, [x0], #32
49 .endm
50
51 .macro QUANT_END d
52     fmov        x2,  \d
53     mov         w0,  #0
54     tst         x2,  x2
55     cinc        w0,  w0,  ne
56     ret
57 .endm
58
59 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
60 function x264_quant_2x2_dc_neon, export=1
61     ld1        {v0.4h}, [x0]
62     dup         v2.4h,  w2
63     dup         v1.4h,  w1
64     abs         v3.4h,  v0.4h
65     add         v3.4h,  v3.4h,  v2.4h
66     umull       v3.4s,  v3.4h,  v1.4h
67     sshr        v0.4h,  v0.4h,  #15
68     shrn        v3.4h,  v3.4s,  #16
69     eor         v3.8b,  v3.8b,  v0.8b
70     sub         v3.4h,  v3.4h,  v0.4h
71     st1        {v3.4h}, [x0]
72     QUANT_END   d3
73 endfunc
74
75 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
76 function x264_quant_4x4_dc_neon, export=1
77     ld1        {v16.8h,v17.8h}, [x0]
78     abs         v18.8h,  v16.8h
79     abs         v19.8h,  v17.8h
80     dup         v0.8h,  w2
81     dup         v2.8h,  w1
82     QUANT_TWO   v0.8h,  v0.8h,  v2,  v2,  v0.16b
83     uqxtn       v0.8b,  v0.8h
84     QUANT_END   d0
85 endfunc
86
87 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
88 function x264_quant_4x4_neon, export=1
89     ld1        {v16.8h,v17.8h}, [x0]
90     abs         v18.8h,  v16.8h
91     abs         v19.8h,  v17.8h
92     ld1        {v0.8h,v1.8h}, [x2]
93     ld1        {v2.8h,v3.8h}, [x1]
94     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v0.16b
95     uqxtn       v0.8b,  v0.8h
96     QUANT_END   d0
97 endfunc
98
99 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
100 function x264_quant_4x4x4_neon, export=1
101     ld1        {v16.8h,v17.8h}, [x0]
102     abs         v18.8h, v16.8h
103     abs         v19.8h, v17.8h
104     ld1        {v0.8h,v1.8h}, [x2]
105     ld1        {v2.8h,v3.8h}, [x1]
106     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
107     ld1        {v16.8h,v17.8h}, [x0]
108     abs         v18.8h, v16.8h
109     abs         v19.8h, v17.8h
110     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
111     ld1        {v16.8h,v17.8h}, [x0]
112     abs         v18.8h, v16.8h
113     abs         v19.8h, v17.8h
114     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v6.16b
115     ld1        {v16.8h,v17.8h}, [x0]
116     abs         v18.8h, v16.8h
117     abs         v19.8h, v17.8h
118     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v7.16b
119     uqxtn       v4.8b,  v4.8h
120     uqxtn       v7.8b,  v7.8h
121     uqxtn       v6.8b,  v6.8h
122     uqxtn       v5.8b,  v5.8h
123     fmov        x7,  d7
124     fmov        x6,  d6
125     fmov        x5,  d5
126     fmov        x4,  d4
127     mov         w0,  #0
128     tst         x7,  x7
129     cinc        w0,  w0,  ne
130     lsl         w0,  w0,  #1
131     tst         x6,  x6
132     cinc        w0,  w0,  ne
133     lsl         w0,  w0,  #1
134     tst         x5,  x5
135     cinc        w0,  w0,  ne
136     lsl         w0,  w0,  #1
137     tst         x4,  x4
138     cinc        w0,  w0,  ne
139     ret
140 endfunc
141
142 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
143 function x264_quant_8x8_neon, export=1
144     ld1        {v16.8h,v17.8h}, [x0]
145     abs         v18.8h, v16.8h
146     abs         v19.8h, v17.8h
147     ld1        {v0.8h,v1.8h}, [x2], #32
148     ld1        {v2.8h,v3.8h}, [x1], #32
149     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
150 .rept 3
151     ld1        {v16.8h,v17.8h}, [x0]
152     abs         v18.8h, v16.8h
153     abs         v19.8h, v17.8h
154     ld1        {v0.8h,v1.8h}, [x2], #32
155     ld1        {v2.8h,v3.8h}, [x1], #32
156     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
157     orr         v4.16b, v4.16b, v5.16b
158 .endr
159     uqxtn       v0.8b,  v4.8h
160     QUANT_END   d0
161 endfunc
162
163 .macro DEQUANT_START mf_size offset dc=no
164     mov         w3,  #0x2b
165     mul         w3,  w3,  w2
166     lsr         w3,  w3,  #8            // i_qbits = i_qp / 6
167     add         w5,  w3,  w3,  lsl #1
168     sub         w2,  w2,  w5,  lsl #1   // i_mf = i_qp % 6
169     lsl         w2,  w2,  #\mf_size
170 .ifc \dc,no
171     add         x1,  x1,  w2, sxtw      // dequant_mf[i_mf]
172 .else
173     ldr         x1, [x1,  w2, sxtw]     // dequant_mf[i_mf][0][0]
174 .endif
175     subs        w3,  w3,  #\offset      // 6 for 8x8
176 .endm
177
178 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
179 .macro DEQUANT size bits
180 function x264_dequant_\size\()_neon, export=1
181     DEQUANT_START \bits+2, \bits
182 .ifc \size, 8x8
183     mov         w2,  #4
184 .endif
185     b.lt        dequant_\size\()_rshift
186
187     dup         v31.8h, w3
188 dequant_\size\()_lshift_loop:
189 .ifc \size, 8x8
190     subs        w2,  w2,  #1
191 .endif
192     ld1        {v16.4s}, [x1], #16
193     ld1        {v17.4s}, [x1], #16
194     sqxtn       v2.4h,  v16.4s
195     ld1        {v18.4s}, [x1], #16
196     sqxtn2      v2.8h,  v17.4s
197     ld1        {v19.4s}, [x1], #16
198     sqxtn       v3.4h,  v18.4s
199     ld1        {v0.8h,v1.8h}, [x0]
200     sqxtn2      v3.8h,  v19.4s
201     mul         v0.8h,  v0.8h,  v2.8h
202     mul         v1.8h,  v1.8h,  v3.8h
203     sshl        v0.8h,  v0.8h,  v31.8h
204     sshl        v1.8h,  v1.8h,  v31.8h
205     st1        {v0.8h,v1.8h}, [x0], #32
206 .ifc \size, 8x8
207     b.gt        dequant_\size\()_lshift_loop
208 .endif
209     ret
210
211 dequant_\size\()_rshift:
212     dup         v31.4s, w3
213     neg         w3,  w3
214     mov         w5,  #1
215     sub         w3,  w3,  #1
216     lsl         w5,  w5,  w3
217
218 .ifc \size, 8x8
219 dequant_\size\()_rshift_loop:
220     subs        w2,  w2,  #1
221 .endif
222     ld1        {v16.4s}, [x1], #16
223     ld1        {v17.4s}, [x1], #16
224     sqxtn       v2.4h,  v16.4s
225     ld1        {v18.4s}, [x1], #16
226     dup         v16.4s, w5
227     sqxtn2      v2.8h,  v17.4s
228     ld1        {v19.4s}, [x1], #16
229     dup         v17.4s, w5
230     sqxtn       v3.4h,  v18.4s
231     ld1        {v0.8h,v1.8h}, [x0]
232     dup         v18.4s, w5
233     sqxtn2      v3.8h,  v19.4s
234     dup         v19.4s, w5
235
236     smlal       v16.4s, v0.4h,  v2.4h
237     smlal2      v17.4s, v0.8h,  v2.8h
238     smlal       v18.4s, v1.4h,  v3.4h
239     smlal2      v19.4s, v1.8h,  v3.8h
240     sshl        v16.4s, v16.4s, v31.4s
241     sshl        v17.4s, v17.4s, v31.4s
242     sshl        v18.4s, v18.4s, v31.4s
243     sshl        v19.4s, v19.4s, v31.4s
244
245     sqxtn       v0.4h,  v16.4s
246     sqxtn2      v0.8h,  v17.4s
247     sqxtn       v1.4h,  v18.4s
248     sqxtn2      v1.8h,  v19.4s
249     st1        {v0.8h,v1.8h}, [x0], #32
250 .ifc \size, 8x8
251     b.gt        dequant_\size\()_rshift_loop
252 .endif
253     ret
254 endfunc
255 .endm
256
257 DEQUANT 4x4, 4
258 DEQUANT 8x8, 6
259
260 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
261 function x264_dequant_4x4_dc_neon, export=1
262     DEQUANT_START 6, 6, yes
263     b.lt        dequant_4x4_dc_rshift
264
265     lsl         w1,  w1,  w3
266     dup         v2.8h,  w1
267     ld1        {v0.8h,v1.8h},   [x0]
268
269     mul         v0.8h,  v0.8h,  v2.8h
270     mul         v1.8h,  v1.8h,  v2.8h
271     st1        {v0.8h,v1.8h},   [x0]
272     ret
273
274 dequant_4x4_dc_rshift:
275     dup         v4.8h,  w1
276     dup         v3.4s, w3
277     neg         w3,  w3
278     mov         w5,  #1
279     sub         w3,  w3,  #1
280     lsl         w5,  w5,  w3
281
282     dup         v16.4s, w5
283     dup         v17.4s, w5
284     ld1        {v0.8h,v1.8h}, [x0]
285     dup         v18.4s, w5
286     dup         v19.4s, w5
287
288     smlal       v16.4s, v0.4h,  v4.4h
289     smlal2      v17.4s, v0.8h,  v4.8h
290     smlal       v18.4s, v1.4h,  v4.4h
291     smlal2      v19.4s, v1.8h,  v4.8h
292     sshl        v16.4s, v16.4s, v3.4s
293     sshl        v17.4s, v17.4s, v3.4s
294     sshl        v18.4s, v18.4s, v3.4s
295     sshl        v19.4s, v19.4s, v3.4s
296
297     sqxtn       v0.4h,  v16.4s
298     sqxtn2      v0.8h,  v17.4s
299     sqxtn       v1.4h,  v18.4s
300     sqxtn2      v1.8h,  v19.4s
301     st1        {v0.8h,v1.8h}, [x0]
302     ret
303 endfunc
304
305 .macro decimate_score_1x size
306 function x264_decimate_score\size\()_neon, export=1
307     ld1        {v0.8h,v1.8h}, [x0]
308     movrel      x5,  X(x264_decimate_table4)
309     movi        v3.16b, #0x01
310     sqxtn       v0.8b,  v0.8h
311     sqxtn2      v0.16b, v1.8h
312     abs         v2.16b, v0.16b
313     cmeq        v1.16b, v0.16b, #0
314     cmhi        v2.16b, v2.16b, v3.16b
315     shrn        v1.8b,  v1.8h,  #4
316     shrn        v2.8b,  v2.8h,  #4
317     fmov        x2,  d2
318     fmov        x1,  d1
319     cbnz        x2,  9f
320     mvn         x1,  x1
321     mov         w0,  #0
322     cbz         x1,  0f
323 .ifc \size, 15
324     lsr         x1,  x1,  #1
325 .endif
326     rbit        x1,  x1
327 1:
328     clz         x3,  x1
329     lsr         x6,  x3,  #2
330     lsl         x1,  x1,  x3
331     ldrb        w7,  [x5, x6]
332     lsl         x1,  x1,  #4
333     add         w0,  w0,  w7
334     cbnz        x1,  1b
335     ret
336 9:
337     mov         w0,  #9
338 0:
339     ret
340 endfunc
341 .endm
342
343 decimate_score_1x 15
344 decimate_score_1x 16
345
346 const mask64, align=6
347     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
348     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
349 endconst
350
351 function x264_decimate_score64_neon, export=1
352     ld1        {v0.8h,v1.8h}, [x0], #32
353     ld1        {v2.8h,v3.8h}, [x0], #32
354     ld1        {v4.8h,v5.8h}, [x0], #32
355     ld1        {v6.8h,v7.8h}, [x0]
356     movrel      x6,  mask64
357     movi        v31.16b, #0x01
358     sqxtn       v16.8b,  v1.8h
359     sqxtn2      v16.16b, v0.8h
360     sqxtn       v17.8b,  v3.8h
361     sqxtn2      v17.16b, v2.8h
362     sqxtn       v18.8b,  v5.8h
363     sqxtn2      v18.16b, v4.8h
364     sqxtn       v19.8b,  v7.8h
365     sqxtn2      v19.16b, v6.8h
366     abs         v4.16b, v16.16b
367     abs         v5.16b, v17.16b
368     abs         v6.16b, v18.16b
369     abs         v7.16b, v19.16b
370     ld1        {v30.16b}, [x6]
371     cmeq        v0.16b, v16.16b, #0
372     cmeq        v1.16b, v17.16b, #0
373     cmeq        v2.16b, v18.16b, #0
374     cmeq        v3.16b, v19.16b, #0
375     umax        v4.16b, v4.16b, v5.16b
376     umax        v6.16b, v6.16b, v7.16b
377     and         v0.16b, v0.16b, v30.16b
378     and         v1.16b, v1.16b, v30.16b
379     and         v2.16b, v2.16b, v30.16b
380     and         v3.16b, v3.16b, v30.16b
381     umax        v4.16b, v4.16b, v6.16b
382     addp        v0.16b, v1.16b, v0.16b
383     addp        v2.16b, v3.16b, v2.16b
384     cmhi        v4.16b, v4.16b, v31.16b
385     addp        v0.16b, v2.16b, v0.16b
386     shrn        v4.8b,  v4.8h,  #4
387     addp        v0.16b, v0.16b, v0.16b
388     fmov        x2,  d4
389     fmov        x1,  d0
390     cbnz        x2,  9f
391     mvn         x1,  x1
392     mov         w0,  #0
393     cbz         x1,  0f
394     movrel      x5,  X(x264_decimate_table8)
395 1:
396     clz         x3,  x1
397     lsl         x1,  x1,  x3
398     ldrb        w7,  [x5, x3]
399     lsl         x1,  x1,  #1
400     add         w0,  w0,  w7
401     cbnz        x1,  1b
402     ret
403 9:
404     mov         w0,  #9
405 0:
406     ret
407 endfunc
408
409 // int coeff_last( int16_t *l )
410 function x264_coeff_last4_aarch64, export=1
411     ldr         x2,  [x0]
412     mov         w4,  #3
413     clz         x0,  x2
414     sub         w0,  w4,  w0, lsr #4
415     ret
416 endfunc
417
418 function x264_coeff_last8_aarch64, export=1
419     ldr         x3,  [x0, #8]
420     mov         w4,  #7
421     clz         x2,  x3
422     cmp         w2,  #64
423     b.ne        1f
424     ldr         x3,  [x0]
425     sub         w4,  w4,  #4
426     clz         x2,  x3
427 1:
428     sub         w0,  w4,  w2, lsr #4
429     ret
430 endfunc
431
432 .macro COEFF_LAST_1x size
433 function x264_coeff_last\size\()_neon, export=1
434 .if \size == 15
435     sub         x0,  x0,  #2
436 .endif
437     ld1        {v0.8h,v1.8h}, [x0]
438     uqxtn       v0.8b,  v0.8h
439     uqxtn2      v0.16b, v1.8h
440     cmtst       v0.16b, v0.16b, v0.16b
441     shrn        v0.8b,  v0.8h,  #4
442     fmov        x1,  d0
443     mov         w3,  #\size - 1
444     clz         x2,  x1
445     sub         w0,  w3,  w2, lsr #2
446     ret
447 endfunc
448 .endm
449
450 COEFF_LAST_1x 15
451 COEFF_LAST_1x 16
452
453 function x264_coeff_last64_neon, export=1
454     ld1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
455     movi        v31.8h,  #8
456     movi        v30.8h,  #1
457     uqxtn       v0.8b,  v0.8h
458     uqxtn2      v0.16b, v1.8h
459     ld1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
460     uqxtn       v1.8b,  v2.8h
461     uqxtn2      v1.16b, v3.8h
462     uqxtn       v2.8b,  v4.8h
463     uqxtn2      v2.16b, v5.8h
464     uqxtn       v3.8b,  v6.8h
465     uqxtn2      v3.16b, v7.8h
466
467     cmtst       v0.16b, v0.16b, v0.16b
468     cmtst       v1.16b, v1.16b, v1.16b
469     cmtst       v2.16b, v2.16b, v2.16b
470     cmtst       v3.16b, v3.16b, v3.16b
471
472     shrn        v0.8b,  v0.8h,  #4
473     shrn2       v0.16b, v1.8h,  #4
474     shrn        v1.8b,  v2.8h,  #4
475     shrn2       v1.16b, v3.8h,  #4
476
477     clz         v0.4s,  v0.4s
478     clz         v1.4s,  v1.4s
479
480     shrn        v0.4h,  v0.4s,  #2
481     shrn2       v0.8h,  v1.4s,  #2
482
483     sub         v0.8h,  v31.8h,  v0.8h
484     sshl        v0.8h,  v30.8h,  v0.8h
485     shrn        v0.8b,  v0.8h,  #1
486
487     fmov        x2,  d0
488     mov         w3,  #63
489     clz         x2,  x2
490     sub         w0,  w3,  w2
491     ret
492 endfunc
493
494 .macro coeff_level_run_start size
495     add         x6,  x1,  #23            // runlevel->mask
496     mov         w7,  #0
497     mov         w8,  #0
498     mov         w9,  #1
499     and         x6,  x6,  #~15
500     mov         w4,  #\size - 1
501 .endm
502
503 .macro coeff_level_run shift
504     clz         x3,  x2
505     subs        w4,  w4,  w3, lsr #\shift
506     str         w4,  [x1], #4
507 1:
508     ldrh        w5,  [x0, x4, lsl #1]
509     strh        w5,  [x6], #2
510     add         w7,  w7,  #1
511     lsl         w10, w9, w4
512     orr         w8,  w8,  w10
513     b.le        2f
514     add         w3,  w3,  #1 << \shift
515     sub         w4,  w4,  #1
516     and         x3,  x3,  #~((1 << \shift) - 1)
517     lsl         x2,  x2,  x3
518     clz         x3,  x2
519     subs        w4,  w4,  w3, lsr #\shift
520     b.ge        1b
521 2:
522     str         w8,  [x1]
523     mov         w0,  w7
524 .endm
525
526 function x264_coeff_level_run4_aarch64, export=1
527     ldr         x2,  [x0]
528
529     coeff_level_run_start 4
530
531     coeff_level_run 4
532
533     ret
534 endfunc
535
536 .macro X264_COEFF_LEVEL_RUN size
537 function x264_coeff_level_run\size\()_neon, export=1
538 .if \size == 15
539     sub         x0,  x0,  #2
540 .endif
541 .if         \size < 15
542     ld1         {v0.8h}, [x0]
543     uqxtn       v0.8b,  v0.8h
544     cmtst       v0.8b,  v0.8b,  v0.8b
545 .else
546     ld1         {v0.8h,v1.8h}, [x0]
547     uqxtn       v0.8b,  v0.8h
548     uqxtn2      v0.16b, v1.8h
549     cmtst       v0.16b, v0.16b, v0.16b
550     shrn        v0.8b,  v0.8h,  #4
551 .endif
552     fmov        x2,  d0
553 .if \size == 15
554     add         x0,  x0,  #2
555 .endif
556
557     coeff_level_run_start \size
558
559     coeff_level_run (4 - (\size + 1) / 8)
560
561     ret
562 endfunc
563 .endm
564
565 X264_COEFF_LEVEL_RUN 8
566 X264_COEFF_LEVEL_RUN 15
567 X264_COEFF_LEVEL_RUN 16
568
569 function x264_denoise_dct_neon, export=1
570 1:  subs        w3,  w3,  #16
571     ld1         {v0.8h,v1.8h}, [x0]
572     ld1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
573     abs         v16.8h,  v0.8h
574     abs         v17.8h,  v1.8h
575     ld1         {v2.8h,v3.8h}, [x2], #32
576     cmlt        v18.8h,  v0.8h,   #0
577     cmlt        v19.8h,  v1.8h,   #0
578     uaddw       v4.4s,   v4.4s,   v16.4h
579     uaddw2      v5.4s,   v5.4s,   v16.8h
580     uqsub       v20.8h,  v16.8h,  v2.8h
581     uqsub       v21.8h,  v17.8h,  v3.8h
582     uaddw       v6.4s,   v6.4s,   v17.4h
583     uaddw2      v7.4s,   v7.4s,   v17.8h
584     neg         v22.8h,  v20.8h
585     neg         v23.8h,  v21.8h
586     bsl         v18.16b, v22.16b, v20.16b
587     bsl         v19.16b, v23.16b, v21.16b
588     st1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
589     st1         {v18.8h,v19.8h}, [x0], #32
590     b.gt        1b
591     ret
592 endfunc