]> git.sesse.net Git - x264/blob - common/aarch64/quant-a.S
aarch64: NEON asm for decimate_score
[x264] / common / aarch64 / quant-a.S
1 /****************************************************************************
2  * quant.S: arm quantization and level-run
3  *****************************************************************************
4  * Copyright (C) 2009-2014 x264 project
5  *
6  * Authors: David Conrad <lessen42@gmail.com>
7  *          Janne Grunau <janne-x264@jannau.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *
23  * This program is also available under a commercial proprietary license.
24  * For more information, contact us at licensing@x264.com.
25  *****************************************************************************/
26
27 #include "asm.S"
28
29 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
30     add         v18.8h, v18.8h, \bias0
31     add         v19.8h, v19.8h, \bias1
32     umull       v20.4s, v18.4h, \mf0_1\().4h
33     umull2      v21.4s, v18.8h, \mf0_1\().8h
34     umull       v22.4s, v19.4h, \mf2_3\().4h
35     umull2      v23.4s, v19.8h, \mf2_3\().8h
36     sshr        v16.8h, v16.8h, #15
37     sshr        v17.8h, v17.8h, #15
38     shrn        v18.4h, v20.4s, #16
39     shrn2       v18.8h, v21.4s, #16
40     shrn        v19.4h, v22.4s, #16
41     shrn2       v19.8h, v23.4s, #16
42     eor         v18.16b, v18.16b, v16.16b
43     eor         v19.16b, v19.16b, v17.16b
44     sub         v18.8h, v18.8h, v16.8h
45     sub         v19.8h, v19.8h, v17.8h
46     orr         \mask,  v18.16b, v19.16b
47     st1        {v18.8h,v19.8h}, [x0], #32
48 .endm
49
50 .macro QUANT_END d
51     fmov        x2,  \d
52     mov         w0,  #0
53     tst         x2,  x2
54     cinc        w0,  w0,  ne
55     ret
56 .endm
57
58 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
59 function x264_quant_2x2_dc_neon, export=1
60     ld1        {v0.4h}, [x0]
61     dup         v2.4h,  w2
62     dup         v1.4h,  w1
63     abs         v3.4h,  v0.4h
64     add         v3.4h,  v3.4h,  v2.4h
65     umull       v3.4s,  v3.4h,  v1.4h
66     sshr        v0.4h,  v0.4h,  #15
67     shrn        v3.4h,  v3.4s,  #16
68     eor         v3.8b,  v3.8b,  v0.8b
69     sub         v3.4h,  v3.4h,  v0.4h
70     st1        {v3.4h}, [x0]
71     QUANT_END   d3
72 endfunc
73
74 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
75 function x264_quant_4x4_dc_neon, export=1
76     ld1        {v16.8h,v17.8h}, [x0]
77     abs         v18.8h,  v16.8h
78     abs         v19.8h,  v17.8h
79     dup         v0.8h,  w2
80     dup         v2.8h,  w1
81     QUANT_TWO   v0.8h,  v0.8h,  v2,  v2,  v0.16b
82     uqxtn       v0.8b,  v0.8h
83     QUANT_END   d0
84 endfunc
85
86 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
87 function x264_quant_4x4_neon, export=1
88     ld1        {v16.8h,v17.8h}, [x0]
89     abs         v18.8h,  v16.8h
90     abs         v19.8h,  v17.8h
91     ld1        {v0.8h,v1.8h}, [x2]
92     ld1        {v2.8h,v3.8h}, [x1]
93     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v0.16b
94     uqxtn       v0.8b,  v0.8h
95     QUANT_END   d0
96 endfunc
97
98 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
99 function x264_quant_4x4x4_neon, export=1
100     ld1        {v16.8h,v17.8h}, [x0]
101     abs         v18.8h, v16.8h
102     abs         v19.8h, v17.8h
103     ld1        {v0.8h,v1.8h}, [x2]
104     ld1        {v2.8h,v3.8h}, [x1]
105     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
106     ld1        {v16.8h,v17.8h}, [x0]
107     abs         v18.8h, v16.8h
108     abs         v19.8h, v17.8h
109     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
110     ld1        {v16.8h,v17.8h}, [x0]
111     abs         v18.8h, v16.8h
112     abs         v19.8h, v17.8h
113     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v6.16b
114     ld1        {v16.8h,v17.8h}, [x0]
115     abs         v18.8h, v16.8h
116     abs         v19.8h, v17.8h
117     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v7.16b
118     uqxtn       v4.8b,  v4.8h
119     uqxtn       v7.8b,  v7.8h
120     uqxtn       v6.8b,  v6.8h
121     uqxtn       v5.8b,  v5.8h
122     fmov        x7,  d7
123     fmov        x6,  d6
124     fmov        x5,  d5
125     fmov        x4,  d4
126     mov         w0,  #0
127     tst         x7,  x7
128     cinc        w0,  w0,  ne
129     lsl         w0,  w0,  #1
130     tst         x6,  x6
131     cinc        w0,  w0,  ne
132     lsl         w0,  w0,  #1
133     tst         x5,  x5
134     cinc        w0,  w0,  ne
135     lsl         w0,  w0,  #1
136     tst         x4,  x4
137     cinc        w0,  w0,  ne
138     ret
139 endfunc
140
141 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
142 function x264_quant_8x8_neon, export=1
143     ld1        {v16.8h,v17.8h}, [x0]
144     abs         v18.8h, v16.8h
145     abs         v19.8h, v17.8h
146     ld1        {v0.8h,v1.8h}, [x2], #32
147     ld1        {v2.8h,v3.8h}, [x1], #32
148     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v4.16b
149 .rept 3
150     ld1        {v16.8h,v17.8h}, [x0]
151     abs         v18.8h, v16.8h
152     abs         v19.8h, v17.8h
153     ld1        {v0.8h,v1.8h}, [x2], #32
154     ld1        {v2.8h,v3.8h}, [x1], #32
155     QUANT_TWO   v0.8h,  v1.8h,  v2,  v3,  v5.16b
156     orr         v4.16b, v4.16b, v5.16b
157 .endr
158     uqxtn       v0.8b,  v4.8h
159     QUANT_END   d0
160 endfunc
161
162 .macro DEQUANT_START mf_size offset dc=no
163     mov         w3,  #0x2b
164     mul         w3,  w3,  w2
165     lsr         w3,  w3,  #8            // i_qbits = i_qp / 6
166     add         w5,  w3,  w3,  lsl #1
167     sub         w2,  w2,  w5,  lsl #1   // i_mf = i_qp % 6
168     lsl         w2,  w2,  #\mf_size
169 .ifc \dc,no
170     add         x1,  x1,  w2, sxtw      // dequant_mf[i_mf]
171 .else
172     ldr         x1, [x1,  w2, sxtw]     // dequant_mf[i_mf][0][0]
173 .endif
174     subs        w3,  w3,  #\offset      // 6 for 8x8
175 .endm
176
177 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
178 .macro DEQUANT size bits
179 function x264_dequant_\size\()_neon, export=1
180     DEQUANT_START \bits+2, \bits
181 .ifc \size, 8x8
182     mov         w2,  #4
183 .endif
184     b.lt        dequant_\size\()_rshift
185
186     dup         v31.8h, w3
187 dequant_\size\()_lshift_loop:
188 .ifc \size, 8x8
189     subs        w2,  w2,  #1
190 .endif
191     ld1        {v16.4s}, [x1], #16
192     ld1        {v17.4s}, [x1], #16
193     sqxtn       v2.4h,  v16.4s
194     ld1        {v18.4s}, [x1], #16
195     sqxtn2      v2.8h,  v17.4s
196     ld1        {v19.4s}, [x1], #16
197     sqxtn       v3.4h,  v18.4s
198     ld1        {v0.8h,v1.8h}, [x0]
199     sqxtn2      v3.8h,  v19.4s
200     mul         v0.8h,  v0.8h,  v2.8h
201     mul         v1.8h,  v1.8h,  v3.8h
202     sshl        v0.8h,  v0.8h,  v31.8h
203     sshl        v1.8h,  v1.8h,  v31.8h
204     st1        {v0.8h,v1.8h}, [x0], #32
205 .ifc \size, 8x8
206     b.gt        dequant_\size\()_lshift_loop
207 .endif
208     ret
209
210 dequant_\size\()_rshift:
211     dup         v31.4s, w3
212     neg         w3,  w3
213     mov         w5,  #1
214     sub         w3,  w3,  #1
215     lsl         w5,  w5,  w3
216
217 .ifc \size, 8x8
218 dequant_\size\()_rshift_loop:
219     subs        w2,  w2,  #1
220 .endif
221     ld1        {v16.4s}, [x1], #16
222     ld1        {v17.4s}, [x1], #16
223     sqxtn       v2.4h,  v16.4s
224     ld1        {v18.4s}, [x1], #16
225     dup         v16.4s, w5
226     sqxtn2      v2.8h,  v17.4s
227     ld1        {v19.4s}, [x1], #16
228     dup         v17.4s, w5
229     sqxtn       v3.4h,  v18.4s
230     ld1        {v0.8h,v1.8h}, [x0]
231     dup         v18.4s, w5
232     sqxtn2      v3.8h,  v19.4s
233     dup         v19.4s, w5
234
235     smlal       v16.4s, v0.4h,  v2.4h
236     smlal2      v17.4s, v0.8h,  v2.8h
237     smlal       v18.4s, v1.4h,  v3.4h
238     smlal2      v19.4s, v1.8h,  v3.8h
239     sshl        v16.4s, v16.4s, v31.4s
240     sshl        v17.4s, v17.4s, v31.4s
241     sshl        v18.4s, v18.4s, v31.4s
242     sshl        v19.4s, v19.4s, v31.4s
243
244     sqxtn       v0.4h,  v16.4s
245     sqxtn2      v0.8h,  v17.4s
246     sqxtn       v1.4h,  v18.4s
247     sqxtn2      v1.8h,  v19.4s
248     st1        {v0.8h,v1.8h}, [x0], #32
249 .ifc \size, 8x8
250     b.gt        dequant_\size\()_rshift_loop
251 .endif
252     ret
253 endfunc
254 .endm
255
256 DEQUANT 4x4, 4
257 DEQUANT 8x8, 6
258
259 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
260 function x264_dequant_4x4_dc_neon, export=1
261     DEQUANT_START 6, 6, yes
262     b.lt        dequant_4x4_dc_rshift
263
264     lsl         w1,  w1,  w3
265     dup         v2.8h,  w1
266     ld1        {v0.8h,v1.8h},   [x0]
267
268     mul         v0.8h,  v0.8h,  v2.8h
269     mul         v1.8h,  v1.8h,  v2.8h
270     st1        {v0.8h,v1.8h},   [x0]
271     ret
272
273 dequant_4x4_dc_rshift:
274     dup         v4.8h,  w1
275     dup         v3.4s, w3
276     neg         w3,  w3
277     mov         w5,  #1
278     sub         w3,  w3,  #1
279     lsl         w5,  w5,  w3
280
281     dup         v16.4s, w5
282     dup         v17.4s, w5
283     ld1        {v0.8h,v1.8h}, [x0]
284     dup         v18.4s, w5
285     dup         v19.4s, w5
286
287     smlal       v16.4s, v0.4h,  v4.4h
288     smlal2      v17.4s, v0.8h,  v4.8h
289     smlal       v18.4s, v1.4h,  v4.4h
290     smlal2      v19.4s, v1.8h,  v4.8h
291     sshl        v16.4s, v16.4s, v3.4s
292     sshl        v17.4s, v17.4s, v3.4s
293     sshl        v18.4s, v18.4s, v3.4s
294     sshl        v19.4s, v19.4s, v3.4s
295
296     sqxtn       v0.4h,  v16.4s
297     sqxtn2      v0.8h,  v17.4s
298     sqxtn       v1.4h,  v18.4s
299     sqxtn2      v1.8h,  v19.4s
300     st1        {v0.8h,v1.8h}, [x0]
301     ret
302 endfunc
303
304 .macro decimate_score_1x size
305 function x264_decimate_score\size\()_neon, export=1
306     ld1        {v0.8h,v1.8h}, [x0]
307     movrel      x5,  X(x264_decimate_table4)
308     movi        v3.16b, #0x01
309     sqxtn       v0.8b,  v0.8h
310     sqxtn2      v0.16b, v1.8h
311     abs         v2.16b, v0.16b
312     cmeq        v1.16b, v0.16b, #0
313     cmhi        v2.16b, v2.16b, v3.16b
314     shrn        v1.8b,  v1.8h,  #4
315     shrn        v2.8b,  v2.8h,  #4
316     fmov        x2,  d2
317     fmov        x1,  d1
318     cbnz        x2,  9f
319     mvn         x1,  x1
320     mov         w0,  #0
321     cbz         x1,  0f
322 .ifc \size, 15
323     lsr         x1,  x1,  #1
324 .endif
325     rbit        x1,  x1
326 1:
327     clz         x3,  x1
328     lsr         x6,  x3,  #2
329     lsl         x1,  x1,  x3
330     ldrb        w7,  [x5, x6]
331     cbz         x1,  2f
332     lsl         x1,  x1,  #4
333     add         w0,  w0,  w7
334     cbnz        x1,  1b
335     ret
336 2:
337     add         w0,  w0,  w7
338 0:
339     ret
340 9:
341     mov         w0,  #9
342     ret
343 endfunc
344 .endm
345
346 decimate_score_1x 15
347 decimate_score_1x 16
348
349 const mask64, align=6
350     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
351     .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
352 endconst
353
354 function x264_decimate_score64_neon, export=1
355     ld1        {v0.8h,v1.8h}, [x0], #32
356     ld1        {v2.8h,v3.8h}, [x0], #32
357     ld1        {v4.8h,v5.8h}, [x0], #32
358     ld1        {v6.8h,v7.8h}, [x0]
359     movrel      x6,  mask64
360     movi        v31.16b, #0x01
361     sqxtn       v16.8b,  v1.8h
362     sqxtn2      v16.16b, v0.8h
363     sqxtn       v17.8b,  v3.8h
364     sqxtn2      v17.16b, v2.8h
365     sqxtn       v18.8b,  v5.8h
366     sqxtn2      v18.16b, v4.8h
367     sqxtn       v19.8b,  v7.8h
368     sqxtn2      v19.16b, v6.8h
369     abs         v4.16b, v16.16b
370     abs         v5.16b, v17.16b
371     abs         v6.16b, v18.16b
372     abs         v7.16b, v19.16b
373     ld1        {v30.16b}, [x6]
374     cmeq        v0.16b, v16.16b, #0
375     cmeq        v1.16b, v17.16b, #0
376     cmeq        v2.16b, v18.16b, #0
377     cmeq        v3.16b, v19.16b, #0
378     umax        v4.16b, v4.16b, v5.16b
379     umax        v6.16b, v6.16b, v7.16b
380     and         v0.16b, v0.16b, v30.16b
381     and         v1.16b, v1.16b, v30.16b
382     and         v2.16b, v2.16b, v30.16b
383     and         v3.16b, v3.16b, v30.16b
384     umax        v4.16b, v4.16b, v6.16b
385     addp        v0.16b, v1.16b, v0.16b
386     addp        v2.16b, v3.16b, v2.16b
387     cmhi        v4.16b, v4.16b, v31.16b
388     addp        v0.16b, v2.16b, v0.16b
389     shrn        v4.8b,  v4.8h,  #4
390     addp        v0.16b, v0.16b, v0.16b
391     fmov        x2,  d4
392     fmov        x1,  d0
393     cbnz        x2,  9f
394     mvn         x1,  x1
395     mov         w0,  #0
396     cbz         x1,  0f
397     movrel      x5,  X(x264_decimate_table8)
398 1:
399     clz         x3,  x1
400     lsl         x1,  x1,  x3
401     ldrb        w7,  [x5, x3]
402     cbz         x1,  2f
403     lsl         x1,  x1,  #1
404     add         w0,  w0,  w7
405     cbnz        x1,  1b
406     ret
407 2:
408     add         w0,  w0,  w7
409 0:
410     ret
411 9:
412     mov         w0,  #9
413     ret
414 endfunc
415
416 // int coeff_last( int16_t *l )
417 function x264_coeff_last4_aarch64, export=1
418     ldr         x2,  [x0]
419     mov         w4,  #3
420     clz         x0,  x2
421     sub         w0,  w4,  w0, lsr #4
422     ret
423 endfunc
424
425 function x264_coeff_last8_aarch64, export=1
426     ldr         x3,  [x0, #8]
427     mov         w4,  #7
428     clz         x2,  x3
429     cmp         w2,  #64
430     b.ne        1f
431     ldr         x3,  [x0]
432     sub         w4,  w4,  #4
433     clz         x2,  x3
434 1:
435     sub         w0,  w4,  w2, lsr #4
436     ret
437 endfunc
438
439 .macro COEFF_LAST_1x size
440 function x264_coeff_last\size\()_neon, export=1
441 .if \size == 15
442     sub         x0,  x0,  #2
443 .endif
444     ld1        {v0.8h,v1.8h}, [x0]
445     uqxtn       v0.8b,  v0.8h
446     uqxtn2      v0.16b, v1.8h
447     cmtst       v0.16b, v0.16b, v0.16b
448     shrn        v0.8b,  v0.8h,  #4
449     fmov        x1,  d0
450     mov         w3,  #\size - 1
451     clz         x2,  x1
452     sub         w0,  w3,  w2, lsr #2
453     ret
454 endfunc
455 .endm
456
457 COEFF_LAST_1x 15
458 COEFF_LAST_1x 16
459
460 function x264_coeff_last64_neon, export=1
461     ld1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
462     movi        v31.8h,  #8
463     movi        v30.8h,  #1
464     uqxtn       v0.8b,  v0.8h
465     uqxtn2      v0.16b, v1.8h
466     ld1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
467     uqxtn       v1.8b,  v2.8h
468     uqxtn2      v1.16b, v3.8h
469     uqxtn       v2.8b,  v4.8h
470     uqxtn2      v2.16b, v5.8h
471     uqxtn       v3.8b,  v6.8h
472     uqxtn2      v3.16b, v7.8h
473
474     cmtst       v0.16b, v0.16b, v0.16b
475     cmtst       v1.16b, v1.16b, v1.16b
476     cmtst       v2.16b, v2.16b, v2.16b
477     cmtst       v3.16b, v3.16b, v3.16b
478
479     shrn        v0.8b,  v0.8h,  #4
480     shrn2       v0.16b, v1.8h,  #4
481     shrn        v1.8b,  v2.8h,  #4
482     shrn2       v1.16b, v3.8h,  #4
483
484     clz         v0.4s,  v0.4s
485     clz         v1.4s,  v1.4s
486
487     shrn        v0.4h,  v0.4s,  #2
488     shrn2       v0.8h,  v1.4s,  #2
489
490     sub         v0.8h,  v31.8h,  v0.8h
491     sshl        v0.8h,  v30.8h,  v0.8h
492     shrn        v0.8b,  v0.8h,  #1
493
494     fmov        x2,  d0
495     mov         w3,  #63
496     clz         x2,  x2
497     sub         w0,  w3,  w2
498     ret
499 endfunc