1 /*****************************************************************************
2 * predict.S: aarch64 intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Mans Rullgard <mans@mansr.com>
8 * Janne Grunau <janne-x264@jannau.net>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
30 const p8weight, align=4
31 .short 1, 2, 3, 4, 1, 2, 3, 4
33 const p16weight, align=4
34 .short 1, 2, 3, 4, 5, 6, 7, 8
37 .macro ldcol.8 vd, xn, xm, n=8, hi=0
38 .if \n == 8 || \hi == 0
39 ld1 {\vd\().b}[0], [\xn], \xm
40 ld1 {\vd\().b}[1], [\xn], \xm
41 ld1 {\vd\().b}[2], [\xn], \xm
42 ld1 {\vd\().b}[3], [\xn], \xm
44 .if \n == 8 || \hi == 1
45 ld1 {\vd\().b}[4], [\xn], \xm
46 ld1 {\vd\().b}[5], [\xn], \xm
47 ld1 {\vd\().b}[6], [\xn], \xm
48 ld1 {\vd\().b}[7], [\xn], \xm
52 .macro ldcol.16 vd, xn, xm
54 ld1 {\vd\().b}[ 8], [\xn], \xm
55 ld1 {\vd\().b}[ 9], [\xn], \xm
56 ld1 {\vd\().b}[10], [\xn], \xm
57 ld1 {\vd\().b}[11], [\xn], \xm
58 ld1 {\vd\().b}[12], [\xn], \xm
59 ld1 {\vd\().b}[13], [\xn], \xm
60 ld1 {\vd\().b}[14], [\xn], \xm
61 ld1 {\vd\().b}[15], [\xn], \xm
65 function x264_predict_4x4_h_aarch64, export=1
66 ldrb w1, [x0, #0*FDEC_STRIDE-1]
67 ldrb w2, [x0, #1*FDEC_STRIDE-1]
68 ldrb w3, [x0, #2*FDEC_STRIDE-1]
69 ldrb w4, [x0, #3*FDEC_STRIDE-1]
70 add w1, w1, w1, lsl #8
71 add w2, w2, w2, lsl #8
72 add w3, w3, w3, lsl #8
73 add w4, w4, w4, lsl #8
74 add w1, w1, w1, lsl #16
75 str w1, [x0, #0*FDEC_STRIDE]
76 add w2, w2, w2, lsl #16
77 str w2, [x0, #1*FDEC_STRIDE]
78 add w3, w3, w3, lsl #16
79 str w3, [x0, #2*FDEC_STRIDE]
80 add w4, w4, w4, lsl #16
81 str w4, [x0, #3*FDEC_STRIDE]
85 function x264_predict_4x4_v_aarch64, export=1
86 ldr w1, [x0, #0 - 1 * FDEC_STRIDE]
87 str w1, [x0, #0 + 0 * FDEC_STRIDE]
88 str w1, [x0, #0 + 1 * FDEC_STRIDE]
89 str w1, [x0, #0 + 2 * FDEC_STRIDE]
90 str w1, [x0, #0 + 3 * FDEC_STRIDE]
94 function x264_predict_4x4_dc_neon, export=1
95 sub x1, x0, #FDEC_STRIDE
99 ld1r {v1.8b}, [x2], x7
100 ld1r {v2.8b}, [x2], x7
101 ld1r {v3.8b}, [x2], x7
102 ld1r {v4.8b}, [x2], x7
104 uaddl v1.8h, v1.8b, v2.8b
105 uaddl v2.8h, v3.8b, v4.8b
106 addp v0.4h, v0.4h, v0.4h
107 add v1.4h, v1.4h, v2.4h
109 add v0.4h, v0.4h, v1.4h
110 rshrn v0.8b, v0.8h, #3
111 str s0, [x0], #FDEC_STRIDE
112 str s0, [x0], #FDEC_STRIDE
113 str s0, [x0], #FDEC_STRIDE
118 function x264_predict_4x4_dc_top_neon, export=1
119 sub x1, x0, #FDEC_STRIDE
123 addp v0.4h, v0.4h, v0.4h
125 rshrn v0.8b, v0.8h, #2
126 str s0, [x0], #FDEC_STRIDE
127 str s0, [x0], #FDEC_STRIDE
128 str s0, [x0], #FDEC_STRIDE
133 function x264_predict_4x4_ddr_neon, export=1
134 sub x1, x0, #FDEC_STRIDE+1
136 ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
137 ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
138 ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
139 ext v0.8b, v1.8b, v0.8b, #7
140 ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
141 ext v0.8b, v2.8b, v0.8b, #7 // a
142 ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
143 ext v1.8b, v3.8b, v0.8b, #7 // b
144 ext v2.8b, v4.8b, v1.8b, #7 // c
145 uaddl v0.8h, v0.8b, v1.8b
146 uaddl v1.8h, v1.8b, v2.8b
147 add v0.8h, v0.8h, v1.8h
148 rshrn v0.8b, v0.8h, #2
150 ext v3.8b, v0.8b, v0.8b, #3
151 ext v2.8b, v0.8b, v0.8b, #2
152 ext v1.8b, v0.8b, v0.8b, #1
154 str s3, [x0], #FDEC_STRIDE
155 str s2, [x0], #FDEC_STRIDE
156 str s1, [x0], #FDEC_STRIDE
161 function x264_predict_4x4_ddl_neon, export=1
162 sub x0, x0, #FDEC_STRIDE
164 ld1 {v0.8b}, [x0], x7
166 ext v1.8b, v0.8b, v0.8b, #1
167 ext v2.8b, v0.8b, v3.8b, #2
168 uhadd v0.8b, v0.8b, v2.8b
169 urhadd v0.8b, v0.8b, v1.8b
170 str s0, [x0], #FDEC_STRIDE
171 ext v1.8b, v0.8b, v0.8b, #1
172 ext v2.8b, v0.8b, v0.8b, #2
173 str s1, [x0], #FDEC_STRIDE
174 ext v3.8b, v0.8b, v0.8b, #3
175 str s2, [x0], #FDEC_STRIDE
180 function x264_predict_8x8_dc_neon, export=1
182 ld1 {v0.16b}, [x1], #16
184 ext v0.16b, v0.16b, v0.16b, #7
187 add v0.8h, v0.8h, v1.8h
189 rshrn v0.8b, v0.8h, #4
191 st1 {v0.8b}, [x0], x7
196 function x264_predict_8x8_h_neon, export=1
201 st1 {v0.8b}, [x0], x7
203 st1 {v1.8b}, [x0], x7
205 st1 {v2.8b}, [x0], x7
207 st1 {v3.8b}, [x0], x7
209 st1 {v4.8b}, [x0], x7
211 st1 {v5.8b}, [x0], x7
213 st1 {v6.8b}, [x0], x7
214 st1 {v7.8b}, [x0], x7
218 function x264_predict_8x8_v_neon, export=1
223 st1 {v0.8b}, [x0], x7
228 function x264_predict_8x8_ddl_neon, export=1
234 ext v4.16b, v3.16b, v0.16b, #15
235 ext v2.16b, v0.16b, v2.16b, #1
236 uhadd v4.16b, v4.16b, v2.16b
237 urhadd v0.16b, v0.16b, v4.16b
238 ext v1.16b, v0.16b, v0.16b, #1
239 ext v2.16b, v0.16b, v0.16b, #2
240 st1 {v1.8b}, [x0], x7
241 ext v3.16b, v0.16b, v0.16b, #3
242 st1 {v2.8b}, [x0], x7
243 ext v4.16b, v0.16b, v0.16b, #4
244 st1 {v3.8b}, [x0], x7
245 ext v5.16b, v0.16b, v0.16b, #5
246 st1 {v4.8b}, [x0], x7
247 ext v6.16b, v0.16b, v0.16b, #6
248 st1 {v5.8b}, [x0], x7
249 ext v7.16b, v0.16b, v0.16b, #7
250 st1 {v6.8b}, [x0], x7
251 ext v0.16b, v0.16b, v0.16b, #8
252 st1 {v7.8b}, [x0], x7
253 st1 {v0.8b}, [x0], x7
257 function x264_predict_8x8_ddr_neon, export=1
258 ld1 {v0.16b,v1.16b}, [x1]
259 ext v2.16b, v0.16b, v1.16b, #7
260 ext v4.16b, v0.16b, v1.16b, #9
261 ext v3.16b, v0.16b, v1.16b, #8
263 uhadd v2.16b, v2.16b, v4.16b
264 urhadd v7.16b, v3.16b, v2.16b
266 add x0, x0, #7*FDEC_STRIDE
267 mov x7, #-1*FDEC_STRIDE
269 ext v6.16b, v7.16b, v7.16b, #1
270 st1 {v7.8b}, [x0], x7
271 ext v5.16b, v7.16b, v7.16b, #2
272 st1 {v6.8b}, [x0], x7
273 ext v4.16b, v7.16b, v7.16b, #3
274 st1 {v5.8b}, [x0], x7
275 ext v3.16b, v7.16b, v7.16b, #4
276 st1 {v4.8b}, [x0], x7
277 ext v2.16b, v7.16b, v7.16b, #5
278 st1 {v3.8b}, [x0], x7
279 ext v1.16b, v7.16b, v7.16b, #6
280 st1 {v2.8b}, [x0], x7
281 ext v0.16b, v7.16b, v7.16b, #7
282 st1 {v1.8b}, [x0], x7
283 st1 {v0.8b}, [x0], x7
287 function x264_predict_8x8_vl_neon, export=1
292 ext v1.16b, v1.16b, v0.16b, #15
293 ext v2.16b, v0.16b, v2.16b, #1
295 uhadd v1.16b, v1.16b, v2.16b
296 urhadd v3.16b, v0.16b, v2.16b
298 urhadd v0.16b, v0.16b, v1.16b
300 ext v4.16b, v0.16b, v0.16b, #1
301 st1 {v3.8b}, [x0], x7
302 ext v5.16b, v3.16b, v3.16b, #1
303 st1 {v4.8b}, [x0], x7
304 ext v6.16b, v0.16b, v0.16b, #2
305 st1 {v5.8b}, [x0], x7
306 ext v7.16b, v3.16b, v3.16b, #2
307 st1 {v6.8b}, [x0], x7
308 ext v4.16b, v0.16b, v0.16b, #3
309 st1 {v7.8b}, [x0], x7
310 ext v5.16b, v3.16b, v3.16b, #3
311 st1 {v4.8b}, [x0], x7
312 ext v6.16b, v0.16b, v0.16b, #4
313 st1 {v5.8b}, [x0], x7
314 st1 {v6.8b}, [x0], x7
318 function x264_predict_8x8_vr_neon, export=1
323 ext v1.16b, v2.16b, v2.16b, #14
324 ext v0.16b, v2.16b, v2.16b, #15
326 uhadd v3.16b, v2.16b, v1.16b
327 urhadd v2.16b, v2.16b, v0.16b
328 urhadd v0.16b, v0.16b, v3.16b
330 ext v1.16b, v2.16b, v2.16b, #8
331 uzp1 v2.8b, v0.8b, v0.8b
332 uzp2 v3.8b, v0.8b, v0.8b
333 ext v0.16b, v0.16b, v0.16b, #8
335 st1 {v1.8b}, [x0], x7
336 st1 {v0.8b}, [x0], x7
337 ext v4.8b, v3.8b, v1.8b, #7
338 ext v5.8b, v2.8b, v0.8b, #7
339 st1 {v4.8b}, [x0], x7
340 st1 {v5.8b}, [x0], x7
341 ext v6.8b, v3.8b, v1.8b, #6
342 ext v7.8b, v2.8b, v0.8b, #6
343 st1 {v6.8b}, [x0], x7
344 st1 {v7.8b}, [x0], x7
345 ext v1.8b, v3.8b, v1.8b, #5
346 ext v0.8b, v2.8b, v0.8b, #5
347 st1 {v1.8b}, [x0], x7
348 st1 {v0.8b}, [x0], x7
352 function x264_predict_8x8_hd_neon, export=1
357 ext v3.16b, v1.16b, v1.16b, #1
358 ext v2.16b, v1.16b, v1.16b, #2
360 urhadd v4.16b, v1.16b, v3.16b
362 uhadd v1.16b, v1.16b, v2.16b
363 urhadd v0.16b, v1.16b, v3.16b
365 zip1 v16.8b, v4.8b, v0.8b
366 zip2 v17.8b, v4.8b, v0.8b
367 ext v7.16b, v0.16b, v0.16b, #8
369 ext v0.8b, v17.8b, v7.8b, #6
370 ext v1.8b, v17.8b, v7.8b, #4
371 st1 {v0.8b}, [x0], x7
372 ext v2.8b, v17.8b, v7.8b, #2
373 st1 {v1.8b}, [x0], x7
374 st1 {v2.8b}, [x0], x7
375 ext v3.8b, v16.8b, v17.8b, #6
376 st1 {v17.8b}, [x0], x7
377 ext v4.8b, v16.8b, v17.8b, #4
378 st1 {v3.8b}, [x0], x7
379 ext v5.8b, v16.8b, v17.8b, #2
380 st1 {v4.8b}, [x0], x7
381 st1 {v5.8b}, [x0], x7
382 st1 {v16.8b}, [x0], x7
387 function x264_predict_8x8_hu_neon, export=1
394 ext v4.8b, v7.8b, v6.8b, #2
395 ext v2.8b, v7.8b, v6.8b, #1
397 uhadd v5.8b, v7.8b, v4.8b
398 urhadd v0.8b, v2.8b, v7.8b
399 urhadd v1.8b, v5.8b, v2.8b
401 zip1 v16.8b, v0.8b, v1.8b
402 zip2 v17.8b, v0.8b, v1.8b
406 ext v0.8b, v16.8b, v17.8b, #2
407 ext v1.8b, v16.8b, v17.8b, #4
408 ext v2.8b, v16.8b, v17.8b, #6
409 st1 {v16.8b}, [x0], x7
410 st1 {v0.8b}, [x0], x7
411 st1 {v1.8b}, [x0], x7
412 st1 {v2.8b}, [x0], x7
414 ext v4.8b, v17.8b, v18.8b, #2
415 ext v5.8b, v17.8b, v18.8b, #4
416 ext v6.8b, v17.8b, v18.8b, #6
417 st1 {v17.8b}, [x0], x7
418 st1 {v4.8b}, [x0], x7
419 st1 {v5.8b}, [x0], x7
425 function x264_predict_8x8c_dc_top_neon, export=1
426 sub x2, x0, #FDEC_STRIDE
430 addp v0.4h, v0.4h, v0.4h
431 rshrn v0.8b, v0.8h, #2
434 transpose v0.2s, v1.2s, v2.2s, v3.2s
438 function x264_predict_8x8c_dc_left_neon, export=1
439 ldrb w2, [x0, #0 * FDEC_STRIDE - 1]
440 ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
441 ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
442 ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
446 ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
447 ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
448 ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
449 ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
456 rshrn v0.8b, v0.8h, #2
457 rshrn v1.8b, v1.8h, #2
461 function x264_predict_8x8c_dc_neon, export=1
462 sub x2, x0, #FDEC_STRIDE
467 transpose v0.2s, v1.2s, v2.2s, v3.2s
468 uaddlp v0.4h, v0.8b // s0, s2
469 uaddlp v1.4h, v1.8b // s1, s3
470 addp v0.4h, v0.4h, v1.4h // s0, s2, s1, s3
471 addp v1.4h, v0.4h, v0.4h
472 rshrn v2.8b, v0.8h, #2
473 rshrn v3.8b, v1.8h, #3
474 dup v5.8b, v2.b[2] // dc1
475 dup v6.8b, v3.b[1] // dc2
476 dup v4.8b, v3.b[0] // dc0
477 dup v7.8b, v2.b[3] // dc3
478 trn1 v0.2s, v4.2s, v5.2s
479 trn1 v1.2s, v7.2s, v6.2s
481 add x2, x0, x1, lsl #2
483 st1 {v0.8b}, [x0], x1
484 st1 {v1.8b}, [x2], x1
489 function x264_predict_8x8c_h_neon, export=1
493 ld1r {v0.8b}, [x1], x7
494 ld1r {v1.8b}, [x1], x7
495 st1 {v0.8b}, [x0], x7
496 st1 {v1.8b}, [x0], x7
501 function x264_predict_8x8c_v_neon, export=1
502 sub x0, x0, #FDEC_STRIDE
504 ld1 {v0.8b}, [x0], x7
506 st1 {v0.8b}, [x0], x7
511 function x264_predict_8x8c_p_neon, export=1
512 sub x3, x0, #FDEC_STRIDE
517 ld1 {v2.s}[0], [x2], x1
518 ldcol.8 v0, x3, x1, 4, hi=1
520 ldcol.8 v3, x3, x1, 4
523 uaddl v4.8h, v2.8b, v3.8b
525 trn1 v2.2s, v2.2s, v3.2s
527 usubl v2.8h, v2.8b, v0.8b
528 mul v2.8h, v2.8h, v7.8h
531 addp v2.4s, v2.4s, v2.4s
533 add v2.2s, v2.2s, v3.2s
534 rshrn v5.4h, v2.4s, #5 // b, c, x, x
535 addp v2.4h, v5.4h, v5.4h
537 sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
539 add v4.4h, v4.4h, v0.4h
540 shl v2.4h, v4.4h, #4 // a
541 sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
542 ext v0.16b, v0.16b, v0.16b, #14
543 sub v6.4h, v5.4h, v3.4h
545 mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
546 dup v1.8h, v2.h[0] // pix
547 dup v2.8h, v5.h[1] // c
548 add v1.8h, v1.8h, v0.8h // pix + x*b
552 sqshrun v0.8b, v1.8h, #5
553 add v1.8h, v1.8h, v2.8h
554 st1 {v0.8b}, [x0], x1
560 function x264_predict_16x16_dc_top_neon, export=1
561 sub x2, x0, #FDEC_STRIDE
565 rshrn v0.8b, v0.8h, #4
570 function x264_predict_16x16_dc_left_neon, export=1
575 rshrn v0.8b, v0.8h, #4
580 function x264_predict_16x16_dc_neon, export=1
581 sub x3, x0, #FDEC_STRIDE
588 add v0.4h, v0.4h, v1.4h
589 rshrn v0.8b, v0.8h, #5
593 st1 {v0.16b}, [x0], x1
598 function x264_predict_16x16_h_neon, export=1
602 ld1r {v0.16b}, [x1], x7
603 ld1r {v1.16b}, [x1], x7
604 st1 {v0.16b}, [x0], x7
605 st1 {v1.16b}, [x0], x7
610 function x264_predict_16x16_v_neon, export=1
611 sub x0, x0, #FDEC_STRIDE
613 ld1 {v0.16b}, [x0], x7
615 st1 {v0.16b}, [x0], x7
620 function x264_predict_16x16_p_neon, export=1
621 sub x3, x0, #FDEC_STRIDE
626 ld1 {v2.8b}, [x2], x1
633 uaddl v4.8h, v2.8b, v3.8b
635 usubl v2.8h, v2.8b, v0.8b
636 usubl v3.8h, v3.8b, v1.8b
637 mul v2.8h, v2.8h, v7.8h
638 mul v3.8h, v3.8h, v7.8h
641 addp v2.4s, v2.4s, v3.4s
642 addp v2.4s, v2.4s, v2.4s
644 add v2.2s, v2.2s, v3.2s
645 rshrn v5.4h, v2.4s, #6 // b, c, x, x
646 addp v2.4h, v5.4h, v5.4h
648 sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
649 ext v4.16b, v4.16b, v4.16b, #14
650 add v4.4h, v4.4h, v7.4h
651 shl v2.4h, v4.4h, #4 // a
652 sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
653 ext v7.16b, v7.16b, v7.16b, #14
656 mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
657 dup v1.8h, v2.h[0] // pix
658 dup v2.8h, v5.h[1] // c
660 add v1.8h, v1.8h, v0.8h // pix + x*b
661 add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
665 sqshrun v0.8b, v1.8h, #5
666 add v1.8h, v1.8h, v2.8h
667 sqshrun2 v0.16b, v3.8h, #5
668 add v3.8h, v3.8h, v2.8h
669 st1 {v0.16b}, [x0], x1