1 /*****************************************************************************
2 * predict.S: aarch64 intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2015 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Mans Rullgard <mans@mansr.com>
8 * Janne Grunau <janne-x264@jannau.net>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
30 const p8weight, align=4
31 .short 1, 2, 3, 4, 1, 2, 3, 4
33 const p16weight, align=4
34 .short 1, 2, 3, 4, 5, 6, 7, 8
37 .macro ldcol.8 vd, xn, xm, n=8, hi=0
38 .if \n == 8 || \hi == 0
39 ld1 {\vd\().b}[0], [\xn], \xm
40 ld1 {\vd\().b}[1], [\xn], \xm
41 ld1 {\vd\().b}[2], [\xn], \xm
42 ld1 {\vd\().b}[3], [\xn], \xm
44 .if \n == 8 || \hi == 1
45 ld1 {\vd\().b}[4], [\xn], \xm
46 ld1 {\vd\().b}[5], [\xn], \xm
47 ld1 {\vd\().b}[6], [\xn], \xm
48 ld1 {\vd\().b}[7], [\xn], \xm
52 .macro ldcol.16 vd, xn, xm
54 ld1 {\vd\().b}[ 8], [\xn], \xm
55 ld1 {\vd\().b}[ 9], [\xn], \xm
56 ld1 {\vd\().b}[10], [\xn], \xm
57 ld1 {\vd\().b}[11], [\xn], \xm
58 ld1 {\vd\().b}[12], [\xn], \xm
59 ld1 {\vd\().b}[13], [\xn], \xm
60 ld1 {\vd\().b}[14], [\xn], \xm
61 ld1 {\vd\().b}[15], [\xn], \xm
65 function x264_predict_4x4_h_aarch64, export=1
66 ldrb w1, [x0, #0*FDEC_STRIDE-1]
68 ldrb w2, [x0, #1*FDEC_STRIDE-1]
69 ldrb w3, [x0, #2*FDEC_STRIDE-1]
71 ldrb w4, [x0, #3*FDEC_STRIDE-1]
73 str w1, [x0, #0*FDEC_STRIDE]
75 str w2, [x0, #1*FDEC_STRIDE]
77 str w3, [x0, #2*FDEC_STRIDE]
78 str w4, [x0, #3*FDEC_STRIDE]
82 function x264_predict_4x4_v_aarch64, export=1
83 ldr w1, [x0, #0 - 1 * FDEC_STRIDE]
84 str w1, [x0, #0 + 0 * FDEC_STRIDE]
85 str w1, [x0, #0 + 1 * FDEC_STRIDE]
86 str w1, [x0, #0 + 2 * FDEC_STRIDE]
87 str w1, [x0, #0 + 3 * FDEC_STRIDE]
91 function x264_predict_4x4_dc_neon, export=1
92 sub x1, x0, #FDEC_STRIDE
96 ld1r {v1.8b}, [x2], x7
97 ld1r {v2.8b}, [x2], x7
98 ld1r {v3.8b}, [x2], x7
99 ld1r {v4.8b}, [x2], x7
101 uaddl v1.8h, v1.8b, v2.8b
102 uaddl v2.8h, v3.8b, v4.8b
103 addp v0.4h, v0.4h, v0.4h
104 add v1.4h, v1.4h, v2.4h
106 add v0.4h, v0.4h, v1.4h
107 rshrn v0.8b, v0.8h, #3
108 str s0, [x0], #FDEC_STRIDE
109 str s0, [x0], #FDEC_STRIDE
110 str s0, [x0], #FDEC_STRIDE
115 function x264_predict_4x4_dc_top_neon, export=1
116 sub x1, x0, #FDEC_STRIDE
120 addp v0.4h, v0.4h, v0.4h
122 rshrn v0.8b, v0.8h, #2
123 str s0, [x0], #FDEC_STRIDE
124 str s0, [x0], #FDEC_STRIDE
125 str s0, [x0], #FDEC_STRIDE
130 function x264_predict_4x4_ddr_neon, export=1
131 sub x1, x0, #FDEC_STRIDE+1
133 ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
134 ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
135 ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
136 ext v0.8b, v1.8b, v0.8b, #7
137 ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
138 ext v0.8b, v2.8b, v0.8b, #7 // a
139 ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
140 ext v1.8b, v3.8b, v0.8b, #7 // b
141 ext v2.8b, v4.8b, v1.8b, #7 // c
142 uaddl v0.8h, v0.8b, v1.8b
143 uaddl v1.8h, v1.8b, v2.8b
144 add v0.8h, v0.8h, v1.8h
145 rshrn v0.8b, v0.8h, #2
147 ext v3.8b, v0.8b, v0.8b, #3
148 ext v2.8b, v0.8b, v0.8b, #2
149 ext v1.8b, v0.8b, v0.8b, #1
151 str s3, [x0], #FDEC_STRIDE
152 str s2, [x0], #FDEC_STRIDE
153 str s1, [x0], #FDEC_STRIDE
158 function x264_predict_4x4_ddl_neon, export=1
159 sub x0, x0, #FDEC_STRIDE
161 ld1 {v0.8b}, [x0], x7
163 ext v1.8b, v0.8b, v0.8b, #1
164 ext v2.8b, v0.8b, v3.8b, #2
165 uhadd v0.8b, v0.8b, v2.8b
166 urhadd v0.8b, v0.8b, v1.8b
167 str s0, [x0], #FDEC_STRIDE
168 ext v1.8b, v0.8b, v0.8b, #1
169 ext v2.8b, v0.8b, v0.8b, #2
170 str s1, [x0], #FDEC_STRIDE
171 ext v3.8b, v0.8b, v0.8b, #3
172 str s2, [x0], #FDEC_STRIDE
177 function x264_predict_8x8_dc_neon, export=1
179 ld1 {v0.16b}, [x1], #16
181 ext v0.16b, v0.16b, v0.16b, #7
184 add v0.8h, v0.8h, v1.8h
186 rshrn v0.8b, v0.8h, #4
188 st1 {v0.8b}, [x0], x7
193 function x264_predict_8x8_h_neon, export=1
198 st1 {v0.8b}, [x0], x7
200 st1 {v1.8b}, [x0], x7
202 st1 {v2.8b}, [x0], x7
204 st1 {v3.8b}, [x0], x7
206 st1 {v4.8b}, [x0], x7
208 st1 {v5.8b}, [x0], x7
210 st1 {v6.8b}, [x0], x7
211 st1 {v7.8b}, [x0], x7
215 function x264_predict_8x8_v_neon, export=1
220 st1 {v0.8b}, [x0], x7
225 function x264_predict_8x8_ddl_neon, export=1
231 ext v4.16b, v3.16b, v0.16b, #15
232 ext v2.16b, v0.16b, v2.16b, #1
233 uhadd v4.16b, v4.16b, v2.16b
234 urhadd v0.16b, v0.16b, v4.16b
235 ext v1.16b, v0.16b, v0.16b, #1
236 ext v2.16b, v0.16b, v0.16b, #2
237 st1 {v1.8b}, [x0], x7
238 ext v3.16b, v0.16b, v0.16b, #3
239 st1 {v2.8b}, [x0], x7
240 ext v4.16b, v0.16b, v0.16b, #4
241 st1 {v3.8b}, [x0], x7
242 ext v5.16b, v0.16b, v0.16b, #5
243 st1 {v4.8b}, [x0], x7
244 ext v6.16b, v0.16b, v0.16b, #6
245 st1 {v5.8b}, [x0], x7
246 ext v7.16b, v0.16b, v0.16b, #7
247 st1 {v6.8b}, [x0], x7
248 ext v0.16b, v0.16b, v0.16b, #8
249 st1 {v7.8b}, [x0], x7
250 st1 {v0.8b}, [x0], x7
254 function x264_predict_8x8_ddr_neon, export=1
255 ld1 {v0.16b,v1.16b}, [x1]
256 ext v2.16b, v0.16b, v1.16b, #7
257 ext v4.16b, v0.16b, v1.16b, #9
258 ext v3.16b, v0.16b, v1.16b, #8
260 uhadd v2.16b, v2.16b, v4.16b
261 urhadd v7.16b, v3.16b, v2.16b
263 add x0, x0, #7*FDEC_STRIDE
264 mov x7, #-1*FDEC_STRIDE
266 ext v6.16b, v7.16b, v7.16b, #1
267 st1 {v7.8b}, [x0], x7
268 ext v5.16b, v7.16b, v7.16b, #2
269 st1 {v6.8b}, [x0], x7
270 ext v4.16b, v7.16b, v7.16b, #3
271 st1 {v5.8b}, [x0], x7
272 ext v3.16b, v7.16b, v7.16b, #4
273 st1 {v4.8b}, [x0], x7
274 ext v2.16b, v7.16b, v7.16b, #5
275 st1 {v3.8b}, [x0], x7
276 ext v1.16b, v7.16b, v7.16b, #6
277 st1 {v2.8b}, [x0], x7
278 ext v0.16b, v7.16b, v7.16b, #7
279 st1 {v1.8b}, [x0], x7
280 st1 {v0.8b}, [x0], x7
284 function x264_predict_8x8_vl_neon, export=1
289 ext v1.16b, v1.16b, v0.16b, #15
290 ext v2.16b, v0.16b, v2.16b, #1
292 uhadd v1.16b, v1.16b, v2.16b
293 urhadd v3.16b, v0.16b, v2.16b
295 urhadd v0.16b, v0.16b, v1.16b
297 ext v4.16b, v0.16b, v0.16b, #1
298 st1 {v3.8b}, [x0], x7
299 ext v5.16b, v3.16b, v3.16b, #1
300 st1 {v4.8b}, [x0], x7
301 ext v6.16b, v0.16b, v0.16b, #2
302 st1 {v5.8b}, [x0], x7
303 ext v7.16b, v3.16b, v3.16b, #2
304 st1 {v6.8b}, [x0], x7
305 ext v4.16b, v0.16b, v0.16b, #3
306 st1 {v7.8b}, [x0], x7
307 ext v5.16b, v3.16b, v3.16b, #3
308 st1 {v4.8b}, [x0], x7
309 ext v6.16b, v0.16b, v0.16b, #4
310 st1 {v5.8b}, [x0], x7
311 st1 {v6.8b}, [x0], x7
315 function x264_predict_8x8_vr_neon, export=1
320 ext v1.16b, v2.16b, v2.16b, #14
321 ext v0.16b, v2.16b, v2.16b, #15
323 uhadd v3.16b, v2.16b, v1.16b
324 urhadd v2.16b, v2.16b, v0.16b
325 urhadd v0.16b, v0.16b, v3.16b
327 ext v1.16b, v2.16b, v2.16b, #8
328 uzp1 v2.8b, v0.8b, v0.8b
329 uzp2 v3.8b, v0.8b, v0.8b
330 ext v0.16b, v0.16b, v0.16b, #8
332 st1 {v1.8b}, [x0], x7
333 st1 {v0.8b}, [x0], x7
334 ext v4.8b, v3.8b, v1.8b, #7
335 ext v5.8b, v2.8b, v0.8b, #7
336 st1 {v4.8b}, [x0], x7
337 st1 {v5.8b}, [x0], x7
338 ext v6.8b, v3.8b, v1.8b, #6
339 ext v7.8b, v2.8b, v0.8b, #6
340 st1 {v6.8b}, [x0], x7
341 st1 {v7.8b}, [x0], x7
342 ext v1.8b, v3.8b, v1.8b, #5
343 ext v0.8b, v2.8b, v0.8b, #5
344 st1 {v1.8b}, [x0], x7
345 st1 {v0.8b}, [x0], x7
349 function x264_predict_8x8_hd_neon, export=1
354 ext v3.16b, v1.16b, v1.16b, #1
355 ext v2.16b, v1.16b, v1.16b, #2
357 urhadd v4.16b, v1.16b, v3.16b
359 uhadd v1.16b, v1.16b, v2.16b
360 urhadd v0.16b, v1.16b, v3.16b
362 zip1 v16.8b, v4.8b, v0.8b
363 zip2 v17.8b, v4.8b, v0.8b
364 ext v7.16b, v0.16b, v0.16b, #8
366 ext v0.8b, v17.8b, v7.8b, #6
367 ext v1.8b, v17.8b, v7.8b, #4
368 st1 {v0.8b}, [x0], x7
369 ext v2.8b, v17.8b, v7.8b, #2
370 st1 {v1.8b}, [x0], x7
371 st1 {v2.8b}, [x0], x7
372 ext v3.8b, v16.8b, v17.8b, #6
373 st1 {v17.8b}, [x0], x7
374 ext v4.8b, v16.8b, v17.8b, #4
375 st1 {v3.8b}, [x0], x7
376 ext v5.8b, v16.8b, v17.8b, #2
377 st1 {v4.8b}, [x0], x7
378 st1 {v5.8b}, [x0], x7
379 st1 {v16.8b}, [x0], x7
384 function x264_predict_8x8_hu_neon, export=1
391 ext v4.8b, v7.8b, v6.8b, #2
392 ext v2.8b, v7.8b, v6.8b, #1
394 uhadd v5.8b, v7.8b, v4.8b
395 urhadd v0.8b, v2.8b, v7.8b
396 urhadd v1.8b, v5.8b, v2.8b
398 zip1 v16.8b, v0.8b, v1.8b
399 zip2 v17.8b, v0.8b, v1.8b
403 ext v0.8b, v16.8b, v17.8b, #2
404 ext v1.8b, v16.8b, v17.8b, #4
405 ext v2.8b, v16.8b, v17.8b, #6
406 st1 {v16.8b}, [x0], x7
407 st1 {v0.8b}, [x0], x7
408 st1 {v1.8b}, [x0], x7
409 st1 {v2.8b}, [x0], x7
411 ext v4.8b, v17.8b, v18.8b, #2
412 ext v5.8b, v17.8b, v18.8b, #4
413 ext v6.8b, v17.8b, v18.8b, #6
414 st1 {v17.8b}, [x0], x7
415 st1 {v4.8b}, [x0], x7
416 st1 {v5.8b}, [x0], x7
422 function x264_predict_8x8c_dc_top_neon, export=1
423 sub x2, x0, #FDEC_STRIDE
427 addp v0.4h, v0.4h, v0.4h
428 rshrn v0.8b, v0.8h, #2
431 transpose v0.2s, v1.2s, v2.2s, v3.2s
435 function x264_predict_8x8c_dc_left_neon, export=1
436 ldrb w2, [x0, #0 * FDEC_STRIDE - 1]
437 ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
438 ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
439 ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
443 ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
444 ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
445 ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
446 ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
453 rshrn v0.8b, v0.8h, #2
454 rshrn v1.8b, v1.8h, #2
458 function x264_predict_8x8c_dc_neon, export=1
459 sub x2, x0, #FDEC_STRIDE
464 transpose v0.2s, v1.2s, v2.2s, v3.2s
465 uaddlp v0.4h, v0.8b // s0, s2
466 uaddlp v1.4h, v1.8b // s1, s3
467 addp v0.4h, v0.4h, v1.4h // s0, s2, s1, s3
468 addp v1.4h, v0.4h, v0.4h
469 rshrn v2.8b, v0.8h, #2
470 rshrn v3.8b, v1.8h, #3
471 dup v5.8b, v2.b[2] // dc1
472 dup v6.8b, v3.b[1] // dc2
473 dup v4.8b, v3.b[0] // dc0
474 dup v7.8b, v2.b[3] // dc3
475 trn1 v0.2s, v4.2s, v5.2s
476 trn1 v1.2s, v7.2s, v6.2s
478 add x2, x0, x1, lsl #2
480 st1 {v0.8b}, [x0], x1
481 st1 {v1.8b}, [x2], x1
486 function x264_predict_8x8c_h_neon, export=1
490 ld1r {v0.8b}, [x1], x7
491 ld1r {v1.8b}, [x1], x7
492 st1 {v0.8b}, [x0], x7
493 st1 {v1.8b}, [x0], x7
498 function x264_predict_8x8c_v_neon, export=1
499 sub x0, x0, #FDEC_STRIDE
501 ld1 {v0.8b}, [x0], x7
503 st1 {v0.8b}, [x0], x7
508 function x264_predict_8x8c_p_neon, export=1
509 sub x3, x0, #FDEC_STRIDE
514 ld1 {v2.s}[0], [x2], x1
515 ldcol.8 v0, x3, x1, 4, hi=1
517 ldcol.8 v3, x3, x1, 4
520 uaddl v4.8h, v2.8b, v3.8b
522 trn1 v2.2s, v2.2s, v3.2s
524 usubl v2.8h, v2.8b, v0.8b
525 mul v2.8h, v2.8h, v7.8h
528 addp v2.4s, v2.4s, v2.4s
530 add v2.2s, v2.2s, v3.2s
531 rshrn v5.4h, v2.4s, #5 // b, c, x, x
532 addp v2.4h, v5.4h, v5.4h
534 sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
536 add v4.4h, v4.4h, v0.4h
537 shl v2.4h, v4.4h, #4 // a
538 sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
539 ext v0.16b, v0.16b, v0.16b, #14
540 sub v6.4h, v5.4h, v3.4h
542 mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
543 dup v1.8h, v2.h[0] // pix
544 dup v2.8h, v5.h[1] // c
545 add v1.8h, v1.8h, v0.8h // pix + x*b
549 sqshrun v0.8b, v1.8h, #5
550 add v1.8h, v1.8h, v2.8h
551 st1 {v0.8b}, [x0], x1
557 .macro loadsum4 wd, t1, t2, t3, x, idx
558 ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
559 ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1]
560 ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1]
561 ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1]
567 function x264_predict_8x16c_h_neon, export=1
569 add x3, x0, #FDEC_STRIDE - 1
570 mov x7, #2 * FDEC_STRIDE
571 add x1, x0, #FDEC_STRIDE
573 ld1r {v0.8b}, [x2], x7
574 ld1r {v1.8b}, [x3], x7
575 ld1r {v2.8b}, [x2], x7
576 ld1r {v3.8b}, [x3], x7
577 st1 {v0.8b}, [x0], x7
578 st1 {v1.8b}, [x1], x7
579 st1 {v2.8b}, [x0], x7
580 st1 {v3.8b}, [x1], x7
585 function x264_predict_8x16c_v_neon, export=1
586 sub x1, x0, #FDEC_STRIDE
587 mov x2, #2 * FDEC_STRIDE
588 ld1 {v0.8b}, [x1], x2
590 st1 {v0.8b}, [x0], x2
591 st1 {v0.8b}, [x1], x2
596 function x264_predict_8x16c_p_neon, export=1
599 sub x3, x0, #FDEC_STRIDE
605 ld1 {v2.8b}, [x2], x1
609 ext v4.8b, v2.8b, v2.8b, #3
610 ext v5.8b, v3.8b, v3.8b, #7
614 uaddl v4.8h, v5.8b, v4.8b // a * 1/16
616 usubl v2.8h, v2.8b, v0.8b
617 mul v2.8h, v2.8h, v17.8h
619 addp v2.4s, v2.4s, v2.4s // H
621 usubl v3.8h, v3.8b, v1.8b
622 mul v3.8h, v3.8h, v17.8h
624 addp v3.4s, v3.4s, v3.4s
625 addp v3.4s, v3.4s, v3.4s // V
627 ext v17.16b, v17.16b, v17.16b, #14
629 shl v4.4h, v4.4h, #4 // a
630 shl v6.2s, v2.2s, #4 // 16 * H
631 shl v7.2s, v3.2s, #2 // 4 * V
632 add v2.2s, v2.2s, v6.2s // 17 * H
633 add v3.2s, v3.2s, v7.2s // 5 * V
634 rshrn v2.4h, v2.4s, #5 // b
635 rshrn v3.4h, v3.4s, #6 // c
639 sub v4.4h, v4.4h, v2.4h // a - b
640 shl v6.4h, v2.4h, #1 // 2 * b
641 add v4.4h, v4.4h, v3.4h // a - b + c
642 shl v7.4h, v3.4h, #3 // 8 * c
643 sub v4.4h, v4.4h, v6.4h // a - 3b + c
644 sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
646 mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
647 dup v1.8h, v4.h[0] // i00
648 dup v2.8h, v3.h[0] // c
649 add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
653 sqrshrun v4.8b, v1.8h, #5
654 add v1.8h, v1.8h, v2.8h
655 sqrshrun v5.8b, v1.8h, #5
656 st1 {v4.8b}, [x0], x1
657 add v1.8h, v1.8h, v2.8h
658 st1 {v5.8b}, [x0], x1
663 function x264_predict_8x16c_dc_neon, export=1
664 sub x3, x0, #FDEC_STRIDE
667 loadsum4 w2, w3, w4, w5, x0, 0
670 loadsum4 w6, w7, w8, w9, x0, 4
671 addp v6.4h, v6.4h, v6.4h // s0, s1
673 loadsum4 w2, w3, w4, w5, x0, 8
674 dup v20.8h, v6.h[0] // s0
676 loadsum4 w6, w7, w8, w9, x0, 12
677 dup v21.8h, v6.h[1] // s1
680 ext v16.16b, v20.16b, v21.16b, #8
681 ext v17.16b, v22.16b, v21.16b, #8
682 ext v1.16b, v23.16b, v21.16b, #8
683 ext v2.16b, v24.16b, v21.16b, #8
684 ext v3.16b, v25.16b, v21.16b, #8
686 add v0.8h, v16.8h, v17.8h
687 add v1.8h, v1.8h, v23.8h
688 add v2.8h, v2.8h, v24.8h
689 add v3.8h, v3.8h, v25.8h
691 rshrn v0.8b, v0.8h, #3
692 rshrn v1.8b, v1.8h, #3
693 rshrn v2.8b, v2.8h, #3
694 rshrn v3.8b, v3.8h, #3
697 st1 {v\idx\().8b}, [x0], x1
703 function x264_predict_8x16c_dc_left_neon, export=1
705 ldrb w2, [x0, # 0 * FDEC_STRIDE - 1]
706 ldrb w3, [x0, # 1 * FDEC_STRIDE - 1]
707 ldrb w4, [x0, # 2 * FDEC_STRIDE - 1]
708 ldrb w5, [x0, # 3 * FDEC_STRIDE - 1]
711 ldrb w6, [x0, # 4 * FDEC_STRIDE - 1]
713 ldrb w7, [x0, # 5 * FDEC_STRIDE - 1]
715 ldrb w8, [x0, # 6 * FDEC_STRIDE - 1]
716 ldrb w9, [x0, # 7 * FDEC_STRIDE - 1]
719 rshrn v0.8b, v0.8h, #2
722 ldrb w10, [x0, # 8 * FDEC_STRIDE - 1]
723 ldrb w11, [x0, # 9 * FDEC_STRIDE - 1]
725 ldrb w12, [x0, #10 * FDEC_STRIDE - 1]
726 ldrb w13, [x0, #11 * FDEC_STRIDE - 1]
729 rshrn v1.8b, v1.8h, #2
732 ldrb w2, [x0, #12 * FDEC_STRIDE - 1]
733 ldrb w3, [x0, #13 * FDEC_STRIDE - 1]
735 ldrb w4, [x0, #14 * FDEC_STRIDE - 1]
736 ldrb w5, [x0, #15 * FDEC_STRIDE - 1]
739 rshrn v2.8b, v2.8h, #2
741 st1 {v0.8b}, [x0], x1
742 st1 {v0.8b}, [x0], x1
744 st1 {v0.8b}, [x0], x1
746 st1 {v0.8b}, [x0], x1
747 rshrn v3.8b, v3.8h, #2
751 st1 {v\idx\().8b}, [x0], x1
757 function x264_predict_8x16c_dc_top_neon, export=1
758 sub x2, x0, #FDEC_STRIDE
762 addp v0.4h, v0.4h, v0.4h
763 rshrn v4.8b, v0.8h, #2
766 ext v0.8b, v0.8b, v1.8b, #4
768 st1 {v0.8b}, [x0], x1
774 function x264_predict_16x16_dc_top_neon, export=1
775 sub x2, x0, #FDEC_STRIDE
779 rshrn v0.8b, v0.8h, #4
784 function x264_predict_16x16_dc_left_neon, export=1
789 rshrn v0.8b, v0.8h, #4
794 function x264_predict_16x16_dc_neon, export=1
795 sub x3, x0, #FDEC_STRIDE
802 add v0.4h, v0.4h, v1.4h
803 rshrn v0.8b, v0.8h, #5
807 st1 {v0.16b}, [x0], x1
812 function x264_predict_16x16_h_neon, export=1
816 ld1r {v0.16b}, [x1], x7
817 ld1r {v1.16b}, [x1], x7
818 st1 {v0.16b}, [x0], x7
819 st1 {v1.16b}, [x0], x7
824 function x264_predict_16x16_v_neon, export=1
825 sub x0, x0, #FDEC_STRIDE
827 ld1 {v0.16b}, [x0], x7
829 st1 {v0.16b}, [x0], x7
834 function x264_predict_16x16_p_neon, export=1
835 sub x3, x0, #FDEC_STRIDE
840 ld1 {v2.8b}, [x2], x1
847 uaddl v4.8h, v2.8b, v3.8b
849 usubl v2.8h, v2.8b, v0.8b
850 usubl v3.8h, v3.8b, v1.8b
851 mul v2.8h, v2.8h, v7.8h
852 mul v3.8h, v3.8h, v7.8h
855 addp v2.4s, v2.4s, v3.4s
856 addp v2.4s, v2.4s, v2.4s
858 add v2.2s, v2.2s, v3.2s
859 rshrn v5.4h, v2.4s, #6 // b, c, x, x
860 addp v2.4h, v5.4h, v5.4h
862 sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
863 ext v4.16b, v4.16b, v4.16b, #14
864 add v4.4h, v4.4h, v7.4h
865 shl v2.4h, v4.4h, #4 // a
866 sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
867 ext v7.16b, v7.16b, v7.16b, #14
870 mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
871 dup v1.8h, v2.h[0] // pix
872 dup v2.8h, v5.h[1] // c
874 add v1.8h, v1.8h, v0.8h // pix + x*b
875 add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
879 sqshrun v0.8b, v1.8h, #5
880 add v1.8h, v1.8h, v2.8h
881 sqshrun2 v0.16b, v3.8h, #5
882 add v3.8h, v3.8h, v2.8h
883 st1 {v0.16b}, [x0], x1