1 /*****************************************************************************
2 * predict.S: aarch64 intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2015 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Mans Rullgard <mans@mansr.com>
8 * Janne Grunau <janne-x264@jannau.net>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
30 const p8weight, align=4
31 .short 1, 2, 3, 4, 1, 2, 3, 4
33 const p16weight, align=4
34 .short 1, 2, 3, 4, 5, 6, 7, 8
37 .macro ldcol.8 vd, xn, xm, n=8, hi=0
38 .if \n == 8 || \hi == 0
39 ld1 {\vd\().b}[0], [\xn], \xm
40 ld1 {\vd\().b}[1], [\xn], \xm
41 ld1 {\vd\().b}[2], [\xn], \xm
42 ld1 {\vd\().b}[3], [\xn], \xm
44 .if \n == 8 || \hi == 1
45 ld1 {\vd\().b}[4], [\xn], \xm
46 ld1 {\vd\().b}[5], [\xn], \xm
47 ld1 {\vd\().b}[6], [\xn], \xm
48 ld1 {\vd\().b}[7], [\xn], \xm
52 .macro ldcol.16 vd, xn, xm
54 ld1 {\vd\().b}[ 8], [\xn], \xm
55 ld1 {\vd\().b}[ 9], [\xn], \xm
56 ld1 {\vd\().b}[10], [\xn], \xm
57 ld1 {\vd\().b}[11], [\xn], \xm
58 ld1 {\vd\().b}[12], [\xn], \xm
59 ld1 {\vd\().b}[13], [\xn], \xm
60 ld1 {\vd\().b}[14], [\xn], \xm
61 ld1 {\vd\().b}[15], [\xn], \xm
65 function x264_predict_4x4_h_aarch64, export=1
66 ldrb w1, [x0, #0*FDEC_STRIDE-1]
68 ldrb w2, [x0, #1*FDEC_STRIDE-1]
69 ldrb w3, [x0, #2*FDEC_STRIDE-1]
71 ldrb w4, [x0, #3*FDEC_STRIDE-1]
73 str w1, [x0, #0*FDEC_STRIDE]
75 str w2, [x0, #1*FDEC_STRIDE]
77 str w3, [x0, #2*FDEC_STRIDE]
78 str w4, [x0, #3*FDEC_STRIDE]
82 function x264_predict_4x4_v_aarch64, export=1
83 ldr w1, [x0, #0 - 1 * FDEC_STRIDE]
84 str w1, [x0, #0 + 0 * FDEC_STRIDE]
85 str w1, [x0, #0 + 1 * FDEC_STRIDE]
86 str w1, [x0, #0 + 2 * FDEC_STRIDE]
87 str w1, [x0, #0 + 3 * FDEC_STRIDE]
91 function x264_predict_4x4_dc_neon, export=1
92 sub x1, x0, #FDEC_STRIDE
93 ldrb w4, [x0, #-1 + 0 * FDEC_STRIDE]
94 ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE]
95 ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE]
96 ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE]
104 add v0.4h, v0.4h, v1.4h
105 rshrn v0.8b, v0.8h, #3
107 str s0, [x0, #1 * FDEC_STRIDE]
108 str s0, [x0, #2 * FDEC_STRIDE]
109 str s0, [x0, #3 * FDEC_STRIDE]
113 function x264_predict_4x4_dc_top_neon, export=1
114 sub x1, x0, #FDEC_STRIDE
118 rshrn v0.8b, v0.8h, #2
120 str s0, [x0, #1 * FDEC_STRIDE]
121 str s0, [x0, #2 * FDEC_STRIDE]
122 str s0, [x0, #3 * FDEC_STRIDE]
127 function x264_predict_4x4_ddr_neon, export=1
128 sub x1, x0, #FDEC_STRIDE+1
130 ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
131 ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
132 ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
133 ext v0.8b, v1.8b, v0.8b, #7
134 ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
135 ext v0.8b, v2.8b, v0.8b, #7 // a
136 ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
137 ext v1.8b, v3.8b, v0.8b, #7 // b
138 ext v2.8b, v4.8b, v1.8b, #7 // c
139 uaddl v0.8h, v0.8b, v1.8b
140 uaddl v1.8h, v1.8b, v2.8b
141 add v0.8h, v0.8h, v1.8h
142 rshrn v0.8b, v0.8h, #2
144 ext v3.8b, v0.8b, v0.8b, #3
145 ext v2.8b, v0.8b, v0.8b, #2
146 ext v1.8b, v0.8b, v0.8b, #1
148 str s3, [x0], #FDEC_STRIDE
149 str s2, [x0], #FDEC_STRIDE
150 str s1, [x0], #FDEC_STRIDE
155 function x264_predict_4x4_ddl_neon, export=1
156 sub x0, x0, #FDEC_STRIDE
158 ld1 {v0.8b}, [x0], x7
160 ext v1.8b, v0.8b, v0.8b, #1
161 ext v2.8b, v0.8b, v3.8b, #2
162 uhadd v0.8b, v0.8b, v2.8b
163 urhadd v0.8b, v0.8b, v1.8b
164 str s0, [x0], #FDEC_STRIDE
165 ext v1.8b, v0.8b, v0.8b, #1
166 ext v2.8b, v0.8b, v0.8b, #2
167 str s1, [x0], #FDEC_STRIDE
168 ext v3.8b, v0.8b, v0.8b, #3
169 str s2, [x0], #FDEC_STRIDE
174 function x264_predict_8x8_dc_neon, export=1
176 ld1 {v0.16b}, [x1], #16
178 ext v0.16b, v0.16b, v0.16b, #7
181 add v0.8h, v0.8h, v1.8h
183 rshrn v0.8b, v0.8h, #4
185 st1 {v0.8b}, [x0], x7
190 function x264_predict_8x8_h_neon, export=1
195 st1 {v0.8b}, [x0], x7
197 st1 {v1.8b}, [x0], x7
199 st1 {v2.8b}, [x0], x7
201 st1 {v3.8b}, [x0], x7
203 st1 {v4.8b}, [x0], x7
205 st1 {v5.8b}, [x0], x7
207 st1 {v6.8b}, [x0], x7
208 st1 {v7.8b}, [x0], x7
212 function x264_predict_8x8_v_neon, export=1
217 st1 {v0.8b}, [x0], x7
222 function x264_predict_8x8_ddl_neon, export=1
228 ext v4.16b, v3.16b, v0.16b, #15
229 ext v2.16b, v0.16b, v2.16b, #1
230 uhadd v4.16b, v4.16b, v2.16b
231 urhadd v0.16b, v0.16b, v4.16b
232 ext v1.16b, v0.16b, v0.16b, #1
233 ext v2.16b, v0.16b, v0.16b, #2
234 st1 {v1.8b}, [x0], x7
235 ext v3.16b, v0.16b, v0.16b, #3
236 st1 {v2.8b}, [x0], x7
237 ext v4.16b, v0.16b, v0.16b, #4
238 st1 {v3.8b}, [x0], x7
239 ext v5.16b, v0.16b, v0.16b, #5
240 st1 {v4.8b}, [x0], x7
241 ext v6.16b, v0.16b, v0.16b, #6
242 st1 {v5.8b}, [x0], x7
243 ext v7.16b, v0.16b, v0.16b, #7
244 st1 {v6.8b}, [x0], x7
245 ext v0.16b, v0.16b, v0.16b, #8
246 st1 {v7.8b}, [x0], x7
247 st1 {v0.8b}, [x0], x7
251 function x264_predict_8x8_ddr_neon, export=1
252 ld1 {v0.16b,v1.16b}, [x1]
253 ext v2.16b, v0.16b, v1.16b, #7
254 ext v4.16b, v0.16b, v1.16b, #9
255 ext v3.16b, v0.16b, v1.16b, #8
257 uhadd v2.16b, v2.16b, v4.16b
258 urhadd v7.16b, v3.16b, v2.16b
260 add x0, x0, #7*FDEC_STRIDE
261 mov x7, #-1*FDEC_STRIDE
263 ext v6.16b, v7.16b, v7.16b, #1
264 st1 {v7.8b}, [x0], x7
265 ext v5.16b, v7.16b, v7.16b, #2
266 st1 {v6.8b}, [x0], x7
267 ext v4.16b, v7.16b, v7.16b, #3
268 st1 {v5.8b}, [x0], x7
269 ext v3.16b, v7.16b, v7.16b, #4
270 st1 {v4.8b}, [x0], x7
271 ext v2.16b, v7.16b, v7.16b, #5
272 st1 {v3.8b}, [x0], x7
273 ext v1.16b, v7.16b, v7.16b, #6
274 st1 {v2.8b}, [x0], x7
275 ext v0.16b, v7.16b, v7.16b, #7
276 st1 {v1.8b}, [x0], x7
277 st1 {v0.8b}, [x0], x7
281 function x264_predict_8x8_vl_neon, export=1
286 ext v1.16b, v1.16b, v0.16b, #15
287 ext v2.16b, v0.16b, v2.16b, #1
289 uhadd v1.16b, v1.16b, v2.16b
290 urhadd v3.16b, v0.16b, v2.16b
292 urhadd v0.16b, v0.16b, v1.16b
294 ext v4.16b, v0.16b, v0.16b, #1
295 st1 {v3.8b}, [x0], x7
296 ext v5.16b, v3.16b, v3.16b, #1
297 st1 {v4.8b}, [x0], x7
298 ext v6.16b, v0.16b, v0.16b, #2
299 st1 {v5.8b}, [x0], x7
300 ext v7.16b, v3.16b, v3.16b, #2
301 st1 {v6.8b}, [x0], x7
302 ext v4.16b, v0.16b, v0.16b, #3
303 st1 {v7.8b}, [x0], x7
304 ext v5.16b, v3.16b, v3.16b, #3
305 st1 {v4.8b}, [x0], x7
306 ext v6.16b, v0.16b, v0.16b, #4
307 st1 {v5.8b}, [x0], x7
308 st1 {v6.8b}, [x0], x7
312 function x264_predict_8x8_vr_neon, export=1
317 ext v1.16b, v2.16b, v2.16b, #14
318 ext v0.16b, v2.16b, v2.16b, #15
320 uhadd v3.16b, v2.16b, v1.16b
321 urhadd v2.16b, v2.16b, v0.16b
322 urhadd v0.16b, v0.16b, v3.16b
324 ext v1.16b, v2.16b, v2.16b, #8
325 uzp1 v2.8b, v0.8b, v0.8b
326 uzp2 v3.8b, v0.8b, v0.8b
327 ext v0.16b, v0.16b, v0.16b, #8
329 st1 {v1.8b}, [x0], x7
330 st1 {v0.8b}, [x0], x7
331 ext v4.8b, v3.8b, v1.8b, #7
332 ext v5.8b, v2.8b, v0.8b, #7
333 st1 {v4.8b}, [x0], x7
334 st1 {v5.8b}, [x0], x7
335 ext v6.8b, v3.8b, v1.8b, #6
336 ext v7.8b, v2.8b, v0.8b, #6
337 st1 {v6.8b}, [x0], x7
338 st1 {v7.8b}, [x0], x7
339 ext v1.8b, v3.8b, v1.8b, #5
340 ext v0.8b, v2.8b, v0.8b, #5
341 st1 {v1.8b}, [x0], x7
342 st1 {v0.8b}, [x0], x7
346 function x264_predict_8x8_hd_neon, export=1
351 ext v3.16b, v1.16b, v1.16b, #1
352 ext v2.16b, v1.16b, v1.16b, #2
354 urhadd v4.16b, v1.16b, v3.16b
356 uhadd v1.16b, v1.16b, v2.16b
357 urhadd v0.16b, v1.16b, v3.16b
359 zip1 v16.8b, v4.8b, v0.8b
360 zip2 v17.8b, v4.8b, v0.8b
361 ext v7.16b, v0.16b, v0.16b, #8
363 ext v0.8b, v17.8b, v7.8b, #6
364 ext v1.8b, v17.8b, v7.8b, #4
365 st1 {v0.8b}, [x0], x7
366 ext v2.8b, v17.8b, v7.8b, #2
367 st1 {v1.8b}, [x0], x7
368 st1 {v2.8b}, [x0], x7
369 ext v3.8b, v16.8b, v17.8b, #6
370 st1 {v17.8b}, [x0], x7
371 ext v4.8b, v16.8b, v17.8b, #4
372 st1 {v3.8b}, [x0], x7
373 ext v5.8b, v16.8b, v17.8b, #2
374 st1 {v4.8b}, [x0], x7
375 st1 {v5.8b}, [x0], x7
376 st1 {v16.8b}, [x0], x7
381 function x264_predict_8x8_hu_neon, export=1
388 ext v4.8b, v7.8b, v6.8b, #2
389 ext v2.8b, v7.8b, v6.8b, #1
391 uhadd v5.8b, v7.8b, v4.8b
392 urhadd v0.8b, v2.8b, v7.8b
393 urhadd v1.8b, v5.8b, v2.8b
395 zip1 v16.8b, v0.8b, v1.8b
396 zip2 v17.8b, v0.8b, v1.8b
400 ext v0.8b, v16.8b, v17.8b, #2
401 ext v1.8b, v16.8b, v17.8b, #4
402 ext v2.8b, v16.8b, v17.8b, #6
403 st1 {v16.8b}, [x0], x7
404 st1 {v0.8b}, [x0], x7
405 st1 {v1.8b}, [x0], x7
406 st1 {v2.8b}, [x0], x7
408 ext v4.8b, v17.8b, v18.8b, #2
409 ext v5.8b, v17.8b, v18.8b, #4
410 ext v6.8b, v17.8b, v18.8b, #6
411 st1 {v17.8b}, [x0], x7
412 st1 {v4.8b}, [x0], x7
413 st1 {v5.8b}, [x0], x7
419 function x264_predict_8x8c_dc_top_neon, export=1
420 sub x2, x0, #FDEC_STRIDE
424 addp v0.4h, v0.4h, v0.4h
425 rshrn v0.8b, v0.8h, #2
428 transpose v0.2s, v1.2s, v2.2s, v3.2s
432 function x264_predict_8x8c_dc_left_neon, export=1
433 ldrb w2, [x0, #0 * FDEC_STRIDE - 1]
434 ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
435 ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
436 ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
440 ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
441 ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
442 ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
443 ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
450 rshrn v0.8b, v0.8h, #2
451 rshrn v1.8b, v1.8h, #2
455 function x264_predict_8x8c_dc_neon, export=1
457 sub x2, x0, #FDEC_STRIDE
458 ldrb w10, [x0, #0 * FDEC_STRIDE - 1]
459 ldrb w11, [x0, #1 * FDEC_STRIDE - 1]
460 ldrb w12, [x0, #2 * FDEC_STRIDE - 1]
461 ldrb w13, [x0, #3 * FDEC_STRIDE - 1]
463 ldrb w4, [x0, #4 * FDEC_STRIDE - 1]
464 ldrb w5, [x0, #5 * FDEC_STRIDE - 1]
466 ldrb w6, [x0, #6 * FDEC_STRIDE - 1]
467 ldrb w7, [x0, #7 * FDEC_STRIDE - 1]
470 add w10, w10, w12, lsl #16
471 add w4, w4, w6, lsl #16
473 add x10, x10, x4, lsl #32
474 uaddlp v0.4h, v0.8b // s0, s1
475 mov v1.d[0], x10 // s2, s3
476 add v3.4h, v0.4h, v1.4h
477 addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
478 addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
479 uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
480 uzp1 v1.2d, v1.2d, v1.2d
481 uzp1 v0.2d, v0.2d, v0.2d
482 rshrn v3.8b, v1.8h, #3
483 rshrn v2.8b, v0.8h, #2
484 uzp1 v0.8b, v3.8b, v2.8b
485 uzp2 v1.8b, v2.8b, v3.8b
487 add x2, x0, #2 * FDEC_STRIDE
488 add x4, x0, #4 * FDEC_STRIDE
489 add x5, x0, #6 * FDEC_STRIDE
490 st1 {v0.8b}, [x0], x1
491 st1 {v0.8b}, [x2], x1
494 st1 {v1.8b}, [x4], x1
495 st1 {v1.8b}, [x5], x1
501 function x264_predict_8x8c_h_neon, export=1
505 ld1r {v0.8b}, [x1], x7
506 ld1r {v1.8b}, [x1], x7
507 st1 {v0.8b}, [x0], x7
508 st1 {v1.8b}, [x0], x7
513 function x264_predict_8x8c_v_aarch64, export=1
514 ldr x1, [x0, #-FDEC_STRIDE]
515 .irp c, 0,1,2,3,4,5,6,7
516 str x1, [x0, #\c * FDEC_STRIDE]
521 function x264_predict_8x8c_p_neon, export=1
522 sub x3, x0, #FDEC_STRIDE
527 ld1 {v2.s}[0], [x2], x1
528 ldcol.8 v0, x3, x1, 4, hi=1
530 ldcol.8 v3, x3, x1, 4
533 uaddl v4.8h, v2.8b, v3.8b
535 trn1 v2.2s, v2.2s, v3.2s
537 usubl v2.8h, v2.8b, v0.8b
538 mul v2.8h, v2.8h, v7.8h
541 addp v2.4s, v2.4s, v2.4s
543 add v2.2s, v2.2s, v3.2s
544 rshrn v5.4h, v2.4s, #5 // b, c, x, x
545 addp v2.4h, v5.4h, v5.4h
547 sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
549 add v4.4h, v4.4h, v0.4h
550 shl v2.4h, v4.4h, #4 // a
551 sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
552 ext v0.16b, v0.16b, v0.16b, #14
553 sub v6.4h, v5.4h, v3.4h
555 mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
556 dup v1.8h, v2.h[0] // pix
557 dup v2.8h, v5.h[1] // c
558 add v1.8h, v1.8h, v0.8h // pix + x*b
562 sqshrun v0.8b, v1.8h, #5
563 add v1.8h, v1.8h, v2.8h
564 st1 {v0.8b}, [x0], x1
570 .macro loadsum4 wd, t1, t2, t3, x, idx
571 ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
572 ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1]
573 ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1]
574 ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1]
580 function x264_predict_8x16c_h_neon, export=1
582 add x3, x0, #FDEC_STRIDE - 1
583 mov x7, #2 * FDEC_STRIDE
584 add x1, x0, #FDEC_STRIDE
586 ld1r {v0.8b}, [x2], x7
587 ld1r {v1.8b}, [x3], x7
588 ld1r {v2.8b}, [x2], x7
589 ld1r {v3.8b}, [x3], x7
590 st1 {v0.8b}, [x0], x7
591 st1 {v1.8b}, [x1], x7
592 st1 {v2.8b}, [x0], x7
593 st1 {v3.8b}, [x1], x7
598 function x264_predict_8x16c_v_neon, export=1
599 sub x1, x0, #FDEC_STRIDE
600 mov x2, #2 * FDEC_STRIDE
601 ld1 {v0.8b}, [x1], x2
603 st1 {v0.8b}, [x0], x2
604 st1 {v0.8b}, [x1], x2
609 function x264_predict_8x16c_p_neon, export=1
612 sub x3, x0, #FDEC_STRIDE
618 ld1 {v2.8b}, [x2], x1
622 ext v4.8b, v2.8b, v2.8b, #3
623 ext v5.8b, v3.8b, v3.8b, #7
627 uaddl v4.8h, v5.8b, v4.8b // a * 1/16
629 usubl v2.8h, v2.8b, v0.8b
630 mul v2.8h, v2.8h, v17.8h
632 addp v2.4s, v2.4s, v2.4s // H
634 usubl v3.8h, v3.8b, v1.8b
635 mul v3.8h, v3.8h, v17.8h
637 addp v3.4s, v3.4s, v3.4s
638 addp v3.4s, v3.4s, v3.4s // V
640 ext v17.16b, v17.16b, v17.16b, #14
642 shl v4.4h, v4.4h, #4 // a
643 shl v6.2s, v2.2s, #4 // 16 * H
644 shl v7.2s, v3.2s, #2 // 4 * V
645 add v2.2s, v2.2s, v6.2s // 17 * H
646 add v3.2s, v3.2s, v7.2s // 5 * V
647 rshrn v2.4h, v2.4s, #5 // b
648 rshrn v3.4h, v3.4s, #6 // c
652 sub v4.4h, v4.4h, v2.4h // a - b
653 shl v6.4h, v2.4h, #1 // 2 * b
654 add v4.4h, v4.4h, v3.4h // a - b + c
655 shl v7.4h, v3.4h, #3 // 8 * c
656 sub v4.4h, v4.4h, v6.4h // a - 3b + c
657 sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
659 mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
660 dup v1.8h, v4.h[0] // i00
661 dup v2.8h, v3.h[0] // c
662 add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
666 sqrshrun v4.8b, v1.8h, #5
667 add v1.8h, v1.8h, v2.8h
668 sqrshrun v5.8b, v1.8h, #5
669 st1 {v4.8b}, [x0], x1
670 add v1.8h, v1.8h, v2.8h
671 st1 {v5.8b}, [x0], x1
676 function x264_predict_8x16c_dc_neon, export=1
678 sub x10, x0, #FDEC_STRIDE
679 loadsum4 w2, w3, w4, w5, x0, 0
681 loadsum4 w6, w7, w8, w9, x0, 4
685 loadsum4 w2, w3, w4, w5, x0, 8
686 addp v6.4h, v6.4h, v6.4h // s0, s1
687 loadsum4 w6, w7, w8, w9, x0, 12
688 dup v20.8h, v6.h[0] // s0
689 dup v21.8h, v6.h[1] // s1
693 ext v16.16b, v20.16b, v21.16b, #8
694 ext v17.16b, v22.16b, v21.16b, #8
695 ext v1.16b, v23.16b, v21.16b, #8
696 ext v2.16b, v24.16b, v21.16b, #8
697 ext v3.16b, v25.16b, v21.16b, #8
699 add v0.8h, v16.8h, v17.8h
700 add v1.8h, v1.8h, v23.8h
701 add v2.8h, v2.8h, v24.8h
702 add v3.8h, v3.8h, v25.8h
704 rshrn v0.8b, v0.8h, #3
705 rshrn v1.8b, v1.8h, #3
706 rshrn v2.8b, v2.8h, #3
707 rshrn v3.8b, v3.8h, #3
709 add x11, x0, #4 * FDEC_STRIDE
710 add x12, x0, #8 * FDEC_STRIDE
711 add x13, x0, #12 * FDEC_STRIDE
713 st1 {v0.8b}, [x0], x1
714 st1 {v1.8b}, [x11], x1
715 st1 {v2.8b}, [x12], x1
716 st1 {v3.8b}, [x13], x1
721 function x264_predict_8x16c_dc_left_neon, export=1
723 ldrb w2, [x0, # 0 * FDEC_STRIDE - 1]
724 ldrb w3, [x0, # 1 * FDEC_STRIDE - 1]
725 ldrb w4, [x0, # 2 * FDEC_STRIDE - 1]
726 ldrb w5, [x0, # 3 * FDEC_STRIDE - 1]
729 ldrb w6, [x0, # 4 * FDEC_STRIDE - 1]
731 ldrb w7, [x0, # 5 * FDEC_STRIDE - 1]
733 ldrb w8, [x0, # 6 * FDEC_STRIDE - 1]
734 ldrb w9, [x0, # 7 * FDEC_STRIDE - 1]
737 rshrn v0.8b, v0.8h, #2
740 ldrb w10, [x0, # 8 * FDEC_STRIDE - 1]
741 ldrb w11, [x0, # 9 * FDEC_STRIDE - 1]
743 ldrb w12, [x0, #10 * FDEC_STRIDE - 1]
744 ldrb w13, [x0, #11 * FDEC_STRIDE - 1]
747 rshrn v1.8b, v1.8h, #2
750 ldrb w2, [x0, #12 * FDEC_STRIDE - 1]
751 ldrb w3, [x0, #13 * FDEC_STRIDE - 1]
753 ldrb w4, [x0, #14 * FDEC_STRIDE - 1]
754 ldrb w5, [x0, #15 * FDEC_STRIDE - 1]
757 rshrn v2.8b, v2.8h, #2
759 st1 {v0.8b}, [x0], x1
760 st1 {v0.8b}, [x0], x1
762 st1 {v0.8b}, [x0], x1
764 st1 {v0.8b}, [x0], x1
765 rshrn v3.8b, v3.8h, #2
769 st1 {v\idx\().8b}, [x0], x1
775 function x264_predict_8x16c_dc_top_neon, export=1
776 sub x2, x0, #FDEC_STRIDE
780 addp v0.4h, v0.4h, v0.4h
781 rshrn v4.8b, v0.8h, #2
784 ext v0.8b, v0.8b, v1.8b, #4
786 st1 {v0.8b}, [x0], x1
792 function x264_predict_16x16_dc_top_neon, export=1
793 sub x2, x0, #FDEC_STRIDE
797 rshrn v0.8b, v0.8h, #4
802 function x264_predict_16x16_dc_left_neon, export=1
807 rshrn v0.8b, v0.8h, #4
812 function x264_predict_16x16_dc_neon, export=1
813 sub x3, x0, #FDEC_STRIDE
820 add v0.4h, v0.4h, v1.4h
821 rshrn v0.8b, v0.8h, #5
825 st1 {v0.16b}, [x0], x1
830 function x264_predict_16x16_h_neon, export=1
834 ld1r {v0.16b}, [x1], x7
835 ld1r {v1.16b}, [x1], x7
836 st1 {v0.16b}, [x0], x7
837 st1 {v1.16b}, [x0], x7
842 function x264_predict_16x16_v_neon, export=1
843 sub x0, x0, #FDEC_STRIDE
845 ld1 {v0.16b}, [x0], x7
847 st1 {v0.16b}, [x0], x7
852 function x264_predict_16x16_p_neon, export=1
853 sub x3, x0, #FDEC_STRIDE
858 ld1 {v2.8b}, [x2], x1
865 uaddl v4.8h, v2.8b, v3.8b
867 usubl v2.8h, v2.8b, v0.8b
868 usubl v3.8h, v3.8b, v1.8b
869 mul v2.8h, v2.8h, v7.8h
870 mul v3.8h, v3.8h, v7.8h
873 addp v2.4s, v2.4s, v3.4s
874 addp v2.4s, v2.4s, v2.4s
876 add v2.2s, v2.2s, v3.2s
877 rshrn v5.4h, v2.4s, #6 // b, c, x, x
878 addp v2.4h, v5.4h, v5.4h
880 sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
881 ext v4.16b, v4.16b, v4.16b, #14
882 add v4.4h, v4.4h, v7.4h
883 shl v2.4h, v4.4h, #4 // a
884 sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
885 ext v7.16b, v7.16b, v7.16b, #14
888 mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
889 dup v1.8h, v2.h[0] // pix
890 dup v2.8h, v5.h[1] // c
892 add v1.8h, v1.8h, v0.8h // pix + x*b
893 add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
897 sqshrun v0.8b, v1.8h, #5
898 add v1.8h, v1.8h, v2.8h
899 sqshrun2 v0.16b, v3.8h, #5
900 add v3.8h, v3.8h, v2.8h
901 st1 {v0.16b}, [x0], x1