1 /*****************************************************************************
2 * predict.S: aarch64 intra prediction
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Mans Rullgard <mans@mansr.com>
8 * Janne Grunau <janne-x264@jannau.net>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
30 const p8weight, align=4
31 .short 1, 2, 3, 4, 1, 2, 3, 4
33 const p16weight, align=4
34 .short 1, 2, 3, 4, 5, 6, 7, 8
37 .macro ldcol.8 vd, xn, xm, n=8, hi=0
38 .if \n == 8 || \hi == 0
39 ld1 {\vd\().b}[0], [\xn], \xm
40 ld1 {\vd\().b}[1], [\xn], \xm
41 ld1 {\vd\().b}[2], [\xn], \xm
42 ld1 {\vd\().b}[3], [\xn], \xm
44 .if \n == 8 || \hi == 1
45 ld1 {\vd\().b}[4], [\xn], \xm
46 ld1 {\vd\().b}[5], [\xn], \xm
47 ld1 {\vd\().b}[6], [\xn], \xm
48 ld1 {\vd\().b}[7], [\xn], \xm
52 .macro ldcol.16 vd, xn, xm
54 ld1 {\vd\().b}[ 8], [\xn], \xm
55 ld1 {\vd\().b}[ 9], [\xn], \xm
56 ld1 {\vd\().b}[10], [\xn], \xm
57 ld1 {\vd\().b}[11], [\xn], \xm
58 ld1 {\vd\().b}[12], [\xn], \xm
59 ld1 {\vd\().b}[13], [\xn], \xm
60 ld1 {\vd\().b}[14], [\xn], \xm
61 ld1 {\vd\().b}[15], [\xn], \xm
65 function x264_predict_4x4_h_aarch64, export=1
66 ldrb w1, [x0, #0*FDEC_STRIDE-1]
67 ldrb w2, [x0, #1*FDEC_STRIDE-1]
68 ldrb w3, [x0, #2*FDEC_STRIDE-1]
69 ldrb w4, [x0, #3*FDEC_STRIDE-1]
70 add w1, w1, w1, lsl #8
71 add w2, w2, w2, lsl #8
72 add w3, w3, w3, lsl #8
73 add w4, w4, w4, lsl #8
74 add w1, w1, w1, lsl #16
75 str w1, [x0, #0*FDEC_STRIDE]
76 add w2, w2, w2, lsl #16
77 str w2, [x0, #1*FDEC_STRIDE]
78 add w3, w3, w3, lsl #16
79 str w3, [x0, #2*FDEC_STRIDE]
80 add w4, w4, w4, lsl #16
81 str w4, [x0, #3*FDEC_STRIDE]
85 function x264_predict_4x4_v_aarch64, export=1
86 ldr w1, [x0, #0 - 1 * FDEC_STRIDE]
87 str w1, [x0, #0 + 0 * FDEC_STRIDE]
88 str w1, [x0, #0 + 1 * FDEC_STRIDE]
89 str w1, [x0, #0 + 2 * FDEC_STRIDE]
90 str w1, [x0, #0 + 3 * FDEC_STRIDE]
94 function x264_predict_4x4_dc_neon, export=1
95 sub x1, x0, #FDEC_STRIDE
99 ld1r {v1.8b}, [x2], x7
100 ld1r {v2.8b}, [x2], x7
101 ld1r {v3.8b}, [x2], x7
102 ld1r {v4.8b}, [x2], x7
104 uaddl v1.8h, v1.8b, v2.8b
105 uaddl v2.8h, v3.8b, v4.8b
106 addp v0.4h, v0.4h, v0.4h
107 add v1.4h, v1.4h, v2.4h
109 add v0.4h, v0.4h, v1.4h
110 rshrn v0.8b, v0.8h, #3
111 str s0, [x0], #FDEC_STRIDE
112 str s0, [x0], #FDEC_STRIDE
113 str s0, [x0], #FDEC_STRIDE
118 function x264_predict_4x4_dc_top_neon, export=1
119 sub x1, x0, #FDEC_STRIDE
123 addp v0.4h, v0.4h, v0.4h
125 rshrn v0.8b, v0.8h, #2
126 str s0, [x0], #FDEC_STRIDE
127 str s0, [x0], #FDEC_STRIDE
128 str s0, [x0], #FDEC_STRIDE
133 function x264_predict_4x4_ddr_neon, export=1
134 sub x1, x0, #FDEC_STRIDE+1
136 ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
137 ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
138 ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
139 ext v0.8b, v1.8b, v0.8b, #7
140 ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
141 ext v0.8b, v2.8b, v0.8b, #7 // a
142 ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
143 ext v1.8b, v3.8b, v0.8b, #7 // b
144 ext v2.8b, v4.8b, v1.8b, #7 // c
145 uaddl v0.8h, v0.8b, v1.8b
146 uaddl v1.8h, v1.8b, v2.8b
147 add v0.8h, v0.8h, v1.8h
148 rshrn v0.8b, v0.8h, #2
150 ext v3.8b, v0.8b, v0.8b, #3
151 ext v2.8b, v0.8b, v0.8b, #2
152 ext v1.8b, v0.8b, v0.8b, #1
154 str s3, [x0], #FDEC_STRIDE
155 str s2, [x0], #FDEC_STRIDE
156 str s1, [x0], #FDEC_STRIDE
161 function x264_predict_4x4_ddl_neon, export=1
162 sub x0, x0, #FDEC_STRIDE
164 ld1 {v0.8b}, [x0], x7
166 ext v1.8b, v0.8b, v0.8b, #1
167 ext v2.8b, v0.8b, v3.8b, #2
168 uhadd v0.8b, v0.8b, v2.8b
169 urhadd v0.8b, v0.8b, v1.8b
170 str s0, [x0], #FDEC_STRIDE
171 ext v1.8b, v0.8b, v0.8b, #1
172 ext v2.8b, v0.8b, v0.8b, #2
173 str s1, [x0], #FDEC_STRIDE
174 ext v3.8b, v0.8b, v0.8b, #3
175 str s2, [x0], #FDEC_STRIDE
180 function x264_predict_8x8_dc_neon, export=1
182 ld1 {v0.16b}, [x1], #16
184 ext v0.16b, v0.16b, v0.16b, #7
187 add v0.8h, v0.8h, v1.8h
189 rshrn v0.8b, v0.8h, #4
191 st1 {v0.8b}, [x0], x7
196 function x264_predict_8x8_h_neon, export=1
201 st1 {v0.8b}, [x0], x7
203 st1 {v1.8b}, [x0], x7
205 st1 {v2.8b}, [x0], x7
207 st1 {v3.8b}, [x0], x7
209 st1 {v4.8b}, [x0], x7
211 st1 {v5.8b}, [x0], x7
213 st1 {v6.8b}, [x0], x7
214 st1 {v7.8b}, [x0], x7
218 function x264_predict_8x8_v_neon, export=1
223 st1 {v0.8b}, [x0], x7
228 function x264_predict_8x8_ddl_neon, export=1
234 ext v4.16b, v3.16b, v0.16b, #15
235 ext v2.16b, v0.16b, v2.16b, #1
236 uhadd v4.16b, v4.16b, v2.16b
237 urhadd v0.16b, v0.16b, v4.16b
238 ext v1.16b, v0.16b, v0.16b, #1
239 ext v2.16b, v0.16b, v0.16b, #2
240 st1 {v1.8b}, [x0], x7
241 ext v3.16b, v0.16b, v0.16b, #3
242 st1 {v2.8b}, [x0], x7
243 ext v4.16b, v0.16b, v0.16b, #4
244 st1 {v3.8b}, [x0], x7
245 ext v5.16b, v0.16b, v0.16b, #5
246 st1 {v4.8b}, [x0], x7
247 ext v6.16b, v0.16b, v0.16b, #6
248 st1 {v5.8b}, [x0], x7
249 ext v7.16b, v0.16b, v0.16b, #7
250 st1 {v6.8b}, [x0], x7
251 ext v0.16b, v0.16b, v0.16b, #8
252 st1 {v7.8b}, [x0], x7
253 st1 {v0.8b}, [x0], x7
257 function x264_predict_8x8_ddr_neon, export=1
258 ld1 {v0.16b,v1.16b}, [x1]
259 ext v2.16b, v0.16b, v1.16b, #7
260 ext v4.16b, v0.16b, v1.16b, #9
261 ext v3.16b, v0.16b, v1.16b, #8
263 uhadd v2.16b, v2.16b, v4.16b
264 urhadd v7.16b, v3.16b, v2.16b
266 add x0, x0, #7*FDEC_STRIDE
267 mov x7, #-1*FDEC_STRIDE
269 ext v6.16b, v7.16b, v7.16b, #1
270 st1 {v7.8b}, [x0], x7
271 ext v5.16b, v7.16b, v7.16b, #2
272 st1 {v6.8b}, [x0], x7
273 ext v4.16b, v7.16b, v7.16b, #3
274 st1 {v5.8b}, [x0], x7
275 ext v3.16b, v7.16b, v7.16b, #4
276 st1 {v4.8b}, [x0], x7
277 ext v2.16b, v7.16b, v7.16b, #5
278 st1 {v3.8b}, [x0], x7
279 ext v1.16b, v7.16b, v7.16b, #6
280 st1 {v2.8b}, [x0], x7
281 ext v0.16b, v7.16b, v7.16b, #7
282 st1 {v1.8b}, [x0], x7
283 st1 {v0.8b}, [x0], x7
287 function x264_predict_8x8_vl_neon, export=1
292 ext v1.16b, v1.16b, v0.16b, #15
293 ext v2.16b, v0.16b, v2.16b, #1
295 uhadd v1.16b, v1.16b, v2.16b
296 urhadd v3.16b, v0.16b, v2.16b
298 urhadd v0.16b, v0.16b, v1.16b
300 ext v4.16b, v0.16b, v0.16b, #1
301 st1 {v3.8b}, [x0], x7
302 ext v5.16b, v3.16b, v3.16b, #1
303 st1 {v4.8b}, [x0], x7
304 ext v6.16b, v0.16b, v0.16b, #2
305 st1 {v5.8b}, [x0], x7
306 ext v7.16b, v3.16b, v3.16b, #2
307 st1 {v6.8b}, [x0], x7
308 ext v4.16b, v0.16b, v0.16b, #3
309 st1 {v7.8b}, [x0], x7
310 ext v5.16b, v3.16b, v3.16b, #3
311 st1 {v4.8b}, [x0], x7
312 ext v6.16b, v0.16b, v0.16b, #4
313 st1 {v5.8b}, [x0], x7
314 st1 {v6.8b}, [x0], x7
318 function x264_predict_8x8_vr_neon, export=1
323 ext v1.16b, v2.16b, v2.16b, #14
324 ext v0.16b, v2.16b, v2.16b, #15
326 uhadd v3.16b, v2.16b, v1.16b
327 urhadd v2.16b, v2.16b, v0.16b
328 urhadd v0.16b, v0.16b, v3.16b
330 ext v1.16b, v2.16b, v2.16b, #8
331 uzp1 v2.8b, v0.8b, v0.8b
332 uzp2 v3.8b, v0.8b, v0.8b
333 ext v0.16b, v0.16b, v0.16b, #8
335 st1 {v1.8b}, [x0], x7
336 st1 {v0.8b}, [x0], x7
337 ext v4.8b, v3.8b, v1.8b, #7
338 ext v5.8b, v2.8b, v0.8b, #7
339 st1 {v4.8b}, [x0], x7
340 st1 {v5.8b}, [x0], x7
341 ext v6.8b, v3.8b, v1.8b, #6
342 ext v7.8b, v2.8b, v0.8b, #6
343 st1 {v6.8b}, [x0], x7
344 st1 {v7.8b}, [x0], x7
345 ext v1.8b, v3.8b, v1.8b, #5
346 ext v0.8b, v2.8b, v0.8b, #5
347 st1 {v1.8b}, [x0], x7
348 st1 {v0.8b}, [x0], x7
352 function x264_predict_8x8_hd_neon, export=1
357 ext v3.16b, v1.16b, v1.16b, #1
358 ext v2.16b, v1.16b, v1.16b, #2
360 urhadd v4.16b, v1.16b, v3.16b
362 uhadd v1.16b, v1.16b, v2.16b
363 urhadd v0.16b, v1.16b, v3.16b
365 zip1 v16.8b, v4.8b, v0.8b
366 zip2 v17.8b, v4.8b, v0.8b
367 ext v7.16b, v0.16b, v0.16b, #8
369 ext v0.8b, v17.8b, v7.8b, #6
370 ext v1.8b, v17.8b, v7.8b, #4
371 st1 {v0.8b}, [x0], x7
372 ext v2.8b, v17.8b, v7.8b, #2
373 st1 {v1.8b}, [x0], x7
374 st1 {v2.8b}, [x0], x7
375 ext v3.8b, v16.8b, v17.8b, #6
376 st1 {v17.8b}, [x0], x7
377 ext v4.8b, v16.8b, v17.8b, #4
378 st1 {v3.8b}, [x0], x7
379 ext v5.8b, v16.8b, v17.8b, #2
380 st1 {v4.8b}, [x0], x7
381 st1 {v5.8b}, [x0], x7
382 st1 {v16.8b}, [x0], x7
387 function x264_predict_8x8_hu_neon, export=1
394 ext v4.8b, v7.8b, v6.8b, #2
395 ext v2.8b, v7.8b, v6.8b, #1
397 uhadd v5.8b, v7.8b, v4.8b
398 urhadd v0.8b, v2.8b, v7.8b
399 urhadd v1.8b, v5.8b, v2.8b
401 zip1 v16.8b, v0.8b, v1.8b
402 zip2 v17.8b, v0.8b, v1.8b
406 ext v0.8b, v16.8b, v17.8b, #2
407 ext v1.8b, v16.8b, v17.8b, #4
408 ext v2.8b, v16.8b, v17.8b, #6
409 st1 {v16.8b}, [x0], x7
410 st1 {v0.8b}, [x0], x7
411 st1 {v1.8b}, [x0], x7
412 st1 {v2.8b}, [x0], x7
414 ext v4.8b, v17.8b, v18.8b, #2
415 ext v5.8b, v17.8b, v18.8b, #4
416 ext v6.8b, v17.8b, v18.8b, #6
417 st1 {v17.8b}, [x0], x7
418 st1 {v4.8b}, [x0], x7
419 st1 {v5.8b}, [x0], x7
425 function x264_predict_8x8c_dc_top_neon, export=1
426 sub x2, x0, #FDEC_STRIDE
430 addp v0.4h, v0.4h, v0.4h
431 rshrn v0.8b, v0.8h, #2
434 transpose v0.2s, v1.2s, v2.2s, v3.2s
438 function x264_predict_8x8c_dc_left_neon, export=1
439 ldrb w2, [x0, #0 * FDEC_STRIDE - 1]
440 ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
441 ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
442 ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
446 ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
447 ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
448 ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
449 ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
456 rshrn v0.8b, v0.8h, #2
457 rshrn v1.8b, v1.8h, #2
461 function x264_predict_8x8c_dc_neon, export=1
462 sub x2, x0, #FDEC_STRIDE
467 transpose v0.2s, v1.2s, v2.2s, v3.2s
468 uaddlp v0.4h, v0.8b // s0, s2
469 uaddlp v1.4h, v1.8b // s1, s3
470 addp v0.4h, v0.4h, v1.4h // s0, s2, s1, s3
471 addp v1.4h, v0.4h, v0.4h
472 rshrn v2.8b, v0.8h, #2
473 rshrn v3.8b, v1.8h, #3
474 dup v5.8b, v2.b[2] // dc1
475 dup v6.8b, v3.b[1] // dc2
476 dup v4.8b, v3.b[0] // dc0
477 dup v7.8b, v2.b[3] // dc3
478 trn1 v0.2s, v4.2s, v5.2s
479 trn1 v1.2s, v7.2s, v6.2s
481 add x2, x0, x1, lsl #2
483 st1 {v0.8b}, [x0], x1
484 st1 {v1.8b}, [x2], x1
489 function x264_predict_8x8c_h_neon, export=1
493 ld1r {v0.8b}, [x1], x7
494 ld1r {v1.8b}, [x1], x7
495 st1 {v0.8b}, [x0], x7
496 st1 {v1.8b}, [x0], x7
501 function x264_predict_8x8c_v_neon, export=1
502 sub x0, x0, #FDEC_STRIDE
504 ld1 {v0.8b}, [x0], x7
506 st1 {v0.8b}, [x0], x7
511 function x264_predict_8x8c_p_neon, export=1
512 sub x3, x0, #FDEC_STRIDE
517 ld1 {v2.s}[0], [x2], x1
518 ldcol.8 v0, x3, x1, 4, hi=1
520 ldcol.8 v3, x3, x1, 4
523 uaddl v4.8h, v2.8b, v3.8b
525 trn1 v2.2s, v2.2s, v3.2s
527 usubl v2.8h, v2.8b, v0.8b
528 mul v2.8h, v2.8h, v7.8h
531 addp v2.4s, v2.4s, v2.4s
533 add v2.2s, v2.2s, v3.2s
534 rshrn v5.4h, v2.4s, #5 // b, c, x, x
535 addp v2.4h, v5.4h, v5.4h
537 sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
539 add v4.4h, v4.4h, v0.4h
540 shl v2.4h, v4.4h, #4 // a
541 sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
542 ext v0.16b, v0.16b, v0.16b, #14
543 sub v6.4h, v5.4h, v3.4h
545 mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
546 dup v1.8h, v2.h[0] // pix
547 dup v2.8h, v5.h[1] // c
548 add v1.8h, v1.8h, v0.8h // pix + x*b
552 sqshrun v0.8b, v1.8h, #5
553 add v1.8h, v1.8h, v2.8h
554 st1 {v0.8b}, [x0], x1
560 .macro loadsum4 wd, t1, t2, t3, x, idx
561 ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
562 ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1]
563 ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1]
564 ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1]
570 function x264_predict_8x16c_h_neon, export=1
572 add x3, x0, #FDEC_STRIDE - 1
573 mov x7, #2 * FDEC_STRIDE
574 add x1, x0, #FDEC_STRIDE
576 ld1r {v0.8b}, [x2], x7
577 ld1r {v1.8b}, [x3], x7
578 ld1r {v2.8b}, [x2], x7
579 ld1r {v3.8b}, [x3], x7
580 st1 {v0.8b}, [x0], x7
581 st1 {v1.8b}, [x1], x7
582 st1 {v2.8b}, [x0], x7
583 st1 {v3.8b}, [x1], x7
588 function x264_predict_8x16c_v_neon, export=1
589 sub x1, x0, #FDEC_STRIDE
590 mov x2, #2 * FDEC_STRIDE
591 ld1 {v0.8b}, [x1], x2
593 st1 {v0.8b}, [x0], x2
594 st1 {v0.8b}, [x1], x2
599 function x264_predict_8x16c_p_neon, export=1
602 sub x3, x0, #FDEC_STRIDE
608 ld1 {v2.8b}, [x2], x1
612 ext v4.8b, v2.8b, v2.8b, #3
613 ext v5.8b, v3.8b, v3.8b, #7
617 uaddl v4.8h, v5.8b, v4.8b // a * 1/16
619 usubl v2.8h, v2.8b, v0.8b
620 mul v2.8h, v2.8h, v17.8h
622 addp v2.4s, v2.4s, v2.4s // H
624 usubl v3.8h, v3.8b, v1.8b
625 mul v3.8h, v3.8h, v17.8h
627 addp v3.4s, v3.4s, v3.4s
628 addp v3.4s, v3.4s, v3.4s // V
630 ext v17.16b, v17.16b, v17.16b, #14
632 shl v4.4h, v4.4h, #4 // a
633 shl v6.2s, v2.2s, #4 // 16 * H
634 shl v7.2s, v3.2s, #2 // 4 * V
635 add v2.2s, v2.2s, v6.2s // 17 * H
636 add v3.2s, v3.2s, v7.2s // 5 * V
637 rshrn v2.4h, v2.4s, #5 // b
638 rshrn v3.4h, v3.4s, #6 // c
642 sub v4.4h, v4.4h, v2.4h // a - b
643 shl v6.4h, v2.4h, #1 // 2 * b
644 add v4.4h, v4.4h, v3.4h // a - b + c
645 shl v7.4h, v3.4h, #3 // 8 * c
646 sub v4.4h, v4.4h, v6.4h // a - 3b + c
647 sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
649 mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
650 dup v1.8h, v4.h[0] // i00
651 dup v2.8h, v3.h[0] // c
652 add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
656 sqrshrun v4.8b, v1.8h, #5
657 add v1.8h, v1.8h, v2.8h
658 sqrshrun v5.8b, v1.8h, #5
659 st1 {v4.8b}, [x0], x1
660 add v1.8h, v1.8h, v2.8h
661 st1 {v5.8b}, [x0], x1
666 function x264_predict_8x16c_dc_neon, export=1
667 sub x3, x0, #FDEC_STRIDE
670 loadsum4 w2, w3, w4, w5, x0, 0
673 loadsum4 w6, w7, w8, w9, x0, 4
674 addp v6.4h, v6.4h, v6.4h // s0, s1
676 loadsum4 w2, w3, w4, w5, x0, 8
677 dup v20.8h, v6.h[0] // s0
679 loadsum4 w6, w7, w8, w9, x0, 12
680 dup v21.8h, v6.h[1] // s1
683 ext v16.16b, v20.16b, v21.16b, #8
684 ext v17.16b, v22.16b, v21.16b, #8
685 ext v1.16b, v23.16b, v21.16b, #8
686 ext v2.16b, v24.16b, v21.16b, #8
687 ext v3.16b, v25.16b, v21.16b, #8
689 add v0.8h, v16.8h, v17.8h
690 add v1.8h, v1.8h, v23.8h
691 add v2.8h, v2.8h, v24.8h
692 add v3.8h, v3.8h, v25.8h
694 rshrn v0.8b, v0.8h, #3
695 rshrn v1.8b, v1.8h, #3
696 rshrn v2.8b, v2.8h, #3
697 rshrn v3.8b, v3.8h, #3
700 st1 {v\idx\().8b}, [x0], x1
706 function x264_predict_8x16c_dc_left_neon, export=1
708 ldrb w2, [x0, # 0 * FDEC_STRIDE - 1]
709 ldrb w3, [x0, # 1 * FDEC_STRIDE - 1]
710 ldrb w4, [x0, # 2 * FDEC_STRIDE - 1]
711 ldrb w5, [x0, # 3 * FDEC_STRIDE - 1]
714 ldrb w6, [x0, # 4 * FDEC_STRIDE - 1]
716 ldrb w7, [x0, # 5 * FDEC_STRIDE - 1]
718 ldrb w8, [x0, # 6 * FDEC_STRIDE - 1]
719 ldrb w9, [x0, # 7 * FDEC_STRIDE - 1]
722 rshrn v0.8b, v0.8h, #2
725 ldrb w10, [x0, # 8 * FDEC_STRIDE - 1]
726 ldrb w11, [x0, # 9 * FDEC_STRIDE - 1]
728 ldrb w12, [x0, #10 * FDEC_STRIDE - 1]
729 ldrb w13, [x0, #11 * FDEC_STRIDE - 1]
732 rshrn v1.8b, v1.8h, #2
735 ldrb w2, [x0, #12 * FDEC_STRIDE - 1]
736 ldrb w3, [x0, #13 * FDEC_STRIDE - 1]
738 ldrb w4, [x0, #14 * FDEC_STRIDE - 1]
739 ldrb w5, [x0, #15 * FDEC_STRIDE - 1]
742 rshrn v2.8b, v2.8h, #2
744 st1 {v0.8b}, [x0], x1
745 st1 {v0.8b}, [x0], x1
747 st1 {v0.8b}, [x0], x1
749 st1 {v0.8b}, [x0], x1
750 rshrn v3.8b, v3.8h, #2
754 st1 {v\idx\().8b}, [x0], x1
760 function x264_predict_8x16c_dc_top_neon, export=1
761 sub x2, x0, #FDEC_STRIDE
765 addp v0.4h, v0.4h, v0.4h
766 rshrn v4.8b, v0.8h, #2
769 ext v0.8b, v0.8b, v1.8b, #4
771 st1 {v0.8b}, [x0], x1
777 function x264_predict_16x16_dc_top_neon, export=1
778 sub x2, x0, #FDEC_STRIDE
782 rshrn v0.8b, v0.8h, #4
787 function x264_predict_16x16_dc_left_neon, export=1
792 rshrn v0.8b, v0.8h, #4
797 function x264_predict_16x16_dc_neon, export=1
798 sub x3, x0, #FDEC_STRIDE
805 add v0.4h, v0.4h, v1.4h
806 rshrn v0.8b, v0.8h, #5
810 st1 {v0.16b}, [x0], x1
815 function x264_predict_16x16_h_neon, export=1
819 ld1r {v0.16b}, [x1], x7
820 ld1r {v1.16b}, [x1], x7
821 st1 {v0.16b}, [x0], x7
822 st1 {v1.16b}, [x0], x7
827 function x264_predict_16x16_v_neon, export=1
828 sub x0, x0, #FDEC_STRIDE
830 ld1 {v0.16b}, [x0], x7
832 st1 {v0.16b}, [x0], x7
837 function x264_predict_16x16_p_neon, export=1
838 sub x3, x0, #FDEC_STRIDE
843 ld1 {v2.8b}, [x2], x1
850 uaddl v4.8h, v2.8b, v3.8b
852 usubl v2.8h, v2.8b, v0.8b
853 usubl v3.8h, v3.8b, v1.8b
854 mul v2.8h, v2.8h, v7.8h
855 mul v3.8h, v3.8h, v7.8h
858 addp v2.4s, v2.4s, v3.4s
859 addp v2.4s, v2.4s, v2.4s
861 add v2.2s, v2.2s, v3.2s
862 rshrn v5.4h, v2.4s, #6 // b, c, x, x
863 addp v2.4h, v5.4h, v5.4h
865 sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
866 ext v4.16b, v4.16b, v4.16b, #14
867 add v4.4h, v4.4h, v7.4h
868 shl v2.4h, v4.4h, #4 // a
869 sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
870 ext v7.16b, v7.16b, v7.16b, #14
873 mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
874 dup v1.8h, v2.h[0] // pix
875 dup v2.8h, v5.h[1] // c
877 add v1.8h, v1.8h, v0.8h // pix + x*b
878 add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
882 sqshrun v0.8b, v1.8h, #5
883 add v1.8h, v1.8h, v2.8h
884 sqshrun2 v0.16b, v3.8h, #5
885 add v3.8h, v3.8h, v2.8h
886 st1 {v0.16b}, [x0], x1