1 /*****************************************************************************
2 * mc.S: aarch64 motion compensation
3 *****************************************************************************
4 * Copyright (C) 2009-2015 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
8 * Mans Rullgard <mans@mansr.com>
9 * Stefan Groenroos <stefan.gronroos@gmail.com>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 * This program is also available under a commercial proprietary license.
26 * For more information, contact us at licensing@x264.com.
27 *****************************************************************************/
31 // note: prefetch stuff assumes 64-byte cacheline
33 // void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
34 function x264_prefetch_ref_aarch64, export=1
38 add x0, x0, x2, lsl #3
41 add x3, x1, x1, lsl #1
42 add x4, x0, x1, lsl #2
45 prfm pldl1strm, [x0, x1]
46 prfm pldl1strm, [x0, x2]
47 prfm pldl1strm, [x0, x3]
49 prfm pldl1strm, [x4, x1]
50 prfm pldl1strm, [x4, x2]
51 prfm pldl1strm, [x4, x3]
55 // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
56 // uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
57 .macro x264_prefetch_fenc sub
58 function x264_prefetch_fenc_\sub\()_aarch64, export=1
66 add x0, x0, x6, lsl #2
67 add x6, x0, x1, lsl #1
69 prfm pldl1strm, [x0, x1]
71 prfm pldl1strm, [x6, x1]
73 add x2, x2, x7, lsl #1
75 prfm pldl1strm, [x2, x3]
77 add x7, x2, x3, lsl #1
79 prfm pldl1strm, [x7, x3]
85 x264_prefetch_fenc 420
86 x264_prefetch_fenc 422
88 // void pixel_avg( uint8_t *dst, intptr_t dst_stride,
89 // uint8_t *src1, intptr_t src1_stride,
90 // uint8_t *src2, intptr_t src2_stride, int weight );
92 function x264_pixel_avg_\w\()x\h\()_neon, export=1
96 b.eq pixel_avg_w\w\()_neon
98 b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
100 b.ge pixel_avg_weight_w\w\()_add_add_neon
101 b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
116 .macro load_weights_add_add
119 .macro weight_add_add dst, s1, s2, h=
121 umull2 \dst, \s1, v30.16b
122 umlal2 \dst, \s2, v31.16b
124 umull \dst, \s1, v30.8b
125 umlal \dst, \s2, v31.8b
130 .macro load_weights_add_sub
133 .macro weight_add_sub dst, s1, s2, h=
135 umull2 \dst, \s1, v30.16b
136 umlsl2 \dst, \s2, v31.16b
138 umull \dst, \s1, v30.8b
139 umlsl \dst, \s2, v31.8b
144 .macro load_weights_sub_add
147 .macro weight_sub_add dst, s1, s2, h=
149 umull2 \dst, \s2, v31.16b
150 umlsl2 \dst, \s1, v30.16b
152 umull \dst, \s2, v31.8b
153 umlsl \dst, \s1, v30.8b
157 .macro AVG_WEIGHT ext
158 function pixel_avg_weight_w4_\ext\()_neon
164 ld1 {v0.s}[0], [x2], x3
165 ld1 {v1.s}[0], [x4], x5
166 weight_\ext v4.8h, v0.8b, v1.8b
167 ld1 {v2.s}[0], [x2], x3
168 ld1 {v3.s}[0], [x4], x5
169 sqrshrun v0.8b, v4.8h, #6
170 weight_\ext v5.8h, v2.8b, v3.8b
171 st1 {v0.s}[0], [x0], x1
172 sqrshrun v1.8b, v5.8h, #6
173 st1 {v1.s}[0], [x0], x1
178 function pixel_avg_weight_w8_\ext\()_neon
184 ld1 {v0.8b}, [x2], x3
185 ld1 {v1.8b}, [x4], x5
186 weight_\ext v16.8h, v0.8b, v1.8b
187 ld1 {v2.8b}, [x2], x3
188 ld1 {v3.8b}, [x4], x5
189 weight_\ext v17.8h, v2.8b, v3.8b
190 ld1 {v4.8b}, [x2], x3
191 ld1 {v5.8b}, [x4], x5
192 weight_\ext v18.8h, v4.8b, v5.8b
193 ld1 {v6.8b}, [x2], x3
194 ld1 {v7.8b}, [x4], x5
195 weight_\ext v19.8h, v6.8b, v7.8b
196 sqrshrun v0.8b, v16.8h, #6
197 sqrshrun v1.8b, v17.8h, #6
198 sqrshrun v2.8b, v18.8h, #6
199 sqrshrun v3.8b, v19.8h, #6
200 st1 {v0.8b}, [x0], x1
201 st1 {v1.8b}, [x0], x1
202 st1 {v2.8b}, [x0], x1
203 st1 {v3.8b}, [x0], x1
208 function pixel_avg_weight_w16_\ext\()_neon
214 ld1 {v0.16b}, [x2], x3
215 ld1 {v1.16b}, [x4], x5
216 weight_\ext v16.8h, v0.8b, v1.8b
217 weight_\ext v17.8h, v0.16b, v1.16b, 2
218 ld1 {v2.16b}, [x2], x3
219 ld1 {v3.16b}, [x4], x5
220 weight_\ext v18.8h, v2.8b, v3.8b
221 weight_\ext v19.8h, v2.16b, v3.16b, 2
222 sqrshrun v0.8b, v16.8h, #6
223 sqrshrun v1.8b, v18.8h, #6
224 sqrshrun2 v0.16b, v17.8h, #6
225 sqrshrun2 v1.16b, v19.8h, #6
226 st1 {v0.16b}, [x0], x1
227 st1 {v1.16b}, [x0], x1
237 function pixel_avg_w4_neon
239 ld1 {v0.s}[0], [x2], x3
240 ld1 {v2.s}[0], [x4], x5
241 urhadd v0.8b, v0.8b, v2.8b
242 ld1 {v1.s}[0], [x2], x3
243 ld1 {v3.s}[0], [x4], x5
244 urhadd v1.8b, v1.8b, v3.8b
245 st1 {v0.s}[0], [x0], x1
246 st1 {v1.s}[0], [x0], x1
251 function pixel_avg_w8_neon
253 ld1 {v0.8b}, [x2], x3
254 ld1 {v1.8b}, [x4], x5
255 ld1 {v2.8b}, [x2], x3
256 urhadd v0.8b, v0.8b, v1.8b
257 ld1 {v3.8b}, [x4], x5
258 st1 {v0.8b}, [x0], x1
259 ld1 {v4.8b}, [x2], x3
260 urhadd v1.8b, v2.8b, v3.8b
261 ld1 {v5.8b}, [x4], x5
262 st1 {v1.8b}, [x0], x1
263 ld1 {v6.8b}, [x2], x3
264 ld1 {v7.8b}, [x4], x5
265 urhadd v2.8b, v4.8b, v5.8b
266 urhadd v3.8b, v6.8b, v7.8b
267 st1 {v2.8b}, [x0], x1
268 st1 {v3.8b}, [x0], x1
273 function pixel_avg_w16_neon
275 ld1 {v0.16b}, [x2], x3
276 ld1 {v1.16b}, [x4], x5
277 ld1 {v2.16b}, [x2], x3
278 urhadd v0.16b, v0.16b, v1.16b
279 ld1 {v3.16b}, [x4], x5
280 st1 {v0.16b}, [x0], x1
281 ld1 {v4.16b}, [x2], x3
282 urhadd v1.16b, v2.16b, v3.16b
283 ld1 {v5.16b}, [x4], x5
284 st1 {v1.16b}, [x0], x1
285 ld1 {v6.16b}, [x2], x3
286 ld1 {v7.16b}, [x4], x5
287 urhadd v2.16b, v4.16b, v5.16b
288 urhadd v3.16b, v6.16b, v7.16b
289 st1 {v2.16b}, [x0], x1
290 st1 {v3.16b}, [x0], x1
295 function x264_pixel_avg2_w4_neon, export=1
298 ld1 {v0.s}[0], [x2], x3
299 ld1 {v2.s}[0], [x4], x3
300 urhadd v0.8b, v0.8b, v2.8b
301 ld1 {v1.s}[0], [x2], x3
302 ld1 {v3.s}[0], [x4], x3
303 urhadd v1.8b, v1.8b, v3.8b
304 st1 {v0.s}[0], [x0], x1
305 st1 {v1.s}[0], [x0], x1
310 function x264_pixel_avg2_w8_neon, export=1
313 ld1 {v0.8b}, [x2], x3
314 ld1 {v2.8b}, [x4], x3
315 urhadd v0.8b, v0.8b, v2.8b
316 ld1 {v1.8b}, [x2], x3
317 ld1 {v3.8b}, [x4], x3
318 urhadd v1.8b, v1.8b, v3.8b
319 st1 {v0.8b}, [x0], x1
320 st1 {v1.8b}, [x0], x1
325 function x264_pixel_avg2_w16_neon, export=1
328 ld1 {v0.16b}, [x2], x3
329 ld1 {v2.16b}, [x4], x3
330 urhadd v0.16b, v0.16b, v2.16b
331 ld1 {v1.16b}, [x2], x3
332 ld1 {v3.16b}, [x4], x3
333 urhadd v1.16b, v1.16b, v3.16b
334 st1 {v0.16b}, [x0], x1
335 st1 {v1.16b}, [x0], x1
340 function x264_pixel_avg2_w20_neon, export=1
344 ld1 {v0.16b,v1.16b}, [x2], x3
345 ld1 {v2.16b,v3.16b}, [x4], x3
346 urhadd v0.16b, v0.16b, v2.16b
347 urhadd v1.8b, v1.8b, v3.8b
348 ld1 {v4.16b,v5.16b}, [x2], x3
349 ld1 {v6.16b,v7.16b}, [x4], x3
350 urhadd v4.16b, v4.16b, v6.16b
351 urhadd v5.8b, v5.8b, v7.8b
352 st1 {v0.16b}, [x0], #16
353 st1 {v1.s}[0], [x0], x1
354 st1 {v4.16b}, [x0], #16
355 st1 {v5.s}[0], [x0], x1
360 .macro weight_prologue type
363 ldr w12, [x4, #32] // denom
365 ldp w4, w5, [x4, #32+4] // scale, offset
374 // void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
375 // intptr_t dst_stride, const x264_weight_t *weight, int h )
376 function x264_mc_weight_w20_neon, export=1
381 ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
382 ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
383 umull v22.8h, v16.8b, v0.8b
384 umull v23.8h, v17.8b, v0.8b
385 zip1 v18.2s, v18.2s, v21.2s
386 umull v25.8h, v19.8b, v0.8b
387 umull v26.8h, v20.8b, v0.8b
388 umull v24.8h, v18.8b, v0.8b
389 srshl v22.8h, v22.8h, v2.8h
390 srshl v23.8h, v23.8h, v2.8h
391 srshl v24.8h, v24.8h, v2.8h
392 srshl v25.8h, v25.8h, v2.8h
393 srshl v26.8h, v26.8h, v2.8h
394 add v22.8h, v22.8h, v1.8h
395 add v23.8h, v23.8h, v1.8h
396 add v24.8h, v24.8h, v1.8h
397 add v25.8h, v25.8h, v1.8h
398 add v26.8h, v26.8h, v1.8h
400 sqxtun2 v4.16b, v23.8h
403 sqxtun2 v5.16b, v26.8h
404 st1 {v4.16b}, [x0], #16
405 st1 {v6.s}[0], [x0], x1
406 st1 {v5.16b}, [x0], #16
407 st1 {v6.s}[1], [x0], x1
412 function x264_mc_weight_w16_neon, export=1
417 ld1 {v4.16b}, [x2], x3
418 ld1 {v5.16b}, [x2], x3
419 umull v22.8h, v4.8b, v0.8b
420 umull2 v23.8h, v4.16b, v0.16b
421 umull v24.8h, v5.8b, v0.8b
422 umull2 v25.8h, v5.16b, v0.16b
423 srshl v22.8h, v22.8h, v2.8h
424 srshl v23.8h, v23.8h, v2.8h
425 srshl v24.8h, v24.8h, v2.8h
426 srshl v25.8h, v25.8h, v2.8h
427 add v22.8h, v22.8h, v1.8h
428 add v23.8h, v23.8h, v1.8h
429 add v24.8h, v24.8h, v1.8h
430 add v25.8h, v25.8h, v1.8h
432 sqxtun2 v4.16b, v23.8h
434 sqxtun2 v5.16b, v25.8h
435 st1 {v4.16b}, [x0], x1
436 st1 {v5.16b}, [x0], x1
441 function x264_mc_weight_w8_neon, export=1
445 ld1 {v16.8b}, [x2], x3
446 ld1 {v17.8b}, [x2], x3
447 umull v4.8h, v16.8b, v0.8b
448 umull v5.8h, v17.8b, v0.8b
449 srshl v4.8h, v4.8h, v2.8h
450 srshl v5.8h, v5.8h, v2.8h
451 add v4.8h, v4.8h, v1.8h
452 add v5.8h, v5.8h, v1.8h
455 st1 {v16.8b}, [x0], x1
456 st1 {v17.8b}, [x0], x1
461 function x264_mc_weight_w4_neon, export=1
465 ld1 {v16.s}[0], [x2], x3
466 ld1 {v16.s}[1], [x2], x3
467 umull v4.8h, v16.8b, v0.8b
468 srshl v4.8h, v4.8h, v2.8h
469 add v4.8h, v4.8h, v1.8h
471 st1 {v16.s}[0], [x0], x1
472 st1 {v16.s}[1], [x0], x1
477 function x264_mc_weight_w20_nodenom_neon, export=1
478 weight_prologue nodenom
482 ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
485 ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
489 zip1 v18.2s, v18.2s, v21.2s
490 umlal v27.8h, v16.8b, v0.8b
491 umlal v28.8h, v17.8b, v0.8b
492 umlal v31.8h, v18.8b, v0.8b
493 umlal v29.8h, v19.8b, v0.8b
494 umlal v30.8h, v20.8b, v0.8b
496 sqxtun2 v4.16b, v28.8h
498 sqxtun2 v5.16b, v30.8h
500 st1 {v4.16b}, [x0], #16
501 st1 {v6.s}[0], [x0], x1
502 st1 {v5.16b}, [x0], #16
503 st1 {v6.s}[1], [x0], x1
508 function x264_mc_weight_w16_nodenom_neon, export=1
509 weight_prologue nodenom
512 ld1 {v6.16b}, [x2], x3
515 ld1 {v7.16b}, [x2], x3
518 umlal v27.8h, v6.8b, v0.8b
519 umlal2 v28.8h, v6.16b, v0.16b
520 umlal v29.8h, v7.8b, v0.8b
521 umlal2 v30.8h, v7.16b, v0.16b
523 sqxtun2 v4.16b, v28.8h
525 sqxtun2 v5.16b, v30.8h
526 st1 {v4.16b}, [x0], x1
527 st1 {v5.16b}, [x0], x1
532 function x264_mc_weight_w8_nodenom_neon, export=1
533 weight_prologue nodenom
536 ld1 {v16.8b}, [x2], x3
538 ld1 {v17.8b}, [x2], x3
540 umlal v27.8h, v16.8b, v0.8b
541 umlal v29.8h, v17.8b, v0.8b
544 st1 {v4.8b}, [x0], x1
545 st1 {v5.8b}, [x0], x1
550 function x264_mc_weight_w4_nodenom_neon, export=1
551 weight_prologue nodenom
554 ld1 {v16.s}[0], [x2], x3
555 ld1 {v16.s}[1], [x2], x3
557 umlal v27.8h, v16.8b, v0.8b
559 st1 {v4.s}[0], [x0], x1
560 st1 {v4.s}[1], [x0], x1
565 .macro weight_simple_prologue
566 ldr w6, [x4] // offset
570 .macro weight_simple name op
571 function x264_mc_weight_w20_\name\()_neon, export=1
572 weight_simple_prologue
576 ld1 {v16.16b}, [x2], x3
578 ld1 {v17.16b}, [x2], x3
579 \op v18.8b, v18.8b, v1.8b
580 \op v16.16b, v16.16b, v1.16b
581 \op v19.8b, v19.8b, v1.8b
582 \op v17.16b, v17.16b, v1.16b
584 st1 {v16.16b}, [x0], x1
586 st1 {v17.16b}, [x0], x1
591 function x264_mc_weight_w16_\name\()_neon, export=1
592 weight_simple_prologue
595 ld1 {v16.16b}, [x2], x3
596 ld1 {v17.16b}, [x2], x3
597 \op v16.16b, v16.16b, v1.16b
598 \op v17.16b, v17.16b, v1.16b
599 st1 {v16.16b}, [x0], x1
600 st1 {v17.16b}, [x0], x1
605 function x264_mc_weight_w8_\name\()_neon, export=1
606 weight_simple_prologue
609 ld1 {v16.8b}, [x2], x3
610 ld1 {v17.8b}, [x2], x3
611 \op v16.8b, v16.8b, v1.8b
612 \op v17.8b, v17.8b, v1.8b
613 st1 {v16.8b}, [x0], x1
614 st1 {v17.8b}, [x0], x1
619 function x264_mc_weight_w4_\name\()_neon, export=1
620 weight_simple_prologue
623 ld1 {v16.s}[0], [x2], x3
624 ld1 {v16.s}[1], [x2], x3
625 \op v16.8b, v16.8b, v1.8b
626 st1 {v16.s}[0], [x0], x1
627 st1 {v16.s}[1], [x0], x1
633 weight_simple offsetadd, uqadd
634 weight_simple offsetsub, uqsub
637 // void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
638 function x264_mc_copy_w4_neon, export=1
641 ld1 {v0.s}[0], [x2], x3
642 ld1 {v1.s}[0], [x2], x3
643 ld1 {v2.s}[0], [x2], x3
644 ld1 {v3.s}[0], [x2], x3
645 st1 {v0.s}[0], [x0], x1
646 st1 {v1.s}[0], [x0], x1
647 st1 {v2.s}[0], [x0], x1
648 st1 {v3.s}[0], [x0], x1
653 function x264_mc_copy_w8_neon, export=1
655 ld1 {v0.8b}, [x2], x3
656 ld1 {v1.8b}, [x2], x3
657 ld1 {v2.8b}, [x2], x3
658 ld1 {v3.8b}, [x2], x3
659 st1 {v0.8b}, [x0], x1
660 st1 {v1.8b}, [x0], x1
661 st1 {v2.8b}, [x0], x1
662 st1 {v3.8b}, [x0], x1
667 function x264_mc_copy_w16_neon, export=1
669 ld1 {v0.16b}, [x2], x3
670 ld1 {v1.16b}, [x2], x3
671 ld1 {v2.16b}, [x2], x3
672 ld1 {v3.16b}, [x2], x3
673 st1 {v0.16b}, [x0], x1
674 st1 {v1.16b}, [x0], x1
675 st1 {v2.16b}, [x0], x1
676 st1 {v3.16b}, [x0], x1
681 // void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v,
682 // intptr_t i_dst_stride,
683 // uint8_t *src, intptr_t i_src_stride,
684 // int dx, int dy, int i_width, int i_height );
685 function x264_mc_chroma_neon, export=1
686 ldr w15, [sp] // height
687 sbfx x12, x6, #3, #29 // asr(3) and sign extend
688 sbfx x11, x5, #3, #29 // asr(3) and sign extend
691 add x3, x3, x11, lsl #1
701 b.gt mc_chroma_w8_neon
702 b.eq mc_chroma_w4_neon
705 .macro CHROMA_MC_START r00, r01, r10, r11
706 mul w12, w5, w6 // cD = d8x *d8y
712 sub w10, w13, w12 // cB = d8x *(8-d8y);
713 sub w11, w14, w12 // cC = (8-d8x)*d8y
714 sub w9, w9, w14 // cA = (8-d8x)*(8-d8y);
717 .macro CHROMA_MC width, vsize
718 function mc_chroma_w\width\()_neon
719 // since the element size varies, there's a different index for the 2nd store
728 ld2 {v28.8b,v29.8b}, [x3], x4
732 ext v6.8b, v28.8b, v6.8b, #1
733 ext v7.8b, v29.8b, v7.8b, #1
735 ld2 {v30.8b,v31.8b}, [x3], x4
739 ext v22.8b, v30.8b, v22.8b, #1
740 ext v23.8b, v31.8b, v23.8b, #1
742 trn1 v0.2s, v0.2s, v1.2s
743 trn1 v2.2s, v2.2s, v3.2s
745 trn1 v4.2s, v28.2s, v6.2s
746 trn1 v5.2s, v29.2s, v7.2s
747 trn1 v20.2s, v30.2s, v22.2s
748 trn1 v21.2s, v31.2s, v23.2s
749 1: // height loop, interpolate xy
751 umull v16.8h, v4.8b, v0.8b
752 umlal v16.8h, v20.8b, v2.8b
753 umull v17.8h, v5.8b, v0.8b
754 umlal v17.8h, v21.8b, v2.8b
756 ld2 {v28.8b,v29.8b}, [x3], x4
757 transpose v24.2d, v25.2d, v16.2d, v17.2d
759 ext v6.8b, v28.8b, v6.8b, #1
760 ext v7.8b, v29.8b, v7.8b, #1
762 trn1 v4.2s, v28.2s, v6.2s
763 trn1 v5.2s, v29.2s, v7.2s
765 add v16.8h, v24.8h, v25.8h
767 umull v18.8h, v20.8b, v0.8b
768 umlal v18.8h, v4.8b, v2.8b
769 umull v19.8h, v21.8b, v0.8b
770 umlal v19.8h, v5.8b, v2.8b
772 ld2 {v30.8b,v31.8b}, [x3], x4
773 transpose v26.2d, v27.2d, v18.2d, v19.2d
775 ext v22.8b, v30.8b, v22.8b, #1
776 ext v23.8b, v31.8b, v23.8b, #1
777 trn1 v20.2s, v30.2s, v22.2s
778 trn1 v21.2s, v31.2s, v23.2s
780 add v17.8h, v26.8h, v27.8h
782 rshrn v16.8b, v16.8h, #6
783 rshrn v17.8b, v17.8h, #6
788 st1 {v16.\vsize}[0], [x0], x2
789 st1 {v16.\vsize}[st2], [x1], x2
790 st1 {v17.\vsize}[0], [x0], x2
791 st1 {v17.\vsize}[st2], [x1], x2
803 ld1 {v4.8b}, [x3], x4
804 ld1 {v6.8b}, [x3], x4
805 3: // vertical interpolation loop
807 umull v16.8h, v4.8b, v0.8b
808 ld1 {v4.8b}, [x3], x4
809 umlal v16.8h, v6.8b, v1.8b
810 umull v17.8h, v6.8b, v0.8b
811 ld1 {v6.8b}, [x3], x4
812 umlal v17.8h, v4.8b, v1.8b
814 rshrn v20.8b, v16.8h, #6 // uvuvuvuv
815 rshrn v21.8b, v17.8h, #6 // uvuvuvuv
817 uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
818 uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
823 st1 {v16.\vsize}[0], [x0], x2
824 st1 {v16.\vsize}[st2], [x0], x2
825 st1 {v17.\vsize}[0], [x1], x2
826 st1 {v17.\vsize}[st2], [x1], x2
832 ld1 {v4.8b,v5.8b}, [x3], x4
833 ld1 {v6.8b,v7.8b}, [x3], x4
835 ext v5.8b, v4.8b, v5.8b, #2
836 ext v7.8b, v6.8b, v7.8b, #2
837 5: // horizontal interpolation loop
839 umull v16.8h, v4.8b, v0.8b
840 umlal v16.8h, v5.8b, v1.8b
841 umull v17.8h, v6.8b, v0.8b
842 umlal v17.8h, v7.8b, v1.8b
844 ld1 {v4.8b,v5.8b}, [x3], x4
845 ld1 {v6.8b,v7.8b}, [x3], x4
846 rshrn v20.8b, v16.8h, #6
847 rshrn v21.8b, v17.8h, #6
848 ext v5.8b, v4.8b, v5.8b, #2
849 ext v7.8b, v6.8b, v7.8b, #2
850 uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
851 uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
856 st1 {v16.\vsize}[0], [x0], x2
857 st1 {v16.\vsize}[st2], [x0], x2
858 st1 {v17.\vsize}[0], [x1], x2
859 st1 {v17.\vsize}[st2], [x1], x2
869 function mc_chroma_w8_neon
872 ld2 {v4.16b,v5.16b}, [x3], x4
873 ld2 {v20.16b,v21.16b}, [x3], x4
877 ext v6.16b, v4.16b, v4.16b, #1
878 ext v7.16b, v5.16b, v5.16b, #1
883 ext v22.16b, v20.16b, v20.16b, #1
884 ext v23.16b, v21.16b, v21.16b, #1
886 1: // height loop, interpolate xy
888 umull v16.8h, v4.8b, v0.8b
889 umlal v16.8h, v6.8b, v1.8b
890 umlal v16.8h, v20.8b, v2.8b
891 umlal v16.8h, v22.8b, v3.8b
893 umull v17.8h, v5.8b, v0.8b
894 umlal v17.8h, v7.8b, v1.8b
895 umlal v17.8h, v21.8b, v2.8b
896 umlal v17.8h, v23.8b, v3.8b
898 ld2 {v4.16b,v5.16b}, [x3], x4
900 ext v6.16b, v4.16b, v4.16b, #1
901 ext v7.16b, v5.16b, v5.16b, #1
903 umull v18.8h, v20.8b, v0.8b
904 umlal v18.8h, v22.8b, v1.8b
905 umlal v18.8h, v4.8b, v2.8b
906 umlal v18.8h, v6.8b, v3.8b
908 umull v19.8h, v21.8b, v0.8b
909 umlal v19.8h, v23.8b, v1.8b
910 umlal v19.8h, v5.8b, v2.8b
911 umlal v19.8h, v7.8b, v3.8b
913 ld2 {v20.16b,v21.16b}, [x3], x4
915 rshrn v16.8b, v16.8h, #6
916 rshrn v17.8b, v17.8h, #6
917 rshrn v18.8b, v18.8h, #6
918 rshrn v19.8b, v19.8h, #6
920 ext v22.16b, v20.16b, v20.16b, #1
921 ext v23.16b, v21.16b, v21.16b, #1
926 st1 {v16.8b}, [x0], x2
927 st1 {v17.8b}, [x1], x2
928 st1 {v18.8b}, [x0], x2
929 st1 {v19.8b}, [x1], x2
941 ld2 {v4.8b,v5.8b}, [x3], x4
942 ld2 {v6.8b,v7.8b}, [x3], x4
943 3: // vertical interpolation loop
945 umull v16.8h, v4.8b, v0.8b //U
946 umlal v16.8h, v6.8b, v1.8b
947 umull v17.8h, v5.8b, v0.8b //V
948 umlal v17.8h, v7.8b, v1.8b
950 ld2 {v4.8b,v5.8b}, [x3], x4
952 umull v18.8h, v6.8b, v0.8b
953 umlal v18.8h, v4.8b, v1.8b
954 umull v19.8h, v7.8b, v0.8b
955 umlal v19.8h, v5.8b, v1.8b
957 ld2 {v6.8b,v7.8b}, [x3], x4
959 rshrn v16.8b, v16.8h, #6
960 rshrn v17.8b, v17.8h, #6
961 rshrn v18.8b, v18.8h, #6
962 rshrn v19.8b, v19.8h, #6
967 st1 {v16.8b}, [x0], x2
968 st1 {v17.8b}, [x1], x2
969 st1 {v18.8b}, [x0], x2
970 st1 {v19.8b}, [x1], x2
975 ld2 {v4.16b,v5.16b}, [x3], x4
976 ext v6.16b, v4.16b, v4.16b, #1
977 ext v7.16b, v5.16b, v5.16b, #1
978 ld2 {v20.16b,v21.16b}, [x3], x4
979 ext v22.16b, v20.16b, v20.16b, #1
980 ext v23.16b, v21.16b, v21.16b, #1
981 5: // horizontal interpolation loop
983 umull v16.8h, v4.8b, v0.8b //U
984 umlal v16.8h, v6.8b, v1.8b
985 umull v17.8h, v5.8b, v0.8b //V
986 umlal v17.8h, v7.8b, v1.8b
988 ld2 {v4.16b,v5.16b}, [x3], x4
990 umull v18.8h, v20.8b, v0.8b
991 umlal v18.8h, v22.8b, v1.8b
992 umull v19.8h, v21.8b, v0.8b
993 umlal v19.8h, v23.8b, v1.8b
995 ld2 {v20.16b,v21.16b}, [x3], x4
997 rshrn v16.8b, v16.8h, #6
998 rshrn v17.8b, v17.8h, #6
999 rshrn v18.8b, v18.8h, #6
1000 rshrn v19.8b, v19.8h, #6
1002 ext v6.16b, v4.16b, v4.16b, #1
1003 ext v7.16b, v5.16b, v5.16b, #1
1004 ext v22.16b, v20.16b, v20.16b, #1
1005 ext v23.16b, v21.16b, v21.16b, #1
1010 st1 {v16.8b}, [x0], x2
1011 st1 {v17.8b}, [x1], x2
1012 st1 {v18.8b}, [x0], x2
1013 st1 {v19.8b}, [x1], x2
1019 //void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
1020 // intptr_t stride, int width, int height, int16_t *buf )
1021 function x264_hpel_filter_neon, export=1
1024 sub x13, x3, x9 // align src
1035 add x7, x3, #16 // src pointer next 16b for horiz filter
1036 mov x5, x15 // restore width
1037 sub x3, x3, x4, lsl #1 // src - 2*stride
1038 ld1 {v28.16b}, [x7], #16 // src[16:31]
1040 add x9, x3, x5 // holds src - 2*stride + width
1042 ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
1043 ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
1044 ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
1045 ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
1046 ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
1047 ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
1049 ext v22.16b, v7.16b, v18.16b, #14
1050 uaddl v1.8h, v16.8b, v21.8b
1051 ext v26.16b, v18.16b, v28.16b, #3
1052 umlsl v1.8h, v17.8b, v30.8b
1053 ext v23.16b, v7.16b, v18.16b, #15
1054 umlal v1.8h, v18.8b, v31.8b
1055 ext v24.16b, v18.16b, v28.16b, #1
1056 umlal v1.8h, v19.8b, v31.8b
1057 ext v25.16b, v18.16b, v28.16b, #2
1058 umlsl v1.8h, v20.8b, v30.8b
1059 2: // next 16 pixel of line
1061 sub x3, x9, x5 // src - 2*stride += 16
1063 uaddl v4.8h, v22.8b, v26.8b
1064 uaddl2 v5.8h, v22.16b, v26.16b
1065 sqrshrun v6.8b, v1.8h, #5
1066 umlsl v4.8h, v23.8b, v30.8b
1067 umlsl2 v5.8h, v23.16b, v30.16b
1068 umlal v4.8h, v18.8b, v31.8b
1069 umlal2 v5.8h, v18.16b, v31.16b
1070 umlal v4.8h, v24.8b, v31.8b
1071 umlal2 v5.8h, v24.16b, v31.16b
1072 umlsl v4.8h, v25.8b, v30.8b
1073 umlsl2 v5.8h, v25.16b, v30.16b
1075 uaddl2 v2.8h, v16.16b, v21.16b
1076 sqrshrun v4.8b, v4.8h, #5
1078 sqrshrun2 v4.16b, v5.8h, #5
1080 umlsl2 v2.8h, v17.16b, v30.16b
1081 ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
1082 umlal2 v2.8h, v18.16b, v31.16b
1083 ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
1084 umlal2 v2.8h, v19.16b, v31.16b
1085 ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
1086 umlsl2 v2.8h, v20.16b, v30.16b
1087 ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
1088 st1 {v4.16b}, [x0], #16
1089 sqrshrun2 v6.16b, v2.8h, #5
1090 ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
1091 ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
1093 ext v22.16b, v0.16b, v1.16b, #12
1094 ext v26.16b, v1.16b, v2.16b, #6
1095 ext v23.16b, v0.16b, v1.16b, #14
1096 st1 {v6.16b}, [x1], #16
1097 uaddl v3.8h, v16.8b, v21.8b
1098 ext v25.16b, v1.16b, v2.16b, #4
1099 umlsl v3.8h, v17.8b, v30.8b
1100 ext v24.16b, v1.16b, v2.16b, #2
1102 umlal v3.8h, v18.8b, v31.8b
1103 add v4.8h, v22.8h, v26.8h
1104 umlal v3.8h, v19.8b, v31.8b
1105 add v5.8h, v23.8h, v25.8h
1106 umlsl v3.8h, v20.8b, v30.8b
1107 add v6.8h, v24.8h, v1.8h
1109 ext v22.16b, v1.16b, v2.16b, #12
1110 ext v26.16b, v2.16b, v3.16b, #6
1111 ext v23.16b, v1.16b, v2.16b, #14
1112 ext v25.16b, v2.16b, v3.16b, #4
1113 ext v24.16b, v2.16b, v3.16b, #2
1115 add v22.8h, v22.8h, v26.8h
1116 add v23.8h, v23.8h, v25.8h
1117 add v24.8h, v24.8h, v2.8h
1119 sub v4.8h, v4.8h, v5.8h // a-b
1120 sub v5.8h, v5.8h, v6.8h // b-c
1122 sub v22.8h, v22.8h, v23.8h // a-b
1123 sub v23.8h, v23.8h, v24.8h // b-c
1125 sshr v4.8h, v4.8h, #2 // (a-b)/4
1126 sshr v22.8h, v22.8h, #2 // (a-b)/4
1127 sub v4.8h, v4.8h, v5.8h // (a-b)/4-b+c
1128 sub v22.8h, v22.8h, v23.8h // (a-b)/4-b+c
1129 sshr v4.8h, v4.8h, #2 // ((a-b)/4-b+c)/4
1130 sshr v22.8h, v22.8h, #2 // ((a-b)/4-b+c)/4
1131 add v4.8h, v4.8h, v6.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
1132 add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
1134 sqrshrun v4.8b, v4.8h, #6
1135 ld1 {v28.16b}, [x7], #16 // src[16:31]
1137 ext v23.16b, v7.16b, v18.16b, #15
1138 sqrshrun2 v4.16b, v22.8h, #6
1140 ext v22.16b, v7.16b, v18.16b, #14
1141 ext v24.16b, v18.16b, v28.16b, #1
1142 ext v25.16b, v18.16b, v28.16b, #2
1143 ext v26.16b, v18.16b, v28.16b, #3
1145 st1 {v4.16b}, [x2], #16
1158 // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
1159 // uint8_t *dstv, uint8_t *dstc, intptr_t src_stride,
1160 // intptr_t dst_stride, int width, int height )
1161 function x264_frame_init_lowres_core_neon, export=1
1163 sub x10, x6, w7, uxtw // dst_stride - width
1169 add x12, x0, x5 // src1 = src0 + src_stride
1170 add x13, x0, x5, lsl #1 // src2 = src1 + src_stride
1172 ld2 {v0.16b,v1.16b}, [x11], #32
1173 ld2 {v2.16b,v3.16b}, [x12], #32
1174 ld2 {v4.16b,v5.16b}, [x13], #32
1176 urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x]
1177 urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x]
1180 urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
1181 urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
1183 ld2 {v0.16b,v1.16b}, [x11], #32
1184 ld2 {v2.16b,v3.16b}, [x12], #32
1185 ld2 {v4.16b,v5.16b}, [x13], #32
1186 urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
1187 urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
1188 ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2]
1189 ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2]
1191 urhadd v16.16b, v20.16b, v21.16b
1192 urhadd v18.16b, v22.16b, v23.16b
1193 urhadd v17.16b, v21.16b, v24.16b
1194 urhadd v19.16b, v23.16b, v25.16b
1196 st1 {v16.16b}, [x1], #16
1197 st1 {v18.16b}, [x3], #16
1198 st1 {v17.16b}, [x2], #16
1199 st1 {v19.16b}, [x4], #16
1203 urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
1204 urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
1206 ld2 {v0.16b,v1.16b}, [x11], #32
1207 ld2 {v2.16b,v3.16b}, [x12], #32
1208 ld2 {v4.16b,v5.16b}, [x13], #32
1209 urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
1210 urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
1211 ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2]
1212 ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2]
1214 urhadd v16.16b, v30.16b, v21.16b
1215 urhadd v18.16b, v31.16b, v23.16b
1216 urhadd v17.16b, v21.16b, v24.16b
1217 urhadd v19.16b, v23.16b, v25.16b
1219 st1 {v16.16b}, [x1], #16
1220 st1 {v18.16b}, [x3], #16
1221 st1 {v17.16b}, [x2], #16
1222 st1 {v19.16b}, [x4], #16
1226 add x0, x0, x5, lsl #1
1236 function x264_load_deinterleave_chroma_fenc_neon, export=1
1237 mov x4, #FENC_STRIDE/2
1238 b load_deinterleave_chroma
1241 function x264_load_deinterleave_chroma_fdec_neon, export=1
1242 mov x4, #FDEC_STRIDE/2
1243 load_deinterleave_chroma:
1244 ld2 {v0.8b,v1.8b}, [x1], x2
1245 ld2 {v2.8b,v3.8b}, [x1], x2
1247 st1 {v0.8b}, [x0], x4
1248 st1 {v1.8b}, [x0], x4
1249 st1 {v2.8b}, [x0], x4
1250 st1 {v3.8b}, [x0], x4
1251 b.gt load_deinterleave_chroma
1256 function x264_plane_copy_neon, export=1
1272 ldp q0, q1, [x2], #32
1273 stp q0, q1, [x0], #32
1284 function x264_plane_copy_deinterleave_neon, export=1
1286 and w9, w9, #0xfffffff0
1289 sub x5, x5, x9, lsl #1
1291 ld2 {v0.16b,v1.16b}, [x4], #32
1293 st1 {v0.16b}, [x0], #16
1294 st1 {v1.16b}, [x2], #16
1307 .macro deinterleave_rgb
1309 st1 {v0.8b}, [x0], #8
1310 st1 {v1.8b}, [x2], #8
1311 st1 {v2.8b}, [x4], #8
1323 function x264_plane_copy_deinterleave_rgb_neon, export=1
1326 ldp w9, w10, [sp, #4]
1329 ldp x9, x10, [sp, #8]
1339 sub x7, x7, x11, lsl #1
1342 ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24
1347 sub x7, x7, x11, lsl #2
1349 ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32
1355 function x264_plane_copy_interleave_neon, export=1
1357 and w9, w9, #0xfffffff0
1358 sub x1, x1, x9, lsl #1
1362 ld1 {v0.16b}, [x2], #16
1363 ld1 {v1.16b}, [x4], #16
1365 st2 {v0.16b,v1.16b}, [x0], #32
1378 function x264_store_interleave_chroma_neon, export=1
1379 mov x5, #FDEC_STRIDE
1381 ld1 {v0.8b}, [x2], x5
1382 ld1 {v1.8b}, [x3], x5
1383 ld1 {v2.8b}, [x2], x5
1384 ld1 {v3.8b}, [x3], x5
1386 zip1 v4.16b, v0.16b, v1.16b
1387 zip1 v5.16b, v2.16b, v3.16b
1388 st1 {v4.16b}, [x0], x1
1389 st1 {v5.16b}, [x0], x1
1395 .macro integral4h p1, p2
1396 ext v1.8b, \p1\().8b, \p2\().8b, #1
1397 ext v2.8b, \p1\().8b, \p2\().8b, #2
1398 ext v3.8b, \p1\().8b, \p2\().8b, #3
1399 uaddl v0.8h, \p1\().8b, v1.8b
1400 uaddl v4.8h, v2.8b, v3.8b
1401 add v0.8h, v0.8h, v4.8h
1402 add v0.8h, v0.8h, v5.8h
1405 function integral_init4h_neon, export=1
1406 sub x3, x0, x2, lsl #1
1407 ld1 {v6.8b,v7.8b}, [x1], #16
1410 ld1 {v5.8h}, [x3], #16
1412 ld1 {v6.8b}, [x1], #8
1413 ld1 {v5.8h}, [x3], #16
1414 st1 {v0.8h}, [x0], #16
1416 ld1 {v7.8b}, [x1], #8
1417 st1 {v0.8h}, [x0], #16
1422 .macro integral8h p1, p2, s
1423 ext v1.8b, \p1\().8b, \p2\().8b, #1
1424 ext v2.8b, \p1\().8b, \p2\().8b, #2
1425 ext v3.8b, \p1\().8b, \p2\().8b, #3
1426 ext v4.8b, \p1\().8b, \p2\().8b, #4
1427 ext v5.8b, \p1\().8b, \p2\().8b, #5
1428 ext v6.8b, \p1\().8b, \p2\().8b, #6
1429 ext v7.8b, \p1\().8b, \p2\().8b, #7
1430 uaddl v0.8h, \p1\().8b, v1.8b
1431 uaddl v2.8h, v2.8b, v3.8b
1432 uaddl v4.8h, v4.8b, v5.8b
1433 uaddl v6.8h, v6.8b, v7.8b
1434 add v0.8h, v0.8h, v2.8h
1435 add v4.8h, v4.8h, v6.8h
1436 add v0.8h, v0.8h, v4.8h
1437 add v0.8h, v0.8h, \s\().8h
1440 function integral_init8h_neon, export=1
1441 sub x3, x0, x2, lsl #1
1442 ld1 {v16.8b,v17.8b}, [x1], #16
1445 ld1 {v18.8h}, [x3], #16
1446 integral8h v16, v17, v18
1447 ld1 {v16.8b}, [x1], #8
1448 ld1 {v18.8h}, [x3], #16
1449 st1 {v0.8h}, [x0], #16
1450 integral8h v17, v16, v18
1451 ld1 {v17.8b}, [x1], #8
1452 st1 {v0.8h}, [x0], #16
1457 function integral_init4v_neon, export=1
1459 add x4, x0, x2, lsl #3
1460 add x8, x0, x2, lsl #4
1462 ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48
1463 ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48
1466 ld1 {v24.8h,v25.8h}, [x4], #32
1467 ext v0.16b, v20.16b, v21.16b, #8
1468 ext v1.16b, v21.16b, v22.16b, #8
1469 ext v2.16b, v16.16b, v17.16b, #8
1470 ext v3.16b, v17.16b, v18.16b, #8
1471 sub v24.8h, v24.8h, v20.8h
1472 sub v25.8h, v25.8h, v21.8h
1473 add v0.8h, v0.8h, v20.8h
1474 add v1.8h, v1.8h, v21.8h
1475 add v2.8h, v2.8h, v16.8h
1476 add v3.8h, v3.8h, v17.8h
1477 st1 {v24.8h}, [x1], #16
1478 st1 {v25.8h}, [x1], #16
1479 mov v20.16b, v22.16b
1480 mov v16.16b, v18.16b
1481 sub v0.8h, v2.8h, v0.8h
1482 sub v1.8h, v3.8h, v1.8h
1483 ld1 {v21.8h,v22.8h}, [x3], #32
1484 ld1 {v17.8h,v18.8h}, [x8], #32
1485 st1 {v0.8h}, [x0], #16
1486 st1 {v1.8h}, [x0], #16
1492 function integral_init8v_neon, export=1
1493 add x2, x0, x1, lsl #4
1495 ands x3, x1, #16 - 1
1499 ld1 {v2.8h}, [x2], #16
1500 sub v4.8h, v2.8h, v0.8h
1501 st1 {v4.8h}, [x0], #16
1505 ld1 {v0.8h,v1.8h}, [x0]
1506 ld1 {v2.8h,v3.8h}, [x2], #32
1507 sub v4.8h, v2.8h, v0.8h
1508 sub v5.8h, v3.8h, v1.8h
1509 st1 {v4.8h}, [x0], #16
1510 st1 {v5.8h}, [x0], #16
1516 function x264_mbtree_propagate_cost_neon, export=1
1520 ld1 {v1.8h}, [x1], #16
1521 ld1 {v2.8h}, [x2], #16
1522 ld1 {v3.8h}, [x3], #16
1523 ld1 {v4.8h}, [x4], #16
1524 bic v3.8h, #0xc0, lsl #8
1525 umin v3.8h, v2.8h, v3.8h
1526 umull v20.4s, v2.4h, v4.4h // propagate_intra
1527 umull2 v21.4s, v2.8h, v4.8h // propagate_intra
1528 usubl v22.4s, v2.4h, v3.4h // propagate_num
1529 usubl2 v23.4s, v2.8h, v3.8h // propagate_num
1530 uxtl v26.4s, v2.4h // propagate_denom
1531 uxtl2 v27.4s, v2.8h // propagate_denom
1534 ucvtf v20.4s, v20.4s
1535 ucvtf v21.4s, v21.4s
1536 ucvtf v26.4s, v26.4s
1537 ucvtf v27.4s, v27.4s
1538 ucvtf v22.4s, v22.4s
1539 ucvtf v23.4s, v23.4s
1540 frecpe v28.4s, v26.4s
1541 frecpe v29.4s, v27.4s
1542 ucvtf v24.4s, v24.4s
1543 ucvtf v25.4s, v25.4s
1544 frecps v30.4s, v28.4s, v26.4s
1545 frecps v31.4s, v29.4s, v27.4s
1546 fmla v24.4s, v20.4s, v5.4s // propagate_amount
1547 fmla v25.4s, v21.4s, v5.4s // propagate_amount
1548 fmul v28.4s, v28.4s, v30.4s
1549 fmul v29.4s, v29.4s, v31.4s
1550 fmul v16.4s, v24.4s, v22.4s
1551 fmul v17.4s, v25.4s, v23.4s
1552 fmul v18.4s, v16.4s, v28.4s
1553 fmul v19.4s, v17.4s, v29.4s
1554 fcvtns v20.4s, v18.4s
1555 fcvtns v21.4s, v19.4s
1557 sqxtn2 v0.8h, v21.4s
1558 st1 {v0.8h}, [x0], #16
1563 const pw_0to15, align=5
1564 .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1567 function x264_mbtree_propagate_list_internal_neon, export=1
1568 movrel x11, pw_0to15
1569 dup v31.8h, w4 // bipred_weight
1570 movi v30.8h, #0xc0, lsl #8
1571 ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y
1575 dup v24.8h, w5 // mb_y
1576 zip1 v29.8h, v29.8h, v24.8h
1579 ld1 {v1.8h}, [x1], #16 // propagate_amount
1580 ld1 {v2.8h}, [x2], #16 // lowres_cost
1581 and v2.16b, v2.16b, v30.16b
1582 cmeq v25.8h, v2.8h, v30.8h
1583 umull v16.4s, v1.4h, v31.4h
1584 umull2 v17.4s, v1.8h, v31.8h
1585 rshrn v16.4h, v16.4s, #6
1586 rshrn2 v16.8h, v17.4s, #6
1587 bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
1588 // propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
1589 ld1 {v4.8h,v5.8h}, [x0], #32
1590 sshr v6.8h, v4.8h, #5
1591 sshr v7.8h, v5.8h, #5
1592 add v6.8h, v6.8h, v29.8h
1593 add v29.8h, v29.8h, v28.8h
1594 add v7.8h, v7.8h, v29.8h
1595 add v29.8h, v29.8h, v28.8h
1596 st1 {v6.8h,v7.8h}, [x3], #32
1597 and v4.16b, v4.16b, v27.16b
1598 and v5.16b, v5.16b, v27.16b
1599 uzp1 v6.8h, v4.8h, v5.8h // x & 31
1600 uzp2 v7.8h, v4.8h, v5.8h // y & 31
1601 sub v4.8h, v26.8h, v6.8h // 32 - (x & 31)
1602 sub v5.8h, v26.8h, v7.8h // 32 - (y & 31)
1603 mul v19.8h, v6.8h, v7.8h // idx3weight = y*x;
1604 mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x);
1605 mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x;
1606 mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ;
1607 umull v6.4s, v19.4h, v25.4h
1608 umull2 v7.4s, v19.8h, v25.8h
1609 umull v4.4s, v18.4h, v25.4h
1610 umull2 v5.4s, v18.8h, v25.8h
1611 umull v2.4s, v17.4h, v25.4h
1612 umull2 v3.4s, v17.8h, v25.8h
1613 umull v0.4s, v16.4h, v25.4h
1614 umull2 v1.4s, v16.8h, v25.8h
1615 rshrn v19.4h, v6.4s, #10
1616 rshrn2 v19.8h, v7.4s, #10
1617 rshrn v18.4h, v4.4s, #10
1618 rshrn2 v18.8h, v5.4s, #10
1619 rshrn v17.4h, v2.4s, #10
1620 rshrn2 v17.8h, v3.4s, #10
1621 rshrn v16.4h, v0.4s, #10
1622 rshrn2 v16.8h, v1.4s, #10
1623 zip1 v0.8h, v16.8h, v17.8h
1624 zip2 v1.8h, v16.8h, v17.8h
1625 zip1 v2.8h, v18.8h, v19.8h
1626 zip2 v3.8h, v18.8h, v19.8h
1627 st1 {v0.8h,v1.8h}, [x3], #32
1628 st1 {v2.8h,v3.8h}, [x3], #32
1633 function x264_memcpy_aligned_neon, export=1
1643 ldp q0, q1, [x1], #32
1644 stp q0, q1, [x0], #32
1649 ldp q0, q1, [x1, #32]
1650 ldp q2, q3, [x1], #64
1651 stp q0, q1, [x0, #32]
1652 stp q2, q3, [x0], #64
1658 function x264_memzero_aligned_neon, export=1
1663 stp q0, q1, [x0, #96]
1664 stp q0, q1, [x0, #64]
1665 stp q0, q1, [x0, #32]
1666 stp q0, q1, [x0], 128