1 /*****************************************************************************
2 * pixel.S: aarch64 pixel metrics
3 *****************************************************************************
4 * Copyright (C) 2009-2015 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
39 .short 0, -1, -1, -1, 0, -1, -1, -1
40 .short 0, -1, -1, -1, -1, -1, -1, -1
44 ld1 {v1.s}[0], [x2], x3
45 ld1 {v0.s}[0], [x0], x1
46 ld1 {v1.s}[1], [x2], x3
47 ld1 {v0.s}[1], [x0], x1
48 uabdl v16.8h, v0.8b, v1.8b
52 ld1 {v1.s}[0], [x2], x3
53 ld1 {v0.s}[0], [x0], x1
54 ld1 {v1.s}[1], [x2], x3
55 ld1 {v0.s}[1], [x0], x1
56 uabal v16.8h, v0.8b, v1.8b
64 uabdl v16.8h, v0.8b, v1.8b
65 uabdl v17.8h, v2.8b, v3.8b
73 uabal v16.8h, v0.8b, v1.8b
74 uabal v17.8h, v2.8b, v3.8b
78 ld1 {v1.16b}, [x2], x3
79 ld1 {v0.16b}, [x0], x1
80 ld1 {v3.16b}, [x2], x3
81 ld1 {v2.16b}, [x0], x1
82 uabdl v16.8h, v0.8b, v1.8b
83 uabdl2 v17.8h, v0.16b, v1.16b
84 uabal v16.8h, v2.8b, v3.8b
85 uabal2 v17.8h, v2.16b, v3.16b
89 ld1 {v1.16b}, [x2], x3
90 ld1 {v0.16b}, [x0], x1
91 ld1 {v3.16b}, [x2], x3
92 ld1 {v2.16b}, [x0], x1
93 uabal v16.8h, v0.8b, v1.8b
94 uabal2 v17.8h, v0.16b, v1.16b
95 uabal v16.8h, v2.8b, v3.8b
96 uabal2 v17.8h, v2.16b, v3.16b
99 .macro SAD_FUNC w, h, name
100 function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
107 add v16.8h, v16.8h, v17.8h
124 .macro SAD_X_4 x, first=uabal
125 ld1 {v0.s}[0], [x0], x7
126 ld1 {v1.s}[0], [x1], x5
127 ld1 {v0.s}[1], [x0], x7
128 ld1 {v1.s}[1], [x1], x5
129 \first v16.8h, v1.8b, v0.8b
130 ld1 {v2.s}[0], [x2], x5
131 ld1 {v2.s}[1], [x2], x5
132 \first v17.8h, v2.8b, v0.8b
133 ld1 {v3.s}[0], [x3], x5
134 ld1 {v3.s}[1], [x3], x5
135 \first v18.8h, v3.8b, v0.8b
137 ld1 {v4.s}[0], [x4], x5
138 ld1 {v4.s}[1], [x4], x5
139 \first v19.8h, v4.8b, v0.8b
143 .macro SAD_X_8 x, first=uabal
144 ld1 {v0.8b}, [x0], x7
145 ld1 {v1.8b}, [x1], x5
146 \first v16.8h, v1.8b, v0.8b
147 ld1 {v2.8b}, [x2], x5
148 ld1 {v5.8b}, [x0], x7
149 \first v17.8h, v2.8b, v0.8b
150 ld1 {v3.8b}, [x3], x5
151 ld1 {v1.8b}, [x1], x5
152 \first v18.8h, v3.8b, v0.8b
153 uabal v16.8h, v1.8b, v5.8b
154 ld1 {v2.8b}, [x2], x5
155 ld1 {v3.8b}, [x3], x5
156 uabal v17.8h, v2.8b, v5.8b
157 uabal v18.8h, v3.8b, v5.8b
159 ld1 {v4.8b}, [x4], x5
160 \first v19.8h, v4.8b, v0.8b
161 ld1 {v4.8b}, [x4], x5
162 uabal v19.8h, v4.8b, v5.8b
166 .macro SAD_X_16 x, first=uabal
167 ld1 {v0.16b}, [x0], x7
168 ld1 {v1.16b}, [x1], x5
169 \first v16.8h, v1.8b, v0.8b
170 \first\()2 v20.8h, v1.16b, v0.16b
171 ld1 {v2.16b}, [x2], x5
172 ld1 {v5.16b}, [x0], x7
173 \first v17.8h, v2.8b, v0.8b
174 \first\()2 v21.8h, v2.16b, v0.16b
175 ld1 {v3.16b}, [x3], x5
176 ld1 {v1.16b}, [x1], x5
177 \first v18.8h, v3.8b, v0.8b
178 \first\()2 v22.8h, v3.16b, v0.16b
179 uabal v16.8h, v1.8b, v5.8b
180 uabal2 v20.8h, v1.16b, v5.16b
181 ld1 {v2.16b}, [x2], x5
182 ld1 {v3.16b}, [x3], x5
183 uabal v17.8h, v2.8b, v5.8b
184 uabal2 v21.8h, v2.16b, v5.16b
185 uabal v18.8h, v3.8b, v5.8b
186 uabal2 v22.8h, v3.16b, v5.16b
188 ld1 {v4.16b}, [x4], x5
189 \first v19.8h, v4.8b, v0.8b
190 \first\()2 v23.8h, v4.16b, v0.16b
191 ld1 {v4.16b}, [x4], x5
192 uabal v19.8h, v4.8b, v5.8b
193 uabal2 v23.8h, v4.16b, v5.16b
197 .macro SAD_X_FUNC x, w, h
198 function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
212 add v16.8h, v16.8h, v20.8h
213 add v17.8h, v17.8h, v21.8h
214 add v18.8h, v18.8h, v22.8h
216 add v19.8h, v19.8h, v23.8h
252 function x264_pixel_vsad_neon, export=1
254 ld1 {v0.16b}, [x0], x1
255 ld1 {v1.16b}, [x0], x1
256 uabdl v6.8h, v0.8b, v1.8b
257 uabdl2 v7.8h, v0.16b, v1.16b
261 ld1 {v0.16b}, [x0], x1
262 uabal v6.8h, v1.8b, v0.8b
263 uabal2 v7.8h, v1.16b, v0.16b
264 ld1 {v1.16b}, [x0], x1
266 uabal v6.8h, v0.8b, v1.8b
267 uabal2 v7.8h, v0.16b, v1.16b
270 add v5.8h, v6.8h, v7.8h
276 function x264_pixel_asd8_neon, export=1
278 ld1 {v0.8b}, [x0], x1
279 ld1 {v1.8b}, [x2], x3
280 ld1 {v2.8b}, [x0], x1
281 ld1 {v3.8b}, [x2], x3
282 usubl v16.8h, v0.8b, v1.8b
285 ld1 {v4.8b}, [x0], x1
286 ld1 {v5.8b}, [x2], x3
287 usubl v17.8h, v2.8b, v3.8b
288 usubl v18.8h, v4.8b, v5.8b
289 add v16.8h, v16.8h, v17.8h
290 ld1 {v2.8b}, [x0], x1
291 ld1 {v3.8b}, [x2], x3
292 add v16.8h, v16.8h, v18.8h
294 usubl v17.8h, v2.8b, v3.8b
295 add v16.8h, v16.8h, v17.8h
303 ld1 {v16.s}[0], [x0], x1
304 ld1 {v17.s}[0], [x2], x3
305 usubl v2.8h, v16.8b, v17.8b
306 ld1 {v16.s}[0], [x0], x1
307 ld1 {v17.s}[0], [x2], x3
308 smull v0.4s, v2.4h, v2.4h
312 usubl v2.8h, v16.8b, v17.8b
313 ld1 {v16.s}[0], [x0], x1
314 ld1 {v17.s}[0], [x2], x3
315 smlal v0.4s, v2.4h, v2.4h
319 usubl v2.8h, v16.8b, v17.8b
320 smlal v0.4s, v2.4h, v2.4h
324 ld1 {v16.8b}, [x0], x1
325 ld1 {v17.8b}, [x2], x3
326 usubl v2.8h, v16.8b, v17.8b
327 ld1 {v16.8b}, [x0], x1
328 smull v0.4s, v2.4h, v2.4h
329 ld1 {v17.8b}, [x2], x3
330 smlal2 v0.4s, v2.8h, v2.8h
334 usubl v2.8h, v16.8b, v17.8b
335 ld1 {v16.8b}, [x0], x1
336 smlal v0.4s, v2.4h, v2.4h
337 ld1 {v17.8b}, [x2], x3
338 smlal2 v0.4s, v2.8h, v2.8h
342 usubl v2.8h, v16.8b, v17.8b
343 smlal v0.4s, v2.4h, v2.4h
344 smlal2 v0.4s, v2.8h, v2.8h
348 ld1 {v16.16b}, [x0], x1
349 ld1 {v17.16b}, [x2], x3
350 usubl v2.8h, v16.8b, v17.8b
351 usubl2 v3.8h, v16.16b, v17.16b
352 ld1 {v16.16b}, [x0], x1
353 smull v0.4s, v2.4h, v2.4h
354 smull2 v1.4s, v2.8h, v2.8h
355 ld1 {v17.16b}, [x2], x3
356 smlal v0.4s, v3.4h, v3.4h
357 smlal2 v1.4s, v3.8h, v3.8h
361 usubl v2.8h, v16.8b, v17.8b
362 usubl2 v3.8h, v16.16b, v17.16b
363 ld1 {v16.16b}, [x0], x1
364 smlal v0.4s, v2.4h, v2.4h
365 smlal2 v1.4s, v2.8h, v2.8h
366 ld1 {v17.16b}, [x2], x3
367 smlal v0.4s, v3.4h, v3.4h
368 smlal2 v1.4s, v3.8h, v3.8h
372 usubl v2.8h, v16.8b, v17.8b
373 usubl2 v3.8h, v16.16b, v17.16b
374 smlal v0.4s, v2.4h, v2.4h
375 smlal2 v1.4s, v2.8h, v2.8h
376 smlal v0.4s, v3.4h, v3.4h
377 smlal2 v1.4s, v3.8h, v3.8h
378 add v0.4s, v0.4s, v1.4s
382 function x264_pixel_ssd_\w\()x\h\()_neon, export=1
405 function x264_pixel_ssd_nv12_core_neon, export=1
411 sub x1, x1, x8, lsl #1
412 sub x3, x3, x8, lsl #1
415 ld2 {v0.8b,v1.8b}, [x0], #16
416 ld2 {v2.8b,v3.8b}, [x2], #16
417 ld2 {v24.8b,v25.8b}, [x0], #16
418 ld2 {v26.8b,v27.8b}, [x2], #16
420 usubl v16.8h, v0.8b, v2.8b
421 usubl v17.8h, v1.8b, v3.8b
422 smull v20.4s, v16.4h, v16.4h
423 smull v21.4s, v17.4h, v17.4h
424 usubl v18.8h, v24.8b, v26.8b
425 usubl v19.8h, v25.8b, v27.8b
426 smlal2 v20.4s, v16.8h, v16.8h
427 smlal2 v21.4s, v17.8h, v17.8h
432 smlal v20.4s, v18.4h, v18.4h
433 smlal v21.4s, v19.4h, v19.4h
434 ld2 {v0.8b,v1.8b}, [x0], #16
435 ld2 {v2.8b,v3.8b}, [x2], #16
436 smlal2 v20.4s, v18.8h, v18.8h
437 smlal2 v21.4s, v19.8h, v19.8h
440 usubl v16.8h, v0.8b, v2.8b
441 usubl v17.8h, v1.8b, v3.8b
442 smlal v20.4s, v16.4h, v16.4h
443 smlal v21.4s, v17.4h, v17.4h
444 ld2 {v24.8b,v25.8b}, [x0], #16
445 ld2 {v26.8b,v27.8b}, [x2], #16
446 smlal2 v20.4s, v16.8h, v16.8h
447 smlal2 v21.4s, v17.8h, v17.8h
450 usubl v18.8h, v24.8b, v26.8b
451 usubl v19.8h, v25.8b, v27.8b
454 smlal v20.4s, v18.4h, v18.4h
455 smlal v21.4s, v19.4h, v19.4h
456 smlal2 v20.4s, v18.8h, v18.8h
457 smlal2 v21.4s, v19.8h, v19.8h
460 uaddw v6.2d, v6.2d, v20.2s
461 uaddw v7.2d, v7.2d, v21.2s
464 uaddw2 v6.2d, v6.2d, v20.4s
465 uaddw2 v7.2d, v7.2d, v21.4s
468 addp v6.2d, v6.2d, v7.2d
476 function x264_pixel_var_8x\h\()_neon, export=1
477 ld1 {v16.8b}, [x0], x1
478 ld1 {v17.8b}, [x0], x1
480 umull v1.8h, v16.8b, v16.8b
482 umull v2.8h, v17.8b, v17.8b
483 uaddw v0.8h, v0.8h, v17.8b
484 ld1 {v18.8b}, [x0], x1
487 ld1 {v19.8b}, [x0], x1
490 uaddw v0.8h, v0.8h, v18.8b
491 umull v24.8h, v18.8b, v18.8b
492 ld1 {v20.8b}, [x0], x1
493 uaddw v0.8h, v0.8h, v19.8b
494 umull v25.8h, v19.8b, v19.8b
496 ld1 {v21.8b}, [x0], x1
497 uaddw v0.8h, v0.8h, v20.8b
498 umull v26.8h, v20.8b, v20.8b
500 ld1 {v18.8b}, [x0], x1
501 uaddw v0.8h, v0.8h, v21.8b
502 umull v27.8h, v21.8b, v21.8b
504 ld1 {v19.8b}, [x0], x1
508 uaddw v0.8h, v0.8h, v18.8b
509 umull v28.8h, v18.8b, v18.8b
510 uaddw v0.8h, v0.8h, v19.8b
511 umull v29.8h, v19.8b, v19.8b
522 function x264_pixel_var_16x16_neon, export=1
523 ld1 {v16.16b}, [x0], x1
524 ld1 {v17.16b}, [x0], x1
526 umull v1.8h, v16.8b, v16.8b
527 umull2 v2.8h, v16.16b, v16.16b
531 uaddw2 v0.8h, v0.8h, v16.16b
534 ld1 {v18.16b}, [x0], x1
535 uaddw v0.8h, v0.8h, v17.8b
536 umull v3.8h, v17.8b, v17.8b
537 uaddw2 v0.8h, v0.8h, v17.16b
538 umull2 v4.8h, v17.16b, v17.16b
542 ld1 {v17.16b}, [x0], x1
543 uaddw v0.8h, v0.8h, v18.8b
544 umull v5.8h, v18.8b, v18.8b
545 uaddw2 v0.8h, v0.8h, v18.16b
546 umull2 v6.8h, v18.16b, v18.16b
551 uaddw v0.8h, v0.8h, v17.8b
552 umull v3.8h, v17.8b, v17.8b
553 uaddw2 v0.8h, v0.8h, v17.16b
554 umull2 v4.8h, v17.16b, v17.16b
559 function x264_var_end
560 add v1.4s, v1.4s, v2.4s
565 orr x0, x0, x1, lsl #32
570 .macro pixel_var2_8 h
571 function x264_pixel_var2_8x\h\()_neon, export=1
572 ld1 {v16.8b}, [x0], x1
573 ld1 {v18.8b}, [x2], x3
574 ld1 {v17.8b}, [x0], x1
575 ld1 {v19.8b}, [x2], x3
577 usubl v6.8h, v16.8b, v18.8b
578 usubl v7.8h, v17.8b, v19.8b
579 ld1 {v16.8b}, [x0], x1
580 ld1 {v18.8b}, [x2], x3
581 smull v2.4s, v6.4h, v6.4h
582 smull2 v3.4s, v6.8h, v6.8h
583 add v0.8h, v6.8h, v7.8h
584 smlal v2.4s, v7.4h, v7.4h
585 smlal2 v3.4s, v7.8h, v7.8h
587 usubl v6.8h, v16.8b, v18.8b
590 ld1 {v17.8b}, [x0], x1
591 ld1 {v19.8b}, [x2], x3
592 smlal v2.4s, v6.4h, v6.4h
593 smlal2 v3.4s, v6.8h, v6.8h
594 usubl v7.8h, v17.8b, v19.8b
595 add v0.8h, v0.8h, v6.8h
596 ld1 {v16.8b}, [x0], x1
597 ld1 {v18.8b}, [x2], x3
598 smlal v2.4s, v7.4h, v7.4h
599 smlal2 v3.4s, v7.8h, v7.8h
600 usubl v6.8h, v16.8b, v18.8b
601 add v0.8h, v0.8h, v7.8h
604 ld1 {v17.8b}, [x0], x1
605 ld1 {v19.8b}, [x2], x3
606 smlal v2.4s, v6.4h, v6.4h
607 smlal2 v3.4s, v6.8h, v6.8h
608 usubl v7.8h, v17.8b, v19.8b
609 add v0.8h, v0.8h, v6.8h
610 smlal v2.4s, v7.4h, v7.4h
611 add v0.8h, v0.8h, v7.8h
612 smlal2 v3.4s, v7.8h, v7.8h
615 add v2.4s, v2.4s, v3.4s
622 sub x0, x1, x0, lsr # 6 + (\h >> 4)
632 function x264_pixel_satd_4x4_neon, export=1
633 ld1 {v1.s}[0], [x2], x3
634 ld1 {v0.s}[0], [x0], x1
635 ld1 {v3.s}[0], [x2], x3
636 ld1 {v2.s}[0], [x0], x1
637 ld1 {v1.s}[1], [x2], x3
638 ld1 {v0.s}[1], [x0], x1
639 ld1 {v3.s}[1], [x2], x3
640 ld1 {v2.s}[1], [x0], x1
642 usubl v0.8h, v0.8b, v1.8b
643 usubl v1.8h, v2.8b, v3.8b
644 SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
646 zip1 v0.2d, v2.2d, v3.2d
647 zip2 v1.2d, v2.2d, v3.2d
648 SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
650 trn1 v0.8h, v2.8h, v3.8h
651 trn2 v1.8h, v2.8h, v3.8h
652 SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
654 trn1 v0.4s, v2.4s, v3.4s
655 trn2 v1.4s, v2.4s, v3.4s
658 umax v0.8h, v0.8h, v1.8h
665 function x264_pixel_satd_4x8_neon, export=1
666 ld1 {v1.s}[0], [x2], x3
667 ld1 {v0.s}[0], [x0], x1
668 ld1 {v3.s}[0], [x2], x3
669 ld1 {v2.s}[0], [x0], x1
670 ld1 {v5.s}[0], [x2], x3
671 ld1 {v4.s}[0], [x0], x1
672 ld1 {v7.s}[0], [x2], x3
673 ld1 {v6.s}[0], [x0], x1
674 ld1 {v1.s}[1], [x2], x3
675 ld1 {v0.s}[1], [x0], x1
676 ld1 {v3.s}[1], [x2], x3
677 ld1 {v2.s}[1], [x0], x1
678 ld1 {v5.s}[1], [x2], x3
679 ld1 {v4.s}[1], [x0], x1
680 ld1 {v7.s}[1], [x2], x3
681 ld1 {v6.s}[1], [x0], x1
682 b x264_satd_4x8_8x4_end_neon
685 function x264_pixel_satd_8x4_neon, export=1
686 ld1 {v1.8b}, [x2], x3
687 ld1 {v0.8b}, [x0], x1
688 ld1 {v3.8b}, [x2], x3
689 ld1 {v2.8b}, [x0], x1
690 ld1 {v5.8b}, [x2], x3
691 ld1 {v4.8b}, [x0], x1
692 ld1 {v7.8b}, [x2], x3
693 ld1 {v6.8b}, [x0], x1
696 function x264_satd_4x8_8x4_end_neon
697 usubl v0.8h, v0.8b, v1.8b
698 usubl v1.8h, v2.8b, v3.8b
699 usubl v2.8h, v4.8b, v5.8b
700 usubl v3.8h, v6.8b, v7.8b
702 SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
703 SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
705 SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
706 SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
708 trn1 v0.8h, v4.8h, v5.8h
709 trn2 v1.8h, v4.8h, v5.8h
710 trn1 v2.8h, v6.8h, v7.8h
711 trn2 v3.8h, v6.8h, v7.8h
713 SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
714 SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
716 trn1 v0.4s, v16.4s, v18.4s
717 trn2 v1.4s, v16.4s, v18.4s
718 trn1 v2.4s, v17.4s, v19.4s
719 trn2 v3.4s, v17.4s, v19.4s
724 umax v0.8h, v0.8h, v1.8h
725 umax v1.8h, v2.8h, v3.8h
726 add v0.8h, v0.8h, v1.8h
732 function x264_pixel_satd_8x8_neon, export=1
735 bl x264_satd_8x8_neon
736 add v0.8h, v0.8h, v1.8h
737 add v1.8h, v2.8h, v3.8h
738 add v0.8h, v0.8h, v1.8h
744 function x264_pixel_satd_8x16_neon, export=1
747 bl x264_satd_8x8_neon
748 add v0.8h, v0.8h, v1.8h
749 add v1.8h, v2.8h, v3.8h
750 add v30.8h, v0.8h, v1.8h
752 bl x264_satd_8x8_neon
753 add v0.8h, v0.8h, v1.8h
754 add v1.8h, v2.8h, v3.8h
755 add v31.8h, v0.8h, v1.8h
756 add v0.8h, v30.8h, v31.8h
762 .macro SUMSUBL_AB sum, sub, a, b
767 .macro load_diff_fly_8x8
768 ld1 {v1.8b}, [x2], x3
769 ld1 {v0.8b}, [x0], x1
770 ld1 {v3.8b}, [x2], x3
771 ld1 {v2.8b}, [x0], x1
772 usubl v16.8h, v0.8b, v1.8b
773 ld1 {v5.8b}, [x2], x3
774 ld1 {v4.8b}, [x0], x1
775 usubl v17.8h, v2.8b, v3.8b
776 ld1 {v7.8b}, [x2], x3
777 ld1 {v6.8b}, [x0], x1
778 usubl v18.8h, v4.8b, v5.8b
779 ld1 {v1.8b}, [x2], x3
780 ld1 {v0.8b}, [x0], x1
781 usubl v19.8h, v6.8b, v7.8b
782 ld1 {v3.8b}, [x2], x3
783 ld1 {v2.8b}, [x0], x1
784 usubl v20.8h, v0.8b, v1.8b
785 ld1 {v5.8b}, [x2], x3
786 ld1 {v4.8b}, [x0], x1
787 usubl v21.8h, v2.8b, v3.8b
788 ld1 {v7.8b}, [x2], x3
789 ld1 {v6.8b}, [x0], x1
791 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
792 SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
794 usubl v22.8h, v4.8b, v5.8b
795 usubl v23.8h, v6.8b, v7.8b
798 .macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
799 SUMSUB_AB \s1, \d1, \a, \b
800 SUMSUB_AB \s2, \d2, \c, \d
803 .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
804 SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
805 SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
808 function x264_satd_8x8_neon
812 // one vertical hadamard pass and two horizontal
813 function x264_satd_8x4v_8x8h_neon
814 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
815 SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
817 HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
819 transpose v0.8h, v1.8h, v16.8h, v17.8h
820 transpose v2.8h, v3.8h, v18.8h, v19.8h
821 transpose v4.8h, v5.8h, v20.8h, v21.8h
822 transpose v6.8h, v7.8h, v22.8h, v23.8h
824 SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
825 SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
826 SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
827 SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
829 transpose v0.4s, v2.4s, v16.4s, v18.4s
830 transpose v1.4s, v3.4s, v17.4s, v19.4s
831 transpose v4.4s, v6.4s, v20.4s, v22.4s
832 transpose v5.4s, v7.4s, v21.4s, v23.4s
843 umax v0.8h, v0.8h, v2.8h
844 umax v1.8h, v1.8h, v3.8h
845 umax v2.8h, v4.8h, v6.8h
846 umax v3.8h, v5.8h, v7.8h
851 function x264_pixel_satd_16x8_neon, export=1
854 bl x264_satd_16x4_neon
855 add v30.8h, v0.8h, v1.8h
856 add v31.8h, v2.8h, v3.8h
858 bl x264_satd_16x4_neon
859 add v0.8h, v0.8h, v1.8h
860 add v1.8h, v2.8h, v3.8h
861 add v30.8h, v30.8h, v0.8h
862 add v31.8h, v31.8h, v1.8h
864 add v0.8h, v30.8h, v31.8h
870 function x264_pixel_satd_16x16_neon, export=1
873 bl x264_satd_16x4_neon
874 add v30.8h, v0.8h, v1.8h
875 add v31.8h, v2.8h, v3.8h
877 bl x264_satd_16x4_neon
878 add v0.8h, v0.8h, v1.8h
879 add v1.8h, v2.8h, v3.8h
880 add v30.8h, v30.8h, v0.8h
881 add v31.8h, v31.8h, v1.8h
883 bl x264_satd_16x4_neon
884 add v0.8h, v0.8h, v1.8h
885 add v1.8h, v2.8h, v3.8h
886 add v30.8h, v30.8h, v0.8h
887 add v31.8h, v31.8h, v1.8h
889 bl x264_satd_16x4_neon
890 add v0.8h, v0.8h, v1.8h
891 add v1.8h, v2.8h, v3.8h
892 add v30.8h, v30.8h, v0.8h
893 add v31.8h, v31.8h, v1.8h
895 add v0.8h, v30.8h, v31.8h
901 function x264_satd_16x4_neon
902 ld1 {v1.16b}, [x2], x3
903 ld1 {v0.16b}, [x0], x1
904 ld1 {v3.16b}, [x2], x3
905 ld1 {v2.16b}, [x0], x1
906 usubl v16.8h, v0.8b, v1.8b
907 usubl2 v20.8h, v0.16b, v1.16b
908 ld1 {v5.16b}, [x2], x3
909 ld1 {v4.16b}, [x0], x1
910 usubl v17.8h, v2.8b, v3.8b
911 usubl2 v21.8h, v2.16b, v3.16b
912 ld1 {v7.16b}, [x2], x3
913 ld1 {v6.16b}, [x0], x1
915 usubl v18.8h, v4.8b, v5.8b
916 usubl2 v22.8h, v4.16b, v5.16b
917 usubl v19.8h, v6.8b, v7.8b
918 usubl2 v23.8h, v6.16b, v7.16b
920 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
921 SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
923 b x264_satd_8x4v_8x8h_neon
926 function x264_pixel_satd_4x16_neon, export=1
928 ld1 {v1.s}[0], [x2], x3
929 ld1 {v0.s}[0], [x0], x1
930 ld1 {v3.s}[0], [x2], x3
931 ld1 {v2.s}[0], [x0], x1
932 ld1 {v5.s}[0], [x2], x3
933 ld1 {v4.s}[0], [x0], x1
934 ld1 {v7.s}[0], [x2], x3
935 ld1 {v6.s}[0], [x0], x1
936 ld1 {v1.s}[1], [x2], x3
937 ld1 {v0.s}[1], [x0], x1
938 ld1 {v3.s}[1], [x2], x3
939 ld1 {v2.s}[1], [x0], x1
940 ld1 {v5.s}[1], [x2], x3
941 ld1 {v4.s}[1], [x0], x1
942 ld1 {v7.s}[1], [x2], x3
943 ld1 {v6.s}[1], [x0], x1
944 usubl v16.8h, v0.8b, v1.8b
945 usubl v17.8h, v2.8b, v3.8b
946 usubl v18.8h, v4.8b, v5.8b
947 usubl v19.8h, v6.8b, v7.8b
948 ld1 {v1.s}[0], [x2], x3
949 ld1 {v0.s}[0], [x0], x1
950 ld1 {v3.s}[0], [x2], x3
951 ld1 {v2.s}[0], [x0], x1
952 ld1 {v5.s}[0], [x2], x3
953 ld1 {v4.s}[0], [x0], x1
954 ld1 {v7.s}[0], [x2], x3
955 ld1 {v6.s}[0], [x0], x1
956 ld1 {v1.s}[1], [x2], x3
957 ld1 {v0.s}[1], [x0], x1
958 ld1 {v3.s}[1], [x2], x3
959 ld1 {v2.s}[1], [x0], x1
960 ld1 {v5.s}[1], [x2], x3
961 ld1 {v4.s}[1], [x0], x1
962 ld1 {v7.s}[1], [x2], x3
963 ld1 {v6.s}[1], [x0], x1
964 usubl v20.8h, v0.8b, v1.8b
965 usubl v21.8h, v2.8b, v3.8b
966 usubl v22.8h, v4.8b, v5.8b
967 usubl v23.8h, v6.8b, v7.8b
969 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
970 SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
972 bl x264_satd_8x4v_8x8h_neon
974 add v30.8h, v0.8h, v1.8h
975 add v31.8h, v2.8h, v3.8h
976 add v0.8h, v30.8h, v31.8h
982 function x264_pixel_sa8d_8x8_neon, export=1
984 bl pixel_sa8d_8x8_neon
985 add v0.8h, v0.8h, v1.8h
993 function x264_pixel_sa8d_16x16_neon, export=1
995 bl pixel_sa8d_8x8_neon
998 bl pixel_sa8d_8x8_neon
1000 uadalp v31.4s, v1.8h
1001 sub x0, x0, x1, lsl #4
1002 sub x2, x2, x3, lsl #4
1005 bl pixel_sa8d_8x8_neon
1006 uadalp v30.4s, v0.8h
1007 uadalp v31.4s, v1.8h
1008 bl pixel_sa8d_8x8_neon
1009 uadalp v30.4s, v0.8h
1010 uadalp v31.4s, v1.8h
1011 add v0.4s, v30.4s, v31.4s
1019 .macro sa8d_satd_8x8 satd=
1020 function pixel_sa8d_\satd\()8x8_neon
1023 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
1024 SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
1026 HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
1028 transpose v0.8h, v1.8h, v16.8h, v17.8h
1029 transpose v2.8h, v3.8h, v18.8h, v19.8h
1030 transpose v4.8h, v5.8h, v20.8h, v21.8h
1031 transpose v6.8h, v7.8h, v22.8h, v23.8h
1033 SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
1034 SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
1035 SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
1036 SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
1038 transpose v4.4s, v6.4s, v24.4s, v26.4s
1039 transpose v5.4s, v7.4s, v25.4s, v27.4s
1040 transpose v24.4s, v26.4s, v0.4s, v2.4s
1041 transpose v25.4s, v27.4s, v1.4s, v3.4s
1052 umax v0.8h, v0.8h, v2.8h
1053 umax v1.8h, v1.8h, v3.8h
1054 umax v2.8h, v4.8h, v6.8h
1055 umax v3.8h, v5.8h, v7.8h
1057 add v26.8h, v0.8h, v1.8h
1058 add v27.8h, v2.8h, v3.8h
1061 SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
1062 SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
1063 SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
1064 SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
1066 transpose v20.8h, v21.8h, v16.8h, v17.8h
1067 transpose v4.8h, v5.8h, v0.8h, v1.8h
1068 transpose v22.8h, v23.8h, v18.8h, v19.8h
1069 transpose v6.8h, v7.8h, v2.8h, v3.8h
1071 SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
1072 SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
1073 SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
1074 SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
1076 transpose v20.4s, v22.4s, v2.4s, v0.4s
1077 transpose v21.4s, v23.4s, v3.4s, v1.4s
1078 transpose v16.4s, v18.4s, v24.4s, v4.4s
1079 transpose v17.4s, v19.4s, v25.4s, v5.4s
1081 SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
1082 SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
1083 SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
1084 SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
1086 transpose v16.2d, v20.2d, v0.2d, v4.2d
1087 transpose v17.2d, v21.2d, v1.2d, v5.2d
1088 transpose v18.2d, v22.2d, v2.2d, v6.2d
1089 transpose v19.2d, v23.2d, v3.2d, v7.2d
1100 umax v16.8h, v16.8h, v20.8h
1101 umax v17.8h, v17.8h, v21.8h
1102 umax v18.8h, v18.8h, v22.8h
1103 umax v19.8h, v19.8h, v23.8h
1105 add v0.8h, v16.8h, v17.8h
1106 add v1.8h, v18.8h, v19.8h
1115 function x264_pixel_sa8d_satd_16x16_neon, export=1
1117 bl pixel_sa8d_satd_8x8_neon
1118 uaddlp v30.4s, v0.8h
1119 uaddlp v31.4s, v1.8h
1120 uaddlp v28.4s, v26.8h
1121 uaddlp v29.4s, v27.8h
1122 bl pixel_sa8d_satd_8x8_neon
1123 uadalp v30.4s, v0.8h
1124 uadalp v31.4s, v1.8h
1125 uadalp v28.4s, v26.8h
1126 uadalp v29.4s, v27.8h
1127 sub x0, x0, x1, lsl #4
1128 sub x2, x2, x3, lsl #4
1131 bl pixel_sa8d_satd_8x8_neon
1132 uadalp v30.4s, v0.8h
1133 uadalp v31.4s, v1.8h
1134 uadalp v28.4s, v26.8h
1135 uadalp v29.4s, v27.8h
1136 bl pixel_sa8d_satd_8x8_neon
1137 uadalp v30.4s, v0.8h
1138 uadalp v31.4s, v1.8h
1139 uadalp v28.4s, v26.8h
1140 uadalp v29.4s, v27.8h
1141 add v0.4s, v30.4s, v31.4s // sa8d
1142 add v1.4s, v28.4s, v29.4s // satd
1145 urshr v0.4s, v0.4s, #1
1148 add x0, x0, x1, lsl #32
1152 .macro HADAMARD_AC w h
1153 function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
1154 movrel x5, mask_ac_4_8
1156 ld1 {v30.8h,v31.8h}, [x5]
1160 bl x264_hadamard_ac_8x8_neon
1162 bl x264_hadamard_ac_8x8_neon
1165 sub x0, x0, x1, lsl #3
1167 bl x264_hadamard_ac_8x8_neon
1170 sub x0, x0, x1, lsl #4
1171 bl x264_hadamard_ac_8x8_neon
1180 orr x0, x0, x1, lsl #32
1190 // v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
1191 function x264_hadamard_ac_8x8_neon
1192 ld1 {v16.8b}, [x0], x1
1193 ld1 {v17.8b}, [x0], x1
1194 ld1 {v18.8b}, [x0], x1
1195 ld1 {v19.8b}, [x0], x1
1196 SUMSUBL_AB v0.8h, v1.8h, v16.8b, v17.8b
1197 ld1 {v20.8b}, [x0], x1
1198 ld1 {v21.8b}, [x0], x1
1199 SUMSUBL_AB v2.8h, v3.8h, v18.8b, v19.8b
1200 ld1 {v22.8b}, [x0], x1
1201 ld1 {v23.8b}, [x0], x1
1202 SUMSUBL_AB v4.8h, v5.8h, v20.8b, v21.8b
1203 SUMSUBL_AB v6.8h, v7.8h, v22.8b, v23.8b
1205 SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
1206 SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
1208 transpose v0.8h, v1.8h, v16.8h, v17.8h
1209 transpose v2.8h, v3.8h, v18.8h, v19.8h
1210 transpose v4.8h, v5.8h, v20.8h, v21.8h
1211 transpose v6.8h, v7.8h, v22.8h, v23.8h
1213 SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
1214 SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
1215 SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
1216 SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
1218 transpose v0.4s, v2.4s, v16.4s, v18.4s
1219 transpose v1.4s, v3.4s, v17.4s, v19.4s
1220 transpose v4.4s, v6.4s, v20.4s, v22.4s
1221 transpose v5.4s, v7.4s, v21.4s, v23.4s
1223 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
1224 SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
1225 SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
1236 add v0.8h, v0.8h, v4.8h
1237 add v1.8h, v1.8h, v5.8h
1238 and v0.16b, v0.16b, v30.16b
1239 add v2.8h, v2.8h, v6.8h
1240 add v3.8h, v3.8h, v7.8h
1241 add v0.8h, v0.8h, v2.8h
1242 add v1.8h, v1.8h, v3.8h
1243 uadalp v28.4s, v0.8h
1244 uadalp v28.4s, v1.8h
1246 SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
1247 SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
1248 SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
1249 SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
1251 transpose v16.2d, v17.2d, v6.2d, v7.2d
1252 transpose v18.2d, v19.2d, v4.2d, v5.2d
1253 transpose v20.2d, v21.2d, v2.2d, v3.2d
1262 transpose v7.2d, v6.2d, v1.2d, v0.2d
1264 umax v3.8h, v16.8h, v17.8h
1265 umax v2.8h, v18.8h, v19.8h
1266 umax v1.8h, v20.8h, v21.8h
1268 SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
1270 add v2.8h, v2.8h, v3.8h
1271 add v2.8h, v2.8h, v1.8h
1272 and v4.16b, v4.16b, v31.16b
1273 add v2.8h, v2.8h, v2.8h
1276 add v2.8h, v2.8h, v5.8h
1277 add v2.8h, v2.8h, v4.8h
1278 uadalp v29.4s, v2.8h
1283 function x264_pixel_ssim_4x4x2_core_neon, export=1
1284 ld1 {v0.8b}, [x0], x1
1285 ld1 {v2.8b}, [x2], x3
1286 umull v16.8h, v0.8b, v0.8b
1287 umull v17.8h, v0.8b, v2.8b
1288 umull v18.8h, v2.8b, v2.8b
1290 ld1 {v28.8b}, [x0], x1
1291 ld1 {v29.8b}, [x2], x3
1292 umull v20.8h, v28.8b, v28.8b
1293 umull v21.8h, v28.8b, v29.8b
1294 umull v22.8h, v29.8b, v29.8b
1296 uaddlp v16.4s, v16.8h
1297 uaddlp v17.4s, v17.8h
1298 uaddl v0.8h, v0.8b, v28.8b
1299 uadalp v16.4s, v18.8h
1300 uaddl v1.8h, v2.8b, v29.8b
1302 ld1 {v26.8b}, [x0], x1
1303 ld1 {v27.8b}, [x2], x3
1304 umull v23.8h, v26.8b, v26.8b
1305 umull v24.8h, v26.8b, v27.8b
1306 umull v25.8h, v27.8b, v27.8b
1308 uadalp v16.4s, v20.8h
1309 uaddw v0.8h, v0.8h, v26.8b
1310 uadalp v17.4s, v21.8h
1311 uaddw v1.8h, v1.8h, v27.8b
1312 uadalp v16.4s, v22.8h
1314 ld1 {v28.8b}, [x0], x1
1315 ld1 {v29.8b}, [x2], x3
1316 umull v20.8h, v28.8b, v28.8b
1317 umull v21.8h, v28.8b, v29.8b
1318 umull v22.8h, v29.8b, v29.8b
1320 uadalp v16.4s, v23.8h
1321 uaddw v0.8h, v0.8h, v28.8b
1322 uadalp v17.4s, v24.8h
1323 uaddw v1.8h, v1.8h, v29.8b
1324 uadalp v16.4s, v25.8h
1326 uadalp v16.4s, v20.8h
1327 uadalp v17.4s, v21.8h
1328 uadalp v16.4s, v22.8h
1333 addp v0.4s, v0.4s, v0.4s
1334 addp v1.4s, v1.4s, v1.4s
1335 addp v2.4s, v16.4s, v16.4s
1336 addp v3.4s, v17.4s, v17.4s
1338 st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4]
1342 function x264_pixel_ssim_end4_neon, export=1
1344 ld1 {v16.4s,v17.4s}, [x0], #32
1345 ld1 {v18.4s,v19.4s}, [x1], #32
1347 subs x2, x5, w2, uxtw
1348 mov w3, #416 // ssim_c1 = .01*.01*255*255*64
1349 movk w4, #0x03, lsl #16 // ssim_c2 = .03*.03*255*255*64*63
1350 add v0.4s, v16.4s, v18.4s
1351 add v1.4s, v17.4s, v19.4s
1352 add v0.4s, v0.4s, v1.4s
1353 ld1 {v20.4s,v21.4s}, [x0], #32
1354 ld1 {v22.4s,v23.4s}, [x1], #32
1355 add v2.4s, v20.4s, v22.4s
1356 add v3.4s, v21.4s, v23.4s
1357 add v1.4s, v1.4s, v2.4s
1358 ld1 {v16.4s}, [x0], #16
1359 ld1 {v18.4s}, [x1], #16
1360 add v16.4s, v16.4s, v18.4s
1361 add v2.4s, v2.4s, v3.4s
1362 add v3.4s, v3.4s, v16.4s
1367 transpose v4.4s, v5.4s, v0.4s, v1.4s
1368 transpose v6.4s, v7.4s, v2.4s, v3.4s
1369 transpose v0.2d, v2.2d, v4.2d, v6.2d
1370 transpose v1.2d, v3.2d, v5.2d, v7.2d
1372 mul v16.4s, v0.4s, v1.4s // s1*s2
1373 mul v0.4s, v0.4s, v0.4s
1374 mla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2
1376 shl v3.4s, v3.4s, #7
1377 shl v2.4s, v2.4s, #6
1378 add v1.4s, v16.4s, v16.4s
1380 sub v2.4s, v2.4s, v0.4s // vars
1381 sub v3.4s, v3.4s, v1.4s // covar*2
1382 add v0.4s, v0.4s, v30.4s
1383 add v2.4s, v2.4s, v31.4s
1384 add v1.4s, v1.4s, v30.4s
1385 add v3.4s, v3.4s, v31.4s
1392 fmul v0.4s, v0.4s, v2.4s
1393 fmul v1.4s, v1.4s, v3.4s
1395 fdiv v0.4s, v1.4s, v0.4s
1399 add x3, x3, x2, lsl #2
1401 and v0.16b, v0.16b, v29.16b
1403 faddp v0.4s, v0.4s, v0.4s