1 /*****************************************************************************
2 * pixel.S: aarch64 pixel metrics
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
6 * Authors: David Conrad <lessen42@gmail.com>
7 * Janne Grunau <janne-x264@jannau.net>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
39 .short 0, -1, -1, -1, 0, -1, -1, -1
40 .short 0, -1, -1, -1, -1, -1, -1, -1
44 ld1 {v1.s}[0], [x2], x3
45 ld1 {v0.s}[0], [x0], x1
46 ld1 {v1.s}[1], [x2], x3
47 ld1 {v0.s}[1], [x0], x1
48 uabdl v16.8h, v0.8b, v1.8b
52 ld1 {v1.s}[0], [x2], x3
53 ld1 {v0.s}[0], [x0], x1
54 ld1 {v1.s}[1], [x2], x3
55 ld1 {v0.s}[1], [x0], x1
56 uabal v16.8h, v0.8b, v1.8b
64 uabdl v16.8h, v0.8b, v1.8b
65 uabdl v17.8h, v2.8b, v3.8b
73 uabal v16.8h, v0.8b, v1.8b
74 uabal v17.8h, v2.8b, v3.8b
78 ld1 {v1.16b}, [x2], x3
79 ld1 {v0.16b}, [x0], x1
80 ld1 {v3.16b}, [x2], x3
81 ld1 {v2.16b}, [x0], x1
82 uabdl v16.8h, v0.8b, v1.8b
83 uabdl2 v17.8h, v0.16b, v1.16b
84 uabal v16.8h, v2.8b, v3.8b
85 uabal2 v17.8h, v2.16b, v3.16b
89 ld1 {v1.16b}, [x2], x3
90 ld1 {v0.16b}, [x0], x1
91 ld1 {v3.16b}, [x2], x3
92 ld1 {v2.16b}, [x0], x1
93 uabal v16.8h, v0.8b, v1.8b
94 uabal2 v17.8h, v0.16b, v1.16b
95 uabal v16.8h, v2.8b, v3.8b
96 uabal2 v17.8h, v2.16b, v3.16b
99 .macro SAD_FUNC w, h, name
100 function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
107 add v16.8h, v16.8h, v17.8h
123 .macro SAD_X_4 x, first=uabal
124 ld1 {v0.s}[0], [x0], x7
125 ld1 {v1.s}[0], [x1], x5
126 ld1 {v0.s}[1], [x0], x7
127 ld1 {v1.s}[1], [x1], x5
128 \first v16.8h, v1.8b, v0.8b
129 ld1 {v2.s}[0], [x2], x5
130 ld1 {v2.s}[1], [x2], x5
131 \first v17.8h, v2.8b, v0.8b
132 ld1 {v3.s}[0], [x3], x5
133 ld1 {v3.s}[1], [x3], x5
134 \first v18.8h, v3.8b, v0.8b
136 ld1 {v4.s}[0], [x4], x5
137 ld1 {v4.s}[1], [x4], x5
138 \first v19.8h, v4.8b, v0.8b
142 .macro SAD_X_8 x, first=uabal
143 ld1 {v0.8b}, [x0], x7
144 ld1 {v1.8b}, [x1], x5
145 \first v16.8h, v1.8b, v0.8b
146 ld1 {v2.8b}, [x2], x5
147 ld1 {v5.8b}, [x0], x7
148 \first v17.8h, v2.8b, v0.8b
149 ld1 {v3.8b}, [x3], x5
150 ld1 {v1.8b}, [x1], x5
151 \first v18.8h, v3.8b, v0.8b
152 uabal v16.8h, v1.8b, v5.8b
153 ld1 {v2.8b}, [x2], x5
154 ld1 {v3.8b}, [x3], x5
155 uabal v17.8h, v2.8b, v5.8b
156 uabal v18.8h, v3.8b, v5.8b
158 ld1 {v4.8b}, [x4], x5
159 \first v19.8h, v4.8b, v0.8b
160 ld1 {v4.8b}, [x4], x5
161 uabal v19.8h, v4.8b, v5.8b
165 .macro SAD_X_16 x, first=uabal
166 ld1 {v0.16b}, [x0], x7
167 ld1 {v1.16b}, [x1], x5
168 \first v16.8h, v1.8b, v0.8b
169 \first\()2 v20.8h, v1.16b, v0.16b
170 ld1 {v2.16b}, [x2], x5
171 ld1 {v5.16b}, [x0], x7
172 \first v17.8h, v2.8b, v0.8b
173 \first\()2 v21.8h, v2.16b, v0.16b
174 ld1 {v3.16b}, [x3], x5
175 ld1 {v1.16b}, [x1], x5
176 \first v18.8h, v3.8b, v0.8b
177 \first\()2 v22.8h, v3.16b, v0.16b
178 uabal v16.8h, v1.8b, v5.8b
179 uabal2 v20.8h, v1.16b, v5.16b
180 ld1 {v2.16b}, [x2], x5
181 ld1 {v3.16b}, [x3], x5
182 uabal v17.8h, v2.8b, v5.8b
183 uabal2 v21.8h, v2.16b, v5.16b
184 uabal v18.8h, v3.8b, v5.8b
185 uabal2 v22.8h, v3.16b, v5.16b
187 ld1 {v4.16b}, [x4], x5
188 \first v19.8h, v4.8b, v0.8b
189 \first\()2 v23.8h, v4.16b, v0.16b
190 ld1 {v4.16b}, [x4], x5
191 uabal v19.8h, v4.8b, v5.8b
192 uabal2 v23.8h, v4.16b, v5.16b
196 .macro SAD_X_FUNC x, w, h
197 function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
211 add v16.8h, v16.8h, v20.8h
212 add v17.8h, v17.8h, v21.8h
213 add v18.8h, v18.8h, v22.8h
215 add v19.8h, v19.8h, v23.8h
252 ld1 {v16.s}[0], [x0], x1
253 ld1 {v17.s}[0], [x2], x3
254 usubl v2.8h, v16.8b, v17.8b
255 ld1 {v16.s}[0], [x0], x1
256 ld1 {v17.s}[0], [x2], x3
257 smull v0.4s, v2.4h, v2.4h
261 usubl v2.8h, v16.8b, v17.8b
262 ld1 {v16.s}[0], [x0], x1
263 ld1 {v17.s}[0], [x2], x3
264 smlal v0.4s, v2.4h, v2.4h
268 usubl v2.8h, v16.8b, v17.8b
269 smlal v0.4s, v2.4h, v2.4h
273 ld1 {v16.8b}, [x0], x1
274 ld1 {v17.8b}, [x2], x3
275 usubl v2.8h, v16.8b, v17.8b
276 ld1 {v16.8b}, [x0], x1
277 smull v0.4s, v2.4h, v2.4h
278 ld1 {v17.8b}, [x2], x3
279 smlal2 v0.4s, v2.8h, v2.8h
283 usubl v2.8h, v16.8b, v17.8b
284 ld1 {v16.8b}, [x0], x1
285 smlal v0.4s, v2.4h, v2.4h
286 ld1 {v17.8b}, [x2], x3
287 smlal2 v0.4s, v2.8h, v2.8h
291 usubl v2.8h, v16.8b, v17.8b
292 smlal v0.4s, v2.4h, v2.4h
293 smlal2 v0.4s, v2.8h, v2.8h
297 ld1 {v16.16b}, [x0], x1
298 ld1 {v17.16b}, [x2], x3
299 usubl v2.8h, v16.8b, v17.8b
300 usubl2 v3.8h, v16.16b, v17.16b
301 ld1 {v16.16b}, [x0], x1
302 smull v0.4s, v2.4h, v2.4h
303 smull2 v1.4s, v2.8h, v2.8h
304 ld1 {v17.16b}, [x2], x3
305 smlal v0.4s, v3.4h, v3.4h
306 smlal2 v1.4s, v3.8h, v3.8h
310 usubl v2.8h, v16.8b, v17.8b
311 usubl2 v3.8h, v16.16b, v17.16b
312 ld1 {v16.16b}, [x0], x1
313 smlal v0.4s, v2.4h, v2.4h
314 smlal2 v1.4s, v2.8h, v2.8h
315 ld1 {v17.16b}, [x2], x3
316 smlal v0.4s, v3.4h, v3.4h
317 smlal2 v1.4s, v3.8h, v3.8h
321 usubl v2.8h, v16.8b, v17.8b
322 usubl2 v3.8h, v16.16b, v17.16b
323 smlal v0.4s, v2.4h, v2.4h
324 smlal2 v1.4s, v2.8h, v2.8h
325 smlal v0.4s, v3.4h, v3.4h
326 smlal2 v1.4s, v3.8h, v3.8h
327 add v0.4s, v0.4s, v1.4s
331 function x264_pixel_ssd_\w\()x\h\()_neon, export=1
353 function x264_pixel_var_8x\h\()_neon, export=1
354 ld1 {v16.8b}, [x0], x1
355 ld1 {v17.8b}, [x0], x1
357 umull v1.8h, v16.8b, v16.8b
359 umull v2.8h, v17.8b, v17.8b
360 uaddw v0.8h, v0.8h, v17.8b
361 ld1 {v18.8b}, [x0], x1
364 ld1 {v19.8b}, [x0], x1
367 uaddw v0.8h, v0.8h, v18.8b
368 umull v24.8h, v18.8b, v18.8b
369 ld1 {v20.8b}, [x0], x1
370 uaddw v0.8h, v0.8h, v19.8b
371 umull v25.8h, v19.8b, v19.8b
373 ld1 {v21.8b}, [x0], x1
374 uaddw v0.8h, v0.8h, v20.8b
375 umull v26.8h, v20.8b, v20.8b
377 ld1 {v18.8b}, [x0], x1
378 uaddw v0.8h, v0.8h, v21.8b
379 umull v27.8h, v21.8b, v21.8b
381 ld1 {v19.8b}, [x0], x1
385 uaddw v0.8h, v0.8h, v18.8b
386 umull v28.8h, v18.8b, v18.8b
387 uaddw v0.8h, v0.8h, v19.8b
388 umull v29.8h, v19.8b, v19.8b
399 function x264_pixel_var_16x16_neon, export=1
400 ld1 {v16.16b}, [x0], x1
401 ld1 {v17.16b}, [x0], x1
403 umull v1.8h, v16.8b, v16.8b
404 umull2 v2.8h, v16.16b, v16.16b
408 uaddw2 v0.8h, v0.8h, v16.16b
411 ld1 {v18.16b}, [x0], x1
412 uaddw v0.8h, v0.8h, v17.8b
413 umull v3.8h, v17.8b, v17.8b
414 uaddw2 v0.8h, v0.8h, v17.16b
415 umull2 v4.8h, v17.16b, v17.16b
419 ld1 {v17.16b}, [x0], x1
420 uaddw v0.8h, v0.8h, v18.8b
421 umull v5.8h, v18.8b, v18.8b
422 uaddw2 v0.8h, v0.8h, v18.16b
423 umull2 v6.8h, v18.16b, v18.16b
428 uaddw v0.8h, v0.8h, v17.8b
429 umull v3.8h, v17.8b, v17.8b
430 uaddw2 v0.8h, v0.8h, v17.16b
431 umull2 v4.8h, v17.16b, v17.16b
436 function x264_var_end
437 add v1.4s, v1.4s, v2.4s
442 orr x0, x0, x1, lsl #32
447 .macro pixel_var2_8 h
448 function x264_pixel_var2_8x\h\()_neon, export=1
449 ld1 {v16.8b}, [x0], x1
450 ld1 {v18.8b}, [x2], x3
451 ld1 {v17.8b}, [x0], x1
452 ld1 {v19.8b}, [x2], x3
454 usubl v6.8h, v16.8b, v18.8b
455 usubl v7.8h, v17.8b, v19.8b
456 ld1 {v16.8b}, [x0], x1
457 ld1 {v18.8b}, [x2], x3
458 smull v2.4s, v6.4h, v6.4h
459 smull2 v3.4s, v6.8h, v6.8h
460 add v0.8h, v6.8h, v7.8h
461 smlal v2.4s, v7.4h, v7.4h
462 smlal2 v3.4s, v7.8h, v7.8h
464 usubl v6.8h, v16.8b, v18.8b
467 ld1 {v17.8b}, [x0], x1
468 ld1 {v19.8b}, [x2], x3
469 smlal v2.4s, v6.4h, v6.4h
470 smlal2 v3.4s, v6.8h, v6.8h
471 usubl v7.8h, v17.8b, v19.8b
472 add v0.8h, v0.8h, v6.8h
473 ld1 {v16.8b}, [x0], x1
474 ld1 {v18.8b}, [x2], x3
475 smlal v2.4s, v7.4h, v7.4h
476 smlal2 v3.4s, v7.8h, v7.8h
477 usubl v6.8h, v16.8b, v18.8b
478 add v0.8h, v0.8h, v7.8h
481 ld1 {v17.8b}, [x0], x1
482 ld1 {v19.8b}, [x2], x3
483 smlal v2.4s, v6.4h, v6.4h
484 smlal2 v3.4s, v6.8h, v6.8h
485 usubl v7.8h, v17.8b, v19.8b
486 add v0.8h, v0.8h, v6.8h
487 smlal v2.4s, v7.4h, v7.4h
488 add v0.8h, v0.8h, v7.8h
489 smlal2 v3.4s, v7.8h, v7.8h
492 add v2.4s, v2.4s, v3.4s
499 sub x0, x1, x0, lsr # 6 + (\h >> 4)
509 function x264_pixel_satd_4x4_neon, export=1
510 ld1 {v1.s}[0], [x2], x3
511 ld1 {v0.s}[0], [x0], x1
512 ld1 {v3.s}[0], [x2], x3
513 ld1 {v2.s}[0], [x0], x1
514 ld1 {v1.s}[1], [x2], x3
515 ld1 {v0.s}[1], [x0], x1
516 ld1 {v3.s}[1], [x2], x3
517 ld1 {v2.s}[1], [x0], x1
519 usubl v0.8h, v0.8b, v1.8b
520 usubl v1.8h, v2.8b, v3.8b
521 SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
523 zip1 v0.2d, v2.2d, v3.2d
524 zip2 v1.2d, v2.2d, v3.2d
525 SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
527 trn1 v0.8h, v2.8h, v3.8h
528 trn2 v1.8h, v2.8h, v3.8h
529 SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
531 trn1 v0.4s, v2.4s, v3.4s
532 trn2 v1.4s, v2.4s, v3.4s
535 umax v0.8h, v0.8h, v1.8h
542 function x264_pixel_satd_4x8_neon, export=1
543 ld1 {v1.s}[0], [x2], x3
544 ld1 {v0.s}[0], [x0], x1
545 ld1 {v3.s}[0], [x2], x3
546 ld1 {v2.s}[0], [x0], x1
547 ld1 {v5.s}[0], [x2], x3
548 ld1 {v4.s}[0], [x0], x1
549 ld1 {v7.s}[0], [x2], x3
550 ld1 {v6.s}[0], [x0], x1
551 ld1 {v1.s}[1], [x2], x3
552 ld1 {v0.s}[1], [x0], x1
553 ld1 {v3.s}[1], [x2], x3
554 ld1 {v2.s}[1], [x0], x1
555 ld1 {v5.s}[1], [x2], x3
556 ld1 {v4.s}[1], [x0], x1
557 ld1 {v7.s}[1], [x2], x3
558 ld1 {v6.s}[1], [x0], x1
559 b x264_satd_4x8_8x4_end_neon
562 function x264_pixel_satd_8x4_neon, export=1
563 ld1 {v1.8b}, [x2], x3
564 ld1 {v0.8b}, [x0], x1
565 ld1 {v3.8b}, [x2], x3
566 ld1 {v2.8b}, [x0], x1
567 ld1 {v5.8b}, [x2], x3
568 ld1 {v4.8b}, [x0], x1
569 ld1 {v7.8b}, [x2], x3
570 ld1 {v6.8b}, [x0], x1
573 function x264_satd_4x8_8x4_end_neon
574 usubl v0.8h, v0.8b, v1.8b
575 usubl v1.8h, v2.8b, v3.8b
576 usubl v2.8h, v4.8b, v5.8b
577 usubl v3.8h, v6.8b, v7.8b
579 SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
580 SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
582 SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
583 SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
585 trn1 v0.8h, v4.8h, v5.8h
586 trn2 v1.8h, v4.8h, v5.8h
587 trn1 v2.8h, v6.8h, v7.8h
588 trn2 v3.8h, v6.8h, v7.8h
590 SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
591 SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
593 trn1 v0.4s, v16.4s, v18.4s
594 trn2 v1.4s, v16.4s, v18.4s
595 trn1 v2.4s, v17.4s, v19.4s
596 trn2 v3.4s, v17.4s, v19.4s
601 umax v0.8h, v0.8h, v1.8h
602 umax v1.8h, v2.8h, v3.8h
603 add v0.8h, v0.8h, v1.8h
609 function x264_pixel_satd_8x8_neon, export=1
612 bl x264_satd_8x8_neon
613 add v0.8h, v0.8h, v1.8h
614 add v1.8h, v2.8h, v3.8h
615 add v0.8h, v0.8h, v1.8h
621 function x264_pixel_satd_8x16_neon, export=1
624 bl x264_satd_8x8_neon
625 add v0.8h, v0.8h, v1.8h
626 add v1.8h, v2.8h, v3.8h
627 add v30.8h, v0.8h, v1.8h
629 bl x264_satd_8x8_neon
630 add v0.8h, v0.8h, v1.8h
631 add v1.8h, v2.8h, v3.8h
632 add v31.8h, v0.8h, v1.8h
633 add v0.8h, v30.8h, v31.8h
639 .macro SUMSUBL_AB sum, sub, a, b
644 .macro load_diff_fly_8x8
645 ld1 {v1.8b}, [x2], x3
646 ld1 {v0.8b}, [x0], x1
647 ld1 {v3.8b}, [x2], x3
648 ld1 {v2.8b}, [x0], x1
649 usubl v16.8h, v0.8b, v1.8b
650 ld1 {v5.8b}, [x2], x3
651 ld1 {v4.8b}, [x0], x1
652 usubl v17.8h, v2.8b, v3.8b
653 ld1 {v7.8b}, [x2], x3
654 ld1 {v6.8b}, [x0], x1
655 usubl v18.8h, v4.8b, v5.8b
656 ld1 {v1.8b}, [x2], x3
657 ld1 {v0.8b}, [x0], x1
658 usubl v19.8h, v6.8b, v7.8b
659 ld1 {v3.8b}, [x2], x3
660 ld1 {v2.8b}, [x0], x1
661 usubl v20.8h, v0.8b, v1.8b
662 ld1 {v5.8b}, [x2], x3
663 ld1 {v4.8b}, [x0], x1
664 usubl v21.8h, v2.8b, v3.8b
665 ld1 {v7.8b}, [x2], x3
666 ld1 {v6.8b}, [x0], x1
668 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
669 SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
671 usubl v22.8h, v4.8b, v5.8b
672 usubl v23.8h, v6.8b, v7.8b
675 .macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
676 SUMSUB_AB \s1, \d1, \a, \b
677 SUMSUB_AB \s2, \d2, \c, \d
680 .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
681 SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
682 SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
685 function x264_satd_8x8_neon
689 // one vertical hadamard pass and two horizontal
690 function x264_satd_8x4v_8x8h_neon
691 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
692 SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
694 HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
696 transpose v0.8h, v1.8h, v16.8h, v17.8h
697 transpose v2.8h, v3.8h, v18.8h, v19.8h
698 transpose v4.8h, v5.8h, v20.8h, v21.8h
699 transpose v6.8h, v7.8h, v22.8h, v23.8h
701 SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
702 SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
703 SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
704 SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
706 transpose v0.4s, v2.4s, v16.4s, v18.4s
707 transpose v1.4s, v3.4s, v17.4s, v19.4s
708 transpose v4.4s, v6.4s, v20.4s, v22.4s
709 transpose v5.4s, v7.4s, v21.4s, v23.4s
720 umax v0.8h, v0.8h, v2.8h
721 umax v1.8h, v1.8h, v3.8h
722 umax v2.8h, v4.8h, v6.8h
723 umax v3.8h, v5.8h, v7.8h
728 function x264_pixel_satd_16x8_neon, export=1
731 bl x264_satd_16x4_neon
732 add v30.8h, v0.8h, v1.8h
733 add v31.8h, v2.8h, v3.8h
735 bl x264_satd_16x4_neon
736 add v0.8h, v0.8h, v1.8h
737 add v1.8h, v2.8h, v3.8h
738 add v30.8h, v30.8h, v0.8h
739 add v31.8h, v31.8h, v1.8h
741 add v0.8h, v30.8h, v31.8h
747 function x264_pixel_satd_16x16_neon, export=1
750 bl x264_satd_16x4_neon
751 add v30.8h, v0.8h, v1.8h
752 add v31.8h, v2.8h, v3.8h
754 bl x264_satd_16x4_neon
755 add v0.8h, v0.8h, v1.8h
756 add v1.8h, v2.8h, v3.8h
757 add v30.8h, v30.8h, v0.8h
758 add v31.8h, v31.8h, v1.8h
760 bl x264_satd_16x4_neon
761 add v0.8h, v0.8h, v1.8h
762 add v1.8h, v2.8h, v3.8h
763 add v30.8h, v30.8h, v0.8h
764 add v31.8h, v31.8h, v1.8h
766 bl x264_satd_16x4_neon
767 add v0.8h, v0.8h, v1.8h
768 add v1.8h, v2.8h, v3.8h
769 add v30.8h, v30.8h, v0.8h
770 add v31.8h, v31.8h, v1.8h
772 add v0.8h, v30.8h, v31.8h
778 function x264_satd_16x4_neon
779 ld1 {v1.16b}, [x2], x3
780 ld1 {v0.16b}, [x0], x1
781 ld1 {v3.16b}, [x2], x3
782 ld1 {v2.16b}, [x0], x1
783 usubl v16.8h, v0.8b, v1.8b
784 usubl2 v20.8h, v0.16b, v1.16b
785 ld1 {v5.16b}, [x2], x3
786 ld1 {v4.16b}, [x0], x1
787 usubl v17.8h, v2.8b, v3.8b
788 usubl2 v21.8h, v2.16b, v3.16b
789 ld1 {v7.16b}, [x2], x3
790 ld1 {v6.16b}, [x0], x1
792 usubl v18.8h, v4.8b, v5.8b
793 usubl2 v22.8h, v4.16b, v5.16b
794 usubl v19.8h, v6.8b, v7.8b
795 usubl2 v23.8h, v6.16b, v7.16b
797 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
798 SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
800 b x264_satd_8x4v_8x8h_neon
804 function x264_pixel_sa8d_8x8_neon, export=1
806 bl x264_sa8d_8x8_neon
807 add v0.8h, v0.8h, v1.8h
815 function x264_pixel_sa8d_16x16_neon, export=1
817 bl x264_sa8d_8x8_neon
820 bl x264_sa8d_8x8_neon
823 sub x0, x0, x1, lsl #4
824 sub x2, x2, x3, lsl #4
827 bl x264_sa8d_8x8_neon
830 bl x264_sa8d_8x8_neon
833 add v0.4s, v30.4s, v31.4s
841 function x264_sa8d_8x8_neon
844 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
845 SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
847 HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
848 SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
849 SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
850 SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
851 SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
853 transpose v20.8h, v21.8h, v16.8h, v17.8h
854 transpose v4.8h, v5.8h, v0.8h, v1.8h
855 transpose v22.8h, v23.8h, v18.8h, v19.8h
856 transpose v6.8h, v7.8h, v2.8h, v3.8h
858 SUMSUB_AB v28.8h, v29.8h, v20.8h, v21.8h
859 SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
860 SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
861 SUMSUB_AB v26.8h, v27.8h, v6.8h, v7.8h
863 transpose v20.4s, v22.4s, v28.4s, v0.4s
864 transpose v21.4s, v23.4s, v29.4s, v1.4s
865 transpose v16.4s, v18.4s, v24.4s, v26.4s
866 transpose v17.4s, v19.4s, v25.4s, v27.4s
868 SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
869 SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
870 SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
871 SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
873 transpose v16.2d, v20.2d, v0.2d, v4.2d
874 transpose v17.2d, v21.2d, v1.2d, v5.2d
875 transpose v18.2d, v22.2d, v2.2d, v6.2d
876 transpose v19.2d, v23.2d, v3.2d, v7.2d
887 umax v16.8h, v16.8h, v20.8h
888 umax v17.8h, v17.8h, v21.8h
889 umax v18.8h, v18.8h, v22.8h
890 umax v19.8h, v19.8h, v23.8h
892 add v0.8h, v16.8h, v17.8h
893 add v1.8h, v18.8h, v19.8h
899 .macro HADAMARD_AC w h
900 function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
901 movrel x5, mask_ac_4_8
903 ld1 {v30.8h,v31.8h}, [x5]
907 bl x264_hadamard_ac_8x8_neon
909 bl x264_hadamard_ac_8x8_neon
912 sub x0, x0, x1, lsl #3
914 bl x264_hadamard_ac_8x8_neon
917 sub x0, x0, x1, lsl #4
918 bl x264_hadamard_ac_8x8_neon
927 orr x0, x0, x1, lsl #32
937 // v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
938 function x264_hadamard_ac_8x8_neon
939 ld1 {v16.8b}, [x0], x1
940 ld1 {v17.8b}, [x0], x1
941 ld1 {v18.8b}, [x0], x1
942 ld1 {v19.8b}, [x0], x1
943 SUMSUBL_AB v0.8h, v1.8h, v16.8b, v17.8b
944 ld1 {v20.8b}, [x0], x1
945 ld1 {v21.8b}, [x0], x1
946 SUMSUBL_AB v2.8h, v3.8h, v18.8b, v19.8b
947 ld1 {v22.8b}, [x0], x1
948 ld1 {v23.8b}, [x0], x1
949 SUMSUBL_AB v4.8h, v5.8h, v20.8b, v21.8b
950 SUMSUBL_AB v6.8h, v7.8h, v22.8b, v23.8b
952 SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
953 SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
955 transpose v0.8h, v1.8h, v16.8h, v17.8h
956 transpose v2.8h, v3.8h, v18.8h, v19.8h
957 transpose v4.8h, v5.8h, v20.8h, v21.8h
958 transpose v6.8h, v7.8h, v22.8h, v23.8h
960 SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
961 SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
962 SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
963 SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
965 transpose v0.4s, v2.4s, v16.4s, v18.4s
966 transpose v1.4s, v3.4s, v17.4s, v19.4s
967 transpose v4.4s, v6.4s, v20.4s, v22.4s
968 transpose v5.4s, v7.4s, v21.4s, v23.4s
970 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
971 SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
972 SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
983 add v0.8h, v0.8h, v4.8h
984 add v1.8h, v1.8h, v5.8h
985 and v0.16b, v0.16b, v30.16b
986 add v2.8h, v2.8h, v6.8h
987 add v3.8h, v3.8h, v7.8h
988 add v0.8h, v0.8h, v2.8h
989 add v1.8h, v1.8h, v3.8h
993 SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
994 SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
995 SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
996 SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
998 transpose v16.2d, v17.2d, v6.2d, v7.2d
999 transpose v18.2d, v19.2d, v4.2d, v5.2d
1000 transpose v20.2d, v21.2d, v2.2d, v3.2d
1009 transpose v7.2d, v6.2d, v1.2d, v0.2d
1011 umax v3.8h, v16.8h, v17.8h
1012 umax v2.8h, v18.8h, v19.8h
1013 umax v1.8h, v20.8h, v21.8h
1015 SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
1017 add v2.8h, v2.8h, v3.8h
1018 add v2.8h, v2.8h, v1.8h
1019 and v4.16b, v4.16b, v31.16b
1020 add v2.8h, v2.8h, v2.8h
1023 add v2.8h, v2.8h, v5.8h
1024 add v2.8h, v2.8h, v4.8h
1025 uadalp v29.4s, v2.8h
1030 function x264_pixel_ssim_4x4x2_core_neon, export=1
1031 ld1 {v0.8b}, [x0], x1
1032 ld1 {v2.8b}, [x2], x3
1033 umull v16.8h, v0.8b, v0.8b
1034 umull v17.8h, v0.8b, v2.8b
1035 umull v18.8h, v2.8b, v2.8b
1037 ld1 {v28.8b}, [x0], x1
1038 ld1 {v29.8b}, [x2], x3
1039 umull v20.8h, v28.8b, v28.8b
1040 umull v21.8h, v28.8b, v29.8b
1041 umull v22.8h, v29.8b, v29.8b
1043 uaddlp v16.4s, v16.8h
1044 uaddlp v17.4s, v17.8h
1045 uaddl v0.8h, v0.8b, v28.8b
1046 uadalp v16.4s, v18.8h
1047 uaddl v1.8h, v2.8b, v29.8b
1049 ld1 {v26.8b}, [x0], x1
1050 ld1 {v27.8b}, [x2], x3
1051 umull v23.8h, v26.8b, v26.8b
1052 umull v24.8h, v26.8b, v27.8b
1053 umull v25.8h, v27.8b, v27.8b
1055 uadalp v16.4s, v20.8h
1056 uaddw v0.8h, v0.8h, v26.8b
1057 uadalp v17.4s, v21.8h
1058 uaddw v1.8h, v1.8h, v27.8b
1059 uadalp v16.4s, v22.8h
1061 ld1 {v28.8b}, [x0], x1
1062 ld1 {v29.8b}, [x2], x3
1063 umull v20.8h, v28.8b, v28.8b
1064 umull v21.8h, v28.8b, v29.8b
1065 umull v22.8h, v29.8b, v29.8b
1067 uadalp v16.4s, v23.8h
1068 uaddw v0.8h, v0.8h, v28.8b
1069 uadalp v17.4s, v24.8h
1070 uaddw v1.8h, v1.8h, v29.8b
1071 uadalp v16.4s, v25.8h
1073 uadalp v16.4s, v20.8h
1074 uadalp v17.4s, v21.8h
1075 uadalp v16.4s, v22.8h
1080 addp v0.4s, v0.4s, v0.4s
1081 addp v1.4s, v1.4s, v1.4s
1082 addp v2.4s, v16.4s, v16.4s
1083 addp v3.4s, v17.4s, v17.4s
1085 st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4]
1089 function x264_pixel_ssim_end4_neon, export=1
1091 ld1 {v16.4s,v17.4s}, [x0], #32
1092 ld1 {v18.4s,v19.4s}, [x1], #32
1094 subs x2, x5, w2, uxtw
1095 mov w3, #416 // ssim_c1 = .01*.01*255*255*64
1096 movk w4, #0x03, lsl #16 // ssim_c2 = .03*.03*255*255*64*63
1097 add v0.4s, v16.4s, v18.4s
1098 add v1.4s, v17.4s, v19.4s
1099 add v0.4s, v0.4s, v1.4s
1100 ld1 {v20.4s,v21.4s}, [x0], #32
1101 ld1 {v22.4s,v23.4s}, [x1], #32
1102 add v2.4s, v20.4s, v22.4s
1103 add v3.4s, v21.4s, v23.4s
1104 add v1.4s, v1.4s, v2.4s
1105 ld1 {v16.4s}, [x0], #16
1106 ld1 {v18.4s}, [x1], #16
1107 add v16.4s, v16.4s, v18.4s
1108 add v2.4s, v2.4s, v3.4s
1109 add v3.4s, v3.4s, v16.4s
1114 transpose v4.4s, v5.4s, v0.4s, v1.4s
1115 transpose v6.4s, v7.4s, v2.4s, v3.4s
1116 transpose v0.2d, v2.2d, v4.2d, v6.2d
1117 transpose v1.2d, v3.2d, v5.2d, v7.2d
1119 mul v16.4s, v0.4s, v1.4s // s1*s2
1120 mul v0.4s, v0.4s, v0.4s
1121 mla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2
1123 shl v3.4s, v3.4s, #7
1124 shl v2.4s, v2.4s, #6
1125 add v1.4s, v16.4s, v16.4s
1127 sub v2.4s, v2.4s, v0.4s // vars
1128 sub v3.4s, v3.4s, v1.4s // covar*2
1129 add v0.4s, v0.4s, v30.4s
1130 add v2.4s, v2.4s, v31.4s
1131 add v1.4s, v1.4s, v30.4s
1132 add v3.4s, v3.4s, v31.4s
1139 fmul v0.4s, v0.4s, v2.4s
1140 fmul v1.4s, v1.4s, v3.4s
1142 fdiv v0.4s, v1.4s, v0.4s
1146 add x3, x3, x2, lsl #2
1148 and v0.16b, v0.16b, v29.16b
1150 faddp v0.4s, v0.4s, v0.4s