1 ;******************************************************************************
2 ;* VP9 Intra prediction SIMD optimizations
4 ;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
5 ;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
32 pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
33 pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
34 pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
43 ; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
44 ; only 3 registers on x86-32, which would make it one cycle faster, but that
45 ; would make the code quite a bit uglier...
86 cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
89 DEFINE_ARGS dst, stride, stride3
90 lea stride3q, [strideq*3]
91 mova [dstq+strideq*0], m0
92 mova [dstq+strideq*1], m0
93 mova [dstq+strideq*2], m0
94 mova [dstq+stride3q ], m0
98 cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
101 DEFINE_ARGS dst, stride, stride3
102 lea stride3q, [strideq*3]
103 mova [dstq+strideq*0], m0
104 mova [dstq+strideq*1], m0
105 mova [dstq+strideq*2], m0
106 mova [dstq+stride3q ], m0
107 lea dstq, [dstq+strideq*4]
108 mova [dstq+strideq*0], m0
109 mova [dstq+strideq*1], m0
110 mova [dstq+strideq*2], m0
111 mova [dstq+stride3q ], m0
115 cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
119 DEFINE_ARGS dst, stride, stride3, cnt
120 lea stride3q, [strideq*3]
123 mova [dstq+strideq*0+ 0], m0
124 mova [dstq+strideq*0+16], m1
125 mova [dstq+strideq*1+ 0], m0
126 mova [dstq+strideq*1+16], m1
127 mova [dstq+strideq*2+ 0], m0
128 mova [dstq+strideq*2+16], m1
129 mova [dstq+stride3q + 0], m0
130 mova [dstq+stride3q +16], m1
131 lea dstq, [dstq+strideq*4]
137 cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
139 mova m0, [aq+mmsize*0]
140 mova m1, [aq+mmsize*1]
141 mova m2, [aq+mmsize*2]
142 mova m3, [aq+mmsize*3]
143 DEFINE_ARGS dst, stride, cnt
146 mova [dstq+strideq*0+ 0], m0
147 mova [dstq+strideq*0+16], m1
148 mova [dstq+strideq*0+32], m2
149 mova [dstq+strideq*0+48], m3
150 mova [dstq+strideq*1+ 0], m0
151 mova [dstq+strideq*1+16], m1
152 mova [dstq+strideq*1+32], m2
153 mova [dstq+strideq*1+48], m3
154 lea dstq, [dstq+strideq*2]
160 cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
162 DEFINE_ARGS dst, stride, stride3
163 lea stride3q, [strideq*3]
168 mova [dstq+strideq*0], m0
169 mova [dstq+strideq*1], m1
170 mova [dstq+strideq*2], m2
171 mova [dstq+stride3q ], m3
175 cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
177 DEFINE_ARGS dst, stride, stride3
178 lea stride3q, [strideq*3]
182 mova [dstq+strideq*0], m0
183 mova [dstq+strideq*1], m1
186 mova [dstq+strideq*2], m0
187 mova [dstq+stride3q ], m1
188 lea dstq, [dstq+strideq*4]
192 mova [dstq+strideq*0], m0
193 mova [dstq+strideq*1], m1
196 mova [dstq+strideq*2], m0
197 mova [dstq+stride3q ], m1
201 cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
203 lea stride3q, [strideq*3]
211 mova [dstq+strideq*0+ 0], m0
212 mova [dstq+strideq*0+16], m0
213 mova [dstq+strideq*1+ 0], m1
214 mova [dstq+strideq*1+16], m1
215 mova [dstq+strideq*2+ 0], m2
216 mova [dstq+strideq*2+16], m2
217 mova [dstq+stride3q + 0], m3
218 mova [dstq+stride3q +16], m3
219 lea dstq, [dstq+strideq*4]
225 cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
227 lea stride3q, [strideq*3]
235 mova [dstq+strideq*0+ 0], m0
236 mova [dstq+strideq*0+16], m0
237 mova [dstq+strideq*0+32], m0
238 mova [dstq+strideq*0+48], m0
239 mova [dstq+strideq*1+ 0], m1
240 mova [dstq+strideq*1+16], m1
241 mova [dstq+strideq*1+32], m1
242 mova [dstq+strideq*1+48], m1
243 mova [dstq+strideq*2+ 0], m2
244 mova [dstq+strideq*2+16], m2
245 mova [dstq+strideq*2+32], m2
246 mova [dstq+strideq*2+48], m2
247 mova [dstq+stride3q + 0], m3
248 mova [dstq+stride3q +16], m3
249 mova [dstq+stride3q +32], m3
250 mova [dstq+stride3q +48], m3
251 lea dstq, [dstq+strideq*4]
257 cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
260 DEFINE_ARGS dst, stride, stride3
261 lea stride3q, [strideq*3]
268 mova [dstq+strideq*0], m0
269 mova [dstq+strideq*1], m0
270 mova [dstq+strideq*2], m0
271 mova [dstq+stride3q ], m0
275 cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
278 DEFINE_ARGS dst, stride, stride3
279 lea stride3q, [strideq*3]
287 pshuflw m0, m0, q0000
289 mova [dstq+strideq*0], m0
290 mova [dstq+strideq*1], m0
291 mova [dstq+strideq*2], m0
292 mova [dstq+stride3q ], m0
293 lea dstq, [dstq+strideq*4]
294 mova [dstq+strideq*0], m0
295 mova [dstq+strideq*1], m0
296 mova [dstq+strideq*2], m0
297 mova [dstq+stride3q ], m0
301 cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
303 paddw m0, [lq+mmsize]
305 paddw m0, [aq+mmsize]
306 DEFINE_ARGS dst, stride, stride3, cnt
307 lea stride3q, [strideq*3]
316 pshuflw m0, m0, q0000
319 mova [dstq+strideq*0+ 0], m0
320 mova [dstq+strideq*0+16], m0
321 mova [dstq+strideq*1+ 0], m0
322 mova [dstq+strideq*1+16], m0
323 mova [dstq+strideq*2+ 0], m0
324 mova [dstq+strideq*2+16], m0
325 mova [dstq+stride3q + 0], m0
326 mova [dstq+stride3q +16], m0
327 lea dstq, [dstq+strideq*4]
333 cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
334 mova m0, [lq+mmsize*0]
335 paddw m0, [lq+mmsize*1]
336 paddw m0, [lq+mmsize*2]
337 paddw m0, [lq+mmsize*3]
338 paddw m0, [aq+mmsize*0]
339 paddw m0, [aq+mmsize*1]
340 paddw m0, [aq+mmsize*2]
341 paddw m0, [aq+mmsize*3]
342 DEFINE_ARGS dst, stride, stride3, cnt
343 lea stride3q, [strideq*3]
352 pshuflw m0, m0, q0000
355 mova [dstq+strideq*0+ 0], m0
356 mova [dstq+strideq*0+16], m0
357 mova [dstq+strideq*0+32], m0
358 mova [dstq+strideq*0+48], m0
359 mova [dstq+strideq*1+ 0], m0
360 mova [dstq+strideq*1+16], m0
361 mova [dstq+strideq*1+32], m0
362 mova [dstq+strideq*1+48], m0
363 lea dstq, [dstq+strideq*2]
370 cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
372 DEFINE_ARGS dst, stride, stride3
373 lea stride3q, [strideq*3]
380 mova [dstq+strideq*0], m0
381 mova [dstq+strideq*1], m0
382 mova [dstq+strideq*2], m0
383 mova [dstq+stride3q ], m0
387 cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
389 DEFINE_ARGS dst, stride, stride3
390 lea stride3q, [strideq*3]
398 pshuflw m0, m0, q0000
400 mova [dstq+strideq*0], m0
401 mova [dstq+strideq*1], m0
402 mova [dstq+strideq*2], m0
403 mova [dstq+stride3q ], m0
404 lea dstq, [dstq+strideq*4]
405 mova [dstq+strideq*0], m0
406 mova [dstq+strideq*1], m0
407 mova [dstq+strideq*2], m0
408 mova [dstq+stride3q ], m0
412 cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
414 paddw m0, [%2+mmsize]
415 DEFINE_ARGS dst, stride, stride3, cnt
416 lea stride3q, [strideq*3]
425 pshuflw m0, m0, q0000
428 mova [dstq+strideq*0+ 0], m0
429 mova [dstq+strideq*0+16], m0
430 mova [dstq+strideq*1+ 0], m0
431 mova [dstq+strideq*1+16], m0
432 mova [dstq+strideq*2+ 0], m0
433 mova [dstq+strideq*2+16], m0
434 mova [dstq+stride3q + 0], m0
435 mova [dstq+stride3q +16], m0
436 lea dstq, [dstq+strideq*4]
442 cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
443 mova m0, [%2+mmsize*0]
444 paddw m0, [%2+mmsize*1]
445 paddw m0, [%2+mmsize*2]
446 paddw m0, [%2+mmsize*3]
447 DEFINE_ARGS dst, stride, cnt
456 pshuflw m0, m0, q0000
459 mova [dstq+strideq*0+ 0], m0
460 mova [dstq+strideq*0+16], m0
461 mova [dstq+strideq*0+32], m0
462 mova [dstq+strideq*0+48], m0
463 mova [dstq+strideq*1+ 0], m0
464 mova [dstq+strideq*1+16], m0
465 mova [dstq+strideq*1+32], m0
466 mova [dstq+strideq*1+48], m0
467 lea dstq, [dstq+strideq*2]
477 cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
485 DEFINE_ARGS dst, stride, stride3
486 lea stride3q, [strideq*3]
504 mova [dstq+strideq*0], m0
505 mova [dstq+strideq*1], m1
506 mova [dstq+strideq*2], m2
507 mova [dstq+stride3q ], m3
510 cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
512 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
515 cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
521 pshuflw m0, m0, q1111
524 DEFINE_ARGS dst, stride, l, stride3, cnt
525 lea stride3q, [strideq*3]
546 mova [dstq+strideq*0], m0
547 mova [dstq+strideq*1], m1
548 mova [dstq+strideq*2], m2
549 mova [dstq+stride3q ], m3
550 lea dstq, [dstq+strideq*4]
555 cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
557 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
560 cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
567 pshuflw m0, m0, q1111
571 DEFINE_ARGS dst, stride, l, cnt
590 mova [dstq+strideq*0+ 0], m0
591 mova [dstq+strideq*0+16], m2
592 mova [dstq+strideq*1+ 0], m1
593 mova [dstq+strideq*1+16], m3
594 lea dstq, [dstq+strideq*2]
599 cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
601 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
604 cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
616 %define reg_min [rsp+16]
617 %define reg_max [rsp+ 0]
620 mova m4, [aq+mmsize*0]
621 mova m5, [aq+mmsize*1]
622 mova m6, [aq+mmsize*2]
623 mova m7, [aq+mmsize*3]
625 pshuflw m0, m0, q1111
631 DEFINE_ARGS dst, stride, l, cnt
634 pinsrw m3, [lq+cntq*2], 0
649 mova [dstq+strideq*0+ 0], m0
650 mova [dstq+strideq*0+16], m1
651 mova [dstq+strideq*0+32], m2
652 mova [dstq+strideq*0+48], m3
658 cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
660 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
662 ; Directional intra predicion functions
664 ; in the functions below, 'abcdefgh' refers to above data (sometimes simply
665 ; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
666 ; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
667 ; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
670 ; left=(left+2*center+right+2)>>2
671 %macro LOWPASS 3 ; left [dst], center, right
677 ; abcdefgh (src) -> bcdefghh (dst)
678 ; dst/src can be the same register
679 %macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
681 pshufb %1, %2, %3 ; abcdefgh -> bcdefghh
683 psrldq %1, %2, 2 ; abcdefgh -> bcdefgh.
684 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
688 ; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
689 %macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
691 pshufb %1, %3, %4 ; abcdefgh -> bcdefghh
692 pshufb %2, %1, %4 ; bcdefghh -> cdefghhh
694 psrldq %1, %3, 2 ; abcdefgh -> bcdefgh.
695 psrldq %2, %3, 4 ; abcdefgh -> cdefgh..
696 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
697 pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh
702 cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
704 movu m1, [aq] ; abcdefgh
705 pshufhw m0, m1, q3310 ; abcdefhh
706 SHIFT_RIGHT m1, m1 ; bcdefghh
707 psrldq m2, m1, 2 ; cdefghh.
708 LOWPASS 0, 1, 2 ; BCDEFGh.
709 pshufd m1, m0, q3321 ; DEFGh...
710 movh [dstq+strideq*0], m0
711 movh [dstq+strideq*2], m1
713 psrldq m0, 2 ; CDEFGh..
714 psrldq m1, 2 ; EFGh....
715 movh [dstq+strideq*0], m0
716 movh [dstq+strideq*2], m1
719 cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
721 mova m0, [aq] ; abcdefgh
723 mova m4, [pb_2to15_14_15]
725 SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh
726 LOWPASS 0, 1, 2 ; BCDEFGHh
727 shufps m1, m0, m2, q3332 ; FGHhhhhh
728 shufps m3, m0, m1, q2121 ; DEFGHhhh
729 DEFINE_ARGS dst, stride, stride5
730 lea stride5q, [strideq*5]
732 mova [dstq+strideq*0], m0
733 mova [dstq+strideq*4], m1
734 SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh
735 pshuflw m1, m1, q3321 ; GHhhhhhh
736 pshufd m2, m0, q3321 ; EFGHhhhh
737 mova [dstq+strideq*1], m0
738 mova [dstq+stride5q ], m1
739 lea dstq, [dstq+strideq*2]
740 pshuflw m1, m1, q3321 ; Hhhhhhhh
741 mova [dstq+strideq*0], m3
742 mova [dstq+strideq*4], m1
743 pshuflw m1, m1, q3321 ; hhhhhhhh
744 mova [dstq+strideq*1], m2
745 mova [dstq+stride5q ], m1
748 cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
750 mova m0, [aq] ; abcdefgh
751 mova m3, [aq+mmsize] ; ijklmnop
752 PALIGNR m1, m3, m0, 2, m4 ; bcdefghi
753 PALIGNR m2, m3, m0, 4, m4 ; cdefghij
754 LOWPASS 0, 1, 2 ; BCDEFGHI
756 mova m4, [pb_2to15_14_15]
758 SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp
759 LOWPASS 1, 2, 3 ; JKLMNOPp
760 pshufd m2, m2, q3333 ; pppppppp
761 DEFINE_ARGS dst, stride, cnt
765 mova [dstq+strideq*0+ 0], m0
766 mova [dstq+strideq*0+16], m1
767 mova [dstq+strideq*8+ 0], m1
768 mova [dstq+strideq*8+16], m2
771 vpalignr m0, m1, m0, 2
773 PALIGNR m3, m1, m0, 2, m4
776 SHIFT_RIGHT m1, m1, m4
781 cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
783 mova m0, [aq+mmsize*0] ; abcdefgh
784 mova m1, [aq+mmsize*1] ; ijklmnop
785 mova m2, [aq+mmsize*2] ; qrstuvwx
786 mova m3, [aq+mmsize*3] ; yz012345
787 PALIGNR m4, m1, m0, 2, m6
788 PALIGNR m5, m1, m0, 4, m6
789 LOWPASS 0, 4, 5 ; BCDEFGHI
790 PALIGNR m4, m2, m1, 2, m6
791 PALIGNR m5, m2, m1, 4, m6
792 LOWPASS 1, 4, 5 ; JKLMNOPQ
793 PALIGNR m4, m3, m2, 2, m6
794 PALIGNR m5, m3, m2, 4, m6
795 LOWPASS 2, 4, 5 ; RSTUVWXY
797 mova m6, [pb_2to15_14_15]
799 SHIFT_RIGHTx2 m4, m5, m3, m6
800 LOWPASS 3, 4, 5 ; Z0123455
801 pshufd m4, m4, q3333 ; 55555555
802 DEFINE_ARGS dst, stride, stride8, stride24, cnt
804 lea stride8q, [strideq*8]
805 lea stride24q, [stride8q*3]
808 mova [dstq+stride8q*0+ 0], m0
809 mova [dstq+stride8q*0+16], m1
810 mova [dstq+stride8q*0+32], m2
811 mova [dstq+stride8q*0+48], m3
812 mova [dstq+stride8q*1+ 0], m1
813 mova [dstq+stride8q*1+16], m2
814 mova [dstq+stride8q*1+32], m3
815 mova [dstq+stride8q*1+48], m4
816 mova [dstq+stride8q*2+ 0], m2
817 mova [dstq+stride8q*2+16], m3
818 mova [dstq+stride8q*2+32], m4
819 mova [dstq+stride8q*2+48], m4
820 mova [dstq+stride24q + 0], m3
821 mova [dstq+stride24q +16], m4
822 mova [dstq+stride24q +32], m4
823 mova [dstq+stride24q +48], m4
826 vpalignr m0, m1, m0, 2
827 vpalignr m1, m2, m1, 2
828 vpalignr m2, m3, m2, 2
830 PALIGNR m5, m1, m0, 2, m6
832 PALIGNR m5, m2, m1, 2, m6
834 PALIGNR m5, m3, m2, 2, m6
837 SHIFT_RIGHT m3, m3, m6
850 %if HAVE_AVX2_EXTERNAL
852 cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
854 mova m0, [aq] ; abcdefghijklmnop
855 vpbroadcastw xm1, [aq+30] ; pppppppp
856 vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp
857 vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp
858 vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp
859 LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp
860 vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp
861 DEFINE_ARGS dst, stride, stride3, cnt
863 lea stride3q, [strideq*3]
866 mova [dstq+strideq*0], m0
867 vpalignr m3, m2, m0, 2
868 vpalignr m4, m2, m0, 4
869 mova [dstq+strideq*1], m3
870 mova [dstq+strideq*2], m4
871 vpalignr m3, m2, m0, 6
872 vpalignr m4, m2, m0, 8
873 mova [dstq+stride3q ], m3
874 lea dstq, [dstq+strideq*4]
875 mova [dstq+strideq*0], m4
876 vpalignr m3, m2, m0, 10
877 vpalignr m4, m2, m0, 12
878 mova [dstq+strideq*1], m3
879 mova [dstq+strideq*2], m4
880 vpalignr m3, m2, m0, 14
881 mova [dstq+stride3q ], m3
882 lea dstq, [dstq+strideq*4]
884 vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp
889 cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
891 mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop
892 mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345
893 vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555
894 vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx
895 vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq
896 vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr
897 LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ
898 vperm2i128 m5, m1, m4, q0201 ; yz01234555555555
899 vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455
900 vpalignr m3, m5, m1, 4 ; stuvwxyz01234555
901 LOWPASS 1, 2, 3 ; RSTUVWXYZ......5
902 vperm2i128 m2, m1, m4, q0201 ; Z......555555555
903 vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY
904 DEFINE_ARGS dst, stride, stride3, cnt
905 lea stride3q, [strideq*3]
909 mova [dstq+strideq*0 + 0], m0
910 mova [dstq+strideq*0 +32], m1
911 vpalignr m3, m5, m0, 2
912 vpalignr m4, m2, m1, 2
913 mova [dstq+strideq*1 + 0], m3
914 mova [dstq+strideq*1 +32], m4
915 vpalignr m3, m5, m0, 4
916 vpalignr m4, m2, m1, 4
917 mova [dstq+strideq*2 + 0], m3
918 mova [dstq+strideq*2 +32], m4
919 vpalignr m3, m5, m0, 6
920 vpalignr m4, m2, m1, 6
921 mova [dstq+stride3q*1+ 0], m3
922 mova [dstq+stride3q*1+32], m4
923 lea dstq, [dstq+strideq*4]
924 vpalignr m3, m5, m0, 8
925 vpalignr m4, m2, m1, 8
926 mova [dstq+strideq*0 + 0], m3
927 mova [dstq+strideq*0 +32], m4
928 vpalignr m3, m5, m0, 10
929 vpalignr m4, m2, m1, 10
930 mova [dstq+strideq*1 + 0], m3
931 mova [dstq+strideq*1 +32], m4
932 vpalignr m3, m5, m0, 12
933 vpalignr m4, m2, m1, 12
934 mova [dstq+strideq*2+ 0], m3
935 mova [dstq+strideq*2+32], m4
936 vpalignr m3, m5, m0, 14
937 vpalignr m4, m2, m1, 14
938 mova [dstq+stride3q+ 0], m3
939 mova [dstq+stride3q+ 32], m4
940 vpalignr m3, m5, m0, 16
941 vpalignr m4, m2, m1, 16
942 vperm2i128 m5, m3, m4, q0201
943 vperm2i128 m2, m4, m4, q0101
946 lea dstq, [dstq+strideq*4]
952 %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
953 cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
954 movh m0, [lq] ; wxyz....
955 movhps m0, [aq-2] ; wxyz*abc
956 movd m1, [aq+6] ; d.......
957 PALIGNR m1, m0, 2, m2 ; xyz*abcd
958 psrldq m2, m1, 2 ; yz*abcd.
959 LOWPASS 0, 1, 2 ; XYZ#ABC.
960 DEFINE_ARGS dst, stride, stride3
961 lea stride3q, [strideq*3]
963 movh [dstq+stride3q ], m0
964 psrldq m0, 2 ; YZ#ABC..
965 movh [dstq+strideq*2], m0
966 psrldq m0, 2 ; Z#ABC...
967 movh [dstq+strideq*1], m0
968 psrldq m0, 2 ; #ABC....
969 movh [dstq+strideq*0], m0
972 cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
973 mova m0, [lq] ; stuvwxyz
974 movu m1, [aq-2] ; *abcdefg
975 mova m2, [aq] ; abcdefgh
976 psrldq m3, m2, 2 ; bcdefgh.
977 LOWPASS 3, 2, 1 ; ABCDEFG.
978 PALIGNR m1, m0, 2, m4 ; tuvwxyz*
979 PALIGNR m2, m1, 2, m4 ; uvwxyz*a
980 LOWPASS 2, 1, 0 ; TUVWXYZ#
981 DEFINE_ARGS dst, stride, dst4, stride3
982 lea stride3q, [strideq*3]
983 lea dst4q, [dstq+strideq*4]
985 movhps [dstq +stride3q +0], m2
986 movh [dstq+ stride3q +8], m3
987 mova [dst4q+stride3q +0], m2
988 PALIGNR m1, m3, m2, 2, m0
990 movhps [dstq +strideq*2+0], m1
991 movh [dstq+ strideq*2+8], m3
992 mova [dst4q+strideq*2+0], m1
993 PALIGNR m2, m3, m1, 2, m0
995 movhps [dstq +strideq*1+0], m2
996 movh [dstq+ strideq*1+8], m3
997 mova [dst4q+strideq*1+0], m2
998 PALIGNR m1, m3, m2, 2, m0
1000 movhps [dstq +strideq*0+0], m1
1001 movh [dstq+ strideq*0+8], m3
1002 mova [dst4q+strideq*0+0], m1
1005 cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
1006 mova m0, [lq] ; klmnopqr
1007 mova m1, [lq+mmsize] ; stuvwxyz
1008 movu m2, [aq-2] ; *abcdefg
1009 movu m3, [aq+mmsize-2] ; hijklmno
1010 mova m4, [aq] ; abcdefgh
1011 mova m5, [aq+mmsize] ; ijklmnop
1012 psrldq m6, m5, 2 ; jklmnop.
1013 LOWPASS 6, 5, 3 ; IJKLMNO.
1014 PALIGNR m5, m4, 2, m3 ; bcdefghi
1015 LOWPASS 5, 4, 2 ; ABCDEFGH
1016 PALIGNR m2, m1, 2, m3 ; tuvwxyz*
1017 PALIGNR m4, m2, 2, m3 ; uvwxyz*a
1018 LOWPASS 4, 2, 1 ; TUVWXYZ#
1019 PALIGNR m1, m0, 2, m3 ; lmnopqrs
1020 PALIGNR m2, m1, 2, m3 ; mnopqrst
1021 LOWPASS 2, 1, 0 ; LMNOPQRS
1022 DEFINE_ARGS dst, stride, dst8, cnt
1023 lea dst8q, [dstq+strideq*8]
1028 mova [dst8q+strideq*0+ 0], m4
1029 mova [dst8q+strideq*0+16], m5
1030 mova [dst8q+strideq*8+ 0], m2
1031 mova [dst8q+strideq*8+16], m4
1033 vpalignr m2, m4, m2, 2
1034 vpalignr m4, m5, m4, 2
1035 vpalignr m5, m6, m5, 2
1037 PALIGNR m0, m4, m2, 2, m1
1039 PALIGNR m0, m5, m4, 2, m1
1041 PALIGNR m0, m6, m5, 2, m1
1049 cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
1050 %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
1051 mova m0, [aq+mmsize*3] ; a[24-31]
1052 movu m1, [aq+mmsize*3-2] ; a[23-30]
1053 psrldq m2, m0, 2 ; a[25-31].
1054 LOWPASS 2, 0, 1 ; A[24-30].
1055 mova m1, [aq+mmsize*2] ; a[16-23]
1056 movu m3, [aq+mmsize*2-2] ; a[15-22]
1057 PALIGNR m0, m1, 2, m4 ; a[17-24]
1058 LOWPASS 0, 1, 3 ; A[16-23]
1059 mova m3, [aq+mmsize*1] ; a[8-15]
1060 movu m4, [aq+mmsize*1-2] ; a[7-14]
1061 PALIGNR m1, m3, 2, m5 ; a[9-16]
1062 LOWPASS 1, 3, 4 ; A[8-15]
1063 mova m4, [aq+mmsize*0] ; a[0-7]
1064 movu m5, [aq+mmsize*0-2] ; *a[0-6]
1065 PALIGNR m3, m4, 2, m6 ; a[1-8]
1066 LOWPASS 3, 4, 5 ; A[0-7]
1067 SCRATCH 1, 8, rsp+0*mmsize
1068 SCRATCH 3, 9, rsp+1*mmsize
1069 %if notcpuflag(ssse3)
1070 SCRATCH 0, 10, rsp+2*mmsize
1072 mova m6, [lq+mmsize*3] ; l[24-31]
1073 PALIGNR m5, m6, 2, m0 ; l[25-31]*
1074 PALIGNR m4, m5, 2, m0 ; l[26-31]*a
1075 LOWPASS 4, 5, 6 ; L[25-31]#
1076 mova m7, [lq+mmsize*2] ; l[16-23]
1077 PALIGNR m6, m7, 2, m0 ; l[17-24]
1078 PALIGNR m5, m6, 2, m0 ; l[18-25]
1079 LOWPASS 5, 6, 7 ; L[17-24]
1080 mova m1, [lq+mmsize*1] ; l[8-15]
1081 PALIGNR m7, m1, 2, m0 ; l[9-16]
1082 PALIGNR m6, m7, 2, m0 ; l[10-17]
1083 LOWPASS 6, 7, 1 ; L[9-16]
1084 mova m3, [lq+mmsize*0] ; l[0-7]
1085 PALIGNR m1, m3, 2, m0 ; l[1-8]
1086 PALIGNR m7, m1, 2, m0 ; l[2-9]
1087 LOWPASS 7, 1, 3 ; L[1-8]
1090 UNSCRATCH 1, 8, rsp+0*mmsize
1092 UNSCRATCH 3, 9, rsp+1*mmsize
1094 UNSCRATCH 0, 10, rsp+2*mmsize
1096 DEFINE_ARGS dst8, stride, stride8, stride24, cnt
1097 lea stride8q, [strideq*8]
1098 lea stride24q, [stride8q*3]
1099 lea dst8q, [dst8q+strideq*8]
1105 UNSCRATCH 1, 8, rsp+0*mmsize
1106 %if notcpuflag(ssse3)
1107 UNSCRATCH 3, 9, rsp+1*mmsize
1110 mova [dst8q+stride8q*0+ 0], m4
1111 mova [dst8q+stride8q*0+16], m3
1112 mova [dst8q+stride8q*0+32], m1
1113 mova [dst8q+stride8q*0+48], m0
1114 mova [dst8q+stride8q*1+ 0], m5
1115 mova [dst8q+stride8q*1+16], m4
1116 mova [dst8q+stride8q*1+32], m3
1117 mova [dst8q+stride8q*1+48], m1
1118 mova [dst8q+stride8q*2+ 0], m6
1119 mova [dst8q+stride8q*2+16], m5
1120 mova [dst8q+stride8q*2+32], m4
1121 mova [dst8q+stride8q*2+48], m3
1122 mova [dst8q+stride24q + 0], m7
1123 mova [dst8q+stride24q +16], m6
1124 mova [dst8q+stride24q +32], m5
1125 mova [dst8q+stride24q +48], m4
1127 vpalignr m7, m6, m7, 2
1128 vpalignr m6, m5, m6, 2
1129 vpalignr m5, m4, m5, 2
1130 vpalignr m4, m3, m4, 2
1131 vpalignr m3, m1, m3, 2
1132 vpalignr m1, m0, m1, 2
1133 vpalignr m0, m2, m0, 2
1135 SCRATCH 2, 8, rsp+0*mmsize
1136 %if notcpuflag(ssse3)
1137 SCRATCH 0, 9, rsp+1*mmsize
1139 PALIGNR m2, m6, m7, 2, m0
1141 PALIGNR m2, m5, m6, 2, m0
1143 PALIGNR m2, m4, m5, 2, m0
1145 PALIGNR m2, m3, m4, 2, m0
1147 PALIGNR m2, m1, m3, 2, m0
1149 %if notcpuflag(ssse3)
1150 UNSCRATCH 0, 9, rsp+1*mmsize
1151 SCRATCH 3, 9, rsp+1*mmsize
1153 PALIGNR m2, m0, m1, 2, m3
1155 UNSCRATCH 2, 8, rsp+0*mmsize
1156 SCRATCH 1, 8, rsp+0*mmsize
1157 PALIGNR m1, m2, m0, 2, m3
1173 %if HAVE_AVX2_EXTERNAL
1175 cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
1176 mova m0, [lq] ; klmnopqrstuvwxyz
1177 movu m1, [aq-2] ; *abcdefghijklmno
1178 mova m2, [aq] ; abcdefghijklmnop
1179 vperm2i128 m4, m2, m2, q2001 ; ijklmnop........
1180 vpalignr m5, m4, m2, 2 ; bcdefghijklmnop.
1181 vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg
1182 LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO.
1183 vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz*
1184 vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a
1185 LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ#
1186 vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH
1187 DEFINE_ARGS dst, stride, stride3, stride5, dst3
1188 lea dst3q, [dstq+strideq*4]
1189 lea stride3q, [strideq*3]
1190 lea stride5q, [stride3q+strideq*2]
1192 vpalignr m3, m5, m0, 2
1193 vpalignr m4, m1, m5, 2
1194 mova [dst3q+stride5q*2], m3 ; 14
1195 mova [ dstq+stride3q*2], m4 ; 6
1196 vpalignr m3, m5, m0, 4
1197 vpalignr m4, m1, m5, 4
1199 mova [dst3q+stride5q*2], m3 ; 13
1200 mova [dst3q+strideq*2 ], m4 ; 5
1201 mova [dst3q+stride3q*4], m0 ; 15
1202 vpalignr m3, m5, m0, 6
1203 vpalignr m4, m1, m5, 6
1204 mova [dstq+stride3q*4], m3 ; 12
1205 mova [dst3q+strideq*1], m4 ; 4
1206 vpalignr m3, m5, m0, 8
1207 vpalignr m4, m1, m5, 8
1208 mova [dst3q+strideq*8], m3 ; 11
1209 mova [dst3q+strideq*0], m4 ; 3
1210 vpalignr m3, m5, m0, 10
1211 vpalignr m4, m1, m5, 10
1212 mova [dstq+stride5q*2], m3 ; 10
1213 mova [dstq+strideq*2 ], m4 ; 2
1214 vpalignr m3, m5, m0, 12
1215 vpalignr m4, m1, m5, 12
1216 mova [dst3q+stride3q*2], m3 ; 9
1217 mova [dstq+strideq*1 ], m4 ; 1
1218 vpalignr m3, m5, m0, 14
1219 vpalignr m4, m1, m5, 14
1220 mova [dstq+strideq*8], m3 ; 8
1221 mova [dstq+strideq*0], m4 ; 0
1222 mova [dst3q+strideq*4], m5 ; 7
1226 cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
1227 mova m0, [lq+mmsize*0+0] ; l[0-15]
1228 mova m1, [lq+mmsize*1+0] ; l[16-31]
1229 movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno
1230 mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop
1231 mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345
1232 vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0
1233 vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01
1234 vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012
1235 LOWPASS 0, 6, 7 ; L[0-15]
1236 vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg
1237 vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz*
1238 vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a
1239 LOWPASS 1, 5, 6 ; L[16-31]#
1240 vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx
1241 vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq
1242 LOWPASS 2, 3, 6 ; A[0-15]
1243 movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234
1244 vperm2i128 m6, m4, m4, q2001 ; yz012345........
1245 vpalignr m7, m6, m4, 2 ; rstuvwxyz012345.
1246 LOWPASS 3, 4, 7 ; A[16-31].
1247 vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH
1248 vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23]
1249 vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX
1250 DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
1251 lea stride3q, [strideq*3]
1252 lea stride5q, [stride3q+strideq*2]
1253 lea stride7q, [strideq*4+stride3q]
1254 lea dst24q, [dst8q+stride3q*8]
1255 lea dst8q, [dst8q+strideq*8]
1259 mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7
1260 mova [dst24q+stride7q+32], m1
1261 mova [dst8q+stride7q+0], m1
1262 mova [dst8q+stride7q+32], m2
1263 vpalignr m6, m4, m1, 2
1264 vpalignr m7, m5, m0, 2
1265 vpalignr m9, m8, m2, 2
1266 mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6
1267 mova [dst24q+stride3q*2+32], m6
1268 mova [dst8q+stride3q*2+0], m6
1269 mova [dst8q+stride3q*2+32], m9
1270 vpalignr m6, m4, m1, 4
1271 vpalignr m7, m5, m0, 4
1272 vpalignr m9, m8, m2, 4
1273 mova [dst24q+stride5q+0], m7 ; 29 21 13 5
1274 mova [dst24q+stride5q+32], m6
1275 mova [dst8q+stride5q+0], m6
1276 mova [dst8q+stride5q+32], m9
1277 vpalignr m6, m4, m1, 6
1278 vpalignr m7, m5, m0, 6
1279 vpalignr m9, m8, m2, 6
1280 mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4
1281 mova [dst24q+strideq*4+32], m6
1282 mova [dst8q+strideq*4+0], m6
1283 mova [dst8q+strideq*4+32], m9
1284 vpalignr m6, m4, m1, 8
1285 vpalignr m7, m5, m0, 8
1286 vpalignr m9, m8, m2, 8
1287 mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3
1288 mova [dst24q+stride3q+32], m6
1289 mova [dst8q+stride3q+0], m6
1290 mova [dst8q+stride3q+32], m9
1291 vpalignr m6, m4, m1, 10
1292 vpalignr m7, m5, m0, 10
1293 vpalignr m9, m8, m2, 10
1294 mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2
1295 mova [dst24q+strideq*2+32], m6
1296 mova [dst8q+strideq*2+0], m6
1297 mova [dst8q+strideq*2+32], m9
1298 vpalignr m6, m4, m1, 12
1299 vpalignr m7, m5, m0, 12
1300 vpalignr m9, m8, m2, 12
1301 mova [dst24q+strideq+0 ], m7 ; 25 17 9 1
1302 mova [dst24q+strideq+32], m6
1303 mova [dst8q+strideq+0], m6
1304 mova [dst8q+strideq+32], m9
1305 vpalignr m6, m4, m1, 14
1306 vpalignr m7, m5, m0, 14
1307 vpalignr m9, m8, m2, 14
1308 mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0
1309 mova [dst24q+strideq*0+32], m6
1310 mova [dst8q+strideq*0+0], m6
1311 mova [dst8q+strideq*0+32], m9
1318 sub dst24q, stride7q
1328 %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1329 cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
1331 movu m0, [aq] ; abcdefgh
1332 psrldq m1, m0, 2 ; bcdefgh.
1333 psrldq m2, m0, 4 ; cdefgh..
1334 LOWPASS 2, 1, 0 ; BCDEFGH.
1335 pavgw m1, m0 ; ABCDEFG.
1336 DEFINE_ARGS dst, stride, stride3
1337 lea stride3q, [strideq*3]
1339 movh [dstq+strideq*0], m1
1340 movh [dstq+strideq*1], m2
1343 movh [dstq+strideq*2], m1
1344 movh [dstq+stride3q ], m2
1347 cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
1349 mova m0, [aq] ; abcdefgh
1351 mova m3, [pb_2to15_14_15]
1353 SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh
1354 LOWPASS 2, 1, 0 ; BCDEFGHh
1355 pavgw m1, m0 ; ABCDEFGh
1356 DEFINE_ARGS dst, stride, stride3
1357 lea stride3q, [strideq*3]
1359 mova [dstq+strideq*0], m1
1360 mova [dstq+strideq*1], m2
1361 SHIFT_RIGHT m1, m1, m3
1362 SHIFT_RIGHT m2, m2, m3
1363 mova [dstq+strideq*2], m1
1364 mova [dstq+stride3q ], m2
1365 lea dstq, [dstq+strideq*4]
1366 SHIFT_RIGHT m1, m1, m3
1367 SHIFT_RIGHT m2, m2, m3
1368 mova [dstq+strideq*0], m1
1369 mova [dstq+strideq*1], m2
1370 SHIFT_RIGHT m1, m1, m3
1371 SHIFT_RIGHT m2, m2, m3
1372 mova [dstq+strideq*2], m1
1373 mova [dstq+stride3q ], m2
1376 cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
1379 mova m1, [aq+mmsize]
1380 PALIGNR m2, m1, m0, 2, m3
1381 PALIGNR m3, m1, m0, 4, m4
1385 mova m4, [pb_2to15_14_15]
1387 SHIFT_RIGHTx2 m5, m0, m1, m4
1390 DEFINE_ARGS dst, stride, cnt
1394 mova [dstq+strideq*0+ 0], m2
1395 mova [dstq+strideq*0+16], m1
1396 mova [dstq+strideq*1+ 0], m3
1397 mova [dstq+strideq*1+16], m0
1398 lea dstq, [dstq+strideq*2]
1400 vpalignr m2, m1, m2, 2
1401 vpalignr m3, m0, m3, 2
1403 PALIGNR m5, m1, m2, 2, m4
1405 PALIGNR m5, m0, m3, 2, m4
1408 SHIFT_RIGHT m1, m1, m4
1409 SHIFT_RIGHT m0, m0, m4
1414 cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
1416 mova m0, [aq+mmsize*0]
1417 mova m1, [aq+mmsize*1]
1418 mova m2, [aq+mmsize*2]
1419 PALIGNR m6, m1, m0, 2, m5
1420 PALIGNR m7, m1, m0, 4, m5
1423 SCRATCH 6, 8, rsp+0*mmsize
1424 PALIGNR m4, m2, m1, 2, m0
1425 PALIGNR m5, m2, m1, 4, m0
1428 mova m0, [aq+mmsize*3]
1429 PALIGNR m1, m0, m2, 2, m6
1430 PALIGNR m3, m0, m2, 4, m6
1434 PRELOAD 10, pb_2to15_14_15, shuf
1436 SHIFT_RIGHTx2 m6, m1, m0, reg_shuf
1440 pshufd m9, m6, q3333
1443 UNSCRATCH 6, 8, rsp+0*mmsize
1445 DEFINE_ARGS dst, stride, cnt, stride16, stride17
1446 mov stride16q, strideq
1449 lea stride17q, [stride16q+strideq]
1451 ; FIXME m8 is unused for avx, so we could save one register here for win64
1454 UNSCRATCH 6, 8, rsp+0*mmsize
1456 mova [dstq+strideq*0+ 0], m6
1457 mova [dstq+strideq*0+16], m4
1458 mova [dstq+strideq*0+32], m2
1459 mova [dstq+strideq*0+48], m0
1460 mova [dstq+strideq*1+ 0], m7
1461 mova [dstq+strideq*1+16], m5
1462 mova [dstq+strideq*1+32], m3
1463 mova [dstq+strideq*1+48], m1
1464 mova [dstq+stride16q+ 0], m4
1465 mova [dstq+stride16q+16], m2
1466 mova [dstq+stride16q+32], m0
1468 mova [dstq+stride16q+48], m9
1470 mova [dstq+stride17q+ 0], m5
1471 mova [dstq+stride17q+16], m3
1472 mova [dstq+stride17q+32], m1
1474 mova [dstq+stride17q+48], m9
1476 lea dstq, [dstq+strideq*2]
1478 vpalignr m6, m4, m6, 2
1479 vpalignr m4, m2, m4, 2
1480 vpalignr m2, m0, m2, 2
1481 vpalignr m7, m5, m7, 2
1482 vpalignr m5, m3, m5, 2
1483 vpalignr m3, m1, m3, 2
1485 SCRATCH 3, 8, rsp+0*mmsize
1486 %if notcpuflag(ssse3)
1487 SCRATCH 1, 10, rsp+1*mmsize
1489 PALIGNR m3, m4, m6, 2, m1
1491 PALIGNR m3, m2, m4, 2, m1
1493 PALIGNR m3, m0, m2, 2, m1
1495 PALIGNR m3, m5, m7, 2, m1
1497 UNSCRATCH 3, 8, rsp+0*mmsize
1498 SCRATCH 6, 8, rsp+0*mmsize
1499 %if notcpuflag(ssse3)
1500 UNSCRATCH 1, 10, rsp+1*mmsize
1501 SCRATCH 7, 10, rsp+1*mmsize
1503 PALIGNR m6, m3, m5, 2, m7
1505 PALIGNR m6, m1, m3, 2, m7
1507 %if notcpuflag(ssse3)
1508 UNSCRATCH 7, 10, rsp+1*mmsize
1511 SHIFT_RIGHT m1, m1, reg_shuf
1512 SHIFT_RIGHT m0, m0, reg_shuf
1517 DEFINE_ARGS dst, stride, stride3
1518 lea stride3q, [strideq*3]
1521 mova [dstq+strideq*0+48], m0
1522 mova [dstq+strideq*1+48], m0
1523 mova [dstq+strideq*2+48], m0
1524 mova [dstq+stride3q +48], m0
1526 lea dstq, [dstq+strideq*4]
1542 cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
1545 PALIGNR m0, m1, 10, m2 ; xyz*abcd
1546 pslldq m1, m0, 2 ; .xyz*abc
1547 pslldq m2, m0, 4 ; ..xyz*ab
1548 LOWPASS 2, 1, 0 ; ..YZ#ABC
1549 pavgw m1, m0 ; ....#ABC
1550 DEFINE_ARGS dst, stride, stride3
1551 lea stride3q, [strideq*3]
1553 movhps [dstq+strideq*0], m1
1554 movhps [dstq+strideq*1], m2
1555 shufps m0, m2, m1, q3210
1557 pshufb m2, [pb_4_5_8to13_8x0]
1559 pshuflw m2, m2, q2222
1563 movh [dstq+strideq*2], m0
1564 movh [dstq+stride3q ], m2
1567 cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
1568 movu m1, [aq-2] ; *abcdefg
1569 movu m2, [lq] ; stuvwxyz
1570 mova m0, [aq] ; abcdefgh
1571 PALIGNR m3, m1, m2, 14, m4 ; z*abcdef
1574 PALIGNR m1, m2, 2, m4 ; tuvwxyz*
1575 pslldq m4, m2, 2 ; .stuvwxy
1577 DEFINE_ARGS dst, stride, stride3
1578 lea stride3q, [strideq*3]
1580 mova [dstq+strideq*0], m0
1581 mova [dstq+strideq*1], m3
1582 PALIGNR m0, m4, 14, m1
1584 PALIGNR m3, m4, 14, m1
1586 mova [dstq+strideq*2], m0
1587 mova [dstq+stride3q ], m3
1588 lea dstq, [dstq+strideq*4]
1589 PALIGNR m0, m4, 14, m1
1591 PALIGNR m3, m4, 14, m1
1593 mova [dstq+strideq*0], m0
1594 mova [dstq+strideq*1], m3
1595 PALIGNR m0, m4, 14, m1
1597 PALIGNR m3, m4, 14, m4
1598 mova [dstq+strideq*2], m0
1599 mova [dstq+stride3q ], m3
1602 cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
1603 movu m1, [aq-2] ; *abcdefg
1604 movu m2, [aq+mmsize-2] ; hijklmno
1605 mova m3, [aq] ; abcdefgh
1606 mova m4, [aq+mmsize] ; ijklmnop
1607 mova m5, [lq+mmsize] ; stuvwxyz
1608 PALIGNR m0, m1, m5, 14, m6 ; z*abcdef
1609 movu m6, [aq+mmsize-4] ; ghijklmn
1614 PALIGNR m1, m5, 2, m7 ; tuvwxyz*
1615 movu m7, [lq+mmsize-2] ; rstuvwxy
1617 movu m5, [lq+2] ; lmnopqrs
1618 pslldq m4, m5, 2 ; .lmnopqr
1619 pslldq m7, m5, 4 ; ..lmnopq
1627 DEFINE_ARGS dst, stride, cnt
1631 mova [dstq+strideq*0+ 0], m3
1632 mova [dstq+strideq*0+16], m2
1633 mova [dstq+strideq*1+ 0], m0
1634 mova [dstq+strideq*1+16], m6
1635 lea dstq, [dstq+strideq*2]
1636 PALIGNR m2, m3, 14, m4
1637 PALIGNR m3, m7, 14, m4
1639 PALIGNR m6, m0, 14, m4
1640 PALIGNR m0, m5, 14, m4
1646 cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
1647 movu m0, [aq+mmsize*0-2] ; *a[0-6]
1648 movu m1, [aq+mmsize*1-2] ; a[7-14]
1649 movu m2, [aq+mmsize*2-2] ; a[15-22]
1650 movu m3, [aq+mmsize*3-2] ; a[23-30]
1651 mova m4, [aq+mmsize*3+0] ; a[24-31]
1652 movu m5, [aq+mmsize*3-4] ; a[22-29]
1653 LOWPASS 5, 3, 4 ; A[23-30]
1654 SCRATCH 5, 8, rsp+0*mmsize
1656 mova m4, [aq+mmsize*2+0] ; a[16-23]
1657 movu m6, [aq+mmsize*2-4] ; a[14-21]
1658 LOWPASS 6, 2, 4 ; A[15-22]
1659 SCRATCH 6, 9, rsp+1*mmsize
1661 mova m4, [aq+mmsize*1+0] ; a[8-15]
1662 movu m7, [aq+mmsize*1-4] ; a[6-13]
1663 LOWPASS 7, 1, 4 ; A[7-14]
1664 SCRATCH 7, 10, rsp+2*mmsize
1666 mova m4, [aq+mmsize*0+0] ; a[0-7]
1667 mova m5, [lq+mmsize*3+0] ; l[24-31]
1668 PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5]
1669 LOWPASS 6, 0, 4 ; #A[0-6]
1670 SCRATCH 6, 11, rsp+3*mmsize
1672 PALIGNR m0, m5, 2, m7 ; l[25-31]*
1673 movu m7, [lq+mmsize*3-2] ; l[23-30]
1674 LOWPASS 0, 5, 7 ; L[24-31]
1675 movu m5, [lq+mmsize*2-2] ; l[15-22]
1676 mova m7, [lq+mmsize*2+0] ; l[16-23]
1677 movu m6, [lq+mmsize*2+2] ; l[17-24]
1678 LOWPASS 5, 7, 6 ; L[16-23]
1685 SCRATCH 5, 12, rsp+4*mmsize
1686 SCRATCH 6, 13, rsp+5*mmsize
1687 movu m6, [lq+mmsize*1-2] ; l[7-14]
1688 mova m0, [lq+mmsize*1+0] ; l[8-15]
1689 movu m5, [lq+mmsize*1+2] ; l[9-16]
1690 LOWPASS 6, 0, 5 ; L[8-15]
1691 movu m0, [lq+mmsize*0+2] ; l[1-8]
1692 pslldq m5, m0, 2 ; .l[1-7]
1693 pslldq m7, m0, 4 ; ..l[1-6]
1701 UNSCRATCH 6, 13, rsp+5*mmsize
1702 DEFINE_ARGS dst, stride, stride16, cnt, stride17
1703 mov stride16q, strideq
1707 lea stride17q, [stride16q+strideq]
1711 mova [dstq+strideq*0+ 0], m4
1712 mova [dstq+strideq*0+16], m1
1713 mova [dstq+strideq*0+32], m2
1714 mova [dstq+strideq*0+48], m3
1716 mova [dstq+strideq*1+ 0], m11
1717 mova [dstq+strideq*1+16], m10
1718 mova [dstq+strideq*1+32], m9
1719 mova [dstq+strideq*1+48], m8
1721 mova [dstq+stride16q+ 0], m6
1722 mova [dstq+stride16q+16], m4
1723 mova [dstq+stride16q+32], m1
1724 mova [dstq+stride16q+48], m2
1726 mova [dstq+stride17q+ 0], m12
1727 mova [dstq+stride17q+16], m11
1728 mova [dstq+stride17q+32], m10
1729 mova [dstq+stride17q+48], m9
1731 lea dstq, [dstq+strideq*2]
1732 PALIGNR m3, m2, 14, m5
1733 PALIGNR m2, m1, 14, m5
1734 PALIGNR m1, m4, 14, m5
1735 PALIGNR m4, m6, 14, m5
1736 PALIGNR m6, m7, 14, m5
1739 PALIGNR m8, m9, 14, m5
1740 PALIGNR m9, m10, 14, m5
1741 PALIGNR m10, m11, 14, m5
1742 PALIGNR m11, m12, 14, m5
1743 PALIGNR m12, m0, 14, m5
1750 UNSCRATCH 5, 12, rsp+4*mmsize
1751 UNSCRATCH 4, 11, rsp+3*mmsize
1752 UNSCRATCH 3, 10, rsp+2*mmsize
1753 UNSCRATCH 2, 9, rsp+1*mmsize
1754 UNSCRATCH 1, 8, rsp+0*mmsize
1759 mova [dstq+strideq*0+ 0], m4
1760 mova [dstq+strideq*0+16], m3
1761 mova [dstq+strideq*0+32], m2
1762 mova [dstq+strideq*0+48], m1
1763 mova [dstq+stride16q+ 0], m5
1764 mova [dstq+stride16q+16], m4
1765 mova [dstq+stride16q+32], m3
1766 mova [dstq+stride16q+48], m2
1767 lea dstq, [dstq+strideq*2]
1768 PALIGNR m1, m2, 14, m6
1769 PALIGNR m2, m3, 14, m6
1770 PALIGNR m3, m4, 14, m6
1771 PALIGNR m4, m5, 14, m6
1772 PALIGNR m5, m0, 14, m6
1787 %macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1788 cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
1789 movh m0, [lq] ; abcd
1791 pshufb m0, [pb_0to7_67x4] ; abcddddd
1794 pshufhw m0, m0, q3333 ; abcddddd
1796 psrldq m1, m0, 2 ; bcddddd.
1797 psrldq m2, m0, 4 ; cddddd..
1798 LOWPASS 2, 1, 0 ; BCDddd..
1799 pavgw m1, m0 ; abcddddd
1800 SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd
1801 PALIGNR m2, m1, 4, m0 ; bCcDdddd
1802 DEFINE_ARGS dst, stride, stride3
1803 lea stride3q, [strideq*3]
1805 movh [dstq+strideq*0], m1 ; aBbC
1806 movh [dstq+strideq*1], m2 ; bCcD
1807 movhps [dstq+strideq*2], m1 ; cDdd
1808 movhps [dstq+stride3q ], m2 ; dddd
1811 cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
1814 mova m3, [pb_2to15_14_15]
1816 SHIFT_RIGHTx2 m1, m2, m0, m3
1819 SBUTTERFLY wd, 1, 2, 0
1820 shufps m0, m1, m2, q1032
1821 pshufd m3, m2, q3332
1822 DEFINE_ARGS dst, stride, stride3
1823 lea stride3q, [strideq*3]
1825 mova [dstq+strideq *0], m1
1826 mova [dstq+strideq *2], m0
1827 mova [dstq+strideq *4], m2
1828 mova [dstq+stride3q*2], m3
1831 vpalignr m1, m2, m1, 4
1833 PALIGNR m0, m2, m1, 4, m3
1836 pshufd m2, m2, q3321
1837 shufps m0, m1, m2, q1032
1838 pshufd m3, m2, q3332
1839 mova [dstq+strideq *0], m1
1840 mova [dstq+strideq *2], m0
1841 mova [dstq+strideq *4], m2
1842 mova [dstq+stride3q*2], m3
1845 cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
1847 mova m3, [lq+mmsize]
1852 SBUTTERFLY wd, 1, 2, 0
1854 mova m5, [pb_2to15_14_15]
1856 SHIFT_RIGHTx2 m0, m4, m3, m5
1859 SBUTTERFLY wd, 3, 4, 5
1860 pshufd m0, m0, q3333
1861 DEFINE_ARGS dst, stride, stride3, cnt
1862 lea stride3q, [strideq*3]
1866 mova [dstq+strideq *0+ 0], m1
1867 mova [dstq+strideq *0+16], m2
1868 mova [dstq+strideq *4+ 0], m2
1869 mova [dstq+strideq *4+16], m3
1870 mova [dstq+strideq *8+ 0], m3
1871 mova [dstq+strideq *8+16], m4
1872 mova [dstq+stride3q*4+ 0], m4
1873 mova [dstq+stride3q*4+16], m0
1876 vpalignr m1, m2, m1, 4
1877 vpalignr m2, m3, m2, 4
1878 vpalignr m3, m4, m3, 4
1879 vpalignr m4, m0, m4, 4
1881 PALIGNR m5, m2, m1, 4, m6
1883 PALIGNR m5, m3, m2, 4, m6
1885 PALIGNR m5, m4, m3, 4, m6
1887 PALIGNR m5, m0, m4, 4, m6
1894 cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
1895 %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
1896 mova m2, [lq+mmsize*0+0]
1897 movu m1, [lq+mmsize*0+2]
1898 movu m0, [lq+mmsize*0+4]
1901 SBUTTERFLY wd, 1, 0, 2
1902 SCRATCH 1, 8, rsp+0*mmsize
1903 mova m4, [lq+mmsize*1+0]
1904 movu m3, [lq+mmsize*1+2]
1905 movu m2, [lq+mmsize*1+4]
1908 SBUTTERFLY wd, 3, 2, 4
1909 mova m6, [lq+mmsize*2+0]
1910 movu m5, [lq+mmsize*2+2]
1911 movu m4, [lq+mmsize*2+4]
1914 SBUTTERFLY wd, 5, 4, 6
1915 mova m7, [lq+mmsize*3+0]
1916 SCRATCH 0, 9, rsp+1*mmsize
1918 mova m0, [pb_2to15_14_15]
1920 SHIFT_RIGHTx2 m1, m6, m7, m0
1923 SBUTTERFLY wd, 7, 6, 0
1924 pshufd m1, m1, q3333
1925 UNSCRATCH 0, 9, rsp+1*mmsize
1926 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
1927 lea stride3q, [strideq*3]
1928 lea stride4q, [strideq*4]
1929 lea stride28q, [stride4q*8]
1930 lea stride20q, [stride4q*5]
1931 sub stride28q, stride4q
1938 mova [rsp+1*mmsize], m1
1939 mova m1, [rsp+0*mmsize]
1941 mova [dstq+strideq *0+ 0], m1
1942 mova [dstq+strideq *0+16], m0
1943 mova [dstq+strideq *0+32], m3
1944 mova [dstq+strideq *0+48], m2
1945 mova [dstq+stride4q*1+ 0], m0
1946 mova [dstq+stride4q*1+16], m3
1947 mova [dstq+stride4q*1+32], m2
1948 mova [dstq+stride4q*1+48], m5
1949 mova [dstq+stride4q*2+ 0], m3
1950 mova [dstq+stride4q*2+16], m2
1951 mova [dstq+stride4q*2+32], m5
1952 mova [dstq+stride4q*2+48], m4
1954 vpalignr m1, m0, m1, 4
1955 vpalignr m0, m3, m0, 4
1956 vpalignr m3, m2, m3, 4
1958 SCRATCH 6, 9, rsp+2*mmsize
1959 %if notcpuflag(ssse3)
1960 SCRATCH 7, 10, rsp+3*mmsize
1962 PALIGNR m6, m0, m1, 4, m7
1964 PALIGNR m6, m3, m0, 4, m7
1966 PALIGNR m6, m2, m3, 4, m7
1968 UNSCRATCH 6, 9, rsp+2*mmsize
1969 SCRATCH 0, 9, rsp+2*mmsize
1970 %if notcpuflag(ssse3)
1971 UNSCRATCH 7, 10, rsp+3*mmsize
1972 SCRATCH 3, 10, rsp+3*mmsize
1978 mova [rsp+0*mmsize], m1
1979 mova m1, [rsp+1*mmsize]
1981 mova [dstq+stride3q*4+ 0], m2
1982 mova [dstq+stride3q*4+16], m5
1983 mova [dstq+stride3q*4+32], m4
1984 mova [dstq+stride3q*4+48], m7
1985 mova [dstq+stride4q*4+ 0], m5
1986 mova [dstq+stride4q*4+16], m4
1987 mova [dstq+stride4q*4+32], m7
1988 mova [dstq+stride4q*4+48], m6
1989 mova [dstq+stride20q + 0], m4
1990 mova [dstq+stride20q +16], m7
1991 mova [dstq+stride20q +32], m6
1992 mova [dstq+stride20q +48], m1
1993 mova [dstq+stride3q*8+ 0], m7
1994 mova [dstq+stride3q*8+16], m6
1995 mova [dstq+stride3q*8+32], m1
1996 mova [dstq+stride3q*8+48], m1
1997 mova [dstq+stride28q + 0], m6
1998 mova [dstq+stride28q +16], m1
1999 mova [dstq+stride28q +32], m1
2000 mova [dstq+stride28q +48], m1
2002 vpalignr m2, m5, m2, 4
2003 vpalignr m5, m4, m5, 4
2004 vpalignr m4, m7, m4, 4
2005 vpalignr m7, m6, m7, 4
2006 vpalignr m6, m1, m6, 4
2008 PALIGNR m0, m5, m2, 4, m3
2010 PALIGNR m0, m4, m5, 4, m3
2012 PALIGNR m0, m7, m4, 4, m3
2014 PALIGNR m0, m6, m7, 4, m3
2016 PALIGNR m0, m1, m6, 4, m3
2018 UNSCRATCH 0, 9, rsp+2*mmsize
2019 %if notcpuflag(ssse3)
2020 UNSCRATCH 3, 10, rsp+3*mmsize
2037 cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
2045 DEFINE_ARGS dst, stride, stride3
2046 lea stride3q, [strideq*3]
2048 movh [dstq+stride3q ], m1
2049 movhps [dstq+strideq*1], m1
2051 PALIGNR m2, m1, 4, m0
2052 movh [dstq+strideq*2], m2
2053 movhps [dstq+strideq*0], m2
2056 cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
2059 PALIGNR m2, m1, m0, 2, m3
2060 PALIGNR m3, m1, m0, 4, m4
2063 SBUTTERFLY wd, 2, 3, 0
2067 DEFINE_ARGS dst8, mstride, cnt
2068 lea dst8q, [dst8q+mstrideq*8]
2074 mova [dst8q+mstrideq*0], m2
2075 mova [dst8q+mstrideq*4], m3
2077 vpalignr m2, m3, m2, 4
2078 vpalignr m3, m1, m3, 4
2080 PALIGNR m0, m3, m2, 4, m4
2082 PALIGNR m0, m1, m3, 4, m4
2090 cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
2096 mova m4, [lq+mmsize]
2098 PALIGNR m3, m5, m4, 2, m6
2099 PALIGNR m2, m5, m4, 4, m6
2102 SBUTTERFLY wd, 1, 0, 4
2103 SBUTTERFLY wd, 3, 2, 4
2107 movu m5, [aq+mmsize-2]
2111 DEFINE_ARGS dst, mstride, mstride3, cnt
2112 lea dstq, [dstq+mstrideq*8]
2113 lea dstq, [dstq+mstrideq*8]
2115 lea mstride3q, [mstrideq*3]
2120 mova [dstq+mstride3q*4+ 0], m2
2121 mova [dstq+mstride3q*4+16], m4
2122 mova [dstq+mstrideq *8+ 0], m3
2123 mova [dstq+mstrideq *8+16], m2
2124 mova [dstq+mstrideq *4+ 0], m0
2125 mova [dstq+mstrideq *4+16], m3
2126 mova [dstq+mstrideq *0+ 0], m1
2127 mova [dstq+mstrideq *0+16], m0
2129 vpalignr m1, m0, m1, 4
2130 vpalignr m0, m3, m0, 4
2131 vpalignr m3, m2, m3, 4
2132 vpalignr m2, m4, m2, 4
2133 vpalignr m4, m5, m4, 4
2135 PALIGNR m6, m0, m1, 4, m7
2137 PALIGNR m6, m3, m0, 4, m7
2139 PALIGNR m6, m2, m3, 4, m7
2141 PALIGNR m6, m4, m2, 4, m7
2143 PALIGNR m6, m5, m4, 4, m7
2151 cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
2152 10 * -mmsize * ARCH_X86_32, dst, stride, l, a
2153 mova m2, [lq+mmsize*0+0]
2154 movu m1, [lq+mmsize*0+2]
2155 movu m0, [lq+mmsize*0+4]
2158 SBUTTERFLY wd, 1, 0, 2
2159 mova m4, [lq+mmsize*1+0]
2160 movu m3, [lq+mmsize*1+2]
2161 movu m2, [lq+mmsize*1+4]
2164 SBUTTERFLY wd, 3, 2, 4
2165 SCRATCH 0, 8, rsp+0*mmsize
2166 SCRATCH 1, 9, rsp+1*mmsize
2167 SCRATCH 2, 10, rsp+2*mmsize
2168 SCRATCH 3, 11, rsp+3*mmsize
2169 mova m6, [lq+mmsize*2+0]
2170 movu m5, [lq+mmsize*2+2]
2171 movu m4, [lq+mmsize*2+4]
2174 SBUTTERFLY wd, 5, 4, 6
2175 mova m0, [lq+mmsize*3+0]
2176 movu m1, [aq+mmsize*0-2]
2177 PALIGNR m7, m1, m0, 2, m2
2178 PALIGNR m6, m1, m0, 4, m2
2181 SBUTTERFLY wd, 7, 6, 0
2182 mova m2, [aq+mmsize*0+0]
2183 movu m0, [aq+mmsize*0+2]
2185 movu m1, [aq+mmsize*1-2]
2186 mova m2, [aq+mmsize*1+0]
2187 movu m3, [aq+mmsize*1+2]
2189 SCRATCH 6, 12, rsp+6*mmsize
2190 SCRATCH 7, 13, rsp+7*mmsize
2191 movu m2, [aq+mmsize*2-2]
2192 mova m3, [aq+mmsize*2+0]
2193 movu m6, [aq+mmsize*2+2]
2195 movu m3, [aq+mmsize*3-2]
2199 UNSCRATCH 6, 12, rsp+6*mmsize
2200 UNSCRATCH 7, 13, rsp+7*mmsize
2202 mova [rsp+4*mmsize], m4
2203 mova [rsp+5*mmsize], m5
2204 ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
2205 ; to do it again here
2207 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
2209 lea stride3q, [strideq*3]
2211 lea stride4q, [strideq*4]
2212 lea stride28q, [stride4q*8]
2213 lea stride20q, [stride4q*5]
2214 sub stride28q, stride4q
2218 ; x86-32 doesn't have enough registers, so on that platform, we split
2219 ; the loop in 2... Otherwise you spend most of the loop (un)scratching
2222 mova [dstq+stride28q + 0], m9
2223 mova [dstq+stride28q +16], m8
2224 mova [dstq+stride28q +32], m11
2225 mova [dstq+stride28q +48], m10
2226 mova [dstq+stride3q*8+ 0], m8
2227 mova [dstq+stride3q*8+16], m11
2228 mova [dstq+stride3q*8+32], m10
2229 mova [dstq+stride3q*8+48], m5
2230 mova [dstq+stride20q + 0], m11
2231 mova [dstq+stride20q +16], m10
2232 mova [dstq+stride20q +32], m5
2233 mova [dstq+stride20q +48], m4
2234 mova [dstq+stride4q*4+ 0], m10
2235 mova [dstq+stride4q*4+16], m5
2236 mova [dstq+stride4q*4+32], m4
2237 mova [dstq+stride4q*4+48], m7
2239 mova [dstq+stride3q*4+ 0], m5
2240 mova [dstq+stride3q*4+16], m4
2241 mova [dstq+stride3q*4+32], m7
2242 mova [dstq+stride3q*4+48], m6
2243 mova [dstq+strideq* 8+ 0], m4
2244 mova [dstq+strideq* 8+16], m7
2245 mova [dstq+strideq* 8+32], m6
2246 mova [dstq+strideq* 8+48], m0
2247 mova [dstq+strideq* 4+ 0], m7
2248 mova [dstq+strideq* 4+16], m6
2249 mova [dstq+strideq* 4+32], m0
2250 mova [dstq+strideq* 4+48], m1
2251 mova [dstq+strideq* 0+ 0], m6
2252 mova [dstq+strideq* 0+16], m0
2253 mova [dstq+strideq* 0+32], m1
2254 mova [dstq+strideq* 0+48], m2
2258 vpalignr m9, m8, m9, 4
2259 vpalignr m8, m11, m8, 4
2260 vpalignr m11, m10, m11, 4
2261 vpalignr m10, m5, m10, 4
2263 vpalignr m5, m4, m5, 4
2264 vpalignr m4, m7, m4, 4
2265 vpalignr m7, m6, m7, 4
2266 vpalignr m6, m0, m6, 4
2267 vpalignr m0, m1, m0, 4
2268 vpalignr m1, m2, m1, 4
2269 vpalignr m2, m3, m2, 4
2272 PALIGNR m12, m8, m9, 4, m13
2274 PALIGNR m12, m11, m8, 4, m13
2276 PALIGNR m12, m10, m11, 4, m13
2278 PALIGNR m12, m5, m10, 4, m13
2281 SCRATCH 3, 12, rsp+8*mmsize, sh
2282 %if notcpuflag(ssse3)
2283 SCRATCH 2, 13, rsp+9*mmsize
2285 PALIGNR m3, m4, m5, 4, m2
2287 PALIGNR m3, m7, m4, 4, m2
2289 PALIGNR m3, m6, m7, 4, m2
2291 PALIGNR m3, m0, m6, 4, m2
2293 PALIGNR m3, m1, m0, 4, m2
2295 %if notcpuflag(ssse3)
2296 UNSCRATCH 2, 13, rsp+9*mmsize
2297 SCRATCH 0, 13, rsp+9*mmsize
2299 PALIGNR m3, m2, m1, 4, m0
2301 PALIGNR m3, reg_sh, m2, 4, m0
2303 %if notcpuflag(ssse3)
2304 UNSCRATCH 0, 13, rsp+9*mmsize
2306 UNSCRATCH 3, 12, rsp+8*mmsize, sh
2313 UNSCRATCH 0, 8, rsp+0*mmsize
2314 UNSCRATCH 1, 9, rsp+1*mmsize
2315 UNSCRATCH 2, 10, rsp+2*mmsize
2316 UNSCRATCH 3, 11, rsp+3*mmsize
2317 mova m4, [rsp+4*mmsize]
2318 mova m5, [rsp+5*mmsize]
2319 mova m6, [rsp+6*mmsize]
2320 mova m7, [rsp+7*mmsize]
2321 DEFINE_ARGS dst, stride, stride5, stride3
2322 lea stride5q, [strideq*5]
2323 lea dstq, [dstq+stride5q*4]
2324 DEFINE_ARGS dst, stride, cnt, stride3
2327 mova [dstq+stride3q*4+ 0], m1
2328 mova [dstq+stride3q*4+16], m0
2329 mova [dstq+stride3q*4+32], m3
2330 mova [dstq+stride3q*4+48], m2
2331 mova [dstq+strideq* 8+ 0], m0
2332 mova [dstq+strideq* 8+16], m3
2333 mova [dstq+strideq* 8+32], m2
2334 mova [dstq+strideq* 8+48], m5
2335 mova [dstq+strideq* 4+ 0], m3
2336 mova [dstq+strideq* 4+16], m2
2337 mova [dstq+strideq* 4+32], m5
2338 mova [dstq+strideq* 4+48], m4
2339 mova [dstq+strideq* 0+ 0], m2
2340 mova [dstq+strideq* 0+16], m5
2341 mova [dstq+strideq* 0+32], m4
2342 mova [dstq+strideq* 0+48], m7
2345 vpalignr m1, m0, m1, 4
2346 vpalignr m0, m3, m0, 4
2347 vpalignr m3, m2, m3, 4
2348 vpalignr m2, m5, m2, 4
2349 vpalignr m5, m4, m5, 4
2350 vpalignr m4, m7, m4, 4
2351 vpalignr m7, m6, m7, 4
2353 SCRATCH 6, 12, rsp+8*mmsize, sh
2354 %if notcpuflag(ssse3)
2355 SCRATCH 7, 13, rsp+9*mmsize
2357 PALIGNR m6, m0, m1, 4, m7
2359 PALIGNR m6, m3, m0, 4, m7
2361 PALIGNR m6, m2, m3, 4, m7
2363 PALIGNR m6, m5, m2, 4, m7
2365 PALIGNR m6, m4, m5, 4, m7
2367 %if notcpuflag(ssse3)
2368 UNSCRATCH 7, 13, rsp+9*mmsize
2369 SCRATCH 5, 13, rsp+9*mmsize
2371 PALIGNR m6, m7, m4, 4, m5
2373 PALIGNR m6, reg_sh, m7, 4, m5
2375 %if notcpuflag(ssse3)
2376 UNSCRATCH 5, 13, rsp+9*mmsize
2378 UNSCRATCH 6, 12, rsp+8*mmsize, sh