1 ;******************************************************************************
2 ;* VP9 Intra prediction SIMD optimizations
4 ;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
5 ;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
32 pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
33 pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
34 pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
43 ; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
44 ; only 3 registers on x86-32, which would make it one cycle faster, but that
45 ; would make the code quite a bit uglier...
86 cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
89 DEFINE_ARGS dst, stride, stride3
90 lea stride3q, [strideq*3]
91 mova [dstq+strideq*0], m0
92 mova [dstq+strideq*1], m0
93 mova [dstq+strideq*2], m0
94 mova [dstq+stride3q ], m0
98 cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
101 DEFINE_ARGS dst, stride, stride3
102 lea stride3q, [strideq*3]
103 mova [dstq+strideq*0], m0
104 mova [dstq+strideq*1], m0
105 mova [dstq+strideq*2], m0
106 mova [dstq+stride3q ], m0
107 lea dstq, [dstq+strideq*4]
108 mova [dstq+strideq*0], m0
109 mova [dstq+strideq*1], m0
110 mova [dstq+strideq*2], m0
111 mova [dstq+stride3q ], m0
115 cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
119 DEFINE_ARGS dst, stride, stride3, cnt
120 lea stride3q, [strideq*3]
123 mova [dstq+strideq*0+ 0], m0
124 mova [dstq+strideq*0+16], m1
125 mova [dstq+strideq*1+ 0], m0
126 mova [dstq+strideq*1+16], m1
127 mova [dstq+strideq*2+ 0], m0
128 mova [dstq+strideq*2+16], m1
129 mova [dstq+stride3q + 0], m0
130 mova [dstq+stride3q +16], m1
131 lea dstq, [dstq+strideq*4]
137 cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
139 mova m0, [aq+mmsize*0]
140 mova m1, [aq+mmsize*1]
141 mova m2, [aq+mmsize*2]
142 mova m3, [aq+mmsize*3]
143 DEFINE_ARGS dst, stride, cnt
146 mova [dstq+strideq*0+ 0], m0
147 mova [dstq+strideq*0+16], m1
148 mova [dstq+strideq*0+32], m2
149 mova [dstq+strideq*0+48], m3
150 mova [dstq+strideq*1+ 0], m0
151 mova [dstq+strideq*1+16], m1
152 mova [dstq+strideq*1+32], m2
153 mova [dstq+strideq*1+48], m3
154 lea dstq, [dstq+strideq*2]
160 cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
162 DEFINE_ARGS dst, stride, stride3
163 lea stride3q, [strideq*3]
168 mova [dstq+strideq*0], m0
169 mova [dstq+strideq*1], m1
170 mova [dstq+strideq*2], m2
171 mova [dstq+stride3q ], m3
175 cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
177 DEFINE_ARGS dst, stride, stride3
178 lea stride3q, [strideq*3]
182 mova [dstq+strideq*0], m0
183 mova [dstq+strideq*1], m1
186 mova [dstq+strideq*2], m0
187 mova [dstq+stride3q ], m1
188 lea dstq, [dstq+strideq*4]
192 mova [dstq+strideq*0], m0
193 mova [dstq+strideq*1], m1
196 mova [dstq+strideq*2], m0
197 mova [dstq+stride3q ], m1
201 cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
203 lea stride3q, [strideq*3]
211 mova [dstq+strideq*0+ 0], m0
212 mova [dstq+strideq*0+16], m0
213 mova [dstq+strideq*1+ 0], m1
214 mova [dstq+strideq*1+16], m1
215 mova [dstq+strideq*2+ 0], m2
216 mova [dstq+strideq*2+16], m2
217 mova [dstq+stride3q + 0], m3
218 mova [dstq+stride3q +16], m3
219 lea dstq, [dstq+strideq*4]
225 cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
227 lea stride3q, [strideq*3]
235 mova [dstq+strideq*0+ 0], m0
236 mova [dstq+strideq*0+16], m0
237 mova [dstq+strideq*0+32], m0
238 mova [dstq+strideq*0+48], m0
239 mova [dstq+strideq*1+ 0], m1
240 mova [dstq+strideq*1+16], m1
241 mova [dstq+strideq*1+32], m1
242 mova [dstq+strideq*1+48], m1
243 mova [dstq+strideq*2+ 0], m2
244 mova [dstq+strideq*2+16], m2
245 mova [dstq+strideq*2+32], m2
246 mova [dstq+strideq*2+48], m2
247 mova [dstq+stride3q + 0], m3
248 mova [dstq+stride3q +16], m3
249 mova [dstq+stride3q +32], m3
250 mova [dstq+stride3q +48], m3
251 lea dstq, [dstq+strideq*4]
257 cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
260 DEFINE_ARGS dst, stride, stride3
261 lea stride3q, [strideq*3]
268 mova [dstq+strideq*0], m0
269 mova [dstq+strideq*1], m0
270 mova [dstq+strideq*2], m0
271 mova [dstq+stride3q ], m0
275 cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
278 DEFINE_ARGS dst, stride, stride3
279 lea stride3q, [strideq*3]
287 pshuflw m0, m0, q0000
289 mova [dstq+strideq*0], m0
290 mova [dstq+strideq*1], m0
291 mova [dstq+strideq*2], m0
292 mova [dstq+stride3q ], m0
293 lea dstq, [dstq+strideq*4]
294 mova [dstq+strideq*0], m0
295 mova [dstq+strideq*1], m0
296 mova [dstq+strideq*2], m0
297 mova [dstq+stride3q ], m0
301 cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
303 paddw m0, [lq+mmsize]
305 paddw m0, [aq+mmsize]
306 DEFINE_ARGS dst, stride, stride3, cnt
307 lea stride3q, [strideq*3]
316 pshuflw m0, m0, q0000
319 mova [dstq+strideq*0+ 0], m0
320 mova [dstq+strideq*0+16], m0
321 mova [dstq+strideq*1+ 0], m0
322 mova [dstq+strideq*1+16], m0
323 mova [dstq+strideq*2+ 0], m0
324 mova [dstq+strideq*2+16], m0
325 mova [dstq+stride3q + 0], m0
326 mova [dstq+stride3q +16], m0
327 lea dstq, [dstq+strideq*4]
333 cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
334 mova m0, [lq+mmsize*0]
335 paddw m0, [lq+mmsize*1]
336 paddw m0, [lq+mmsize*2]
337 paddw m0, [lq+mmsize*3]
338 paddw m0, [aq+mmsize*0]
339 paddw m0, [aq+mmsize*1]
340 paddw m0, [aq+mmsize*2]
341 paddw m0, [aq+mmsize*3]
342 DEFINE_ARGS dst, stride, stride3, cnt
343 lea stride3q, [strideq*3]
352 pshuflw m0, m0, q0000
355 mova [dstq+strideq*0+ 0], m0
356 mova [dstq+strideq*0+16], m0
357 mova [dstq+strideq*0+32], m0
358 mova [dstq+strideq*0+48], m0
359 mova [dstq+strideq*1+ 0], m0
360 mova [dstq+strideq*1+16], m0
361 mova [dstq+strideq*1+32], m0
362 mova [dstq+strideq*1+48], m0
363 lea dstq, [dstq+strideq*2]
370 cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
372 DEFINE_ARGS dst, stride, stride3
373 lea stride3q, [strideq*3]
380 mova [dstq+strideq*0], m0
381 mova [dstq+strideq*1], m0
382 mova [dstq+strideq*2], m0
383 mova [dstq+stride3q ], m0
387 cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
389 DEFINE_ARGS dst, stride, stride3
390 lea stride3q, [strideq*3]
398 pshuflw m0, m0, q0000
400 mova [dstq+strideq*0], m0
401 mova [dstq+strideq*1], m0
402 mova [dstq+strideq*2], m0
403 mova [dstq+stride3q ], m0
404 lea dstq, [dstq+strideq*4]
405 mova [dstq+strideq*0], m0
406 mova [dstq+strideq*1], m0
407 mova [dstq+strideq*2], m0
408 mova [dstq+stride3q ], m0
412 cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
414 paddw m0, [%2+mmsize]
415 DEFINE_ARGS dst, stride, stride3, cnt
416 lea stride3q, [strideq*3]
425 pshuflw m0, m0, q0000
428 mova [dstq+strideq*0+ 0], m0
429 mova [dstq+strideq*0+16], m0
430 mova [dstq+strideq*1+ 0], m0
431 mova [dstq+strideq*1+16], m0
432 mova [dstq+strideq*2+ 0], m0
433 mova [dstq+strideq*2+16], m0
434 mova [dstq+stride3q + 0], m0
435 mova [dstq+stride3q +16], m0
436 lea dstq, [dstq+strideq*4]
442 cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
443 mova m0, [%2+mmsize*0]
444 paddw m0, [%2+mmsize*1]
445 paddw m0, [%2+mmsize*2]
446 paddw m0, [%2+mmsize*3]
447 DEFINE_ARGS dst, stride, cnt
456 pshuflw m0, m0, q0000
459 mova [dstq+strideq*0+ 0], m0
460 mova [dstq+strideq*0+16], m0
461 mova [dstq+strideq*0+32], m0
462 mova [dstq+strideq*0+48], m0
463 mova [dstq+strideq*1+ 0], m0
464 mova [dstq+strideq*1+16], m0
465 mova [dstq+strideq*1+32], m0
466 mova [dstq+strideq*1+48], m0
467 lea dstq, [dstq+strideq*2]
477 cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
485 DEFINE_ARGS dst, stride, stride3
486 lea stride3q, [strideq*3]
504 mova [dstq+strideq*0], m0
505 mova [dstq+strideq*1], m1
506 mova [dstq+strideq*2], m2
507 mova [dstq+stride3q ], m3
510 cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
512 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
515 cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
521 pshuflw m0, m0, q1111
524 DEFINE_ARGS dst, stride, l, stride3, cnt
525 lea stride3q, [strideq*3]
546 mova [dstq+strideq*0], m0
547 mova [dstq+strideq*1], m1
548 mova [dstq+strideq*2], m2
549 mova [dstq+stride3q ], m3
550 lea dstq, [dstq+strideq*4]
555 cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
557 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
560 cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
567 pshuflw m0, m0, q1111
571 DEFINE_ARGS dst, stride, l, cnt
590 mova [dstq+strideq*0+ 0], m0
591 mova [dstq+strideq*0+16], m2
592 mova [dstq+strideq*1+ 0], m1
593 mova [dstq+strideq*1+16], m3
594 lea dstq, [dstq+strideq*2]
599 cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
601 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
604 cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
616 %define reg_min [rsp+16]
617 %define reg_max [rsp+ 0]
620 mova m4, [aq+mmsize*0]
621 mova m5, [aq+mmsize*1]
622 mova m6, [aq+mmsize*2]
623 mova m7, [aq+mmsize*3]
625 pshuflw m0, m0, q1111
631 DEFINE_ARGS dst, stride, l, cnt
634 pinsrw m3, [lq+cntq*2], 0
649 mova [dstq+strideq*0+ 0], m0
650 mova [dstq+strideq*0+16], m1
651 mova [dstq+strideq*0+32], m2
652 mova [dstq+strideq*0+48], m3
658 cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
660 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
662 ; Directional intra predicion functions
664 ; in the functions below, 'abcdefgh' refers to above data (sometimes simply
665 ; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
666 ; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
667 ; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
670 ; left=(left+2*center+right+2)>>2
671 %macro LOWPASS 3 ; left [dst], center, right
677 ; abcdefgh (src) -> bcdefghh (dst)
678 ; dst/src can be the same register
679 %macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
681 pshufb %1, %2, %3 ; abcdefgh -> bcdefghh
683 psrldq %1, %2, 2 ; abcdefgh -> bcdefgh.
684 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
688 ; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
689 %macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
691 pshufb %1, %3, %4 ; abcdefgh -> bcdefghh
692 pshufb %2, %1, %4 ; bcdefghh -> cdefghhh
694 psrldq %1, %3, 2 ; abcdefgh -> bcdefgh.
695 psrldq %2, %3, 4 ; abcdefgh -> cdefgh..
696 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
697 pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh
702 cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
704 movu m1, [aq] ; abcdefgh
705 pshufhw m0, m1, q3310 ; abcdefhh
706 SHIFT_RIGHT m1, m1 ; bcdefghh
707 psrldq m2, m1, 2 ; cdefghh.
708 LOWPASS 0, 1, 2 ; BCDEFGh.
709 pshufd m1, m0, q3321 ; DEFGh...
710 movh [dstq+strideq*0], m0
711 movh [dstq+strideq*2], m1
713 psrldq m0, 2 ; CDEFGh..
714 psrldq m1, 2 ; EFGh....
715 movh [dstq+strideq*0], m0
716 movh [dstq+strideq*2], m1
719 cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
721 mova m0, [aq] ; abcdefgh
723 mova m4, [pb_2to15_14_15]
725 SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh
726 LOWPASS 0, 1, 2 ; BCDEFGHh
727 shufps m1, m0, m2, q3332 ; FGHhhhhh
728 shufps m3, m0, m1, q2121 ; DEFGHhhh
729 DEFINE_ARGS dst, stride, stride5
730 lea stride5q, [strideq*5]
732 mova [dstq+strideq*0], m0
733 mova [dstq+strideq*4], m1
734 SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh
735 pshuflw m1, m1, q3321 ; GHhhhhhh
736 pshufd m2, m0, q3321 ; EFGHhhhh
737 mova [dstq+strideq*1], m0
738 mova [dstq+stride5q ], m1
739 lea dstq, [dstq+strideq*2]
740 pshuflw m1, m1, q3321 ; Hhhhhhhh
741 mova [dstq+strideq*0], m3
742 mova [dstq+strideq*4], m1
743 pshuflw m1, m1, q3321 ; hhhhhhhh
744 mova [dstq+strideq*1], m2
745 mova [dstq+stride5q ], m1
748 cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
750 mova m0, [aq] ; abcdefgh
751 mova m3, [aq+mmsize] ; ijklmnop
752 PALIGNR m1, m3, m0, 2, m4 ; bcdefghi
753 PALIGNR m2, m3, m0, 4, m4 ; cdefghij
754 LOWPASS 0, 1, 2 ; BCDEFGHI
756 mova m4, [pb_2to15_14_15]
758 SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp
759 LOWPASS 1, 2, 3 ; JKLMNOPp
760 pshufd m2, m2, q3333 ; pppppppp
761 DEFINE_ARGS dst, stride, cnt
765 mova [dstq+strideq*0+ 0], m0
766 mova [dstq+strideq*0+16], m1
767 mova [dstq+strideq*8+ 0], m1
768 mova [dstq+strideq*8+16], m2
771 vpalignr m0, m1, m0, 2
773 PALIGNR m3, m1, m0, 2, m4
776 SHIFT_RIGHT m1, m1, m4
781 cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
783 mova m0, [aq+mmsize*0] ; abcdefgh
784 mova m1, [aq+mmsize*1] ; ijklmnop
785 mova m2, [aq+mmsize*2] ; qrstuvwx
786 mova m3, [aq+mmsize*3] ; yz012345
787 PALIGNR m4, m1, m0, 2, m6
788 PALIGNR m5, m1, m0, 4, m6
789 LOWPASS 0, 4, 5 ; BCDEFGHI
790 PALIGNR m4, m2, m1, 2, m6
791 PALIGNR m5, m2, m1, 4, m6
792 LOWPASS 1, 4, 5 ; JKLMNOPQ
793 PALIGNR m4, m3, m2, 2, m6
794 PALIGNR m5, m3, m2, 4, m6
795 LOWPASS 2, 4, 5 ; RSTUVWXY
797 mova m6, [pb_2to15_14_15]
799 SHIFT_RIGHTx2 m4, m5, m3, m6
800 LOWPASS 3, 4, 5 ; Z0123455
801 pshufd m4, m4, q3333 ; 55555555
802 DEFINE_ARGS dst, stride, stride8, stride24, cnt
804 lea stride8q, [strideq*8]
805 lea stride24q, [stride8q*3]
808 mova [dstq+stride8q*0+ 0], m0
809 mova [dstq+stride8q*0+16], m1
810 mova [dstq+stride8q*0+32], m2
811 mova [dstq+stride8q*0+48], m3
812 mova [dstq+stride8q*1+ 0], m1
813 mova [dstq+stride8q*1+16], m2
814 mova [dstq+stride8q*1+32], m3
815 mova [dstq+stride8q*1+48], m4
816 mova [dstq+stride8q*2+ 0], m2
817 mova [dstq+stride8q*2+16], m3
818 mova [dstq+stride8q*2+32], m4
819 mova [dstq+stride8q*2+48], m4
820 mova [dstq+stride24q + 0], m3
821 mova [dstq+stride24q +16], m4
822 mova [dstq+stride24q +32], m4
823 mova [dstq+stride24q +48], m4
826 vpalignr m0, m1, m0, 2
827 vpalignr m1, m2, m1, 2
828 vpalignr m2, m3, m2, 2
830 PALIGNR m5, m1, m0, 2, m6
832 PALIGNR m5, m2, m1, 2, m6
834 PALIGNR m5, m3, m2, 2, m6
837 SHIFT_RIGHT m3, m3, m6
850 %if HAVE_AVX2_EXTERNAL
852 cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
854 mova m0, [aq] ; abcdefghijklmnop
855 vpbroadcastw xm1, [aq+30] ; pppppppp
856 vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp
857 vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp
858 vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp
859 LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp
860 vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp
861 DEFINE_ARGS dst, stride, stride3, cnt
863 lea stride3q, [strideq*3]
865 mova [dstq+strideq*0], m0
866 vpalignr m3, m2, m0, 2
867 vpalignr m4, m2, m0, 4
868 mova [dstq+strideq*1], m3
869 mova [dstq+strideq*2], m4
870 vpalignr m3, m2, m0, 6
871 vpalignr m4, m2, m0, 8
872 mova [dstq+stride3q ], m3
873 lea dstq, [dstq+strideq*4]
874 mova [dstq+strideq*0], m4
875 vpalignr m3, m2, m0, 10
876 vpalignr m4, m2, m0, 12
877 mova [dstq+strideq*1], m3
878 mova [dstq+strideq*2], m4
879 vpalignr m3, m2, m0, 14
880 mova [dstq+stride3q ], m3
881 lea dstq, [dstq+strideq*4]
883 vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp
889 %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
890 cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
891 movh m0, [lq] ; wxyz....
892 movhps m0, [aq-2] ; wxyz*abc
893 movd m1, [aq+6] ; d.......
894 PALIGNR m1, m0, 2, m2 ; xyz*abcd
895 psrldq m2, m1, 2 ; yz*abcd.
896 LOWPASS 0, 1, 2 ; XYZ#ABC.
897 DEFINE_ARGS dst, stride, stride3
898 lea stride3q, [strideq*3]
900 movh [dstq+stride3q ], m0
901 psrldq m0, 2 ; YZ#ABC..
902 movh [dstq+strideq*2], m0
903 psrldq m0, 2 ; Z#ABC...
904 movh [dstq+strideq*1], m0
905 psrldq m0, 2 ; #ABC....
906 movh [dstq+strideq*0], m0
909 cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
910 mova m0, [lq] ; stuvwxyz
911 movu m1, [aq-2] ; *abcdefg
912 mova m2, [aq] ; abcdefgh
913 psrldq m3, m2, 2 ; bcdefgh.
914 LOWPASS 3, 2, 1 ; ABCDEFG.
915 PALIGNR m1, m0, 2, m4 ; tuvwxyz*
916 PALIGNR m2, m1, 2, m4 ; uvwxyz*a
917 LOWPASS 2, 1, 0 ; TUVWXYZ#
918 DEFINE_ARGS dst, stride, dst4, stride3
919 lea stride3q, [strideq*3]
920 lea dst4q, [dstq+strideq*4]
922 movhps [dstq +stride3q +0], m2
923 movh [dstq+ stride3q +8], m3
924 mova [dst4q+stride3q +0], m2
925 PALIGNR m1, m3, m2, 2, m0
927 movhps [dstq +strideq*2+0], m1
928 movh [dstq+ strideq*2+8], m3
929 mova [dst4q+strideq*2+0], m1
930 PALIGNR m2, m3, m1, 2, m0
932 movhps [dstq +strideq*1+0], m2
933 movh [dstq+ strideq*1+8], m3
934 mova [dst4q+strideq*1+0], m2
935 PALIGNR m1, m3, m2, 2, m0
937 movhps [dstq +strideq*0+0], m1
938 movh [dstq+ strideq*0+8], m3
939 mova [dst4q+strideq*0+0], m1
942 cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
943 mova m0, [lq] ; klmnopqr
944 mova m1, [lq+mmsize] ; stuvwxyz
945 movu m2, [aq-2] ; *abcdefg
946 movu m3, [aq+mmsize-2] ; hijklmno
947 mova m4, [aq] ; abcdefgh
948 mova m5, [aq+mmsize] ; ijklmnop
949 psrldq m6, m5, 2 ; jklmnop.
950 LOWPASS 6, 5, 3 ; IJKLMNO.
951 PALIGNR m5, m4, 2, m3 ; bcdefghi
952 LOWPASS 5, 4, 2 ; ABCDEFGH
953 PALIGNR m2, m1, 2, m3 ; tuvwxyz*
954 PALIGNR m4, m2, 2, m3 ; uvwxyz*a
955 LOWPASS 4, 2, 1 ; TUVWXYZ#
956 PALIGNR m1, m0, 2, m3 ; lmnopqrs
957 PALIGNR m2, m1, 2, m3 ; mnopqrst
958 LOWPASS 2, 1, 0 ; LMNOPQRS
959 DEFINE_ARGS dst, stride, dst8, cnt
960 lea dst8q, [dstq+strideq*8]
965 mova [dst8q+strideq*0+ 0], m4
966 mova [dst8q+strideq*0+16], m5
967 mova [dst8q+strideq*8+ 0], m2
968 mova [dst8q+strideq*8+16], m4
970 vpalignr m2, m4, m2, 2
971 vpalignr m4, m5, m4, 2
972 vpalignr m5, m6, m5, 2
974 PALIGNR m0, m4, m2, 2, m1
976 PALIGNR m0, m5, m4, 2, m1
978 PALIGNR m0, m6, m5, 2, m1
986 cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
987 %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
988 mova m0, [aq+mmsize*3] ; a[24-31]
989 movu m1, [aq+mmsize*3-2] ; a[23-30]
990 psrldq m2, m0, 2 ; a[25-31].
991 LOWPASS 2, 0, 1 ; A[24-30].
992 mova m1, [aq+mmsize*2] ; a[16-23]
993 movu m3, [aq+mmsize*2-2] ; a[15-22]
994 PALIGNR m0, m1, 2, m4 ; a[17-24]
995 LOWPASS 0, 1, 3 ; A[16-23]
996 mova m3, [aq+mmsize*1] ; a[8-15]
997 movu m4, [aq+mmsize*1-2] ; a[7-14]
998 PALIGNR m1, m3, 2, m5 ; a[9-16]
999 LOWPASS 1, 3, 4 ; A[8-15]
1000 mova m4, [aq+mmsize*0] ; a[0-7]
1001 movu m5, [aq+mmsize*0-2] ; *a[0-6]
1002 PALIGNR m3, m4, 2, m6 ; a[1-8]
1003 LOWPASS 3, 4, 5 ; A[0-7]
1004 SCRATCH 1, 8, rsp+0*mmsize
1005 SCRATCH 3, 9, rsp+1*mmsize
1006 %if notcpuflag(ssse3)
1007 SCRATCH 0, 10, rsp+2*mmsize
1009 mova m6, [lq+mmsize*3] ; l[24-31]
1010 PALIGNR m5, m6, 2, m0 ; l[25-31]*
1011 PALIGNR m4, m5, 2, m0 ; l[26-31]*a
1012 LOWPASS 4, 5, 6 ; L[25-31]#
1013 mova m7, [lq+mmsize*2] ; l[16-23]
1014 PALIGNR m6, m7, 2, m0 ; l[17-24]
1015 PALIGNR m5, m6, 2, m0 ; l[18-25]
1016 LOWPASS 5, 6, 7 ; L[17-24]
1017 mova m1, [lq+mmsize*1] ; l[8-15]
1018 PALIGNR m7, m1, 2, m0 ; l[9-16]
1019 PALIGNR m6, m7, 2, m0 ; l[10-17]
1020 LOWPASS 6, 7, 1 ; L[9-16]
1021 mova m3, [lq+mmsize*0] ; l[0-7]
1022 PALIGNR m1, m3, 2, m0 ; l[1-8]
1023 PALIGNR m7, m1, 2, m0 ; l[2-9]
1024 LOWPASS 7, 1, 3 ; L[1-8]
1027 UNSCRATCH 1, 8, rsp+0*mmsize
1029 UNSCRATCH 3, 9, rsp+1*mmsize
1031 UNSCRATCH 0, 10, rsp+2*mmsize
1033 DEFINE_ARGS dst8, stride, stride8, stride24, cnt
1034 lea stride8q, [strideq*8]
1035 lea stride24q, [stride8q*3]
1036 lea dst8q, [dst8q+strideq*8]
1042 UNSCRATCH 1, 8, rsp+0*mmsize
1043 %if notcpuflag(ssse3)
1044 UNSCRATCH 3, 9, rsp+1*mmsize
1047 mova [dst8q+stride8q*0+ 0], m4
1048 mova [dst8q+stride8q*0+16], m3
1049 mova [dst8q+stride8q*0+32], m1
1050 mova [dst8q+stride8q*0+48], m0
1051 mova [dst8q+stride8q*1+ 0], m5
1052 mova [dst8q+stride8q*1+16], m4
1053 mova [dst8q+stride8q*1+32], m3
1054 mova [dst8q+stride8q*1+48], m1
1055 mova [dst8q+stride8q*2+ 0], m6
1056 mova [dst8q+stride8q*2+16], m5
1057 mova [dst8q+stride8q*2+32], m4
1058 mova [dst8q+stride8q*2+48], m3
1059 mova [dst8q+stride24q + 0], m7
1060 mova [dst8q+stride24q +16], m6
1061 mova [dst8q+stride24q +32], m5
1062 mova [dst8q+stride24q +48], m4
1064 vpalignr m7, m6, m7, 2
1065 vpalignr m6, m5, m6, 2
1066 vpalignr m5, m4, m5, 2
1067 vpalignr m4, m3, m4, 2
1068 vpalignr m3, m1, m3, 2
1069 vpalignr m1, m0, m1, 2
1070 vpalignr m0, m2, m0, 2
1072 SCRATCH 2, 8, rsp+0*mmsize
1073 %if notcpuflag(ssse3)
1074 SCRATCH 0, 9, rsp+1*mmsize
1076 PALIGNR m2, m6, m7, 2, m0
1078 PALIGNR m2, m5, m6, 2, m0
1080 PALIGNR m2, m4, m5, 2, m0
1082 PALIGNR m2, m3, m4, 2, m0
1084 PALIGNR m2, m1, m3, 2, m0
1086 %if notcpuflag(ssse3)
1087 UNSCRATCH 0, 9, rsp+1*mmsize
1088 SCRATCH 3, 9, rsp+1*mmsize
1090 PALIGNR m2, m0, m1, 2, m3
1092 UNSCRATCH 2, 8, rsp+0*mmsize
1093 SCRATCH 1, 8, rsp+0*mmsize
1094 PALIGNR m1, m2, m0, 2, m3
1110 %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1111 cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
1113 movu m0, [aq] ; abcdefgh
1114 psrldq m1, m0, 2 ; bcdefgh.
1115 psrldq m2, m0, 4 ; cdefgh..
1116 LOWPASS 2, 1, 0 ; BCDEFGH.
1117 pavgw m1, m0 ; ABCDEFG.
1118 DEFINE_ARGS dst, stride, stride3
1119 lea stride3q, [strideq*3]
1121 movh [dstq+strideq*0], m1
1122 movh [dstq+strideq*1], m2
1125 movh [dstq+strideq*2], m1
1126 movh [dstq+stride3q ], m2
1129 cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
1131 mova m0, [aq] ; abcdefgh
1133 mova m3, [pb_2to15_14_15]
1135 SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh
1136 LOWPASS 2, 1, 0 ; BCDEFGHh
1137 pavgw m1, m0 ; ABCDEFGh
1138 DEFINE_ARGS dst, stride, stride3
1139 lea stride3q, [strideq*3]
1141 mova [dstq+strideq*0], m1
1142 mova [dstq+strideq*1], m2
1143 SHIFT_RIGHT m1, m1, m3
1144 SHIFT_RIGHT m2, m2, m3
1145 mova [dstq+strideq*2], m1
1146 mova [dstq+stride3q ], m2
1147 lea dstq, [dstq+strideq*4]
1148 SHIFT_RIGHT m1, m1, m3
1149 SHIFT_RIGHT m2, m2, m3
1150 mova [dstq+strideq*0], m1
1151 mova [dstq+strideq*1], m2
1152 SHIFT_RIGHT m1, m1, m3
1153 SHIFT_RIGHT m2, m2, m3
1154 mova [dstq+strideq*2], m1
1155 mova [dstq+stride3q ], m2
1158 cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
1161 mova m1, [aq+mmsize]
1162 PALIGNR m2, m1, m0, 2, m3
1163 PALIGNR m3, m1, m0, 4, m4
1167 mova m4, [pb_2to15_14_15]
1169 SHIFT_RIGHTx2 m5, m0, m1, m4
1172 DEFINE_ARGS dst, stride, cnt
1176 mova [dstq+strideq*0+ 0], m2
1177 mova [dstq+strideq*0+16], m1
1178 mova [dstq+strideq*1+ 0], m3
1179 mova [dstq+strideq*1+16], m0
1180 lea dstq, [dstq+strideq*2]
1182 vpalignr m2, m1, m2, 2
1183 vpalignr m3, m0, m3, 2
1185 PALIGNR m5, m1, m2, 2, m4
1187 PALIGNR m5, m0, m3, 2, m4
1190 SHIFT_RIGHT m1, m1, m4
1191 SHIFT_RIGHT m0, m0, m4
1196 cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
1198 mova m0, [aq+mmsize*0]
1199 mova m1, [aq+mmsize*1]
1200 mova m2, [aq+mmsize*2]
1201 PALIGNR m6, m1, m0, 2, m5
1202 PALIGNR m7, m1, m0, 4, m5
1205 SCRATCH 6, 8, rsp+0*mmsize
1206 PALIGNR m4, m2, m1, 2, m0
1207 PALIGNR m5, m2, m1, 4, m0
1210 mova m0, [aq+mmsize*3]
1211 PALIGNR m1, m0, m2, 2, m6
1212 PALIGNR m3, m0, m2, 4, m6
1216 PRELOAD 10, pb_2to15_14_15, shuf
1218 SHIFT_RIGHTx2 m6, m1, m0, reg_shuf
1222 pshufd m9, m6, q3333
1225 UNSCRATCH 6, 8, rsp+0*mmsize
1227 DEFINE_ARGS dst, stride, cnt, stride16, stride17
1228 mov stride16q, strideq
1231 lea stride17q, [stride16q+strideq]
1233 ; FIXME m8 is unused for avx, so we could save one register here for win64
1236 UNSCRATCH 6, 8, rsp+0*mmsize
1238 mova [dstq+strideq*0+ 0], m6
1239 mova [dstq+strideq*0+16], m4
1240 mova [dstq+strideq*0+32], m2
1241 mova [dstq+strideq*0+48], m0
1242 mova [dstq+strideq*1+ 0], m7
1243 mova [dstq+strideq*1+16], m5
1244 mova [dstq+strideq*1+32], m3
1245 mova [dstq+strideq*1+48], m1
1246 mova [dstq+stride16q+ 0], m4
1247 mova [dstq+stride16q+16], m2
1248 mova [dstq+stride16q+32], m0
1250 mova [dstq+stride16q+48], m9
1252 mova [dstq+stride17q+ 0], m5
1253 mova [dstq+stride17q+16], m3
1254 mova [dstq+stride17q+32], m1
1256 mova [dstq+stride17q+48], m9
1258 lea dstq, [dstq+strideq*2]
1260 vpalignr m6, m4, m6, 2
1261 vpalignr m4, m2, m4, 2
1262 vpalignr m2, m0, m2, 2
1263 vpalignr m7, m5, m7, 2
1264 vpalignr m5, m3, m5, 2
1265 vpalignr m3, m1, m3, 2
1267 SCRATCH 3, 8, rsp+0*mmsize
1268 %if notcpuflag(ssse3)
1269 SCRATCH 1, 10, rsp+1*mmsize
1271 PALIGNR m3, m4, m6, 2, m1
1273 PALIGNR m3, m2, m4, 2, m1
1275 PALIGNR m3, m0, m2, 2, m1
1277 PALIGNR m3, m5, m7, 2, m1
1279 UNSCRATCH 3, 8, rsp+0*mmsize
1280 SCRATCH 6, 8, rsp+0*mmsize
1281 %if notcpuflag(ssse3)
1282 UNSCRATCH 1, 10, rsp+1*mmsize
1283 SCRATCH 7, 10, rsp+1*mmsize
1285 PALIGNR m6, m3, m5, 2, m7
1287 PALIGNR m6, m1, m3, 2, m7
1289 %if notcpuflag(ssse3)
1290 UNSCRATCH 7, 10, rsp+1*mmsize
1293 SHIFT_RIGHT m1, m1, reg_shuf
1294 SHIFT_RIGHT m0, m0, reg_shuf
1299 DEFINE_ARGS dst, stride, stride3
1300 lea stride3q, [strideq*3]
1303 mova [dstq+strideq*0+48], m0
1304 mova [dstq+strideq*1+48], m0
1305 mova [dstq+strideq*2+48], m0
1306 mova [dstq+stride3q +48], m0
1308 lea dstq, [dstq+strideq*4]
1324 cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
1327 PALIGNR m0, m1, 10, m2 ; xyz*abcd
1328 pslldq m1, m0, 2 ; .xyz*abc
1329 pslldq m2, m0, 4 ; ..xyz*ab
1330 LOWPASS 2, 1, 0 ; ..YZ#ABC
1331 pavgw m1, m0 ; ....#ABC
1332 DEFINE_ARGS dst, stride, stride3
1333 lea stride3q, [strideq*3]
1335 movhps [dstq+strideq*0], m1
1336 movhps [dstq+strideq*1], m2
1337 shufps m0, m2, m1, q3210
1339 pshufb m2, [pb_4_5_8to13_8x0]
1341 pshuflw m2, m2, q2222
1345 movh [dstq+strideq*2], m0
1346 movh [dstq+stride3q ], m2
1349 cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
1350 movu m1, [aq-2] ; *abcdefg
1351 movu m2, [lq] ; stuvwxyz
1352 mova m0, [aq] ; abcdefgh
1353 PALIGNR m3, m1, m2, 14, m4 ; z*abcdef
1356 PALIGNR m1, m2, 2, m4 ; tuvwxyz*
1357 pslldq m4, m2, 2 ; .stuvwxy
1359 DEFINE_ARGS dst, stride, stride3
1360 lea stride3q, [strideq*3]
1362 mova [dstq+strideq*0], m0
1363 mova [dstq+strideq*1], m3
1364 PALIGNR m0, m4, 14, m1
1366 PALIGNR m3, m4, 14, m1
1368 mova [dstq+strideq*2], m0
1369 mova [dstq+stride3q ], m3
1370 lea dstq, [dstq+strideq*4]
1371 PALIGNR m0, m4, 14, m1
1373 PALIGNR m3, m4, 14, m1
1375 mova [dstq+strideq*0], m0
1376 mova [dstq+strideq*1], m3
1377 PALIGNR m0, m4, 14, m1
1379 PALIGNR m3, m4, 14, m4
1380 mova [dstq+strideq*2], m0
1381 mova [dstq+stride3q ], m3
1384 cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
1385 movu m1, [aq-2] ; *abcdefg
1386 movu m2, [aq+mmsize-2] ; hijklmno
1387 mova m3, [aq] ; abcdefgh
1388 mova m4, [aq+mmsize] ; ijklmnop
1389 mova m5, [lq+mmsize] ; stuvwxyz
1390 PALIGNR m0, m1, m5, 14, m6 ; z*abcdef
1391 movu m6, [aq+mmsize-4] ; ghijklmn
1396 PALIGNR m1, m5, 2, m7 ; tuvwxyz*
1397 movu m7, [lq+mmsize-2] ; rstuvwxy
1399 movu m5, [lq+2] ; lmnopqrs
1400 pslldq m4, m5, 2 ; .lmnopqr
1401 pslldq m7, m5, 4 ; ..lmnopq
1409 DEFINE_ARGS dst, stride, cnt
1413 mova [dstq+strideq*0+ 0], m3
1414 mova [dstq+strideq*0+16], m2
1415 mova [dstq+strideq*1+ 0], m0
1416 mova [dstq+strideq*1+16], m6
1417 lea dstq, [dstq+strideq*2]
1418 PALIGNR m2, m3, 14, m4
1419 PALIGNR m3, m7, 14, m4
1421 PALIGNR m6, m0, 14, m4
1422 PALIGNR m0, m5, 14, m4
1428 cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
1429 movu m0, [aq+mmsize*0-2] ; *a[0-6]
1430 movu m1, [aq+mmsize*1-2] ; a[7-14]
1431 movu m2, [aq+mmsize*2-2] ; a[15-22]
1432 movu m3, [aq+mmsize*3-2] ; a[23-30]
1433 mova m4, [aq+mmsize*3+0] ; a[24-31]
1434 movu m5, [aq+mmsize*3-4] ; a[22-29]
1435 LOWPASS 5, 3, 4 ; A[23-30]
1436 SCRATCH 5, 8, rsp+0*mmsize
1438 mova m4, [aq+mmsize*2+0] ; a[16-23]
1439 movu m6, [aq+mmsize*2-4] ; a[14-21]
1440 LOWPASS 6, 2, 4 ; A[15-22]
1441 SCRATCH 6, 9, rsp+1*mmsize
1443 mova m4, [aq+mmsize*1+0] ; a[8-15]
1444 movu m7, [aq+mmsize*1-4] ; a[6-13]
1445 LOWPASS 7, 1, 4 ; A[7-14]
1446 SCRATCH 7, 10, rsp+2*mmsize
1448 mova m4, [aq+mmsize*0+0] ; a[0-7]
1449 mova m5, [lq+mmsize*3+0] ; l[24-31]
1450 PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5]
1451 LOWPASS 6, 0, 4 ; #A[0-6]
1452 SCRATCH 6, 11, rsp+3*mmsize
1454 PALIGNR m0, m5, 2, m7 ; l[25-31]*
1455 movu m7, [lq+mmsize*3-2] ; l[23-30]
1456 LOWPASS 0, 5, 7 ; L[24-31]
1457 movu m5, [lq+mmsize*2-2] ; l[15-22]
1458 mova m7, [lq+mmsize*2+0] ; l[16-23]
1459 movu m6, [lq+mmsize*2+2] ; l[17-24]
1460 LOWPASS 5, 7, 6 ; L[16-23]
1467 SCRATCH 5, 12, rsp+4*mmsize
1468 SCRATCH 6, 13, rsp+5*mmsize
1469 movu m6, [lq+mmsize*1-2] ; l[7-14]
1470 mova m0, [lq+mmsize*1+0] ; l[8-15]
1471 movu m5, [lq+mmsize*1+2] ; l[9-16]
1472 LOWPASS 6, 0, 5 ; L[8-15]
1473 movu m0, [lq+mmsize*0+2] ; l[1-8]
1474 pslldq m5, m0, 2 ; .l[1-7]
1475 pslldq m7, m0, 4 ; ..l[1-6]
1483 UNSCRATCH 6, 13, rsp+5*mmsize
1484 DEFINE_ARGS dst, stride, stride16, cnt, stride17
1485 mov stride16q, strideq
1489 lea stride17q, [stride16q+strideq]
1493 mova [dstq+strideq*0+ 0], m4
1494 mova [dstq+strideq*0+16], m1
1495 mova [dstq+strideq*0+32], m2
1496 mova [dstq+strideq*0+48], m3
1498 mova [dstq+strideq*1+ 0], m11
1499 mova [dstq+strideq*1+16], m10
1500 mova [dstq+strideq*1+32], m9
1501 mova [dstq+strideq*1+48], m8
1503 mova [dstq+stride16q+ 0], m6
1504 mova [dstq+stride16q+16], m4
1505 mova [dstq+stride16q+32], m1
1506 mova [dstq+stride16q+48], m2
1508 mova [dstq+stride17q+ 0], m12
1509 mova [dstq+stride17q+16], m11
1510 mova [dstq+stride17q+32], m10
1511 mova [dstq+stride17q+48], m9
1513 lea dstq, [dstq+strideq*2]
1514 PALIGNR m3, m2, 14, m5
1515 PALIGNR m2, m1, 14, m5
1516 PALIGNR m1, m4, 14, m5
1517 PALIGNR m4, m6, 14, m5
1518 PALIGNR m6, m7, 14, m5
1521 PALIGNR m8, m9, 14, m5
1522 PALIGNR m9, m10, 14, m5
1523 PALIGNR m10, m11, 14, m5
1524 PALIGNR m11, m12, 14, m5
1525 PALIGNR m12, m0, 14, m5
1532 UNSCRATCH 5, 12, rsp+4*mmsize
1533 UNSCRATCH 4, 11, rsp+3*mmsize
1534 UNSCRATCH 3, 10, rsp+2*mmsize
1535 UNSCRATCH 2, 9, rsp+1*mmsize
1536 UNSCRATCH 1, 8, rsp+0*mmsize
1541 mova [dstq+strideq*0+ 0], m4
1542 mova [dstq+strideq*0+16], m3
1543 mova [dstq+strideq*0+32], m2
1544 mova [dstq+strideq*0+48], m1
1545 mova [dstq+stride16q+ 0], m5
1546 mova [dstq+stride16q+16], m4
1547 mova [dstq+stride16q+32], m3
1548 mova [dstq+stride16q+48], m2
1549 lea dstq, [dstq+strideq*2]
1550 PALIGNR m1, m2, 14, m6
1551 PALIGNR m2, m3, 14, m6
1552 PALIGNR m3, m4, 14, m6
1553 PALIGNR m4, m5, 14, m6
1554 PALIGNR m5, m0, 14, m6
1569 %macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1570 cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
1571 movh m0, [lq] ; abcd
1573 pshufb m0, [pb_0to7_67x4] ; abcddddd
1576 pshufhw m0, m0, q3333 ; abcddddd
1578 psrldq m1, m0, 2 ; bcddddd.
1579 psrldq m2, m0, 4 ; cddddd..
1580 LOWPASS 2, 1, 0 ; BCDddd..
1581 pavgw m1, m0 ; abcddddd
1582 SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd
1583 PALIGNR m2, m1, 4, m0 ; bCcDdddd
1584 DEFINE_ARGS dst, stride, stride3
1585 lea stride3q, [strideq*3]
1587 movh [dstq+strideq*0], m1 ; aBbC
1588 movh [dstq+strideq*1], m2 ; bCcD
1589 movhps [dstq+strideq*2], m1 ; cDdd
1590 movhps [dstq+stride3q ], m2 ; dddd
1593 cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
1596 mova m3, [pb_2to15_14_15]
1598 SHIFT_RIGHTx2 m1, m2, m0, m3
1601 SBUTTERFLY wd, 1, 2, 0
1602 shufps m0, m1, m2, q1032
1603 pshufd m3, m2, q3332
1604 DEFINE_ARGS dst, stride, stride3
1605 lea stride3q, [strideq*3]
1607 mova [dstq+strideq *0], m1
1608 mova [dstq+strideq *2], m0
1609 mova [dstq+strideq *4], m2
1610 mova [dstq+stride3q*2], m3
1613 vpalignr m1, m2, m1, 4
1615 PALIGNR m0, m2, m1, 4, m3
1618 pshufd m2, m2, q3321
1619 shufps m0, m1, m2, q1032
1620 pshufd m3, m2, q3332
1621 mova [dstq+strideq *0], m1
1622 mova [dstq+strideq *2], m0
1623 mova [dstq+strideq *4], m2
1624 mova [dstq+stride3q*2], m3
1627 cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
1629 mova m3, [lq+mmsize]
1634 SBUTTERFLY wd, 1, 2, 0
1636 mova m5, [pb_2to15_14_15]
1638 SHIFT_RIGHTx2 m0, m4, m3, m5
1641 SBUTTERFLY wd, 3, 4, 5
1642 pshufd m0, m0, q3333
1643 DEFINE_ARGS dst, stride, stride3, cnt
1644 lea stride3q, [strideq*3]
1648 mova [dstq+strideq *0+ 0], m1
1649 mova [dstq+strideq *0+16], m2
1650 mova [dstq+strideq *4+ 0], m2
1651 mova [dstq+strideq *4+16], m3
1652 mova [dstq+strideq *8+ 0], m3
1653 mova [dstq+strideq *8+16], m4
1654 mova [dstq+stride3q*4+ 0], m4
1655 mova [dstq+stride3q*4+16], m0
1658 vpalignr m1, m2, m1, 4
1659 vpalignr m2, m3, m2, 4
1660 vpalignr m3, m4, m3, 4
1661 vpalignr m4, m0, m4, 4
1663 PALIGNR m5, m2, m1, 4, m6
1665 PALIGNR m5, m3, m2, 4, m6
1667 PALIGNR m5, m4, m3, 4, m6
1669 PALIGNR m5, m0, m4, 4, m6
1676 cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
1677 %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
1678 mova m2, [lq+mmsize*0+0]
1679 movu m1, [lq+mmsize*0+2]
1680 movu m0, [lq+mmsize*0+4]
1683 SBUTTERFLY wd, 1, 0, 2
1684 SCRATCH 1, 8, rsp+0*mmsize
1685 mova m4, [lq+mmsize*1+0]
1686 movu m3, [lq+mmsize*1+2]
1687 movu m2, [lq+mmsize*1+4]
1690 SBUTTERFLY wd, 3, 2, 4
1691 mova m6, [lq+mmsize*2+0]
1692 movu m5, [lq+mmsize*2+2]
1693 movu m4, [lq+mmsize*2+4]
1696 SBUTTERFLY wd, 5, 4, 6
1697 mova m7, [lq+mmsize*3+0]
1698 SCRATCH 0, 9, rsp+1*mmsize
1700 mova m0, [pb_2to15_14_15]
1702 SHIFT_RIGHTx2 m1, m6, m7, m0
1705 SBUTTERFLY wd, 7, 6, 0
1706 pshufd m1, m1, q3333
1707 UNSCRATCH 0, 9, rsp+1*mmsize
1708 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
1709 lea stride3q, [strideq*3]
1710 lea stride4q, [strideq*4]
1711 lea stride28q, [stride4q*8]
1712 lea stride20q, [stride4q*5]
1713 sub stride28q, stride4q
1720 mova [rsp+1*mmsize], m1
1721 mova m1, [rsp+0*mmsize]
1723 mova [dstq+strideq *0+ 0], m1
1724 mova [dstq+strideq *0+16], m0
1725 mova [dstq+strideq *0+32], m3
1726 mova [dstq+strideq *0+48], m2
1727 mova [dstq+stride4q*1+ 0], m0
1728 mova [dstq+stride4q*1+16], m3
1729 mova [dstq+stride4q*1+32], m2
1730 mova [dstq+stride4q*1+48], m5
1731 mova [dstq+stride4q*2+ 0], m3
1732 mova [dstq+stride4q*2+16], m2
1733 mova [dstq+stride4q*2+32], m5
1734 mova [dstq+stride4q*2+48], m4
1736 vpalignr m1, m0, m1, 4
1737 vpalignr m0, m3, m0, 4
1738 vpalignr m3, m2, m3, 4
1740 SCRATCH 6, 9, rsp+2*mmsize
1741 %if notcpuflag(ssse3)
1742 SCRATCH 7, 10, rsp+3*mmsize
1744 PALIGNR m6, m0, m1, 4, m7
1746 PALIGNR m6, m3, m0, 4, m7
1748 PALIGNR m6, m2, m3, 4, m7
1750 UNSCRATCH 6, 9, rsp+2*mmsize
1751 SCRATCH 0, 9, rsp+2*mmsize
1752 %if notcpuflag(ssse3)
1753 UNSCRATCH 7, 10, rsp+3*mmsize
1754 SCRATCH 3, 10, rsp+3*mmsize
1760 mova [rsp+0*mmsize], m1
1761 mova m1, [rsp+1*mmsize]
1763 mova [dstq+stride3q*4+ 0], m2
1764 mova [dstq+stride3q*4+16], m5
1765 mova [dstq+stride3q*4+32], m4
1766 mova [dstq+stride3q*4+48], m7
1767 mova [dstq+stride4q*4+ 0], m5
1768 mova [dstq+stride4q*4+16], m4
1769 mova [dstq+stride4q*4+32], m7
1770 mova [dstq+stride4q*4+48], m6
1771 mova [dstq+stride20q + 0], m4
1772 mova [dstq+stride20q +16], m7
1773 mova [dstq+stride20q +32], m6
1774 mova [dstq+stride20q +48], m1
1775 mova [dstq+stride3q*8+ 0], m7
1776 mova [dstq+stride3q*8+16], m6
1777 mova [dstq+stride3q*8+32], m1
1778 mova [dstq+stride3q*8+48], m1
1779 mova [dstq+stride28q + 0], m6
1780 mova [dstq+stride28q +16], m1
1781 mova [dstq+stride28q +32], m1
1782 mova [dstq+stride28q +48], m1
1784 vpalignr m2, m5, m2, 4
1785 vpalignr m5, m4, m5, 4
1786 vpalignr m4, m7, m4, 4
1787 vpalignr m7, m6, m7, 4
1788 vpalignr m6, m1, m6, 4
1790 PALIGNR m0, m5, m2, 4, m3
1792 PALIGNR m0, m4, m5, 4, m3
1794 PALIGNR m0, m7, m4, 4, m3
1796 PALIGNR m0, m6, m7, 4, m3
1798 PALIGNR m0, m1, m6, 4, m3
1800 UNSCRATCH 0, 9, rsp+2*mmsize
1801 %if notcpuflag(ssse3)
1802 UNSCRATCH 3, 10, rsp+3*mmsize
1819 cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
1827 DEFINE_ARGS dst, stride, stride3
1828 lea stride3q, [strideq*3]
1830 movh [dstq+stride3q ], m1
1831 movhps [dstq+strideq*1], m1
1833 PALIGNR m2, m1, 4, m0
1834 movh [dstq+strideq*2], m2
1835 movhps [dstq+strideq*0], m2
1838 cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
1841 PALIGNR m2, m1, m0, 2, m3
1842 PALIGNR m3, m1, m0, 4, m4
1845 SBUTTERFLY wd, 2, 3, 0
1849 DEFINE_ARGS dst8, mstride, cnt
1850 lea dst8q, [dst8q+mstrideq*8]
1856 mova [dst8q+mstrideq*0], m2
1857 mova [dst8q+mstrideq*4], m3
1859 vpalignr m2, m3, m2, 4
1860 vpalignr m3, m1, m3, 4
1862 PALIGNR m0, m3, m2, 4, m4
1864 PALIGNR m0, m1, m3, 4, m4
1872 cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
1878 mova m4, [lq+mmsize]
1880 PALIGNR m3, m5, m4, 2, m6
1881 PALIGNR m2, m5, m4, 4, m6
1884 SBUTTERFLY wd, 1, 0, 4
1885 SBUTTERFLY wd, 3, 2, 4
1889 movu m5, [aq+mmsize-2]
1893 DEFINE_ARGS dst, mstride, mstride3, cnt
1894 lea dstq, [dstq+mstrideq*8]
1895 lea dstq, [dstq+mstrideq*8]
1897 lea mstride3q, [mstrideq*3]
1902 mova [dstq+mstride3q*4+ 0], m2
1903 mova [dstq+mstride3q*4+16], m4
1904 mova [dstq+mstrideq *8+ 0], m3
1905 mova [dstq+mstrideq *8+16], m2
1906 mova [dstq+mstrideq *4+ 0], m0
1907 mova [dstq+mstrideq *4+16], m3
1908 mova [dstq+mstrideq *0+ 0], m1
1909 mova [dstq+mstrideq *0+16], m0
1911 vpalignr m1, m0, m1, 4
1912 vpalignr m0, m3, m0, 4
1913 vpalignr m3, m2, m3, 4
1914 vpalignr m2, m4, m2, 4
1915 vpalignr m4, m5, m4, 4
1917 PALIGNR m6, m0, m1, 4, m7
1919 PALIGNR m6, m3, m0, 4, m7
1921 PALIGNR m6, m2, m3, 4, m7
1923 PALIGNR m6, m4, m2, 4, m7
1925 PALIGNR m6, m5, m4, 4, m7
1933 cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
1934 10 * -mmsize * ARCH_X86_32, dst, stride, l, a
1935 mova m2, [lq+mmsize*0+0]
1936 movu m1, [lq+mmsize*0+2]
1937 movu m0, [lq+mmsize*0+4]
1940 SBUTTERFLY wd, 1, 0, 2
1941 mova m4, [lq+mmsize*1+0]
1942 movu m3, [lq+mmsize*1+2]
1943 movu m2, [lq+mmsize*1+4]
1946 SBUTTERFLY wd, 3, 2, 4
1947 SCRATCH 0, 8, rsp+0*mmsize
1948 SCRATCH 1, 9, rsp+1*mmsize
1949 SCRATCH 2, 10, rsp+2*mmsize
1950 SCRATCH 3, 11, rsp+3*mmsize
1951 mova m6, [lq+mmsize*2+0]
1952 movu m5, [lq+mmsize*2+2]
1953 movu m4, [lq+mmsize*2+4]
1956 SBUTTERFLY wd, 5, 4, 6
1957 mova m0, [lq+mmsize*3+0]
1958 movu m1, [aq+mmsize*0-2]
1959 PALIGNR m7, m1, m0, 2, m2
1960 PALIGNR m6, m1, m0, 4, m2
1963 SBUTTERFLY wd, 7, 6, 0
1964 mova m2, [aq+mmsize*0+0]
1965 movu m0, [aq+mmsize*0+2]
1967 movu m1, [aq+mmsize*1-2]
1968 mova m2, [aq+mmsize*1+0]
1969 movu m3, [aq+mmsize*1+2]
1971 SCRATCH 6, 12, rsp+6*mmsize
1972 SCRATCH 7, 13, rsp+7*mmsize
1973 movu m2, [aq+mmsize*2-2]
1974 mova m3, [aq+mmsize*2+0]
1975 movu m6, [aq+mmsize*2+2]
1977 movu m3, [aq+mmsize*3-2]
1981 UNSCRATCH 6, 12, rsp+6*mmsize
1982 UNSCRATCH 7, 13, rsp+7*mmsize
1984 mova [rsp+4*mmsize], m4
1985 mova [rsp+5*mmsize], m5
1986 ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
1987 ; to do it again here
1989 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
1991 lea stride3q, [strideq*3]
1993 lea stride4q, [strideq*4]
1994 lea stride28q, [stride4q*8]
1995 lea stride20q, [stride4q*5]
1996 sub stride28q, stride4q
2000 ; x86-32 doesn't have enough registers, so on that platform, we split
2001 ; the loop in 2... Otherwise you spend most of the loop (un)scratching
2004 mova [dstq+stride28q + 0], m9
2005 mova [dstq+stride28q +16], m8
2006 mova [dstq+stride28q +32], m11
2007 mova [dstq+stride28q +48], m10
2008 mova [dstq+stride3q*8+ 0], m8
2009 mova [dstq+stride3q*8+16], m11
2010 mova [dstq+stride3q*8+32], m10
2011 mova [dstq+stride3q*8+48], m5
2012 mova [dstq+stride20q + 0], m11
2013 mova [dstq+stride20q +16], m10
2014 mova [dstq+stride20q +32], m5
2015 mova [dstq+stride20q +48], m4
2016 mova [dstq+stride4q*4+ 0], m10
2017 mova [dstq+stride4q*4+16], m5
2018 mova [dstq+stride4q*4+32], m4
2019 mova [dstq+stride4q*4+48], m7
2021 mova [dstq+stride3q*4+ 0], m5
2022 mova [dstq+stride3q*4+16], m4
2023 mova [dstq+stride3q*4+32], m7
2024 mova [dstq+stride3q*4+48], m6
2025 mova [dstq+strideq* 8+ 0], m4
2026 mova [dstq+strideq* 8+16], m7
2027 mova [dstq+strideq* 8+32], m6
2028 mova [dstq+strideq* 8+48], m0
2029 mova [dstq+strideq* 4+ 0], m7
2030 mova [dstq+strideq* 4+16], m6
2031 mova [dstq+strideq* 4+32], m0
2032 mova [dstq+strideq* 4+48], m1
2033 mova [dstq+strideq* 0+ 0], m6
2034 mova [dstq+strideq* 0+16], m0
2035 mova [dstq+strideq* 0+32], m1
2036 mova [dstq+strideq* 0+48], m2
2040 vpalignr m9, m8, m9, 4
2041 vpalignr m8, m11, m8, 4
2042 vpalignr m11, m10, m11, 4
2043 vpalignr m10, m5, m10, 4
2045 vpalignr m5, m4, m5, 4
2046 vpalignr m4, m7, m4, 4
2047 vpalignr m7, m6, m7, 4
2048 vpalignr m6, m0, m6, 4
2049 vpalignr m0, m1, m0, 4
2050 vpalignr m1, m2, m1, 4
2051 vpalignr m2, m3, m2, 4
2054 PALIGNR m12, m8, m9, 4, m13
2056 PALIGNR m12, m11, m8, 4, m13
2058 PALIGNR m12, m10, m11, 4, m13
2060 PALIGNR m12, m5, m10, 4, m13
2063 SCRATCH 3, 12, rsp+8*mmsize, sh
2064 %if notcpuflag(ssse3)
2065 SCRATCH 2, 13, rsp+9*mmsize
2067 PALIGNR m3, m4, m5, 4, m2
2069 PALIGNR m3, m7, m4, 4, m2
2071 PALIGNR m3, m6, m7, 4, m2
2073 PALIGNR m3, m0, m6, 4, m2
2075 PALIGNR m3, m1, m0, 4, m2
2077 %if notcpuflag(ssse3)
2078 UNSCRATCH 2, 13, rsp+9*mmsize
2079 SCRATCH 0, 13, rsp+9*mmsize
2081 PALIGNR m3, m2, m1, 4, m0
2083 PALIGNR m3, reg_sh, m2, 4, m0
2085 %if notcpuflag(ssse3)
2086 UNSCRATCH 0, 13, rsp+9*mmsize
2088 UNSCRATCH 3, 12, rsp+8*mmsize, sh
2095 UNSCRATCH 0, 8, rsp+0*mmsize
2096 UNSCRATCH 1, 9, rsp+1*mmsize
2097 UNSCRATCH 2, 10, rsp+2*mmsize
2098 UNSCRATCH 3, 11, rsp+3*mmsize
2099 mova m4, [rsp+4*mmsize]
2100 mova m5, [rsp+5*mmsize]
2101 mova m6, [rsp+6*mmsize]
2102 mova m7, [rsp+7*mmsize]
2103 DEFINE_ARGS dst, stride, stride5, stride3
2104 lea stride5q, [strideq*5]
2105 lea dstq, [dstq+stride5q*4]
2106 DEFINE_ARGS dst, stride, cnt, stride3
2109 mova [dstq+stride3q*4+ 0], m1
2110 mova [dstq+stride3q*4+16], m0
2111 mova [dstq+stride3q*4+32], m3
2112 mova [dstq+stride3q*4+48], m2
2113 mova [dstq+strideq* 8+ 0], m0
2114 mova [dstq+strideq* 8+16], m3
2115 mova [dstq+strideq* 8+32], m2
2116 mova [dstq+strideq* 8+48], m5
2117 mova [dstq+strideq* 4+ 0], m3
2118 mova [dstq+strideq* 4+16], m2
2119 mova [dstq+strideq* 4+32], m5
2120 mova [dstq+strideq* 4+48], m4
2121 mova [dstq+strideq* 0+ 0], m2
2122 mova [dstq+strideq* 0+16], m5
2123 mova [dstq+strideq* 0+32], m4
2124 mova [dstq+strideq* 0+48], m7
2127 vpalignr m1, m0, m1, 4
2128 vpalignr m0, m3, m0, 4
2129 vpalignr m3, m2, m3, 4
2130 vpalignr m2, m5, m2, 4
2131 vpalignr m5, m4, m5, 4
2132 vpalignr m4, m7, m4, 4
2133 vpalignr m7, m6, m7, 4
2135 SCRATCH 6, 12, rsp+8*mmsize, sh
2136 %if notcpuflag(ssse3)
2137 SCRATCH 7, 13, rsp+9*mmsize
2139 PALIGNR m6, m0, m1, 4, m7
2141 PALIGNR m6, m3, m0, 4, m7
2143 PALIGNR m6, m2, m3, 4, m7
2145 PALIGNR m6, m5, m2, 4, m7
2147 PALIGNR m6, m4, m5, 4, m7
2149 %if notcpuflag(ssse3)
2150 UNSCRATCH 7, 13, rsp+9*mmsize
2151 SCRATCH 5, 13, rsp+9*mmsize
2153 PALIGNR m6, m7, m4, 4, m5
2155 PALIGNR m6, reg_sh, m7, 4, m5
2157 %if notcpuflag(ssse3)
2158 UNSCRATCH 5, 13, rsp+9*mmsize
2160 UNSCRATCH 6, 12, rsp+8*mmsize, sh