1 ;******************************************************************************
2 ;* VP9 Intra prediction SIMD optimizations
4 ;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
5 ;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
32 pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
33 pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
34 pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
43 ; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
44 ; only 3 registers on x86-32, which would make it one cycle faster, but that
45 ; would make the code quite a bit uglier...
86 cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
89 DEFINE_ARGS dst, stride, stride3
90 lea stride3q, [strideq*3]
91 mova [dstq+strideq*0], m0
92 mova [dstq+strideq*1], m0
93 mova [dstq+strideq*2], m0
94 mova [dstq+stride3q ], m0
98 cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
101 DEFINE_ARGS dst, stride, stride3
102 lea stride3q, [strideq*3]
103 mova [dstq+strideq*0], m0
104 mova [dstq+strideq*1], m0
105 mova [dstq+strideq*2], m0
106 mova [dstq+stride3q ], m0
107 lea dstq, [dstq+strideq*4]
108 mova [dstq+strideq*0], m0
109 mova [dstq+strideq*1], m0
110 mova [dstq+strideq*2], m0
111 mova [dstq+stride3q ], m0
115 cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
119 DEFINE_ARGS dst, stride, stride3, cnt
120 lea stride3q, [strideq*3]
123 mova [dstq+strideq*0+ 0], m0
124 mova [dstq+strideq*0+16], m1
125 mova [dstq+strideq*1+ 0], m0
126 mova [dstq+strideq*1+16], m1
127 mova [dstq+strideq*2+ 0], m0
128 mova [dstq+strideq*2+16], m1
129 mova [dstq+stride3q + 0], m0
130 mova [dstq+stride3q +16], m1
131 lea dstq, [dstq+strideq*4]
137 cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
139 mova m0, [aq+mmsize*0]
140 mova m1, [aq+mmsize*1]
141 mova m2, [aq+mmsize*2]
142 mova m3, [aq+mmsize*3]
143 DEFINE_ARGS dst, stride, cnt
146 mova [dstq+strideq*0+ 0], m0
147 mova [dstq+strideq*0+16], m1
148 mova [dstq+strideq*0+32], m2
149 mova [dstq+strideq*0+48], m3
150 mova [dstq+strideq*1+ 0], m0
151 mova [dstq+strideq*1+16], m1
152 mova [dstq+strideq*1+32], m2
153 mova [dstq+strideq*1+48], m3
154 lea dstq, [dstq+strideq*2]
160 cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
162 DEFINE_ARGS dst, stride, stride3
163 lea stride3q, [strideq*3]
168 mova [dstq+strideq*0], m0
169 mova [dstq+strideq*1], m1
170 mova [dstq+strideq*2], m2
171 mova [dstq+stride3q ], m3
175 cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
177 DEFINE_ARGS dst, stride, stride3
178 lea stride3q, [strideq*3]
182 mova [dstq+strideq*0], m0
183 mova [dstq+strideq*1], m1
186 mova [dstq+strideq*2], m0
187 mova [dstq+stride3q ], m1
188 lea dstq, [dstq+strideq*4]
192 mova [dstq+strideq*0], m0
193 mova [dstq+strideq*1], m1
196 mova [dstq+strideq*2], m0
197 mova [dstq+stride3q ], m1
201 cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
203 lea stride3q, [strideq*3]
211 mova [dstq+strideq*0+ 0], m0
212 mova [dstq+strideq*0+16], m0
213 mova [dstq+strideq*1+ 0], m1
214 mova [dstq+strideq*1+16], m1
215 mova [dstq+strideq*2+ 0], m2
216 mova [dstq+strideq*2+16], m2
217 mova [dstq+stride3q + 0], m3
218 mova [dstq+stride3q +16], m3
219 lea dstq, [dstq+strideq*4]
225 cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
227 lea stride3q, [strideq*3]
235 mova [dstq+strideq*0+ 0], m0
236 mova [dstq+strideq*0+16], m0
237 mova [dstq+strideq*0+32], m0
238 mova [dstq+strideq*0+48], m0
239 mova [dstq+strideq*1+ 0], m1
240 mova [dstq+strideq*1+16], m1
241 mova [dstq+strideq*1+32], m1
242 mova [dstq+strideq*1+48], m1
243 mova [dstq+strideq*2+ 0], m2
244 mova [dstq+strideq*2+16], m2
245 mova [dstq+strideq*2+32], m2
246 mova [dstq+strideq*2+48], m2
247 mova [dstq+stride3q + 0], m3
248 mova [dstq+stride3q +16], m3
249 mova [dstq+stride3q +32], m3
250 mova [dstq+stride3q +48], m3
251 lea dstq, [dstq+strideq*4]
257 cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
260 DEFINE_ARGS dst, stride, stride3
261 lea stride3q, [strideq*3]
268 mova [dstq+strideq*0], m0
269 mova [dstq+strideq*1], m0
270 mova [dstq+strideq*2], m0
271 mova [dstq+stride3q ], m0
275 cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
278 DEFINE_ARGS dst, stride, stride3
279 lea stride3q, [strideq*3]
287 pshuflw m0, m0, q0000
289 mova [dstq+strideq*0], m0
290 mova [dstq+strideq*1], m0
291 mova [dstq+strideq*2], m0
292 mova [dstq+stride3q ], m0
293 lea dstq, [dstq+strideq*4]
294 mova [dstq+strideq*0], m0
295 mova [dstq+strideq*1], m0
296 mova [dstq+strideq*2], m0
297 mova [dstq+stride3q ], m0
301 cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
303 paddw m0, [lq+mmsize]
305 paddw m0, [aq+mmsize]
306 DEFINE_ARGS dst, stride, stride3, cnt
307 lea stride3q, [strideq*3]
316 pshuflw m0, m0, q0000
319 mova [dstq+strideq*0+ 0], m0
320 mova [dstq+strideq*0+16], m0
321 mova [dstq+strideq*1+ 0], m0
322 mova [dstq+strideq*1+16], m0
323 mova [dstq+strideq*2+ 0], m0
324 mova [dstq+strideq*2+16], m0
325 mova [dstq+stride3q + 0], m0
326 mova [dstq+stride3q +16], m0
327 lea dstq, [dstq+strideq*4]
333 cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
334 mova m0, [lq+mmsize*0]
335 paddw m0, [lq+mmsize*1]
336 paddw m0, [lq+mmsize*2]
337 paddw m0, [lq+mmsize*3]
338 paddw m0, [aq+mmsize*0]
339 paddw m0, [aq+mmsize*1]
340 paddw m0, [aq+mmsize*2]
341 paddw m0, [aq+mmsize*3]
342 DEFINE_ARGS dst, stride, stride3, cnt
343 lea stride3q, [strideq*3]
352 pshuflw m0, m0, q0000
355 mova [dstq+strideq*0+ 0], m0
356 mova [dstq+strideq*0+16], m0
357 mova [dstq+strideq*0+32], m0
358 mova [dstq+strideq*0+48], m0
359 mova [dstq+strideq*1+ 0], m0
360 mova [dstq+strideq*1+16], m0
361 mova [dstq+strideq*1+32], m0
362 mova [dstq+strideq*1+48], m0
363 lea dstq, [dstq+strideq*2]
370 cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
372 DEFINE_ARGS dst, stride, stride3
373 lea stride3q, [strideq*3]
380 mova [dstq+strideq*0], m0
381 mova [dstq+strideq*1], m0
382 mova [dstq+strideq*2], m0
383 mova [dstq+stride3q ], m0
387 cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
389 DEFINE_ARGS dst, stride, stride3
390 lea stride3q, [strideq*3]
398 pshuflw m0, m0, q0000
400 mova [dstq+strideq*0], m0
401 mova [dstq+strideq*1], m0
402 mova [dstq+strideq*2], m0
403 mova [dstq+stride3q ], m0
404 lea dstq, [dstq+strideq*4]
405 mova [dstq+strideq*0], m0
406 mova [dstq+strideq*1], m0
407 mova [dstq+strideq*2], m0
408 mova [dstq+stride3q ], m0
412 cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
414 paddw m0, [%2+mmsize]
415 DEFINE_ARGS dst, stride, stride3, cnt
416 lea stride3q, [strideq*3]
425 pshuflw m0, m0, q0000
428 mova [dstq+strideq*0+ 0], m0
429 mova [dstq+strideq*0+16], m0
430 mova [dstq+strideq*1+ 0], m0
431 mova [dstq+strideq*1+16], m0
432 mova [dstq+strideq*2+ 0], m0
433 mova [dstq+strideq*2+16], m0
434 mova [dstq+stride3q + 0], m0
435 mova [dstq+stride3q +16], m0
436 lea dstq, [dstq+strideq*4]
442 cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
443 mova m0, [%2+mmsize*0]
444 paddw m0, [%2+mmsize*1]
445 paddw m0, [%2+mmsize*2]
446 paddw m0, [%2+mmsize*3]
447 DEFINE_ARGS dst, stride, cnt
456 pshuflw m0, m0, q0000
459 mova [dstq+strideq*0+ 0], m0
460 mova [dstq+strideq*0+16], m0
461 mova [dstq+strideq*0+32], m0
462 mova [dstq+strideq*0+48], m0
463 mova [dstq+strideq*1+ 0], m0
464 mova [dstq+strideq*1+16], m0
465 mova [dstq+strideq*1+32], m0
466 mova [dstq+strideq*1+48], m0
467 lea dstq, [dstq+strideq*2]
477 cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
485 DEFINE_ARGS dst, stride, stride3
486 lea stride3q, [strideq*3]
504 mova [dstq+strideq*0], m0
505 mova [dstq+strideq*1], m1
506 mova [dstq+strideq*2], m2
507 mova [dstq+stride3q ], m3
510 cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
512 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
515 cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
521 pshuflw m0, m0, q1111
524 DEFINE_ARGS dst, stride, l, stride3, cnt
525 lea stride3q, [strideq*3]
546 mova [dstq+strideq*0], m0
547 mova [dstq+strideq*1], m1
548 mova [dstq+strideq*2], m2
549 mova [dstq+stride3q ], m3
550 lea dstq, [dstq+strideq*4]
555 cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
557 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
560 cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
567 pshuflw m0, m0, q1111
571 DEFINE_ARGS dst, stride, l, cnt
590 mova [dstq+strideq*0+ 0], m0
591 mova [dstq+strideq*0+16], m2
592 mova [dstq+strideq*1+ 0], m1
593 mova [dstq+strideq*1+16], m3
594 lea dstq, [dstq+strideq*2]
599 cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
601 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
604 cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
616 %define reg_min [rsp+16]
617 %define reg_max [rsp+ 0]
620 mova m4, [aq+mmsize*0]
621 mova m5, [aq+mmsize*1]
622 mova m6, [aq+mmsize*2]
623 mova m7, [aq+mmsize*3]
625 pshuflw m0, m0, q1111
631 DEFINE_ARGS dst, stride, l, cnt
634 pinsrw m3, [lq+cntq*2], 0
649 mova [dstq+strideq*0+ 0], m0
650 mova [dstq+strideq*0+16], m1
651 mova [dstq+strideq*0+32], m2
652 mova [dstq+strideq*0+48], m3
658 cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
660 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
662 ; Directional intra predicion functions
664 ; in the functions below, 'abcdefgh' refers to above data (sometimes simply
665 ; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
666 ; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
667 ; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
670 ; left=(left+2*center+right+2)>>2
671 %macro LOWPASS 3 ; left [dst], center, right
677 ; abcdefgh (src) -> bcdefghh (dst)
678 ; dst/src can be the same register
679 %macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
681 pshufb %1, %2, %3 ; abcdefgh -> bcdefghh
683 psrldq %1, %2, 2 ; abcdefgh -> bcdefgh.
684 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
688 ; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
689 %macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
691 pshufb %1, %3, %4 ; abcdefgh -> bcdefghh
692 pshufb %2, %1, %4 ; bcdefghh -> cdefghhh
694 psrldq %1, %3, 2 ; abcdefgh -> bcdefgh.
695 psrldq %2, %3, 4 ; abcdefgh -> cdefgh..
696 pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh
697 pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh
702 cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
704 movu m1, [aq] ; abcdefgh
705 pshufhw m0, m1, q3310 ; abcdefhh
706 SHIFT_RIGHT m1, m1 ; bcdefghh
707 psrldq m2, m1, 2 ; cdefghh.
708 LOWPASS 0, 1, 2 ; BCDEFGh.
709 pshufd m1, m0, q3321 ; DEFGh...
710 movh [dstq+strideq*0], m0
711 movh [dstq+strideq*2], m1
713 psrldq m0, 2 ; CDEFGh..
714 psrldq m1, 2 ; EFGh....
715 movh [dstq+strideq*0], m0
716 movh [dstq+strideq*2], m1
719 cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
721 mova m0, [aq] ; abcdefgh
723 mova m4, [pb_2to15_14_15]
725 SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh
726 LOWPASS 0, 1, 2 ; BCDEFGHh
727 shufps m1, m0, m2, q3332 ; FGHhhhhh
728 shufps m3, m0, m1, q2121 ; DEFGHhhh
729 DEFINE_ARGS dst, stride, stride5
730 lea stride5q, [strideq*5]
732 mova [dstq+strideq*0], m0
733 mova [dstq+strideq*4], m1
734 SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh
735 pshuflw m1, m1, q3321 ; GHhhhhhh
736 pshufd m2, m0, q3321 ; EFGHhhhh
737 mova [dstq+strideq*1], m0
738 mova [dstq+stride5q ], m1
739 lea dstq, [dstq+strideq*2]
740 pshuflw m1, m1, q3321 ; Hhhhhhhh
741 mova [dstq+strideq*0], m3
742 mova [dstq+strideq*4], m1
743 pshuflw m1, m1, q3321 ; hhhhhhhh
744 mova [dstq+strideq*1], m2
745 mova [dstq+stride5q ], m1
748 cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
750 mova m0, [aq] ; abcdefgh
751 mova m3, [aq+mmsize] ; ijklmnop
752 PALIGNR m1, m3, m0, 2, m4 ; bcdefghi
753 PALIGNR m2, m3, m0, 4, m4 ; cdefghij
754 LOWPASS 0, 1, 2 ; BCDEFGHI
756 mova m4, [pb_2to15_14_15]
758 SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp
759 LOWPASS 1, 2, 3 ; JKLMNOPp
760 pshufd m2, m2, q3333 ; pppppppp
761 DEFINE_ARGS dst, stride, cnt
765 mova [dstq+strideq*0+ 0], m0
766 mova [dstq+strideq*0+16], m1
767 mova [dstq+strideq*8+ 0], m1
768 mova [dstq+strideq*8+16], m2
771 vpalignr m0, m1, m0, 2
773 PALIGNR m3, m1, m0, 2, m4
776 SHIFT_RIGHT m1, m1, m4
781 cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
783 mova m0, [aq+mmsize*0] ; abcdefgh
784 mova m1, [aq+mmsize*1] ; ijklmnop
785 mova m2, [aq+mmsize*2] ; qrstuvwx
786 mova m3, [aq+mmsize*3] ; yz012345
787 PALIGNR m4, m1, m0, 2, m6
788 PALIGNR m5, m1, m0, 4, m6
789 LOWPASS 0, 4, 5 ; BCDEFGHI
790 PALIGNR m4, m2, m1, 2, m6
791 PALIGNR m5, m2, m1, 4, m6
792 LOWPASS 1, 4, 5 ; JKLMNOPQ
793 PALIGNR m4, m3, m2, 2, m6
794 PALIGNR m5, m3, m2, 4, m6
795 LOWPASS 2, 4, 5 ; RSTUVWXY
797 mova m6, [pb_2to15_14_15]
799 SHIFT_RIGHTx2 m4, m5, m3, m6
800 LOWPASS 3, 4, 5 ; Z0123455
801 pshufd m4, m4, q3333 ; 55555555
802 DEFINE_ARGS dst, stride, stride8, stride24, cnt
804 lea stride8q, [strideq*8]
805 lea stride24q, [stride8q*3]
808 mova [dstq+stride8q*0+ 0], m0
809 mova [dstq+stride8q*0+16], m1
810 mova [dstq+stride8q*0+32], m2
811 mova [dstq+stride8q*0+48], m3
812 mova [dstq+stride8q*1+ 0], m1
813 mova [dstq+stride8q*1+16], m2
814 mova [dstq+stride8q*1+32], m3
815 mova [dstq+stride8q*1+48], m4
816 mova [dstq+stride8q*2+ 0], m2
817 mova [dstq+stride8q*2+16], m3
818 mova [dstq+stride8q*2+32], m4
819 mova [dstq+stride8q*2+48], m4
820 mova [dstq+stride24q + 0], m3
821 mova [dstq+stride24q +16], m4
822 mova [dstq+stride24q +32], m4
823 mova [dstq+stride24q +48], m4
826 vpalignr m0, m1, m0, 2
827 vpalignr m1, m2, m1, 2
828 vpalignr m2, m3, m2, 2
830 PALIGNR m5, m1, m0, 2, m6
832 PALIGNR m5, m2, m1, 2, m6
834 PALIGNR m5, m3, m2, 2, m6
837 SHIFT_RIGHT m3, m3, m6
850 %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
851 cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
852 movh m0, [lq] ; wxyz....
853 movhps m0, [aq-2] ; wxyz*abc
854 movd m1, [aq+6] ; d.......
855 PALIGNR m1, m0, 2, m2 ; xyz*abcd
856 psrldq m2, m1, 2 ; yz*abcd.
857 LOWPASS 0, 1, 2 ; XYZ#ABC.
858 DEFINE_ARGS dst, stride, stride3
859 lea stride3q, [strideq*3]
861 movh [dstq+stride3q ], m0
862 psrldq m0, 2 ; YZ#ABC..
863 movh [dstq+strideq*2], m0
864 psrldq m0, 2 ; Z#ABC...
865 movh [dstq+strideq*1], m0
866 psrldq m0, 2 ; #ABC....
867 movh [dstq+strideq*0], m0
870 cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
871 mova m0, [lq] ; stuvwxyz
872 movu m1, [aq-2] ; *abcdefg
873 mova m2, [aq] ; abcdefgh
874 psrldq m3, m2, 2 ; bcdefgh.
875 LOWPASS 3, 2, 1 ; ABCDEFG.
876 PALIGNR m1, m0, 2, m4 ; tuvwxyz*
877 PALIGNR m2, m1, 2, m4 ; uvwxyz*a
878 LOWPASS 2, 1, 0 ; TUVWXYZ#
879 DEFINE_ARGS dst, stride, dst4, stride3
880 lea stride3q, [strideq*3]
881 lea dst4q, [dstq+strideq*4]
883 movhps [dstq +stride3q +0], m2
884 movh [dstq+ stride3q +8], m3
885 mova [dst4q+stride3q +0], m2
886 PALIGNR m1, m3, m2, 2, m0
888 movhps [dstq +strideq*2+0], m1
889 movh [dstq+ strideq*2+8], m3
890 mova [dst4q+strideq*2+0], m1
891 PALIGNR m2, m3, m1, 2, m0
893 movhps [dstq +strideq*1+0], m2
894 movh [dstq+ strideq*1+8], m3
895 mova [dst4q+strideq*1+0], m2
896 PALIGNR m1, m3, m2, 2, m0
898 movhps [dstq +strideq*0+0], m1
899 movh [dstq+ strideq*0+8], m3
900 mova [dst4q+strideq*0+0], m1
903 cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
904 mova m0, [lq] ; klmnopqr
905 mova m1, [lq+mmsize] ; stuvwxyz
906 movu m2, [aq-2] ; *abcdefg
907 movu m3, [aq+mmsize-2] ; hijklmno
908 mova m4, [aq] ; abcdefgh
909 mova m5, [aq+mmsize] ; ijklmnop
910 psrldq m6, m5, 2 ; jklmnop.
911 LOWPASS 6, 5, 3 ; IJKLMNO.
912 PALIGNR m5, m4, 2, m3 ; bcdefghi
913 LOWPASS 5, 4, 2 ; ABCDEFGH
914 PALIGNR m2, m1, 2, m3 ; tuvwxyz*
915 PALIGNR m4, m2, 2, m3 ; uvwxyz*a
916 LOWPASS 4, 2, 1 ; TUVWXYZ#
917 PALIGNR m1, m0, 2, m3 ; lmnopqrs
918 PALIGNR m2, m1, 2, m3 ; mnopqrst
919 LOWPASS 2, 1, 0 ; LMNOPQRS
920 DEFINE_ARGS dst, stride, dst8, cnt
921 lea dst8q, [dstq+strideq*8]
926 mova [dst8q+strideq*0+ 0], m4
927 mova [dst8q+strideq*0+16], m5
928 mova [dst8q+strideq*8+ 0], m2
929 mova [dst8q+strideq*8+16], m4
931 vpalignr m2, m4, m2, 2
932 vpalignr m4, m5, m4, 2
933 vpalignr m5, m6, m5, 2
935 PALIGNR m0, m4, m2, 2, m1
937 PALIGNR m0, m5, m4, 2, m1
939 PALIGNR m0, m6, m5, 2, m1
947 cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
948 %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
949 mova m0, [aq+mmsize*3] ; a[24-31]
950 movu m1, [aq+mmsize*3-2] ; a[23-30]
951 psrldq m2, m0, 2 ; a[25-31].
952 LOWPASS 2, 0, 1 ; A[24-30].
953 mova m1, [aq+mmsize*2] ; a[16-23]
954 movu m3, [aq+mmsize*2-2] ; a[15-22]
955 PALIGNR m0, m1, 2, m4 ; a[17-24]
956 LOWPASS 0, 1, 3 ; A[16-23]
957 mova m3, [aq+mmsize*1] ; a[8-15]
958 movu m4, [aq+mmsize*1-2] ; a[7-14]
959 PALIGNR m1, m3, 2, m5 ; a[9-16]
960 LOWPASS 1, 3, 4 ; A[8-15]
961 mova m4, [aq+mmsize*0] ; a[0-7]
962 movu m5, [aq+mmsize*0-2] ; *a[0-6]
963 PALIGNR m3, m4, 2, m6 ; a[1-8]
964 LOWPASS 3, 4, 5 ; A[0-7]
965 SCRATCH 1, 8, rsp+0*mmsize
966 SCRATCH 3, 9, rsp+1*mmsize
967 %if notcpuflag(ssse3)
968 SCRATCH 0, 10, rsp+2*mmsize
970 mova m6, [lq+mmsize*3] ; l[24-31]
971 PALIGNR m5, m6, 2, m0 ; l[25-31]*
972 PALIGNR m4, m5, 2, m0 ; l[26-31]*a
973 LOWPASS 4, 5, 6 ; L[25-31]#
974 mova m7, [lq+mmsize*2] ; l[16-23]
975 PALIGNR m6, m7, 2, m0 ; l[17-24]
976 PALIGNR m5, m6, 2, m0 ; l[18-25]
977 LOWPASS 5, 6, 7 ; L[17-24]
978 mova m1, [lq+mmsize*1] ; l[8-15]
979 PALIGNR m7, m1, 2, m0 ; l[9-16]
980 PALIGNR m6, m7, 2, m0 ; l[10-17]
981 LOWPASS 6, 7, 1 ; L[9-16]
982 mova m3, [lq+mmsize*0] ; l[0-7]
983 PALIGNR m1, m3, 2, m0 ; l[1-8]
984 PALIGNR m7, m1, 2, m0 ; l[2-9]
985 LOWPASS 7, 1, 3 ; L[1-8]
988 UNSCRATCH 1, 8, rsp+0*mmsize
990 UNSCRATCH 3, 9, rsp+1*mmsize
992 UNSCRATCH 0, 10, rsp+2*mmsize
994 DEFINE_ARGS dst8, stride, stride8, stride24, cnt
995 lea stride8q, [strideq*8]
996 lea stride24q, [stride8q*3]
997 lea dst8q, [dst8q+strideq*8]
1003 UNSCRATCH 1, 8, rsp+0*mmsize
1004 %if notcpuflag(ssse3)
1005 UNSCRATCH 3, 9, rsp+1*mmsize
1008 mova [dst8q+stride8q*0+ 0], m4
1009 mova [dst8q+stride8q*0+16], m3
1010 mova [dst8q+stride8q*0+32], m1
1011 mova [dst8q+stride8q*0+48], m0
1012 mova [dst8q+stride8q*1+ 0], m5
1013 mova [dst8q+stride8q*1+16], m4
1014 mova [dst8q+stride8q*1+32], m3
1015 mova [dst8q+stride8q*1+48], m1
1016 mova [dst8q+stride8q*2+ 0], m6
1017 mova [dst8q+stride8q*2+16], m5
1018 mova [dst8q+stride8q*2+32], m4
1019 mova [dst8q+stride8q*2+48], m3
1020 mova [dst8q+stride24q + 0], m7
1021 mova [dst8q+stride24q +16], m6
1022 mova [dst8q+stride24q +32], m5
1023 mova [dst8q+stride24q +48], m4
1025 vpalignr m7, m6, m7, 2
1026 vpalignr m6, m5, m6, 2
1027 vpalignr m5, m4, m5, 2
1028 vpalignr m4, m3, m4, 2
1029 vpalignr m3, m1, m3, 2
1030 vpalignr m1, m0, m1, 2
1031 vpalignr m0, m2, m0, 2
1033 SCRATCH 2, 8, rsp+0*mmsize
1034 %if notcpuflag(ssse3)
1035 SCRATCH 0, 9, rsp+1*mmsize
1037 PALIGNR m2, m6, m7, 2, m0
1039 PALIGNR m2, m5, m6, 2, m0
1041 PALIGNR m2, m4, m5, 2, m0
1043 PALIGNR m2, m3, m4, 2, m0
1045 PALIGNR m2, m1, m3, 2, m0
1047 %if notcpuflag(ssse3)
1048 UNSCRATCH 0, 9, rsp+1*mmsize
1049 SCRATCH 3, 9, rsp+1*mmsize
1051 PALIGNR m2, m0, m1, 2, m3
1053 UNSCRATCH 2, 8, rsp+0*mmsize
1054 SCRATCH 1, 8, rsp+0*mmsize
1055 PALIGNR m1, m2, m0, 2, m3
1071 %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1072 cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
1074 movu m0, [aq] ; abcdefgh
1075 psrldq m1, m0, 2 ; bcdefgh.
1076 psrldq m2, m0, 4 ; cdefgh..
1077 LOWPASS 2, 1, 0 ; BCDEFGH.
1078 pavgw m1, m0 ; ABCDEFG.
1079 DEFINE_ARGS dst, stride, stride3
1080 lea stride3q, [strideq*3]
1082 movh [dstq+strideq*0], m1
1083 movh [dstq+strideq*1], m2
1086 movh [dstq+strideq*2], m1
1087 movh [dstq+stride3q ], m2
1090 cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
1092 mova m0, [aq] ; abcdefgh
1094 mova m3, [pb_2to15_14_15]
1096 SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh
1097 LOWPASS 2, 1, 0 ; BCDEFGHh
1098 pavgw m1, m0 ; ABCDEFGh
1099 DEFINE_ARGS dst, stride, stride3
1100 lea stride3q, [strideq*3]
1102 mova [dstq+strideq*0], m1
1103 mova [dstq+strideq*1], m2
1104 SHIFT_RIGHT m1, m1, m3
1105 SHIFT_RIGHT m2, m2, m3
1106 mova [dstq+strideq*2], m1
1107 mova [dstq+stride3q ], m2
1108 lea dstq, [dstq+strideq*4]
1109 SHIFT_RIGHT m1, m1, m3
1110 SHIFT_RIGHT m2, m2, m3
1111 mova [dstq+strideq*0], m1
1112 mova [dstq+strideq*1], m2
1113 SHIFT_RIGHT m1, m1, m3
1114 SHIFT_RIGHT m2, m2, m3
1115 mova [dstq+strideq*2], m1
1116 mova [dstq+stride3q ], m2
1119 cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
1122 mova m1, [aq+mmsize]
1123 PALIGNR m2, m1, m0, 2, m3
1124 PALIGNR m3, m1, m0, 4, m4
1128 mova m4, [pb_2to15_14_15]
1130 SHIFT_RIGHTx2 m5, m0, m1, m4
1133 DEFINE_ARGS dst, stride, cnt
1137 mova [dstq+strideq*0+ 0], m2
1138 mova [dstq+strideq*0+16], m1
1139 mova [dstq+strideq*1+ 0], m3
1140 mova [dstq+strideq*1+16], m0
1141 lea dstq, [dstq+strideq*2]
1143 vpalignr m2, m1, m2, 2
1144 vpalignr m3, m0, m3, 2
1146 PALIGNR m5, m1, m2, 2, m4
1148 PALIGNR m5, m0, m3, 2, m4
1151 SHIFT_RIGHT m1, m1, m4
1152 SHIFT_RIGHT m0, m0, m4
1157 cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
1159 mova m0, [aq+mmsize*0]
1160 mova m1, [aq+mmsize*1]
1161 mova m2, [aq+mmsize*2]
1162 PALIGNR m6, m1, m0, 2, m5
1163 PALIGNR m7, m1, m0, 4, m5
1166 SCRATCH 6, 8, rsp+0*mmsize
1167 PALIGNR m4, m2, m1, 2, m0
1168 PALIGNR m5, m2, m1, 4, m0
1171 mova m0, [aq+mmsize*3]
1172 PALIGNR m1, m0, m2, 2, m6
1173 PALIGNR m3, m0, m2, 4, m6
1177 PRELOAD 10, pb_2to15_14_15, shuf
1179 SHIFT_RIGHTx2 m6, m1, m0, reg_shuf
1183 pshufd m9, m6, q3333
1186 UNSCRATCH 6, 8, rsp+0*mmsize
1188 DEFINE_ARGS dst, stride, cnt, stride16, stride17
1189 mov stride16q, strideq
1192 lea stride17q, [stride16q+strideq]
1194 ; FIXME m8 is unused for avx, so we could save one register here for win64
1197 UNSCRATCH 6, 8, rsp+0*mmsize
1199 mova [dstq+strideq*0+ 0], m6
1200 mova [dstq+strideq*0+16], m4
1201 mova [dstq+strideq*0+32], m2
1202 mova [dstq+strideq*0+48], m0
1203 mova [dstq+strideq*1+ 0], m7
1204 mova [dstq+strideq*1+16], m5
1205 mova [dstq+strideq*1+32], m3
1206 mova [dstq+strideq*1+48], m1
1207 mova [dstq+stride16q+ 0], m4
1208 mova [dstq+stride16q+16], m2
1209 mova [dstq+stride16q+32], m0
1211 mova [dstq+stride16q+48], m9
1213 mova [dstq+stride17q+ 0], m5
1214 mova [dstq+stride17q+16], m3
1215 mova [dstq+stride17q+32], m1
1217 mova [dstq+stride17q+48], m9
1219 lea dstq, [dstq+strideq*2]
1221 vpalignr m6, m4, m6, 2
1222 vpalignr m4, m2, m4, 2
1223 vpalignr m2, m0, m2, 2
1224 vpalignr m7, m5, m7, 2
1225 vpalignr m5, m3, m5, 2
1226 vpalignr m3, m1, m3, 2
1228 SCRATCH 3, 8, rsp+0*mmsize
1229 %if notcpuflag(ssse3)
1230 SCRATCH 1, 10, rsp+1*mmsize
1232 PALIGNR m3, m4, m6, 2, m1
1234 PALIGNR m3, m2, m4, 2, m1
1236 PALIGNR m3, m0, m2, 2, m1
1238 PALIGNR m3, m5, m7, 2, m1
1240 UNSCRATCH 3, 8, rsp+0*mmsize
1241 SCRATCH 6, 8, rsp+0*mmsize
1242 %if notcpuflag(ssse3)
1243 UNSCRATCH 1, 10, rsp+1*mmsize
1244 SCRATCH 7, 10, rsp+1*mmsize
1246 PALIGNR m6, m3, m5, 2, m7
1248 PALIGNR m6, m1, m3, 2, m7
1250 %if notcpuflag(ssse3)
1251 UNSCRATCH 7, 10, rsp+1*mmsize
1254 SHIFT_RIGHT m1, m1, reg_shuf
1255 SHIFT_RIGHT m0, m0, reg_shuf
1260 DEFINE_ARGS dst, stride, stride3
1261 lea stride3q, [strideq*3]
1264 mova [dstq+strideq*0+48], m0
1265 mova [dstq+strideq*1+48], m0
1266 mova [dstq+strideq*2+48], m0
1267 mova [dstq+stride3q +48], m0
1269 lea dstq, [dstq+strideq*4]
1285 cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
1288 PALIGNR m0, m1, 10, m2 ; xyz*abcd
1289 pslldq m1, m0, 2 ; .xyz*abc
1290 pslldq m2, m0, 4 ; ..xyz*ab
1291 LOWPASS 2, 1, 0 ; ..YZ#ABC
1292 pavgw m1, m0 ; ....#ABC
1293 DEFINE_ARGS dst, stride, stride3
1294 lea stride3q, [strideq*3]
1296 movhps [dstq+strideq*0], m1
1297 movhps [dstq+strideq*1], m2
1298 shufps m0, m2, m1, q3210
1300 pshufb m2, [pb_4_5_8to13_8x0]
1302 pshuflw m2, m2, q2222
1306 movh [dstq+strideq*2], m0
1307 movh [dstq+stride3q ], m2
1310 cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
1311 movu m1, [aq-2] ; *abcdefg
1312 movu m2, [lq] ; stuvwxyz
1313 mova m0, [aq] ; abcdefgh
1314 PALIGNR m3, m1, m2, 14, m4 ; z*abcdef
1317 PALIGNR m1, m2, 2, m4 ; tuvwxyz*
1318 pslldq m4, m2, 2 ; .stuvwxy
1320 DEFINE_ARGS dst, stride, stride3
1321 lea stride3q, [strideq*3]
1323 mova [dstq+strideq*0], m0
1324 mova [dstq+strideq*1], m3
1325 PALIGNR m0, m4, 14, m1
1327 PALIGNR m3, m4, 14, m1
1329 mova [dstq+strideq*2], m0
1330 mova [dstq+stride3q ], m3
1331 lea dstq, [dstq+strideq*4]
1332 PALIGNR m0, m4, 14, m1
1334 PALIGNR m3, m4, 14, m1
1336 mova [dstq+strideq*0], m0
1337 mova [dstq+strideq*1], m3
1338 PALIGNR m0, m4, 14, m1
1340 PALIGNR m3, m4, 14, m4
1341 mova [dstq+strideq*2], m0
1342 mova [dstq+stride3q ], m3
1345 cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
1346 movu m1, [aq-2] ; *abcdefg
1347 movu m2, [aq+mmsize-2] ; hijklmno
1348 mova m3, [aq] ; abcdefgh
1349 mova m4, [aq+mmsize] ; ijklmnop
1350 mova m5, [lq+mmsize] ; stuvwxyz
1351 PALIGNR m0, m1, m5, 14, m6 ; z*abcdef
1352 movu m6, [aq+mmsize-4] ; ghijklmn
1357 PALIGNR m1, m5, 2, m7 ; tuvwxyz*
1358 movu m7, [lq+mmsize-2] ; rstuvwxy
1360 movu m5, [lq+2] ; lmnopqrs
1361 pslldq m4, m5, 2 ; .lmnopqr
1362 pslldq m7, m5, 4 ; ..lmnopq
1370 DEFINE_ARGS dst, stride, cnt
1374 mova [dstq+strideq*0+ 0], m3
1375 mova [dstq+strideq*0+16], m2
1376 mova [dstq+strideq*1+ 0], m0
1377 mova [dstq+strideq*1+16], m6
1378 lea dstq, [dstq+strideq*2]
1379 PALIGNR m2, m3, 14, m4
1380 PALIGNR m3, m7, 14, m4
1382 PALIGNR m6, m0, 14, m4
1383 PALIGNR m0, m5, 14, m4
1389 cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
1390 movu m0, [aq+mmsize*0-2] ; *a[0-6]
1391 movu m1, [aq+mmsize*1-2] ; a[7-14]
1392 movu m2, [aq+mmsize*2-2] ; a[15-22]
1393 movu m3, [aq+mmsize*3-2] ; a[23-30]
1394 mova m4, [aq+mmsize*3+0] ; a[24-31]
1395 movu m5, [aq+mmsize*3-4] ; a[22-29]
1396 LOWPASS 5, 3, 4 ; A[23-30]
1397 SCRATCH 5, 8, rsp+0*mmsize
1399 mova m4, [aq+mmsize*2+0] ; a[16-23]
1400 movu m6, [aq+mmsize*2-4] ; a[14-21]
1401 LOWPASS 6, 2, 4 ; A[15-22]
1402 SCRATCH 6, 9, rsp+1*mmsize
1404 mova m4, [aq+mmsize*1+0] ; a[8-15]
1405 movu m7, [aq+mmsize*1-4] ; a[6-13]
1406 LOWPASS 7, 1, 4 ; A[7-14]
1407 SCRATCH 7, 10, rsp+2*mmsize
1409 mova m4, [aq+mmsize*0+0] ; a[0-7]
1410 mova m5, [lq+mmsize*3+0] ; l[24-31]
1411 PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5]
1412 LOWPASS 6, 0, 4 ; #A[0-6]
1413 SCRATCH 6, 11, rsp+3*mmsize
1415 PALIGNR m0, m5, 2, m7 ; l[25-31]*
1416 movu m7, [lq+mmsize*3-2] ; l[23-30]
1417 LOWPASS 0, 5, 7 ; L[24-31]
1418 movu m5, [lq+mmsize*2-2] ; l[15-22]
1419 mova m7, [lq+mmsize*2+0] ; l[16-23]
1420 movu m6, [lq+mmsize*2+2] ; l[17-24]
1421 LOWPASS 5, 7, 6 ; L[16-23]
1428 SCRATCH 5, 12, rsp+4*mmsize
1429 SCRATCH 6, 13, rsp+5*mmsize
1430 movu m6, [lq+mmsize*1-2] ; l[7-14]
1431 mova m0, [lq+mmsize*1+0] ; l[8-15]
1432 movu m5, [lq+mmsize*1+2] ; l[9-16]
1433 LOWPASS 6, 0, 5 ; L[8-15]
1434 movu m0, [lq+mmsize*0+2] ; l[1-8]
1435 pslldq m5, m0, 2 ; .l[1-7]
1436 pslldq m7, m0, 4 ; ..l[1-6]
1444 UNSCRATCH 6, 13, rsp+5*mmsize
1445 DEFINE_ARGS dst, stride, stride16, cnt, stride17
1446 mov stride16q, strideq
1450 lea stride17q, [stride16q+strideq]
1454 mova [dstq+strideq*0+ 0], m4
1455 mova [dstq+strideq*0+16], m1
1456 mova [dstq+strideq*0+32], m2
1457 mova [dstq+strideq*0+48], m3
1459 mova [dstq+strideq*1+ 0], m11
1460 mova [dstq+strideq*1+16], m10
1461 mova [dstq+strideq*1+32], m9
1462 mova [dstq+strideq*1+48], m8
1464 mova [dstq+stride16q+ 0], m6
1465 mova [dstq+stride16q+16], m4
1466 mova [dstq+stride16q+32], m1
1467 mova [dstq+stride16q+48], m2
1469 mova [dstq+stride17q+ 0], m12
1470 mova [dstq+stride17q+16], m11
1471 mova [dstq+stride17q+32], m10
1472 mova [dstq+stride17q+48], m9
1474 lea dstq, [dstq+strideq*2]
1475 PALIGNR m3, m2, 14, m5
1476 PALIGNR m2, m1, 14, m5
1477 PALIGNR m1, m4, 14, m5
1478 PALIGNR m4, m6, 14, m5
1479 PALIGNR m6, m7, 14, m5
1482 PALIGNR m8, m9, 14, m5
1483 PALIGNR m9, m10, 14, m5
1484 PALIGNR m10, m11, 14, m5
1485 PALIGNR m11, m12, 14, m5
1486 PALIGNR m12, m0, 14, m5
1493 UNSCRATCH 5, 12, rsp+4*mmsize
1494 UNSCRATCH 4, 11, rsp+3*mmsize
1495 UNSCRATCH 3, 10, rsp+2*mmsize
1496 UNSCRATCH 2, 9, rsp+1*mmsize
1497 UNSCRATCH 1, 8, rsp+0*mmsize
1502 mova [dstq+strideq*0+ 0], m4
1503 mova [dstq+strideq*0+16], m3
1504 mova [dstq+strideq*0+32], m2
1505 mova [dstq+strideq*0+48], m1
1506 mova [dstq+stride16q+ 0], m5
1507 mova [dstq+stride16q+16], m4
1508 mova [dstq+stride16q+32], m3
1509 mova [dstq+stride16q+48], m2
1510 lea dstq, [dstq+strideq*2]
1511 PALIGNR m1, m2, 14, m6
1512 PALIGNR m2, m3, 14, m6
1513 PALIGNR m3, m4, 14, m6
1514 PALIGNR m4, m5, 14, m6
1515 PALIGNR m5, m0, 14, m6
1530 %macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1531 cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
1532 movh m0, [lq] ; abcd
1534 pshufb m0, [pb_0to7_67x4] ; abcddddd
1537 pshufhw m0, m0, q3333 ; abcddddd
1539 psrldq m1, m0, 2 ; bcddddd.
1540 psrldq m2, m0, 4 ; cddddd..
1541 LOWPASS 2, 1, 0 ; BCDddd..
1542 pavgw m1, m0 ; abcddddd
1543 SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd
1544 PALIGNR m2, m1, 4, m0 ; bCcDdddd
1545 DEFINE_ARGS dst, stride, stride3
1546 lea stride3q, [strideq*3]
1548 movh [dstq+strideq*0], m1 ; aBbC
1549 movh [dstq+strideq*1], m2 ; bCcD
1550 movhps [dstq+strideq*2], m1 ; cDdd
1551 movhps [dstq+stride3q ], m2 ; dddd
1554 cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
1557 mova m3, [pb_2to15_14_15]
1559 SHIFT_RIGHTx2 m1, m2, m0, m3
1562 SBUTTERFLY wd, 1, 2, 0
1563 shufps m0, m1, m2, q1032
1564 pshufd m3, m2, q3332
1565 DEFINE_ARGS dst, stride, stride3
1566 lea stride3q, [strideq*3]
1568 mova [dstq+strideq *0], m1
1569 mova [dstq+strideq *2], m0
1570 mova [dstq+strideq *4], m2
1571 mova [dstq+stride3q*2], m3
1574 vpalignr m1, m2, m1, 4
1576 PALIGNR m0, m2, m1, 4, m3
1579 pshufd m2, m2, q3321
1580 shufps m0, m1, m2, q1032
1581 pshufd m3, m2, q3332
1582 mova [dstq+strideq *0], m1
1583 mova [dstq+strideq *2], m0
1584 mova [dstq+strideq *4], m2
1585 mova [dstq+stride3q*2], m3
1588 cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
1590 mova m3, [lq+mmsize]
1595 SBUTTERFLY wd, 1, 2, 0
1597 mova m5, [pb_2to15_14_15]
1599 SHIFT_RIGHTx2 m0, m4, m3, m5
1602 SBUTTERFLY wd, 3, 4, 5
1603 pshufd m0, m0, q3333
1604 DEFINE_ARGS dst, stride, stride3, cnt
1605 lea stride3q, [strideq*3]
1609 mova [dstq+strideq *0+ 0], m1
1610 mova [dstq+strideq *0+16], m2
1611 mova [dstq+strideq *4+ 0], m2
1612 mova [dstq+strideq *4+16], m3
1613 mova [dstq+strideq *8+ 0], m3
1614 mova [dstq+strideq *8+16], m4
1615 mova [dstq+stride3q*4+ 0], m4
1616 mova [dstq+stride3q*4+16], m0
1619 vpalignr m1, m2, m1, 4
1620 vpalignr m2, m3, m2, 4
1621 vpalignr m3, m4, m3, 4
1622 vpalignr m4, m0, m4, 4
1624 PALIGNR m5, m2, m1, 4, m6
1626 PALIGNR m5, m3, m2, 4, m6
1628 PALIGNR m5, m4, m3, 4, m6
1630 PALIGNR m5, m0, m4, 4, m6
1637 cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
1638 %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
1639 mova m2, [lq+mmsize*0+0]
1640 movu m1, [lq+mmsize*0+2]
1641 movu m0, [lq+mmsize*0+4]
1644 SBUTTERFLY wd, 1, 0, 2
1645 SCRATCH 1, 8, rsp+0*mmsize
1646 mova m4, [lq+mmsize*1+0]
1647 movu m3, [lq+mmsize*1+2]
1648 movu m2, [lq+mmsize*1+4]
1651 SBUTTERFLY wd, 3, 2, 4
1652 mova m6, [lq+mmsize*2+0]
1653 movu m5, [lq+mmsize*2+2]
1654 movu m4, [lq+mmsize*2+4]
1657 SBUTTERFLY wd, 5, 4, 6
1658 mova m7, [lq+mmsize*3+0]
1659 SCRATCH 0, 9, rsp+1*mmsize
1661 mova m0, [pb_2to15_14_15]
1663 SHIFT_RIGHTx2 m1, m6, m7, m0
1666 SBUTTERFLY wd, 7, 6, 0
1667 pshufd m1, m1, q3333
1668 UNSCRATCH 0, 9, rsp+1*mmsize
1669 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
1670 lea stride3q, [strideq*3]
1671 lea stride4q, [strideq*4]
1672 lea stride28q, [stride4q*8]
1673 lea stride20q, [stride4q*5]
1674 sub stride28q, stride4q
1681 mova [rsp+1*mmsize], m1
1682 mova m1, [rsp+0*mmsize]
1684 mova [dstq+strideq *0+ 0], m1
1685 mova [dstq+strideq *0+16], m0
1686 mova [dstq+strideq *0+32], m3
1687 mova [dstq+strideq *0+48], m2
1688 mova [dstq+stride4q*1+ 0], m0
1689 mova [dstq+stride4q*1+16], m3
1690 mova [dstq+stride4q*1+32], m2
1691 mova [dstq+stride4q*1+48], m5
1692 mova [dstq+stride4q*2+ 0], m3
1693 mova [dstq+stride4q*2+16], m2
1694 mova [dstq+stride4q*2+32], m5
1695 mova [dstq+stride4q*2+48], m4
1697 vpalignr m1, m0, m1, 4
1698 vpalignr m0, m3, m0, 4
1699 vpalignr m3, m2, m3, 4
1701 SCRATCH 6, 9, rsp+2*mmsize
1702 %if notcpuflag(ssse3)
1703 SCRATCH 7, 10, rsp+3*mmsize
1705 PALIGNR m6, m0, m1, 4, m7
1707 PALIGNR m6, m3, m0, 4, m7
1709 PALIGNR m6, m2, m3, 4, m7
1711 UNSCRATCH 6, 9, rsp+2*mmsize
1712 SCRATCH 0, 9, rsp+2*mmsize
1713 %if notcpuflag(ssse3)
1714 UNSCRATCH 7, 10, rsp+3*mmsize
1715 SCRATCH 3, 10, rsp+3*mmsize
1721 mova [rsp+0*mmsize], m1
1722 mova m1, [rsp+1*mmsize]
1724 mova [dstq+stride3q*4+ 0], m2
1725 mova [dstq+stride3q*4+16], m5
1726 mova [dstq+stride3q*4+32], m4
1727 mova [dstq+stride3q*4+48], m7
1728 mova [dstq+stride4q*4+ 0], m5
1729 mova [dstq+stride4q*4+16], m4
1730 mova [dstq+stride4q*4+32], m7
1731 mova [dstq+stride4q*4+48], m6
1732 mova [dstq+stride20q + 0], m4
1733 mova [dstq+stride20q +16], m7
1734 mova [dstq+stride20q +32], m6
1735 mova [dstq+stride20q +48], m1
1736 mova [dstq+stride3q*8+ 0], m7
1737 mova [dstq+stride3q*8+16], m6
1738 mova [dstq+stride3q*8+32], m1
1739 mova [dstq+stride3q*8+48], m1
1740 mova [dstq+stride28q + 0], m6
1741 mova [dstq+stride28q +16], m1
1742 mova [dstq+stride28q +32], m1
1743 mova [dstq+stride28q +48], m1
1745 vpalignr m2, m5, m2, 4
1746 vpalignr m5, m4, m5, 4
1747 vpalignr m4, m7, m4, 4
1748 vpalignr m7, m6, m7, 4
1749 vpalignr m6, m1, m6, 4
1751 PALIGNR m0, m5, m2, 4, m3
1753 PALIGNR m0, m4, m5, 4, m3
1755 PALIGNR m0, m7, m4, 4, m3
1757 PALIGNR m0, m6, m7, 4, m3
1759 PALIGNR m0, m1, m6, 4, m3
1761 UNSCRATCH 0, 9, rsp+2*mmsize
1762 %if notcpuflag(ssse3)
1763 UNSCRATCH 3, 10, rsp+3*mmsize
1780 cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
1788 DEFINE_ARGS dst, stride, stride3
1789 lea stride3q, [strideq*3]
1791 movh [dstq+stride3q ], m1
1792 movhps [dstq+strideq*1], m1
1794 PALIGNR m2, m1, 4, m0
1795 movh [dstq+strideq*2], m2
1796 movhps [dstq+strideq*0], m2
1799 cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
1802 PALIGNR m2, m1, m0, 2, m3
1803 PALIGNR m3, m1, m0, 4, m4
1806 SBUTTERFLY wd, 2, 3, 0
1810 DEFINE_ARGS dst8, mstride, cnt
1811 lea dst8q, [dst8q+mstrideq*8]
1817 mova [dst8q+mstrideq*0], m2
1818 mova [dst8q+mstrideq*4], m3
1820 vpalignr m2, m3, m2, 4
1821 vpalignr m3, m1, m3, 4
1823 PALIGNR m0, m3, m2, 4, m4
1825 PALIGNR m0, m1, m3, 4, m4
1833 cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
1839 mova m4, [lq+mmsize]
1841 PALIGNR m3, m5, m4, 2, m6
1842 PALIGNR m2, m5, m4, 4, m6
1845 SBUTTERFLY wd, 1, 0, 4
1846 SBUTTERFLY wd, 3, 2, 4
1850 movu m5, [aq+mmsize-2]
1854 DEFINE_ARGS dst, mstride, mstride3, cnt
1855 lea dstq, [dstq+mstrideq*8]
1856 lea dstq, [dstq+mstrideq*8]
1858 lea mstride3q, [mstrideq*3]
1863 mova [dstq+mstride3q*4+ 0], m2
1864 mova [dstq+mstride3q*4+16], m4
1865 mova [dstq+mstrideq *8+ 0], m3
1866 mova [dstq+mstrideq *8+16], m2
1867 mova [dstq+mstrideq *4+ 0], m0
1868 mova [dstq+mstrideq *4+16], m3
1869 mova [dstq+mstrideq *0+ 0], m1
1870 mova [dstq+mstrideq *0+16], m0
1872 vpalignr m1, m0, m1, 4
1873 vpalignr m0, m3, m0, 4
1874 vpalignr m3, m2, m3, 4
1875 vpalignr m2, m4, m2, 4
1876 vpalignr m4, m5, m4, 4
1878 PALIGNR m6, m0, m1, 4, m7
1880 PALIGNR m6, m3, m0, 4, m7
1882 PALIGNR m6, m2, m3, 4, m7
1884 PALIGNR m6, m4, m2, 4, m7
1886 PALIGNR m6, m5, m4, 4, m7
1894 cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
1895 10 * -mmsize * ARCH_X86_32, dst, stride, l, a
1896 mova m2, [lq+mmsize*0+0]
1897 movu m1, [lq+mmsize*0+2]
1898 movu m0, [lq+mmsize*0+4]
1901 SBUTTERFLY wd, 1, 0, 2
1902 mova m4, [lq+mmsize*1+0]
1903 movu m3, [lq+mmsize*1+2]
1904 movu m2, [lq+mmsize*1+4]
1907 SBUTTERFLY wd, 3, 2, 4
1908 SCRATCH 0, 8, rsp+0*mmsize
1909 SCRATCH 1, 9, rsp+1*mmsize
1910 SCRATCH 2, 10, rsp+2*mmsize
1911 SCRATCH 3, 11, rsp+3*mmsize
1912 mova m6, [lq+mmsize*2+0]
1913 movu m5, [lq+mmsize*2+2]
1914 movu m4, [lq+mmsize*2+4]
1917 SBUTTERFLY wd, 5, 4, 6
1918 mova m0, [lq+mmsize*3+0]
1919 movu m1, [aq+mmsize*0-2]
1920 PALIGNR m7, m1, m0, 2, m2
1921 PALIGNR m6, m1, m0, 4, m2
1924 SBUTTERFLY wd, 7, 6, 0
1925 mova m2, [aq+mmsize*0+0]
1926 movu m0, [aq+mmsize*0+2]
1928 movu m1, [aq+mmsize*1-2]
1929 mova m2, [aq+mmsize*1+0]
1930 movu m3, [aq+mmsize*1+2]
1932 SCRATCH 6, 12, rsp+6*mmsize
1933 SCRATCH 7, 13, rsp+7*mmsize
1934 movu m2, [aq+mmsize*2-2]
1935 mova m3, [aq+mmsize*2+0]
1936 movu m6, [aq+mmsize*2+2]
1938 movu m3, [aq+mmsize*3-2]
1942 UNSCRATCH 6, 12, rsp+6*mmsize
1943 UNSCRATCH 7, 13, rsp+7*mmsize
1945 mova [rsp+4*mmsize], m4
1946 mova [rsp+5*mmsize], m5
1947 ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
1948 ; to do it again here
1950 DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
1952 lea stride3q, [strideq*3]
1954 lea stride4q, [strideq*4]
1955 lea stride28q, [stride4q*8]
1956 lea stride20q, [stride4q*5]
1957 sub stride28q, stride4q
1961 ; x86-32 doesn't have enough registers, so on that platform, we split
1962 ; the loop in 2... Otherwise you spend most of the loop (un)scratching
1965 mova [dstq+stride28q + 0], m9
1966 mova [dstq+stride28q +16], m8
1967 mova [dstq+stride28q +32], m11
1968 mova [dstq+stride28q +48], m10
1969 mova [dstq+stride3q*8+ 0], m8
1970 mova [dstq+stride3q*8+16], m11
1971 mova [dstq+stride3q*8+32], m10
1972 mova [dstq+stride3q*8+48], m5
1973 mova [dstq+stride20q + 0], m11
1974 mova [dstq+stride20q +16], m10
1975 mova [dstq+stride20q +32], m5
1976 mova [dstq+stride20q +48], m4
1977 mova [dstq+stride4q*4+ 0], m10
1978 mova [dstq+stride4q*4+16], m5
1979 mova [dstq+stride4q*4+32], m4
1980 mova [dstq+stride4q*4+48], m7
1982 mova [dstq+stride3q*4+ 0], m5
1983 mova [dstq+stride3q*4+16], m4
1984 mova [dstq+stride3q*4+32], m7
1985 mova [dstq+stride3q*4+48], m6
1986 mova [dstq+strideq* 8+ 0], m4
1987 mova [dstq+strideq* 8+16], m7
1988 mova [dstq+strideq* 8+32], m6
1989 mova [dstq+strideq* 8+48], m0
1990 mova [dstq+strideq* 4+ 0], m7
1991 mova [dstq+strideq* 4+16], m6
1992 mova [dstq+strideq* 4+32], m0
1993 mova [dstq+strideq* 4+48], m1
1994 mova [dstq+strideq* 0+ 0], m6
1995 mova [dstq+strideq* 0+16], m0
1996 mova [dstq+strideq* 0+32], m1
1997 mova [dstq+strideq* 0+48], m2
2001 vpalignr m9, m8, m9, 4
2002 vpalignr m8, m11, m8, 4
2003 vpalignr m11, m10, m11, 4
2004 vpalignr m10, m5, m10, 4
2006 vpalignr m5, m4, m5, 4
2007 vpalignr m4, m7, m4, 4
2008 vpalignr m7, m6, m7, 4
2009 vpalignr m6, m0, m6, 4
2010 vpalignr m0, m1, m0, 4
2011 vpalignr m1, m2, m1, 4
2012 vpalignr m2, m3, m2, 4
2015 PALIGNR m12, m8, m9, 4, m13
2017 PALIGNR m12, m11, m8, 4, m13
2019 PALIGNR m12, m10, m11, 4, m13
2021 PALIGNR m12, m5, m10, 4, m13
2024 SCRATCH 3, 12, rsp+8*mmsize, sh
2025 %if notcpuflag(ssse3)
2026 SCRATCH 2, 13, rsp+9*mmsize
2028 PALIGNR m3, m4, m5, 4, m2
2030 PALIGNR m3, m7, m4, 4, m2
2032 PALIGNR m3, m6, m7, 4, m2
2034 PALIGNR m3, m0, m6, 4, m2
2036 PALIGNR m3, m1, m0, 4, m2
2038 %if notcpuflag(ssse3)
2039 UNSCRATCH 2, 13, rsp+9*mmsize
2040 SCRATCH 0, 13, rsp+9*mmsize
2042 PALIGNR m3, m2, m1, 4, m0
2044 PALIGNR m3, reg_sh, m2, 4, m0
2046 %if notcpuflag(ssse3)
2047 UNSCRATCH 0, 13, rsp+9*mmsize
2049 UNSCRATCH 3, 12, rsp+8*mmsize, sh
2056 UNSCRATCH 0, 8, rsp+0*mmsize
2057 UNSCRATCH 1, 9, rsp+1*mmsize
2058 UNSCRATCH 2, 10, rsp+2*mmsize
2059 UNSCRATCH 3, 11, rsp+3*mmsize
2060 mova m4, [rsp+4*mmsize]
2061 mova m5, [rsp+5*mmsize]
2062 mova m6, [rsp+6*mmsize]
2063 mova m7, [rsp+7*mmsize]
2064 DEFINE_ARGS dst, stride, stride5, stride3
2065 lea stride5q, [strideq*5]
2066 lea dstq, [dstq+stride5q*4]
2067 DEFINE_ARGS dst, stride, cnt, stride3
2070 mova [dstq+stride3q*4+ 0], m1
2071 mova [dstq+stride3q*4+16], m0
2072 mova [dstq+stride3q*4+32], m3
2073 mova [dstq+stride3q*4+48], m2
2074 mova [dstq+strideq* 8+ 0], m0
2075 mova [dstq+strideq* 8+16], m3
2076 mova [dstq+strideq* 8+32], m2
2077 mova [dstq+strideq* 8+48], m5
2078 mova [dstq+strideq* 4+ 0], m3
2079 mova [dstq+strideq* 4+16], m2
2080 mova [dstq+strideq* 4+32], m5
2081 mova [dstq+strideq* 4+48], m4
2082 mova [dstq+strideq* 0+ 0], m2
2083 mova [dstq+strideq* 0+16], m5
2084 mova [dstq+strideq* 0+32], m4
2085 mova [dstq+strideq* 0+48], m7
2088 vpalignr m1, m0, m1, 4
2089 vpalignr m0, m3, m0, 4
2090 vpalignr m3, m2, m3, 4
2091 vpalignr m2, m5, m2, 4
2092 vpalignr m5, m4, m5, 4
2093 vpalignr m4, m7, m4, 4
2094 vpalignr m7, m6, m7, 4
2096 SCRATCH 6, 12, rsp+8*mmsize, sh
2097 %if notcpuflag(ssse3)
2098 SCRATCH 7, 13, rsp+9*mmsize
2100 PALIGNR m6, m0, m1, 4, m7
2102 PALIGNR m6, m3, m0, 4, m7
2104 PALIGNR m6, m2, m3, 4, m7
2106 PALIGNR m6, m5, m2, 4, m7
2108 PALIGNR m6, m4, m5, 4, m7
2110 %if notcpuflag(ssse3)
2111 UNSCRATCH 7, 13, rsp+9*mmsize
2112 SCRATCH 5, 13, rsp+9*mmsize
2114 PALIGNR m6, m7, m4, 4, m5
2116 PALIGNR m6, reg_sh, m7, 4, m5
2118 %if notcpuflag(ssse3)
2119 UNSCRATCH 5, 13, rsp+9*mmsize
2121 UNSCRATCH 6, 12, rsp+8*mmsize, sh