1 ;******************************************************************************
2 ;* VP9 Intra prediction SIMD optimizations
4 ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
7 ;* H.264 intra prediction asm optimizations
8 ;* Copyright (c) 2010 Fiona Glaser
9 ;* Copyright (c) 2010 Holger Lubitz
10 ;* Copyright (c) 2010 Loren Merritt
11 ;* Copyright (c) 2010 Ronald S. Bultje
13 ;* This file is part of FFmpeg.
15 ;* FFmpeg is free software; you can redistribute it and/or
16 ;* modify it under the terms of the GNU Lesser General Public
17 ;* License as published by the Free Software Foundation; either
18 ;* version 2.1 of the License, or (at your option) any later version.
20 ;* FFmpeg is distributed in the hope that it will be useful,
21 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
22 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 ;* Lesser General Public License for more details.
25 ;* You should have received a copy of the GNU Lesser General Public
26 ;* License along with FFmpeg; if not, write to the Free Software
27 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28 ;******************************************************************************
30 %include "libavutil/x86/x86util.asm"
34 pw_m256: times 16 dw -256
35 pw_m255: times 16 dw -255
36 pw_4096: times 8 dw 4096
38 pb_4x3_4x2_4x1_4x0: times 4 db 3
42 pb_8x1_8x0: times 8 db 1
44 pb_8x3_8x2: times 8 db 3
46 pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7
48 pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6
50 pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
53 pb_2to6_11x7: db 2, 3, 4, 5, 6
55 pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
56 pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
57 pb_13456_3xm1: db 1, 3, 4, 5, 6
59 pb_6012_4xm1: db 6, 0, 1, 2
61 pb_6xm1_246_8toE: times 6 db -1
62 db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
63 pb_6xm1_BDF_0to6: times 6 db -1
64 db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
65 pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
67 pb_15x0_1xm1: times 15 db 0
69 pb_0to2_5x3: db 0, 1, 2
71 pb_6xm1_2x0: times 6 db -1
73 pb_6x0_2xm1: times 6 db 0
93 ; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
95 %macro DC_4to8_FUNCS 0
96 cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
102 pmulhrsw m0, [pw_4096]
110 movd [dstq+strideq*0], m0
111 movd [dstq+strideq*1], m0
112 lea dstq, [dstq+strideq*2]
113 movd [dstq+strideq*0], m0
114 movd [dstq+strideq*1], m0
117 cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
120 DEFINE_ARGS dst, stride, stride3
121 lea stride3q, [strideq*3]
127 pmulhrsw m0, [pw_2048]
135 movq [dstq+strideq*0], m0
136 movq [dstq+strideq*1], m0
137 movq [dstq+strideq*2], m0
138 movq [dstq+stride3q ], m0
139 lea dstq, [dstq+strideq*4]
140 movq [dstq+strideq*0], m0
141 movq [dstq+strideq*1], m0
142 movq [dstq+strideq*2], m0
143 movq [dstq+stride3q ], m0
152 %macro DC_16to32_FUNCS 0
153 cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
156 DEFINE_ARGS dst, stride, stride3, cnt
157 lea stride3q, [strideq*3]
165 pmulhrsw m0, [pw_1024]
171 pshuflw m0, m0, q0000
176 mova [dstq+strideq*0], m0
177 mova [dstq+strideq*1], m0
178 mova [dstq+strideq*2], m0
179 mova [dstq+stride3q ], m0
180 lea dstq, [dstq+strideq*4]
185 cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
190 DEFINE_ARGS dst, stride, stride3, cnt
191 lea stride3q, [strideq*3]
203 pmulhrsw m0, [pw_512]
209 pshuflw m0, m0, q0000
214 mova [dstq+strideq*0+ 0], m0
215 mova [dstq+strideq*0+16], m0
216 mova [dstq+strideq*1+ 0], m0
217 mova [dstq+strideq*1+16], m0
218 mova [dstq+strideq*2+ 0], m0
219 mova [dstq+strideq*2+16], m0
220 mova [dstq+stride3q + 0], m0
221 mova [dstq+stride3q +16], m0
222 lea dstq, [dstq+strideq*4]
233 %if HAVE_AVX2_EXTERNAL
235 cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
238 DEFINE_ARGS dst, stride, stride3, cnt
239 lea stride3q, [strideq*3]
244 vextracti128 xm1, m0, 1
248 pmulhrsw xm0, [pw_512]
252 mova [dstq+strideq*0], m0
253 mova [dstq+strideq*1], m0
254 mova [dstq+strideq*2], m0
255 mova [dstq+stride3q ], m0
256 lea dstq, [dstq+strideq*4]
257 mova [dstq+strideq*0], m0
258 mova [dstq+strideq*1], m0
259 mova [dstq+strideq*2], m0
260 mova [dstq+stride3q ], m0
261 lea dstq, [dstq+strideq*4]
267 ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
269 %macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
270 cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
275 pmulhrsw m0, [pw_8192]
283 movd [dstq+strideq*0], m0
284 movd [dstq+strideq*1], m0
285 lea dstq, [dstq+strideq*2]
286 movd [dstq+strideq*0], m0
287 movd [dstq+strideq*1], m0
290 cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
292 DEFINE_ARGS dst, stride, stride3
293 lea stride3q, [strideq*3]
297 pmulhrsw m0, [pw_4096]
305 movq [dstq+strideq*0], m0
306 movq [dstq+strideq*1], m0
307 movq [dstq+strideq*2], m0
308 movq [dstq+stride3q ], m0
309 lea dstq, [dstq+strideq*4]
310 movq [dstq+strideq*0], m0
311 movq [dstq+strideq*1], m0
312 movq [dstq+strideq*2], m0
313 movq [dstq+stride3q ], m0
318 DC_1D_4to8_FUNCS top, a
319 DC_1D_4to8_FUNCS left, l
321 DC_1D_4to8_FUNCS top, a
322 DC_1D_4to8_FUNCS left, l
324 %macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
325 cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
327 DEFINE_ARGS dst, stride, stride3, cnt
328 lea stride3q, [strideq*3]
334 pmulhrsw m0, [pw_2048]
340 pshuflw m0, m0, q0000
345 mova [dstq+strideq*0], m0
346 mova [dstq+strideq*1], m0
347 mova [dstq+strideq*2], m0
348 mova [dstq+stride3q ], m0
349 lea dstq, [dstq+strideq*4]
354 cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
357 DEFINE_ARGS dst, stride, stride3, cnt
358 lea stride3q, [strideq*3]
366 pmulhrsw m0, [pw_1024]
372 pshuflw m0, m0, q0000
377 mova [dstq+strideq*0+ 0], m0
378 mova [dstq+strideq*0+16], m0
379 mova [dstq+strideq*1+ 0], m0
380 mova [dstq+strideq*1+16], m0
381 mova [dstq+strideq*2+ 0], m0
382 mova [dstq+strideq*2+16], m0
383 mova [dstq+stride3q + 0], m0
384 mova [dstq+stride3q +16], m0
385 lea dstq, [dstq+strideq*4]
392 DC_1D_16to32_FUNCS top, a
393 DC_1D_16to32_FUNCS left, l
395 DC_1D_16to32_FUNCS top, a
396 DC_1D_16to32_FUNCS left, l
398 %macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
399 %if HAVE_AVX2_EXTERNAL
400 cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
402 DEFINE_ARGS dst, stride, stride3, cnt
403 lea stride3q, [strideq*3]
406 vextracti128 xm1, m0, 1
410 pmulhrsw xm0, [pw_1024]
414 mova [dstq+strideq*0], m0
415 mova [dstq+strideq*1], m0
416 mova [dstq+strideq*2], m0
417 mova [dstq+stride3q ], m0
418 lea dstq, [dstq+strideq*4]
419 mova [dstq+strideq*0], m0
420 mova [dstq+strideq*1], m0
421 mova [dstq+strideq*2], m0
422 mova [dstq+stride3q ], m0
423 lea dstq, [dstq+strideq*4]
431 DC_1D_AVX2_FUNCS top, a
432 DC_1D_AVX2_FUNCS left, l
437 cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
439 DEFINE_ARGS dst, stride, stride3
440 lea stride3q, [strideq*3]
441 movq [dstq+strideq*0], m0
442 movq [dstq+strideq*1], m0
443 movq [dstq+strideq*2], m0
444 movq [dstq+stride3q ], m0
445 lea dstq, [dstq+strideq*4]
446 movq [dstq+strideq*0], m0
447 movq [dstq+strideq*1], m0
448 movq [dstq+strideq*2], m0
449 movq [dstq+stride3q ], m0
453 cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
455 DEFINE_ARGS dst, stride, stride3, cnt
456 lea stride3q, [strideq*3]
459 mova [dstq+strideq*0], m0
460 mova [dstq+strideq*1], m0
461 mova [dstq+strideq*2], m0
462 mova [dstq+stride3q ], m0
463 lea dstq, [dstq+strideq*4]
469 cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
472 DEFINE_ARGS dst, stride, stride3, cnt
473 lea stride3q, [strideq*3]
476 mova [dstq+strideq*0+ 0], m0
477 mova [dstq+strideq*0+16], m1
478 mova [dstq+strideq*1+ 0], m0
479 mova [dstq+strideq*1+16], m1
480 mova [dstq+strideq*2+ 0], m0
481 mova [dstq+strideq*2+16], m1
482 mova [dstq+stride3q + 0], m0
483 mova [dstq+stride3q +16], m1
484 lea dstq, [dstq+strideq*4]
490 cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
492 DEFINE_ARGS dst, stride, stride3, cnt
493 lea stride3q, [strideq*3]
496 mova [dstq+strideq*0], m0
497 mova [dstq+strideq*1], m0
498 mova [dstq+strideq*2], m0
499 mova [dstq+stride3q ], m0
500 lea dstq, [dstq+strideq*4]
501 mova [dstq+strideq*0], m0
502 mova [dstq+strideq*1], m0
503 mova [dstq+strideq*2], m0
504 mova [dstq+stride3q ], m0
505 lea dstq, [dstq+strideq*4]
514 cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
517 pshufb m0, [pb_4x3_4x2_4x1_4x0]
520 pshuflw m0, m0, q0123
523 lea stride3q, [strideq*3]
524 movd [dstq+strideq*0], m0
526 movd [dstq+strideq*1], m0
528 movd [dstq+strideq*2], m0
530 movd [dstq+stride3q ], m0
534 cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
536 mova m2, [pb_8x1_8x0]
537 mova m3, [pb_8x3_8x2]
539 lea stride3q, [strideq*3]
552 movq [dstq+strideq*0], m1
553 movhps [dstq+strideq*1], m1
554 movq [dstq+strideq*2], m0
555 movhps [dstq+stride3q ], m0
556 lea dstq, [dstq+strideq*4]
561 cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
568 lea stride3q, [strideq*3]
581 mova [dstq+strideq*0], m0
582 mova [dstq+strideq*1], m1
590 mova [dstq+strideq*2], m2
591 mova [dstq+stride3q ], m3
592 lea dstq, [dstq+strideq*4]
597 cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
604 lea stride3q, [strideq*3]
617 mova [dstq+strideq*0+ 0], m0
618 mova [dstq+strideq*0+16], m0
619 mova [dstq+strideq*1+ 0], m1
620 mova [dstq+strideq*1+16], m1
628 mova [dstq+strideq*2+ 0], m2
629 mova [dstq+strideq*2+16], m2
630 mova [dstq+stride3q + 0], m3
631 mova [dstq+stride3q +16], m3
632 lea dstq, [dstq+strideq*4]
645 %if HAVE_AVX2_EXTERNAL
647 cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
652 lea stride3q, [strideq*3]
655 movd xm3, [lq+cntq*4]
656 vinserti128 m3, m3, xm3, 1
659 mova [dstq+strideq*0], m0
660 mova [dstq+strideq*1], m1
663 mova [dstq+strideq*2], m2
664 mova [dstq+stride3q ], m3
665 lea dstq, [dstq+strideq*4]
673 %macro TM_MMX_FUNCS 0
674 cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
679 DEFINE_ARGS dst, stride, l, cnt
691 pinsrw m2, [lq+cntq*2], 0
704 movd [dstq+strideq*0], m4
705 movd [dstq+strideq*1], m2
706 lea dstq, [dstq+strideq*2]
717 %macro TM_XMM_FUNCS 0
718 cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
723 DEFINE_ARGS dst, stride, l, cnt
736 pinsrw m2, [lq+cntq*2], 0
749 movh [dstq+strideq*0], m4
750 movhps [dstq+strideq*1], m4
751 lea dstq, [dstq+strideq*2]
756 cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
762 DEFINE_ARGS dst, stride, l, cnt
776 pinsrw m7, [lq+cntq*2], 0
792 mova [dstq+strideq*0], m2
793 mova [dstq+strideq*1], m6
794 lea dstq, [dstq+strideq*2]
804 cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
809 DEFINE_ARGS dst, stride, l, cnt
814 %define pw_m256_reg m12
815 %define pw_m255_reg m13
817 %define pw_m256_reg [pw_m256]
818 %define pw_m255_reg [pw_m255]
820 pshufb m4, pw_m256_reg
847 pinsrw m3, [lq+cntq*2], 0
849 pshufb m7, m3, pw_m255_reg
850 pshufb m3, pw_m256_reg
868 paddw m4, m7, [rsp+0*16]
869 paddw m5, m7, [rsp+1*16]
870 paddw m6, m7, [rsp+2*16]
872 paddw m0, m3, [rsp+0*16]
873 paddw m1, m3, [rsp+1*16]
874 paddw m2, m3, [rsp+2*16]
881 mova [dstq+strideq*0+ 0], m4
882 mova [dstq+strideq*0+16], m6
883 mova [dstq+strideq*1+ 0], m0
884 mova [dstq+strideq*1+16], m2
885 lea dstq, [dstq+strideq*2]
901 %if HAVE_AVX2_EXTERNAL
903 cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
905 pinsrw xm2, [aq-1], 0
906 vinserti128 m2, m2, xm2, 1
908 DEFINE_ARGS dst, stride, l, cnt
918 pinsrw xm7, [lq+cntq*2], 0
919 vinserti128 m7, m7, xm7, 1
928 mova [dstq+strideq*0], m2
929 mova [dstq+strideq*1], m6
930 lea dstq, [dstq+strideq*2]
938 %macro LOWPASS 4 ; left [dst], center, right, tmp
946 %macro DL_MMX_FUNCS 0
947 cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
950 pshufb m0, m1, [pb_0to5_2x7]
951 pshufb m2, m1, [pb_2to6_3x7]
953 punpckhbw m3, m1, m1 ; 44556677
954 pand m0, m1, [pb_6xm1_2x0] ; 012345__
955 pand m3, [pb_6x0_2xm1] ; ______77
956 psrlq m2, m1, 16 ; 234567__
957 por m0, m3 ; 01234577
958 por m2, m3 ; 23456777
964 movd [dstq+strideq*0], m0
965 movd [dstq+strideq*2], m1
969 movd [dstq+strideq*0], m0
970 movd [dstq+strideq*2], m1
979 %macro DL_XMM_FUNCS 0
980 cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
982 lea stride5q, [strideq*5]
984 pshufb m1, m0, [pb_1to6_10x7]
986 punpcklbw m1, m0, m0 ; 0011223344556677
987 punpckhwd m1, m1 ; 4x4,4x5,4x6,4x7
990 %if notcpuflag(ssse3)
998 movq [dstq+strideq*0], m0
999 movq [dstq+strideq*4], m1
1002 movq [dstq+strideq*1], m0
1003 movq [dstq+stride5q ], m1
1004 lea dstq, [dstq+strideq*2]
1007 movq [dstq+strideq*0], m0
1008 movq [dstq+strideq*4], m1
1011 movq [dstq+strideq*1], m0
1012 movq [dstq+stride5q ], m1
1015 cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
1018 mova m5, [pb_1toE_2xF]
1021 pshufb m4, m0, [pb_15]
1023 pand m5, m0, [pb_15x0_1xm1] ; _______________F
1024 psrldq m1, m0, 1 ; 123456789ABCDEF_
1025 por m1, m5 ; 123456789ABCDEFF
1026 psrldq m2, m1, 1 ; 23456789ABCDEFF_
1027 por m2, m5 ; 23456789ABCDEFFF
1028 pshufhw m4, m1, q3333 ; xxxxxxxxFFFFFFFF
1031 DEFINE_ARGS dst, stride, cnt, stride9
1032 lea stride9q, [strideq+strideq*8]
1037 mova [dstq+strideq*0], m0
1044 mova [dstq+strideq*8], m4
1046 mova [dstq+strideq*1], m0
1053 mova [dstq+stride9q ], m4
1054 lea dstq, [dstq+strideq*2]
1059 cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
1062 PALIGNR m2, m1, m0, 1, m4
1063 PALIGNR m3, m1, m0, 2, m4
1066 mova m5, [pb_1toE_2xF]
1069 pshufb m6, m1, [pb_15]
1072 pand m5, m1, [pb_15x0_1xm1] ; _______________F
1073 psrldq m2, m1, 1 ; 123456789ABCDEF_
1074 por m2, m5 ; 123456789ABCDEFF
1075 psrldq m3, m2, 1 ; 23456789ABCDEFF_
1076 por m3, m5 ; 23456789ABCDEFFF
1077 pshufhw m7, m2, q3333 ; xxxxxxxxFFFFFFFF
1078 pshufd m6, m7, q3333
1081 lea dst16q, [dstq +strideq*8]
1083 lea dst16q, [dst16q+strideq*8]
1086 mova [dstq +strideq*0+ 0], m0
1087 mova [dstq +strideq*0+16], m1
1088 movhps [dstq+strideq*8+ 0], m0
1089 movq [dstq +strideq*8+ 8], m1
1090 mova [dstq +strideq*8+16], m7
1091 mova [dst16q+strideq*0+ 0], m1
1092 mova [dst16q+strideq*0+16], m6
1093 mova [dst16q+strideq*8+ 0], m7
1094 mova [dst16q+strideq*8+16], m6
1096 vpalignr m0, m1, m0, 1
1098 %elif cpuflag(ssse3)
1099 palignr m2, m1, m0, 1
1126 %macro DR_MMX_FUNCS 0
1127 cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
1129 punpckldq m0, [aq-1]
1131 DEFINE_ARGS dst, stride, stride3
1132 lea stride3q, [strideq*3]
1133 PALIGNR m1, m0, 1, m3
1137 movd [dstq+stride3q ], m0
1139 movd [dstq+strideq*2], m0
1141 movd [dstq+strideq*1], m0
1143 movd [dstq+strideq*0], m0
1152 %macro DR_XMM_FUNCS 0
1153 cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
1157 DEFINE_ARGS dst, stride, stride3
1158 lea stride3q, [strideq*3]
1160 PALIGNR m2, m1, 1, m3
1163 movhps [dstq+strideq*0], m0
1165 movhps [dstq+strideq*1], m0
1167 movhps [dstq+strideq*2], m0
1169 movhps [dstq+stride3q ], m0
1171 lea dstq, [dstq+strideq*4]
1172 movhps [dstq+strideq*0], m0
1174 movhps [dstq+strideq*1], m0
1176 movhps [dstq+strideq*2], m0
1178 movhps [dstq+stride3q ], m0
1181 cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
1185 DEFINE_ARGS dst, stride, stride9, cnt
1186 lea stride9q, [strideq *3]
1188 lea stride9q, [stride9q*3]
1189 PALIGNR m4, m2, 1, m5
1190 PALIGNR m3, m2, m1, 15, m5
1193 PALIGNR m2, m1, 1, m4
1197 mova [dstq+strideq*0 ], m3
1198 movhps [dstq+strideq*8+0], m0
1199 movq [dstq+strideq*8+8], m3
1200 PALIGNR m3, m0, 15, m1
1202 mova [dstq+strideq*1 ], m3
1203 movhps [dstq+stride9q +0], m0
1204 movq [dstq+stride9q +8], m3
1205 PALIGNR m3, m0, 15, m1
1207 lea dstq, [dstq+strideq*2]
1212 cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
1218 DEFINE_ARGS dst, stride, stride8, cnt
1219 lea stride8q, [strideq*8]
1220 PALIGNR m5, m4, 1, m7
1221 PALIGNR m6, m4, m3, 15, m7
1223 PALIGNR m4, m3, 1, m7
1224 PALIGNR m6, m3, m2, 15, m7
1226 PALIGNR m3, m2, 1, m7
1227 PALIGNR m6, m2, m1, 15, m7
1229 PALIGNR m2, m1, 1, m6
1236 mova [dstq+stride8q*0+ 0], m4
1237 mova [dstq+stride8q*0+16], m5
1238 mova [dstq+stride8q*2+ 0], m3
1239 mova [dstq+stride8q*2+16], m4
1240 PALIGNR m5, m4, 15, m6
1241 PALIGNR m4, m3, 15, m6
1242 PALIGNR m3, m2, 15, m6
1260 cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
1266 movd [dstq+strideq*0], m1
1267 movd [dstq+strideq*1], m2
1268 lea dstq, [dstq+strideq*2]
1271 movd [dstq+strideq*0], m1
1272 movd [dstq+strideq*1], m2
1275 %macro VL_XMM_FUNCS 0
1276 cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
1279 pshufb m0, [pb_0to6_9x7]
1281 punpcklbw m1, m0, m0
1283 shufps m0, m1, q3310
1285 DEFINE_ARGS dst, stride, stride3
1286 lea stride3q, [strideq*3]
1292 movq [dstq+strideq*0], m1
1293 movq [dstq+strideq*1], m2
1296 movq [dstq+strideq*2], m1
1297 movq [dstq+stride3q ], m2
1298 lea dstq, [dstq+strideq*4]
1301 movq [dstq+strideq*0], m1
1302 movq [dstq+strideq*1], m2
1305 movq [dstq+strideq*2], m1
1306 movq [dstq+stride3q ], m2
1309 cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
1311 DEFINE_ARGS dst, stride, stride3, cnt
1312 lea stride3q, [strideq*3]
1314 mova m4, [pb_1toE_2xF]
1318 pand m4, m0, [pb_15x0_1xm1] ; _______________F
1319 psrldq m1, m0, 1 ; 123456789ABCDEF_
1320 por m1, m4 ; 123456789ABCDEFF
1321 psrldq m2, m1, 1 ; 23456789ABCDEFF_
1322 por m2, m4 ; 23456789ABCDEFFF
1328 mova [dstq+strideq*0], m1
1329 mova [dstq+strideq*1], m2
1339 mova [dstq+strideq*2], m1
1340 mova [dstq+stride3q ], m2
1350 lea dstq, [dstq+strideq*4]
1355 cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
1358 DEFINE_ARGS dst, stride, dst16, cnt
1359 PALIGNR m2, m5, m0, 1, m4
1360 PALIGNR m3, m5, m0, 2, m4
1361 lea dst16q, [dstq +strideq*8]
1365 mova m4, [pb_1toE_2xF]
1369 pand m4, m5, [pb_15x0_1xm1] ; _______________F
1370 psrldq m0, m5, 1 ; 123456789ABCDEF_
1371 por m0, m4 ; 123456789ABCDEFF
1372 psrldq m1, m0, 1 ; 23456789ABCDEFF_
1373 por m1, m4 ; 23456789ABCDEFFF
1375 lea dst16q, [dst16q+strideq*8]
1381 punpckhbw m5, m4, m4
1382 pshufhw m5, m5, q3333
1389 mova [dstq+stride%1+ 0], %2
1390 mova [dstq+stride%1+16], %3
1391 movhps [dst16q+stride%1 ], %2
1392 movu [dst16q+stride%1+ 8], %3
1393 movq [dst16q+stride%1+24], m5
1395 palignr %2, %3, %2, 1
1397 %elif cpuflag(ssse3)
1398 palignr m6, %3, %2, 1
1412 lea dstq, [dstq +strideq*2]
1413 lea dst16q, [dst16q+strideq*2]
1428 %macro VR_MMX_FUNCS 0
1429 cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
1433 DEFINE_ARGS dst, stride, stride3
1434 lea stride3q, [strideq*3]
1436 PALIGNR m1, m2, 5, m3
1441 ; ABCD <- for the following predictor:
1443 ; IABC | m0 contains ABCDxxxx
1444 ; JEFG | m2 contains xJIEFGHx
1448 pshufb m2, [pb_13456_3xm1]
1449 movd [dstq+strideq*0], m0
1450 pshufb m0, [pb_6012_4xm1]
1451 movd [dstq+stride3q ], m2
1453 movd [dstq+strideq*2], m0
1454 movd [dstq+strideq*1], m2
1458 movd [dstq+strideq*0], m0
1459 movd [dstq+strideq*1], m2
1460 PALIGNR m0, m1, 7, m3
1462 PALIGNR m2, m1, 7, m3
1463 movd [dstq+strideq*2], m0
1464 movd [dstq+stride3q ], m2
1474 %macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
1475 cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
1479 DEFINE_ARGS dst, stride, stride3
1480 lea stride3q, [strideq*3]
1482 PALIGNR m1, m2, 9, m3
1487 ; ABCDEFGH <- for the following predictor:
1489 ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx
1490 ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP
1497 punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ
1499 movq [dstq+strideq*0], m0
1500 movhps [dstq+strideq*1], m1
1502 pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
1503 pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
1505 psrlw m2, m1, 8 ; x_U_S_Q_xxxxxxxx
1506 pand m3, m1, [pw_255] ; x_V_T_R_xxxxxxxx
1507 packuswb m3, m2 ; xVTRxxxxxUSQxxxx
1508 pslldq m3, 4 ; xxxxxVTRxxxxxUSQ
1509 PALIGNR m0, m3, 7, m4 ; xxxxxxUSQABCDEFG
1512 PALIGNR m1, m3, 7, m4 ; xxxxxxVTRIJKLMNO
1514 movhps [dstq+strideq*2], m0
1515 movhps [dstq+stride3q ], m1
1516 lea dstq, [dstq+strideq*4]
1519 movhps [dstq+strideq*0], m0
1520 movhps [dstq+strideq*1], m1
1523 movhps [dstq+strideq*2], m0
1524 movhps [dstq+stride3q ], m1
1527 cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
1531 DEFINE_ARGS dst, stride, stride3, cnt
1532 lea stride3q, [strideq*3]
1533 PALIGNR m3, m1, m2, 15, m6
1536 PALIGNR m1, m2, 1, m6
1540 pshufb m1, [pb_02468ACE_13579BDF]
1550 mova [dstq+strideq*0], m0
1551 mova [dstq+strideq*1], m3
1552 PALIGNR m4, m0, m1, 15, m6
1553 PALIGNR m5, m3, m2, 15, m6
1554 mova [dstq+strideq*2], m4
1555 mova [dstq+stride3q ], m5
1556 lea dstq, [dstq+strideq*4]
1557 PALIGNR m0, m1, 14, m6
1558 PALIGNR m3, m2, 14, m6
1564 cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
1568 PALIGNR m3, m2, m0, 15, m6
1569 PALIGNR m4, m2, m0, 14, m6
1573 PALIGNR m5, m1, m2, 15, m6
1582 PALIGNR m1, m2, 1, m0
1583 PALIGNR m7, m2, m6, 15, m0
1585 PALIGNR m2, m6, 1, m0
1589 pshufb m1, [pb_02468ACE_13579BDF]
1590 pshufb m2, [pb_02468ACE_13579BDF]
1599 DEFINE_ARGS dst, stride, dst16, cnt
1600 lea dst16q, [dstq +strideq*8]
1601 lea dst16q, [dst16q+strideq*8]
1602 SBUTTERFLY qdq, 2, 1, 6
1611 ; even lines (0, 2, 4, ...): m1 | m0, m3
1612 ; odd lines (1, 3, 5, ...): m2 | m5, m4
1614 mova [dstq+stride%1+ 0], %3
1615 mova [dstq+stride%1+16], %4
1616 movhps [dst16q+stride%1 ], %2
1617 movu [dst16q+stride%1+ 8], %3
1618 movq [dst16q+stride%1+24], %4
1619 PALIGNR %4, %3, 15, m6
1620 PALIGNR %3, %2, 15, m6
1624 %%write q*0, m1, m0, m3
1625 %%write q*1, m2, m5, m4
1626 lea dstq, [dstq +strideq*2]
1627 lea dst16q, [dst16q+strideq*2]
1643 cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
1645 punpckldq m0, [aq-1]
1646 DEFINE_ARGS dst, stride, stride3
1647 lea stride3q, [strideq*3]
1653 ; DHIJ <- for the following predictor:
1655 ; BFCG | m1 contains ABCDxxxx
1656 ; AEBF | m2 contains EFGHIJxx
1659 punpckhdq m0, m1, m2
1661 ; m1 contains AEBFCGDH
1662 ; m0 contains CGDHIJxx
1664 movd [dstq+stride3q ], m1
1665 movd [dstq+strideq*1], m0
1668 movd [dstq+strideq*2], m1
1669 movd [dstq+strideq*0], m0
1672 %macro HD_XMM_FUNCS 0
1673 cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
1676 DEFINE_ARGS dst, stride, stride3, dst4
1677 lea stride3q, [strideq*3]
1678 lea dst4q, [dstq+strideq*4]
1684 ; HPQRSTUV <- for the following predictor
1686 ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx
1687 ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx
1696 ; m1 contains AIBJCKDLEMFNGOHP
1697 ; m2 contains QRSTUVxxxxxxxxxx
1699 movhps [dstq +stride3q ], m1
1700 movq [dst4q+stride3q ], m1
1701 PALIGNR m3, m2, m1, 2, m4
1702 movhps [dstq +strideq*2], m3
1703 movq [dst4q+strideq*2], m3
1704 PALIGNR m3, m2, m1, 4, m4
1705 movhps [dstq +strideq*1], m3
1706 movq [dst4q+strideq*1], m3
1707 PALIGNR m2, m1, 6, m4
1708 movhps [dstq +strideq*0], m2
1709 movq [dst4q+strideq*0], m2
1712 cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
1715 DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
1716 lea stride4q, [strideq*4]
1717 lea dst4q, [dstq +stride4q]
1718 lea dst8q, [dst4q+stride4q]
1719 lea dst12q, [dst8q+stride4q]
1723 PALIGNR m1, m3, m0, 1, m6
1724 PALIGNR m2, m3, m0, 2, m6
1727 SBUTTERFLY bw, 1, 2, 6
1729 ; I PROBABLY INVERTED L0 ad L16 here
1732 sub stride4q, strideq
1733 movhps [dstq +stride4q +0], m2
1734 movq [dstq +stride4q +8], m5
1735 mova [dst4q+stride4q ], m2
1736 movhps [dst8q+stride4q +0], m1
1737 movq [dst8q+stride4q +8], m2
1738 mova [dst12q+stride4q ], m1
1740 palignr m1, m2, m1, 2
1741 palignr m2, m5, m2, 2
1742 %elif cpuflag(ssse3)
1743 palignr m3, m2, m1, 2
1744 palignr m0, m5, m2, 2
1748 ; slightly modified version of PALIGNR
1762 cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
1767 DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
1768 lea stride8q, [strideq*8]
1769 lea dst8q, [dstq +stride8q]
1770 lea dst16q, [dst8q +stride8q]
1771 lea dst24q, [dst16q+stride8q]
1775 PALIGNR m4, m3, m2, 2, m6
1776 PALIGNR m3, m2, 1, m6
1778 PALIGNR m3, m2, m1, 2, m6
1779 PALIGNR m2, m1, 1, m6
1782 PALIGNR m6, m1, m0, 1, m7
1783 PALIGNR m1, m0, 2, m7
1786 SBUTTERFLY bw, 2, 3, 6
1787 SBUTTERFLY bw, 0, 1, 6
1789 ; m0, m1, m2, m3, m4, m5
1791 sub stride8q, strideq
1792 mova [dstq +stride8q+ 0], m3
1793 mova [dstq +stride8q+16], m4
1794 mova [dst8q +stride8q+ 0], m2
1795 mova [dst8q +stride8q+16], m3
1796 mova [dst16q+stride8q+ 0], m1
1797 mova [dst16q+stride8q+16], m2
1798 mova [dst24q+stride8q+ 0], m0
1799 mova [dst24q+stride8q+16], m1
1801 palignr m0, m1, m0, 2
1802 palignr m1, m2, m1, 2
1803 palignr m2, m3, m2, 2
1804 palignr m3, m4, m3, 2
1805 palignr m4, m5, m4, 2
1807 %elif cpuflag(ssse3)
1821 ; sort of a half-integrated version of PALIGNR
1850 %macro HU_MMX_FUNCS 0
1851 cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
1854 pshufb m0, [pb_0to2_5x3]
1856 punpcklbw m1, m0, m0 ; 00112233
1857 pshufw m1, m1, q3333 ; 33333333
1858 punpckldq m0, m1 ; 01233333
1864 DEFINE_ARGS dst, stride, stride3
1865 lea stride3q, [strideq*3]
1866 SBUTTERFLY bw, 1, 2, 0
1867 PALIGNR m2, m1, 2, m0
1868 movd [dstq+strideq*0], m1
1869 movd [dstq+strideq*1], m2
1872 movd [dstq+strideq*2], m1
1873 movd [dstq+stride3q ], m2
1882 %macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
1883 cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
1886 pshufb m0, [pb_0to6_9x7]
1888 punpcklbw m1, m0, m0 ; 0011223344556677
1889 punpckhwd m1, m1 ; 4444555566667777
1890 shufps m0, m1, q3310 ; 0123456777777777
1896 DEFINE_ARGS dst, stride, stride3, dst4
1897 lea stride3q, [strideq*3]
1898 lea dst4q, [dstq+strideq*4]
1899 SBUTTERFLY bw, 1, 2, 0
1900 movq [dstq +strideq*0], m1
1901 movhps [dst4q+strideq*0], m1
1902 PALIGNR m0, m2, m1, 2, m3
1903 movq [dstq +strideq*1], m0
1904 movhps [dst4q+strideq*1], m0
1905 PALIGNR m0, m2, m1, 4, m3
1906 movq [dstq +strideq*2], m0
1907 movhps [dst4q+strideq*2], m0
1908 PALIGNR m2, m1, 6, m3
1909 movq [dstq +stride3q ], m2
1910 movhps [dst4q+stride3q ], m2
1913 cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
1916 mova m3, [pb_2toE_3xF]
1917 pshufb m1, m0, [pb_1toE_2xF]
1920 pand m3, m0, [pb_15x0_1xm1]
1929 DEFINE_ARGS dst, stride, stride9, cnt
1930 lea stride9q, [strideq*8+strideq]
1932 SBUTTERFLY bw, 1, 2, 0
1935 mova [dstq+strideq*0], m1
1936 mova [dstq+strideq*8], m2
1937 PALIGNR m0, m2, m1, 2, m4
1944 mova [dstq+strideq*1], m0
1945 mova [dstq+stride9q ], m2
1946 PALIGNR m1, m2, m0, 2, m4
1953 lea dstq, [dstq+strideq*2]
1958 cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
1961 PALIGNR m2, m0, m1, 1, m5
1962 PALIGNR m3, m0, m1, 2, m5
1966 mova m4, [pb_2toE_3xF]
1967 pshufb m5, m0, [pb_1toE_2xF]
1970 pand m4, m0, [pb_15x0_1xm1]
1979 DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
1981 xor stride0q, stride0q
1982 lea dst8q, [dstq +strideq*8]
1983 lea dst16q, [dst8q +strideq*8]
1984 lea dst24q, [dst16q+strideq*8]
1985 SBUTTERFLY bw, 0, 1, 5
1986 SBUTTERFLY bw, 2, 3, 5
1988 pshufb m6, m1, [pb_15]
1990 pshufhw m6, m4, q3333
1995 mova [dstq +stride0q+ 0], m2
1996 mova [dstq +stride0q+16], m3
1997 mova [dst8q +stride0q+ 0], m3
1998 mova [dst8q +stride0q+16], m0
1999 mova [dst16q+stride0q+ 0], m0
2000 mova [dst16q+stride0q+16], m1
2001 mova [dst24q+stride0q+ 0], m1
2002 mova [dst24q+stride0q+16], m6
2004 palignr m2, m3, m2, 2
2005 palignr m3, m0, m3, 2
2006 palignr m0, m1, m0, 2
2008 %elif cpuflag(ssse3)
2018 ; half-integrated version of PALIGNR
2031 add stride0q, strideq
2044 ; FIXME 127, 128, 129 ?