1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Fiona Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
29 tm_shuf: times 8 db 0x03, 0x80
30 pw_ff00: times 8 dw 0xff00
31 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
32 db 1, 2, 3, 4, 5, 6, 7, 8
33 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
34 db 1, 2, 3, 4, 0, 0, 0, 0
35 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
36 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
37 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
38 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
51 ;-----------------------------------------------------------------------------
52 ; void ff_pred16x16_vertical_8(uint8_t *src, int stride)
53 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_8, 2,3
72 cglobal pred16x16_vertical_8, 2,3
77 movaps [r0+r1*1], xmm0
78 movaps [r0+r1*2], xmm0
80 movaps [r0+r1*1], xmm0
81 movaps [r0+r1*2], xmm0
87 ;-----------------------------------------------------------------------------
88 ; void ff_pred16x16_horizontal_8(uint8_t *src, int stride)
89 ;-----------------------------------------------------------------------------
92 cglobal pred16x16_horizontal_8, 2,3
128 ;-----------------------------------------------------------------------------
129 ; void ff_pred16x16_dc_8(uint8_t *src, int stride)
130 ;-----------------------------------------------------------------------------
132 %macro PRED16x16_DC 0
133 cglobal pred16x16_dc_8, 2,7
141 movzx r5d, byte [r0+r1*1]
146 movzx r2d, byte [r0+r1*0]
147 movzx r3d, byte [r0+r1*1]
152 movzx r2d, byte [r0+r1*0]
159 SPLATB_REG m0, r2, m1
190 ;-----------------------------------------------------------------------------
191 ; void ff_pred16x16_tm_vp8_8(uint8_t *src, int stride)
192 ;-----------------------------------------------------------------------------
194 %macro PRED16x16_TM 0
195 cglobal pred16x16_tm_vp8_8, 2,5
206 movzx r3d, byte [r0-1]
209 movzx r2d, byte [r0+r1-1]
236 cglobal pred16x16_tm_vp8_8, 2,6,6
243 movzx r4d, byte [r0-1]
246 movzx r2d, byte [r0+r1*1-1]
247 movzx r3d, byte [r0+r1*2-1]
252 pshuflw xmm2, xmm2, 0
253 pshuflw xmm4, xmm4, 0
254 punpcklqdq xmm2, xmm2
255 punpcklqdq xmm4, xmm4
264 movdqa [r0+r1*1], xmm2
265 movdqa [r0+r1*2], xmm4
271 ;-----------------------------------------------------------------------------
272 ; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
273 ;-----------------------------------------------------------------------------
275 %macro H264_PRED16x16_PLANE 1
276 cglobal pred16x16_plane_%1_8, 2,9,7
290 pmullw m0, [pw_m8tom1 ]
291 pmullw m1, [pw_m8tom1+8]
292 pmullw m2, [pw_1to8 ]
293 pmullw m3, [pw_1to8 +8]
298 movhps m0, [r0+r1 +8]
299 pmaddubsw m0, [plane_shuf] ; H coefficients
305 pmullw m0, [pw_m8tom1]
325 paddw m0, m1 ; sum of H coefficients
337 movzx e_reg, byte [r3+r2*2 ]
338 movzx r5, byte [r4+r1 ]
341 movzx e_reg, byte [r3+r2 ]
346 movzx e_reg, byte [r3+r1 ]
347 movzx r6, byte [r4+r2*2 ]
351 movzx e_reg, byte [r3 ]
353 movzx r7, byte [r4+r2 ]
356 movzx r6, byte [r4+r2 ]
365 movzx r4, byte [e_reg+r2 ]
377 movzx r4, byte [e_reg ]
379 movzx r7, byte [r3 +r2 ]
383 movzx r6, byte [r3 +r2 ]
389 movzx r4, byte [e_reg+r1 ]
390 movzx r6, byte [r3 +r2*2]
397 movzx r4, byte [e_reg+r2*2]
398 movzx r6, byte [r3 +r1 ]
401 add r5, r6 ; sum of V coefficients
418 lea r5, [r5*5] ; 5*(V/4)
422 sar r5, 4 ; (5*(V/4))/16
425 movzx r4, byte [r0+r1 +15]
426 movzx r3, byte [r3+r2*2 ]
444 lea r1d, [r1d*5] ; 5*(H/4)
448 sar r1d, 4 ; (5*(H/4))/16
469 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
478 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
479 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
481 paddw m5, m0 ; a + {8,9,10,11}*H
482 paddw m6, m0 ; a + {12,13,14,15}*H
487 mova m3, m0 ; b[0..7]
488 mova m4, m2 ; b[8..15]
494 mova m3, m5 ; b[8..11]
495 mova m4, m6 ; b[12..15]
508 mova m3, m0 ; b[0..7]
509 mova m4, m2 ; b[8..15]
515 mova m3, m5 ; b[8..11]
516 mova m4, m6 ; b[12..15]
536 H264_PRED16x16_PLANE h264
537 H264_PRED16x16_PLANE rv40
538 H264_PRED16x16_PLANE svq3
540 H264_PRED16x16_PLANE h264
541 H264_PRED16x16_PLANE rv40
542 H264_PRED16x16_PLANE svq3
544 H264_PRED16x16_PLANE h264
545 H264_PRED16x16_PLANE rv40
546 H264_PRED16x16_PLANE svq3
548 H264_PRED16x16_PLANE h264
549 H264_PRED16x16_PLANE rv40
550 H264_PRED16x16_PLANE svq3
552 ;-----------------------------------------------------------------------------
553 ; void ff_pred8x8_plane_8(uint8_t *src, int stride)
554 ;-----------------------------------------------------------------------------
556 %macro H264_PRED8x8_PLANE 0
557 cglobal pred8x8_plane_8, 2,9,7
567 pmullw m0, [pw_m4to4]
568 pmullw m1, [pw_m4to4+8]
571 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
572 pmaddubsw m0, [plane8_shuf] ; H coefficients
578 pmullw m0, [pw_m4to4]
584 %if notcpuflag(ssse3)
600 paddw m0, m1 ; sum of H coefficients
612 movzx e_reg, byte [r3+r2*2 ]
613 movzx r5, byte [r4+r1 ]
616 movzx e_reg, byte [r3 ]
618 movzx r7, byte [r4+r2 ]
622 movzx r6, byte [r4+r2 ]
628 movzx e_reg, byte [r3+r1 ]
629 movzx r6, byte [r4+r2*2 ]
636 movzx e_reg, byte [r3+r2 ]
649 movzx r3, byte [r4+r2*2 ]
650 movzx r4, byte [r0+r1 +7]
672 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
673 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
676 paddw m2, m0 ; a + {4,5,6,7}*H
683 mova m3, m0 ; b[0..7]
686 mova m4, m0 ; V+b[0..7]
693 mova m3, m0 ; b[0..3]
694 mova m4, m2 ; b[4..7]
699 mova m5, m0 ; V+b[0..3]
700 mova m6, m2 ; V+b[4..7]
726 ;-----------------------------------------------------------------------------
727 ; void ff_pred8x8_vertical_8(uint8_t *src, int stride)
728 ;-----------------------------------------------------------------------------
731 cglobal pred8x8_vertical_8, 2,2
743 ;-----------------------------------------------------------------------------
744 ; void ff_pred8x8_horizontal_8(uint8_t *src, int stride)
745 ;-----------------------------------------------------------------------------
748 cglobal pred8x8_horizontal_8, 2,3
754 SPLATB_LOAD m0, r0+r1*0-1, m2
755 SPLATB_LOAD m1, r0+r1*1-1, m2
771 ;-----------------------------------------------------------------------------
772 ; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
773 ;-----------------------------------------------------------------------------
775 cglobal pred8x8_top_dc_8, 2,5
792 pshufw mm0, mm0, 0 ; dc0 (w)
793 packuswb mm0, mm1 ; dc0,dc1 (b)
805 ;-----------------------------------------------------------------------------
806 ; void ff_pred8x8_dc_8_mmxext(uint8_t *src, int stride)
807 ;-----------------------------------------------------------------------------
810 cglobal pred8x8_dc_8, 2,5
819 movzx r2d, byte [r0+r1*1-1]
820 movzx r3d, byte [r0+r1*2-1]
823 movzx r3d, byte [r0+r1*1-1]
825 movzx r3d, byte [r0+r1*2-1]
829 movzx r2d, byte [r0+r1*1-1]
830 movzx r3d, byte [r0+r1*2-1]
833 movzx r3d, byte [r0+r1*1-1]
835 movzx r3d, byte [r0+r1*2-1]
842 punpckldq m0, m2 ; s0, s1, s2, s3
843 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
845 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
849 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
866 ;-----------------------------------------------------------------------------
867 ; void ff_pred8x8_dc_rv40_8(uint8_t *src, int stride)
868 ;-----------------------------------------------------------------------------
871 cglobal pred8x8_dc_rv40_8, 2,7
877 movzx r5d, byte [r0+r1*1]
881 movzx r2d, byte [r0+r1*0]
882 movzx r3d, byte [r0+r1*1]
887 movzx r2d, byte [r0+r1*0]
903 ;-----------------------------------------------------------------------------
904 ; void ff_pred8x8_tm_vp8_8(uint8_t *src, int stride)
905 ;-----------------------------------------------------------------------------
908 cglobal pred8x8_tm_vp8_8, 2,6
915 movzx r4d, byte [r0-1]
918 movzx r2d, byte [r0+r1*1-1]
919 movzx r3d, byte [r0+r1*2-1]
948 cglobal pred8x8_tm_vp8_8, 2,6,4
953 movzx r4d, byte [r0-1]
956 movzx r2d, byte [r0+r1*1-1]
957 movzx r3d, byte [r0+r1*2-1]
962 pshuflw xmm2, xmm2, 0
963 pshuflw xmm3, xmm3, 0
964 punpcklqdq xmm2, xmm2
965 punpcklqdq xmm3, xmm3
970 movhps [r0+r1*2], xmm2
977 cglobal pred8x8_tm_vp8_8, 2,3,6
979 movdqa xmm4, [tm_shuf]
987 movd xmm2, [r0+r1*1-4]
988 movd xmm3, [r0+r1*2-4]
997 movhps [r0+r1*2], xmm2
1003 ; dest, left, right, src, tmp
1004 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1005 %macro PRED4x4_LOWPASS 5
1015 ;-----------------------------------------------------------------------------
1016 ; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
1018 ;-----------------------------------------------------------------------------
1019 %macro PRED8x8L_TOP_DC 0
1020 cglobal pred8x8l_top_dc_8, 4,4
1028 PALIGNR mm2, mm0, 7, mm0
1029 PALIGNR mm1, mm4, 1, mm4
1030 test r1d, r1d ; top_left
1032 test r2d, r2d ; top_right
1041 test r2d, r2d ; top_right
1050 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1071 ;-----------------------------------------------------------------------------
1072 ; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
1074 ;-----------------------------------------------------------------------------
1076 %macro PRED8x8L_DC 0
1077 cglobal pred8x8l_dc_8, 4,5
1080 movq mm0, [r0+r3*1-8]
1081 punpckhbw mm0, [r0+r3*0-8]
1082 movq mm1, [r4+r3*1-8]
1083 punpckhbw mm1, [r0+r3*2-8]
1087 movq mm2, [r0+r3*1-8]
1088 punpckhbw mm2, [r0+r3*0-8]
1090 movq mm3, [r0+r3*1-8]
1091 punpckhbw mm3, [r0+r3*0-8]
1095 movq mm0, [r0+r3*0-8]
1100 PALIGNR mm4, mm0, 7, mm0
1101 PALIGNR mm1, mm2, 1, mm2
1128 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1131 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1133 PALIGNR mm7, mm1, 7, mm3
1139 PALIGNR mm2, mm0, 7, mm0
1140 PALIGNR mm1, mm4, 1, mm4
1147 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1175 ;-----------------------------------------------------------------------------
1176 ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
1177 ; int has_topright, int stride)
1178 ;-----------------------------------------------------------------------------
1180 %macro PRED8x8L_HORIZONTAL 0
1181 cglobal pred8x8l_horizontal_8, 4,4
1184 movq mm0, [r0+r3*1-8]
1188 punpckhbw mm0, [r1+r3*0-8]
1189 movq mm1, [r2+r3*1-8]
1190 punpckhbw mm1, [r0+r3*2-8]
1194 movq mm2, [r0+r3*1-8]
1195 punpckhbw mm2, [r0+r3*0-8]
1197 movq mm3, [r0+r3*1-8]
1198 punpckhbw mm3, [r0+r3*0-8]
1202 movq mm0, [r0+r3*0-8]
1203 movq mm1, [r1+r3*0-8]
1207 PALIGNR mm4, mm0, 7, mm0
1208 PALIGNR mm1, mm2, 1, mm2
1210 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1213 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1215 PALIGNR mm7, mm1, 7, mm3
1221 pshufw mm0, mm3, 0xff
1222 pshufw mm1, mm3, 0xaa
1224 pshufw mm2, mm3, 0x55
1225 pshufw mm3, mm3, 0x00
1226 pshufw mm4, mm7, 0xff
1227 pshufw mm5, mm7, 0xaa
1228 pshufw mm6, mm7, 0x55
1229 pshufw mm7, mm7, 0x00
1247 ;-----------------------------------------------------------------------------
1248 ; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
1250 ;-----------------------------------------------------------------------------
1252 %macro PRED8x8L_VERTICAL 0
1253 cglobal pred8x8l_vertical_8, 4,4
1260 PALIGNR mm2, mm0, 7, mm0
1261 PALIGNR mm1, mm4, 1, mm4
1262 test r1d, r1d ; top_left
1264 test r2d, r2d ; top_right
1273 test r2d, r2d ; top_right
1282 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1298 ;-----------------------------------------------------------------------------
1299 ; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
1300 ; int has_topright, int stride)
1301 ;-----------------------------------------------------------------------------
1304 cglobal pred8x8l_down_left_8, 4,5
1311 PALIGNR mm2, mm0, 7, mm0
1312 PALIGNR mm1, mm4, 1, mm4
1335 pshufw mm1, mm3, 0xFF
1338 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1347 PALIGNR mm2, mm3, 7, mm3
1348 PALIGNR mm5, mm4, 1, mm4
1349 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1357 PALIGNR mm2, mm7, 1, mm0
1359 PALIGNR mm3, mm7, 7, mm0
1360 PALIGNR mm4, mm6, 1, mm0
1366 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1367 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1411 %macro PRED8x8L_DOWN_LEFT 0
1412 cglobal pred8x8l_down_left_8, 4,4
1419 PALIGNR mm2, mm0, 7, mm0
1420 PALIGNR mm1, mm4, 1, mm4
1421 test r1d, r1d ; top_left
1423 test r2d, r2d ; top_right
1432 test r2d, r2d ; top_right
1443 pshufw mm1, mm3, 0xFF
1446 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1448 test r2d, r2d ; top_right
1455 PALIGNR mm2, mm3, 7, mm3
1456 PALIGNR mm5, mm4, 1, mm4
1457 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1473 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1475 movq [r0+r3*1], xmm0
1477 movq [r0+r3*2], xmm0
1480 movq [r1+r3*1], xmm0
1482 movq [r1+r3*2], xmm0
1484 movq [r2+r3*1], xmm0
1486 movq [r2+r3*2], xmm0
1488 movq [r0+r3*1], xmm0
1490 movq [r0+r3*2], xmm0
1499 ;-----------------------------------------------------------------------------
1500 ; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
1501 ; int has_topright, int stride)
1502 ;-----------------------------------------------------------------------------
1505 cglobal pred8x8l_down_right_8, 4,5
1508 movq mm0, [r0+r3*1-8]
1509 punpckhbw mm0, [r0+r3*0-8]
1510 movq mm1, [r4+r3*1-8]
1511 punpckhbw mm1, [r0+r3*2-8]
1515 movq mm2, [r0+r3*1-8]
1516 punpckhbw mm2, [r0+r3*0-8]
1518 movq mm3, [r0+r3*1-8]
1519 punpckhbw mm3, [r0+r3*0-8]
1523 movq mm0, [r0+r3*0-8]
1528 PALIGNR mm4, mm0, 7, mm0
1529 PALIGNR mm1, mm2, 1, mm2
1530 test r1d, r1d ; top_left
1534 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1538 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1540 PALIGNR mm7, mm1, 7, mm3
1546 PALIGNR mm2, mm0, 7, mm0
1547 PALIGNR mm1, mm4, 1, mm4
1548 test r1d, r1d ; top_left
1550 test r2d, r2d ; top_right
1553 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1569 test r2d, r2d ; top_right
1585 PALIGNR mm2, mm6, 1, mm0
1587 PALIGNR mm3, mm6, 7, mm0
1591 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1592 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1636 %macro PRED8x8L_DOWN_RIGHT 0
1637 cglobal pred8x8l_down_right_8, 4,5
1640 movq mm0, [r0+r3*1-8]
1641 punpckhbw mm0, [r0+r3*0-8]
1642 movq mm1, [r4+r3*1-8]
1643 punpckhbw mm1, [r0+r3*2-8]
1647 movq mm2, [r0+r3*1-8]
1648 punpckhbw mm2, [r0+r3*0-8]
1650 movq mm3, [r0+r3*1-8]
1651 punpckhbw mm3, [r0+r3*0-8]
1655 movq mm0, [r0+r3*0-8]
1660 PALIGNR mm4, mm0, 7, mm0
1661 PALIGNR mm1, mm2, 1, mm2
1689 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1693 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1695 PALIGNR mm7, mm1, 7, mm3
1702 PALIGNR mm2, mm0, 7, mm0
1703 PALIGNR mm1, mm4, 1, mm4
1709 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1726 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1729 movq [r0+r3*2], xmm0
1730 movq [r0+r3*1], xmm1
1733 movq [r2+r3*2], xmm0
1734 movq [r2+r3*1], xmm1
1737 movq [r1+r3*2], xmm0
1738 movq [r1+r3*1], xmm1
1741 movq [r4+r3*2], xmm0
1742 movq [r4+r3*1], xmm1
1751 ;-----------------------------------------------------------------------------
1752 ; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
1753 ; int has_topright, int stride)
1754 ;-----------------------------------------------------------------------------
1757 cglobal pred8x8l_vertical_right_8, 4,5
1760 movq mm0, [r0+r3*1-8]
1761 punpckhbw mm0, [r0+r3*0-8]
1762 movq mm1, [r4+r3*1-8]
1763 punpckhbw mm1, [r0+r3*2-8]
1767 movq mm2, [r0+r3*1-8]
1768 punpckhbw mm2, [r0+r3*0-8]
1770 movq mm3, [r0+r3*1-8]
1771 punpckhbw mm3, [r0+r3*0-8]
1775 movq mm0, [r0+r3*0-8]
1780 PALIGNR mm4, mm0, 7, mm0
1781 PALIGNR mm1, mm2, 1, mm2
1809 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1816 PALIGNR mm2, mm0, 7, mm0
1817 PALIGNR mm1, mm4, 1, mm4
1823 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1827 PALIGNR mm3, mm7, 7, mm0
1828 PALIGNR mm6, mm7, 6, mm1
1832 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1843 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1844 PALIGNR mm6, mm0, 7, mm2
1847 PALIGNR mm5, mm0, 7, mm1
1850 PALIGNR mm6, mm0, 7, mm2
1853 PALIGNR mm5, mm0, 7, mm1
1856 PALIGNR mm6, mm0, 7, mm2
1859 PALIGNR mm5, mm0, 7, mm1
1863 %macro PRED8x8L_VERTICAL_RIGHT 0
1864 cglobal pred8x8l_vertical_right_8, 4,5,7
1865 ; manually spill XMM registers for Win64 because
1866 ; the code here is initialized with INIT_MMX
1870 movq mm0, [r0+r3*1-8]
1871 punpckhbw mm0, [r0+r3*0-8]
1872 movq mm1, [r4+r3*1-8]
1873 punpckhbw mm1, [r0+r3*2-8]
1877 movq mm2, [r0+r3*1-8]
1878 punpckhbw mm2, [r0+r3*0-8]
1880 movq mm3, [r0+r3*1-8]
1881 punpckhbw mm3, [r0+r3*0-8]
1885 movq mm0, [r0+r3*0-8]
1890 PALIGNR mm4, mm0, 7, mm0
1891 PALIGNR mm1, mm2, 1, mm2
1918 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1925 PALIGNR mm2, mm0, 7, mm0
1926 PALIGNR mm1, mm4, 1, mm4
1932 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1937 movdqa xmm6, [pw_ff00]
1946 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1952 movhps [r0+r3*2], xmm5
1953 movhps [r0+r3*1], xmm2
1961 movq [r0+r3*2], xmm5
1962 movq [r0+r3*1], xmm2
1965 movq [r2+r3*2], xmm5
1966 movq [r2+r3*1], xmm2
1969 movq [r1+r3*2], xmm5
1970 movq [r1+r3*1], xmm2
1975 PRED8x8L_VERTICAL_RIGHT
1977 PRED8x8L_VERTICAL_RIGHT
1979 ;-----------------------------------------------------------------------------
1980 ; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
1981 ; int has_topright, int stride)
1982 ;-----------------------------------------------------------------------------
1984 %macro PRED8x8L_VERTICAL_LEFT 0
1985 cglobal pred8x8l_vertical_left_8, 4,4
1992 PALIGNR mm2, mm0, 7, mm0
1993 PALIGNR mm1, mm4, 1, mm4
2016 pshufw mm1, mm3, 0xFF
2019 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2028 PALIGNR mm2, mm3, 7, mm3
2029 PALIGNR mm5, mm4, 1, mm4
2030 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2044 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2046 movq [r0+r3*1], xmm3
2047 movq [r0+r3*2], xmm0
2051 movq [r1+r3*1], xmm3
2052 movq [r1+r3*2], xmm0
2055 movq [r2+r3*1], xmm3
2056 movq [r2+r3*2], xmm0
2059 movq [r0+r3*1], xmm3
2060 movq [r0+r3*2], xmm0
2065 PRED8x8L_VERTICAL_LEFT
2067 PRED8x8L_VERTICAL_LEFT
2069 ;-----------------------------------------------------------------------------
2070 ; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
2071 ; int has_topright, int stride)
2072 ;-----------------------------------------------------------------------------
2074 %macro PRED8x8L_HORIZONTAL_UP 0
2075 cglobal pred8x8l_horizontal_up_8, 4,4
2078 movq mm0, [r0+r3*1-8]
2082 punpckhbw mm0, [r1+r3*0-8]
2083 movq mm1, [r2+r3*1-8]
2084 punpckhbw mm1, [r0+r3*2-8]
2088 movq mm2, [r0+r3*1-8]
2089 punpckhbw mm2, [r0+r3*0-8]
2091 movq mm3, [r0+r3*1-8]
2092 punpckhbw mm3, [r0+r3*0-8]
2096 movq mm0, [r0+r3*0-8]
2097 movq mm1, [r1+r3*0-8]
2101 PALIGNR mm4, mm0, 7, mm0
2102 PALIGNR mm1, mm2, 1, mm2
2104 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2107 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2109 PALIGNR mm7, mm1, 7, mm3
2111 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2112 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2116 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2123 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2125 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2127 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2129 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2130 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2134 PALIGNR mm5, mm4, 2, mm1
2135 pshufw mm1, mm6, 11111001b
2136 PALIGNR mm6, mm4, 4, mm2
2137 pshufw mm2, mm7, 11111110b
2138 PALIGNR mm7, mm4, 6, mm3
2139 pshufw mm3, mm0, 11111111b
2153 PRED8x8L_HORIZONTAL_UP
2155 PRED8x8L_HORIZONTAL_UP
2157 ;-----------------------------------------------------------------------------
2158 ; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
2159 ; int has_topright, int stride)
2160 ;-----------------------------------------------------------------------------
2163 cglobal pred8x8l_horizontal_down_8, 4,5
2166 movq mm0, [r0+r3*1-8]
2167 punpckhbw mm0, [r0+r3*0-8]
2168 movq mm1, [r4+r3*1-8]
2169 punpckhbw mm1, [r0+r3*2-8]
2173 movq mm2, [r0+r3*1-8]
2174 punpckhbw mm2, [r0+r3*0-8]
2176 movq mm3, [r0+r3*1-8]
2177 punpckhbw mm3, [r0+r3*0-8]
2181 movq mm0, [r0+r3*0-8]
2186 PALIGNR mm4, mm0, 7, mm0
2187 PALIGNR mm1, mm2, 1, mm2
2214 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2218 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2220 PALIGNR mm7, mm1, 7, mm3
2226 PALIGNR mm2, mm0, 7, mm0
2227 PALIGNR mm1, mm4, 1, mm4
2233 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2240 PALIGNR mm2, mm6, 7, mm5
2241 PALIGNR mm6, mm7, 7, mm0
2243 PALIGNR mm4, mm3, 1, mm7
2246 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2252 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2260 PALIGNR mm7, mm3, 2, mm5
2262 PALIGNR mm1, mm3, 4, mm5
2264 PALIGNR mm0, mm3, 6, mm3
2269 PALIGNR mm6, mm4, 2, mm5
2271 PALIGNR mm2, mm4, 4, mm5
2273 PALIGNR mm3, mm4, 6, mm4
2277 %macro PRED8x8L_HORIZONTAL_DOWN 0
2278 cglobal pred8x8l_horizontal_down_8, 4,5
2281 movq mm0, [r0+r3*1-8]
2282 punpckhbw mm0, [r0+r3*0-8]
2283 movq mm1, [r4+r3*1-8]
2284 punpckhbw mm1, [r0+r3*2-8]
2288 movq mm2, [r0+r3*1-8]
2289 punpckhbw mm2, [r0+r3*0-8]
2291 movq mm3, [r0+r3*1-8]
2292 punpckhbw mm3, [r0+r3*0-8]
2296 movq mm0, [r0+r3*0-8]
2301 PALIGNR mm4, mm0, 7, mm0
2302 PALIGNR mm1, mm2, 1, mm2
2329 pshufw mm1, mm3, 0xFF
2333 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2337 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2347 PALIGNR mm2, mm0, 7, mm0
2348 PALIGNR mm1, mm4, 1, mm4
2354 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2363 PALIGNR mm2, mm3, 7, mm3
2364 PALIGNR mm5, mm4, 1, mm4
2365 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2374 PALIGNR xmm1, xmm0, 7, xmm4
2375 PALIGNR xmm2, xmm0, 9, xmm5
2377 PALIGNR xmm3, xmm0, 8, xmm0
2381 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2382 punpcklbw xmm4, xmm0
2384 movq [r0+r3*2], xmm4
2385 movq [r2+r3*2], xmm0
2388 movq [r0+r3*1], xmm4
2389 movq [r2+r3*1], xmm0
2392 movq [r1+r3*2], xmm4
2393 movq [r4+r3*2], xmm0
2396 movq [r1+r3*1], xmm4
2397 movq [r4+r3*1], xmm0
2402 PRED8x8L_HORIZONTAL_DOWN
2404 PRED8x8L_HORIZONTAL_DOWN
2406 ;-------------------------------------------------------------------------------
2407 ; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2408 ;-------------------------------------------------------------------------------
2411 cglobal pred4x4_dc_8, 3,5
2417 movzx r1d, byte [r0+r2*1-1]
2420 movzx r1d, byte [r0+r2*2-1]
2423 movzx r1d, byte [r0+r2*1-1]
2425 movzx r1d, byte [r0+r2*2-1]
2429 imul r3d, 0x01010101
2436 ;-----------------------------------------------------------------------------
2437 ; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2439 ;-----------------------------------------------------------------------------
2442 cglobal pred4x4_tm_vp8_8, 3,6
2447 movzx r4d, byte [r0-1]
2450 movzx r1d, byte [r0+r2*1-1]
2451 movzx r3d, byte [r0+r2*2-1]
2483 cglobal pred4x4_tm_vp8_8, 3,3
2492 movd mm2, [r0+r2*1-4]
2493 movd mm3, [r0+r2*2-4]
2494 movd mm4, [r1+r2*1-4]
2495 movd mm5, [r1+r2*2-4]
2515 ;-----------------------------------------------------------------------------
2516 ; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2518 ;-----------------------------------------------------------------------------
2521 cglobal pred4x4_vertical_vp8_8, 3,3
2525 mova m2, m0 ;t0 t1 t2 t3
2526 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2528 psrlq m0, 8 ;t1 t2 t3 t4
2529 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2536 ;-----------------------------------------------------------------------------
2537 ; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2539 ;-----------------------------------------------------------------------------
2541 cglobal pred4x4_down_left_8, 3,3
2551 PRED4x4_LOWPASS m0, m1, m2, m3, m4
2563 ;------------------------------------------------------------------------------
2564 ; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2566 ;------------------------------------------------------------------------------
2569 cglobal pred4x4_vertical_left_8, 3,3
2579 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2589 ;------------------------------------------------------------------------------
2590 ; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
2592 ;------------------------------------------------------------------------------
2595 cglobal pred4x4_horizontal_up_8, 3,3
2598 movd m0, [r0+r2*1-4]
2599 punpcklbw m0, [r0+r2*2-4]
2600 movd m1, [r1+r2*1-4]
2601 punpcklbw m1, [r1+r2*2-4]
2613 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2623 ;------------------------------------------------------------------------------
2624 ; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
2625 ; const uint8_t *topright, int stride)
2626 ;------------------------------------------------------------------------------
2629 cglobal pred4x4_horizontal_down_8, 3,3
2632 movh m0, [r0-4] ; lt ..
2633 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2634 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2635 movd m1, [r1+r2*2-4] ; l3
2636 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2637 movd m2, [r0+r2*2-4] ; l1
2638 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2639 punpckhwd m1, m2 ; l0 l1 l2 l3
2640 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2644 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2645 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2647 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2650 PALIGNR m3, m5, 6, m4
2659 ;-----------------------------------------------------------------------------
2660 ; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
2661 ; const uint8_t *topright, int stride)
2662 ;-----------------------------------------------------------------------------
2665 cglobal pred4x4_vertical_right_8, 3,3
2668 movh m0, [r0] ; ........t3t2t1t0
2670 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2672 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2674 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2676 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2677 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2683 PALIGNR m5, m1, 7, m2
2686 PALIGNR m3, m1, 7, m1
2690 ;-----------------------------------------------------------------------------
2691 ; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
2693 ;-----------------------------------------------------------------------------
2696 cglobal pred4x4_down_right_8, 3,3
2700 movq m2, [r0+r2*1-8]
2701 punpckhbw m2, [r0-8]
2704 PALIGNR m3, m1, 5, m1
2706 PALIGNR m3, [r1+r2*1-8], 7, m4
2708 PALIGNR m3, [r1+r2*2-8], 7, m4
2709 PRED4x4_LOWPASS m0, m3, m1, m2, m4