1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
29 tm_shuf: times 8 db 0x03, 0x80
30 pw_ff00: times 8 dw 0xff00
31 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
32 db 1, 2, 3, 4, 5, 6, 7, 8
33 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
34 db 1, 2, 3, 4, 0, 0, 0, 0
35 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
36 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
37 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
38 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
51 ;-----------------------------------------------------------------------------
52 ; void pred16x16_vertical_8(uint8_t *src, int stride)
53 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_8, 2,3
72 cglobal pred16x16_vertical_8, 2,3
77 movaps [r0+r1*1], xmm0
78 movaps [r0+r1*2], xmm0
80 movaps [r0+r1*1], xmm0
81 movaps [r0+r1*2], xmm0
87 ;-----------------------------------------------------------------------------
88 ; void pred16x16_horizontal_8(uint8_t *src, int stride)
89 ;-----------------------------------------------------------------------------
92 cglobal pred16x16_horizontal_8, 2,3
128 ;-----------------------------------------------------------------------------
129 ; void pred16x16_dc_8(uint8_t *src, int stride)
130 ;-----------------------------------------------------------------------------
132 %macro PRED16x16_DC 0
133 cglobal pred16x16_dc_8, 2,7
141 movzx r5d, byte [r0+r1*1]
146 movzx r2d, byte [r0+r1*0]
147 movzx r3d, byte [r0+r1*1]
152 movzx r2d, byte [r0+r1*0]
159 SPLATB_REG m0, r2, m1
190 ;-----------------------------------------------------------------------------
191 ; void pred16x16_tm_vp8_8(uint8_t *src, int stride)
192 ;-----------------------------------------------------------------------------
194 %macro PRED16x16_TM 0
195 cglobal pred16x16_tm_vp8_8, 2,5
206 movzx r3d, byte [r0-1]
209 movzx r2d, byte [r0+r1-1]
236 cglobal pred16x16_tm_vp8_8, 2,6,6
243 movzx r4d, byte [r0-1]
246 movzx r2d, byte [r0+r1*1-1]
247 movzx r3d, byte [r0+r1*2-1]
252 pshuflw xmm2, xmm2, 0
253 pshuflw xmm4, xmm4, 0
254 punpcklqdq xmm2, xmm2
255 punpcklqdq xmm4, xmm4
264 movdqa [r0+r1*1], xmm2
265 movdqa [r0+r1*2], xmm4
271 ;-----------------------------------------------------------------------------
272 ; void pred16x16_plane_*_8(uint8_t *src, int stride)
273 ;-----------------------------------------------------------------------------
275 %macro H264_PRED16x16_PLANE 1
276 cglobal pred16x16_plane_%1_8, 2,9,7
290 pmullw m0, [pw_m8tom1 ]
291 pmullw m1, [pw_m8tom1+8]
292 pmullw m2, [pw_1to8 ]
293 pmullw m3, [pw_1to8 +8]
298 movhps m0, [r0+r1 +8]
299 pmaddubsw m0, [plane_shuf] ; H coefficients
305 pmullw m0, [pw_m8tom1]
325 paddw m0, m1 ; sum of H coefficients
337 movzx e_reg, byte [r3+r2*2 ]
338 movzx r5, byte [r4+r1 ]
341 movzx e_reg, byte [r3+r2 ]
346 movzx e_reg, byte [r3+r1 ]
347 movzx r6, byte [r4+r2*2 ]
351 movzx e_reg, byte [r3 ]
353 movzx r7, byte [r4+r2 ]
356 movzx r6, byte [r4+r2 ]
365 movzx r4, byte [e_reg+r2 ]
377 movzx r4, byte [e_reg ]
379 movzx r7, byte [r3 +r2 ]
383 movzx r6, byte [r3 +r2 ]
389 movzx r4, byte [e_reg+r1 ]
390 movzx r6, byte [r3 +r2*2]
397 movzx r4, byte [e_reg+r2*2]
398 movzx r6, byte [r3 +r1 ]
401 add r5, r6 ; sum of V coefficients
418 lea r5, [r5*5] ; 5*(V/4)
422 sar r5, 4 ; (5*(V/4))/16
425 movzx r4, byte [r0+r1 +15]
426 movzx r3, byte [r3+r2*2 ]
444 lea r1d, [r1d*5] ; 5*(H/4)
448 sar r1d, 4 ; (5*(H/4))/16
469 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
478 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
479 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
481 paddw m5, m0 ; a + {8,9,10,11}*H
482 paddw m6, m0 ; a + {12,13,14,15}*H
487 mova m3, m0 ; b[0..7]
488 mova m4, m2 ; b[8..15]
494 mova m3, m5 ; b[8..11]
495 mova m4, m6 ; b[12..15]
508 mova m3, m0 ; b[0..7]
509 mova m4, m2 ; b[8..15]
515 mova m3, m5 ; b[8..11]
516 mova m4, m6 ; b[12..15]
536 H264_PRED16x16_PLANE h264
537 H264_PRED16x16_PLANE rv40
538 H264_PRED16x16_PLANE svq3
540 H264_PRED16x16_PLANE h264
541 H264_PRED16x16_PLANE rv40
542 H264_PRED16x16_PLANE svq3
544 H264_PRED16x16_PLANE h264
545 H264_PRED16x16_PLANE rv40
546 H264_PRED16x16_PLANE svq3
548 H264_PRED16x16_PLANE h264
549 H264_PRED16x16_PLANE rv40
550 H264_PRED16x16_PLANE svq3
552 ;-----------------------------------------------------------------------------
553 ; void pred8x8_plane_8(uint8_t *src, int stride)
554 ;-----------------------------------------------------------------------------
556 %macro H264_PRED8x8_PLANE 0
557 cglobal pred8x8_plane_8, 2,9,7
567 pmullw m0, [pw_m4to4]
568 pmullw m1, [pw_m4to4+8]
571 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
572 pmaddubsw m0, [plane8_shuf] ; H coefficients
578 pmullw m0, [pw_m4to4]
584 %if notcpuflag(ssse3)
600 paddw m0, m1 ; sum of H coefficients
612 movzx e_reg, byte [r3+r2*2 ]
613 movzx r5, byte [r4+r1 ]
616 movzx e_reg, byte [r3 ]
618 movzx r7, byte [r4+r2 ]
622 movzx r6, byte [r4+r2 ]
628 movzx e_reg, byte [r3+r1 ]
629 movzx r6, byte [r4+r2*2 ]
636 movzx e_reg, byte [r3+r2 ]
649 movzx r3, byte [r4+r2*2 ]
650 movzx r4, byte [r0+r1 +7]
672 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
673 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
676 paddw m2, m0 ; a + {4,5,6,7}*H
683 mova m3, m0 ; b[0..7]
686 mova m4, m0 ; V+b[0..7]
693 mova m3, m0 ; b[0..3]
694 mova m4, m2 ; b[4..7]
699 mova m5, m0 ; V+b[0..3]
700 mova m6, m2 ; V+b[4..7]
726 ;-----------------------------------------------------------------------------
727 ; void pred8x8_vertical_8(uint8_t *src, int stride)
728 ;-----------------------------------------------------------------------------
731 cglobal pred8x8_vertical_8, 2,2
743 ;-----------------------------------------------------------------------------
744 ; void pred8x8_horizontal_8(uint8_t *src, int stride)
745 ;-----------------------------------------------------------------------------
748 cglobal pred8x8_horizontal_8, 2,3
754 SPLATB_LOAD m0, r0+r1*0-1, m2
755 SPLATB_LOAD m1, r0+r1*1-1, m2
771 ;-----------------------------------------------------------------------------
772 ; void pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
773 ;-----------------------------------------------------------------------------
775 cglobal pred8x8_top_dc_8, 2,5
792 pshufw mm0, mm0, 0 ; dc0 (w)
793 packuswb mm0, mm1 ; dc0,dc1 (b)
805 ;-----------------------------------------------------------------------------
806 ; void pred8x8_dc_8_mmxext(uint8_t *src, int stride)
807 ;-----------------------------------------------------------------------------
810 cglobal pred8x8_dc_8, 2,5
819 movzx r2d, byte [r0+r1*1-1]
820 movzx r3d, byte [r0+r1*2-1]
823 movzx r3d, byte [r0+r1*1-1]
825 movzx r3d, byte [r0+r1*2-1]
829 movzx r2d, byte [r0+r1*1-1]
830 movzx r3d, byte [r0+r1*2-1]
833 movzx r3d, byte [r0+r1*1-1]
835 movzx r3d, byte [r0+r1*2-1]
842 punpckldq m0, m2 ; s0, s1, s2, s3
843 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
845 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
849 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
866 ;-----------------------------------------------------------------------------
867 ; void pred8x8_dc_rv40_8(uint8_t *src, int stride)
868 ;-----------------------------------------------------------------------------
871 cglobal pred8x8_dc_rv40_8, 2,7
877 movzx r5d, byte [r0+r1*1]
881 movzx r2d, byte [r0+r1*0]
882 movzx r3d, byte [r0+r1*1]
887 movzx r2d, byte [r0+r1*0]
903 ;-----------------------------------------------------------------------------
904 ; void pred8x8_tm_vp8_8(uint8_t *src, int stride)
905 ;-----------------------------------------------------------------------------
908 cglobal pred8x8_tm_vp8_8, 2,6
915 movzx r4d, byte [r0-1]
918 movzx r2d, byte [r0+r1*1-1]
919 movzx r3d, byte [r0+r1*2-1]
948 cglobal pred8x8_tm_vp8_8, 2,6,4
953 movzx r4d, byte [r0-1]
956 movzx r2d, byte [r0+r1*1-1]
957 movzx r3d, byte [r0+r1*2-1]
962 pshuflw xmm2, xmm2, 0
963 pshuflw xmm3, xmm3, 0
964 punpcklqdq xmm2, xmm2
965 punpcklqdq xmm3, xmm3
970 movhps [r0+r1*2], xmm2
977 cglobal pred8x8_tm_vp8_8, 2,3,6
979 movdqa xmm4, [tm_shuf]
987 movd xmm2, [r0+r1*1-4]
988 movd xmm3, [r0+r1*2-4]
997 movhps [r0+r1*2], xmm2
1003 ; dest, left, right, src, tmp
1004 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1005 %macro PRED4x4_LOWPASS 5
1015 ;-----------------------------------------------------------------------------
1016 ; void pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
1017 ;-----------------------------------------------------------------------------
1018 %macro PRED8x8L_TOP_DC 0
1019 cglobal pred8x8l_top_dc_8, 4,4
1027 PALIGNR mm2, mm0, 7, mm0
1028 PALIGNR mm1, mm4, 1, mm4
1029 test r1, r1 ; top_left
1031 test r2, r2 ; top_right
1040 test r2, r2 ; top_right
1049 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1070 ;-----------------------------------------------------------------------------
1071 ;void pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
1072 ;-----------------------------------------------------------------------------
1074 %macro PRED8x8L_DC 0
1075 cglobal pred8x8l_dc_8, 4,5
1078 movq mm0, [r0+r3*1-8]
1079 punpckhbw mm0, [r0+r3*0-8]
1080 movq mm1, [r4+r3*1-8]
1081 punpckhbw mm1, [r0+r3*2-8]
1085 movq mm2, [r0+r3*1-8]
1086 punpckhbw mm2, [r0+r3*0-8]
1088 movq mm3, [r0+r3*1-8]
1089 punpckhbw mm3, [r0+r3*0-8]
1093 movq mm0, [r0+r3*0-8]
1098 PALIGNR mm4, mm0, 7, mm0
1099 PALIGNR mm1, mm2, 1, mm2
1126 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1129 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1131 PALIGNR mm7, mm1, 7, mm3
1137 PALIGNR mm2, mm0, 7, mm0
1138 PALIGNR mm1, mm4, 1, mm4
1145 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1173 ;-----------------------------------------------------------------------------
1174 ; void pred8x8l_horizontal_8(uint8_t *src, int has_topleft, int has_topright, int stride)
1175 ;-----------------------------------------------------------------------------
1177 %macro PRED8x8L_HORIZONTAL 0
1178 cglobal pred8x8l_horizontal_8, 4,4
1181 movq mm0, [r0+r3*1-8]
1185 punpckhbw mm0, [r1+r3*0-8]
1186 movq mm1, [r2+r3*1-8]
1187 punpckhbw mm1, [r0+r3*2-8]
1191 movq mm2, [r0+r3*1-8]
1192 punpckhbw mm2, [r0+r3*0-8]
1194 movq mm3, [r0+r3*1-8]
1195 punpckhbw mm3, [r0+r3*0-8]
1199 movq mm0, [r0+r3*0-8]
1200 movq mm1, [r1+r3*0-8]
1204 PALIGNR mm4, mm0, 7, mm0
1205 PALIGNR mm1, mm2, 1, mm2
1207 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1210 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1212 PALIGNR mm7, mm1, 7, mm3
1218 pshufw mm0, mm3, 0xff
1219 pshufw mm1, mm3, 0xaa
1221 pshufw mm2, mm3, 0x55
1222 pshufw mm3, mm3, 0x00
1223 pshufw mm4, mm7, 0xff
1224 pshufw mm5, mm7, 0xaa
1225 pshufw mm6, mm7, 0x55
1226 pshufw mm7, mm7, 0x00
1244 ;-----------------------------------------------------------------------------
1245 ; void pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, int stride)
1246 ;-----------------------------------------------------------------------------
1248 %macro PRED8x8L_VERTICAL 0
1249 cglobal pred8x8l_vertical_8, 4,4
1256 PALIGNR mm2, mm0, 7, mm0
1257 PALIGNR mm1, mm4, 1, mm4
1258 test r1, r1 ; top_left
1260 test r2, r2 ; top_right
1269 test r2, r2 ; top_right
1278 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1294 ;-----------------------------------------------------------------------------
1295 ;void pred8x8l_down_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
1296 ;-----------------------------------------------------------------------------
1299 cglobal pred8x8l_down_left_8, 4,5
1306 PALIGNR mm2, mm0, 7, mm0
1307 PALIGNR mm1, mm4, 1, mm4
1330 pshufw mm1, mm3, 0xFF
1333 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1342 PALIGNR mm2, mm3, 7, mm3
1343 PALIGNR mm5, mm4, 1, mm4
1344 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1352 PALIGNR mm2, mm7, 1, mm0
1354 PALIGNR mm3, mm7, 7, mm0
1355 PALIGNR mm4, mm6, 1, mm0
1361 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1362 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1406 %macro PRED8x8L_DOWN_LEFT 0
1407 cglobal pred8x8l_down_left_8, 4,4
1414 PALIGNR mm2, mm0, 7, mm0
1415 PALIGNR mm1, mm4, 1, mm4
1416 test r1, r1 ; top_left
1418 test r2, r2 ; top_right
1427 test r2, r2 ; top_right
1438 pshufw mm1, mm3, 0xFF
1441 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1443 test r2, r2 ; top_right
1450 PALIGNR mm2, mm3, 7, mm3
1451 PALIGNR mm5, mm4, 1, mm4
1452 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1468 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1470 movq [r0+r3*1], xmm0
1472 movq [r0+r3*2], xmm0
1475 movq [r1+r3*1], xmm0
1477 movq [r1+r3*2], xmm0
1479 movq [r2+r3*1], xmm0
1481 movq [r2+r3*2], xmm0
1483 movq [r0+r3*1], xmm0
1485 movq [r0+r3*2], xmm0
1494 ;-----------------------------------------------------------------------------
1495 ;void pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1496 ;-----------------------------------------------------------------------------
1499 cglobal pred8x8l_down_right_8, 4,5
1502 movq mm0, [r0+r3*1-8]
1503 punpckhbw mm0, [r0+r3*0-8]
1504 movq mm1, [r4+r3*1-8]
1505 punpckhbw mm1, [r0+r3*2-8]
1509 movq mm2, [r0+r3*1-8]
1510 punpckhbw mm2, [r0+r3*0-8]
1512 movq mm3, [r0+r3*1-8]
1513 punpckhbw mm3, [r0+r3*0-8]
1517 movq mm0, [r0+r3*0-8]
1522 PALIGNR mm4, mm0, 7, mm0
1523 PALIGNR mm1, mm2, 1, mm2
1524 test r1, r1 ; top_left
1528 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1532 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1534 PALIGNR mm7, mm1, 7, mm3
1540 PALIGNR mm2, mm0, 7, mm0
1541 PALIGNR mm1, mm4, 1, mm4
1542 test r1, r1 ; top_left
1544 test r2, r2 ; top_right
1547 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1563 test r2, r2 ; top_right
1579 PALIGNR mm2, mm6, 1, mm0
1581 PALIGNR mm3, mm6, 7, mm0
1585 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1586 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1630 %macro PRED8x8L_DOWN_RIGHT 0
1631 cglobal pred8x8l_down_right_8, 4,5
1634 movq mm0, [r0+r3*1-8]
1635 punpckhbw mm0, [r0+r3*0-8]
1636 movq mm1, [r4+r3*1-8]
1637 punpckhbw mm1, [r0+r3*2-8]
1641 movq mm2, [r0+r3*1-8]
1642 punpckhbw mm2, [r0+r3*0-8]
1644 movq mm3, [r0+r3*1-8]
1645 punpckhbw mm3, [r0+r3*0-8]
1649 movq mm0, [r0+r3*0-8]
1654 PALIGNR mm4, mm0, 7, mm0
1655 PALIGNR mm1, mm2, 1, mm2
1683 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1687 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1689 PALIGNR mm7, mm1, 7, mm3
1696 PALIGNR mm2, mm0, 7, mm0
1697 PALIGNR mm1, mm4, 1, mm4
1703 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1720 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1723 movq [r0+r3*2], xmm0
1724 movq [r0+r3*1], xmm1
1727 movq [r2+r3*2], xmm0
1728 movq [r2+r3*1], xmm1
1731 movq [r1+r3*2], xmm0
1732 movq [r1+r3*1], xmm1
1735 movq [r4+r3*2], xmm0
1736 movq [r4+r3*1], xmm1
1745 ;-----------------------------------------------------------------------------
1746 ; void pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, int has_topright, int stride)
1747 ;-----------------------------------------------------------------------------
1750 cglobal pred8x8l_vertical_right_8, 4,5
1753 movq mm0, [r0+r3*1-8]
1754 punpckhbw mm0, [r0+r3*0-8]
1755 movq mm1, [r4+r3*1-8]
1756 punpckhbw mm1, [r0+r3*2-8]
1760 movq mm2, [r0+r3*1-8]
1761 punpckhbw mm2, [r0+r3*0-8]
1763 movq mm3, [r0+r3*1-8]
1764 punpckhbw mm3, [r0+r3*0-8]
1768 movq mm0, [r0+r3*0-8]
1773 PALIGNR mm4, mm0, 7, mm0
1774 PALIGNR mm1, mm2, 1, mm2
1802 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1809 PALIGNR mm2, mm0, 7, mm0
1810 PALIGNR mm1, mm4, 1, mm4
1816 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1820 PALIGNR mm3, mm7, 7, mm0
1821 PALIGNR mm6, mm7, 6, mm1
1825 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1836 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1837 PALIGNR mm6, mm0, 7, mm2
1840 PALIGNR mm5, mm0, 7, mm1
1843 PALIGNR mm6, mm0, 7, mm2
1846 PALIGNR mm5, mm0, 7, mm1
1849 PALIGNR mm6, mm0, 7, mm2
1852 PALIGNR mm5, mm0, 7, mm1
1856 %macro PRED8x8L_VERTICAL_RIGHT 0
1857 cglobal pred8x8l_vertical_right_8, 4,5,7
1858 ; manually spill XMM registers for Win64 because
1859 ; the code here is initialized with INIT_MMX
1863 movq mm0, [r0+r3*1-8]
1864 punpckhbw mm0, [r0+r3*0-8]
1865 movq mm1, [r4+r3*1-8]
1866 punpckhbw mm1, [r0+r3*2-8]
1870 movq mm2, [r0+r3*1-8]
1871 punpckhbw mm2, [r0+r3*0-8]
1873 movq mm3, [r0+r3*1-8]
1874 punpckhbw mm3, [r0+r3*0-8]
1878 movq mm0, [r0+r3*0-8]
1883 PALIGNR mm4, mm0, 7, mm0
1884 PALIGNR mm1, mm2, 1, mm2
1911 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1918 PALIGNR mm2, mm0, 7, mm0
1919 PALIGNR mm1, mm4, 1, mm4
1925 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1930 movdqa xmm6, [pw_ff00]
1939 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1945 movhps [r0+r3*2], xmm5
1946 movhps [r0+r3*1], xmm2
1954 movq [r0+r3*2], xmm5
1955 movq [r0+r3*1], xmm2
1958 movq [r2+r3*2], xmm5
1959 movq [r2+r3*1], xmm2
1962 movq [r1+r3*2], xmm5
1963 movq [r1+r3*1], xmm2
1968 PRED8x8L_VERTICAL_RIGHT
1970 PRED8x8L_VERTICAL_RIGHT
1972 ;-----------------------------------------------------------------------------
1973 ;void pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
1974 ;-----------------------------------------------------------------------------
1976 %macro PRED8x8L_VERTICAL_LEFT 0
1977 cglobal pred8x8l_vertical_left_8, 4,4
1984 PALIGNR mm2, mm0, 7, mm0
1985 PALIGNR mm1, mm4, 1, mm4
2008 pshufw mm1, mm3, 0xFF
2011 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2020 PALIGNR mm2, mm3, 7, mm3
2021 PALIGNR mm5, mm4, 1, mm4
2022 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2036 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2038 movq [r0+r3*1], xmm3
2039 movq [r0+r3*2], xmm0
2043 movq [r1+r3*1], xmm3
2044 movq [r1+r3*2], xmm0
2047 movq [r2+r3*1], xmm3
2048 movq [r2+r3*2], xmm0
2051 movq [r0+r3*1], xmm3
2052 movq [r0+r3*2], xmm0
2057 PRED8x8L_VERTICAL_LEFT
2059 PRED8x8L_VERTICAL_LEFT
2061 ;-----------------------------------------------------------------------------
2062 ; void pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, int has_topright, int stride)
2063 ;-----------------------------------------------------------------------------
2065 %macro PRED8x8L_HORIZONTAL_UP 0
2066 cglobal pred8x8l_horizontal_up_8, 4,4
2069 movq mm0, [r0+r3*1-8]
2073 punpckhbw mm0, [r1+r3*0-8]
2074 movq mm1, [r2+r3*1-8]
2075 punpckhbw mm1, [r0+r3*2-8]
2079 movq mm2, [r0+r3*1-8]
2080 punpckhbw mm2, [r0+r3*0-8]
2082 movq mm3, [r0+r3*1-8]
2083 punpckhbw mm3, [r0+r3*0-8]
2087 movq mm0, [r0+r3*0-8]
2088 movq mm1, [r1+r3*0-8]
2092 PALIGNR mm4, mm0, 7, mm0
2093 PALIGNR mm1, mm2, 1, mm2
2095 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2098 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2100 PALIGNR mm7, mm1, 7, mm3
2102 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2103 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2107 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2114 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2116 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2118 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2120 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2121 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2125 PALIGNR mm5, mm4, 2, mm1
2126 pshufw mm1, mm6, 11111001b
2127 PALIGNR mm6, mm4, 4, mm2
2128 pshufw mm2, mm7, 11111110b
2129 PALIGNR mm7, mm4, 6, mm3
2130 pshufw mm3, mm0, 11111111b
2144 PRED8x8L_HORIZONTAL_UP
2146 PRED8x8L_HORIZONTAL_UP
2148 ;-----------------------------------------------------------------------------
2149 ;void pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, int has_topright, int stride)
2150 ;-----------------------------------------------------------------------------
2153 cglobal pred8x8l_horizontal_down_8, 4,5
2156 movq mm0, [r0+r3*1-8]
2157 punpckhbw mm0, [r0+r3*0-8]
2158 movq mm1, [r4+r3*1-8]
2159 punpckhbw mm1, [r0+r3*2-8]
2163 movq mm2, [r0+r3*1-8]
2164 punpckhbw mm2, [r0+r3*0-8]
2166 movq mm3, [r0+r3*1-8]
2167 punpckhbw mm3, [r0+r3*0-8]
2171 movq mm0, [r0+r3*0-8]
2176 PALIGNR mm4, mm0, 7, mm0
2177 PALIGNR mm1, mm2, 1, mm2
2204 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2208 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2210 PALIGNR mm7, mm1, 7, mm3
2216 PALIGNR mm2, mm0, 7, mm0
2217 PALIGNR mm1, mm4, 1, mm4
2223 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2230 PALIGNR mm2, mm6, 7, mm5
2231 PALIGNR mm6, mm7, 7, mm0
2233 PALIGNR mm4, mm3, 1, mm7
2236 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2242 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2250 PALIGNR mm7, mm3, 2, mm5
2252 PALIGNR mm1, mm3, 4, mm5
2254 PALIGNR mm0, mm3, 6, mm3
2259 PALIGNR mm6, mm4, 2, mm5
2261 PALIGNR mm2, mm4, 4, mm5
2263 PALIGNR mm3, mm4, 6, mm4
2267 %macro PRED8x8L_HORIZONTAL_DOWN 0
2268 cglobal pred8x8l_horizontal_down_8, 4,5
2271 movq mm0, [r0+r3*1-8]
2272 punpckhbw mm0, [r0+r3*0-8]
2273 movq mm1, [r4+r3*1-8]
2274 punpckhbw mm1, [r0+r3*2-8]
2278 movq mm2, [r0+r3*1-8]
2279 punpckhbw mm2, [r0+r3*0-8]
2281 movq mm3, [r0+r3*1-8]
2282 punpckhbw mm3, [r0+r3*0-8]
2286 movq mm0, [r0+r3*0-8]
2291 PALIGNR mm4, mm0, 7, mm0
2292 PALIGNR mm1, mm2, 1, mm2
2319 pshufw mm1, mm3, 0xFF
2323 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2327 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2337 PALIGNR mm2, mm0, 7, mm0
2338 PALIGNR mm1, mm4, 1, mm4
2344 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2353 PALIGNR mm2, mm3, 7, mm3
2354 PALIGNR mm5, mm4, 1, mm4
2355 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2364 PALIGNR xmm1, xmm0, 7, xmm4
2365 PALIGNR xmm2, xmm0, 9, xmm5
2367 PALIGNR xmm3, xmm0, 8, xmm0
2371 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2372 punpcklbw xmm4, xmm0
2374 movq [r0+r3*2], xmm4
2375 movq [r2+r3*2], xmm0
2378 movq [r0+r3*1], xmm4
2379 movq [r2+r3*1], xmm0
2382 movq [r1+r3*2], xmm4
2383 movq [r4+r3*2], xmm0
2386 movq [r1+r3*1], xmm4
2387 movq [r4+r3*1], xmm0
2392 PRED8x8L_HORIZONTAL_DOWN
2394 PRED8x8L_HORIZONTAL_DOWN
2396 ;-----------------------------------------------------------------------------
2397 ; void pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2398 ;-----------------------------------------------------------------------------
2401 cglobal pred4x4_dc_8, 3,5
2407 movzx r1d, byte [r0+r2*1-1]
2410 movzx r1d, byte [r0+r2*2-1]
2413 movzx r1d, byte [r0+r2*1-1]
2415 movzx r1d, byte [r0+r2*2-1]
2419 imul r3d, 0x01010101
2426 ;-----------------------------------------------------------------------------
2427 ; void pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2428 ;-----------------------------------------------------------------------------
2431 cglobal pred4x4_tm_vp8_8, 3,6
2436 movzx r4d, byte [r0-1]
2439 movzx r1d, byte [r0+r2*1-1]
2440 movzx r3d, byte [r0+r2*2-1]
2472 cglobal pred4x4_tm_vp8_8, 3,3
2481 movd mm2, [r0+r2*1-4]
2482 movd mm3, [r0+r2*2-4]
2483 movd mm4, [r1+r2*1-4]
2484 movd mm5, [r1+r2*2-4]
2504 ;-----------------------------------------------------------------------------
2505 ; void pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2506 ;-----------------------------------------------------------------------------
2509 cglobal pred4x4_vertical_vp8_8, 3,3
2513 mova m2, m0 ;t0 t1 t2 t3
2514 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2516 psrlq m0, 8 ;t1 t2 t3 t4
2517 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2524 ;-----------------------------------------------------------------------------
2525 ; void pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2526 ;-----------------------------------------------------------------------------
2528 cglobal pred4x4_down_left_8, 3,3
2538 PRED4x4_LOWPASS m0, m1, m2, m3, m4
2550 ;-----------------------------------------------------------------------------
2551 ; void pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2552 ;-----------------------------------------------------------------------------
2555 cglobal pred4x4_vertical_left_8, 3,3
2565 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2575 ;-----------------------------------------------------------------------------
2576 ; void pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2577 ;-----------------------------------------------------------------------------
2580 cglobal pred4x4_horizontal_up_8, 3,3
2583 movd m0, [r0+r2*1-4]
2584 punpcklbw m0, [r0+r2*2-4]
2585 movd m1, [r1+r2*1-4]
2586 punpcklbw m1, [r1+r2*2-4]
2598 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2608 ;-----------------------------------------------------------------------------
2609 ; void pred4x4_horizontal_down_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2610 ;-----------------------------------------------------------------------------
2613 cglobal pred4x4_horizontal_down_8, 3,3
2616 movh m0, [r0-4] ; lt ..
2617 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2618 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2619 movd m1, [r1+r2*2-4] ; l3
2620 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2621 movd m2, [r0+r2*2-4] ; l1
2622 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2623 punpckhwd m1, m2 ; l0 l1 l2 l3
2624 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2628 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2629 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2631 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2634 PALIGNR m3, m5, 6, m4
2643 ;-----------------------------------------------------------------------------
2644 ; void pred4x4_vertical_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2645 ;-----------------------------------------------------------------------------
2648 cglobal pred4x4_vertical_right_8, 3,3
2651 movh m0, [r0] ; ........t3t2t1t0
2653 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2655 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2657 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2659 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2660 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2666 PALIGNR m5, m1, 7, m2
2669 PALIGNR m3, m1, 7, m1
2673 ;-----------------------------------------------------------------------------
2674 ; void pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2675 ;-----------------------------------------------------------------------------
2678 cglobal pred4x4_down_right_8, 3,3
2682 movq m2, [r0+r2*1-8]
2683 punpckhbw m2, [r0-8]
2686 PALIGNR m3, m1, 5, m1
2688 PALIGNR m3, [r1+r2*1-8], 7, m4
2690 PALIGNR m3, [r1+r2*2-8], 7, m4
2691 PRED4x4_LOWPASS m0, m3, m1, m2, m4