1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
29 tm_shuf: times 8 db 0x03, 0x80
30 pw_ff00: times 8 dw 0xff00
31 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
32 db 1, 2, 3, 4, 5, 6, 7, 8
33 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
34 db 1, 2, 3, 4, 0, 0, 0, 0
35 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
36 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
37 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
38 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
51 ;-----------------------------------------------------------------------------
52 ; void pred16x16_vertical(uint8_t *src, int stride)
53 ;-----------------------------------------------------------------------------
55 cglobal pred16x16_vertical_8_mmx, 2,3
70 cglobal pred16x16_vertical_8_sse, 2,3
75 movaps [r0+r1*1], xmm0
76 movaps [r0+r1*2], xmm0
78 movaps [r0+r1*1], xmm0
79 movaps [r0+r1*2], xmm0
85 ;-----------------------------------------------------------------------------
86 ; void pred16x16_horizontal(uint8_t *src, int stride)
87 ;-----------------------------------------------------------------------------
90 cglobal pred16x16_horizontal_8, 2,3
127 ;-----------------------------------------------------------------------------
128 ; void pred16x16_dc(uint8_t *src, int stride)
129 ;-----------------------------------------------------------------------------
131 %macro PRED16x16_DC 0
132 cglobal pred16x16_dc_8, 2,7
140 movzx r5d, byte [r0+r1*1]
145 movzx r2d, byte [r0+r1*0]
146 movzx r3d, byte [r0+r1*1]
151 movzx r2d, byte [r0+r1*0]
158 SPLATB_REG m0, r2, m1
190 ;-----------------------------------------------------------------------------
191 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
192 ;-----------------------------------------------------------------------------
194 %macro PRED16x16_TM_MMX 0
195 cglobal pred16x16_tm_vp8_8, 2,5
206 movzx r3d, byte [r0-1]
209 movzx r2d, byte [r0+r1-1]
236 cglobal pred16x16_tm_vp8_8_sse2, 2,6,6
243 movzx r4d, byte [r0-1]
246 movzx r2d, byte [r0+r1*1-1]
247 movzx r3d, byte [r0+r1*2-1]
252 pshuflw xmm2, xmm2, 0
253 pshuflw xmm4, xmm4, 0
254 punpcklqdq xmm2, xmm2
255 punpcklqdq xmm4, xmm4
264 movdqa [r0+r1*1], xmm2
265 movdqa [r0+r1*2], xmm4
271 ;-----------------------------------------------------------------------------
272 ; void pred16x16_plane(uint8_t *src, int stride)
273 ;-----------------------------------------------------------------------------
275 %macro H264_PRED16x16_PLANE 1
276 cglobal pred16x16_plane_%1_8, 2,9,7
290 pmullw m0, [pw_m8tom1 ]
291 pmullw m1, [pw_m8tom1+8]
292 pmullw m2, [pw_1to8 ]
293 pmullw m3, [pw_1to8 +8]
298 movhps m0, [r0+r1 +8]
299 pmaddubsw m0, [plane_shuf] ; H coefficients
305 pmullw m0, [pw_m8tom1]
325 paddw m0, m1 ; sum of H coefficients
337 movzx e_reg, byte [r3+r2*2 ]
338 movzx r5, byte [r4+r1 ]
341 movzx e_reg, byte [r3+r2 ]
346 movzx e_reg, byte [r3+r1 ]
347 movzx r6, byte [r4+r2*2 ]
351 movzx e_reg, byte [r3 ]
353 movzx r7, byte [r4+r2 ]
356 movzx r6, byte [r4+r2 ]
365 movzx r4, byte [e_reg+r2 ]
377 movzx r4, byte [e_reg ]
379 movzx r7, byte [r3 +r2 ]
383 movzx r6, byte [r3 +r2 ]
389 movzx r4, byte [e_reg+r1 ]
390 movzx r6, byte [r3 +r2*2]
397 movzx r4, byte [e_reg+r2*2]
398 movzx r6, byte [r3 +r1 ]
401 add r5, r6 ; sum of V coefficients
418 lea r5, [r5*5] ; 5*(V/4)
422 sar r5, 4 ; (5*(V/4))/16
425 movzx r4, byte [r0+r1 +15]
426 movzx r3, byte [r3+r2*2 ]
444 lea r1d, [r1d*5] ; 5*(H/4)
448 sar r1d, 4 ; (5*(H/4))/16
469 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
478 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
479 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
481 paddw m5, m0 ; a + {8,9,10,11}*H
482 paddw m6, m0 ; a + {12,13,14,15}*H
487 mova m3, m0 ; b[0..7]
488 mova m4, m2 ; b[8..15]
494 mova m3, m5 ; b[8..11]
495 mova m4, m6 ; b[12..15]
508 mova m3, m0 ; b[0..7]
509 mova m4, m2 ; b[8..15]
515 mova m3, m5 ; b[8..11]
516 mova m4, m6 ; b[12..15]
536 H264_PRED16x16_PLANE h264
537 H264_PRED16x16_PLANE rv40
538 H264_PRED16x16_PLANE svq3
540 H264_PRED16x16_PLANE h264
541 H264_PRED16x16_PLANE rv40
542 H264_PRED16x16_PLANE svq3
544 H264_PRED16x16_PLANE h264
545 H264_PRED16x16_PLANE rv40
546 H264_PRED16x16_PLANE svq3
548 H264_PRED16x16_PLANE h264
549 H264_PRED16x16_PLANE rv40
550 H264_PRED16x16_PLANE svq3
553 ;-----------------------------------------------------------------------------
554 ; void pred8x8_plane(uint8_t *src, int stride)
555 ;-----------------------------------------------------------------------------
557 %macro H264_PRED8x8_PLANE 0
558 cglobal pred8x8_plane_8, 2,9,7
568 pmullw m0, [pw_m4to4]
569 pmullw m1, [pw_m4to4+8]
572 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
573 pmaddubsw m0, [plane8_shuf] ; H coefficients
579 pmullw m0, [pw_m4to4]
585 %if notcpuflag(ssse3)
601 paddw m0, m1 ; sum of H coefficients
613 movzx e_reg, byte [r3+r2*2 ]
614 movzx r5, byte [r4+r1 ]
617 movzx e_reg, byte [r3 ]
619 movzx r7, byte [r4+r2 ]
623 movzx r6, byte [r4+r2 ]
629 movzx e_reg, byte [r3+r1 ]
630 movzx r6, byte [r4+r2*2 ]
637 movzx e_reg, byte [r3+r2 ]
650 movzx r3, byte [r4+r2*2 ]
651 movzx r4, byte [r0+r1 +7]
673 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
674 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
677 paddw m2, m0 ; a + {4,5,6,7}*H
684 mova m3, m0 ; b[0..7]
687 mova m4, m0 ; V+b[0..7]
694 mova m3, m0 ; b[0..3]
695 mova m4, m2 ; b[4..7]
700 mova m5, m0 ; V+b[0..3]
701 mova m6, m2 ; V+b[4..7]
728 ;-----------------------------------------------------------------------------
729 ; void pred8x8_vertical(uint8_t *src, int stride)
730 ;-----------------------------------------------------------------------------
732 cglobal pred8x8_vertical_8_mmx, 2,2
744 ;-----------------------------------------------------------------------------
745 ; void pred8x8_horizontal(uint8_t *src, int stride)
746 ;-----------------------------------------------------------------------------
749 cglobal pred8x8_horizontal_8, 2,3
755 SPLATB_LOAD m0, r0+r1*0-1, m2
756 SPLATB_LOAD m1, r0+r1*1-1, m2
773 ;-----------------------------------------------------------------------------
774 ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
775 ;-----------------------------------------------------------------------------
776 cglobal pred8x8_top_dc_8_mmxext, 2,5
793 pshufw mm0, mm0, 0 ; dc0 (w)
794 packuswb mm0, mm1 ; dc0,dc1 (b)
806 ;-----------------------------------------------------------------------------
807 ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
808 ;-----------------------------------------------------------------------------
811 cglobal pred8x8_dc_8_mmxext, 2,5
820 movzx r2d, byte [r0+r1*1-1]
821 movzx r3d, byte [r0+r1*2-1]
824 movzx r3d, byte [r0+r1*1-1]
826 movzx r3d, byte [r0+r1*2-1]
830 movzx r2d, byte [r0+r1*1-1]
831 movzx r3d, byte [r0+r1*2-1]
834 movzx r3d, byte [r0+r1*1-1]
836 movzx r3d, byte [r0+r1*2-1]
843 punpckldq m0, m2 ; s0, s1, s2, s3
844 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
846 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
850 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
867 ;-----------------------------------------------------------------------------
868 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
869 ;-----------------------------------------------------------------------------
871 cglobal pred8x8_dc_rv40_8_mmxext, 2,7
877 movzx r5d, byte [r0+r1*1]
881 movzx r2d, byte [r0+r1*0]
882 movzx r3d, byte [r0+r1*1]
887 movzx r2d, byte [r0+r1*0]
903 ;-----------------------------------------------------------------------------
904 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
905 ;-----------------------------------------------------------------------------
907 %macro PRED8x8_TM_MMX 0
908 cglobal pred8x8_tm_vp8_8, 2,6
915 movzx r4d, byte [r0-1]
918 movzx r2d, byte [r0+r1*1-1]
919 movzx r3d, byte [r0+r1*2-1]
948 cglobal pred8x8_tm_vp8_8_sse2, 2,6,4
953 movzx r4d, byte [r0-1]
956 movzx r2d, byte [r0+r1*1-1]
957 movzx r3d, byte [r0+r1*2-1]
962 pshuflw xmm2, xmm2, 0
963 pshuflw xmm3, xmm3, 0
964 punpcklqdq xmm2, xmm2
965 punpcklqdq xmm3, xmm3
970 movhps [r0+r1*2], xmm2
976 cglobal pred8x8_tm_vp8_8_ssse3, 2,3,6
978 movdqa xmm4, [tm_shuf]
986 movd xmm2, [r0+r1*1-4]
987 movd xmm3, [r0+r1*2-4]
996 movhps [r0+r1*2], xmm2
1002 ; dest, left, right, src, tmp
1003 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1004 %macro PRED4x4_LOWPASS 5
1014 ;-----------------------------------------------------------------------------
1015 ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1016 ;-----------------------------------------------------------------------------
1017 %macro PRED8x8L_TOP_DC 1
1018 cglobal pred8x8l_top_dc_8_%1, 4,4
1026 PALIGNR mm2, mm0, 7, mm0
1027 PALIGNR mm1, mm4, 1, mm4
1028 test r1, r1 ; top_left
1030 test r2, r2 ; top_right
1039 test r2, r2 ; top_right
1048 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1065 %define PALIGNR PALIGNR_MMX
1066 PRED8x8L_TOP_DC mmxext
1067 %define PALIGNR PALIGNR_SSSE3
1068 PRED8x8L_TOP_DC ssse3
1070 ;-----------------------------------------------------------------------------
1071 ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1072 ;-----------------------------------------------------------------------------
1074 %macro PRED8x8L_DC 1
1075 cglobal pred8x8l_dc_8_%1, 4,5
1078 movq mm0, [r0+r3*1-8]
1079 punpckhbw mm0, [r0+r3*0-8]
1080 movq mm1, [r4+r3*1-8]
1081 punpckhbw mm1, [r0+r3*2-8]
1085 movq mm2, [r0+r3*1-8]
1086 punpckhbw mm2, [r0+r3*0-8]
1088 movq mm3, [r0+r3*1-8]
1089 punpckhbw mm3, [r0+r3*0-8]
1093 movq mm0, [r0+r3*0-8]
1098 PALIGNR mm4, mm0, 7, mm0
1099 PALIGNR mm1, mm2, 1, mm2
1126 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1129 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1131 PALIGNR mm7, mm1, 7, mm3
1137 PALIGNR mm2, mm0, 7, mm0
1138 PALIGNR mm1, mm4, 1, mm4
1145 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1168 %define PALIGNR PALIGNR_MMX
1170 %define PALIGNR PALIGNR_SSSE3
1173 ;-----------------------------------------------------------------------------
1174 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1175 ;-----------------------------------------------------------------------------
1177 %macro PRED8x8L_HORIZONTAL 1
1178 cglobal pred8x8l_horizontal_8_%1, 4,4
1181 movq mm0, [r0+r3*1-8]
1185 punpckhbw mm0, [r1+r3*0-8]
1186 movq mm1, [r2+r3*1-8]
1187 punpckhbw mm1, [r0+r3*2-8]
1191 movq mm2, [r0+r3*1-8]
1192 punpckhbw mm2, [r0+r3*0-8]
1194 movq mm3, [r0+r3*1-8]
1195 punpckhbw mm3, [r0+r3*0-8]
1199 movq mm0, [r0+r3*0-8]
1200 movq mm1, [r1+r3*0-8]
1204 PALIGNR mm4, mm0, 7, mm0
1205 PALIGNR mm1, mm2, 1, mm2
1207 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1210 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1212 PALIGNR mm7, mm1, 7, mm3
1218 pshufw mm0, mm3, 0xff
1219 pshufw mm1, mm3, 0xaa
1221 pshufw mm2, mm3, 0x55
1222 pshufw mm3, mm3, 0x00
1223 pshufw mm4, mm7, 0xff
1224 pshufw mm5, mm7, 0xaa
1225 pshufw mm6, mm7, 0x55
1226 pshufw mm7, mm7, 0x00
1240 %define PALIGNR PALIGNR_MMX
1241 PRED8x8L_HORIZONTAL mmxext
1242 %define PALIGNR PALIGNR_SSSE3
1243 PRED8x8L_HORIZONTAL ssse3
1245 ;-----------------------------------------------------------------------------
1246 ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1247 ;-----------------------------------------------------------------------------
1249 %macro PRED8x8L_VERTICAL 1
1250 cglobal pred8x8l_vertical_8_%1, 4,4
1257 PALIGNR mm2, mm0, 7, mm0
1258 PALIGNR mm1, mm4, 1, mm4
1259 test r1, r1 ; top_left
1261 test r2, r2 ; top_right
1270 test r2, r2 ; top_right
1279 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1291 %define PALIGNR PALIGNR_MMX
1292 PRED8x8L_VERTICAL mmxext
1293 %define PALIGNR PALIGNR_SSSE3
1294 PRED8x8L_VERTICAL ssse3
1296 ;-----------------------------------------------------------------------------
1297 ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1298 ;-----------------------------------------------------------------------------
1301 %define PALIGNR PALIGNR_MMX
1302 cglobal pred8x8l_down_left_8_mmxext, 4,5
1309 PALIGNR mm2, mm0, 7, mm0
1310 PALIGNR mm1, mm4, 1, mm4
1333 pshufw mm1, mm3, 0xFF
1336 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1345 PALIGNR mm2, mm3, 7, mm3
1346 PALIGNR mm5, mm4, 1, mm4
1347 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1355 PALIGNR mm2, mm7, 1, mm0
1357 PALIGNR mm3, mm7, 7, mm0
1358 PALIGNR mm4, mm6, 1, mm0
1364 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1365 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1409 %macro PRED8x8L_DOWN_LEFT 1
1410 cglobal pred8x8l_down_left_8_%1, 4,4
1417 PALIGNR mm2, mm0, 7, mm0
1418 PALIGNR mm1, mm4, 1, mm4
1419 test r1, r1 ; top_left
1421 test r2, r2 ; top_right
1430 test r2, r2 ; top_right
1441 pshufw mm1, mm3, 0xFF
1444 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1446 test r2, r2 ; top_right
1453 PALIGNR mm2, mm3, 7, mm3
1454 PALIGNR mm5, mm4, 1, mm4
1455 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1471 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1473 movq [r0+r3*1], xmm0
1475 movq [r0+r3*2], xmm0
1478 movq [r1+r3*1], xmm0
1480 movq [r1+r3*2], xmm0
1482 movq [r2+r3*1], xmm0
1484 movq [r2+r3*2], xmm0
1486 movq [r0+r3*1], xmm0
1488 movq [r0+r3*2], xmm0
1493 %define PALIGNR PALIGNR_MMX
1494 PRED8x8L_DOWN_LEFT sse2
1496 %define PALIGNR PALIGNR_SSSE3
1497 PRED8x8L_DOWN_LEFT ssse3
1499 ;-----------------------------------------------------------------------------
1500 ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1501 ;-----------------------------------------------------------------------------
1504 %define PALIGNR PALIGNR_MMX
1505 cglobal pred8x8l_down_right_8_mmxext, 4,5
1508 movq mm0, [r0+r3*1-8]
1509 punpckhbw mm0, [r0+r3*0-8]
1510 movq mm1, [r4+r3*1-8]
1511 punpckhbw mm1, [r0+r3*2-8]
1515 movq mm2, [r0+r3*1-8]
1516 punpckhbw mm2, [r0+r3*0-8]
1518 movq mm3, [r0+r3*1-8]
1519 punpckhbw mm3, [r0+r3*0-8]
1523 movq mm0, [r0+r3*0-8]
1528 PALIGNR mm4, mm0, 7, mm0
1529 PALIGNR mm1, mm2, 1, mm2
1530 test r1, r1 ; top_left
1534 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1538 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1540 PALIGNR mm7, mm1, 7, mm3
1546 PALIGNR mm2, mm0, 7, mm0
1547 PALIGNR mm1, mm4, 1, mm4
1548 test r1, r1 ; top_left
1550 test r2, r2 ; top_right
1553 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1569 test r2, r2 ; top_right
1585 PALIGNR mm2, mm6, 1, mm0
1587 PALIGNR mm3, mm6, 7, mm0
1591 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1592 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1636 %macro PRED8x8L_DOWN_RIGHT 1
1637 cglobal pred8x8l_down_right_8_%1, 4,5
1640 movq mm0, [r0+r3*1-8]
1641 punpckhbw mm0, [r0+r3*0-8]
1642 movq mm1, [r4+r3*1-8]
1643 punpckhbw mm1, [r0+r3*2-8]
1647 movq mm2, [r0+r3*1-8]
1648 punpckhbw mm2, [r0+r3*0-8]
1650 movq mm3, [r0+r3*1-8]
1651 punpckhbw mm3, [r0+r3*0-8]
1655 movq mm0, [r0+r3*0-8]
1660 PALIGNR mm4, mm0, 7, mm0
1661 PALIGNR mm1, mm2, 1, mm2
1689 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1693 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1695 PALIGNR mm7, mm1, 7, mm3
1702 PALIGNR mm2, mm0, 7, mm0
1703 PALIGNR mm1, mm4, 1, mm4
1709 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1726 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1729 movq [r0+r3*2], xmm0
1730 movq [r0+r3*1], xmm1
1733 movq [r2+r3*2], xmm0
1734 movq [r2+r3*1], xmm1
1737 movq [r1+r3*2], xmm0
1738 movq [r1+r3*1], xmm1
1741 movq [r4+r3*2], xmm0
1742 movq [r4+r3*1], xmm1
1747 %define PALIGNR PALIGNR_MMX
1748 PRED8x8L_DOWN_RIGHT sse2
1750 %define PALIGNR PALIGNR_SSSE3
1751 PRED8x8L_DOWN_RIGHT ssse3
1753 ;-----------------------------------------------------------------------------
1754 ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1755 ;-----------------------------------------------------------------------------
1758 %define PALIGNR PALIGNR_MMX
1759 cglobal pred8x8l_vertical_right_8_mmxext, 4,5
1762 movq mm0, [r0+r3*1-8]
1763 punpckhbw mm0, [r0+r3*0-8]
1764 movq mm1, [r4+r3*1-8]
1765 punpckhbw mm1, [r0+r3*2-8]
1769 movq mm2, [r0+r3*1-8]
1770 punpckhbw mm2, [r0+r3*0-8]
1772 movq mm3, [r0+r3*1-8]
1773 punpckhbw mm3, [r0+r3*0-8]
1777 movq mm0, [r0+r3*0-8]
1782 PALIGNR mm4, mm0, 7, mm0
1783 PALIGNR mm1, mm2, 1, mm2
1811 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1818 PALIGNR mm2, mm0, 7, mm0
1819 PALIGNR mm1, mm4, 1, mm4
1825 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1829 PALIGNR mm3, mm7, 7, mm0
1830 PALIGNR mm6, mm7, 6, mm1
1834 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1845 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1846 PALIGNR mm6, mm0, 7, mm2
1849 PALIGNR mm5, mm0, 7, mm1
1852 PALIGNR mm6, mm0, 7, mm2
1855 PALIGNR mm5, mm0, 7, mm1
1858 PALIGNR mm6, mm0, 7, mm2
1861 PALIGNR mm5, mm0, 7, mm1
1865 %macro PRED8x8L_VERTICAL_RIGHT 1
1866 cglobal pred8x8l_vertical_right_8_%1, 4,5,7
1867 ; manually spill XMM registers for Win64 because
1868 ; the code here is initialized with INIT_MMX
1872 movq mm0, [r0+r3*1-8]
1873 punpckhbw mm0, [r0+r3*0-8]
1874 movq mm1, [r4+r3*1-8]
1875 punpckhbw mm1, [r0+r3*2-8]
1879 movq mm2, [r0+r3*1-8]
1880 punpckhbw mm2, [r0+r3*0-8]
1882 movq mm3, [r0+r3*1-8]
1883 punpckhbw mm3, [r0+r3*0-8]
1887 movq mm0, [r0+r3*0-8]
1892 PALIGNR mm4, mm0, 7, mm0
1893 PALIGNR mm1, mm2, 1, mm2
1920 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1927 PALIGNR mm2, mm0, 7, mm0
1928 PALIGNR mm1, mm4, 1, mm4
1934 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1939 movdqa xmm6, [pw_ff00]
1948 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1954 movhps [r0+r3*2], xmm5
1955 movhps [r0+r3*1], xmm2
1963 movq [r0+r3*2], xmm5
1964 movq [r0+r3*1], xmm2
1967 movq [r2+r3*2], xmm5
1968 movq [r2+r3*1], xmm2
1971 movq [r1+r3*2], xmm5
1972 movq [r1+r3*1], xmm2
1977 %define PALIGNR PALIGNR_MMX
1978 PRED8x8L_VERTICAL_RIGHT sse2
1980 %define PALIGNR PALIGNR_SSSE3
1981 PRED8x8L_VERTICAL_RIGHT ssse3
1983 ;-----------------------------------------------------------------------------
1984 ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1985 ;-----------------------------------------------------------------------------
1987 %macro PRED8x8L_VERTICAL_LEFT 1
1988 cglobal pred8x8l_vertical_left_8_%1, 4,4
1995 PALIGNR mm2, mm0, 7, mm0
1996 PALIGNR mm1, mm4, 1, mm4
2019 pshufw mm1, mm3, 0xFF
2022 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2031 PALIGNR mm2, mm3, 7, mm3
2032 PALIGNR mm5, mm4, 1, mm4
2033 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2047 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2049 movq [r0+r3*1], xmm3
2050 movq [r0+r3*2], xmm0
2054 movq [r1+r3*1], xmm3
2055 movq [r1+r3*2], xmm0
2058 movq [r2+r3*1], xmm3
2059 movq [r2+r3*2], xmm0
2062 movq [r0+r3*1], xmm3
2063 movq [r0+r3*2], xmm0
2068 %define PALIGNR PALIGNR_MMX
2069 PRED8x8L_VERTICAL_LEFT sse2
2070 %define PALIGNR PALIGNR_SSSE3
2072 PRED8x8L_VERTICAL_LEFT ssse3
2074 ;-----------------------------------------------------------------------------
2075 ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2076 ;-----------------------------------------------------------------------------
2078 %macro PRED8x8L_HORIZONTAL_UP 1
2079 cglobal pred8x8l_horizontal_up_8_%1, 4,4
2082 movq mm0, [r0+r3*1-8]
2086 punpckhbw mm0, [r1+r3*0-8]
2087 movq mm1, [r2+r3*1-8]
2088 punpckhbw mm1, [r0+r3*2-8]
2092 movq mm2, [r0+r3*1-8]
2093 punpckhbw mm2, [r0+r3*0-8]
2095 movq mm3, [r0+r3*1-8]
2096 punpckhbw mm3, [r0+r3*0-8]
2100 movq mm0, [r0+r3*0-8]
2101 movq mm1, [r1+r3*0-8]
2105 PALIGNR mm4, mm0, 7, mm0
2106 PALIGNR mm1, mm2, 1, mm2
2108 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2111 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2113 PALIGNR mm7, mm1, 7, mm3
2115 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2116 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2120 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2127 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2129 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2131 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2133 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2134 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2138 PALIGNR mm5, mm4, 2, mm1
2139 pshufw mm1, mm6, 11111001b
2140 PALIGNR mm6, mm4, 4, mm2
2141 pshufw mm2, mm7, 11111110b
2142 PALIGNR mm7, mm4, 6, mm3
2143 pshufw mm3, mm0, 11111111b
2157 %define PALIGNR PALIGNR_MMX
2158 PRED8x8L_HORIZONTAL_UP mmxext
2159 %define PALIGNR PALIGNR_SSSE3
2160 PRED8x8L_HORIZONTAL_UP ssse3
2162 ;-----------------------------------------------------------------------------
2163 ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2164 ;-----------------------------------------------------------------------------
2167 %define PALIGNR PALIGNR_MMX
2168 cglobal pred8x8l_horizontal_down_8_mmxext, 4,5
2171 movq mm0, [r0+r3*1-8]
2172 punpckhbw mm0, [r0+r3*0-8]
2173 movq mm1, [r4+r3*1-8]
2174 punpckhbw mm1, [r0+r3*2-8]
2178 movq mm2, [r0+r3*1-8]
2179 punpckhbw mm2, [r0+r3*0-8]
2181 movq mm3, [r0+r3*1-8]
2182 punpckhbw mm3, [r0+r3*0-8]
2186 movq mm0, [r0+r3*0-8]
2191 PALIGNR mm4, mm0, 7, mm0
2192 PALIGNR mm1, mm2, 1, mm2
2219 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2223 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2225 PALIGNR mm7, mm1, 7, mm3
2231 PALIGNR mm2, mm0, 7, mm0
2232 PALIGNR mm1, mm4, 1, mm4
2238 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2245 PALIGNR mm2, mm6, 7, mm5
2246 PALIGNR mm6, mm7, 7, mm0
2248 PALIGNR mm4, mm3, 1, mm7
2251 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2257 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2265 PALIGNR mm7, mm3, 2, mm5
2267 PALIGNR mm1, mm3, 4, mm5
2269 PALIGNR mm0, mm3, 6, mm3
2274 PALIGNR mm6, mm4, 2, mm5
2276 PALIGNR mm2, mm4, 4, mm5
2278 PALIGNR mm3, mm4, 6, mm4
2282 %macro PRED8x8L_HORIZONTAL_DOWN 1
2283 cglobal pred8x8l_horizontal_down_8_%1, 4,5
2286 movq mm0, [r0+r3*1-8]
2287 punpckhbw mm0, [r0+r3*0-8]
2288 movq mm1, [r4+r3*1-8]
2289 punpckhbw mm1, [r0+r3*2-8]
2293 movq mm2, [r0+r3*1-8]
2294 punpckhbw mm2, [r0+r3*0-8]
2296 movq mm3, [r0+r3*1-8]
2297 punpckhbw mm3, [r0+r3*0-8]
2301 movq mm0, [r0+r3*0-8]
2306 PALIGNR mm4, mm0, 7, mm0
2307 PALIGNR mm1, mm2, 1, mm2
2334 pshufw mm1, mm3, 0xFF
2338 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2342 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2352 PALIGNR mm2, mm0, 7, mm0
2353 PALIGNR mm1, mm4, 1, mm4
2359 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2368 PALIGNR mm2, mm3, 7, mm3
2369 PALIGNR mm5, mm4, 1, mm4
2370 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2379 PALIGNR xmm1, xmm0, 7, xmm4
2380 PALIGNR xmm2, xmm0, 9, xmm5
2382 PALIGNR xmm3, xmm0, 8, xmm0
2386 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2387 punpcklbw xmm4, xmm0
2389 movq [r0+r3*2], xmm4
2390 movq [r2+r3*2], xmm0
2393 movq [r0+r3*1], xmm4
2394 movq [r2+r3*1], xmm0
2397 movq [r1+r3*2], xmm4
2398 movq [r4+r3*2], xmm0
2401 movq [r1+r3*1], xmm4
2402 movq [r4+r3*1], xmm0
2407 %define PALIGNR PALIGNR_MMX
2408 PRED8x8L_HORIZONTAL_DOWN sse2
2410 %define PALIGNR PALIGNR_SSSE3
2411 PRED8x8L_HORIZONTAL_DOWN ssse3
2413 ;-----------------------------------------------------------------------------
2414 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2415 ;-----------------------------------------------------------------------------
2417 cglobal pred4x4_dc_8_mmxext, 3,5
2423 movzx r1d, byte [r0+r2*1-1]
2426 movzx r1d, byte [r0+r2*2-1]
2429 movzx r1d, byte [r0+r2*1-1]
2431 movzx r1d, byte [r0+r2*2-1]
2435 imul r3d, 0x01010101
2442 ;-----------------------------------------------------------------------------
2443 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2444 ;-----------------------------------------------------------------------------
2446 %macro PRED4x4_TM_MMX 0
2447 cglobal pred4x4_tm_vp8_8, 3,6
2452 movzx r4d, byte [r0-1]
2455 movzx r1d, byte [r0+r2*1-1]
2456 movzx r3d, byte [r0+r2*2-1]
2488 cglobal pred4x4_tm_vp8_8_ssse3, 3,3
2497 movd mm2, [r0+r2*1-4]
2498 movd mm3, [r0+r2*2-4]
2499 movd mm4, [r1+r2*1-4]
2500 movd mm5, [r1+r2*2-4]
2523 ;-----------------------------------------------------------------------------
2524 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2525 ;-----------------------------------------------------------------------------
2528 cglobal pred4x4_vertical_vp8_8_mmxext, 3,3
2532 mova m2, m0 ;t0 t1 t2 t3
2533 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2535 psrlq m0, 8 ;t1 t2 t3 t4
2536 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2543 ;-----------------------------------------------------------------------------
2544 ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2545 ;-----------------------------------------------------------------------------
2547 cglobal pred4x4_down_left_8_mmxext, 3,3
2557 PRED4x4_LOWPASS m0, m1, m2, m3, m4
2569 ;-----------------------------------------------------------------------------
2570 ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2571 ;-----------------------------------------------------------------------------
2574 cglobal pred4x4_vertical_left_8_mmxext, 3,3
2584 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2594 ;-----------------------------------------------------------------------------
2595 ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2596 ;-----------------------------------------------------------------------------
2599 cglobal pred4x4_horizontal_up_8_mmxext, 3,3
2602 movd m0, [r0+r2*1-4]
2603 punpcklbw m0, [r0+r2*2-4]
2604 movd m1, [r1+r2*1-4]
2605 punpcklbw m1, [r1+r2*2-4]
2617 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2627 ;-----------------------------------------------------------------------------
2628 ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2629 ;-----------------------------------------------------------------------------
2632 %define PALIGNR PALIGNR_MMX
2633 cglobal pred4x4_horizontal_down_8_mmxext, 3,3
2636 movh m0, [r0-4] ; lt ..
2637 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2638 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2639 movd m1, [r1+r2*2-4] ; l3
2640 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2641 movd m2, [r0+r2*2-4] ; l1
2642 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2643 punpckhwd m1, m2 ; l0 l1 l2 l3
2644 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2648 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2649 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2651 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2654 PALIGNR m3, m5, 6, m4
2663 ;-----------------------------------------------------------------------------
2664 ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2665 ;-----------------------------------------------------------------------------
2668 %define PALIGNR PALIGNR_MMX
2669 cglobal pred4x4_vertical_right_8_mmxext, 3,3
2672 movh m0, [r0] ; ........t3t2t1t0
2674 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2676 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2678 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2680 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2681 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2687 PALIGNR m5, m1, 7, m2
2690 PALIGNR m3, m1, 7, m1
2694 ;-----------------------------------------------------------------------------
2695 ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2696 ;-----------------------------------------------------------------------------
2699 %define PALIGNR PALIGNR_MMX
2700 cglobal pred4x4_down_right_8_mmxext, 3,3
2704 movq m2, [r0+r2*1-8]
2705 punpckhbw m2, [r0-8]
2708 PALIGNR m3, m1, 5, m1
2710 PALIGNR m3, [r1+r2*1-8], 7, m4
2712 PALIGNR m3, [r1+r2*2-8], 7, m4
2713 PRED4x4_LOWPASS m0, m3, m1, m2, m4