1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
30 tm_shuf: times 8 db 0x03, 0x80
31 pw_ff00: times 8 dw 0xff00
32 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
33 db 1, 2, 3, 4, 5, 6, 7, 8
34 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
35 db 1, 2, 3, 4, 0, 0, 0, 0
36 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
37 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
38 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
39 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
52 ;-----------------------------------------------------------------------------
53 ; void pred16x16_vertical(uint8_t *src, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_mmx, 2,3
71 cglobal pred16x16_vertical_sse, 2,3
76 movaps [r0+r1*1], xmm0
77 movaps [r0+r1*2], xmm0
79 movaps [r0+r1*1], xmm0
80 movaps [r0+r1*2], xmm0
86 ;-----------------------------------------------------------------------------
87 ; void pred16x16_horizontal(uint8_t *src, int stride)
88 ;-----------------------------------------------------------------------------
91 cglobal pred16x16_horizontal, 2,3
135 ;-----------------------------------------------------------------------------
136 ; void pred16x16_dc(uint8_t *src, int stride)
137 ;-----------------------------------------------------------------------------
139 %macro PRED16x16_DC 0
140 cglobal pred16x16_dc, 2,7
148 movzx r5d, byte [r0+r1*1]
153 movzx r2d, byte [r0+r1*0]
154 movzx r3d, byte [r0+r1*1]
159 movzx r2d, byte [r0+r1*0]
208 ;-----------------------------------------------------------------------------
209 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
210 ;-----------------------------------------------------------------------------
212 %macro PRED16x16_TM_MMX 0
213 cglobal pred16x16_tm_vp8, 2,5
224 movzx r3d, byte [r0-1]
227 movzx r2d, byte [r0+r1-1]
259 cglobal pred16x16_tm_vp8_sse2, 2,6,6
266 movzx r4d, byte [r0-1]
269 movzx r2d, byte [r0+r1*1-1]
270 movzx r3d, byte [r0+r1*2-1]
275 pshuflw xmm2, xmm2, 0
276 pshuflw xmm4, xmm4, 0
277 punpcklqdq xmm2, xmm2
278 punpcklqdq xmm4, xmm4
287 movdqa [r0+r1*1], xmm2
288 movdqa [r0+r1*2], xmm4
294 ;-----------------------------------------------------------------------------
295 ; void pred16x16_plane(uint8_t *src, int stride)
296 ;-----------------------------------------------------------------------------
298 %macro H264_PRED16x16_PLANE 1
299 cglobal pred16x16_plane_%1, 2,9,7
313 pmullw m0, [pw_m8tom1 ]
314 pmullw m1, [pw_m8tom1+8]
315 pmullw m2, [pw_1to8 ]
316 pmullw m3, [pw_1to8 +8]
321 movhps m0, [r0+r1 +8]
322 pmaddubsw m0, [plane_shuf] ; H coefficients
328 pmullw m0, [pw_m8tom1]
352 paddw m0, m1 ; sum of H coefficients
364 movzx e_reg, byte [r3+r2*2 ]
365 movzx r5, byte [r4+r1 ]
368 movzx e_reg, byte [r3+r2 ]
373 movzx e_reg, byte [r3+r1 ]
374 movzx r6, byte [r4+r2*2 ]
378 movzx e_reg, byte [r3 ]
380 movzx r7, byte [r4+r2 ]
383 movzx r6, byte [r4+r2 ]
392 movzx r4, byte [e_reg+r2 ]
404 movzx r4, byte [e_reg ]
406 movzx r7, byte [r3 +r2 ]
410 movzx r6, byte [r3 +r2 ]
416 movzx r4, byte [e_reg+r1 ]
417 movzx r6, byte [r3 +r2*2]
424 movzx r4, byte [e_reg+r2*2]
425 movzx r6, byte [r3 +r1 ]
428 add r5, r6 ; sum of V coefficients
445 lea r5, [r5*5] ; 5*(V/4)
449 sar r5, 4 ; (5*(V/4))/16
452 movzx r4, byte [r0+r1 +15]
453 movzx r3, byte [r3+r2*2 ]
471 lea r1d, [r1d*5] ; 5*(H/4)
475 sar r1d, 4 ; (5*(H/4))/16
490 punpcklqdq m0, m0 ; splat H (words)
491 punpcklqdq m1, m1 ; splat V (words)
492 punpcklqdq m3, m3 ; splat a (words)
512 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
521 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
522 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
524 paddw m5, m0 ; a + {8,9,10,11}*H
525 paddw m6, m0 ; a + {12,13,14,15}*H
530 mova m3, m0 ; b[0..7]
531 mova m4, m2 ; b[8..15]
537 mova m3, m5 ; b[8..11]
538 mova m4, m6 ; b[12..15]
551 mova m3, m0 ; b[0..7]
552 mova m4, m2 ; b[8..15]
558 mova m3, m5 ; b[8..11]
559 mova m4, m6 ; b[12..15]
579 H264_PRED16x16_PLANE h264
580 H264_PRED16x16_PLANE rv40
581 H264_PRED16x16_PLANE svq3
583 H264_PRED16x16_PLANE h264
584 H264_PRED16x16_PLANE rv40
585 H264_PRED16x16_PLANE svq3
587 H264_PRED16x16_PLANE h264
588 H264_PRED16x16_PLANE rv40
589 H264_PRED16x16_PLANE svq3
591 H264_PRED16x16_PLANE h264
592 H264_PRED16x16_PLANE rv40
593 H264_PRED16x16_PLANE svq3
596 ;-----------------------------------------------------------------------------
597 ; void pred8x8_plane(uint8_t *src, int stride)
598 ;-----------------------------------------------------------------------------
600 %macro H264_PRED8x8_PLANE 0
601 cglobal pred8x8_plane, 2,9,7
611 pmullw m0, [pw_m4to4]
612 pmullw m1, [pw_m4to4+8]
615 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
616 pmaddubsw m0, [plane8_shuf] ; H coefficients
622 pmullw m0, [pw_m4to4]
628 %if notcpuflag(ssse3)
629 %if cpuflag(sse2) ; mmsize == 16
648 paddw m0, m1 ; sum of H coefficients
660 movzx e_reg, byte [r3+r2*2 ]
661 movzx r5, byte [r4+r1 ]
664 movzx e_reg, byte [r3 ]
666 movzx r7, byte [r4+r2 ]
670 movzx r6, byte [r4+r2 ]
676 movzx e_reg, byte [r3+r1 ]
677 movzx r6, byte [r4+r2*2 ]
684 movzx e_reg, byte [r3+r2 ]
697 movzx r3, byte [r4+r2*2 ]
698 movzx r4, byte [r0+r1 +7]
718 punpcklqdq m0, m0 ; splat H (words)
719 punpcklqdq m1, m1 ; splat V (words)
720 punpcklqdq m3, m3 ; splat a (words)
736 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
737 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
740 paddw m2, m0 ; a + {4,5,6,7}*H
747 mova m3, m0 ; b[0..7]
750 mova m4, m0 ; V+b[0..7]
757 mova m3, m0 ; b[0..3]
758 mova m4, m2 ; b[4..7]
763 mova m5, m0 ; V+b[0..3]
764 mova m6, m2 ; V+b[4..7]
791 ;-----------------------------------------------------------------------------
792 ; void pred8x8_vertical(uint8_t *src, int stride)
793 ;-----------------------------------------------------------------------------
795 cglobal pred8x8_vertical_mmx, 2,2
807 ;-----------------------------------------------------------------------------
808 ; void pred8x8_horizontal(uint8_t *src, int stride)
809 ;-----------------------------------------------------------------------------
812 cglobal pred8x8_horizontal, 2,3
852 ;-----------------------------------------------------------------------------
853 ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
854 ;-----------------------------------------------------------------------------
855 cglobal pred8x8_top_dc_mmxext, 2,5
872 pshufw mm0, mm0, 0 ; dc0 (w)
873 packuswb mm0, mm1 ; dc0,dc1 (b)
885 ;-----------------------------------------------------------------------------
886 ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
887 ;-----------------------------------------------------------------------------
890 cglobal pred8x8_dc_mmxext, 2,5
899 movzx r2d, byte [r0+r1*1-1]
900 movzx r3d, byte [r0+r1*2-1]
903 movzx r3d, byte [r0+r1*1-1]
905 movzx r3d, byte [r0+r1*2-1]
909 movzx r2d, byte [r0+r1*1-1]
910 movzx r3d, byte [r0+r1*2-1]
913 movzx r3d, byte [r0+r1*1-1]
915 movzx r3d, byte [r0+r1*2-1]
922 punpckldq m0, m2 ; s0, s1, s2, s3
923 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
925 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
929 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
946 ;-----------------------------------------------------------------------------
947 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
948 ;-----------------------------------------------------------------------------
950 cglobal pred8x8_dc_rv40_mmxext, 2,7
956 movzx r5d, byte [r0+r1*1]
960 movzx r2d, byte [r0+r1*0]
961 movzx r3d, byte [r0+r1*1]
966 movzx r2d, byte [r0+r1*0]
982 ;-----------------------------------------------------------------------------
983 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
984 ;-----------------------------------------------------------------------------
986 %macro PRED8x8_TM_MMX 0
987 cglobal pred8x8_tm_vp8, 2,6
994 movzx r4d, byte [r0-1]
997 movzx r2d, byte [r0+r1*1-1]
998 movzx r3d, byte [r0+r1*2-1]
1034 cglobal pred8x8_tm_vp8_sse2, 2,6,4
1038 punpcklbw xmm0, xmm1
1039 movzx r4d, byte [r0-1]
1042 movzx r2d, byte [r0+r1*1-1]
1043 movzx r3d, byte [r0+r1*2-1]
1048 pshuflw xmm2, xmm2, 0
1049 pshuflw xmm3, xmm3, 0
1050 punpcklqdq xmm2, xmm2
1051 punpcklqdq xmm3, xmm3
1055 movq [r0+r1*1], xmm2
1056 movhps [r0+r1*2], xmm2
1062 cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1064 movdqa xmm4, [tm_shuf]
1067 punpcklbw xmm0, xmm1
1072 movd xmm2, [r0+r1*1-4]
1073 movd xmm3, [r0+r1*2-4]
1081 movq [r0+r1*1], xmm2
1082 movhps [r0+r1*2], xmm2
1088 ; dest, left, right, src, tmp
1089 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1090 %macro PRED4x4_LOWPASS 5
1100 ;-----------------------------------------------------------------------------
1101 ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1102 ;-----------------------------------------------------------------------------
1103 %macro PRED8x8L_TOP_DC 1
1104 cglobal pred8x8l_top_dc_%1, 4,4
1112 PALIGNR mm2, mm0, 7, mm0
1113 PALIGNR mm1, mm4, 1, mm4
1114 test r1, r1 ; top_left
1116 test r2, r2 ; top_right
1125 test r2, r2 ; top_right
1134 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1151 %define PALIGNR PALIGNR_MMX
1152 PRED8x8L_TOP_DC mmxext
1153 %define PALIGNR PALIGNR_SSSE3
1154 PRED8x8L_TOP_DC ssse3
1156 ;-----------------------------------------------------------------------------
1157 ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1158 ;-----------------------------------------------------------------------------
1160 %macro PRED8x8L_DC 1
1161 cglobal pred8x8l_dc_%1, 4,5
1164 movq mm0, [r0+r3*1-8]
1165 punpckhbw mm0, [r0+r3*0-8]
1166 movq mm1, [r4+r3*1-8]
1167 punpckhbw mm1, [r0+r3*2-8]
1171 movq mm2, [r0+r3*1-8]
1172 punpckhbw mm2, [r0+r3*0-8]
1174 movq mm3, [r0+r3*1-8]
1175 punpckhbw mm3, [r0+r3*0-8]
1179 movq mm0, [r0+r3*0-8]
1184 PALIGNR mm4, mm0, 7, mm0
1185 PALIGNR mm1, mm2, 1, mm2
1212 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1215 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1217 PALIGNR mm7, mm1, 7, mm3
1223 PALIGNR mm2, mm0, 7, mm0
1224 PALIGNR mm1, mm4, 1, mm4
1231 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1254 %define PALIGNR PALIGNR_MMX
1256 %define PALIGNR PALIGNR_SSSE3
1259 ;-----------------------------------------------------------------------------
1260 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1261 ;-----------------------------------------------------------------------------
1263 %macro PRED8x8L_HORIZONTAL 1
1264 cglobal pred8x8l_horizontal_%1, 4,4
1267 movq mm0, [r0+r3*1-8]
1271 punpckhbw mm0, [r1+r3*0-8]
1272 movq mm1, [r2+r3*1-8]
1273 punpckhbw mm1, [r0+r3*2-8]
1277 movq mm2, [r0+r3*1-8]
1278 punpckhbw mm2, [r0+r3*0-8]
1280 movq mm3, [r0+r3*1-8]
1281 punpckhbw mm3, [r0+r3*0-8]
1285 movq mm0, [r0+r3*0-8]
1286 movq mm1, [r1+r3*0-8]
1290 PALIGNR mm4, mm0, 7, mm0
1291 PALIGNR mm1, mm2, 1, mm2
1293 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1296 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1298 PALIGNR mm7, mm1, 7, mm3
1304 pshufw mm0, mm3, 0xff
1305 pshufw mm1, mm3, 0xaa
1307 pshufw mm2, mm3, 0x55
1308 pshufw mm3, mm3, 0x00
1309 pshufw mm4, mm7, 0xff
1310 pshufw mm5, mm7, 0xaa
1311 pshufw mm6, mm7, 0x55
1312 pshufw mm7, mm7, 0x00
1326 %define PALIGNR PALIGNR_MMX
1327 PRED8x8L_HORIZONTAL mmxext
1328 %define PALIGNR PALIGNR_SSSE3
1329 PRED8x8L_HORIZONTAL ssse3
1331 ;-----------------------------------------------------------------------------
1332 ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1333 ;-----------------------------------------------------------------------------
1335 %macro PRED8x8L_VERTICAL 1
1336 cglobal pred8x8l_vertical_%1, 4,4
1343 PALIGNR mm2, mm0, 7, mm0
1344 PALIGNR mm1, mm4, 1, mm4
1345 test r1, r1 ; top_left
1347 test r2, r2 ; top_right
1356 test r2, r2 ; top_right
1365 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1377 %define PALIGNR PALIGNR_MMX
1378 PRED8x8L_VERTICAL mmxext
1379 %define PALIGNR PALIGNR_SSSE3
1380 PRED8x8L_VERTICAL ssse3
1382 ;-----------------------------------------------------------------------------
1383 ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1384 ;-----------------------------------------------------------------------------
1387 %define PALIGNR PALIGNR_MMX
1388 cglobal pred8x8l_down_left_mmxext, 4,5
1395 PALIGNR mm2, mm0, 7, mm0
1396 PALIGNR mm1, mm4, 1, mm4
1419 pshufw mm1, mm3, 0xFF
1422 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1431 PALIGNR mm2, mm3, 7, mm3
1432 PALIGNR mm5, mm4, 1, mm4
1433 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1441 PALIGNR mm2, mm7, 1, mm0
1443 PALIGNR mm3, mm7, 7, mm0
1444 PALIGNR mm4, mm6, 1, mm0
1450 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1451 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1495 %macro PRED8x8L_DOWN_LEFT 1
1496 cglobal pred8x8l_down_left_%1, 4,4
1503 PALIGNR mm2, mm0, 7, mm0
1504 PALIGNR mm1, mm4, 1, mm4
1505 test r1, r1 ; top_left
1507 test r2, r2 ; top_right
1516 test r2, r2 ; top_right
1527 pshufw mm1, mm3, 0xFF
1530 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1532 test r2, r2 ; top_right
1539 PALIGNR mm2, mm3, 7, mm3
1540 PALIGNR mm5, mm4, 1, mm4
1541 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1557 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1559 movq [r0+r3*1], xmm0
1561 movq [r0+r3*2], xmm0
1564 movq [r1+r3*1], xmm0
1566 movq [r1+r3*2], xmm0
1568 movq [r2+r3*1], xmm0
1570 movq [r2+r3*2], xmm0
1572 movq [r0+r3*1], xmm0
1574 movq [r0+r3*2], xmm0
1579 %define PALIGNR PALIGNR_MMX
1580 PRED8x8L_DOWN_LEFT sse2
1582 %define PALIGNR PALIGNR_SSSE3
1583 PRED8x8L_DOWN_LEFT ssse3
1585 ;-----------------------------------------------------------------------------
1586 ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1587 ;-----------------------------------------------------------------------------
1590 %define PALIGNR PALIGNR_MMX
1591 cglobal pred8x8l_down_right_mmxext, 4,5
1594 movq mm0, [r0+r3*1-8]
1595 punpckhbw mm0, [r0+r3*0-8]
1596 movq mm1, [r4+r3*1-8]
1597 punpckhbw mm1, [r0+r3*2-8]
1601 movq mm2, [r0+r3*1-8]
1602 punpckhbw mm2, [r0+r3*0-8]
1604 movq mm3, [r0+r3*1-8]
1605 punpckhbw mm3, [r0+r3*0-8]
1609 movq mm0, [r0+r3*0-8]
1614 PALIGNR mm4, mm0, 7, mm0
1615 PALIGNR mm1, mm2, 1, mm2
1616 test r1, r1 ; top_left
1620 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1624 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1626 PALIGNR mm7, mm1, 7, mm3
1632 PALIGNR mm2, mm0, 7, mm0
1633 PALIGNR mm1, mm4, 1, mm4
1634 test r1, r1 ; top_left
1636 test r2, r2 ; top_right
1639 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1655 test r2, r2 ; top_right
1671 PALIGNR mm2, mm6, 1, mm0
1673 PALIGNR mm3, mm6, 7, mm0
1677 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1678 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1722 %macro PRED8x8L_DOWN_RIGHT 1
1723 cglobal pred8x8l_down_right_%1, 4,5
1726 movq mm0, [r0+r3*1-8]
1727 punpckhbw mm0, [r0+r3*0-8]
1728 movq mm1, [r4+r3*1-8]
1729 punpckhbw mm1, [r0+r3*2-8]
1733 movq mm2, [r0+r3*1-8]
1734 punpckhbw mm2, [r0+r3*0-8]
1736 movq mm3, [r0+r3*1-8]
1737 punpckhbw mm3, [r0+r3*0-8]
1741 movq mm0, [r0+r3*0-8]
1746 PALIGNR mm4, mm0, 7, mm0
1747 PALIGNR mm1, mm2, 1, mm2
1775 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1779 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1781 PALIGNR mm7, mm1, 7, mm3
1788 PALIGNR mm2, mm0, 7, mm0
1789 PALIGNR mm1, mm4, 1, mm4
1795 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1812 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1815 movq [r0+r3*2], xmm0
1816 movq [r0+r3*1], xmm1
1819 movq [r2+r3*2], xmm0
1820 movq [r2+r3*1], xmm1
1823 movq [r1+r3*2], xmm0
1824 movq [r1+r3*1], xmm1
1827 movq [r4+r3*2], xmm0
1828 movq [r4+r3*1], xmm1
1833 %define PALIGNR PALIGNR_MMX
1834 PRED8x8L_DOWN_RIGHT sse2
1836 %define PALIGNR PALIGNR_SSSE3
1837 PRED8x8L_DOWN_RIGHT ssse3
1839 ;-----------------------------------------------------------------------------
1840 ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1841 ;-----------------------------------------------------------------------------
1844 %define PALIGNR PALIGNR_MMX
1845 cglobal pred8x8l_vertical_right_mmxext, 4,5
1848 movq mm0, [r0+r3*1-8]
1849 punpckhbw mm0, [r0+r3*0-8]
1850 movq mm1, [r4+r3*1-8]
1851 punpckhbw mm1, [r0+r3*2-8]
1855 movq mm2, [r0+r3*1-8]
1856 punpckhbw mm2, [r0+r3*0-8]
1858 movq mm3, [r0+r3*1-8]
1859 punpckhbw mm3, [r0+r3*0-8]
1863 movq mm0, [r0+r3*0-8]
1868 PALIGNR mm4, mm0, 7, mm0
1869 PALIGNR mm1, mm2, 1, mm2
1897 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1904 PALIGNR mm2, mm0, 7, mm0
1905 PALIGNR mm1, mm4, 1, mm4
1911 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1915 PALIGNR mm3, mm7, 7, mm0
1916 PALIGNR mm6, mm7, 6, mm1
1920 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1931 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1932 PALIGNR mm6, mm0, 7, mm2
1935 PALIGNR mm5, mm0, 7, mm1
1938 PALIGNR mm6, mm0, 7, mm2
1941 PALIGNR mm5, mm0, 7, mm1
1944 PALIGNR mm6, mm0, 7, mm2
1947 PALIGNR mm5, mm0, 7, mm1
1951 %macro PRED8x8L_VERTICAL_RIGHT 1
1952 cglobal pred8x8l_vertical_right_%1, 4,5,7
1953 ; manually spill XMM registers for Win64 because
1954 ; the code here is initialized with INIT_MMX
1958 movq mm0, [r0+r3*1-8]
1959 punpckhbw mm0, [r0+r3*0-8]
1960 movq mm1, [r4+r3*1-8]
1961 punpckhbw mm1, [r0+r3*2-8]
1965 movq mm2, [r0+r3*1-8]
1966 punpckhbw mm2, [r0+r3*0-8]
1968 movq mm3, [r0+r3*1-8]
1969 punpckhbw mm3, [r0+r3*0-8]
1973 movq mm0, [r0+r3*0-8]
1978 PALIGNR mm4, mm0, 7, mm0
1979 PALIGNR mm1, mm2, 1, mm2
2006 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2013 PALIGNR mm2, mm0, 7, mm0
2014 PALIGNR mm1, mm4, 1, mm4
2020 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
2025 movdqa xmm6, [pw_ff00]
2034 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
2040 movhps [r0+r3*2], xmm5
2041 movhps [r0+r3*1], xmm2
2049 movq [r0+r3*2], xmm5
2050 movq [r0+r3*1], xmm2
2053 movq [r2+r3*2], xmm5
2054 movq [r2+r3*1], xmm2
2057 movq [r1+r3*2], xmm5
2058 movq [r1+r3*1], xmm2
2063 %define PALIGNR PALIGNR_MMX
2064 PRED8x8L_VERTICAL_RIGHT sse2
2066 %define PALIGNR PALIGNR_SSSE3
2067 PRED8x8L_VERTICAL_RIGHT ssse3
2069 ;-----------------------------------------------------------------------------
2070 ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
2071 ;-----------------------------------------------------------------------------
2073 %macro PRED8x8L_VERTICAL_LEFT 1
2074 cglobal pred8x8l_vertical_left_%1, 4,4
2081 PALIGNR mm2, mm0, 7, mm0
2082 PALIGNR mm1, mm4, 1, mm4
2105 pshufw mm1, mm3, 0xFF
2108 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2117 PALIGNR mm2, mm3, 7, mm3
2118 PALIGNR mm5, mm4, 1, mm4
2119 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2133 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2135 movq [r0+r3*1], xmm3
2136 movq [r0+r3*2], xmm0
2140 movq [r1+r3*1], xmm3
2141 movq [r1+r3*2], xmm0
2144 movq [r2+r3*1], xmm3
2145 movq [r2+r3*2], xmm0
2148 movq [r0+r3*1], xmm3
2149 movq [r0+r3*2], xmm0
2154 %define PALIGNR PALIGNR_MMX
2155 PRED8x8L_VERTICAL_LEFT sse2
2156 %define PALIGNR PALIGNR_SSSE3
2158 PRED8x8L_VERTICAL_LEFT ssse3
2160 ;-----------------------------------------------------------------------------
2161 ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2162 ;-----------------------------------------------------------------------------
2164 %macro PRED8x8L_HORIZONTAL_UP 1
2165 cglobal pred8x8l_horizontal_up_%1, 4,4
2168 movq mm0, [r0+r3*1-8]
2172 punpckhbw mm0, [r1+r3*0-8]
2173 movq mm1, [r2+r3*1-8]
2174 punpckhbw mm1, [r0+r3*2-8]
2178 movq mm2, [r0+r3*1-8]
2179 punpckhbw mm2, [r0+r3*0-8]
2181 movq mm3, [r0+r3*1-8]
2182 punpckhbw mm3, [r0+r3*0-8]
2186 movq mm0, [r0+r3*0-8]
2187 movq mm1, [r1+r3*0-8]
2191 PALIGNR mm4, mm0, 7, mm0
2192 PALIGNR mm1, mm2, 1, mm2
2194 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2197 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2199 PALIGNR mm7, mm1, 7, mm3
2201 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2202 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2206 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2213 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2215 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2217 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2219 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2220 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2224 PALIGNR mm5, mm4, 2, mm1
2225 pshufw mm1, mm6, 11111001b
2226 PALIGNR mm6, mm4, 4, mm2
2227 pshufw mm2, mm7, 11111110b
2228 PALIGNR mm7, mm4, 6, mm3
2229 pshufw mm3, mm0, 11111111b
2243 %define PALIGNR PALIGNR_MMX
2244 PRED8x8L_HORIZONTAL_UP mmxext
2245 %define PALIGNR PALIGNR_SSSE3
2246 PRED8x8L_HORIZONTAL_UP ssse3
2248 ;-----------------------------------------------------------------------------
2249 ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2250 ;-----------------------------------------------------------------------------
2253 %define PALIGNR PALIGNR_MMX
2254 cglobal pred8x8l_horizontal_down_mmxext, 4,5
2257 movq mm0, [r0+r3*1-8]
2258 punpckhbw mm0, [r0+r3*0-8]
2259 movq mm1, [r4+r3*1-8]
2260 punpckhbw mm1, [r0+r3*2-8]
2264 movq mm2, [r0+r3*1-8]
2265 punpckhbw mm2, [r0+r3*0-8]
2267 movq mm3, [r0+r3*1-8]
2268 punpckhbw mm3, [r0+r3*0-8]
2272 movq mm0, [r0+r3*0-8]
2277 PALIGNR mm4, mm0, 7, mm0
2278 PALIGNR mm1, mm2, 1, mm2
2305 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2309 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2311 PALIGNR mm7, mm1, 7, mm3
2317 PALIGNR mm2, mm0, 7, mm0
2318 PALIGNR mm1, mm4, 1, mm4
2324 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2331 PALIGNR mm2, mm6, 7, mm5
2332 PALIGNR mm6, mm7, 7, mm0
2334 PALIGNR mm4, mm3, 1, mm7
2337 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2343 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2351 PALIGNR mm7, mm3, 2, mm5
2353 PALIGNR mm1, mm3, 4, mm5
2355 PALIGNR mm0, mm3, 6, mm3
2360 PALIGNR mm6, mm4, 2, mm5
2362 PALIGNR mm2, mm4, 4, mm5
2364 PALIGNR mm3, mm4, 6, mm4
2368 %macro PRED8x8L_HORIZONTAL_DOWN 1
2369 cglobal pred8x8l_horizontal_down_%1, 4,5
2372 movq mm0, [r0+r3*1-8]
2373 punpckhbw mm0, [r0+r3*0-8]
2374 movq mm1, [r4+r3*1-8]
2375 punpckhbw mm1, [r0+r3*2-8]
2379 movq mm2, [r0+r3*1-8]
2380 punpckhbw mm2, [r0+r3*0-8]
2382 movq mm3, [r0+r3*1-8]
2383 punpckhbw mm3, [r0+r3*0-8]
2387 movq mm0, [r0+r3*0-8]
2392 PALIGNR mm4, mm0, 7, mm0
2393 PALIGNR mm1, mm2, 1, mm2
2420 pshufw mm1, mm3, 0xFF
2424 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2428 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2438 PALIGNR mm2, mm0, 7, mm0
2439 PALIGNR mm1, mm4, 1, mm4
2445 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2454 PALIGNR mm2, mm3, 7, mm3
2455 PALIGNR mm5, mm4, 1, mm4
2456 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2465 PALIGNR xmm1, xmm0, 7, xmm4
2466 PALIGNR xmm2, xmm0, 9, xmm5
2468 PALIGNR xmm3, xmm0, 8, xmm0
2472 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2473 punpcklbw xmm4, xmm0
2475 movq [r0+r3*2], xmm4
2476 movq [r2+r3*2], xmm0
2479 movq [r0+r3*1], xmm4
2480 movq [r2+r3*1], xmm0
2483 movq [r1+r3*2], xmm4
2484 movq [r4+r3*2], xmm0
2487 movq [r1+r3*1], xmm4
2488 movq [r4+r3*1], xmm0
2493 %define PALIGNR PALIGNR_MMX
2494 PRED8x8L_HORIZONTAL_DOWN sse2
2496 %define PALIGNR PALIGNR_SSSE3
2497 PRED8x8L_HORIZONTAL_DOWN ssse3
2499 ;-----------------------------------------------------------------------------
2500 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2501 ;-----------------------------------------------------------------------------
2503 cglobal pred4x4_dc_mmxext, 3,5
2509 movzx r1d, byte [r0+r2*1-1]
2512 movzx r1d, byte [r0+r2*2-1]
2515 movzx r1d, byte [r0+r2*1-1]
2517 movzx r1d, byte [r0+r2*2-1]
2521 imul r3d, 0x01010101
2528 ;-----------------------------------------------------------------------------
2529 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2530 ;-----------------------------------------------------------------------------
2532 %macro PRED4x4_TM_MMX 0
2533 cglobal pred4x4_tm_vp8, 3,6
2538 movzx r4d, byte [r0-1]
2541 movzx r1d, byte [r0+r2*1-1]
2542 movzx r3d, byte [r0+r2*2-1]
2574 cglobal pred4x4_tm_vp8_ssse3, 3,3
2583 movd mm2, [r0+r2*1-4]
2584 movd mm3, [r0+r2*2-4]
2585 movd mm4, [r1+r2*1-4]
2586 movd mm5, [r1+r2*2-4]
2609 ;-----------------------------------------------------------------------------
2610 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2611 ;-----------------------------------------------------------------------------
2614 cglobal pred4x4_vertical_vp8_mmxext, 3,3
2618 mova m2, m0 ;t0 t1 t2 t3
2619 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2621 psrlq m0, 8 ;t1 t2 t3 t4
2622 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2629 ;-----------------------------------------------------------------------------
2630 ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2631 ;-----------------------------------------------------------------------------
2633 cglobal pred4x4_down_left_mmxext, 3,3
2643 PRED4x4_LOWPASS m0, m1, m2, m3, m4
2655 ;-----------------------------------------------------------------------------
2656 ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2657 ;-----------------------------------------------------------------------------
2660 cglobal pred4x4_vertical_left_mmxext, 3,3
2670 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2680 ;-----------------------------------------------------------------------------
2681 ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2682 ;-----------------------------------------------------------------------------
2685 cglobal pred4x4_horizontal_up_mmxext, 3,3
2688 movd m0, [r0+r2*1-4]
2689 punpcklbw m0, [r0+r2*2-4]
2690 movd m1, [r1+r2*1-4]
2691 punpcklbw m1, [r1+r2*2-4]
2703 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2713 ;-----------------------------------------------------------------------------
2714 ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2715 ;-----------------------------------------------------------------------------
2718 %define PALIGNR PALIGNR_MMX
2719 cglobal pred4x4_horizontal_down_mmxext, 3,3
2722 movh m0, [r0-4] ; lt ..
2723 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2724 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2725 movd m1, [r1+r2*2-4] ; l3
2726 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2727 movd m2, [r0+r2*2-4] ; l1
2728 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2729 punpckhwd m1, m2 ; l0 l1 l2 l3
2730 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2734 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2735 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2737 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2740 PALIGNR m3, m5, 6, m4
2749 ;-----------------------------------------------------------------------------
2750 ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2751 ;-----------------------------------------------------------------------------
2754 %define PALIGNR PALIGNR_MMX
2755 cglobal pred4x4_vertical_right_mmxext, 3,3
2758 movh m0, [r0] ; ........t3t2t1t0
2760 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2762 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2764 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2766 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2767 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2773 PALIGNR m5, m1, 7, m2
2776 PALIGNR m3, m1, 7, m1
2780 ;-----------------------------------------------------------------------------
2781 ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2782 ;-----------------------------------------------------------------------------
2785 %define PALIGNR PALIGNR_MMX
2786 cglobal pred4x4_down_right_mmxext, 3,3
2790 movq m2, [r0+r2*1-8]
2791 punpckhbw m2, [r0-8]
2794 PALIGNR m3, m1, 5, m1
2796 PALIGNR m3, [r1+r2*1-8], 7, m4
2798 PALIGNR m3, [r1+r2*2-8], 7, m4
2799 PRED4x4_LOWPASS m0, m3, m1, m2, m4