1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86inc.asm"
26 %include "libavutil/x86/x86util.asm"
30 tm_shuf: times 8 db 0x03, 0x80
31 pw_ff00: times 8 dw 0xff00
32 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
33 db 1, 2, 3, 4, 5, 6, 7, 8
34 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
35 db 1, 2, 3, 4, 0, 0, 0, 0
36 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
37 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
38 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
39 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
52 ;-----------------------------------------------------------------------------
53 ; void pred16x16_vertical(uint8_t *src, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_mmx, 2,3
71 cglobal pred16x16_vertical_sse, 2,3
76 movaps [r0+r1*1], xmm0
77 movaps [r0+r1*2], xmm0
79 movaps [r0+r1*1], xmm0
80 movaps [r0+r1*2], xmm0
86 ;-----------------------------------------------------------------------------
87 ; void pred16x16_horizontal(uint8_t *src, int stride)
88 ;-----------------------------------------------------------------------------
91 cglobal pred16x16_horizontal_%1, 2,3
133 ;-----------------------------------------------------------------------------
134 ; void pred16x16_dc(uint8_t *src, int stride)
135 ;-----------------------------------------------------------------------------
137 %macro PRED16x16_DC 1
138 cglobal pred16x16_dc_%1, 2,7
146 movzx r5d, byte [r0+r1*1]
151 movzx r2d, byte [r0+r1*0]
152 movzx r3d, byte [r0+r1*1]
157 movzx r2d, byte [r0+r1*0]
204 ;-----------------------------------------------------------------------------
205 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
206 ;-----------------------------------------------------------------------------
208 %macro PRED16x16_TM_MMX 1
209 cglobal pred16x16_tm_vp8_%1, 2,5
220 movzx r3d, byte [r0-1]
223 movzx r2d, byte [r0+r1-1]
250 PRED16x16_TM_MMX mmxext
252 cglobal pred16x16_tm_vp8_sse2, 2,6,6
259 movzx r4d, byte [r0-1]
262 movzx r2d, byte [r0+r1*1-1]
263 movzx r3d, byte [r0+r1*2-1]
268 pshuflw xmm2, xmm2, 0
269 pshuflw xmm4, xmm4, 0
270 punpcklqdq xmm2, xmm2
271 punpcklqdq xmm4, xmm4
280 movdqa [r0+r1*1], xmm2
281 movdqa [r0+r1*2], xmm4
287 ;-----------------------------------------------------------------------------
288 ; void pred16x16_plane(uint8_t *src, int stride)
289 ;-----------------------------------------------------------------------------
291 %macro H264_PRED16x16_PLANE 3
292 cglobal pred16x16_plane_%3_%1, 2, 9, %2
306 pmullw m0, [pw_m8tom1 ]
307 pmullw m1, [pw_m8tom1+8]
308 pmullw m2, [pw_1to8 ]
309 pmullw m3, [pw_1to8 +8]
318 pmullw m0, [pw_m8tom1]
322 movhps m0, [r0+r1 +8]
323 pmaddubsw m0, [plane_shuf] ; H coefficients
345 paddw m0, m1 ; sum of H coefficients
357 movzx e_reg, byte [r3+r2*2 ]
358 movzx r5, byte [r4+r1 ]
361 movzx e_reg, byte [r3+r2 ]
366 movzx e_reg, byte [r3+r1 ]
367 movzx r6, byte [r4+r2*2 ]
371 movzx e_reg, byte [r3 ]
373 movzx r7, byte [r4+r2 ]
376 movzx r6, byte [r4+r2 ]
385 movzx r4, byte [e_reg+r2 ]
397 movzx r4, byte [e_reg ]
399 movzx r7, byte [r3 +r2 ]
403 movzx r6, byte [r3 +r2 ]
409 movzx r4, byte [e_reg+r1 ]
410 movzx r6, byte [r3 +r2*2]
417 movzx r4, byte [e_reg+r2*2]
418 movzx r6, byte [r3 +r1 ]
421 add r5, r6 ; sum of V coefficients
438 lea r5, [r5*5] ; 5*(V/4)
442 sar r5, 4 ; (5*(V/4))/16
445 movzx r4, byte [r0+r1 +15]
446 movzx r3, byte [r3+r2*2 ]
464 lea r1d, [r1d*5] ; 5*(H/4)
468 sar r1d, 4 ; (5*(H/4))/16
494 punpcklqdq m0, m0 ; splat H (words)
495 punpcklqdq m1, m1 ; splat V (words)
496 punpcklqdq m3, m3 ; splat a (words)
505 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
514 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
515 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
517 paddw m5, m0 ; a + {8,9,10,11}*H
518 paddw m6, m0 ; a + {12,13,14,15}*H
523 mova m3, m0 ; b[0..7]
524 mova m4, m2 ; b[8..15]
530 mova m3, m5 ; b[8..11]
531 mova m4, m6 ; b[12..15]
544 mova m3, m0 ; b[0..7]
545 mova m4, m2 ; b[8..15]
551 mova m3, m5 ; b[8..11]
552 mova m4, m6 ; b[12..15]
572 H264_PRED16x16_PLANE mmx, 0, h264
573 H264_PRED16x16_PLANE mmx, 0, rv40
574 H264_PRED16x16_PLANE mmx, 0, svq3
575 H264_PRED16x16_PLANE mmx2, 0, h264
576 H264_PRED16x16_PLANE mmx2, 0, rv40
577 H264_PRED16x16_PLANE mmx2, 0, svq3
579 H264_PRED16x16_PLANE sse2, 8, h264
580 H264_PRED16x16_PLANE sse2, 8, rv40
581 H264_PRED16x16_PLANE sse2, 8, svq3
582 H264_PRED16x16_PLANE ssse3, 8, h264
583 H264_PRED16x16_PLANE ssse3, 8, rv40
584 H264_PRED16x16_PLANE ssse3, 8, svq3
586 ;-----------------------------------------------------------------------------
587 ; void pred8x8_plane(uint8_t *src, int stride)
588 ;-----------------------------------------------------------------------------
590 %macro H264_PRED8x8_PLANE 2
591 cglobal pred8x8_plane_%1, 2, 9, %2
601 pmullw m0, [pw_m4to4]
602 pmullw m1, [pw_m4to4+8]
609 pmullw m0, [pw_m4to4]
611 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
612 pmaddubsw m0, [plane8_shuf] ; H coefficients
638 paddw m0, m1 ; sum of H coefficients
650 movzx e_reg, byte [r3+r2*2 ]
651 movzx r5, byte [r4+r1 ]
654 movzx e_reg, byte [r3 ]
656 movzx r7, byte [r4+r2 ]
660 movzx r6, byte [r4+r2 ]
666 movzx e_reg, byte [r3+r1 ]
667 movzx r6, byte [r4+r2*2 ]
674 movzx e_reg, byte [r3+r2 ]
687 movzx r3, byte [r4+r2*2 ]
688 movzx r4, byte [r0+r1 +7]
719 punpcklqdq m0, m0 ; splat H (words)
720 punpcklqdq m1, m1 ; splat V (words)
721 punpcklqdq m3, m3 ; splat a (words)
726 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
727 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
730 paddw m2, m0 ; a + {4,5,6,7}*H
737 mova m3, m0 ; b[0..7]
740 mova m4, m0 ; V+b[0..7]
747 mova m3, m0 ; b[0..3]
748 mova m4, m2 ; b[4..7]
753 mova m5, m0 ; V+b[0..3]
754 mova m6, m2 ; V+b[4..7]
772 H264_PRED8x8_PLANE mmx, 0
773 H264_PRED8x8_PLANE mmx2, 0
775 H264_PRED8x8_PLANE sse2, 8
776 H264_PRED8x8_PLANE ssse3, 8
778 ;-----------------------------------------------------------------------------
779 ; void pred8x8_vertical(uint8_t *src, int stride)
780 ;-----------------------------------------------------------------------------
782 cglobal pred8x8_vertical_mmx, 2,2
794 ;-----------------------------------------------------------------------------
795 ; void pred8x8_horizontal(uint8_t *src, int stride)
796 ;-----------------------------------------------------------------------------
799 cglobal pred8x8_horizontal_%1, 2,3
836 ;-----------------------------------------------------------------------------
837 ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
838 ;-----------------------------------------------------------------------------
839 cglobal pred8x8_top_dc_mmxext, 2,5
856 pshufw mm0, mm0, 0 ; dc0 (w)
857 packuswb mm0, mm1 ; dc0,dc1 (b)
869 ;-----------------------------------------------------------------------------
870 ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
871 ;-----------------------------------------------------------------------------
874 cglobal pred8x8_dc_mmxext, 2,5
883 movzx r2d, byte [r0+r1*1-1]
884 movzx r3d, byte [r0+r1*2-1]
887 movzx r3d, byte [r0+r1*1-1]
889 movzx r3d, byte [r0+r1*2-1]
893 movzx r2d, byte [r0+r1*1-1]
894 movzx r3d, byte [r0+r1*2-1]
897 movzx r3d, byte [r0+r1*1-1]
899 movzx r3d, byte [r0+r1*2-1]
906 punpckldq m0, m2 ; s0, s1, s2, s3
907 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
909 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
913 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
930 ;-----------------------------------------------------------------------------
931 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
932 ;-----------------------------------------------------------------------------
934 cglobal pred8x8_dc_rv40_mmxext, 2,7
940 movzx r5d, byte [r0+r1*1]
944 movzx r2d, byte [r0+r1*0]
945 movzx r3d, byte [r0+r1*1]
950 movzx r2d, byte [r0+r1*0]
966 ;-----------------------------------------------------------------------------
967 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
968 ;-----------------------------------------------------------------------------
970 %macro PRED8x8_TM_MMX 1
971 cglobal pred8x8_tm_vp8_%1, 2,6
978 movzx r4d, byte [r0-1]
981 movzx r2d, byte [r0+r1*1-1]
982 movzx r3d, byte [r0+r1*2-1]
1013 PRED8x8_TM_MMX mmxext
1015 cglobal pred8x8_tm_vp8_sse2, 2,6,4
1019 punpcklbw xmm0, xmm1
1020 movzx r4d, byte [r0-1]
1023 movzx r2d, byte [r0+r1*1-1]
1024 movzx r3d, byte [r0+r1*2-1]
1029 pshuflw xmm2, xmm2, 0
1030 pshuflw xmm3, xmm3, 0
1031 punpcklqdq xmm2, xmm2
1032 punpcklqdq xmm3, xmm3
1036 movq [r0+r1*1], xmm2
1037 movhps [r0+r1*2], xmm2
1043 cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1045 movdqa xmm4, [tm_shuf]
1048 punpcklbw xmm0, xmm1
1053 movd xmm2, [r0+r1*1-4]
1054 movd xmm3, [r0+r1*2-4]
1062 movq [r0+r1*1], xmm2
1063 movhps [r0+r1*2], xmm2
1069 ; dest, left, right, src, tmp
1070 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1071 %macro PRED4x4_LOWPASS 5
1081 ;-----------------------------------------------------------------------------
1082 ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1083 ;-----------------------------------------------------------------------------
1084 %macro PRED8x8L_TOP_DC 1
1085 cglobal pred8x8l_top_dc_%1, 4,4
1093 PALIGNR mm2, mm0, 7, mm0
1094 PALIGNR mm1, mm4, 1, mm4
1095 test r1, r1 ; top_left
1097 test r2, r2 ; top_right
1106 test r2, r2 ; top_right
1115 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1132 %define PALIGNR PALIGNR_MMX
1133 PRED8x8L_TOP_DC mmxext
1134 %define PALIGNR PALIGNR_SSSE3
1135 PRED8x8L_TOP_DC ssse3
1137 ;-----------------------------------------------------------------------------
1138 ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1139 ;-----------------------------------------------------------------------------
1141 %macro PRED8x8L_DC 1
1142 cglobal pred8x8l_dc_%1, 4,5
1145 movq mm0, [r0+r3*1-8]
1146 punpckhbw mm0, [r0+r3*0-8]
1147 movq mm1, [r4+r3*1-8]
1148 punpckhbw mm1, [r0+r3*2-8]
1152 movq mm2, [r0+r3*1-8]
1153 punpckhbw mm2, [r0+r3*0-8]
1155 movq mm3, [r0+r3*1-8]
1156 punpckhbw mm3, [r0+r3*0-8]
1160 movq mm0, [r0+r3*0-8]
1165 PALIGNR mm4, mm0, 7, mm0
1166 PALIGNR mm1, mm2, 1, mm2
1193 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1196 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1198 PALIGNR mm7, mm1, 7, mm3
1204 PALIGNR mm2, mm0, 7, mm0
1205 PALIGNR mm1, mm4, 1, mm4
1212 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1235 %define PALIGNR PALIGNR_MMX
1237 %define PALIGNR PALIGNR_SSSE3
1240 ;-----------------------------------------------------------------------------
1241 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1242 ;-----------------------------------------------------------------------------
1244 %macro PRED8x8L_HORIZONTAL 1
1245 cglobal pred8x8l_horizontal_%1, 4,4
1248 movq mm0, [r0+r3*1-8]
1252 punpckhbw mm0, [r1+r3*0-8]
1253 movq mm1, [r2+r3*1-8]
1254 punpckhbw mm1, [r0+r3*2-8]
1258 movq mm2, [r0+r3*1-8]
1259 punpckhbw mm2, [r0+r3*0-8]
1261 movq mm3, [r0+r3*1-8]
1262 punpckhbw mm3, [r0+r3*0-8]
1266 movq mm0, [r0+r3*0-8]
1267 movq mm1, [r1+r3*0-8]
1271 PALIGNR mm4, mm0, 7, mm0
1272 PALIGNR mm1, mm2, 1, mm2
1274 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1277 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1279 PALIGNR mm7, mm1, 7, mm3
1285 pshufw mm0, mm3, 0xff
1286 pshufw mm1, mm3, 0xaa
1288 pshufw mm2, mm3, 0x55
1289 pshufw mm3, mm3, 0x00
1290 pshufw mm4, mm7, 0xff
1291 pshufw mm5, mm7, 0xaa
1292 pshufw mm6, mm7, 0x55
1293 pshufw mm7, mm7, 0x00
1307 %define PALIGNR PALIGNR_MMX
1308 PRED8x8L_HORIZONTAL mmxext
1309 %define PALIGNR PALIGNR_SSSE3
1310 PRED8x8L_HORIZONTAL ssse3
1312 ;-----------------------------------------------------------------------------
1313 ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1314 ;-----------------------------------------------------------------------------
1316 %macro PRED8x8L_VERTICAL 1
1317 cglobal pred8x8l_vertical_%1, 4,4
1324 PALIGNR mm2, mm0, 7, mm0
1325 PALIGNR mm1, mm4, 1, mm4
1326 test r1, r1 ; top_left
1328 test r2, r2 ; top_right
1337 test r2, r2 ; top_right
1346 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1358 %define PALIGNR PALIGNR_MMX
1359 PRED8x8L_VERTICAL mmxext
1360 %define PALIGNR PALIGNR_SSSE3
1361 PRED8x8L_VERTICAL ssse3
1363 ;-----------------------------------------------------------------------------
1364 ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1365 ;-----------------------------------------------------------------------------
1368 %define PALIGNR PALIGNR_MMX
1369 cglobal pred8x8l_down_left_mmxext, 4,5
1376 PALIGNR mm2, mm0, 7, mm0
1377 PALIGNR mm1, mm4, 1, mm4
1400 pshufw mm1, mm3, 0xFF
1403 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1412 PALIGNR mm2, mm3, 7, mm3
1413 PALIGNR mm5, mm4, 1, mm4
1414 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1422 PALIGNR mm2, mm7, 1, mm0
1424 PALIGNR mm3, mm7, 7, mm0
1425 PALIGNR mm4, mm6, 1, mm0
1431 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1432 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1476 %macro PRED8x8L_DOWN_LEFT 1
1477 cglobal pred8x8l_down_left_%1, 4,4
1484 PALIGNR mm2, mm0, 7, mm0
1485 PALIGNR mm1, mm4, 1, mm4
1486 test r1, r1 ; top_left
1488 test r2, r2 ; top_right
1497 test r2, r2 ; top_right
1508 pshufw mm1, mm3, 0xFF
1511 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1513 test r2, r2 ; top_right
1520 PALIGNR mm2, mm3, 7, mm3
1521 PALIGNR mm5, mm4, 1, mm4
1522 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1538 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1540 movq [r0+r3*1], xmm0
1542 movq [r0+r3*2], xmm0
1545 movq [r1+r3*1], xmm0
1547 movq [r1+r3*2], xmm0
1549 movq [r2+r3*1], xmm0
1551 movq [r2+r3*2], xmm0
1553 movq [r0+r3*1], xmm0
1555 movq [r0+r3*2], xmm0
1560 %define PALIGNR PALIGNR_MMX
1561 PRED8x8L_DOWN_LEFT sse2
1563 %define PALIGNR PALIGNR_SSSE3
1564 PRED8x8L_DOWN_LEFT ssse3
1566 ;-----------------------------------------------------------------------------
1567 ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1568 ;-----------------------------------------------------------------------------
1571 %define PALIGNR PALIGNR_MMX
1572 cglobal pred8x8l_down_right_mmxext, 4,5
1575 movq mm0, [r0+r3*1-8]
1576 punpckhbw mm0, [r0+r3*0-8]
1577 movq mm1, [r4+r3*1-8]
1578 punpckhbw mm1, [r0+r3*2-8]
1582 movq mm2, [r0+r3*1-8]
1583 punpckhbw mm2, [r0+r3*0-8]
1585 movq mm3, [r0+r3*1-8]
1586 punpckhbw mm3, [r0+r3*0-8]
1590 movq mm0, [r0+r3*0-8]
1595 PALIGNR mm4, mm0, 7, mm0
1596 PALIGNR mm1, mm2, 1, mm2
1597 test r1, r1 ; top_left
1601 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1605 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1607 PALIGNR mm7, mm1, 7, mm3
1613 PALIGNR mm2, mm0, 7, mm0
1614 PALIGNR mm1, mm4, 1, mm4
1615 test r1, r1 ; top_left
1617 test r2, r2 ; top_right
1620 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1636 test r2, r2 ; top_right
1652 PALIGNR mm2, mm6, 1, mm0
1654 PALIGNR mm3, mm6, 7, mm0
1658 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1659 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1703 %macro PRED8x8L_DOWN_RIGHT 1
1704 cglobal pred8x8l_down_right_%1, 4,5
1707 movq mm0, [r0+r3*1-8]
1708 punpckhbw mm0, [r0+r3*0-8]
1709 movq mm1, [r4+r3*1-8]
1710 punpckhbw mm1, [r0+r3*2-8]
1714 movq mm2, [r0+r3*1-8]
1715 punpckhbw mm2, [r0+r3*0-8]
1717 movq mm3, [r0+r3*1-8]
1718 punpckhbw mm3, [r0+r3*0-8]
1722 movq mm0, [r0+r3*0-8]
1727 PALIGNR mm4, mm0, 7, mm0
1728 PALIGNR mm1, mm2, 1, mm2
1756 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1760 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1762 PALIGNR mm7, mm1, 7, mm3
1769 PALIGNR mm2, mm0, 7, mm0
1770 PALIGNR mm1, mm4, 1, mm4
1776 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1793 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1796 movq [r0+r3*2], xmm0
1797 movq [r0+r3*1], xmm1
1800 movq [r2+r3*2], xmm0
1801 movq [r2+r3*1], xmm1
1804 movq [r1+r3*2], xmm0
1805 movq [r1+r3*1], xmm1
1808 movq [r4+r3*2], xmm0
1809 movq [r4+r3*1], xmm1
1814 %define PALIGNR PALIGNR_MMX
1815 PRED8x8L_DOWN_RIGHT sse2
1817 %define PALIGNR PALIGNR_SSSE3
1818 PRED8x8L_DOWN_RIGHT ssse3
1820 ;-----------------------------------------------------------------------------
1821 ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1822 ;-----------------------------------------------------------------------------
1825 %define PALIGNR PALIGNR_MMX
1826 cglobal pred8x8l_vertical_right_mmxext, 4,5
1829 movq mm0, [r0+r3*1-8]
1830 punpckhbw mm0, [r0+r3*0-8]
1831 movq mm1, [r4+r3*1-8]
1832 punpckhbw mm1, [r0+r3*2-8]
1836 movq mm2, [r0+r3*1-8]
1837 punpckhbw mm2, [r0+r3*0-8]
1839 movq mm3, [r0+r3*1-8]
1840 punpckhbw mm3, [r0+r3*0-8]
1844 movq mm0, [r0+r3*0-8]
1849 PALIGNR mm4, mm0, 7, mm0
1850 PALIGNR mm1, mm2, 1, mm2
1878 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1885 PALIGNR mm2, mm0, 7, mm0
1886 PALIGNR mm1, mm4, 1, mm4
1892 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1896 PALIGNR mm3, mm7, 7, mm0
1897 PALIGNR mm6, mm7, 6, mm1
1901 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1912 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1913 PALIGNR mm6, mm0, 7, mm2
1916 PALIGNR mm5, mm0, 7, mm1
1919 PALIGNR mm6, mm0, 7, mm2
1922 PALIGNR mm5, mm0, 7, mm1
1925 PALIGNR mm6, mm0, 7, mm2
1928 PALIGNR mm5, mm0, 7, mm1
1932 %macro PRED8x8L_VERTICAL_RIGHT 1
1933 cglobal pred8x8l_vertical_right_%1, 4,5,7
1934 ; manually spill XMM registers for Win64 because
1935 ; the code here is initialized with INIT_MMX
1939 movq mm0, [r0+r3*1-8]
1940 punpckhbw mm0, [r0+r3*0-8]
1941 movq mm1, [r4+r3*1-8]
1942 punpckhbw mm1, [r0+r3*2-8]
1946 movq mm2, [r0+r3*1-8]
1947 punpckhbw mm2, [r0+r3*0-8]
1949 movq mm3, [r0+r3*1-8]
1950 punpckhbw mm3, [r0+r3*0-8]
1954 movq mm0, [r0+r3*0-8]
1959 PALIGNR mm4, mm0, 7, mm0
1960 PALIGNR mm1, mm2, 1, mm2
1987 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1994 PALIGNR mm2, mm0, 7, mm0
1995 PALIGNR mm1, mm4, 1, mm4
2001 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
2006 movdqa xmm6, [pw_ff00]
2015 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
2021 movhps [r0+r3*2], xmm5
2022 movhps [r0+r3*1], xmm2
2030 movq [r0+r3*2], xmm5
2031 movq [r0+r3*1], xmm2
2034 movq [r2+r3*2], xmm5
2035 movq [r2+r3*1], xmm2
2038 movq [r1+r3*2], xmm5
2039 movq [r1+r3*1], xmm2
2044 %define PALIGNR PALIGNR_MMX
2045 PRED8x8L_VERTICAL_RIGHT sse2
2047 %define PALIGNR PALIGNR_SSSE3
2048 PRED8x8L_VERTICAL_RIGHT ssse3
2050 ;-----------------------------------------------------------------------------
2051 ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
2052 ;-----------------------------------------------------------------------------
2054 %macro PRED8x8L_VERTICAL_LEFT 1
2055 cglobal pred8x8l_vertical_left_%1, 4,4
2062 PALIGNR mm2, mm0, 7, mm0
2063 PALIGNR mm1, mm4, 1, mm4
2086 pshufw mm1, mm3, 0xFF
2089 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2098 PALIGNR mm2, mm3, 7, mm3
2099 PALIGNR mm5, mm4, 1, mm4
2100 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2114 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2116 movq [r0+r3*1], xmm3
2117 movq [r0+r3*2], xmm0
2121 movq [r1+r3*1], xmm3
2122 movq [r1+r3*2], xmm0
2125 movq [r2+r3*1], xmm3
2126 movq [r2+r3*2], xmm0
2129 movq [r0+r3*1], xmm3
2130 movq [r0+r3*2], xmm0
2135 %define PALIGNR PALIGNR_MMX
2136 PRED8x8L_VERTICAL_LEFT sse2
2137 %define PALIGNR PALIGNR_SSSE3
2139 PRED8x8L_VERTICAL_LEFT ssse3
2141 ;-----------------------------------------------------------------------------
2142 ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2143 ;-----------------------------------------------------------------------------
2145 %macro PRED8x8L_HORIZONTAL_UP 1
2146 cglobal pred8x8l_horizontal_up_%1, 4,4
2149 movq mm0, [r0+r3*1-8]
2153 punpckhbw mm0, [r1+r3*0-8]
2154 movq mm1, [r2+r3*1-8]
2155 punpckhbw mm1, [r0+r3*2-8]
2159 movq mm2, [r0+r3*1-8]
2160 punpckhbw mm2, [r0+r3*0-8]
2162 movq mm3, [r0+r3*1-8]
2163 punpckhbw mm3, [r0+r3*0-8]
2167 movq mm0, [r0+r3*0-8]
2168 movq mm1, [r1+r3*0-8]
2172 PALIGNR mm4, mm0, 7, mm0
2173 PALIGNR mm1, mm2, 1, mm2
2175 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2178 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2180 PALIGNR mm7, mm1, 7, mm3
2182 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2183 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2187 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2194 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2196 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2198 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2200 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2201 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2205 PALIGNR mm5, mm4, 2, mm1
2206 pshufw mm1, mm6, 11111001b
2207 PALIGNR mm6, mm4, 4, mm2
2208 pshufw mm2, mm7, 11111110b
2209 PALIGNR mm7, mm4, 6, mm3
2210 pshufw mm3, mm0, 11111111b
2224 %define PALIGNR PALIGNR_MMX
2225 PRED8x8L_HORIZONTAL_UP mmxext
2226 %define PALIGNR PALIGNR_SSSE3
2227 PRED8x8L_HORIZONTAL_UP ssse3
2229 ;-----------------------------------------------------------------------------
2230 ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2231 ;-----------------------------------------------------------------------------
2234 %define PALIGNR PALIGNR_MMX
2235 cglobal pred8x8l_horizontal_down_mmxext, 4,5
2238 movq mm0, [r0+r3*1-8]
2239 punpckhbw mm0, [r0+r3*0-8]
2240 movq mm1, [r4+r3*1-8]
2241 punpckhbw mm1, [r0+r3*2-8]
2245 movq mm2, [r0+r3*1-8]
2246 punpckhbw mm2, [r0+r3*0-8]
2248 movq mm3, [r0+r3*1-8]
2249 punpckhbw mm3, [r0+r3*0-8]
2253 movq mm0, [r0+r3*0-8]
2258 PALIGNR mm4, mm0, 7, mm0
2259 PALIGNR mm1, mm2, 1, mm2
2286 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2290 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2292 PALIGNR mm7, mm1, 7, mm3
2298 PALIGNR mm2, mm0, 7, mm0
2299 PALIGNR mm1, mm4, 1, mm4
2305 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2312 PALIGNR mm2, mm6, 7, mm5
2313 PALIGNR mm6, mm7, 7, mm0
2315 PALIGNR mm4, mm3, 1, mm7
2318 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2324 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2332 PALIGNR mm7, mm3, 2, mm5
2334 PALIGNR mm1, mm3, 4, mm5
2336 PALIGNR mm0, mm3, 6, mm3
2341 PALIGNR mm6, mm4, 2, mm5
2343 PALIGNR mm2, mm4, 4, mm5
2345 PALIGNR mm3, mm4, 6, mm4
2349 %macro PRED8x8L_HORIZONTAL_DOWN 1
2350 cglobal pred8x8l_horizontal_down_%1, 4,5
2353 movq mm0, [r0+r3*1-8]
2354 punpckhbw mm0, [r0+r3*0-8]
2355 movq mm1, [r4+r3*1-8]
2356 punpckhbw mm1, [r0+r3*2-8]
2360 movq mm2, [r0+r3*1-8]
2361 punpckhbw mm2, [r0+r3*0-8]
2363 movq mm3, [r0+r3*1-8]
2364 punpckhbw mm3, [r0+r3*0-8]
2368 movq mm0, [r0+r3*0-8]
2373 PALIGNR mm4, mm0, 7, mm0
2374 PALIGNR mm1, mm2, 1, mm2
2401 pshufw mm1, mm3, 0xFF
2405 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2409 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2419 PALIGNR mm2, mm0, 7, mm0
2420 PALIGNR mm1, mm4, 1, mm4
2426 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2435 PALIGNR mm2, mm3, 7, mm3
2436 PALIGNR mm5, mm4, 1, mm4
2437 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2446 PALIGNR xmm1, xmm0, 7, xmm4
2447 PALIGNR xmm2, xmm0, 9, xmm5
2449 PALIGNR xmm3, xmm0, 8, xmm0
2453 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2454 punpcklbw xmm4, xmm0
2456 movq [r0+r3*2], xmm4
2457 movq [r2+r3*2], xmm0
2460 movq [r0+r3*1], xmm4
2461 movq [r2+r3*1], xmm0
2464 movq [r1+r3*2], xmm4
2465 movq [r4+r3*2], xmm0
2468 movq [r1+r3*1], xmm4
2469 movq [r4+r3*1], xmm0
2474 %define PALIGNR PALIGNR_MMX
2475 PRED8x8L_HORIZONTAL_DOWN sse2
2477 %define PALIGNR PALIGNR_SSSE3
2478 PRED8x8L_HORIZONTAL_DOWN ssse3
2480 ;-----------------------------------------------------------------------------
2481 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2482 ;-----------------------------------------------------------------------------
2484 cglobal pred4x4_dc_mmxext, 3,5
2490 movzx r1d, byte [r0+r2*1-1]
2493 movzx r1d, byte [r0+r2*2-1]
2496 movzx r1d, byte [r0+r2*1-1]
2498 movzx r1d, byte [r0+r2*2-1]
2502 imul r3d, 0x01010101
2509 ;-----------------------------------------------------------------------------
2510 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2511 ;-----------------------------------------------------------------------------
2513 %macro PRED4x4_TM_MMX 1
2514 cglobal pred4x4_tm_vp8_%1, 3,6
2519 movzx r4d, byte [r0-1]
2522 movzx r1d, byte [r0+r2*1-1]
2523 movzx r3d, byte [r0+r2*2-1]
2550 PRED4x4_TM_MMX mmxext
2552 cglobal pred4x4_tm_vp8_ssse3, 3,3
2561 movd mm2, [r0+r2*1-4]
2562 movd mm3, [r0+r2*2-4]
2563 movd mm4, [r1+r2*1-4]
2564 movd mm5, [r1+r2*2-4]
2587 ;-----------------------------------------------------------------------------
2588 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2589 ;-----------------------------------------------------------------------------
2592 cglobal pred4x4_vertical_vp8_mmxext, 3,3
2596 mova m2, m0 ;t0 t1 t2 t3
2597 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2599 psrlq m0, 8 ;t1 t2 t3 t4
2600 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2607 ;-----------------------------------------------------------------------------
2608 ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2609 ;-----------------------------------------------------------------------------
2611 cglobal pred4x4_down_left_mmxext, 3,3
2621 PRED4x4_LOWPASS m0, m1, m2, m3, m4
2633 ;-----------------------------------------------------------------------------
2634 ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2635 ;-----------------------------------------------------------------------------
2638 cglobal pred4x4_vertical_left_mmxext, 3,3
2648 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2658 ;-----------------------------------------------------------------------------
2659 ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2660 ;-----------------------------------------------------------------------------
2663 cglobal pred4x4_horizontal_up_mmxext, 3,3
2666 movd m0, [r0+r2*1-4]
2667 punpcklbw m0, [r0+r2*2-4]
2668 movd m1, [r1+r2*1-4]
2669 punpcklbw m1, [r1+r2*2-4]
2681 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2691 ;-----------------------------------------------------------------------------
2692 ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2693 ;-----------------------------------------------------------------------------
2696 %define PALIGNR PALIGNR_MMX
2697 cglobal pred4x4_horizontal_down_mmxext, 3,3
2700 movh m0, [r0-4] ; lt ..
2701 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2702 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2703 movd m1, [r1+r2*2-4] ; l3
2704 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2705 movd m2, [r0+r2*2-4] ; l1
2706 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2707 punpckhwd m1, m2 ; l0 l1 l2 l3
2708 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2712 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2713 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2715 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2718 PALIGNR m3, m5, 6, m4
2727 ;-----------------------------------------------------------------------------
2728 ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2729 ;-----------------------------------------------------------------------------
2732 %define PALIGNR PALIGNR_MMX
2733 cglobal pred4x4_vertical_right_mmxext, 3,3
2736 movh m0, [r0] ; ........t3t2t1t0
2738 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2740 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2742 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2744 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2745 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2751 PALIGNR m5, m1, 7, m2
2754 PALIGNR m3, m1, 7, m1
2758 ;-----------------------------------------------------------------------------
2759 ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2760 ;-----------------------------------------------------------------------------
2763 %define PALIGNR PALIGNR_MMX
2764 cglobal pred4x4_down_right_mmxext, 3,3
2768 movq m2, [r0+r2*1-8]
2769 punpckhbw m2, [r0-8]
2772 PALIGNR m3, m1, 5, m1
2774 PALIGNR m3, [r1+r2*1-8], 7, m4
2776 PALIGNR m3, [r1+r2*2-8], 7, m4
2777 PRED4x4_LOWPASS m0, m3, m1, m2, m4