1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
30 tm_shuf: times 8 db 0x03, 0x80
31 pw_ff00: times 8 dw 0xff00
32 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
33 db 1, 2, 3, 4, 5, 6, 7, 8
34 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
35 db 1, 2, 3, 4, 0, 0, 0, 0
36 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
37 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
38 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
39 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
52 ;-----------------------------------------------------------------------------
53 ; void pred16x16_vertical(uint8_t *src, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_mmx, 2,3
71 cglobal pred16x16_vertical_sse, 2,3
76 movaps [r0+r1*1], xmm0
77 movaps [r0+r1*2], xmm0
79 movaps [r0+r1*1], xmm0
80 movaps [r0+r1*2], xmm0
86 ;-----------------------------------------------------------------------------
87 ; void pred16x16_horizontal(uint8_t *src, int stride)
88 ;-----------------------------------------------------------------------------
91 cglobal pred16x16_horizontal_%1, 2,3
133 ;-----------------------------------------------------------------------------
134 ; void pred16x16_dc(uint8_t *src, int stride)
135 ;-----------------------------------------------------------------------------
137 %macro PRED16x16_DC 1
138 cglobal pred16x16_dc_%1, 2,7
146 movzx r5d, byte [r0+r1*1]
151 movzx r2d, byte [r0+r1*0]
152 movzx r3d, byte [r0+r1*1]
157 movzx r2d, byte [r0+r1*0]
204 ;-----------------------------------------------------------------------------
205 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
206 ;-----------------------------------------------------------------------------
208 %macro PRED16x16_TM_MMX 1
209 cglobal pred16x16_tm_vp8_%1, 2,5
220 movzx r3d, byte [r0-1]
223 movzx r2d, byte [r0+r1-1]
250 PRED16x16_TM_MMX mmxext
252 cglobal pred16x16_tm_vp8_sse2, 2,6,6
259 movzx r4d, byte [r0-1]
262 movzx r2d, byte [r0+r1*1-1]
263 movzx r3d, byte [r0+r1*2-1]
268 pshuflw xmm2, xmm2, 0
269 pshuflw xmm4, xmm4, 0
270 punpcklqdq xmm2, xmm2
271 punpcklqdq xmm4, xmm4
280 movdqa [r0+r1*1], xmm2
281 movdqa [r0+r1*2], xmm4
287 ;-----------------------------------------------------------------------------
288 ; void pred16x16_plane(uint8_t *src, int stride)
289 ;-----------------------------------------------------------------------------
291 %macro H264_PRED16x16_PLANE 3
292 cglobal pred16x16_plane_%3_%1, 2, 7, %2
306 pmullw m0, [pw_m8tom1 ]
307 pmullw m1, [pw_m8tom1+8]
308 pmullw m2, [pw_1to8 ]
309 pmullw m3, [pw_1to8 +8]
318 pmullw m0, [pw_m8tom1]
322 movhps m0, [r0+r1 +8]
323 pmaddubsw m0, [plane_shuf] ; H coefficients
345 paddw m0, m1 ; sum of H coefficients
357 movzx e_reg, byte [r3+r2*2 ]
358 movzx r5, byte [r4+r1 ]
361 movzx e_reg, byte [r3+r2 ]
366 movzx e_reg, byte [r3+r1 ]
367 movzx r6, byte [r4+r2*2 ]
371 movzx e_reg, byte [r3 ]
373 movzx r10, byte [r4+r2 ]
376 movzx r6, byte [r4+r2 ]
385 movzx r4, byte [e_reg+r2 ]
397 movzx r4, byte [e_reg ]
399 movzx r10, byte [r3 +r2 ]
403 movzx r6, byte [r3 +r2 ]
409 movzx r4, byte [e_reg+r1 ]
410 movzx r6, byte [r3 +r2*2]
417 movzx r4, byte [e_reg+r2*2]
418 movzx r6, byte [r3 +r1 ]
421 add r5, r6 ; sum of V coefficients
438 lea r5, [r5*5] ; 5*(V/4)
442 sar r5, 4 ; (5*(V/4))/16
445 movzx r4, byte [r0+r1 +15]
446 movzx r3, byte [r3+r2*2 ]
464 lea r1d, [r1d*5] ; 5*(H/4)
468 sar r1d, 4 ; (5*(H/4))/16
494 punpcklqdq m0, m0 ; splat H (words)
495 punpcklqdq m1, m1 ; splat V (words)
496 punpcklqdq m3, m3 ; splat a (words)
505 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
514 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
515 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
517 paddw m5, m0 ; a + {8,9,10,11}*H
518 paddw m6, m0 ; a + {12,13,14,15}*H
523 mova m3, m0 ; b[0..7]
524 mova m4, m2 ; b[8..15]
530 mova m3, m5 ; b[8..11]
531 mova m4, m6 ; b[12..15]
544 mova m3, m0 ; b[0..7]
545 mova m4, m2 ; b[8..15]
551 mova m3, m5 ; b[8..11]
552 mova m4, m6 ; b[12..15]
572 H264_PRED16x16_PLANE mmx, 0, h264
573 H264_PRED16x16_PLANE mmx, 0, rv40
574 H264_PRED16x16_PLANE mmx, 0, svq3
575 H264_PRED16x16_PLANE mmx2, 0, h264
576 H264_PRED16x16_PLANE mmx2, 0, rv40
577 H264_PRED16x16_PLANE mmx2, 0, svq3
579 H264_PRED16x16_PLANE sse2, 8, h264
580 H264_PRED16x16_PLANE sse2, 8, rv40
581 H264_PRED16x16_PLANE sse2, 8, svq3
582 H264_PRED16x16_PLANE ssse3, 8, h264
583 H264_PRED16x16_PLANE ssse3, 8, rv40
584 H264_PRED16x16_PLANE ssse3, 8, svq3
586 ;-----------------------------------------------------------------------------
587 ; void pred8x8_plane(uint8_t *src, int stride)
588 ;-----------------------------------------------------------------------------
590 %macro H264_PRED8x8_PLANE 2
591 cglobal pred8x8_plane_%1, 2, 7, %2
601 pmullw m0, [pw_m4to4]
602 pmullw m1, [pw_m4to4+8]
609 pmullw m0, [pw_m4to4]
611 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
612 pmaddubsw m0, [plane8_shuf] ; H coefficients
638 paddw m0, m1 ; sum of H coefficients
654 movzx e_reg, byte [r3+r2*2 ]
655 movzx r5, byte [r4+r1 ]
658 movzx e_reg, byte [r3 ]
660 movzx r10, byte [r4+r2 ]
664 movzx r6, byte [r4+r2 ]
670 movzx e_reg, byte [r3+r1 ]
671 movzx r6, byte [r4+r2*2 ]
678 movzx e_reg, byte [r3+r2 ]
691 movzx r3, byte [r4+r2*2 ]
692 movzx r4, byte [r0+r1 +7]
719 punpcklqdq m0, m0 ; splat H (words)
720 punpcklqdq m1, m1 ; splat V (words)
721 punpcklqdq m3, m3 ; splat a (words)
726 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
727 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
730 paddw m2, m0 ; a + {4,5,6,7}*H
737 mova m3, m0 ; b[0..7]
740 mova m4, m0 ; V+b[0..7]
747 mova m3, m0 ; b[0..3]
748 mova m4, m2 ; b[4..7]
753 mova m5, m0 ; V+b[0..3]
754 mova m6, m2 ; V+b[4..7]
772 H264_PRED8x8_PLANE mmx, 0
773 H264_PRED8x8_PLANE mmx2, 0
775 H264_PRED8x8_PLANE sse2, 8
776 H264_PRED8x8_PLANE ssse3, 8
778 ;-----------------------------------------------------------------------------
779 ; void pred8x8_vertical(uint8_t *src, int stride)
780 ;-----------------------------------------------------------------------------
782 cglobal pred8x8_vertical_mmx, 2,2
794 ;-----------------------------------------------------------------------------
795 ; void pred8x8_horizontal(uint8_t *src, int stride)
796 ;-----------------------------------------------------------------------------
799 cglobal pred8x8_horizontal_%1, 2,3
836 ;-----------------------------------------------------------------------------
837 ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
838 ;-----------------------------------------------------------------------------
840 cglobal pred8x8_top_dc_mmxext, 2,5
857 pshufw mm0, mm0, 0 ; dc0 (w)
858 packuswb mm0, mm1 ; dc0,dc1 (b)
870 ;-----------------------------------------------------------------------------
871 ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
872 ;-----------------------------------------------------------------------------
875 cglobal pred8x8_dc_mmxext, 2,5
884 movzx r2d, byte [r0+r1*1-1]
885 movzx r3d, byte [r0+r1*2-1]
888 movzx r3d, byte [r0+r1*1-1]
890 movzx r3d, byte [r0+r1*2-1]
894 movzx r2d, byte [r0+r1*1-1]
895 movzx r3d, byte [r0+r1*2-1]
898 movzx r3d, byte [r0+r1*1-1]
900 movzx r3d, byte [r0+r1*2-1]
907 punpckldq m0, m2 ; s0, s1, s2, s3
908 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
910 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
914 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
932 ;-----------------------------------------------------------------------------
933 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
934 ;-----------------------------------------------------------------------------
936 cglobal pred8x8_dc_rv40_mmxext, 2,7
942 movzx r5d, byte [r0+r1*1]
946 movzx r2d, byte [r0+r1*0]
947 movzx r3d, byte [r0+r1*1]
952 movzx r2d, byte [r0+r1*0]
968 ;-----------------------------------------------------------------------------
969 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
970 ;-----------------------------------------------------------------------------
972 %macro PRED8x8_TM_MMX 1
973 cglobal pred8x8_tm_vp8_%1, 2,6
980 movzx r4d, byte [r0-1]
983 movzx r2d, byte [r0+r1*1-1]
984 movzx r3d, byte [r0+r1*2-1]
1015 PRED8x8_TM_MMX mmxext
1017 cglobal pred8x8_tm_vp8_sse2, 2,6,4
1021 punpcklbw xmm0, xmm1
1022 movzx r4d, byte [r0-1]
1025 movzx r2d, byte [r0+r1*1-1]
1026 movzx r3d, byte [r0+r1*2-1]
1031 pshuflw xmm2, xmm2, 0
1032 pshuflw xmm3, xmm3, 0
1033 punpcklqdq xmm2, xmm2
1034 punpcklqdq xmm3, xmm3
1038 movq [r0+r1*1], xmm2
1039 movhps [r0+r1*2], xmm2
1045 cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1047 movdqa xmm4, [tm_shuf]
1050 punpcklbw xmm0, xmm1
1055 movd xmm2, [r0+r1*1-4]
1056 movd xmm3, [r0+r1*2-4]
1064 movq [r0+r1*1], xmm2
1065 movhps [r0+r1*2], xmm2
1071 ; dest, left, right, src, tmp
1072 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1073 %macro PRED4x4_LOWPASS 5
1083 ;-----------------------------------------------------------------------------
1084 ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1085 ;-----------------------------------------------------------------------------
1087 %macro PRED8x8L_TOP_DC 1
1088 cglobal pred8x8l_top_dc_%1, 4,4
1096 PALIGNR mm2, mm0, 7, mm0
1097 PALIGNR mm1, mm4, 1, mm4
1098 test r1, r1 ; top_left
1100 test r2, r2 ; top_right
1109 test r2, r2 ; top_right
1118 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1135 %define PALIGNR PALIGNR_MMX
1136 PRED8x8L_TOP_DC mmxext
1137 %define PALIGNR PALIGNR_SSSE3
1138 PRED8x8L_TOP_DC ssse3
1140 ;-----------------------------------------------------------------------------
1141 ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1142 ;-----------------------------------------------------------------------------
1144 %macro PRED8x8L_DC 1
1145 cglobal pred8x8l_dc_%1, 4,5
1148 movq mm0, [r0+r3*1-8]
1149 punpckhbw mm0, [r0+r3*0-8]
1150 movq mm1, [r4+r3*1-8]
1151 punpckhbw mm1, [r0+r3*2-8]
1155 movq mm2, [r0+r3*1-8]
1156 punpckhbw mm2, [r0+r3*0-8]
1158 movq mm3, [r0+r3*1-8]
1159 punpckhbw mm3, [r0+r3*0-8]
1163 movq mm0, [r0+r3*0-8]
1168 PALIGNR mm4, mm0, 7, mm0
1169 PALIGNR mm1, mm2, 1, mm2
1196 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1199 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1201 PALIGNR mm7, mm1, 7, mm3
1207 PALIGNR mm2, mm0, 7, mm0
1208 PALIGNR mm1, mm4, 1, mm4
1215 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1238 %define PALIGNR PALIGNR_MMX
1240 %define PALIGNR PALIGNR_SSSE3
1243 ;-----------------------------------------------------------------------------
1244 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1245 ;-----------------------------------------------------------------------------
1247 %macro PRED8x8L_HORIZONTAL 1
1248 cglobal pred8x8l_horizontal_%1, 4,4
1251 movq mm0, [r0+r3*1-8]
1252 punpckhbw mm0, [r0+r3*0-8]
1253 movq mm1, [r2+r3*1-8]
1254 punpckhbw mm1, [r0+r3*2-8]
1258 movq mm2, [r0+r3*1-8]
1259 punpckhbw mm2, [r0+r3*0-8]
1261 movq mm3, [r0+r3*1-8]
1262 punpckhbw mm3, [r0+r3*0-8]
1266 movq mm0, [r0+r3*0-8]
1271 PALIGNR mm4, mm0, 7, mm0
1272 PALIGNR mm1, mm2, 1, mm2
1273 test r1, r1 ; top_left
1283 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1286 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1288 PALIGNR mm7, mm1, 7, mm3
1294 pshufw mm0, mm3, 0xff
1295 pshufw mm1, mm3, 0xaa
1297 pshufw mm2, mm3, 0x55
1298 pshufw mm3, mm3, 0x00
1299 pshufw mm4, mm7, 0xff
1300 pshufw mm5, mm7, 0xaa
1301 pshufw mm6, mm7, 0x55
1302 pshufw mm7, mm7, 0x00
1316 %define PALIGNR PALIGNR_MMX
1317 PRED8x8L_HORIZONTAL mmxext
1318 %define PALIGNR PALIGNR_SSSE3
1319 PRED8x8L_HORIZONTAL ssse3
1321 ;-----------------------------------------------------------------------------
1322 ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1323 ;-----------------------------------------------------------------------------
1325 %macro PRED8x8L_VERTICAL 1
1326 cglobal pred8x8l_vertical_%1, 4,4
1333 PALIGNR mm2, mm0, 7, mm0
1334 PALIGNR mm1, mm4, 1, mm4
1335 test r1, r1 ; top_left
1337 test r2, r2 ; top_right
1346 test r2, r2 ; top_right
1355 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1367 %define PALIGNR PALIGNR_MMX
1368 PRED8x8L_VERTICAL mmxext
1369 %define PALIGNR PALIGNR_SSSE3
1370 PRED8x8L_VERTICAL ssse3
1372 ;-----------------------------------------------------------------------------
1373 ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1374 ;-----------------------------------------------------------------------------
1377 %define PALIGNR PALIGNR_MMX
1378 cglobal pred8x8l_down_left_mmxext, 4,5
1385 PALIGNR mm2, mm0, 7, mm0
1386 PALIGNR mm1, mm4, 1, mm4
1409 pshufw mm1, mm3, 0xFF
1412 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1421 PALIGNR mm2, mm3, 7, mm3
1422 PALIGNR mm5, mm4, 1, mm4
1423 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1431 PALIGNR mm2, mm7, 1, mm0
1433 PALIGNR mm3, mm7, 7, mm0
1434 PALIGNR mm4, mm6, 1, mm0
1440 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1441 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1485 %macro PRED8x8L_DOWN_LEFT 1
1486 cglobal pred8x8l_down_left_%1, 4,4
1493 PALIGNR mm2, mm0, 7, mm0
1494 PALIGNR mm1, mm4, 1, mm4
1495 test r1, r1 ; top_left
1497 test r2, r2 ; top_right
1506 test r2, r2 ; top_right
1517 pshufw mm1, mm3, 0xFF
1520 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1522 test r2, r2 ; top_right
1529 PALIGNR mm2, mm3, 7, mm3
1530 PALIGNR mm5, mm4, 1, mm4
1531 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1547 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1549 movq [r0+r3*1], xmm0
1551 movq [r0+r3*2], xmm0
1554 movq [r1+r3*1], xmm0
1556 movq [r1+r3*2], xmm0
1558 movq [r2+r3*1], xmm0
1560 movq [r2+r3*2], xmm0
1562 movq [r0+r3*1], xmm0
1564 movq [r0+r3*2], xmm0
1569 %define PALIGNR PALIGNR_MMX
1570 PRED8x8L_DOWN_LEFT sse2
1572 %define PALIGNR PALIGNR_SSSE3
1573 PRED8x8L_DOWN_LEFT ssse3
1575 ;-----------------------------------------------------------------------------
1576 ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1577 ;-----------------------------------------------------------------------------
1580 %define PALIGNR PALIGNR_MMX
1581 cglobal pred8x8l_down_right_mmxext, 4,5
1584 movq mm0, [r0+r3*1-8]
1585 punpckhbw mm0, [r0+r3*0-8]
1586 movq mm1, [r4+r3*1-8]
1587 punpckhbw mm1, [r0+r3*2-8]
1591 movq mm2, [r0+r3*1-8]
1592 punpckhbw mm2, [r0+r3*0-8]
1594 movq mm3, [r0+r3*1-8]
1595 punpckhbw mm3, [r0+r3*0-8]
1599 movq mm0, [r0+r3*0-8]
1604 PALIGNR mm4, mm0, 7, mm0
1605 PALIGNR mm1, mm2, 1, mm2
1606 test r1, r1 ; top_left
1610 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1614 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1616 PALIGNR mm7, mm1, 7, mm3
1622 PALIGNR mm2, mm0, 7, mm0
1623 PALIGNR mm1, mm4, 1, mm4
1624 test r1, r1 ; top_left
1626 test r2, r2 ; top_right
1629 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1645 test r2, r2 ; top_right
1661 PALIGNR mm2, mm6, 1, mm0
1663 PALIGNR mm3, mm6, 7, mm0
1667 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1668 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1712 %macro PRED8x8L_DOWN_RIGHT 1
1713 cglobal pred8x8l_down_right_%1, 4,5
1716 movq mm0, [r0+r3*1-8]
1717 punpckhbw mm0, [r0+r3*0-8]
1718 movq mm1, [r4+r3*1-8]
1719 punpckhbw mm1, [r0+r3*2-8]
1723 movq mm2, [r0+r3*1-8]
1724 punpckhbw mm2, [r0+r3*0-8]
1726 movq mm3, [r0+r3*1-8]
1727 punpckhbw mm3, [r0+r3*0-8]
1731 movq mm0, [r0+r3*0-8]
1736 PALIGNR mm4, mm0, 7, mm0
1737 PALIGNR mm1, mm2, 1, mm2
1765 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1769 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1771 PALIGNR mm7, mm1, 7, mm3
1778 PALIGNR mm2, mm0, 7, mm0
1779 PALIGNR mm1, mm4, 1, mm4
1785 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1802 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1805 movq [r0+r3*2], xmm0
1806 movq [r0+r3*1], xmm1
1809 movq [r2+r3*2], xmm0
1810 movq [r2+r3*1], xmm1
1813 movq [r1+r3*2], xmm0
1814 movq [r1+r3*1], xmm1
1817 movq [r4+r3*2], xmm0
1818 movq [r4+r3*1], xmm1
1823 %define PALIGNR PALIGNR_MMX
1824 PRED8x8L_DOWN_RIGHT sse2
1826 %define PALIGNR PALIGNR_SSSE3
1827 PRED8x8L_DOWN_RIGHT ssse3
1829 ;-----------------------------------------------------------------------------
1830 ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1831 ;-----------------------------------------------------------------------------
1834 %define PALIGNR PALIGNR_MMX
1835 cglobal pred8x8l_vertical_right_mmxext, 4,5
1838 movq mm0, [r0+r3*1-8]
1839 punpckhbw mm0, [r0+r3*0-8]
1840 movq mm1, [r4+r3*1-8]
1841 punpckhbw mm1, [r0+r3*2-8]
1845 movq mm2, [r0+r3*1-8]
1846 punpckhbw mm2, [r0+r3*0-8]
1848 movq mm3, [r0+r3*1-8]
1849 punpckhbw mm3, [r0+r3*0-8]
1853 movq mm0, [r0+r3*0-8]
1858 PALIGNR mm4, mm0, 7, mm0
1859 PALIGNR mm1, mm2, 1, mm2
1887 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1894 PALIGNR mm2, mm0, 7, mm0
1895 PALIGNR mm1, mm4, 1, mm4
1901 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1905 PALIGNR mm3, mm7, 7, mm0
1906 PALIGNR mm6, mm7, 6, mm1
1910 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1921 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1922 PALIGNR mm6, mm0, 7, mm2
1925 PALIGNR mm5, mm0, 7, mm1
1928 PALIGNR mm6, mm0, 7, mm2
1931 PALIGNR mm5, mm0, 7, mm1
1934 PALIGNR mm6, mm0, 7, mm2
1937 PALIGNR mm5, mm0, 7, mm1
1941 %macro PRED8x8L_VERTICAL_RIGHT 1
1942 cglobal pred8x8l_vertical_right_%1, 4,5,7
1945 movq mm0, [r0+r3*1-8]
1946 punpckhbw mm0, [r0+r3*0-8]
1947 movq mm1, [r4+r3*1-8]
1948 punpckhbw mm1, [r0+r3*2-8]
1952 movq mm2, [r0+r3*1-8]
1953 punpckhbw mm2, [r0+r3*0-8]
1955 movq mm3, [r0+r3*1-8]
1956 punpckhbw mm3, [r0+r3*0-8]
1960 movq mm0, [r0+r3*0-8]
1965 PALIGNR mm4, mm0, 7, mm0
1966 PALIGNR mm1, mm2, 1, mm2
1993 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2000 PALIGNR mm2, mm0, 7, mm0
2001 PALIGNR mm1, mm4, 1, mm4
2007 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
2012 movdqa xmm6, [pw_ff00]
2021 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
2027 movhps [r0+r3*2], xmm5
2028 movhps [r0+r3*1], xmm2
2036 movq [r0+r3*2], xmm5
2037 movq [r0+r3*1], xmm2
2040 movq [r2+r3*2], xmm5
2041 movq [r2+r3*1], xmm2
2044 movq [r1+r3*2], xmm5
2045 movq [r1+r3*1], xmm2
2050 %define PALIGNR PALIGNR_MMX
2051 PRED8x8L_VERTICAL_RIGHT sse2
2053 %define PALIGNR PALIGNR_SSSE3
2054 PRED8x8L_VERTICAL_RIGHT ssse3
2056 ;-----------------------------------------------------------------------------
2057 ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
2058 ;-----------------------------------------------------------------------------
2060 %macro PRED8x8L_VERTICAL_LEFT 1
2061 cglobal pred8x8l_vertical_left_%1, 4,4
2068 PALIGNR mm2, mm0, 7, mm0
2069 PALIGNR mm1, mm4, 1, mm4
2092 pshufw mm1, mm3, 0xFF
2095 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2104 PALIGNR mm2, mm3, 7, mm3
2105 PALIGNR mm5, mm4, 1, mm4
2106 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2120 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2122 movq [r0+r3*1], xmm3
2123 movq [r0+r3*2], xmm0
2127 movq [r1+r3*1], xmm3
2128 movq [r1+r3*2], xmm0
2131 movq [r2+r3*1], xmm3
2132 movq [r2+r3*2], xmm0
2135 movq [r0+r3*1], xmm3
2136 movq [r0+r3*2], xmm0
2141 %define PALIGNR PALIGNR_MMX
2142 PRED8x8L_VERTICAL_LEFT sse2
2143 %define PALIGNR PALIGNR_SSSE3
2145 PRED8x8L_VERTICAL_LEFT ssse3
2147 ;-----------------------------------------------------------------------------
2148 ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2149 ;-----------------------------------------------------------------------------
2151 %macro PRED8x8L_HORIZONTAL_UP 1
2152 cglobal pred8x8l_horizontal_up_%1, 4,4
2155 movq mm0, [r0+r3*1-8]
2156 punpckhbw mm0, [r0+r3*0-8]
2157 movq mm1, [r2+r3*1-8]
2158 punpckhbw mm1, [r0+r3*2-8]
2162 movq mm2, [r0+r3*1-8]
2163 punpckhbw mm2, [r0+r3*0-8]
2165 movq mm3, [r0+r3*1-8]
2166 punpckhbw mm3, [r0+r3*0-8]
2170 movq mm0, [r0+r3*0-8]
2175 PALIGNR mm4, mm0, 7, mm0
2176 PALIGNR mm1, mm2, 1, mm2
2187 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2190 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2192 PALIGNR mm7, mm1, 7, mm3
2194 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2195 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2199 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2206 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2208 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2210 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2212 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2213 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2217 PALIGNR mm5, mm4, 2, mm1
2218 pshufw mm1, mm6, 11111001b
2219 PALIGNR mm6, mm4, 4, mm2
2220 pshufw mm2, mm7, 11111110b
2221 PALIGNR mm7, mm4, 6, mm3
2222 pshufw mm3, mm0, 11111111b
2236 %define PALIGNR PALIGNR_MMX
2237 PRED8x8L_HORIZONTAL_UP mmxext
2238 %define PALIGNR PALIGNR_SSSE3
2239 PRED8x8L_HORIZONTAL_UP ssse3
2241 ;-----------------------------------------------------------------------------
2242 ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2243 ;-----------------------------------------------------------------------------
2246 %define PALIGNR PALIGNR_MMX
2247 cglobal pred8x8l_horizontal_down_mmxext, 4,5
2250 movq mm0, [r0+r3*1-8]
2251 punpckhbw mm0, [r0+r3*0-8]
2252 movq mm1, [r4+r3*1-8]
2253 punpckhbw mm1, [r0+r3*2-8]
2257 movq mm2, [r0+r3*1-8]
2258 punpckhbw mm2, [r0+r3*0-8]
2260 movq mm3, [r0+r3*1-8]
2261 punpckhbw mm3, [r0+r3*0-8]
2265 movq mm0, [r0+r3*0-8]
2270 PALIGNR mm4, mm0, 7, mm0
2271 PALIGNR mm1, mm2, 1, mm2
2298 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2302 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2304 PALIGNR mm7, mm1, 7, mm3
2310 PALIGNR mm2, mm0, 7, mm0
2311 PALIGNR mm1, mm4, 1, mm4
2317 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2324 PALIGNR mm2, mm6, 7, mm5
2325 PALIGNR mm6, mm7, 7, mm0
2327 PALIGNR mm4, mm3, 1, mm7
2330 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2336 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2344 PALIGNR mm7, mm3, 2, mm5
2346 PALIGNR mm1, mm3, 4, mm5
2348 PALIGNR mm0, mm3, 6, mm3
2353 PALIGNR mm6, mm4, 2, mm5
2355 PALIGNR mm2, mm4, 4, mm5
2357 PALIGNR mm3, mm4, 6, mm4
2361 %macro PRED8x8L_HORIZONTAL_DOWN 1
2362 cglobal pred8x8l_horizontal_down_%1, 4,5
2365 movq mm0, [r0+r3*1-8]
2366 punpckhbw mm0, [r0+r3*0-8]
2367 movq mm1, [r4+r3*1-8]
2368 punpckhbw mm1, [r0+r3*2-8]
2372 movq mm2, [r0+r3*1-8]
2373 punpckhbw mm2, [r0+r3*0-8]
2375 movq mm3, [r0+r3*1-8]
2376 punpckhbw mm3, [r0+r3*0-8]
2380 movq mm0, [r0+r3*0-8]
2385 PALIGNR mm4, mm0, 7, mm0
2386 PALIGNR mm1, mm2, 1, mm2
2413 pshufw mm1, mm3, 0xFF
2417 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2421 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2431 PALIGNR mm2, mm0, 7, mm0
2432 PALIGNR mm1, mm4, 1, mm4
2438 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2447 PALIGNR mm2, mm3, 7, mm3
2448 PALIGNR mm5, mm4, 1, mm4
2449 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2458 PALIGNR xmm1, xmm0, 7, xmm4
2459 PALIGNR xmm2, xmm0, 9, xmm5
2461 PALIGNR xmm3, xmm0, 8, xmm0
2465 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2466 punpcklbw xmm4, xmm0
2468 movq [r0+r3*2], xmm4
2469 movq [r2+r3*2], xmm0
2472 movq [r0+r3*1], xmm4
2473 movq [r2+r3*1], xmm0
2476 movq [r1+r3*2], xmm4
2477 movq [r4+r3*2], xmm0
2480 movq [r1+r3*1], xmm4
2481 movq [r4+r3*1], xmm0
2486 %define PALIGNR PALIGNR_MMX
2487 PRED8x8L_HORIZONTAL_DOWN sse2
2489 %define PALIGNR PALIGNR_SSSE3
2490 PRED8x8L_HORIZONTAL_DOWN ssse3
2493 ;-----------------------------------------------------------------------------
2494 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2495 ;-----------------------------------------------------------------------------
2497 cglobal pred4x4_dc_mmxext, 3,5
2503 movzx r1d, byte [r0+r2*1-1]
2506 movzx r1d, byte [r0+r2*2-1]
2509 movzx r1d, byte [r0+r2*1-1]
2511 movzx r1d, byte [r0+r2*2-1]
2515 imul r3d, 0x01010101
2522 ;-----------------------------------------------------------------------------
2523 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2524 ;-----------------------------------------------------------------------------
2526 %macro PRED4x4_TM_MMX 1
2527 cglobal pred4x4_tm_vp8_%1, 3,6
2532 movzx r4d, byte [r0-1]
2535 movzx r1d, byte [r0+r2*1-1]
2536 movzx r3d, byte [r0+r2*2-1]
2563 PRED4x4_TM_MMX mmxext
2565 cglobal pred4x4_tm_vp8_ssse3, 3,3
2574 movd mm2, [r0+r2*1-4]
2575 movd mm3, [r0+r2*2-4]
2576 movd mm4, [r1+r2*1-4]
2577 movd mm5, [r1+r2*2-4]
2600 ;-----------------------------------------------------------------------------
2601 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2602 ;-----------------------------------------------------------------------------
2605 cglobal pred4x4_vertical_vp8_mmxext, 3,3
2609 mova m2, m0 ;t0 t1 t2 t3
2610 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2612 psrlq m0, 8 ;t1 t2 t3 t4
2613 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2620 ;-----------------------------------------------------------------------------
2621 ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2622 ;-----------------------------------------------------------------------------
2625 cglobal pred4x4_down_left_mmxext, 3,3
2636 PRED4x4_LOWPASS m0, m1, m3, m4, m5
2648 ;-----------------------------------------------------------------------------
2649 ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2650 ;-----------------------------------------------------------------------------
2653 cglobal pred4x4_vertical_left_mmxext, 3,3
2663 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2673 ;-----------------------------------------------------------------------------
2674 ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2675 ;-----------------------------------------------------------------------------
2678 cglobal pred4x4_horizontal_up_mmxext, 3,3
2681 movd m0, [r0+r2*1-4]
2682 punpcklbw m0, [r0+r2*2-4]
2683 movd m1, [r1+r2*1-4]
2684 punpcklbw m1, [r1+r2*2-4]
2696 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2706 ;-----------------------------------------------------------------------------
2707 ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2708 ;-----------------------------------------------------------------------------
2711 %define PALIGNR PALIGNR_MMX
2712 cglobal pred4x4_horizontal_down_mmxext, 3,3
2715 movh m0, [r0-4] ; lt ..
2716 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2717 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2718 movd m1, [r1+r2*2-4] ; l3
2719 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2720 movd m2, [r0+r2*2-4] ; l1
2721 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2722 punpckhwd m1, m2 ; l0 l1 l2 l3
2723 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2727 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2728 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2730 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2733 PALIGNR m3, m5, 6, m4
2742 ;-----------------------------------------------------------------------------
2743 ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2744 ;-----------------------------------------------------------------------------
2747 %define PALIGNR PALIGNR_MMX
2748 cglobal pred4x4_vertical_right_mmxext, 3,3
2751 movh m0, [r0] ; ........t3t2t1t0
2753 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2755 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2757 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2759 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2760 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2766 PALIGNR m5, m1, 7, m2
2769 PALIGNR m3, m1, 7, m1
2773 ;-----------------------------------------------------------------------------
2774 ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2775 ;-----------------------------------------------------------------------------
2778 %define PALIGNR PALIGNR_MMX
2779 cglobal pred4x4_down_right_mmxext, 3,3
2783 movq m2, [r0+r2*1-8]
2784 punpckhbw m2, [r0-8]
2787 PALIGNR m3, m1, 5, m1
2789 PALIGNR m3, [r1+r2*1-8], 7, m4
2791 PALIGNR m3, [r1+r2*2-8], 7, m4
2792 PRED4x4_LOWPASS m0, m3, m1, m2, m4