1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
30 tm_shuf: times 8 db 0x03, 0x80
31 pw_ff00: times 8 dw 0xff00
32 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
33 db 1, 2, 3, 4, 5, 6, 7, 8
34 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
35 db 1, 2, 3, 4, 0, 0, 0, 0
36 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
37 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
38 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
39 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
52 ;-----------------------------------------------------------------------------
53 ; void pred16x16_vertical(uint8_t *src, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_mmx, 2,3
71 cglobal pred16x16_vertical_sse, 2,3
76 movaps [r0+r1*1], xmm0
77 movaps [r0+r1*2], xmm0
79 movaps [r0+r1*1], xmm0
80 movaps [r0+r1*2], xmm0
86 ;-----------------------------------------------------------------------------
87 ; void pred16x16_horizontal(uint8_t *src, int stride)
88 ;-----------------------------------------------------------------------------
91 cglobal pred16x16_horizontal_%1, 2,3
133 ;-----------------------------------------------------------------------------
134 ; void pred16x16_dc(uint8_t *src, int stride)
135 ;-----------------------------------------------------------------------------
137 %macro PRED16x16_DC 1
138 cglobal pred16x16_dc_%1, 2,7
146 movzx r5d, byte [r0+r1*1]
151 movzx r2d, byte [r0+r1*0]
152 movzx r3d, byte [r0+r1*1]
157 movzx r2d, byte [r0+r1*0]
204 ;-----------------------------------------------------------------------------
205 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
206 ;-----------------------------------------------------------------------------
208 %macro PRED16x16_TM_MMX 1
209 cglobal pred16x16_tm_vp8_%1, 2,5
220 movzx r3d, byte [r0-1]
223 movzx r2d, byte [r0+r1-1]
250 PRED16x16_TM_MMX mmxext
252 cglobal pred16x16_tm_vp8_sse2, 2,6,6
259 movzx r4d, byte [r0-1]
262 movzx r2d, byte [r0+r1*1-1]
263 movzx r3d, byte [r0+r1*2-1]
268 pshuflw xmm2, xmm2, 0
269 pshuflw xmm4, xmm4, 0
270 punpcklqdq xmm2, xmm2
271 punpcklqdq xmm4, xmm4
280 movdqa [r0+r1*1], xmm2
281 movdqa [r0+r1*2], xmm4
287 ;-----------------------------------------------------------------------------
288 ; void pred16x16_plane(uint8_t *src, int stride)
289 ;-----------------------------------------------------------------------------
291 %macro H264_PRED16x16_PLANE 3
292 cglobal pred16x16_plane_%3_%1, 2, 7, %2
306 pmullw m0, [pw_m8tom1 ]
307 pmullw m1, [pw_m8tom1+8]
308 pmullw m2, [pw_1to8 ]
309 pmullw m3, [pw_1to8 +8]
318 pmullw m0, [pw_m8tom1]
322 movhps m0, [r0+r1 +8]
323 pmaddubsw m0, [plane_shuf] ; H coefficients
345 paddw m0, m1 ; sum of H coefficients
357 movzx e_reg, byte [r3+r2*2 ]
358 movzx r5, byte [r4+r1 ]
361 movzx e_reg, byte [r3+r2 ]
366 movzx e_reg, byte [r3+r1 ]
367 movzx r6, byte [r4+r2*2 ]
371 movzx e_reg, byte [r3 ]
373 movzx r10, byte [r4+r2 ]
376 movzx r6, byte [r4+r2 ]
385 movzx r4, byte [e_reg+r2 ]
397 movzx r4, byte [e_reg ]
399 movzx r10, byte [r3 +r2 ]
403 movzx r6, byte [r3 +r2 ]
409 movzx r4, byte [e_reg+r1 ]
410 movzx r6, byte [r3 +r2*2]
417 movzx r4, byte [e_reg+r2*2]
418 movzx r6, byte [r3 +r1 ]
421 add r5, r6 ; sum of V coefficients
438 lea r5, [r5*5] ; 5*(V/4)
442 sar r5, 4 ; (5*(V/4))/16
445 movzx r4, byte [r0+r1 +15]
446 movzx r3, byte [r3+r2*2 ]
464 lea r1d, [r1d*5] ; 5*(H/4)
468 sar r1d, 4 ; (5*(H/4))/16
494 punpcklqdq m0, m0 ; splat H (words)
495 punpcklqdq m1, m1 ; splat V (words)
496 punpcklqdq m3, m3 ; splat a (words)
505 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
514 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
515 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
517 paddw m5, m0 ; a + {8,9,10,11}*H
518 paddw m6, m0 ; a + {12,13,14,15}*H
523 mova m3, m0 ; b[0..7]
524 mova m4, m2 ; b[8..15]
530 mova m3, m5 ; b[8..11]
531 mova m4, m6 ; b[12..15]
544 mova m3, m0 ; b[0..7]
545 mova m4, m2 ; b[8..15]
551 mova m3, m5 ; b[8..11]
552 mova m4, m6 ; b[12..15]
572 H264_PRED16x16_PLANE mmx, 0, h264
573 H264_PRED16x16_PLANE mmx, 0, rv40
574 H264_PRED16x16_PLANE mmx, 0, svq3
575 H264_PRED16x16_PLANE mmx2, 0, h264
576 H264_PRED16x16_PLANE mmx2, 0, rv40
577 H264_PRED16x16_PLANE mmx2, 0, svq3
579 H264_PRED16x16_PLANE sse2, 8, h264
580 H264_PRED16x16_PLANE sse2, 8, rv40
581 H264_PRED16x16_PLANE sse2, 8, svq3
582 H264_PRED16x16_PLANE ssse3, 8, h264
583 H264_PRED16x16_PLANE ssse3, 8, rv40
584 H264_PRED16x16_PLANE ssse3, 8, svq3
586 ;-----------------------------------------------------------------------------
587 ; void pred8x8_plane(uint8_t *src, int stride)
588 ;-----------------------------------------------------------------------------
590 %macro H264_PRED8x8_PLANE 2
591 cglobal pred8x8_plane_%1, 2, 7, %2
601 pmullw m0, [pw_m4to4]
602 pmullw m1, [pw_m4to4+8]
609 pmullw m0, [pw_m4to4]
611 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
612 pmaddubsw m0, [plane8_shuf] ; H coefficients
638 paddw m0, m1 ; sum of H coefficients
650 movzx e_reg, byte [r3+r2*2 ]
651 movzx r5, byte [r4+r1 ]
654 movzx e_reg, byte [r3 ]
656 movzx r10, byte [r4+r2 ]
660 movzx r6, byte [r4+r2 ]
666 movzx e_reg, byte [r3+r1 ]
667 movzx r6, byte [r4+r2*2 ]
674 movzx e_reg, byte [r3+r2 ]
687 movzx r3, byte [r4+r2*2 ]
688 movzx r4, byte [r0+r1 +7]
719 punpcklqdq m0, m0 ; splat H (words)
720 punpcklqdq m1, m1 ; splat V (words)
721 punpcklqdq m3, m3 ; splat a (words)
726 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
727 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
730 paddw m2, m0 ; a + {4,5,6,7}*H
737 mova m3, m0 ; b[0..7]
740 mova m4, m0 ; V+b[0..7]
747 mova m3, m0 ; b[0..3]
748 mova m4, m2 ; b[4..7]
753 mova m5, m0 ; V+b[0..3]
754 mova m6, m2 ; V+b[4..7]
772 H264_PRED8x8_PLANE mmx, 0
773 H264_PRED8x8_PLANE mmx2, 0
775 H264_PRED8x8_PLANE sse2, 8
776 H264_PRED8x8_PLANE ssse3, 8
778 ;-----------------------------------------------------------------------------
779 ; void pred8x8_vertical(uint8_t *src, int stride)
780 ;-----------------------------------------------------------------------------
782 cglobal pred8x8_vertical_mmx, 2,2
794 ;-----------------------------------------------------------------------------
795 ; void pred8x8_horizontal(uint8_t *src, int stride)
796 ;-----------------------------------------------------------------------------
799 cglobal pred8x8_horizontal_%1, 2,3
836 ;-----------------------------------------------------------------------------
837 ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
838 ;-----------------------------------------------------------------------------
840 cglobal pred8x8_top_dc_mmxext, 2,5
857 pshufw mm0, mm0, 0 ; dc0 (w)
858 packuswb mm0, mm1 ; dc0,dc1 (b)
870 ;-----------------------------------------------------------------------------
871 ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
872 ;-----------------------------------------------------------------------------
875 cglobal pred8x8_dc_mmxext, 2,5
884 movzx r2d, byte [r0+r1*1-1]
885 movzx r3d, byte [r0+r1*2-1]
888 movzx r3d, byte [r0+r1*1-1]
890 movzx r3d, byte [r0+r1*2-1]
894 movzx r2d, byte [r0+r1*1-1]
895 movzx r3d, byte [r0+r1*2-1]
898 movzx r3d, byte [r0+r1*1-1]
900 movzx r3d, byte [r0+r1*2-1]
907 punpckldq m0, m2 ; s0, s1, s2, s3
908 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
910 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
914 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
932 ;-----------------------------------------------------------------------------
933 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
934 ;-----------------------------------------------------------------------------
936 cglobal pred8x8_dc_rv40_mmxext, 2,7
942 movzx r5d, byte [r0+r1*1]
946 movzx r2d, byte [r0+r1*0]
947 movzx r3d, byte [r0+r1*1]
952 movzx r2d, byte [r0+r1*0]
968 ;-----------------------------------------------------------------------------
969 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
970 ;-----------------------------------------------------------------------------
972 %macro PRED8x8_TM_MMX 1
973 cglobal pred8x8_tm_vp8_%1, 2,6
980 movzx r4d, byte [r0-1]
983 movzx r2d, byte [r0+r1*1-1]
984 movzx r3d, byte [r0+r1*2-1]
1015 PRED8x8_TM_MMX mmxext
1017 cglobal pred8x8_tm_vp8_sse2, 2,6,4
1021 punpcklbw xmm0, xmm1
1022 movzx r4d, byte [r0-1]
1025 movzx r2d, byte [r0+r1*1-1]
1026 movzx r3d, byte [r0+r1*2-1]
1031 pshuflw xmm2, xmm2, 0
1032 pshuflw xmm3, xmm3, 0
1033 punpcklqdq xmm2, xmm2
1034 punpcklqdq xmm3, xmm3
1038 movq [r0+r1*1], xmm2
1039 movhps [r0+r1*2], xmm2
1045 cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1047 movdqa xmm4, [tm_shuf]
1050 punpcklbw xmm0, xmm1
1055 movd xmm2, [r0+r1*1-4]
1056 movd xmm3, [r0+r1*2-4]
1064 movq [r0+r1*1], xmm2
1065 movhps [r0+r1*2], xmm2
1071 ; dest, left, right, src, tmp
1072 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1073 %macro PRED4x4_LOWPASS 5
1083 ;-----------------------------------------------------------------------------
1084 ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1085 ;-----------------------------------------------------------------------------
1087 %macro PRED8x8L_TOP_DC 1
1088 cglobal pred8x8l_top_dc_%1, 4,4
1096 PALIGNR mm2, mm0, 7, mm0
1097 PALIGNR mm1, mm4, 1, mm4
1098 test r1, r1 ; top_left
1100 test r2, r2 ; top_right
1109 test r2, r2 ; top_right
1118 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1135 %define PALIGNR PALIGNR_MMX
1136 PRED8x8L_TOP_DC mmxext
1137 %define PALIGNR PALIGNR_SSSE3
1138 PRED8x8L_TOP_DC ssse3
1140 ;-----------------------------------------------------------------------------
1141 ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1142 ;-----------------------------------------------------------------------------
1144 %macro PRED8x8L_DC 1
1145 cglobal pred8x8l_dc_%1, 4,5
1148 movq mm0, [r0+r3*1-8]
1149 punpckhbw mm0, [r0+r3*0-8]
1150 movq mm1, [r4+r3*1-8]
1151 punpckhbw mm1, [r0+r3*2-8]
1155 movq mm2, [r0+r3*1-8]
1156 punpckhbw mm2, [r0+r3*0-8]
1158 movq mm3, [r0+r3*1-8]
1159 punpckhbw mm3, [r0+r3*0-8]
1163 movq mm0, [r0+r3*0-8]
1168 PALIGNR mm4, mm0, 7, mm0
1169 PALIGNR mm1, mm2, 1, mm2
1196 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1199 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1201 PALIGNR mm7, mm1, 7, mm3
1207 PALIGNR mm2, mm0, 7, mm0
1208 PALIGNR mm1, mm4, 1, mm4
1215 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1238 %define PALIGNR PALIGNR_MMX
1240 %define PALIGNR PALIGNR_SSSE3
1243 ;-----------------------------------------------------------------------------
1244 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1245 ;-----------------------------------------------------------------------------
1247 %macro PRED8x8L_HORIZONTAL 1
1248 cglobal pred8x8l_horizontal_%1, 4,4
1251 movq mm0, [r0+r3*1-8]
1255 punpckhbw mm0, [r1+r3*0-8]
1256 movq mm1, [r2+r3*1-8]
1257 punpckhbw mm1, [r0+r3*2-8]
1261 movq mm2, [r0+r3*1-8]
1262 punpckhbw mm2, [r0+r3*0-8]
1264 movq mm3, [r0+r3*1-8]
1265 punpckhbw mm3, [r0+r3*0-8]
1269 movq mm0, [r0+r3*0-8]
1270 movq mm1, [r1+r3*0-8]
1274 PALIGNR mm4, mm0, 7, mm0
1275 PALIGNR mm1, mm2, 1, mm2
1277 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1280 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1282 PALIGNR mm7, mm1, 7, mm3
1288 pshufw mm0, mm3, 0xff
1289 pshufw mm1, mm3, 0xaa
1291 pshufw mm2, mm3, 0x55
1292 pshufw mm3, mm3, 0x00
1293 pshufw mm4, mm7, 0xff
1294 pshufw mm5, mm7, 0xaa
1295 pshufw mm6, mm7, 0x55
1296 pshufw mm7, mm7, 0x00
1310 %define PALIGNR PALIGNR_MMX
1311 PRED8x8L_HORIZONTAL mmxext
1312 %define PALIGNR PALIGNR_SSSE3
1313 PRED8x8L_HORIZONTAL ssse3
1315 ;-----------------------------------------------------------------------------
1316 ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1317 ;-----------------------------------------------------------------------------
1319 %macro PRED8x8L_VERTICAL 1
1320 cglobal pred8x8l_vertical_%1, 4,4
1327 PALIGNR mm2, mm0, 7, mm0
1328 PALIGNR mm1, mm4, 1, mm4
1329 test r1, r1 ; top_left
1331 test r2, r2 ; top_right
1340 test r2, r2 ; top_right
1349 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1361 %define PALIGNR PALIGNR_MMX
1362 PRED8x8L_VERTICAL mmxext
1363 %define PALIGNR PALIGNR_SSSE3
1364 PRED8x8L_VERTICAL ssse3
1366 ;-----------------------------------------------------------------------------
1367 ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1368 ;-----------------------------------------------------------------------------
1371 %define PALIGNR PALIGNR_MMX
1372 cglobal pred8x8l_down_left_mmxext, 4,5
1379 PALIGNR mm2, mm0, 7, mm0
1380 PALIGNR mm1, mm4, 1, mm4
1403 pshufw mm1, mm3, 0xFF
1406 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1415 PALIGNR mm2, mm3, 7, mm3
1416 PALIGNR mm5, mm4, 1, mm4
1417 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1425 PALIGNR mm2, mm7, 1, mm0
1427 PALIGNR mm3, mm7, 7, mm0
1428 PALIGNR mm4, mm6, 1, mm0
1434 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1435 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1479 %macro PRED8x8L_DOWN_LEFT 1
1480 cglobal pred8x8l_down_left_%1, 4,4
1487 PALIGNR mm2, mm0, 7, mm0
1488 PALIGNR mm1, mm4, 1, mm4
1489 test r1, r1 ; top_left
1491 test r2, r2 ; top_right
1500 test r2, r2 ; top_right
1511 pshufw mm1, mm3, 0xFF
1514 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1516 test r2, r2 ; top_right
1523 PALIGNR mm2, mm3, 7, mm3
1524 PALIGNR mm5, mm4, 1, mm4
1525 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1541 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1543 movq [r0+r3*1], xmm0
1545 movq [r0+r3*2], xmm0
1548 movq [r1+r3*1], xmm0
1550 movq [r1+r3*2], xmm0
1552 movq [r2+r3*1], xmm0
1554 movq [r2+r3*2], xmm0
1556 movq [r0+r3*1], xmm0
1558 movq [r0+r3*2], xmm0
1563 %define PALIGNR PALIGNR_MMX
1564 PRED8x8L_DOWN_LEFT sse2
1566 %define PALIGNR PALIGNR_SSSE3
1567 PRED8x8L_DOWN_LEFT ssse3
1569 ;-----------------------------------------------------------------------------
1570 ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1571 ;-----------------------------------------------------------------------------
1574 %define PALIGNR PALIGNR_MMX
1575 cglobal pred8x8l_down_right_mmxext, 4,5
1578 movq mm0, [r0+r3*1-8]
1579 punpckhbw mm0, [r0+r3*0-8]
1580 movq mm1, [r4+r3*1-8]
1581 punpckhbw mm1, [r0+r3*2-8]
1585 movq mm2, [r0+r3*1-8]
1586 punpckhbw mm2, [r0+r3*0-8]
1588 movq mm3, [r0+r3*1-8]
1589 punpckhbw mm3, [r0+r3*0-8]
1593 movq mm0, [r0+r3*0-8]
1598 PALIGNR mm4, mm0, 7, mm0
1599 PALIGNR mm1, mm2, 1, mm2
1600 test r1, r1 ; top_left
1604 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1608 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1610 PALIGNR mm7, mm1, 7, mm3
1616 PALIGNR mm2, mm0, 7, mm0
1617 PALIGNR mm1, mm4, 1, mm4
1618 test r1, r1 ; top_left
1620 test r2, r2 ; top_right
1623 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1639 test r2, r2 ; top_right
1655 PALIGNR mm2, mm6, 1, mm0
1657 PALIGNR mm3, mm6, 7, mm0
1661 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1662 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1706 %macro PRED8x8L_DOWN_RIGHT 1
1707 cglobal pred8x8l_down_right_%1, 4,5
1710 movq mm0, [r0+r3*1-8]
1711 punpckhbw mm0, [r0+r3*0-8]
1712 movq mm1, [r4+r3*1-8]
1713 punpckhbw mm1, [r0+r3*2-8]
1717 movq mm2, [r0+r3*1-8]
1718 punpckhbw mm2, [r0+r3*0-8]
1720 movq mm3, [r0+r3*1-8]
1721 punpckhbw mm3, [r0+r3*0-8]
1725 movq mm0, [r0+r3*0-8]
1730 PALIGNR mm4, mm0, 7, mm0
1731 PALIGNR mm1, mm2, 1, mm2
1759 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1763 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1765 PALIGNR mm7, mm1, 7, mm3
1772 PALIGNR mm2, mm0, 7, mm0
1773 PALIGNR mm1, mm4, 1, mm4
1779 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1796 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1799 movq [r0+r3*2], xmm0
1800 movq [r0+r3*1], xmm1
1803 movq [r2+r3*2], xmm0
1804 movq [r2+r3*1], xmm1
1807 movq [r1+r3*2], xmm0
1808 movq [r1+r3*1], xmm1
1811 movq [r4+r3*2], xmm0
1812 movq [r4+r3*1], xmm1
1817 %define PALIGNR PALIGNR_MMX
1818 PRED8x8L_DOWN_RIGHT sse2
1820 %define PALIGNR PALIGNR_SSSE3
1821 PRED8x8L_DOWN_RIGHT ssse3
1823 ;-----------------------------------------------------------------------------
1824 ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1825 ;-----------------------------------------------------------------------------
1828 %define PALIGNR PALIGNR_MMX
1829 cglobal pred8x8l_vertical_right_mmxext, 4,5
1832 movq mm0, [r0+r3*1-8]
1833 punpckhbw mm0, [r0+r3*0-8]
1834 movq mm1, [r4+r3*1-8]
1835 punpckhbw mm1, [r0+r3*2-8]
1839 movq mm2, [r0+r3*1-8]
1840 punpckhbw mm2, [r0+r3*0-8]
1842 movq mm3, [r0+r3*1-8]
1843 punpckhbw mm3, [r0+r3*0-8]
1847 movq mm0, [r0+r3*0-8]
1852 PALIGNR mm4, mm0, 7, mm0
1853 PALIGNR mm1, mm2, 1, mm2
1881 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1888 PALIGNR mm2, mm0, 7, mm0
1889 PALIGNR mm1, mm4, 1, mm4
1895 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1899 PALIGNR mm3, mm7, 7, mm0
1900 PALIGNR mm6, mm7, 6, mm1
1904 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1915 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1916 PALIGNR mm6, mm0, 7, mm2
1919 PALIGNR mm5, mm0, 7, mm1
1922 PALIGNR mm6, mm0, 7, mm2
1925 PALIGNR mm5, mm0, 7, mm1
1928 PALIGNR mm6, mm0, 7, mm2
1931 PALIGNR mm5, mm0, 7, mm1
1935 %macro PRED8x8L_VERTICAL_RIGHT 1
1936 cglobal pred8x8l_vertical_right_%1, 4,5,7
1939 movq mm0, [r0+r3*1-8]
1940 punpckhbw mm0, [r0+r3*0-8]
1941 movq mm1, [r4+r3*1-8]
1942 punpckhbw mm1, [r0+r3*2-8]
1946 movq mm2, [r0+r3*1-8]
1947 punpckhbw mm2, [r0+r3*0-8]
1949 movq mm3, [r0+r3*1-8]
1950 punpckhbw mm3, [r0+r3*0-8]
1954 movq mm0, [r0+r3*0-8]
1959 PALIGNR mm4, mm0, 7, mm0
1960 PALIGNR mm1, mm2, 1, mm2
1987 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1994 PALIGNR mm2, mm0, 7, mm0
1995 PALIGNR mm1, mm4, 1, mm4
2001 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
2006 movdqa xmm6, [pw_ff00]
2015 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
2021 movhps [r0+r3*2], xmm5
2022 movhps [r0+r3*1], xmm2
2030 movq [r0+r3*2], xmm5
2031 movq [r0+r3*1], xmm2
2034 movq [r2+r3*2], xmm5
2035 movq [r2+r3*1], xmm2
2038 movq [r1+r3*2], xmm5
2039 movq [r1+r3*1], xmm2
2044 %define PALIGNR PALIGNR_MMX
2045 PRED8x8L_VERTICAL_RIGHT sse2
2047 %define PALIGNR PALIGNR_SSSE3
2048 PRED8x8L_VERTICAL_RIGHT ssse3
2050 ;-----------------------------------------------------------------------------
2051 ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
2052 ;-----------------------------------------------------------------------------
2054 %macro PRED8x8L_VERTICAL_LEFT 1
2055 cglobal pred8x8l_vertical_left_%1, 4,4
2062 PALIGNR mm2, mm0, 7, mm0
2063 PALIGNR mm1, mm4, 1, mm4
2086 pshufw mm1, mm3, 0xFF
2089 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2098 PALIGNR mm2, mm3, 7, mm3
2099 PALIGNR mm5, mm4, 1, mm4
2100 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2114 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2116 movq [r0+r3*1], xmm3
2117 movq [r0+r3*2], xmm0
2121 movq [r1+r3*1], xmm3
2122 movq [r1+r3*2], xmm0
2125 movq [r2+r3*1], xmm3
2126 movq [r2+r3*2], xmm0
2129 movq [r0+r3*1], xmm3
2130 movq [r0+r3*2], xmm0
2135 %define PALIGNR PALIGNR_MMX
2136 PRED8x8L_VERTICAL_LEFT sse2
2137 %define PALIGNR PALIGNR_SSSE3
2139 PRED8x8L_VERTICAL_LEFT ssse3
2141 ;-----------------------------------------------------------------------------
2142 ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2143 ;-----------------------------------------------------------------------------
2145 %macro PRED8x8L_HORIZONTAL_UP 1
2146 cglobal pred8x8l_horizontal_up_%1, 4,4
2149 movq mm0, [r0+r3*1-8]
2153 punpckhbw mm0, [r1+r3*0-8]
2154 movq mm1, [r2+r3*1-8]
2155 punpckhbw mm1, [r0+r3*2-8]
2159 movq mm2, [r0+r3*1-8]
2160 punpckhbw mm2, [r0+r3*0-8]
2162 movq mm3, [r0+r3*1-8]
2163 punpckhbw mm3, [r0+r3*0-8]
2167 movq mm0, [r0+r3*0-8]
2168 movq mm1, [r1+r3*0-8]
2172 PALIGNR mm4, mm0, 7, mm0
2173 PALIGNR mm1, mm2, 1, mm2
2175 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2178 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2180 PALIGNR mm7, mm1, 7, mm3
2182 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2183 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2187 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2194 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2196 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2198 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2200 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2201 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2205 PALIGNR mm5, mm4, 2, mm1
2206 pshufw mm1, mm6, 11111001b
2207 PALIGNR mm6, mm4, 4, mm2
2208 pshufw mm2, mm7, 11111110b
2209 PALIGNR mm7, mm4, 6, mm3
2210 pshufw mm3, mm0, 11111111b
2224 %define PALIGNR PALIGNR_MMX
2225 PRED8x8L_HORIZONTAL_UP mmxext
2226 %define PALIGNR PALIGNR_SSSE3
2227 PRED8x8L_HORIZONTAL_UP ssse3
2229 ;-----------------------------------------------------------------------------
2230 ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2231 ;-----------------------------------------------------------------------------
2234 %define PALIGNR PALIGNR_MMX
2235 cglobal pred8x8l_horizontal_down_mmxext, 4,5
2238 movq mm0, [r0+r3*1-8]
2239 punpckhbw mm0, [r0+r3*0-8]
2240 movq mm1, [r4+r3*1-8]
2241 punpckhbw mm1, [r0+r3*2-8]
2245 movq mm2, [r0+r3*1-8]
2246 punpckhbw mm2, [r0+r3*0-8]
2248 movq mm3, [r0+r3*1-8]
2249 punpckhbw mm3, [r0+r3*0-8]
2253 movq mm0, [r0+r3*0-8]
2258 PALIGNR mm4, mm0, 7, mm0
2259 PALIGNR mm1, mm2, 1, mm2
2286 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2290 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2292 PALIGNR mm7, mm1, 7, mm3
2298 PALIGNR mm2, mm0, 7, mm0
2299 PALIGNR mm1, mm4, 1, mm4
2305 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2312 PALIGNR mm2, mm6, 7, mm5
2313 PALIGNR mm6, mm7, 7, mm0
2315 PALIGNR mm4, mm3, 1, mm7
2318 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2324 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2332 PALIGNR mm7, mm3, 2, mm5
2334 PALIGNR mm1, mm3, 4, mm5
2336 PALIGNR mm0, mm3, 6, mm3
2341 PALIGNR mm6, mm4, 2, mm5
2343 PALIGNR mm2, mm4, 4, mm5
2345 PALIGNR mm3, mm4, 6, mm4
2349 %macro PRED8x8L_HORIZONTAL_DOWN 1
2350 cglobal pred8x8l_horizontal_down_%1, 4,5
2353 movq mm0, [r0+r3*1-8]
2354 punpckhbw mm0, [r0+r3*0-8]
2355 movq mm1, [r4+r3*1-8]
2356 punpckhbw mm1, [r0+r3*2-8]
2360 movq mm2, [r0+r3*1-8]
2361 punpckhbw mm2, [r0+r3*0-8]
2363 movq mm3, [r0+r3*1-8]
2364 punpckhbw mm3, [r0+r3*0-8]
2368 movq mm0, [r0+r3*0-8]
2373 PALIGNR mm4, mm0, 7, mm0
2374 PALIGNR mm1, mm2, 1, mm2
2401 pshufw mm1, mm3, 0xFF
2405 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2409 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2419 PALIGNR mm2, mm0, 7, mm0
2420 PALIGNR mm1, mm4, 1, mm4
2426 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2435 PALIGNR mm2, mm3, 7, mm3
2436 PALIGNR mm5, mm4, 1, mm4
2437 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2446 PALIGNR xmm1, xmm0, 7, xmm4
2447 PALIGNR xmm2, xmm0, 9, xmm5
2449 PALIGNR xmm3, xmm0, 8, xmm0
2453 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2454 punpcklbw xmm4, xmm0
2456 movq [r0+r3*2], xmm4
2457 movq [r2+r3*2], xmm0
2460 movq [r0+r3*1], xmm4
2461 movq [r2+r3*1], xmm0
2464 movq [r1+r3*2], xmm4
2465 movq [r4+r3*2], xmm0
2468 movq [r1+r3*1], xmm4
2469 movq [r4+r3*1], xmm0
2474 %define PALIGNR PALIGNR_MMX
2475 PRED8x8L_HORIZONTAL_DOWN sse2
2477 %define PALIGNR PALIGNR_SSSE3
2478 PRED8x8L_HORIZONTAL_DOWN ssse3
2481 ;-----------------------------------------------------------------------------
2482 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2483 ;-----------------------------------------------------------------------------
2485 cglobal pred4x4_dc_mmxext, 3,5
2491 movzx r1d, byte [r0+r2*1-1]
2494 movzx r1d, byte [r0+r2*2-1]
2497 movzx r1d, byte [r0+r2*1-1]
2499 movzx r1d, byte [r0+r2*2-1]
2503 imul r3d, 0x01010101
2510 ;-----------------------------------------------------------------------------
2511 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2512 ;-----------------------------------------------------------------------------
2514 %macro PRED4x4_TM_MMX 1
2515 cglobal pred4x4_tm_vp8_%1, 3,6
2520 movzx r4d, byte [r0-1]
2523 movzx r1d, byte [r0+r2*1-1]
2524 movzx r3d, byte [r0+r2*2-1]
2551 PRED4x4_TM_MMX mmxext
2553 cglobal pred4x4_tm_vp8_ssse3, 3,3
2562 movd mm2, [r0+r2*1-4]
2563 movd mm3, [r0+r2*2-4]
2564 movd mm4, [r1+r2*1-4]
2565 movd mm5, [r1+r2*2-4]
2588 ;-----------------------------------------------------------------------------
2589 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2590 ;-----------------------------------------------------------------------------
2593 cglobal pred4x4_vertical_vp8_mmxext, 3,3
2597 mova m2, m0 ;t0 t1 t2 t3
2598 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2600 psrlq m0, 8 ;t1 t2 t3 t4
2601 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2608 ;-----------------------------------------------------------------------------
2609 ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2610 ;-----------------------------------------------------------------------------
2613 cglobal pred4x4_down_left_mmxext, 3,3
2624 PRED4x4_LOWPASS m0, m1, m3, m4, m5
2636 ;-----------------------------------------------------------------------------
2637 ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2638 ;-----------------------------------------------------------------------------
2641 cglobal pred4x4_vertical_left_mmxext, 3,3
2651 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2661 ;-----------------------------------------------------------------------------
2662 ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2663 ;-----------------------------------------------------------------------------
2666 cglobal pred4x4_horizontal_up_mmxext, 3,3
2669 movd m0, [r0+r2*1-4]
2670 punpcklbw m0, [r0+r2*2-4]
2671 movd m1, [r1+r2*1-4]
2672 punpcklbw m1, [r1+r2*2-4]
2684 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2694 ;-----------------------------------------------------------------------------
2695 ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2696 ;-----------------------------------------------------------------------------
2699 %define PALIGNR PALIGNR_MMX
2700 cglobal pred4x4_horizontal_down_mmxext, 3,3
2703 movh m0, [r0-4] ; lt ..
2704 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2705 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2706 movd m1, [r1+r2*2-4] ; l3
2707 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2708 movd m2, [r0+r2*2-4] ; l1
2709 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2710 punpckhwd m1, m2 ; l0 l1 l2 l3
2711 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2715 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2716 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2718 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2721 PALIGNR m3, m5, 6, m4
2730 ;-----------------------------------------------------------------------------
2731 ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2732 ;-----------------------------------------------------------------------------
2735 %define PALIGNR PALIGNR_MMX
2736 cglobal pred4x4_vertical_right_mmxext, 3,3
2739 movh m0, [r0] ; ........t3t2t1t0
2741 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2743 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2745 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2747 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2748 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2754 PALIGNR m5, m1, 7, m2
2757 PALIGNR m3, m1, 7, m1
2761 ;-----------------------------------------------------------------------------
2762 ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2763 ;-----------------------------------------------------------------------------
2766 %define PALIGNR PALIGNR_MMX
2767 cglobal pred4x4_down_right_mmxext, 3,3
2771 movq m2, [r0+r2*1-8]
2772 punpckhbw m2, [r0-8]
2775 PALIGNR m3, m1, 5, m1
2777 PALIGNR m3, [r1+r2*1-8], 7, m4
2779 PALIGNR m3, [r1+r2*2-8], 7, m4
2780 PRED4x4_LOWPASS m0, m3, m1, m2, m4