1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
30 tm_shuf: times 8 db 0x03, 0x80
31 pw_ff00: times 8 dw 0xff00
32 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
33 db 1, 2, 3, 4, 5, 6, 7, 8
34 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
35 db 1, 2, 3, 4, 0, 0, 0, 0
36 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
37 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
38 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
39 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
52 ;-----------------------------------------------------------------------------
53 ; void pred16x16_vertical(uint8_t *src, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_mmx, 2,3
71 cglobal pred16x16_vertical_sse, 2,3
76 movaps [r0+r1*1], xmm0
77 movaps [r0+r1*2], xmm0
79 movaps [r0+r1*1], xmm0
80 movaps [r0+r1*2], xmm0
86 ;-----------------------------------------------------------------------------
87 ; void pred16x16_horizontal(uint8_t *src, int stride)
88 ;-----------------------------------------------------------------------------
91 cglobal pred16x16_horizontal_%1, 2,3
133 ;-----------------------------------------------------------------------------
134 ; void pred16x16_dc(uint8_t *src, int stride)
135 ;-----------------------------------------------------------------------------
137 %macro PRED16x16_DC 1
138 cglobal pred16x16_dc_%1, 2,7
146 movzx r5d, byte [r0+r1*1]
151 movzx r2d, byte [r0+r1*0]
152 movzx r3d, byte [r0+r1*1]
157 movzx r2d, byte [r0+r1*0]
204 ;-----------------------------------------------------------------------------
205 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
206 ;-----------------------------------------------------------------------------
208 %macro PRED16x16_TM_MMX 1
209 cglobal pred16x16_tm_vp8_%1, 2,5
220 movzx r3d, byte [r0-1]
223 movzx r2d, byte [r0+r1-1]
250 PRED16x16_TM_MMX mmxext
252 cglobal pred16x16_tm_vp8_sse2, 2,6,6
259 movzx r4d, byte [r0-1]
262 movzx r2d, byte [r0+r1*1-1]
263 movzx r3d, byte [r0+r1*2-1]
268 pshuflw xmm2, xmm2, 0
269 pshuflw xmm4, xmm4, 0
270 punpcklqdq xmm2, xmm2
271 punpcklqdq xmm4, xmm4
280 movdqa [r0+r1*1], xmm2
281 movdqa [r0+r1*2], xmm4
287 ;-----------------------------------------------------------------------------
288 ; void pred16x16_plane(uint8_t *src, int stride)
289 ;-----------------------------------------------------------------------------
291 %macro H264_PRED16x16_PLANE 3
292 cglobal pred16x16_plane_%3_%1, 2, 7, %2
306 pmullw m0, [pw_m8tom1 ]
307 pmullw m1, [pw_m8tom1+8]
308 pmullw m2, [pw_1to8 ]
309 pmullw m3, [pw_1to8 +8]
318 pmullw m0, [pw_m8tom1]
322 movhps m0, [r0+r1 +8]
323 pmaddubsw m0, [plane_shuf] ; H coefficients
345 paddw m0, m1 ; sum of H coefficients
357 movzx e_reg, byte [r3+r2*2 ]
358 movzx r5, byte [r4+r1 ]
361 movzx e_reg, byte [r3+r2 ]
366 movzx e_reg, byte [r3+r1 ]
367 movzx r6, byte [r4+r2*2 ]
371 movzx e_reg, byte [r3 ]
373 movzx r10, byte [r4+r2 ]
376 movzx r6, byte [r4+r2 ]
385 movzx r4, byte [e_reg+r2 ]
397 movzx r4, byte [e_reg ]
399 movzx r10, byte [r3 +r2 ]
403 movzx r6, byte [r3 +r2 ]
409 movzx r4, byte [e_reg+r1 ]
410 movzx r6, byte [r3 +r2*2]
417 movzx r4, byte [e_reg+r2*2]
418 movzx r6, byte [r3 +r1 ]
421 add r5, r6 ; sum of V coefficients
438 lea r5, [r5*5] ; 5*(V/4)
442 sar r5, 4 ; (5*(V/4))/16
445 movzx r4, byte [r0+r1 +15]
446 movzx r3, byte [r3+r2*2 ]
464 lea r1d, [r1d*5] ; 5*(H/4)
468 sar r1d, 4 ; (5*(H/4))/16
494 punpcklqdq m0, m0 ; splat H (words)
495 punpcklqdq m1, m1 ; splat V (words)
496 punpcklqdq m3, m3 ; splat a (words)
505 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
514 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
515 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
517 paddw m5, m0 ; a + {8,9,10,11}*H
518 paddw m6, m0 ; a + {12,13,14,15}*H
523 mova m3, m0 ; b[0..7]
524 mova m4, m2 ; b[8..15]
530 mova m3, m5 ; b[8..11]
531 mova m4, m6 ; b[12..15]
544 mova m3, m0 ; b[0..7]
545 mova m4, m2 ; b[8..15]
551 mova m3, m5 ; b[8..11]
552 mova m4, m6 ; b[12..15]
572 H264_PRED16x16_PLANE mmx, 0, h264
573 H264_PRED16x16_PLANE mmx, 0, rv40
574 H264_PRED16x16_PLANE mmx, 0, svq3
575 H264_PRED16x16_PLANE mmx2, 0, h264
576 H264_PRED16x16_PLANE mmx2, 0, rv40
577 H264_PRED16x16_PLANE mmx2, 0, svq3
579 H264_PRED16x16_PLANE sse2, 8, h264
580 H264_PRED16x16_PLANE sse2, 8, rv40
581 H264_PRED16x16_PLANE sse2, 8, svq3
582 H264_PRED16x16_PLANE ssse3, 8, h264
583 H264_PRED16x16_PLANE ssse3, 8, rv40
584 H264_PRED16x16_PLANE ssse3, 8, svq3
586 ;-----------------------------------------------------------------------------
587 ; void pred8x8_plane(uint8_t *src, int stride)
588 ;-----------------------------------------------------------------------------
590 %macro H264_PRED8x8_PLANE 2
591 cglobal pred8x8_plane_%1, 2, 7, %2
601 pmullw m0, [pw_m4to4]
602 pmullw m1, [pw_m4to4+8]
609 pmullw m0, [pw_m4to4]
611 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
612 pmaddubsw m0, [plane8_shuf] ; H coefficients
638 paddw m0, m1 ; sum of H coefficients
650 movzx e_reg, byte [r3+r2*2 ]
651 movzx r5, byte [r4+r1 ]
654 movzx e_reg, byte [r3 ]
656 movzx r10, byte [r4+r2 ]
660 movzx r6, byte [r4+r2 ]
666 movzx e_reg, byte [r3+r1 ]
667 movzx r6, byte [r4+r2*2 ]
674 movzx e_reg, byte [r3+r2 ]
687 movzx r3, byte [r4+r2*2 ]
688 movzx r4, byte [r0+r1 +7]
719 punpcklqdq m0, m0 ; splat H (words)
720 punpcklqdq m1, m1 ; splat V (words)
721 punpcklqdq m3, m3 ; splat a (words)
726 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
727 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
730 paddw m2, m0 ; a + {4,5,6,7}*H
737 mova m3, m0 ; b[0..7]
740 mova m4, m0 ; V+b[0..7]
747 mova m3, m0 ; b[0..3]
748 mova m4, m2 ; b[4..7]
753 mova m5, m0 ; V+b[0..3]
754 mova m6, m2 ; V+b[4..7]
772 H264_PRED8x8_PLANE mmx, 0
773 H264_PRED8x8_PLANE mmx2, 0
775 H264_PRED8x8_PLANE sse2, 8
776 H264_PRED8x8_PLANE ssse3, 8
778 ;-----------------------------------------------------------------------------
779 ; void pred8x8_vertical(uint8_t *src, int stride)
780 ;-----------------------------------------------------------------------------
782 cglobal pred8x8_vertical_mmx, 2,2
794 ;-----------------------------------------------------------------------------
795 ; void pred8x8_horizontal(uint8_t *src, int stride)
796 ;-----------------------------------------------------------------------------
799 cglobal pred8x8_horizontal_%1, 2,3
836 ;-----------------------------------------------------------------------------
837 ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
838 ;-----------------------------------------------------------------------------
839 cglobal pred8x8_top_dc_mmxext, 2,5
856 pshufw mm0, mm0, 0 ; dc0 (w)
857 packuswb mm0, mm1 ; dc0,dc1 (b)
869 ;-----------------------------------------------------------------------------
870 ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
871 ;-----------------------------------------------------------------------------
874 cglobal pred8x8_dc_mmxext, 2,5
883 movzx r2d, byte [r0+r1*1-1]
884 movzx r3d, byte [r0+r1*2-1]
887 movzx r3d, byte [r0+r1*1-1]
889 movzx r3d, byte [r0+r1*2-1]
893 movzx r2d, byte [r0+r1*1-1]
894 movzx r3d, byte [r0+r1*2-1]
897 movzx r3d, byte [r0+r1*1-1]
899 movzx r3d, byte [r0+r1*2-1]
906 punpckldq m0, m2 ; s0, s1, s2, s3
907 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
909 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
913 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
930 ;-----------------------------------------------------------------------------
931 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
932 ;-----------------------------------------------------------------------------
934 cglobal pred8x8_dc_rv40_mmxext, 2,7
940 movzx r5d, byte [r0+r1*1]
944 movzx r2d, byte [r0+r1*0]
945 movzx r3d, byte [r0+r1*1]
950 movzx r2d, byte [r0+r1*0]
966 ;-----------------------------------------------------------------------------
967 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
968 ;-----------------------------------------------------------------------------
970 %macro PRED8x8_TM_MMX 1
971 cglobal pred8x8_tm_vp8_%1, 2,6
978 movzx r4d, byte [r0-1]
981 movzx r2d, byte [r0+r1*1-1]
982 movzx r3d, byte [r0+r1*2-1]
1013 PRED8x8_TM_MMX mmxext
1015 cglobal pred8x8_tm_vp8_sse2, 2,6,4
1019 punpcklbw xmm0, xmm1
1020 movzx r4d, byte [r0-1]
1023 movzx r2d, byte [r0+r1*1-1]
1024 movzx r3d, byte [r0+r1*2-1]
1029 pshuflw xmm2, xmm2, 0
1030 pshuflw xmm3, xmm3, 0
1031 punpcklqdq xmm2, xmm2
1032 punpcklqdq xmm3, xmm3
1036 movq [r0+r1*1], xmm2
1037 movhps [r0+r1*2], xmm2
1043 cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1045 movdqa xmm4, [tm_shuf]
1048 punpcklbw xmm0, xmm1
1053 movd xmm2, [r0+r1*1-4]
1054 movd xmm3, [r0+r1*2-4]
1062 movq [r0+r1*1], xmm2
1063 movhps [r0+r1*2], xmm2
1069 ; dest, left, right, src, tmp
1070 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1071 %macro PRED4x4_LOWPASS 5
1081 ;-----------------------------------------------------------------------------
1082 ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1083 ;-----------------------------------------------------------------------------
1084 %macro PRED8x8L_TOP_DC 1
1085 cglobal pred8x8l_top_dc_%1, 4,4
1093 PALIGNR mm2, mm0, 7, mm0
1094 PALIGNR mm1, mm4, 1, mm4
1095 test r1, r1 ; top_left
1097 test r2, r2 ; top_right
1106 test r2, r2 ; top_right
1115 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1132 %define PALIGNR PALIGNR_MMX
1133 PRED8x8L_TOP_DC mmxext
1134 %define PALIGNR PALIGNR_SSSE3
1135 PRED8x8L_TOP_DC ssse3
1137 ;-----------------------------------------------------------------------------
1138 ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1139 ;-----------------------------------------------------------------------------
1141 %macro PRED8x8L_DC 1
1142 cglobal pred8x8l_dc_%1, 4,5
1145 movq mm0, [r0+r3*1-8]
1146 punpckhbw mm0, [r0+r3*0-8]
1147 movq mm1, [r4+r3*1-8]
1148 punpckhbw mm1, [r0+r3*2-8]
1152 movq mm2, [r0+r3*1-8]
1153 punpckhbw mm2, [r0+r3*0-8]
1155 movq mm3, [r0+r3*1-8]
1156 punpckhbw mm3, [r0+r3*0-8]
1160 movq mm0, [r0+r3*0-8]
1165 PALIGNR mm4, mm0, 7, mm0
1166 PALIGNR mm1, mm2, 1, mm2
1193 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1196 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1198 PALIGNR mm7, mm1, 7, mm3
1204 PALIGNR mm2, mm0, 7, mm0
1205 PALIGNR mm1, mm4, 1, mm4
1212 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1235 %define PALIGNR PALIGNR_MMX
1237 %define PALIGNR PALIGNR_SSSE3
1240 ;-----------------------------------------------------------------------------
1241 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1242 ;-----------------------------------------------------------------------------
1244 %macro PRED8x8L_HORIZONTAL 1
1245 cglobal pred8x8l_horizontal_%1, 4,4
1248 movq mm0, [r0+r3*1-8]
1252 punpckhbw mm0, [r1+r3*0-8]
1253 movq mm1, [r2+r3*1-8]
1254 punpckhbw mm1, [r0+r3*2-8]
1258 movq mm2, [r0+r3*1-8]
1259 punpckhbw mm2, [r0+r3*0-8]
1261 movq mm3, [r0+r3*1-8]
1262 punpckhbw mm3, [r0+r3*0-8]
1266 movq mm0, [r0+r3*0-8]
1267 movq mm1, [r1+r3*0-8]
1271 PALIGNR mm4, mm0, 7, mm0
1272 PALIGNR mm1, mm2, 1, mm2
1274 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1277 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1279 PALIGNR mm7, mm1, 7, mm3
1285 pshufw mm0, mm3, 0xff
1286 pshufw mm1, mm3, 0xaa
1288 pshufw mm2, mm3, 0x55
1289 pshufw mm3, mm3, 0x00
1290 pshufw mm4, mm7, 0xff
1291 pshufw mm5, mm7, 0xaa
1292 pshufw mm6, mm7, 0x55
1293 pshufw mm7, mm7, 0x00
1307 %define PALIGNR PALIGNR_MMX
1308 PRED8x8L_HORIZONTAL mmxext
1309 %define PALIGNR PALIGNR_SSSE3
1310 PRED8x8L_HORIZONTAL ssse3
1312 ;-----------------------------------------------------------------------------
1313 ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1314 ;-----------------------------------------------------------------------------
1316 %macro PRED8x8L_VERTICAL 1
1317 cglobal pred8x8l_vertical_%1, 4,4
1324 PALIGNR mm2, mm0, 7, mm0
1325 PALIGNR mm1, mm4, 1, mm4
1326 test r1, r1 ; top_left
1328 test r2, r2 ; top_right
1337 test r2, r2 ; top_right
1346 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1358 %define PALIGNR PALIGNR_MMX
1359 PRED8x8L_VERTICAL mmxext
1360 %define PALIGNR PALIGNR_SSSE3
1361 PRED8x8L_VERTICAL ssse3
1363 ;-----------------------------------------------------------------------------
1364 ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1365 ;-----------------------------------------------------------------------------
1368 %define PALIGNR PALIGNR_MMX
1369 cglobal pred8x8l_down_left_mmxext, 4,5
1376 PALIGNR mm2, mm0, 7, mm0
1377 PALIGNR mm1, mm4, 1, mm4
1400 pshufw mm1, mm3, 0xFF
1403 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1412 PALIGNR mm2, mm3, 7, mm3
1413 PALIGNR mm5, mm4, 1, mm4
1414 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1422 PALIGNR mm2, mm7, 1, mm0
1424 PALIGNR mm3, mm7, 7, mm0
1425 PALIGNR mm4, mm6, 1, mm0
1431 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1432 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1476 %macro PRED8x8L_DOWN_LEFT 1
1477 cglobal pred8x8l_down_left_%1, 4,4
1484 PALIGNR mm2, mm0, 7, mm0
1485 PALIGNR mm1, mm4, 1, mm4
1486 test r1, r1 ; top_left
1488 test r2, r2 ; top_right
1497 test r2, r2 ; top_right
1508 pshufw mm1, mm3, 0xFF
1511 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1513 test r2, r2 ; top_right
1520 PALIGNR mm2, mm3, 7, mm3
1521 PALIGNR mm5, mm4, 1, mm4
1522 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1538 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1540 movq [r0+r3*1], xmm0
1542 movq [r0+r3*2], xmm0
1545 movq [r1+r3*1], xmm0
1547 movq [r1+r3*2], xmm0
1549 movq [r2+r3*1], xmm0
1551 movq [r2+r3*2], xmm0
1553 movq [r0+r3*1], xmm0
1555 movq [r0+r3*2], xmm0
1560 %define PALIGNR PALIGNR_MMX
1561 PRED8x8L_DOWN_LEFT sse2
1563 %define PALIGNR PALIGNR_SSSE3
1564 PRED8x8L_DOWN_LEFT ssse3
1566 ;-----------------------------------------------------------------------------
1567 ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1568 ;-----------------------------------------------------------------------------
1571 %define PALIGNR PALIGNR_MMX
1572 cglobal pred8x8l_down_right_mmxext, 4,5
1575 movq mm0, [r0+r3*1-8]
1576 punpckhbw mm0, [r0+r3*0-8]
1577 movq mm1, [r4+r3*1-8]
1578 punpckhbw mm1, [r0+r3*2-8]
1582 movq mm2, [r0+r3*1-8]
1583 punpckhbw mm2, [r0+r3*0-8]
1585 movq mm3, [r0+r3*1-8]
1586 punpckhbw mm3, [r0+r3*0-8]
1590 movq mm0, [r0+r3*0-8]
1595 PALIGNR mm4, mm0, 7, mm0
1596 PALIGNR mm1, mm2, 1, mm2
1597 test r1, r1 ; top_left
1601 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1605 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1607 PALIGNR mm7, mm1, 7, mm3
1613 PALIGNR mm2, mm0, 7, mm0
1614 PALIGNR mm1, mm4, 1, mm4
1615 test r1, r1 ; top_left
1617 test r2, r2 ; top_right
1620 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1636 test r2, r2 ; top_right
1652 PALIGNR mm2, mm6, 1, mm0
1654 PALIGNR mm3, mm6, 7, mm0
1658 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1659 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1703 %macro PRED8x8L_DOWN_RIGHT 1
1704 cglobal pred8x8l_down_right_%1, 4,5
1707 movq mm0, [r0+r3*1-8]
1708 punpckhbw mm0, [r0+r3*0-8]
1709 movq mm1, [r4+r3*1-8]
1710 punpckhbw mm1, [r0+r3*2-8]
1714 movq mm2, [r0+r3*1-8]
1715 punpckhbw mm2, [r0+r3*0-8]
1717 movq mm3, [r0+r3*1-8]
1718 punpckhbw mm3, [r0+r3*0-8]
1722 movq mm0, [r0+r3*0-8]
1727 PALIGNR mm4, mm0, 7, mm0
1728 PALIGNR mm1, mm2, 1, mm2
1756 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1760 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1762 PALIGNR mm7, mm1, 7, mm3
1769 PALIGNR mm2, mm0, 7, mm0
1770 PALIGNR mm1, mm4, 1, mm4
1776 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1793 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1796 movq [r0+r3*2], xmm0
1797 movq [r0+r3*1], xmm1
1800 movq [r2+r3*2], xmm0
1801 movq [r2+r3*1], xmm1
1804 movq [r1+r3*2], xmm0
1805 movq [r1+r3*1], xmm1
1808 movq [r4+r3*2], xmm0
1809 movq [r4+r3*1], xmm1
1814 %define PALIGNR PALIGNR_MMX
1815 PRED8x8L_DOWN_RIGHT sse2
1817 %define PALIGNR PALIGNR_SSSE3
1818 PRED8x8L_DOWN_RIGHT ssse3
1820 ;-----------------------------------------------------------------------------
1821 ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1822 ;-----------------------------------------------------------------------------
1825 %define PALIGNR PALIGNR_MMX
1826 cglobal pred8x8l_vertical_right_mmxext, 4,5
1829 movq mm0, [r0+r3*1-8]
1830 punpckhbw mm0, [r0+r3*0-8]
1831 movq mm1, [r4+r3*1-8]
1832 punpckhbw mm1, [r0+r3*2-8]
1836 movq mm2, [r0+r3*1-8]
1837 punpckhbw mm2, [r0+r3*0-8]
1839 movq mm3, [r0+r3*1-8]
1840 punpckhbw mm3, [r0+r3*0-8]
1844 movq mm0, [r0+r3*0-8]
1849 PALIGNR mm4, mm0, 7, mm0
1850 PALIGNR mm1, mm2, 1, mm2
1878 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1885 PALIGNR mm2, mm0, 7, mm0
1886 PALIGNR mm1, mm4, 1, mm4
1892 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1896 PALIGNR mm3, mm7, 7, mm0
1897 PALIGNR mm6, mm7, 6, mm1
1901 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1912 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1913 PALIGNR mm6, mm0, 7, mm2
1916 PALIGNR mm5, mm0, 7, mm1
1919 PALIGNR mm6, mm0, 7, mm2
1922 PALIGNR mm5, mm0, 7, mm1
1925 PALIGNR mm6, mm0, 7, mm2
1928 PALIGNR mm5, mm0, 7, mm1
1932 %macro PRED8x8L_VERTICAL_RIGHT 1
1933 cglobal pred8x8l_vertical_right_%1, 4,5,7
1936 movq mm0, [r0+r3*1-8]
1937 punpckhbw mm0, [r0+r3*0-8]
1938 movq mm1, [r4+r3*1-8]
1939 punpckhbw mm1, [r0+r3*2-8]
1943 movq mm2, [r0+r3*1-8]
1944 punpckhbw mm2, [r0+r3*0-8]
1946 movq mm3, [r0+r3*1-8]
1947 punpckhbw mm3, [r0+r3*0-8]
1951 movq mm0, [r0+r3*0-8]
1956 PALIGNR mm4, mm0, 7, mm0
1957 PALIGNR mm1, mm2, 1, mm2
1984 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1991 PALIGNR mm2, mm0, 7, mm0
1992 PALIGNR mm1, mm4, 1, mm4
1998 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
2003 movdqa xmm6, [pw_ff00]
2012 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
2018 movhps [r0+r3*2], xmm5
2019 movhps [r0+r3*1], xmm2
2027 movq [r0+r3*2], xmm5
2028 movq [r0+r3*1], xmm2
2031 movq [r2+r3*2], xmm5
2032 movq [r2+r3*1], xmm2
2035 movq [r1+r3*2], xmm5
2036 movq [r1+r3*1], xmm2
2041 %define PALIGNR PALIGNR_MMX
2042 PRED8x8L_VERTICAL_RIGHT sse2
2044 %define PALIGNR PALIGNR_SSSE3
2045 PRED8x8L_VERTICAL_RIGHT ssse3
2047 ;-----------------------------------------------------------------------------
2048 ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
2049 ;-----------------------------------------------------------------------------
2051 %macro PRED8x8L_VERTICAL_LEFT 1
2052 cglobal pred8x8l_vertical_left_%1, 4,4
2059 PALIGNR mm2, mm0, 7, mm0
2060 PALIGNR mm1, mm4, 1, mm4
2083 pshufw mm1, mm3, 0xFF
2086 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2095 PALIGNR mm2, mm3, 7, mm3
2096 PALIGNR mm5, mm4, 1, mm4
2097 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2111 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2113 movq [r0+r3*1], xmm3
2114 movq [r0+r3*2], xmm0
2118 movq [r1+r3*1], xmm3
2119 movq [r1+r3*2], xmm0
2122 movq [r2+r3*1], xmm3
2123 movq [r2+r3*2], xmm0
2126 movq [r0+r3*1], xmm3
2127 movq [r0+r3*2], xmm0
2132 %define PALIGNR PALIGNR_MMX
2133 PRED8x8L_VERTICAL_LEFT sse2
2134 %define PALIGNR PALIGNR_SSSE3
2136 PRED8x8L_VERTICAL_LEFT ssse3
2138 ;-----------------------------------------------------------------------------
2139 ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2140 ;-----------------------------------------------------------------------------
2142 %macro PRED8x8L_HORIZONTAL_UP 1
2143 cglobal pred8x8l_horizontal_up_%1, 4,4
2146 movq mm0, [r0+r3*1-8]
2150 punpckhbw mm0, [r1+r3*0-8]
2151 movq mm1, [r2+r3*1-8]
2152 punpckhbw mm1, [r0+r3*2-8]
2156 movq mm2, [r0+r3*1-8]
2157 punpckhbw mm2, [r0+r3*0-8]
2159 movq mm3, [r0+r3*1-8]
2160 punpckhbw mm3, [r0+r3*0-8]
2164 movq mm0, [r0+r3*0-8]
2165 movq mm1, [r1+r3*0-8]
2169 PALIGNR mm4, mm0, 7, mm0
2170 PALIGNR mm1, mm2, 1, mm2
2172 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2175 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2177 PALIGNR mm7, mm1, 7, mm3
2179 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2180 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2184 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2191 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2193 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2195 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2197 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2198 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2202 PALIGNR mm5, mm4, 2, mm1
2203 pshufw mm1, mm6, 11111001b
2204 PALIGNR mm6, mm4, 4, mm2
2205 pshufw mm2, mm7, 11111110b
2206 PALIGNR mm7, mm4, 6, mm3
2207 pshufw mm3, mm0, 11111111b
2221 %define PALIGNR PALIGNR_MMX
2222 PRED8x8L_HORIZONTAL_UP mmxext
2223 %define PALIGNR PALIGNR_SSSE3
2224 PRED8x8L_HORIZONTAL_UP ssse3
2226 ;-----------------------------------------------------------------------------
2227 ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2228 ;-----------------------------------------------------------------------------
2231 %define PALIGNR PALIGNR_MMX
2232 cglobal pred8x8l_horizontal_down_mmxext, 4,5
2235 movq mm0, [r0+r3*1-8]
2236 punpckhbw mm0, [r0+r3*0-8]
2237 movq mm1, [r4+r3*1-8]
2238 punpckhbw mm1, [r0+r3*2-8]
2242 movq mm2, [r0+r3*1-8]
2243 punpckhbw mm2, [r0+r3*0-8]
2245 movq mm3, [r0+r3*1-8]
2246 punpckhbw mm3, [r0+r3*0-8]
2250 movq mm0, [r0+r3*0-8]
2255 PALIGNR mm4, mm0, 7, mm0
2256 PALIGNR mm1, mm2, 1, mm2
2283 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2287 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2289 PALIGNR mm7, mm1, 7, mm3
2295 PALIGNR mm2, mm0, 7, mm0
2296 PALIGNR mm1, mm4, 1, mm4
2302 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2309 PALIGNR mm2, mm6, 7, mm5
2310 PALIGNR mm6, mm7, 7, mm0
2312 PALIGNR mm4, mm3, 1, mm7
2315 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2321 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2329 PALIGNR mm7, mm3, 2, mm5
2331 PALIGNR mm1, mm3, 4, mm5
2333 PALIGNR mm0, mm3, 6, mm3
2338 PALIGNR mm6, mm4, 2, mm5
2340 PALIGNR mm2, mm4, 4, mm5
2342 PALIGNR mm3, mm4, 6, mm4
2346 %macro PRED8x8L_HORIZONTAL_DOWN 1
2347 cglobal pred8x8l_horizontal_down_%1, 4,5
2350 movq mm0, [r0+r3*1-8]
2351 punpckhbw mm0, [r0+r3*0-8]
2352 movq mm1, [r4+r3*1-8]
2353 punpckhbw mm1, [r0+r3*2-8]
2357 movq mm2, [r0+r3*1-8]
2358 punpckhbw mm2, [r0+r3*0-8]
2360 movq mm3, [r0+r3*1-8]
2361 punpckhbw mm3, [r0+r3*0-8]
2365 movq mm0, [r0+r3*0-8]
2370 PALIGNR mm4, mm0, 7, mm0
2371 PALIGNR mm1, mm2, 1, mm2
2398 pshufw mm1, mm3, 0xFF
2402 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2406 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2416 PALIGNR mm2, mm0, 7, mm0
2417 PALIGNR mm1, mm4, 1, mm4
2423 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2432 PALIGNR mm2, mm3, 7, mm3
2433 PALIGNR mm5, mm4, 1, mm4
2434 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2443 PALIGNR xmm1, xmm0, 7, xmm4
2444 PALIGNR xmm2, xmm0, 9, xmm5
2446 PALIGNR xmm3, xmm0, 8, xmm0
2450 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2451 punpcklbw xmm4, xmm0
2453 movq [r0+r3*2], xmm4
2454 movq [r2+r3*2], xmm0
2457 movq [r0+r3*1], xmm4
2458 movq [r2+r3*1], xmm0
2461 movq [r1+r3*2], xmm4
2462 movq [r4+r3*2], xmm0
2465 movq [r1+r3*1], xmm4
2466 movq [r4+r3*1], xmm0
2471 %define PALIGNR PALIGNR_MMX
2472 PRED8x8L_HORIZONTAL_DOWN sse2
2474 %define PALIGNR PALIGNR_SSSE3
2475 PRED8x8L_HORIZONTAL_DOWN ssse3
2477 ;-----------------------------------------------------------------------------
2478 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2479 ;-----------------------------------------------------------------------------
2481 cglobal pred4x4_dc_mmxext, 3,5
2487 movzx r1d, byte [r0+r2*1-1]
2490 movzx r1d, byte [r0+r2*2-1]
2493 movzx r1d, byte [r0+r2*1-1]
2495 movzx r1d, byte [r0+r2*2-1]
2499 imul r3d, 0x01010101
2506 ;-----------------------------------------------------------------------------
2507 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2508 ;-----------------------------------------------------------------------------
2510 %macro PRED4x4_TM_MMX 1
2511 cglobal pred4x4_tm_vp8_%1, 3,6
2516 movzx r4d, byte [r0-1]
2519 movzx r1d, byte [r0+r2*1-1]
2520 movzx r3d, byte [r0+r2*2-1]
2547 PRED4x4_TM_MMX mmxext
2549 cglobal pred4x4_tm_vp8_ssse3, 3,3
2558 movd mm2, [r0+r2*1-4]
2559 movd mm3, [r0+r2*2-4]
2560 movd mm4, [r1+r2*1-4]
2561 movd mm5, [r1+r2*2-4]
2584 ;-----------------------------------------------------------------------------
2585 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2586 ;-----------------------------------------------------------------------------
2589 cglobal pred4x4_vertical_vp8_mmxext, 3,3
2593 mova m2, m0 ;t0 t1 t2 t3
2594 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2596 psrlq m0, 8 ;t1 t2 t3 t4
2597 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2604 ;-----------------------------------------------------------------------------
2605 ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2606 ;-----------------------------------------------------------------------------
2608 cglobal pred4x4_down_left_mmxext, 3,3
2619 PRED4x4_LOWPASS m0, m1, m3, m4, m5
2631 ;-----------------------------------------------------------------------------
2632 ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2633 ;-----------------------------------------------------------------------------
2636 cglobal pred4x4_vertical_left_mmxext, 3,3
2646 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2656 ;-----------------------------------------------------------------------------
2657 ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2658 ;-----------------------------------------------------------------------------
2661 cglobal pred4x4_horizontal_up_mmxext, 3,3
2664 movd m0, [r0+r2*1-4]
2665 punpcklbw m0, [r0+r2*2-4]
2666 movd m1, [r1+r2*1-4]
2667 punpcklbw m1, [r1+r2*2-4]
2679 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2689 ;-----------------------------------------------------------------------------
2690 ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2691 ;-----------------------------------------------------------------------------
2694 %define PALIGNR PALIGNR_MMX
2695 cglobal pred4x4_horizontal_down_mmxext, 3,3
2698 movh m0, [r0-4] ; lt ..
2699 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2700 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2701 movd m1, [r1+r2*2-4] ; l3
2702 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2703 movd m2, [r0+r2*2-4] ; l1
2704 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2705 punpckhwd m1, m2 ; l0 l1 l2 l3
2706 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2710 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2711 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2713 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2716 PALIGNR m3, m5, 6, m4
2725 ;-----------------------------------------------------------------------------
2726 ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2727 ;-----------------------------------------------------------------------------
2730 %define PALIGNR PALIGNR_MMX
2731 cglobal pred4x4_vertical_right_mmxext, 3,3
2734 movh m0, [r0] ; ........t3t2t1t0
2736 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2738 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2740 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2742 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2743 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2749 PALIGNR m5, m1, 7, m2
2752 PALIGNR m3, m1, 7, m1
2756 ;-----------------------------------------------------------------------------
2757 ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2758 ;-----------------------------------------------------------------------------
2761 %define PALIGNR PALIGNR_MMX
2762 cglobal pred4x4_down_right_mmxext, 3,3
2766 movq m2, [r0+r2*1-8]
2767 punpckhbw m2, [r0-8]
2770 PALIGNR m3, m1, 5, m1
2772 PALIGNR m3, [r1+r2*1-8], 7, m4
2774 PALIGNR m3, [r1+r2*2-8], 7, m4
2775 PRED4x4_LOWPASS m0, m3, m1, m2, m4