1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
30 tm_shuf: times 8 db 0x03, 0x80
31 pw_ff00: times 8 dw 0xff00
32 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
33 db 1, 2, 3, 4, 5, 6, 7, 8
34 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
35 db 1, 2, 3, 4, 0, 0, 0, 0
36 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
37 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
38 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
39 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
52 ;-----------------------------------------------------------------------------
53 ; void pred16x16_vertical(uint8_t *src, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_8_mmx, 2,3
71 cglobal pred16x16_vertical_8_sse, 2,3
76 movaps [r0+r1*1], xmm0
77 movaps [r0+r1*2], xmm0
79 movaps [r0+r1*1], xmm0
80 movaps [r0+r1*2], xmm0
86 ;-----------------------------------------------------------------------------
87 ; void pred16x16_horizontal(uint8_t *src, int stride)
88 ;-----------------------------------------------------------------------------
91 cglobal pred16x16_horizontal_8, 2,3
128 ;-----------------------------------------------------------------------------
129 ; void pred16x16_dc(uint8_t *src, int stride)
130 ;-----------------------------------------------------------------------------
132 %macro PRED16x16_DC 0
133 cglobal pred16x16_dc_8, 2,7
141 movzx r5d, byte [r0+r1*1]
146 movzx r2d, byte [r0+r1*0]
147 movzx r3d, byte [r0+r1*1]
152 movzx r2d, byte [r0+r1*0]
159 SPLATB_REG m0, r2, m1
191 ;-----------------------------------------------------------------------------
192 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
193 ;-----------------------------------------------------------------------------
195 %macro PRED16x16_TM_MMX 0
196 cglobal pred16x16_tm_vp8_8, 2,5
207 movzx r3d, byte [r0-1]
210 movzx r2d, byte [r0+r1-1]
237 cglobal pred16x16_tm_vp8_8_sse2, 2,6,6
244 movzx r4d, byte [r0-1]
247 movzx r2d, byte [r0+r1*1-1]
248 movzx r3d, byte [r0+r1*2-1]
253 pshuflw xmm2, xmm2, 0
254 pshuflw xmm4, xmm4, 0
255 punpcklqdq xmm2, xmm2
256 punpcklqdq xmm4, xmm4
265 movdqa [r0+r1*1], xmm2
266 movdqa [r0+r1*2], xmm4
272 ;-----------------------------------------------------------------------------
273 ; void pred16x16_plane(uint8_t *src, int stride)
274 ;-----------------------------------------------------------------------------
276 %macro H264_PRED16x16_PLANE 1
277 cglobal pred16x16_plane_%1_8, 2,9,7
291 pmullw m0, [pw_m8tom1 ]
292 pmullw m1, [pw_m8tom1+8]
293 pmullw m2, [pw_1to8 ]
294 pmullw m3, [pw_1to8 +8]
299 movhps m0, [r0+r1 +8]
300 pmaddubsw m0, [plane_shuf] ; H coefficients
306 pmullw m0, [pw_m8tom1]
326 paddw m0, m1 ; sum of H coefficients
338 movzx e_reg, byte [r3+r2*2 ]
339 movzx r5, byte [r4+r1 ]
342 movzx e_reg, byte [r3+r2 ]
347 movzx e_reg, byte [r3+r1 ]
348 movzx r6, byte [r4+r2*2 ]
352 movzx e_reg, byte [r3 ]
354 movzx r7, byte [r4+r2 ]
357 movzx r6, byte [r4+r2 ]
366 movzx r4, byte [e_reg+r2 ]
378 movzx r4, byte [e_reg ]
380 movzx r7, byte [r3 +r2 ]
384 movzx r6, byte [r3 +r2 ]
390 movzx r4, byte [e_reg+r1 ]
391 movzx r6, byte [r3 +r2*2]
398 movzx r4, byte [e_reg+r2*2]
399 movzx r6, byte [r3 +r1 ]
402 add r5, r6 ; sum of V coefficients
419 lea r5, [r5*5] ; 5*(V/4)
423 sar r5, 4 ; (5*(V/4))/16
426 movzx r4, byte [r0+r1 +15]
427 movzx r3, byte [r3+r2*2 ]
445 lea r1d, [r1d*5] ; 5*(H/4)
449 sar r1d, 4 ; (5*(H/4))/16
470 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
479 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
480 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
482 paddw m5, m0 ; a + {8,9,10,11}*H
483 paddw m6, m0 ; a + {12,13,14,15}*H
488 mova m3, m0 ; b[0..7]
489 mova m4, m2 ; b[8..15]
495 mova m3, m5 ; b[8..11]
496 mova m4, m6 ; b[12..15]
509 mova m3, m0 ; b[0..7]
510 mova m4, m2 ; b[8..15]
516 mova m3, m5 ; b[8..11]
517 mova m4, m6 ; b[12..15]
537 H264_PRED16x16_PLANE h264
538 H264_PRED16x16_PLANE rv40
539 H264_PRED16x16_PLANE svq3
541 H264_PRED16x16_PLANE h264
542 H264_PRED16x16_PLANE rv40
543 H264_PRED16x16_PLANE svq3
545 H264_PRED16x16_PLANE h264
546 H264_PRED16x16_PLANE rv40
547 H264_PRED16x16_PLANE svq3
549 H264_PRED16x16_PLANE h264
550 H264_PRED16x16_PLANE rv40
551 H264_PRED16x16_PLANE svq3
554 ;-----------------------------------------------------------------------------
555 ; void pred8x8_plane(uint8_t *src, int stride)
556 ;-----------------------------------------------------------------------------
558 %macro H264_PRED8x8_PLANE 0
559 cglobal pred8x8_plane_8, 2,9,7
569 pmullw m0, [pw_m4to4]
570 pmullw m1, [pw_m4to4+8]
573 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
574 pmaddubsw m0, [plane8_shuf] ; H coefficients
580 pmullw m0, [pw_m4to4]
586 %if notcpuflag(ssse3)
602 paddw m0, m1 ; sum of H coefficients
614 movzx e_reg, byte [r3+r2*2 ]
615 movzx r5, byte [r4+r1 ]
618 movzx e_reg, byte [r3 ]
620 movzx r7, byte [r4+r2 ]
624 movzx r6, byte [r4+r2 ]
630 movzx e_reg, byte [r3+r1 ]
631 movzx r6, byte [r4+r2*2 ]
638 movzx e_reg, byte [r3+r2 ]
651 movzx r3, byte [r4+r2*2 ]
652 movzx r4, byte [r0+r1 +7]
674 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
675 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
678 paddw m2, m0 ; a + {4,5,6,7}*H
685 mova m3, m0 ; b[0..7]
688 mova m4, m0 ; V+b[0..7]
695 mova m3, m0 ; b[0..3]
696 mova m4, m2 ; b[4..7]
701 mova m5, m0 ; V+b[0..3]
702 mova m6, m2 ; V+b[4..7]
729 ;-----------------------------------------------------------------------------
730 ; void pred8x8_vertical(uint8_t *src, int stride)
731 ;-----------------------------------------------------------------------------
733 cglobal pred8x8_vertical_8_mmx, 2,2
745 ;-----------------------------------------------------------------------------
746 ; void pred8x8_horizontal(uint8_t *src, int stride)
747 ;-----------------------------------------------------------------------------
750 cglobal pred8x8_horizontal_8, 2,3
756 SPLATB_LOAD m0, r0+r1*0-1, m2
757 SPLATB_LOAD m1, r0+r1*1-1, m2
774 ;-----------------------------------------------------------------------------
775 ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
776 ;-----------------------------------------------------------------------------
777 cglobal pred8x8_top_dc_8_mmxext, 2,5
794 pshufw mm0, mm0, 0 ; dc0 (w)
795 packuswb mm0, mm1 ; dc0,dc1 (b)
807 ;-----------------------------------------------------------------------------
808 ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
809 ;-----------------------------------------------------------------------------
812 cglobal pred8x8_dc_8_mmxext, 2,5
821 movzx r2d, byte [r0+r1*1-1]
822 movzx r3d, byte [r0+r1*2-1]
825 movzx r3d, byte [r0+r1*1-1]
827 movzx r3d, byte [r0+r1*2-1]
831 movzx r2d, byte [r0+r1*1-1]
832 movzx r3d, byte [r0+r1*2-1]
835 movzx r3d, byte [r0+r1*1-1]
837 movzx r3d, byte [r0+r1*2-1]
844 punpckldq m0, m2 ; s0, s1, s2, s3
845 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
847 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
851 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
868 ;-----------------------------------------------------------------------------
869 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
870 ;-----------------------------------------------------------------------------
872 cglobal pred8x8_dc_rv40_8_mmxext, 2,7
878 movzx r5d, byte [r0+r1*1]
882 movzx r2d, byte [r0+r1*0]
883 movzx r3d, byte [r0+r1*1]
888 movzx r2d, byte [r0+r1*0]
904 ;-----------------------------------------------------------------------------
905 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
906 ;-----------------------------------------------------------------------------
908 %macro PRED8x8_TM_MMX 0
909 cglobal pred8x8_tm_vp8_8, 2,6
916 movzx r4d, byte [r0-1]
919 movzx r2d, byte [r0+r1*1-1]
920 movzx r3d, byte [r0+r1*2-1]
949 cglobal pred8x8_tm_vp8_8_sse2, 2,6,4
954 movzx r4d, byte [r0-1]
957 movzx r2d, byte [r0+r1*1-1]
958 movzx r3d, byte [r0+r1*2-1]
963 pshuflw xmm2, xmm2, 0
964 pshuflw xmm3, xmm3, 0
965 punpcklqdq xmm2, xmm2
966 punpcklqdq xmm3, xmm3
971 movhps [r0+r1*2], xmm2
977 cglobal pred8x8_tm_vp8_8_ssse3, 2,3,6
979 movdqa xmm4, [tm_shuf]
987 movd xmm2, [r0+r1*1-4]
988 movd xmm3, [r0+r1*2-4]
997 movhps [r0+r1*2], xmm2
1003 ; dest, left, right, src, tmp
1004 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1005 %macro PRED4x4_LOWPASS 5
1015 ;-----------------------------------------------------------------------------
1016 ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1017 ;-----------------------------------------------------------------------------
1018 %macro PRED8x8L_TOP_DC 1
1019 cglobal pred8x8l_top_dc_8_%1, 4,4
1027 PALIGNR mm2, mm0, 7, mm0
1028 PALIGNR mm1, mm4, 1, mm4
1029 test r1, r1 ; top_left
1031 test r2, r2 ; top_right
1040 test r2, r2 ; top_right
1049 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1066 %define PALIGNR PALIGNR_MMX
1067 PRED8x8L_TOP_DC mmxext
1068 %define PALIGNR PALIGNR_SSSE3
1069 PRED8x8L_TOP_DC ssse3
1071 ;-----------------------------------------------------------------------------
1072 ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1073 ;-----------------------------------------------------------------------------
1075 %macro PRED8x8L_DC 1
1076 cglobal pred8x8l_dc_8_%1, 4,5
1079 movq mm0, [r0+r3*1-8]
1080 punpckhbw mm0, [r0+r3*0-8]
1081 movq mm1, [r4+r3*1-8]
1082 punpckhbw mm1, [r0+r3*2-8]
1086 movq mm2, [r0+r3*1-8]
1087 punpckhbw mm2, [r0+r3*0-8]
1089 movq mm3, [r0+r3*1-8]
1090 punpckhbw mm3, [r0+r3*0-8]
1094 movq mm0, [r0+r3*0-8]
1099 PALIGNR mm4, mm0, 7, mm0
1100 PALIGNR mm1, mm2, 1, mm2
1127 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1130 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1132 PALIGNR mm7, mm1, 7, mm3
1138 PALIGNR mm2, mm0, 7, mm0
1139 PALIGNR mm1, mm4, 1, mm4
1146 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1169 %define PALIGNR PALIGNR_MMX
1171 %define PALIGNR PALIGNR_SSSE3
1174 ;-----------------------------------------------------------------------------
1175 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1176 ;-----------------------------------------------------------------------------
1178 %macro PRED8x8L_HORIZONTAL 1
1179 cglobal pred8x8l_horizontal_8_%1, 4,4
1182 movq mm0, [r0+r3*1-8]
1186 punpckhbw mm0, [r1+r3*0-8]
1187 movq mm1, [r2+r3*1-8]
1188 punpckhbw mm1, [r0+r3*2-8]
1192 movq mm2, [r0+r3*1-8]
1193 punpckhbw mm2, [r0+r3*0-8]
1195 movq mm3, [r0+r3*1-8]
1196 punpckhbw mm3, [r0+r3*0-8]
1200 movq mm0, [r0+r3*0-8]
1201 movq mm1, [r1+r3*0-8]
1205 PALIGNR mm4, mm0, 7, mm0
1206 PALIGNR mm1, mm2, 1, mm2
1208 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1211 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1213 PALIGNR mm7, mm1, 7, mm3
1219 pshufw mm0, mm3, 0xff
1220 pshufw mm1, mm3, 0xaa
1222 pshufw mm2, mm3, 0x55
1223 pshufw mm3, mm3, 0x00
1224 pshufw mm4, mm7, 0xff
1225 pshufw mm5, mm7, 0xaa
1226 pshufw mm6, mm7, 0x55
1227 pshufw mm7, mm7, 0x00
1241 %define PALIGNR PALIGNR_MMX
1242 PRED8x8L_HORIZONTAL mmxext
1243 %define PALIGNR PALIGNR_SSSE3
1244 PRED8x8L_HORIZONTAL ssse3
1246 ;-----------------------------------------------------------------------------
1247 ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1248 ;-----------------------------------------------------------------------------
1250 %macro PRED8x8L_VERTICAL 1
1251 cglobal pred8x8l_vertical_8_%1, 4,4
1258 PALIGNR mm2, mm0, 7, mm0
1259 PALIGNR mm1, mm4, 1, mm4
1260 test r1, r1 ; top_left
1262 test r2, r2 ; top_right
1271 test r2, r2 ; top_right
1280 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1292 %define PALIGNR PALIGNR_MMX
1293 PRED8x8L_VERTICAL mmxext
1294 %define PALIGNR PALIGNR_SSSE3
1295 PRED8x8L_VERTICAL ssse3
1297 ;-----------------------------------------------------------------------------
1298 ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1299 ;-----------------------------------------------------------------------------
1302 %define PALIGNR PALIGNR_MMX
1303 cglobal pred8x8l_down_left_8_mmxext, 4,5
1310 PALIGNR mm2, mm0, 7, mm0
1311 PALIGNR mm1, mm4, 1, mm4
1334 pshufw mm1, mm3, 0xFF
1337 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1346 PALIGNR mm2, mm3, 7, mm3
1347 PALIGNR mm5, mm4, 1, mm4
1348 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1356 PALIGNR mm2, mm7, 1, mm0
1358 PALIGNR mm3, mm7, 7, mm0
1359 PALIGNR mm4, mm6, 1, mm0
1365 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1366 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1410 %macro PRED8x8L_DOWN_LEFT 1
1411 cglobal pred8x8l_down_left_8_%1, 4,4
1418 PALIGNR mm2, mm0, 7, mm0
1419 PALIGNR mm1, mm4, 1, mm4
1420 test r1, r1 ; top_left
1422 test r2, r2 ; top_right
1431 test r2, r2 ; top_right
1442 pshufw mm1, mm3, 0xFF
1445 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1447 test r2, r2 ; top_right
1454 PALIGNR mm2, mm3, 7, mm3
1455 PALIGNR mm5, mm4, 1, mm4
1456 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1472 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1474 movq [r0+r3*1], xmm0
1476 movq [r0+r3*2], xmm0
1479 movq [r1+r3*1], xmm0
1481 movq [r1+r3*2], xmm0
1483 movq [r2+r3*1], xmm0
1485 movq [r2+r3*2], xmm0
1487 movq [r0+r3*1], xmm0
1489 movq [r0+r3*2], xmm0
1494 %define PALIGNR PALIGNR_MMX
1495 PRED8x8L_DOWN_LEFT sse2
1497 %define PALIGNR PALIGNR_SSSE3
1498 PRED8x8L_DOWN_LEFT ssse3
1500 ;-----------------------------------------------------------------------------
1501 ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1502 ;-----------------------------------------------------------------------------
1505 %define PALIGNR PALIGNR_MMX
1506 cglobal pred8x8l_down_right_8_mmxext, 4,5
1509 movq mm0, [r0+r3*1-8]
1510 punpckhbw mm0, [r0+r3*0-8]
1511 movq mm1, [r4+r3*1-8]
1512 punpckhbw mm1, [r0+r3*2-8]
1516 movq mm2, [r0+r3*1-8]
1517 punpckhbw mm2, [r0+r3*0-8]
1519 movq mm3, [r0+r3*1-8]
1520 punpckhbw mm3, [r0+r3*0-8]
1524 movq mm0, [r0+r3*0-8]
1529 PALIGNR mm4, mm0, 7, mm0
1530 PALIGNR mm1, mm2, 1, mm2
1531 test r1, r1 ; top_left
1535 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1539 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1541 PALIGNR mm7, mm1, 7, mm3
1547 PALIGNR mm2, mm0, 7, mm0
1548 PALIGNR mm1, mm4, 1, mm4
1549 test r1, r1 ; top_left
1551 test r2, r2 ; top_right
1554 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1570 test r2, r2 ; top_right
1586 PALIGNR mm2, mm6, 1, mm0
1588 PALIGNR mm3, mm6, 7, mm0
1592 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1593 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1637 %macro PRED8x8L_DOWN_RIGHT 1
1638 cglobal pred8x8l_down_right_8_%1, 4,5
1641 movq mm0, [r0+r3*1-8]
1642 punpckhbw mm0, [r0+r3*0-8]
1643 movq mm1, [r4+r3*1-8]
1644 punpckhbw mm1, [r0+r3*2-8]
1648 movq mm2, [r0+r3*1-8]
1649 punpckhbw mm2, [r0+r3*0-8]
1651 movq mm3, [r0+r3*1-8]
1652 punpckhbw mm3, [r0+r3*0-8]
1656 movq mm0, [r0+r3*0-8]
1661 PALIGNR mm4, mm0, 7, mm0
1662 PALIGNR mm1, mm2, 1, mm2
1690 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1694 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1696 PALIGNR mm7, mm1, 7, mm3
1703 PALIGNR mm2, mm0, 7, mm0
1704 PALIGNR mm1, mm4, 1, mm4
1710 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1727 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1730 movq [r0+r3*2], xmm0
1731 movq [r0+r3*1], xmm1
1734 movq [r2+r3*2], xmm0
1735 movq [r2+r3*1], xmm1
1738 movq [r1+r3*2], xmm0
1739 movq [r1+r3*1], xmm1
1742 movq [r4+r3*2], xmm0
1743 movq [r4+r3*1], xmm1
1748 %define PALIGNR PALIGNR_MMX
1749 PRED8x8L_DOWN_RIGHT sse2
1751 %define PALIGNR PALIGNR_SSSE3
1752 PRED8x8L_DOWN_RIGHT ssse3
1754 ;-----------------------------------------------------------------------------
1755 ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1756 ;-----------------------------------------------------------------------------
1759 %define PALIGNR PALIGNR_MMX
1760 cglobal pred8x8l_vertical_right_8_mmxext, 4,5
1763 movq mm0, [r0+r3*1-8]
1764 punpckhbw mm0, [r0+r3*0-8]
1765 movq mm1, [r4+r3*1-8]
1766 punpckhbw mm1, [r0+r3*2-8]
1770 movq mm2, [r0+r3*1-8]
1771 punpckhbw mm2, [r0+r3*0-8]
1773 movq mm3, [r0+r3*1-8]
1774 punpckhbw mm3, [r0+r3*0-8]
1778 movq mm0, [r0+r3*0-8]
1783 PALIGNR mm4, mm0, 7, mm0
1784 PALIGNR mm1, mm2, 1, mm2
1812 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1819 PALIGNR mm2, mm0, 7, mm0
1820 PALIGNR mm1, mm4, 1, mm4
1826 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1830 PALIGNR mm3, mm7, 7, mm0
1831 PALIGNR mm6, mm7, 6, mm1
1835 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1846 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1847 PALIGNR mm6, mm0, 7, mm2
1850 PALIGNR mm5, mm0, 7, mm1
1853 PALIGNR mm6, mm0, 7, mm2
1856 PALIGNR mm5, mm0, 7, mm1
1859 PALIGNR mm6, mm0, 7, mm2
1862 PALIGNR mm5, mm0, 7, mm1
1866 %macro PRED8x8L_VERTICAL_RIGHT 1
1867 cglobal pred8x8l_vertical_right_8_%1, 4,5,7
1868 ; manually spill XMM registers for Win64 because
1869 ; the code here is initialized with INIT_MMX
1873 movq mm0, [r0+r3*1-8]
1874 punpckhbw mm0, [r0+r3*0-8]
1875 movq mm1, [r4+r3*1-8]
1876 punpckhbw mm1, [r0+r3*2-8]
1880 movq mm2, [r0+r3*1-8]
1881 punpckhbw mm2, [r0+r3*0-8]
1883 movq mm3, [r0+r3*1-8]
1884 punpckhbw mm3, [r0+r3*0-8]
1888 movq mm0, [r0+r3*0-8]
1893 PALIGNR mm4, mm0, 7, mm0
1894 PALIGNR mm1, mm2, 1, mm2
1921 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1928 PALIGNR mm2, mm0, 7, mm0
1929 PALIGNR mm1, mm4, 1, mm4
1935 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1940 movdqa xmm6, [pw_ff00]
1949 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1955 movhps [r0+r3*2], xmm5
1956 movhps [r0+r3*1], xmm2
1964 movq [r0+r3*2], xmm5
1965 movq [r0+r3*1], xmm2
1968 movq [r2+r3*2], xmm5
1969 movq [r2+r3*1], xmm2
1972 movq [r1+r3*2], xmm5
1973 movq [r1+r3*1], xmm2
1978 %define PALIGNR PALIGNR_MMX
1979 PRED8x8L_VERTICAL_RIGHT sse2
1981 %define PALIGNR PALIGNR_SSSE3
1982 PRED8x8L_VERTICAL_RIGHT ssse3
1984 ;-----------------------------------------------------------------------------
1985 ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1986 ;-----------------------------------------------------------------------------
1988 %macro PRED8x8L_VERTICAL_LEFT 1
1989 cglobal pred8x8l_vertical_left_8_%1, 4,4
1996 PALIGNR mm2, mm0, 7, mm0
1997 PALIGNR mm1, mm4, 1, mm4
2020 pshufw mm1, mm3, 0xFF
2023 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2032 PALIGNR mm2, mm3, 7, mm3
2033 PALIGNR mm5, mm4, 1, mm4
2034 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2048 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2050 movq [r0+r3*1], xmm3
2051 movq [r0+r3*2], xmm0
2055 movq [r1+r3*1], xmm3
2056 movq [r1+r3*2], xmm0
2059 movq [r2+r3*1], xmm3
2060 movq [r2+r3*2], xmm0
2063 movq [r0+r3*1], xmm3
2064 movq [r0+r3*2], xmm0
2069 %define PALIGNR PALIGNR_MMX
2070 PRED8x8L_VERTICAL_LEFT sse2
2071 %define PALIGNR PALIGNR_SSSE3
2073 PRED8x8L_VERTICAL_LEFT ssse3
2075 ;-----------------------------------------------------------------------------
2076 ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2077 ;-----------------------------------------------------------------------------
2079 %macro PRED8x8L_HORIZONTAL_UP 1
2080 cglobal pred8x8l_horizontal_up_8_%1, 4,4
2083 movq mm0, [r0+r3*1-8]
2087 punpckhbw mm0, [r1+r3*0-8]
2088 movq mm1, [r2+r3*1-8]
2089 punpckhbw mm1, [r0+r3*2-8]
2093 movq mm2, [r0+r3*1-8]
2094 punpckhbw mm2, [r0+r3*0-8]
2096 movq mm3, [r0+r3*1-8]
2097 punpckhbw mm3, [r0+r3*0-8]
2101 movq mm0, [r0+r3*0-8]
2102 movq mm1, [r1+r3*0-8]
2106 PALIGNR mm4, mm0, 7, mm0
2107 PALIGNR mm1, mm2, 1, mm2
2109 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2112 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2114 PALIGNR mm7, mm1, 7, mm3
2116 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2117 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2121 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2128 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2130 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2132 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2134 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2135 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2139 PALIGNR mm5, mm4, 2, mm1
2140 pshufw mm1, mm6, 11111001b
2141 PALIGNR mm6, mm4, 4, mm2
2142 pshufw mm2, mm7, 11111110b
2143 PALIGNR mm7, mm4, 6, mm3
2144 pshufw mm3, mm0, 11111111b
2158 %define PALIGNR PALIGNR_MMX
2159 PRED8x8L_HORIZONTAL_UP mmxext
2160 %define PALIGNR PALIGNR_SSSE3
2161 PRED8x8L_HORIZONTAL_UP ssse3
2163 ;-----------------------------------------------------------------------------
2164 ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2165 ;-----------------------------------------------------------------------------
2168 %define PALIGNR PALIGNR_MMX
2169 cglobal pred8x8l_horizontal_down_8_mmxext, 4,5
2172 movq mm0, [r0+r3*1-8]
2173 punpckhbw mm0, [r0+r3*0-8]
2174 movq mm1, [r4+r3*1-8]
2175 punpckhbw mm1, [r0+r3*2-8]
2179 movq mm2, [r0+r3*1-8]
2180 punpckhbw mm2, [r0+r3*0-8]
2182 movq mm3, [r0+r3*1-8]
2183 punpckhbw mm3, [r0+r3*0-8]
2187 movq mm0, [r0+r3*0-8]
2192 PALIGNR mm4, mm0, 7, mm0
2193 PALIGNR mm1, mm2, 1, mm2
2220 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2224 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2226 PALIGNR mm7, mm1, 7, mm3
2232 PALIGNR mm2, mm0, 7, mm0
2233 PALIGNR mm1, mm4, 1, mm4
2239 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2246 PALIGNR mm2, mm6, 7, mm5
2247 PALIGNR mm6, mm7, 7, mm0
2249 PALIGNR mm4, mm3, 1, mm7
2252 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2258 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2266 PALIGNR mm7, mm3, 2, mm5
2268 PALIGNR mm1, mm3, 4, mm5
2270 PALIGNR mm0, mm3, 6, mm3
2275 PALIGNR mm6, mm4, 2, mm5
2277 PALIGNR mm2, mm4, 4, mm5
2279 PALIGNR mm3, mm4, 6, mm4
2283 %macro PRED8x8L_HORIZONTAL_DOWN 1
2284 cglobal pred8x8l_horizontal_down_8_%1, 4,5
2287 movq mm0, [r0+r3*1-8]
2288 punpckhbw mm0, [r0+r3*0-8]
2289 movq mm1, [r4+r3*1-8]
2290 punpckhbw mm1, [r0+r3*2-8]
2294 movq mm2, [r0+r3*1-8]
2295 punpckhbw mm2, [r0+r3*0-8]
2297 movq mm3, [r0+r3*1-8]
2298 punpckhbw mm3, [r0+r3*0-8]
2302 movq mm0, [r0+r3*0-8]
2307 PALIGNR mm4, mm0, 7, mm0
2308 PALIGNR mm1, mm2, 1, mm2
2335 pshufw mm1, mm3, 0xFF
2339 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2343 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2353 PALIGNR mm2, mm0, 7, mm0
2354 PALIGNR mm1, mm4, 1, mm4
2360 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2369 PALIGNR mm2, mm3, 7, mm3
2370 PALIGNR mm5, mm4, 1, mm4
2371 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2380 PALIGNR xmm1, xmm0, 7, xmm4
2381 PALIGNR xmm2, xmm0, 9, xmm5
2383 PALIGNR xmm3, xmm0, 8, xmm0
2387 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2388 punpcklbw xmm4, xmm0
2390 movq [r0+r3*2], xmm4
2391 movq [r2+r3*2], xmm0
2394 movq [r0+r3*1], xmm4
2395 movq [r2+r3*1], xmm0
2398 movq [r1+r3*2], xmm4
2399 movq [r4+r3*2], xmm0
2402 movq [r1+r3*1], xmm4
2403 movq [r4+r3*1], xmm0
2408 %define PALIGNR PALIGNR_MMX
2409 PRED8x8L_HORIZONTAL_DOWN sse2
2411 %define PALIGNR PALIGNR_SSSE3
2412 PRED8x8L_HORIZONTAL_DOWN ssse3
2414 ;-----------------------------------------------------------------------------
2415 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2416 ;-----------------------------------------------------------------------------
2418 cglobal pred4x4_dc_8_mmxext, 3,5
2424 movzx r1d, byte [r0+r2*1-1]
2427 movzx r1d, byte [r0+r2*2-1]
2430 movzx r1d, byte [r0+r2*1-1]
2432 movzx r1d, byte [r0+r2*2-1]
2436 imul r3d, 0x01010101
2443 ;-----------------------------------------------------------------------------
2444 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2445 ;-----------------------------------------------------------------------------
2447 %macro PRED4x4_TM_MMX 0
2448 cglobal pred4x4_tm_vp8_8, 3,6
2453 movzx r4d, byte [r0-1]
2456 movzx r1d, byte [r0+r2*1-1]
2457 movzx r3d, byte [r0+r2*2-1]
2489 cglobal pred4x4_tm_vp8_8_ssse3, 3,3
2498 movd mm2, [r0+r2*1-4]
2499 movd mm3, [r0+r2*2-4]
2500 movd mm4, [r1+r2*1-4]
2501 movd mm5, [r1+r2*2-4]
2524 ;-----------------------------------------------------------------------------
2525 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2526 ;-----------------------------------------------------------------------------
2529 cglobal pred4x4_vertical_vp8_8_mmxext, 3,3
2533 mova m2, m0 ;t0 t1 t2 t3
2534 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2536 psrlq m0, 8 ;t1 t2 t3 t4
2537 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2544 ;-----------------------------------------------------------------------------
2545 ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2546 ;-----------------------------------------------------------------------------
2548 cglobal pred4x4_down_left_8_mmxext, 3,3
2558 PRED4x4_LOWPASS m0, m1, m2, m3, m4
2570 ;-----------------------------------------------------------------------------
2571 ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2572 ;-----------------------------------------------------------------------------
2575 cglobal pred4x4_vertical_left_8_mmxext, 3,3
2585 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2595 ;-----------------------------------------------------------------------------
2596 ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2597 ;-----------------------------------------------------------------------------
2600 cglobal pred4x4_horizontal_up_8_mmxext, 3,3
2603 movd m0, [r0+r2*1-4]
2604 punpcklbw m0, [r0+r2*2-4]
2605 movd m1, [r1+r2*1-4]
2606 punpcklbw m1, [r1+r2*2-4]
2618 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2628 ;-----------------------------------------------------------------------------
2629 ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2630 ;-----------------------------------------------------------------------------
2633 %define PALIGNR PALIGNR_MMX
2634 cglobal pred4x4_horizontal_down_8_mmxext, 3,3
2637 movh m0, [r0-4] ; lt ..
2638 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2639 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2640 movd m1, [r1+r2*2-4] ; l3
2641 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2642 movd m2, [r0+r2*2-4] ; l1
2643 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2644 punpckhwd m1, m2 ; l0 l1 l2 l3
2645 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2649 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2650 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2652 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2655 PALIGNR m3, m5, 6, m4
2664 ;-----------------------------------------------------------------------------
2665 ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2666 ;-----------------------------------------------------------------------------
2669 %define PALIGNR PALIGNR_MMX
2670 cglobal pred4x4_vertical_right_8_mmxext, 3,3
2673 movh m0, [r0] ; ........t3t2t1t0
2675 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2677 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2679 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2681 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2682 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2688 PALIGNR m5, m1, 7, m2
2691 PALIGNR m3, m1, 7, m1
2695 ;-----------------------------------------------------------------------------
2696 ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2697 ;-----------------------------------------------------------------------------
2700 %define PALIGNR PALIGNR_MMX
2701 cglobal pred4x4_down_right_8_mmxext, 3,3
2705 movq m2, [r0+r2*1-8]
2706 punpckhbw m2, [r0-8]
2709 PALIGNR m3, m1, 5, m1
2711 PALIGNR m3, [r1+r2*1-8], 7, m4
2713 PALIGNR m3, [r1+r2*2-8], 7, m4
2714 PRED4x4_LOWPASS m0, m3, m1, m2, m4