1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
30 tm_shuf: times 8 db 0x03, 0x80
31 pw_ff00: times 8 dw 0xff00
32 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
33 db 1, 2, 3, 4, 5, 6, 7, 8
34 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
35 db 1, 2, 3, 4, 0, 0, 0, 0
36 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
37 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
38 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
39 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
52 ;-----------------------------------------------------------------------------
53 ; void pred16x16_vertical(uint8_t *src, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_mmx, 2,3
71 cglobal pred16x16_vertical_sse, 2,3
76 movaps [r0+r1*1], xmm0
77 movaps [r0+r1*2], xmm0
79 movaps [r0+r1*1], xmm0
80 movaps [r0+r1*2], xmm0
86 ;-----------------------------------------------------------------------------
87 ; void pred16x16_horizontal(uint8_t *src, int stride)
88 ;-----------------------------------------------------------------------------
91 cglobal pred16x16_horizontal_%1, 2,3
133 ;-----------------------------------------------------------------------------
134 ; void pred16x16_dc(uint8_t *src, int stride)
135 ;-----------------------------------------------------------------------------
137 %macro PRED16x16_DC 1
138 cglobal pred16x16_dc_%1, 2,7
146 movzx r5d, byte [r0+r1*1]
151 movzx r2d, byte [r0+r1*0]
152 movzx r3d, byte [r0+r1*1]
157 movzx r2d, byte [r0+r1*0]
204 ;-----------------------------------------------------------------------------
205 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
206 ;-----------------------------------------------------------------------------
208 %macro PRED16x16_TM_MMX 1
209 cglobal pred16x16_tm_vp8_%1, 2,5
220 movzx r3d, byte [r0-1]
223 movzx r2d, byte [r0+r1-1]
250 PRED16x16_TM_MMX mmxext
252 cglobal pred16x16_tm_vp8_sse2, 2,6,6
259 movzx r4d, byte [r0-1]
262 movzx r2d, byte [r0+r1*1-1]
263 movzx r3d, byte [r0+r1*2-1]
268 pshuflw xmm2, xmm2, 0
269 pshuflw xmm4, xmm4, 0
270 punpcklqdq xmm2, xmm2
271 punpcklqdq xmm4, xmm4
280 movdqa [r0+r1*1], xmm2
281 movdqa [r0+r1*2], xmm4
287 ;-----------------------------------------------------------------------------
288 ; void pred16x16_plane(uint8_t *src, int stride)
289 ;-----------------------------------------------------------------------------
291 %macro H264_PRED16x16_PLANE 3
292 cglobal pred16x16_plane_%3_%1, 2, 7, %2
306 pmullw m0, [pw_m8tom1 ]
307 pmullw m1, [pw_m8tom1+8]
308 pmullw m2, [pw_1to8 ]
309 pmullw m3, [pw_1to8 +8]
318 pmullw m0, [pw_m8tom1]
322 movhps m0, [r0+r1 +8]
323 pmaddubsw m0, [plane_shuf] ; H coefficients
345 paddw m0, m1 ; sum of H coefficients
361 lea r3, [r3*5] ; 5*(H/4)
365 sar r3, 4 ; (5*(H/4))/16
379 movzx e_reg, byte [r3+r2*2 ]
380 movzx r5, byte [r4+r1 ]
383 movzx e_reg, byte [r3+r2 ]
388 movzx e_reg, byte [r3+r1 ]
389 movzx r6, byte [r4+r2*2 ]
393 movzx e_reg, byte [r3 ]
395 movzx r10, byte [r4+r2 ]
398 movzx r6, byte [r4+r2 ]
407 movzx r4, byte [e_reg+r2 ]
419 movzx r4, byte [e_reg ]
421 movzx r10, byte [r3 +r2 ]
425 movzx r6, byte [r3 +r2 ]
431 movzx r4, byte [e_reg+r1 ]
432 movzx r6, byte [r3 +r2*2]
439 movzx r4, byte [e_reg+r2*2]
440 movzx r6, byte [r3 +r1 ]
443 add r5, r6 ; sum of V coefficients
460 lea r5, [r5*5] ; 5*(V/4)
464 sar r5, 4 ; (5*(V/4))/16
467 movzx r4, byte [r0+r1 +15]
468 movzx r3, byte [r3+r2*2 ]
495 punpcklqdq m0, m0 ; splat H (words)
496 punpcklqdq m1, m1 ; splat V (words)
497 punpcklqdq m3, m3 ; splat a (words)
506 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
515 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
516 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
518 paddw m5, m0 ; a + {8,9,10,11}*H
519 paddw m6, m0 ; a + {12,13,14,15}*H
524 mova m3, m0 ; b[0..7]
525 mova m4, m2 ; b[8..15]
531 mova m3, m5 ; b[8..11]
532 mova m4, m6 ; b[12..15]
545 mova m3, m0 ; b[0..7]
546 mova m4, m2 ; b[8..15]
552 mova m3, m5 ; b[8..11]
553 mova m4, m6 ; b[12..15]
573 H264_PRED16x16_PLANE mmx, 0, h264
574 H264_PRED16x16_PLANE mmx, 0, rv40
575 H264_PRED16x16_PLANE mmx, 0, svq3
576 H264_PRED16x16_PLANE mmx2, 0, h264
577 H264_PRED16x16_PLANE mmx2, 0, rv40
578 H264_PRED16x16_PLANE mmx2, 0, svq3
580 H264_PRED16x16_PLANE sse2, 8, h264
581 H264_PRED16x16_PLANE sse2, 8, rv40
582 H264_PRED16x16_PLANE sse2, 8, svq3
583 H264_PRED16x16_PLANE ssse3, 8, h264
584 H264_PRED16x16_PLANE ssse3, 8, rv40
585 H264_PRED16x16_PLANE ssse3, 8, svq3
587 ;-----------------------------------------------------------------------------
588 ; void pred8x8_plane(uint8_t *src, int stride)
589 ;-----------------------------------------------------------------------------
591 %macro H264_PRED8x8_PLANE 2
592 cglobal pred8x8_plane_%1, 2, 7, %2
602 pmullw m0, [pw_m4to4]
603 pmullw m1, [pw_m4to4+8]
610 pmullw m0, [pw_m4to4]
612 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
613 pmaddubsw m0, [plane8_shuf] ; H coefficients
639 paddw m0, m1 ; sum of H coefficients
655 movzx e_reg, byte [r3+r2*2 ]
656 movzx r5, byte [r4+r1 ]
659 movzx e_reg, byte [r3 ]
661 movzx r10, byte [r4+r2 ]
665 movzx r6, byte [r4+r2 ]
671 movzx e_reg, byte [r3+r1 ]
672 movzx r6, byte [r4+r2*2 ]
679 movzx e_reg, byte [r3+r2 ]
692 movzx r3, byte [r4+r2*2 ]
693 movzx r4, byte [r0+r1 +7]
720 punpcklqdq m0, m0 ; splat H (words)
721 punpcklqdq m1, m1 ; splat V (words)
722 punpcklqdq m3, m3 ; splat a (words)
727 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
728 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
731 paddw m2, m0 ; a + {4,5,6,7}*H
738 mova m3, m0 ; b[0..7]
741 mova m4, m0 ; V+b[0..7]
748 mova m3, m0 ; b[0..3]
749 mova m4, m2 ; b[4..7]
754 mova m5, m0 ; V+b[0..3]
755 mova m6, m2 ; V+b[4..7]
773 H264_PRED8x8_PLANE mmx, 0
774 H264_PRED8x8_PLANE mmx2, 0
776 H264_PRED8x8_PLANE sse2, 8
777 H264_PRED8x8_PLANE ssse3, 8
779 ;-----------------------------------------------------------------------------
780 ; void pred8x8_vertical(uint8_t *src, int stride)
781 ;-----------------------------------------------------------------------------
783 cglobal pred8x8_vertical_mmx, 2,2
795 ;-----------------------------------------------------------------------------
796 ; void pred8x8_horizontal(uint8_t *src, int stride)
797 ;-----------------------------------------------------------------------------
800 cglobal pred8x8_horizontal_%1, 2,3
837 ;-----------------------------------------------------------------------------
838 ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
839 ;-----------------------------------------------------------------------------
841 cglobal pred8x8_top_dc_mmxext, 2,5
858 pshufw mm0, mm0, 0 ; dc0 (w)
859 packuswb mm0, mm1 ; dc0,dc1 (b)
871 ;-----------------------------------------------------------------------------
872 ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
873 ;-----------------------------------------------------------------------------
876 cglobal pred8x8_dc_mmxext, 2,5
885 movzx r2d, byte [r0+r1*1-1]
886 movzx r3d, byte [r0+r1*2-1]
889 movzx r3d, byte [r0+r1*1-1]
891 movzx r3d, byte [r0+r1*2-1]
895 movzx r2d, byte [r0+r1*1-1]
896 movzx r3d, byte [r0+r1*2-1]
899 movzx r3d, byte [r0+r1*1-1]
901 movzx r3d, byte [r0+r1*2-1]
908 punpckldq m0, m2 ; s0, s1, s2, s3
909 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
911 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
915 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
933 ;-----------------------------------------------------------------------------
934 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
935 ;-----------------------------------------------------------------------------
937 cglobal pred8x8_dc_rv40_mmxext, 2,7
943 movzx r5d, byte [r0+r1*1]
947 movzx r2d, byte [r0+r1*0]
948 movzx r3d, byte [r0+r1*1]
953 movzx r2d, byte [r0+r1*0]
969 ;-----------------------------------------------------------------------------
970 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
971 ;-----------------------------------------------------------------------------
973 %macro PRED8x8_TM_MMX 1
974 cglobal pred8x8_tm_vp8_%1, 2,6
981 movzx r4d, byte [r0-1]
984 movzx r2d, byte [r0+r1*1-1]
985 movzx r3d, byte [r0+r1*2-1]
1016 PRED8x8_TM_MMX mmxext
1018 cglobal pred8x8_tm_vp8_sse2, 2,6,4
1022 punpcklbw xmm0, xmm1
1023 movzx r4d, byte [r0-1]
1026 movzx r2d, byte [r0+r1*1-1]
1027 movzx r3d, byte [r0+r1*2-1]
1032 pshuflw xmm2, xmm2, 0
1033 pshuflw xmm3, xmm3, 0
1034 punpcklqdq xmm2, xmm2
1035 punpcklqdq xmm3, xmm3
1039 movq [r0+r1*1], xmm2
1040 movhps [r0+r1*2], xmm2
1046 cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1048 movdqa xmm4, [tm_shuf]
1051 punpcklbw xmm0, xmm1
1056 movd xmm2, [r0+r1*1-4]
1057 movd xmm3, [r0+r1*2-4]
1065 movq [r0+r1*1], xmm2
1066 movhps [r0+r1*2], xmm2
1072 ; dest, left, right, src, tmp
1073 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1074 %macro PRED4x4_LOWPASS 5
1084 ;-----------------------------------------------------------------------------
1085 ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1086 ;-----------------------------------------------------------------------------
1088 %macro PRED8x8L_TOP_DC 1
1089 cglobal pred8x8l_top_dc_%1, 4,4
1097 PALIGNR mm2, mm0, 7, mm0
1098 PALIGNR mm1, mm4, 1, mm4
1099 test r1, r1 ; top_left
1101 test r2, r2 ; top_right
1110 test r2, r2 ; top_right
1119 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1136 %define PALIGNR PALIGNR_MMX
1137 PRED8x8L_TOP_DC mmxext
1138 %define PALIGNR PALIGNR_SSSE3
1139 PRED8x8L_TOP_DC ssse3
1141 ;-----------------------------------------------------------------------------
1142 ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1143 ;-----------------------------------------------------------------------------
1145 %macro PRED8x8L_DC 1
1146 cglobal pred8x8l_dc_%1, 4,5
1149 movq mm0, [r0+r3*1-8]
1150 punpckhbw mm0, [r0+r3*0-8]
1151 movq mm1, [r4+r3*1-8]
1152 punpckhbw mm1, [r0+r3*2-8]
1156 movq mm2, [r0+r3*1-8]
1157 punpckhbw mm2, [r0+r3*0-8]
1159 movq mm3, [r0+r3*1-8]
1160 punpckhbw mm3, [r0+r3*0-8]
1164 movq mm0, [r0+r3*0-8]
1169 PALIGNR mm4, mm0, 7, mm0
1170 PALIGNR mm1, mm2, 1, mm2
1197 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1200 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1202 PALIGNR mm7, mm1, 7, mm3
1208 PALIGNR mm2, mm0, 7, mm0
1209 PALIGNR mm1, mm4, 1, mm4
1216 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1239 %define PALIGNR PALIGNR_MMX
1241 %define PALIGNR PALIGNR_SSSE3
1244 ;-----------------------------------------------------------------------------
1245 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1246 ;-----------------------------------------------------------------------------
1248 %macro PRED8x8L_HORIZONTAL 1
1249 cglobal pred8x8l_horizontal_%1, 4,4
1252 movq mm0, [r0+r3*1-8]
1253 punpckhbw mm0, [r0+r3*0-8]
1254 movq mm1, [r2+r3*1-8]
1255 punpckhbw mm1, [r0+r3*2-8]
1259 movq mm2, [r0+r3*1-8]
1260 punpckhbw mm2, [r0+r3*0-8]
1262 movq mm3, [r0+r3*1-8]
1263 punpckhbw mm3, [r0+r3*0-8]
1267 movq mm0, [r0+r3*0-8]
1272 PALIGNR mm4, mm0, 7, mm0
1273 PALIGNR mm1, mm2, 1, mm2
1274 test r1, r1 ; top_left
1284 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1287 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1289 PALIGNR mm7, mm1, 7, mm3
1295 pshufw mm0, mm3, 0xff
1296 pshufw mm1, mm3, 0xaa
1298 pshufw mm2, mm3, 0x55
1299 pshufw mm3, mm3, 0x00
1300 pshufw mm4, mm7, 0xff
1301 pshufw mm5, mm7, 0xaa
1302 pshufw mm6, mm7, 0x55
1303 pshufw mm7, mm7, 0x00
1317 %define PALIGNR PALIGNR_MMX
1318 PRED8x8L_HORIZONTAL mmxext
1319 %define PALIGNR PALIGNR_SSSE3
1320 PRED8x8L_HORIZONTAL ssse3
1322 ;-----------------------------------------------------------------------------
1323 ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1324 ;-----------------------------------------------------------------------------
1326 %macro PRED8x8L_VERTICAL 1
1327 cglobal pred8x8l_vertical_%1, 4,4
1334 PALIGNR mm2, mm0, 7, mm0
1335 PALIGNR mm1, mm4, 1, mm4
1336 test r1, r1 ; top_left
1338 test r2, r2 ; top_right
1347 test r2, r2 ; top_right
1356 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1368 %define PALIGNR PALIGNR_MMX
1369 PRED8x8L_VERTICAL mmxext
1370 %define PALIGNR PALIGNR_SSSE3
1371 PRED8x8L_VERTICAL ssse3
1373 ;-----------------------------------------------------------------------------
1374 ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1375 ;-----------------------------------------------------------------------------
1377 %macro PRED8x8L_DOWN_LEFT 1
1378 cglobal pred8x8l_down_left_%1, 4,4
1385 PALIGNR mm2, mm0, 7, mm0
1386 PALIGNR mm1, mm4, 1, mm4
1387 test r1, r1 ; top_left
1389 test r2, r2 ; top_right
1398 test r2, r2 ; top_right
1409 pshufw mm1, mm3, 0xFF
1412 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1414 test r2, r2 ; top_right
1421 PALIGNR mm2, mm3, 7, mm3
1422 PALIGNR mm5, mm4, 1, mm4
1423 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1439 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1441 movq [r0+r3*1], xmm0
1443 movq [r0+r3*2], xmm0
1446 movq [r1+r3*1], xmm0
1448 movq [r1+r3*2], xmm0
1450 movq [r2+r3*1], xmm0
1452 movq [r2+r3*2], xmm0
1454 movq [r0+r3*1], xmm0
1456 movq [r0+r3*2], xmm0
1461 %define PALIGNR PALIGNR_MMX
1462 PRED8x8L_DOWN_LEFT sse2
1464 %define PALIGNR PALIGNR_SSSE3
1465 PRED8x8L_DOWN_LEFT ssse3
1467 ;-----------------------------------------------------------------------------
1468 ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1469 ;-----------------------------------------------------------------------------
1472 %define PALIGNR PALIGNR_MMX
1473 cglobal pred8x8l_down_right_mmxext, 4,5
1476 movq mm0, [r0+r3*1-8]
1477 punpckhbw mm0, [r0+r3*0-8]
1478 movq mm1, [r4+r3*1-8]
1479 punpckhbw mm1, [r0+r3*2-8]
1483 movq mm2, [r0+r3*1-8]
1484 punpckhbw mm2, [r0+r3*0-8]
1486 movq mm3, [r0+r3*1-8]
1487 punpckhbw mm3, [r0+r3*0-8]
1491 movq mm0, [r0+r3*0-8]
1496 PALIGNR mm4, mm0, 7, mm0
1497 PALIGNR mm1, mm2, 1, mm2
1498 test r1, r1 ; top_left
1502 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1506 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1508 PALIGNR mm7, mm1, 7, mm3
1514 PALIGNR mm2, mm0, 7, mm0
1515 PALIGNR mm1, mm4, 1, mm4
1516 test r1, r1 ; top_left
1518 test r2, r2 ; top_right
1521 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1537 test r2, r2 ; top_right
1553 PALIGNR mm2, mm6, 1, mm0
1555 PALIGNR mm3, mm6, 7, mm0
1559 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1560 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1604 %macro PRED8x8L_DOWN_RIGHT 1
1605 cglobal pred8x8l_down_right_%1, 4,5
1608 movq mm0, [r0+r3*1-8]
1609 punpckhbw mm0, [r0+r3*0-8]
1610 movq mm1, [r4+r3*1-8]
1611 punpckhbw mm1, [r0+r3*2-8]
1615 movq mm2, [r0+r3*1-8]
1616 punpckhbw mm2, [r0+r3*0-8]
1618 movq mm3, [r0+r3*1-8]
1619 punpckhbw mm3, [r0+r3*0-8]
1623 movq mm0, [r0+r3*0-8]
1628 PALIGNR mm4, mm0, 7, mm0
1629 PALIGNR mm1, mm2, 1, mm2
1657 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1661 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1663 PALIGNR mm7, mm1, 7, mm3
1670 PALIGNR mm2, mm0, 7, mm0
1671 PALIGNR mm1, mm4, 1, mm4
1677 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1694 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1697 movq [r0+r3*2], xmm0
1698 movq [r0+r3*1], xmm1
1701 movq [r2+r3*2], xmm0
1702 movq [r2+r3*1], xmm1
1705 movq [r1+r3*2], xmm0
1706 movq [r1+r3*1], xmm1
1709 movq [r4+r3*2], xmm0
1710 movq [r4+r3*1], xmm1
1715 %define PALIGNR PALIGNR_MMX
1716 PRED8x8L_DOWN_RIGHT sse2
1718 %define PALIGNR PALIGNR_SSSE3
1719 PRED8x8L_DOWN_RIGHT ssse3
1721 ;-----------------------------------------------------------------------------
1722 ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1723 ;-----------------------------------------------------------------------------
1726 %define PALIGNR PALIGNR_MMX
1727 cglobal pred8x8l_vertical_right_mmxext, 4,5
1730 movq mm0, [r0+r3*1-8]
1731 punpckhbw mm0, [r0+r3*0-8]
1732 movq mm1, [r4+r3*1-8]
1733 punpckhbw mm1, [r0+r3*2-8]
1737 movq mm2, [r0+r3*1-8]
1738 punpckhbw mm2, [r0+r3*0-8]
1740 movq mm3, [r0+r3*1-8]
1741 punpckhbw mm3, [r0+r3*0-8]
1745 movq mm0, [r0+r3*0-8]
1750 PALIGNR mm4, mm0, 7, mm0
1751 PALIGNR mm1, mm2, 1, mm2
1779 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1786 PALIGNR mm2, mm0, 7, mm0
1787 PALIGNR mm1, mm4, 1, mm4
1793 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1797 PALIGNR mm3, mm7, 7, mm0
1798 PALIGNR mm6, mm7, 6, mm1
1802 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1813 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1814 PALIGNR mm6, mm0, 7, mm2
1817 PALIGNR mm5, mm0, 7, mm1
1820 PALIGNR mm6, mm0, 7, mm2
1823 PALIGNR mm5, mm0, 7, mm1
1826 PALIGNR mm6, mm0, 7, mm2
1829 PALIGNR mm5, mm0, 7, mm1
1833 %macro PRED8x8L_VERTICAL_RIGHT 1
1834 cglobal pred8x8l_vertical_right_%1, 4,5,7
1837 movq mm0, [r0+r3*1-8]
1838 punpckhbw mm0, [r0+r3*0-8]
1839 movq mm1, [r4+r3*1-8]
1840 punpckhbw mm1, [r0+r3*2-8]
1844 movq mm2, [r0+r3*1-8]
1845 punpckhbw mm2, [r0+r3*0-8]
1847 movq mm3, [r0+r3*1-8]
1848 punpckhbw mm3, [r0+r3*0-8]
1852 movq mm0, [r0+r3*0-8]
1857 PALIGNR mm4, mm0, 7, mm0
1858 PALIGNR mm1, mm2, 1, mm2
1885 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1892 PALIGNR mm2, mm0, 7, mm0
1893 PALIGNR mm1, mm4, 1, mm4
1899 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1904 movdqa xmm6, [pw_ff00]
1913 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1919 movhps [r0+r3*2], xmm5
1920 movhps [r0+r3*1], xmm2
1928 movq [r0+r3*2], xmm5
1929 movq [r0+r3*1], xmm2
1932 movq [r2+r3*2], xmm5
1933 movq [r2+r3*1], xmm2
1936 movq [r1+r3*2], xmm5
1937 movq [r1+r3*1], xmm2
1942 %define PALIGNR PALIGNR_MMX
1943 PRED8x8L_VERTICAL_RIGHT sse2
1945 %define PALIGNR PALIGNR_SSSE3
1946 PRED8x8L_VERTICAL_RIGHT ssse3
1948 ;-----------------------------------------------------------------------------
1949 ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1950 ;-----------------------------------------------------------------------------
1952 %macro PRED8x8L_VERTICAL_LEFT 1
1953 cglobal pred8x8l_vertical_left_%1, 4,4
1960 PALIGNR mm2, mm0, 7, mm0
1961 PALIGNR mm1, mm4, 1, mm4
1984 pshufw mm1, mm3, 0xFF
1987 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1996 PALIGNR mm2, mm3, 7, mm3
1997 PALIGNR mm5, mm4, 1, mm4
1998 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2012 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2014 movq [r0+r3*1], xmm3
2015 movq [r0+r3*2], xmm0
2019 movq [r1+r3*1], xmm3
2020 movq [r1+r3*2], xmm0
2023 movq [r2+r3*1], xmm3
2024 movq [r2+r3*2], xmm0
2027 movq [r0+r3*1], xmm3
2028 movq [r0+r3*2], xmm0
2033 %define PALIGNR PALIGNR_MMX
2034 PRED8x8L_VERTICAL_LEFT sse2
2035 %define PALIGNR PALIGNR_SSSE3
2037 PRED8x8L_VERTICAL_LEFT ssse3
2039 ;-----------------------------------------------------------------------------
2040 ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2041 ;-----------------------------------------------------------------------------
2043 %macro PRED8x8L_HORIZONTAL_UP 1
2044 cglobal pred8x8l_horizontal_up_%1, 4,4
2047 movq mm0, [r0+r3*1-8]
2048 punpckhbw mm0, [r0+r3*0-8]
2049 movq mm1, [r2+r3*1-8]
2050 punpckhbw mm1, [r0+r3*2-8]
2054 movq mm2, [r0+r3*1-8]
2055 punpckhbw mm2, [r0+r3*0-8]
2057 movq mm3, [r0+r3*1-8]
2058 punpckhbw mm3, [r0+r3*0-8]
2062 movq mm0, [r0+r3*0-8]
2067 PALIGNR mm4, mm0, 7, mm0
2068 PALIGNR mm1, mm2, 1, mm2
2079 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2082 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2084 PALIGNR mm7, mm1, 7, mm3
2086 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2087 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2091 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2098 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2100 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2102 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2104 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2105 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2109 PALIGNR mm5, mm4, 2, mm1
2110 pshufw mm1, mm6, 11111001b
2111 PALIGNR mm6, mm4, 4, mm2
2112 pshufw mm2, mm7, 11111110b
2113 PALIGNR mm7, mm4, 6, mm3
2114 pshufw mm3, mm0, 11111111b
2128 %define PALIGNR PALIGNR_MMX
2129 PRED8x8L_HORIZONTAL_UP mmxext
2130 %define PALIGNR PALIGNR_SSSE3
2131 PRED8x8L_HORIZONTAL_UP ssse3
2133 ;-----------------------------------------------------------------------------
2134 ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2135 ;-----------------------------------------------------------------------------
2138 %define PALIGNR PALIGNR_MMX
2139 cglobal pred8x8l_horizontal_down_mmxext, 4,5
2142 movq mm0, [r0+r3*1-8]
2143 punpckhbw mm0, [r0+r3*0-8]
2144 movq mm1, [r4+r3*1-8]
2145 punpckhbw mm1, [r0+r3*2-8]
2149 movq mm2, [r0+r3*1-8]
2150 punpckhbw mm2, [r0+r3*0-8]
2152 movq mm3, [r0+r3*1-8]
2153 punpckhbw mm3, [r0+r3*0-8]
2157 movq mm0, [r0+r3*0-8]
2162 PALIGNR mm4, mm0, 7, mm0
2163 PALIGNR mm1, mm2, 1, mm2
2190 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2194 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2196 PALIGNR mm7, mm1, 7, mm3
2202 PALIGNR mm2, mm0, 7, mm0
2203 PALIGNR mm1, mm4, 1, mm4
2209 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2216 PALIGNR mm2, mm6, 7, mm5
2217 PALIGNR mm6, mm7, 7, mm0
2219 PALIGNR mm4, mm3, 1, mm7
2222 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2228 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2236 PALIGNR mm7, mm3, 2, mm5
2238 PALIGNR mm1, mm3, 4, mm5
2240 PALIGNR mm0, mm3, 6, mm3
2245 PALIGNR mm6, mm4, 2, mm5
2247 PALIGNR mm2, mm4, 4, mm5
2249 PALIGNR mm3, mm4, 6, mm4
2253 %macro PRED8x8L_HORIZONTAL_DOWN 1
2254 cglobal pred8x8l_horizontal_down_%1, 4,5
2257 movq mm0, [r0+r3*1-8]
2258 punpckhbw mm0, [r0+r3*0-8]
2259 movq mm1, [r4+r3*1-8]
2260 punpckhbw mm1, [r0+r3*2-8]
2264 movq mm2, [r0+r3*1-8]
2265 punpckhbw mm2, [r0+r3*0-8]
2267 movq mm3, [r0+r3*1-8]
2268 punpckhbw mm3, [r0+r3*0-8]
2272 movq mm0, [r0+r3*0-8]
2277 PALIGNR mm4, mm0, 7, mm0
2278 PALIGNR mm1, mm2, 1, mm2
2305 pshufw mm1, mm3, 0xFF
2309 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2313 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2323 PALIGNR mm2, mm0, 7, mm0
2324 PALIGNR mm1, mm4, 1, mm4
2330 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2339 PALIGNR mm2, mm3, 7, mm3
2340 PALIGNR mm5, mm4, 1, mm4
2341 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2350 PALIGNR xmm1, xmm0, 7, xmm4
2351 PALIGNR xmm2, xmm0, 9, xmm5
2353 PALIGNR xmm3, xmm0, 8, xmm0
2357 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2358 punpcklbw xmm4, xmm0
2360 movq [r0+r3*2], xmm4
2361 movq [r2+r3*2], xmm0
2364 movq [r0+r3*1], xmm4
2365 movq [r2+r3*1], xmm0
2368 movq [r1+r3*2], xmm4
2369 movq [r4+r3*2], xmm0
2372 movq [r1+r3*1], xmm4
2373 movq [r4+r3*1], xmm0
2378 %define PALIGNR PALIGNR_MMX
2379 PRED8x8L_HORIZONTAL_DOWN sse2
2381 %define PALIGNR PALIGNR_SSSE3
2382 PRED8x8L_HORIZONTAL_DOWN ssse3
2385 ;-----------------------------------------------------------------------------
2386 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2387 ;-----------------------------------------------------------------------------
2389 cglobal pred4x4_dc_mmxext, 3,5
2395 movzx r1d, byte [r0+r2*1-1]
2398 movzx r1d, byte [r0+r2*2-1]
2401 movzx r1d, byte [r0+r2*1-1]
2403 movzx r1d, byte [r0+r2*2-1]
2407 imul r3d, 0x01010101
2414 ;-----------------------------------------------------------------------------
2415 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2416 ;-----------------------------------------------------------------------------
2418 %macro PRED4x4_TM_MMX 1
2419 cglobal pred4x4_tm_vp8_%1, 3,6
2424 movzx r4d, byte [r0-1]
2427 movzx r1d, byte [r0+r2*1-1]
2428 movzx r3d, byte [r0+r2*2-1]
2455 PRED4x4_TM_MMX mmxext
2457 cglobal pred4x4_tm_vp8_ssse3, 3,3
2466 movd mm2, [r0+r2*1-4]
2467 movd mm3, [r0+r2*2-4]
2468 movd mm4, [r1+r2*1-4]
2469 movd mm5, [r1+r2*2-4]
2492 ;-----------------------------------------------------------------------------
2493 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2494 ;-----------------------------------------------------------------------------
2497 cglobal pred4x4_vertical_vp8_mmxext, 3,3
2501 mova m2, m0 ;t0 t1 t2 t3
2502 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2504 psrlq m0, 8 ;t1 t2 t3 t4
2505 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2512 ;-----------------------------------------------------------------------------
2513 ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2514 ;-----------------------------------------------------------------------------
2517 cglobal pred4x4_down_left_mmxext, 3,3
2528 PRED4x4_LOWPASS m0, m1, m3, m4, m5
2540 ;-----------------------------------------------------------------------------
2541 ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2542 ;-----------------------------------------------------------------------------
2545 cglobal pred4x4_vertical_left_mmxext, 3,3
2555 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2565 ;-----------------------------------------------------------------------------
2566 ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2567 ;-----------------------------------------------------------------------------
2570 cglobal pred4x4_horizontal_up_mmxext, 3,3
2573 movq m0, [r0+r2*1-8]
2574 punpckhbw m0, [r0+r2*2-8]
2575 movq m1, [r1+r2*1-8]
2576 punpckhbw m1, [r1+r2*2-8]
2588 PRED4x4_LOWPASS m4, m0, m2, m3, m5