1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Fiona Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
29 tm_shuf: times 8 db 0x03, 0x80
30 pw_ff00: times 8 dw 0xff00
31 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
32 db 1, 2, 3, 4, 5, 6, 7, 8
33 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
34 db 1, 2, 3, 4, 0, 0, 0, 0
35 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
36 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
37 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
38 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
47 ;-----------------------------------------------------------------------------
48 ; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
49 ;-----------------------------------------------------------------------------
52 cglobal pred16x16_vertical_8, 2,3
68 cglobal pred16x16_vertical_8, 2,3
73 movaps [r0+r1*1], xmm0
74 movaps [r0+r1*2], xmm0
76 movaps [r0+r1*1], xmm0
77 movaps [r0+r1*2], xmm0
83 ;-----------------------------------------------------------------------------
84 ; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
85 ;-----------------------------------------------------------------------------
88 cglobal pred16x16_horizontal_8, 2,3
124 ;-----------------------------------------------------------------------------
125 ; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
126 ;-----------------------------------------------------------------------------
128 %macro PRED16x16_DC 0
129 cglobal pred16x16_dc_8, 2,7
137 movzx r5d, byte [r0+r1*1]
142 movzx r2d, byte [r0+r1*0]
143 movzx r3d, byte [r0+r1*1]
148 movzx r2d, byte [r0+r1*0]
155 SPLATB_REG m0, r2, m1
186 ;-----------------------------------------------------------------------------
187 ; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
188 ;-----------------------------------------------------------------------------
190 %macro PRED16x16_TM 0
191 cglobal pred16x16_tm_vp8_8, 2,5
202 movzx r3d, byte [r0-1]
205 movzx r2d, byte [r0+r1-1]
232 cglobal pred16x16_tm_vp8_8, 2,6,6
239 movzx r4d, byte [r0-1]
242 movzx r2d, byte [r0+r1*1-1]
243 movzx r3d, byte [r0+r1*2-1]
248 pshuflw xmm2, xmm2, 0
249 pshuflw xmm4, xmm4, 0
250 punpcklqdq xmm2, xmm2
251 punpcklqdq xmm4, xmm4
260 movdqa [r0+r1*1], xmm2
261 movdqa [r0+r1*2], xmm4
267 %if HAVE_AVX2_EXTERNAL
269 cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
272 vpbroadcastb xm1, [r0-1]
276 lea stride3q, [strideq*3]
278 vpbroadcastb xm1, [dstq+strideq*1-1]
279 vpbroadcastb xm2, [dstq+strideq*2-1]
280 vpbroadcastb xm3, [dstq+stride3q-1]
281 vpbroadcastb xm4, [dstq+strideq*4-1]
294 movdqa [dstq+strideq*1], xm1
295 vextracti128 [dstq+strideq*2], m1, 1
296 movdqa [dstq+stride3q*1], xm3
297 vextracti128 [dstq+strideq*4], m3, 1
298 lea dstq, [dstq+strideq*4]
304 ;-----------------------------------------------------------------------------
305 ; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
306 ;-----------------------------------------------------------------------------
308 %macro H264_PRED16x16_PLANE 1
309 cglobal pred16x16_plane_%1_8, 2,9,7
323 pmullw m0, [pw_m8tom1 ]
324 pmullw m1, [pw_m8tom1+8]
325 pmullw m2, [pw_1to8 ]
326 pmullw m3, [pw_1to8 +8]
331 movhps m0, [r0+r1 +8]
332 pmaddubsw m0, [plane_shuf] ; H coefficients
338 pmullw m0, [pw_m8tom1]
358 paddw m0, m1 ; sum of H coefficients
370 movzx e_reg, byte [r3+r2*2 ]
371 movzx r5, byte [r4+r1 ]
374 movzx e_reg, byte [r3+r2 ]
379 movzx e_reg, byte [r3+r1 ]
380 movzx r6, byte [r4+r2*2 ]
384 movzx e_reg, byte [r3 ]
386 movzx r7, byte [r4+r2 ]
389 movzx r6, byte [r4+r2 ]
398 movzx r4, byte [e_reg+r2 ]
410 movzx r4, byte [e_reg ]
412 movzx r7, byte [r3 +r2 ]
416 movzx r6, byte [r3 +r2 ]
422 movzx r4, byte [e_reg+r1 ]
423 movzx r6, byte [r3 +r2*2]
430 movzx r4, byte [e_reg+r2*2]
431 movzx r6, byte [r3 +r1 ]
434 add r5, r6 ; sum of V coefficients
451 lea r5, [r5*5] ; 5*(V/4)
455 sar r5, 4 ; (5*(V/4))/16
458 movzx r4, byte [r0+r1 +15]
459 movzx r3, byte [r3+r2*2 ]
477 lea r1d, [r1d*5] ; 5*(H/4)
481 sar r1d, 4 ; (5*(H/4))/16
502 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
511 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
512 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
514 paddw m5, m0 ; a + {8,9,10,11}*H
515 paddw m6, m0 ; a + {12,13,14,15}*H
520 mova m3, m0 ; b[0..7]
521 mova m4, m2 ; b[8..15]
527 mova m3, m5 ; b[8..11]
528 mova m4, m6 ; b[12..15]
541 mova m3, m0 ; b[0..7]
542 mova m4, m2 ; b[8..15]
548 mova m3, m5 ; b[8..11]
549 mova m4, m6 ; b[12..15]
569 H264_PRED16x16_PLANE h264
570 H264_PRED16x16_PLANE rv40
571 H264_PRED16x16_PLANE svq3
573 H264_PRED16x16_PLANE h264
574 H264_PRED16x16_PLANE rv40
575 H264_PRED16x16_PLANE svq3
577 H264_PRED16x16_PLANE h264
578 H264_PRED16x16_PLANE rv40
579 H264_PRED16x16_PLANE svq3
581 H264_PRED16x16_PLANE h264
582 H264_PRED16x16_PLANE rv40
583 H264_PRED16x16_PLANE svq3
585 ;-----------------------------------------------------------------------------
586 ; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
587 ;-----------------------------------------------------------------------------
589 %macro H264_PRED8x8_PLANE 0
590 cglobal pred8x8_plane_8, 2,9,7
600 pmullw m0, [pw_m4to4]
601 pmullw m1, [pw_m4to4+8]
604 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
605 pmaddubsw m0, [plane8_shuf] ; H coefficients
611 pmullw m0, [pw_m4to4]
617 %if notcpuflag(ssse3)
633 paddw m0, m1 ; sum of H coefficients
645 movzx e_reg, byte [r3+r2*2 ]
646 movzx r5, byte [r4+r1 ]
649 movzx e_reg, byte [r3 ]
651 movzx r7, byte [r4+r2 ]
655 movzx r6, byte [r4+r2 ]
661 movzx e_reg, byte [r3+r1 ]
662 movzx r6, byte [r4+r2*2 ]
669 movzx e_reg, byte [r3+r2 ]
682 movzx r3, byte [r4+r2*2 ]
683 movzx r4, byte [r0+r1 +7]
705 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
706 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
709 paddw m2, m0 ; a + {4,5,6,7}*H
716 mova m3, m0 ; b[0..7]
719 mova m4, m0 ; V+b[0..7]
726 mova m3, m0 ; b[0..3]
727 mova m4, m2 ; b[4..7]
732 mova m5, m0 ; V+b[0..3]
733 mova m6, m2 ; V+b[4..7]
759 ;-----------------------------------------------------------------------------
760 ; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
761 ;-----------------------------------------------------------------------------
764 cglobal pred8x8_vertical_8, 2,2
776 ;-----------------------------------------------------------------------------
777 ; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
778 ;-----------------------------------------------------------------------------
781 cglobal pred8x8_horizontal_8, 2,3
787 SPLATB_LOAD m0, r0+r1*0-1, m2
788 SPLATB_LOAD m1, r0+r1*1-1, m2
804 ;-----------------------------------------------------------------------------
805 ; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
806 ;-----------------------------------------------------------------------------
808 cglobal pred8x8_top_dc_8, 2,5
825 pshufw mm0, mm0, 0 ; dc0 (w)
826 packuswb mm0, mm1 ; dc0,dc1 (b)
838 ;-----------------------------------------------------------------------------
839 ; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
840 ;-----------------------------------------------------------------------------
843 cglobal pred8x8_dc_8, 2,5
852 movzx r2d, byte [r0+r1*1-1]
853 movzx r3d, byte [r0+r1*2-1]
856 movzx r3d, byte [r0+r1*1-1]
858 movzx r3d, byte [r0+r1*2-1]
862 movzx r2d, byte [r0+r1*1-1]
863 movzx r3d, byte [r0+r1*2-1]
866 movzx r3d, byte [r0+r1*1-1]
868 movzx r3d, byte [r0+r1*2-1]
875 punpckldq m0, m2 ; s0, s1, s2, s3
876 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
878 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
882 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
899 ;-----------------------------------------------------------------------------
900 ; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
901 ;-----------------------------------------------------------------------------
904 cglobal pred8x8_dc_rv40_8, 2,7
910 movzx r5d, byte [r0+r1*1]
914 movzx r2d, byte [r0+r1*0]
915 movzx r3d, byte [r0+r1*1]
920 movzx r2d, byte [r0+r1*0]
936 ;-----------------------------------------------------------------------------
937 ; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
938 ;-----------------------------------------------------------------------------
941 cglobal pred8x8_tm_vp8_8, 2,6
948 movzx r4d, byte [r0-1]
951 movzx r2d, byte [r0+r1*1-1]
952 movzx r3d, byte [r0+r1*2-1]
981 cglobal pred8x8_tm_vp8_8, 2,6,4
986 movzx r4d, byte [r0-1]
989 movzx r2d, byte [r0+r1*1-1]
990 movzx r3d, byte [r0+r1*2-1]
995 pshuflw xmm2, xmm2, 0
996 pshuflw xmm3, xmm3, 0
997 punpcklqdq xmm2, xmm2
998 punpcklqdq xmm3, xmm3
1002 movq [r0+r1*1], xmm2
1003 movhps [r0+r1*2], xmm2
1010 cglobal pred8x8_tm_vp8_8, 2,3,6
1012 movdqa xmm4, [tm_shuf]
1015 punpcklbw xmm0, xmm1
1020 movd xmm2, [r0+r1*1-4]
1021 movd xmm3, [r0+r1*2-4]
1029 movq [r0+r1*1], xmm2
1030 movhps [r0+r1*2], xmm2
1036 ; dest, left, right, src, tmp
1037 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1038 %macro PRED4x4_LOWPASS 5
1048 ;-----------------------------------------------------------------------------
1049 ; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
1051 ;-----------------------------------------------------------------------------
1052 %macro PRED8x8L_TOP_DC 0
1053 cglobal pred8x8l_top_dc_8, 4,4
1061 PALIGNR mm2, mm0, 7, mm0
1062 PALIGNR mm1, mm4, 1, mm4
1063 test r1d, r1d ; top_left
1065 test r2d, r2d ; top_right
1074 test r2d, r2d ; top_right
1083 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1104 ;-----------------------------------------------------------------------------
1105 ; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
1107 ;-----------------------------------------------------------------------------
1109 %macro PRED8x8L_DC 0
1110 cglobal pred8x8l_dc_8, 4,5
1113 movq mm0, [r0+r3*1-8]
1114 punpckhbw mm0, [r0+r3*0-8]
1115 movq mm1, [r4+r3*1-8]
1116 punpckhbw mm1, [r0+r3*2-8]
1120 movq mm2, [r0+r3*1-8]
1121 punpckhbw mm2, [r0+r3*0-8]
1123 movq mm3, [r0+r3*1-8]
1124 punpckhbw mm3, [r0+r3*0-8]
1128 movq mm0, [r0+r3*0-8]
1133 PALIGNR mm4, mm0, 7, mm0
1134 PALIGNR mm1, mm2, 1, mm2
1161 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1164 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1166 PALIGNR mm7, mm1, 7, mm3
1172 PALIGNR mm2, mm0, 7, mm0
1173 PALIGNR mm1, mm4, 1, mm4
1180 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1208 ;-----------------------------------------------------------------------------
1209 ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
1210 ; int has_topright, ptrdiff_t stride)
1211 ;-----------------------------------------------------------------------------
1213 %macro PRED8x8L_HORIZONTAL 0
1214 cglobal pred8x8l_horizontal_8, 4,4
1217 movq mm0, [r0+r3*1-8]
1221 punpckhbw mm0, [r1+r3*0-8]
1222 movq mm1, [r2+r3*1-8]
1223 punpckhbw mm1, [r0+r3*2-8]
1227 movq mm2, [r0+r3*1-8]
1228 punpckhbw mm2, [r0+r3*0-8]
1230 movq mm3, [r0+r3*1-8]
1231 punpckhbw mm3, [r0+r3*0-8]
1235 movq mm0, [r0+r3*0-8]
1236 movq mm1, [r1+r3*0-8]
1240 PALIGNR mm4, mm0, 7, mm0
1241 PALIGNR mm1, mm2, 1, mm2
1243 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1246 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1248 PALIGNR mm7, mm1, 7, mm3
1254 pshufw mm0, mm3, 0xff
1255 pshufw mm1, mm3, 0xaa
1257 pshufw mm2, mm3, 0x55
1258 pshufw mm3, mm3, 0x00
1259 pshufw mm4, mm7, 0xff
1260 pshufw mm5, mm7, 0xaa
1261 pshufw mm6, mm7, 0x55
1262 pshufw mm7, mm7, 0x00
1280 ;-----------------------------------------------------------------------------
1281 ; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
1283 ;-----------------------------------------------------------------------------
1285 %macro PRED8x8L_VERTICAL 0
1286 cglobal pred8x8l_vertical_8, 4,4
1293 PALIGNR mm2, mm0, 7, mm0
1294 PALIGNR mm1, mm4, 1, mm4
1295 test r1d, r1d ; top_left
1297 test r2d, r2d ; top_right
1306 test r2d, r2d ; top_right
1315 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1331 ;-----------------------------------------------------------------------------
1332 ; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
1333 ; int has_topright, ptrdiff_t stride)
1334 ;-----------------------------------------------------------------------------
1337 cglobal pred8x8l_down_left_8, 4,5
1344 PALIGNR mm2, mm0, 7, mm0
1345 PALIGNR mm1, mm4, 1, mm4
1368 pshufw mm1, mm3, 0xFF
1371 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1380 PALIGNR mm2, mm3, 7, mm3
1381 PALIGNR mm5, mm4, 1, mm4
1382 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1390 PALIGNR mm2, mm7, 1, mm0
1392 PALIGNR mm3, mm7, 7, mm0
1393 PALIGNR mm4, mm6, 1, mm0
1399 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1400 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1444 %macro PRED8x8L_DOWN_LEFT 0
1445 cglobal pred8x8l_down_left_8, 4,4
1452 PALIGNR mm2, mm0, 7, mm0
1453 PALIGNR mm1, mm4, 1, mm4
1454 test r1d, r1d ; top_left
1456 test r2d, r2d ; top_right
1465 test r2d, r2d ; top_right
1476 pshufw mm1, mm3, 0xFF
1479 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1481 test r2d, r2d ; top_right
1488 PALIGNR mm2, mm3, 7, mm3
1489 PALIGNR mm5, mm4, 1, mm4
1490 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1506 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1508 movq [r0+r3*1], xmm0
1510 movq [r0+r3*2], xmm0
1513 movq [r1+r3*1], xmm0
1515 movq [r1+r3*2], xmm0
1517 movq [r2+r3*1], xmm0
1519 movq [r2+r3*2], xmm0
1521 movq [r0+r3*1], xmm0
1523 movq [r0+r3*2], xmm0
1532 ;-----------------------------------------------------------------------------
1533 ; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
1534 ; int has_topright, ptrdiff_t stride)
1535 ;-----------------------------------------------------------------------------
1538 cglobal pred8x8l_down_right_8, 4,5
1541 movq mm0, [r0+r3*1-8]
1542 punpckhbw mm0, [r0+r3*0-8]
1543 movq mm1, [r4+r3*1-8]
1544 punpckhbw mm1, [r0+r3*2-8]
1548 movq mm2, [r0+r3*1-8]
1549 punpckhbw mm2, [r0+r3*0-8]
1551 movq mm3, [r0+r3*1-8]
1552 punpckhbw mm3, [r0+r3*0-8]
1556 movq mm0, [r0+r3*0-8]
1561 PALIGNR mm4, mm0, 7, mm0
1562 PALIGNR mm1, mm2, 1, mm2
1563 test r1d, r1d ; top_left
1567 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1571 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1573 PALIGNR mm7, mm1, 7, mm3
1579 PALIGNR mm2, mm0, 7, mm0
1580 PALIGNR mm1, mm4, 1, mm4
1581 test r1d, r1d ; top_left
1583 test r2d, r2d ; top_right
1586 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1602 test r2d, r2d ; top_right
1618 PALIGNR mm2, mm6, 1, mm0
1620 PALIGNR mm3, mm6, 7, mm0
1624 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1625 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1669 %macro PRED8x8L_DOWN_RIGHT 0
1670 cglobal pred8x8l_down_right_8, 4,5
1673 movq mm0, [r0+r3*1-8]
1674 punpckhbw mm0, [r0+r3*0-8]
1675 movq mm1, [r4+r3*1-8]
1676 punpckhbw mm1, [r0+r3*2-8]
1680 movq mm2, [r0+r3*1-8]
1681 punpckhbw mm2, [r0+r3*0-8]
1683 movq mm3, [r0+r3*1-8]
1684 punpckhbw mm3, [r0+r3*0-8]
1688 movq mm0, [r0+r3*0-8]
1693 PALIGNR mm4, mm0, 7, mm0
1694 PALIGNR mm1, mm2, 1, mm2
1722 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1726 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1728 PALIGNR mm7, mm1, 7, mm3
1735 PALIGNR mm2, mm0, 7, mm0
1736 PALIGNR mm1, mm4, 1, mm4
1742 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1759 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1762 movq [r0+r3*2], xmm0
1763 movq [r0+r3*1], xmm1
1766 movq [r2+r3*2], xmm0
1767 movq [r2+r3*1], xmm1
1770 movq [r1+r3*2], xmm0
1771 movq [r1+r3*1], xmm1
1774 movq [r4+r3*2], xmm0
1775 movq [r4+r3*1], xmm1
1784 ;-----------------------------------------------------------------------------
1785 ; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
1786 ; int has_topright, ptrdiff_t stride)
1787 ;-----------------------------------------------------------------------------
1790 cglobal pred8x8l_vertical_right_8, 4,5
1793 movq mm0, [r0+r3*1-8]
1794 punpckhbw mm0, [r0+r3*0-8]
1795 movq mm1, [r4+r3*1-8]
1796 punpckhbw mm1, [r0+r3*2-8]
1800 movq mm2, [r0+r3*1-8]
1801 punpckhbw mm2, [r0+r3*0-8]
1803 movq mm3, [r0+r3*1-8]
1804 punpckhbw mm3, [r0+r3*0-8]
1808 movq mm0, [r0+r3*0-8]
1813 PALIGNR mm4, mm0, 7, mm0
1814 PALIGNR mm1, mm2, 1, mm2
1842 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1849 PALIGNR mm2, mm0, 7, mm0
1850 PALIGNR mm1, mm4, 1, mm4
1856 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1860 PALIGNR mm3, mm7, 7, mm0
1861 PALIGNR mm6, mm7, 6, mm1
1865 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1876 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1877 PALIGNR mm6, mm0, 7, mm2
1880 PALIGNR mm5, mm0, 7, mm1
1883 PALIGNR mm6, mm0, 7, mm2
1886 PALIGNR mm5, mm0, 7, mm1
1889 PALIGNR mm6, mm0, 7, mm2
1892 PALIGNR mm5, mm0, 7, mm1
1896 %macro PRED8x8L_VERTICAL_RIGHT 0
1897 cglobal pred8x8l_vertical_right_8, 4,5,7
1898 ; manually spill XMM registers for Win64 because
1899 ; the code here is initialized with INIT_MMX
1903 movq mm0, [r0+r3*1-8]
1904 punpckhbw mm0, [r0+r3*0-8]
1905 movq mm1, [r4+r3*1-8]
1906 punpckhbw mm1, [r0+r3*2-8]
1910 movq mm2, [r0+r3*1-8]
1911 punpckhbw mm2, [r0+r3*0-8]
1913 movq mm3, [r0+r3*1-8]
1914 punpckhbw mm3, [r0+r3*0-8]
1918 movq mm0, [r0+r3*0-8]
1923 PALIGNR mm4, mm0, 7, mm0
1924 PALIGNR mm1, mm2, 1, mm2
1951 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1958 PALIGNR mm2, mm0, 7, mm0
1959 PALIGNR mm1, mm4, 1, mm4
1965 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1970 movdqa xmm6, [pw_ff00]
1979 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1985 movhps [r0+r3*2], xmm5
1986 movhps [r0+r3*1], xmm2
1994 movq [r0+r3*2], xmm5
1995 movq [r0+r3*1], xmm2
1998 movq [r2+r3*2], xmm5
1999 movq [r2+r3*1], xmm2
2002 movq [r1+r3*2], xmm5
2003 movq [r1+r3*1], xmm2
2008 PRED8x8L_VERTICAL_RIGHT
2010 PRED8x8L_VERTICAL_RIGHT
2012 ;-----------------------------------------------------------------------------
2013 ; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
2014 ; int has_topright, ptrdiff_t stride)
2015 ;-----------------------------------------------------------------------------
2017 %macro PRED8x8L_VERTICAL_LEFT 0
2018 cglobal pred8x8l_vertical_left_8, 4,4
2025 PALIGNR mm2, mm0, 7, mm0
2026 PALIGNR mm1, mm4, 1, mm4
2049 pshufw mm1, mm3, 0xFF
2052 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2061 PALIGNR mm2, mm3, 7, mm3
2062 PALIGNR mm5, mm4, 1, mm4
2063 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2077 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2079 movq [r0+r3*1], xmm3
2080 movq [r0+r3*2], xmm0
2084 movq [r1+r3*1], xmm3
2085 movq [r1+r3*2], xmm0
2088 movq [r2+r3*1], xmm3
2089 movq [r2+r3*2], xmm0
2092 movq [r0+r3*1], xmm3
2093 movq [r0+r3*2], xmm0
2098 PRED8x8L_VERTICAL_LEFT
2100 PRED8x8L_VERTICAL_LEFT
2102 ;-----------------------------------------------------------------------------
2103 ; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
2104 ; int has_topright, ptrdiff_t stride)
2105 ;-----------------------------------------------------------------------------
2107 %macro PRED8x8L_HORIZONTAL_UP 0
2108 cglobal pred8x8l_horizontal_up_8, 4,4
2111 movq mm0, [r0+r3*1-8]
2115 punpckhbw mm0, [r1+r3*0-8]
2116 movq mm1, [r2+r3*1-8]
2117 punpckhbw mm1, [r0+r3*2-8]
2121 movq mm2, [r0+r3*1-8]
2122 punpckhbw mm2, [r0+r3*0-8]
2124 movq mm3, [r0+r3*1-8]
2125 punpckhbw mm3, [r0+r3*0-8]
2129 movq mm0, [r0+r3*0-8]
2130 movq mm1, [r1+r3*0-8]
2134 PALIGNR mm4, mm0, 7, mm0
2135 PALIGNR mm1, mm2, 1, mm2
2137 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2140 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2142 PALIGNR mm7, mm1, 7, mm3
2144 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2145 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2149 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2156 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2158 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2160 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2162 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2163 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2167 PALIGNR mm5, mm4, 2, mm1
2168 pshufw mm1, mm6, 11111001b
2169 PALIGNR mm6, mm4, 4, mm2
2170 pshufw mm2, mm7, 11111110b
2171 PALIGNR mm7, mm4, 6, mm3
2172 pshufw mm3, mm0, 11111111b
2186 PRED8x8L_HORIZONTAL_UP
2188 PRED8x8L_HORIZONTAL_UP
2190 ;-----------------------------------------------------------------------------
2191 ; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
2192 ; int has_topright, ptrdiff_t stride)
2193 ;-----------------------------------------------------------------------------
2196 cglobal pred8x8l_horizontal_down_8, 4,5
2199 movq mm0, [r0+r3*1-8]
2200 punpckhbw mm0, [r0+r3*0-8]
2201 movq mm1, [r4+r3*1-8]
2202 punpckhbw mm1, [r0+r3*2-8]
2206 movq mm2, [r0+r3*1-8]
2207 punpckhbw mm2, [r0+r3*0-8]
2209 movq mm3, [r0+r3*1-8]
2210 punpckhbw mm3, [r0+r3*0-8]
2214 movq mm0, [r0+r3*0-8]
2219 PALIGNR mm4, mm0, 7, mm0
2220 PALIGNR mm1, mm2, 1, mm2
2247 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2251 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2253 PALIGNR mm7, mm1, 7, mm3
2259 PALIGNR mm2, mm0, 7, mm0
2260 PALIGNR mm1, mm4, 1, mm4
2266 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2273 PALIGNR mm2, mm6, 7, mm5
2274 PALIGNR mm6, mm7, 7, mm0
2276 PALIGNR mm4, mm3, 1, mm7
2279 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2285 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2293 PALIGNR mm7, mm3, 2, mm5
2295 PALIGNR mm1, mm3, 4, mm5
2297 PALIGNR mm0, mm3, 6, mm3
2302 PALIGNR mm6, mm4, 2, mm5
2304 PALIGNR mm2, mm4, 4, mm5
2306 PALIGNR mm3, mm4, 6, mm4
2310 %macro PRED8x8L_HORIZONTAL_DOWN 0
2311 cglobal pred8x8l_horizontal_down_8, 4,5
2314 movq mm0, [r0+r3*1-8]
2315 punpckhbw mm0, [r0+r3*0-8]
2316 movq mm1, [r4+r3*1-8]
2317 punpckhbw mm1, [r0+r3*2-8]
2321 movq mm2, [r0+r3*1-8]
2322 punpckhbw mm2, [r0+r3*0-8]
2324 movq mm3, [r0+r3*1-8]
2325 punpckhbw mm3, [r0+r3*0-8]
2329 movq mm0, [r0+r3*0-8]
2334 PALIGNR mm4, mm0, 7, mm0
2335 PALIGNR mm1, mm2, 1, mm2
2362 pshufw mm1, mm3, 0xFF
2366 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2370 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2380 PALIGNR mm2, mm0, 7, mm0
2381 PALIGNR mm1, mm4, 1, mm4
2387 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2396 PALIGNR mm2, mm3, 7, mm3
2397 PALIGNR mm5, mm4, 1, mm4
2398 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2407 PALIGNR xmm1, xmm0, 7, xmm4
2408 PALIGNR xmm2, xmm0, 9, xmm5
2410 PALIGNR xmm3, xmm0, 8, xmm0
2414 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2415 punpcklbw xmm4, xmm0
2417 movq [r0+r3*2], xmm4
2418 movq [r2+r3*2], xmm0
2421 movq [r0+r3*1], xmm4
2422 movq [r2+r3*1], xmm0
2425 movq [r1+r3*2], xmm4
2426 movq [r4+r3*2], xmm0
2429 movq [r1+r3*1], xmm4
2430 movq [r4+r3*1], xmm0
2435 PRED8x8L_HORIZONTAL_DOWN
2437 PRED8x8L_HORIZONTAL_DOWN
2439 ;-------------------------------------------------------------------------------
2440 ; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright,
2442 ;-------------------------------------------------------------------------------
2445 cglobal pred4x4_dc_8, 3,5
2451 movzx r1d, byte [r0+r2*1-1]
2454 movzx r1d, byte [r0+r2*2-1]
2457 movzx r1d, byte [r0+r2*1-1]
2459 movzx r1d, byte [r0+r2*2-1]
2463 imul r3d, 0x01010101
2470 ;-----------------------------------------------------------------------------
2471 ; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2473 ;-----------------------------------------------------------------------------
2476 cglobal pred4x4_tm_vp8_8, 3,6
2481 movzx r4d, byte [r0-1]
2484 movzx r1d, byte [r0+r2*1-1]
2485 movzx r3d, byte [r0+r2*2-1]
2517 cglobal pred4x4_tm_vp8_8, 3,3
2526 movd mm2, [r0+r2*1-4]
2527 movd mm3, [r0+r2*2-4]
2528 movd mm4, [r1+r2*1-4]
2529 movd mm5, [r1+r2*2-4]
2549 ;-----------------------------------------------------------------------------
2550 ; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2552 ;-----------------------------------------------------------------------------
2555 cglobal pred4x4_vertical_vp8_8, 3,3
2559 mova m2, m0 ;t0 t1 t2 t3
2560 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2562 psrlq m0, 8 ;t1 t2 t3 t4
2563 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2570 ;-----------------------------------------------------------------------------
2571 ; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2573 ;-----------------------------------------------------------------------------
2575 cglobal pred4x4_down_left_8, 3,3
2585 PRED4x4_LOWPASS m0, m1, m2, m3, m4
2597 ;------------------------------------------------------------------------------
2598 ; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2600 ;------------------------------------------------------------------------------
2603 cglobal pred4x4_vertical_left_8, 3,3
2613 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2623 ;------------------------------------------------------------------------------
2624 ; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
2626 ;------------------------------------------------------------------------------
2629 cglobal pred4x4_horizontal_up_8, 3,3
2632 movd m0, [r0+r2*1-4]
2633 punpcklbw m0, [r0+r2*2-4]
2634 movd m1, [r1+r2*1-4]
2635 punpcklbw m1, [r1+r2*2-4]
2647 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2657 ;------------------------------------------------------------------------------
2658 ; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
2659 ; const uint8_t *topright,
2661 ;------------------------------------------------------------------------------
2664 cglobal pred4x4_horizontal_down_8, 3,3
2667 movh m0, [r0-4] ; lt ..
2668 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2669 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2670 movd m1, [r1+r2*2-4] ; l3
2671 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2672 movd m2, [r0+r2*2-4] ; l1
2673 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2674 punpckhwd m1, m2 ; l0 l1 l2 l3
2675 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2679 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2680 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2682 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2685 PALIGNR m3, m5, 6, m4
2694 ;-----------------------------------------------------------------------------
2695 ; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
2696 ; const uint8_t *topright,
2698 ;-----------------------------------------------------------------------------
2701 cglobal pred4x4_vertical_right_8, 3,3
2704 movh m0, [r0] ; ........t3t2t1t0
2706 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2708 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2710 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2712 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2713 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2719 PALIGNR m5, m1, 7, m2
2722 PALIGNR m3, m1, 7, m1
2726 ;-----------------------------------------------------------------------------
2727 ; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
2729 ;-----------------------------------------------------------------------------
2732 cglobal pred4x4_down_right_8, 3,3
2736 movq m2, [r0+r2*1-8]
2737 punpckhbw m2, [r0-8]
2740 PALIGNR m3, m1, 5, m1
2742 PALIGNR m3, [r1+r2*1-8], 7, m4
2744 PALIGNR m3, [r1+r2*2-8], 7, m4
2745 PRED4x4_LOWPASS m0, m3, m1, m2, m4