1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Fiona Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
29 tm_shuf: times 8 db 0x03, 0x80
30 pw_ff00: times 8 dw 0xff00
31 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
32 db 1, 2, 3, 4, 5, 6, 7, 8
33 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
34 db 1, 2, 3, 4, 0, 0, 0, 0
35 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
36 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
37 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
38 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
51 ;-----------------------------------------------------------------------------
52 ; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
53 ;-----------------------------------------------------------------------------
56 cglobal pred16x16_vertical_8, 2,3
72 cglobal pred16x16_vertical_8, 2,3
77 movaps [r0+r1*1], xmm0
78 movaps [r0+r1*2], xmm0
80 movaps [r0+r1*1], xmm0
81 movaps [r0+r1*2], xmm0
87 ;-----------------------------------------------------------------------------
88 ; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
89 ;-----------------------------------------------------------------------------
92 cglobal pred16x16_horizontal_8, 2,3
128 ;-----------------------------------------------------------------------------
129 ; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
130 ;-----------------------------------------------------------------------------
132 %macro PRED16x16_DC 0
133 cglobal pred16x16_dc_8, 2,7
141 movzx r5d, byte [r0+r1*1]
146 movzx r2d, byte [r0+r1*0]
147 movzx r3d, byte [r0+r1*1]
152 movzx r2d, byte [r0+r1*0]
159 SPLATB_REG m0, r2, m1
190 ;-----------------------------------------------------------------------------
191 ; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
192 ;-----------------------------------------------------------------------------
194 %macro PRED16x16_TM 0
195 cglobal pred16x16_tm_vp8_8, 2,5
206 movzx r3d, byte [r0-1]
209 movzx r2d, byte [r0+r1-1]
236 cglobal pred16x16_tm_vp8_8, 2,6,6
243 movzx r4d, byte [r0-1]
246 movzx r2d, byte [r0+r1*1-1]
247 movzx r3d, byte [r0+r1*2-1]
252 pshuflw xmm2, xmm2, 0
253 pshuflw xmm4, xmm4, 0
254 punpcklqdq xmm2, xmm2
255 punpcklqdq xmm4, xmm4
264 movdqa [r0+r1*1], xmm2
265 movdqa [r0+r1*2], xmm4
271 %if HAVE_AVX2_EXTERNAL
273 cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
276 vpbroadcastb xm1, [r0-1]
280 lea stride3q, [strideq*3]
282 vpbroadcastb xm1, [dstq+strideq*1-1]
283 vpbroadcastb xm2, [dstq+strideq*2-1]
284 vpbroadcastb xm3, [dstq+stride3q-1]
285 vpbroadcastb xm4, [dstq+strideq*4-1]
298 movdqa [dstq+strideq*1], xm1
299 vextracti128 [dstq+strideq*2], m1, 1
300 movdqa [dstq+stride3q*1], xm3
301 vextracti128 [dstq+strideq*4], m3, 1
302 lea dstq, [dstq+strideq*4]
308 ;-----------------------------------------------------------------------------
309 ; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
310 ;-----------------------------------------------------------------------------
312 %macro H264_PRED16x16_PLANE 1
313 cglobal pred16x16_plane_%1_8, 2,9,7
327 pmullw m0, [pw_m8tom1 ]
328 pmullw m1, [pw_m8tom1+8]
329 pmullw m2, [pw_1to8 ]
330 pmullw m3, [pw_1to8 +8]
335 movhps m0, [r0+r1 +8]
336 pmaddubsw m0, [plane_shuf] ; H coefficients
342 pmullw m0, [pw_m8tom1]
362 paddw m0, m1 ; sum of H coefficients
374 movzx e_reg, byte [r3+r2*2 ]
375 movzx r5, byte [r4+r1 ]
378 movzx e_reg, byte [r3+r2 ]
383 movzx e_reg, byte [r3+r1 ]
384 movzx r6, byte [r4+r2*2 ]
388 movzx e_reg, byte [r3 ]
390 movzx r7, byte [r4+r2 ]
393 movzx r6, byte [r4+r2 ]
402 movzx r4, byte [e_reg+r2 ]
414 movzx r4, byte [e_reg ]
416 movzx r7, byte [r3 +r2 ]
420 movzx r6, byte [r3 +r2 ]
426 movzx r4, byte [e_reg+r1 ]
427 movzx r6, byte [r3 +r2*2]
434 movzx r4, byte [e_reg+r2*2]
435 movzx r6, byte [r3 +r1 ]
438 add r5, r6 ; sum of V coefficients
455 lea r5, [r5*5] ; 5*(V/4)
459 sar r5, 4 ; (5*(V/4))/16
462 movzx r4, byte [r0+r1 +15]
463 movzx r3, byte [r3+r2*2 ]
481 lea r1d, [r1d*5] ; 5*(H/4)
485 sar r1d, 4 ; (5*(H/4))/16
506 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
515 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
516 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
518 paddw m5, m0 ; a + {8,9,10,11}*H
519 paddw m6, m0 ; a + {12,13,14,15}*H
524 mova m3, m0 ; b[0..7]
525 mova m4, m2 ; b[8..15]
531 mova m3, m5 ; b[8..11]
532 mova m4, m6 ; b[12..15]
545 mova m3, m0 ; b[0..7]
546 mova m4, m2 ; b[8..15]
552 mova m3, m5 ; b[8..11]
553 mova m4, m6 ; b[12..15]
573 H264_PRED16x16_PLANE h264
574 H264_PRED16x16_PLANE rv40
575 H264_PRED16x16_PLANE svq3
577 H264_PRED16x16_PLANE h264
578 H264_PRED16x16_PLANE rv40
579 H264_PRED16x16_PLANE svq3
581 H264_PRED16x16_PLANE h264
582 H264_PRED16x16_PLANE rv40
583 H264_PRED16x16_PLANE svq3
585 H264_PRED16x16_PLANE h264
586 H264_PRED16x16_PLANE rv40
587 H264_PRED16x16_PLANE svq3
589 ;-----------------------------------------------------------------------------
590 ; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
591 ;-----------------------------------------------------------------------------
593 %macro H264_PRED8x8_PLANE 0
594 cglobal pred8x8_plane_8, 2,9,7
604 pmullw m0, [pw_m4to4]
605 pmullw m1, [pw_m4to4+8]
608 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
609 pmaddubsw m0, [plane8_shuf] ; H coefficients
615 pmullw m0, [pw_m4to4]
621 %if notcpuflag(ssse3)
637 paddw m0, m1 ; sum of H coefficients
649 movzx e_reg, byte [r3+r2*2 ]
650 movzx r5, byte [r4+r1 ]
653 movzx e_reg, byte [r3 ]
655 movzx r7, byte [r4+r2 ]
659 movzx r6, byte [r4+r2 ]
665 movzx e_reg, byte [r3+r1 ]
666 movzx r6, byte [r4+r2*2 ]
673 movzx e_reg, byte [r3+r2 ]
686 movzx r3, byte [r4+r2*2 ]
687 movzx r4, byte [r0+r1 +7]
709 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
710 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
713 paddw m2, m0 ; a + {4,5,6,7}*H
720 mova m3, m0 ; b[0..7]
723 mova m4, m0 ; V+b[0..7]
730 mova m3, m0 ; b[0..3]
731 mova m4, m2 ; b[4..7]
736 mova m5, m0 ; V+b[0..3]
737 mova m6, m2 ; V+b[4..7]
763 ;-----------------------------------------------------------------------------
764 ; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
765 ;-----------------------------------------------------------------------------
768 cglobal pred8x8_vertical_8, 2,2
780 ;-----------------------------------------------------------------------------
781 ; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
782 ;-----------------------------------------------------------------------------
785 cglobal pred8x8_horizontal_8, 2,3
791 SPLATB_LOAD m0, r0+r1*0-1, m2
792 SPLATB_LOAD m1, r0+r1*1-1, m2
808 ;-----------------------------------------------------------------------------
809 ; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
810 ;-----------------------------------------------------------------------------
812 cglobal pred8x8_top_dc_8, 2,5
829 pshufw mm0, mm0, 0 ; dc0 (w)
830 packuswb mm0, mm1 ; dc0,dc1 (b)
842 ;-----------------------------------------------------------------------------
843 ; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
844 ;-----------------------------------------------------------------------------
847 cglobal pred8x8_dc_8, 2,5
856 movzx r2d, byte [r0+r1*1-1]
857 movzx r3d, byte [r0+r1*2-1]
860 movzx r3d, byte [r0+r1*1-1]
862 movzx r3d, byte [r0+r1*2-1]
866 movzx r2d, byte [r0+r1*1-1]
867 movzx r3d, byte [r0+r1*2-1]
870 movzx r3d, byte [r0+r1*1-1]
872 movzx r3d, byte [r0+r1*2-1]
879 punpckldq m0, m2 ; s0, s1, s2, s3
880 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
882 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
886 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
903 ;-----------------------------------------------------------------------------
904 ; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
905 ;-----------------------------------------------------------------------------
908 cglobal pred8x8_dc_rv40_8, 2,7
914 movzx r5d, byte [r0+r1*1]
918 movzx r2d, byte [r0+r1*0]
919 movzx r3d, byte [r0+r1*1]
924 movzx r2d, byte [r0+r1*0]
940 ;-----------------------------------------------------------------------------
941 ; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
942 ;-----------------------------------------------------------------------------
945 cglobal pred8x8_tm_vp8_8, 2,6
952 movzx r4d, byte [r0-1]
955 movzx r2d, byte [r0+r1*1-1]
956 movzx r3d, byte [r0+r1*2-1]
985 cglobal pred8x8_tm_vp8_8, 2,6,4
990 movzx r4d, byte [r0-1]
993 movzx r2d, byte [r0+r1*1-1]
994 movzx r3d, byte [r0+r1*2-1]
999 pshuflw xmm2, xmm2, 0
1000 pshuflw xmm3, xmm3, 0
1001 punpcklqdq xmm2, xmm2
1002 punpcklqdq xmm3, xmm3
1006 movq [r0+r1*1], xmm2
1007 movhps [r0+r1*2], xmm2
1014 cglobal pred8x8_tm_vp8_8, 2,3,6
1016 movdqa xmm4, [tm_shuf]
1019 punpcklbw xmm0, xmm1
1024 movd xmm2, [r0+r1*1-4]
1025 movd xmm3, [r0+r1*2-4]
1033 movq [r0+r1*1], xmm2
1034 movhps [r0+r1*2], xmm2
1040 ; dest, left, right, src, tmp
1041 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1042 %macro PRED4x4_LOWPASS 5
1052 ;-----------------------------------------------------------------------------
1053 ; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
1055 ;-----------------------------------------------------------------------------
1056 %macro PRED8x8L_TOP_DC 0
1057 cglobal pred8x8l_top_dc_8, 4,4
1065 PALIGNR mm2, mm0, 7, mm0
1066 PALIGNR mm1, mm4, 1, mm4
1067 test r1d, r1d ; top_left
1069 test r2d, r2d ; top_right
1078 test r2d, r2d ; top_right
1087 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1108 ;-----------------------------------------------------------------------------
1109 ; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
1111 ;-----------------------------------------------------------------------------
1113 %macro PRED8x8L_DC 0
1114 cglobal pred8x8l_dc_8, 4,5
1117 movq mm0, [r0+r3*1-8]
1118 punpckhbw mm0, [r0+r3*0-8]
1119 movq mm1, [r4+r3*1-8]
1120 punpckhbw mm1, [r0+r3*2-8]
1124 movq mm2, [r0+r3*1-8]
1125 punpckhbw mm2, [r0+r3*0-8]
1127 movq mm3, [r0+r3*1-8]
1128 punpckhbw mm3, [r0+r3*0-8]
1132 movq mm0, [r0+r3*0-8]
1137 PALIGNR mm4, mm0, 7, mm0
1138 PALIGNR mm1, mm2, 1, mm2
1165 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1168 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1170 PALIGNR mm7, mm1, 7, mm3
1176 PALIGNR mm2, mm0, 7, mm0
1177 PALIGNR mm1, mm4, 1, mm4
1184 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1212 ;-----------------------------------------------------------------------------
1213 ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
1214 ; int has_topright, ptrdiff_t stride)
1215 ;-----------------------------------------------------------------------------
1217 %macro PRED8x8L_HORIZONTAL 0
1218 cglobal pred8x8l_horizontal_8, 4,4
1221 movq mm0, [r0+r3*1-8]
1225 punpckhbw mm0, [r1+r3*0-8]
1226 movq mm1, [r2+r3*1-8]
1227 punpckhbw mm1, [r0+r3*2-8]
1231 movq mm2, [r0+r3*1-8]
1232 punpckhbw mm2, [r0+r3*0-8]
1234 movq mm3, [r0+r3*1-8]
1235 punpckhbw mm3, [r0+r3*0-8]
1239 movq mm0, [r0+r3*0-8]
1240 movq mm1, [r1+r3*0-8]
1244 PALIGNR mm4, mm0, 7, mm0
1245 PALIGNR mm1, mm2, 1, mm2
1247 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1250 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1252 PALIGNR mm7, mm1, 7, mm3
1258 pshufw mm0, mm3, 0xff
1259 pshufw mm1, mm3, 0xaa
1261 pshufw mm2, mm3, 0x55
1262 pshufw mm3, mm3, 0x00
1263 pshufw mm4, mm7, 0xff
1264 pshufw mm5, mm7, 0xaa
1265 pshufw mm6, mm7, 0x55
1266 pshufw mm7, mm7, 0x00
1284 ;-----------------------------------------------------------------------------
1285 ; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
1287 ;-----------------------------------------------------------------------------
1289 %macro PRED8x8L_VERTICAL 0
1290 cglobal pred8x8l_vertical_8, 4,4
1297 PALIGNR mm2, mm0, 7, mm0
1298 PALIGNR mm1, mm4, 1, mm4
1299 test r1d, r1d ; top_left
1301 test r2d, r2d ; top_right
1310 test r2d, r2d ; top_right
1319 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1335 ;-----------------------------------------------------------------------------
1336 ; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
1337 ; int has_topright, ptrdiff_t stride)
1338 ;-----------------------------------------------------------------------------
1341 cglobal pred8x8l_down_left_8, 4,5
1348 PALIGNR mm2, mm0, 7, mm0
1349 PALIGNR mm1, mm4, 1, mm4
1372 pshufw mm1, mm3, 0xFF
1375 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1384 PALIGNR mm2, mm3, 7, mm3
1385 PALIGNR mm5, mm4, 1, mm4
1386 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1394 PALIGNR mm2, mm7, 1, mm0
1396 PALIGNR mm3, mm7, 7, mm0
1397 PALIGNR mm4, mm6, 1, mm0
1403 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1404 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1448 %macro PRED8x8L_DOWN_LEFT 0
1449 cglobal pred8x8l_down_left_8, 4,4
1456 PALIGNR mm2, mm0, 7, mm0
1457 PALIGNR mm1, mm4, 1, mm4
1458 test r1d, r1d ; top_left
1460 test r2d, r2d ; top_right
1469 test r2d, r2d ; top_right
1480 pshufw mm1, mm3, 0xFF
1483 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1485 test r2d, r2d ; top_right
1492 PALIGNR mm2, mm3, 7, mm3
1493 PALIGNR mm5, mm4, 1, mm4
1494 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1510 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1512 movq [r0+r3*1], xmm0
1514 movq [r0+r3*2], xmm0
1517 movq [r1+r3*1], xmm0
1519 movq [r1+r3*2], xmm0
1521 movq [r2+r3*1], xmm0
1523 movq [r2+r3*2], xmm0
1525 movq [r0+r3*1], xmm0
1527 movq [r0+r3*2], xmm0
1536 ;-----------------------------------------------------------------------------
1537 ; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
1538 ; int has_topright, ptrdiff_t stride)
1539 ;-----------------------------------------------------------------------------
1542 cglobal pred8x8l_down_right_8, 4,5
1545 movq mm0, [r0+r3*1-8]
1546 punpckhbw mm0, [r0+r3*0-8]
1547 movq mm1, [r4+r3*1-8]
1548 punpckhbw mm1, [r0+r3*2-8]
1552 movq mm2, [r0+r3*1-8]
1553 punpckhbw mm2, [r0+r3*0-8]
1555 movq mm3, [r0+r3*1-8]
1556 punpckhbw mm3, [r0+r3*0-8]
1560 movq mm0, [r0+r3*0-8]
1565 PALIGNR mm4, mm0, 7, mm0
1566 PALIGNR mm1, mm2, 1, mm2
1567 test r1d, r1d ; top_left
1571 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1575 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1577 PALIGNR mm7, mm1, 7, mm3
1583 PALIGNR mm2, mm0, 7, mm0
1584 PALIGNR mm1, mm4, 1, mm4
1585 test r1d, r1d ; top_left
1587 test r2d, r2d ; top_right
1590 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1606 test r2d, r2d ; top_right
1622 PALIGNR mm2, mm6, 1, mm0
1624 PALIGNR mm3, mm6, 7, mm0
1628 PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1629 PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1673 %macro PRED8x8L_DOWN_RIGHT 0
1674 cglobal pred8x8l_down_right_8, 4,5
1677 movq mm0, [r0+r3*1-8]
1678 punpckhbw mm0, [r0+r3*0-8]
1679 movq mm1, [r4+r3*1-8]
1680 punpckhbw mm1, [r0+r3*2-8]
1684 movq mm2, [r0+r3*1-8]
1685 punpckhbw mm2, [r0+r3*0-8]
1687 movq mm3, [r0+r3*1-8]
1688 punpckhbw mm3, [r0+r3*0-8]
1692 movq mm0, [r0+r3*0-8]
1697 PALIGNR mm4, mm0, 7, mm0
1698 PALIGNR mm1, mm2, 1, mm2
1726 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1730 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1732 PALIGNR mm7, mm1, 7, mm3
1739 PALIGNR mm2, mm0, 7, mm0
1740 PALIGNR mm1, mm4, 1, mm4
1746 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1763 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1766 movq [r0+r3*2], xmm0
1767 movq [r0+r3*1], xmm1
1770 movq [r2+r3*2], xmm0
1771 movq [r2+r3*1], xmm1
1774 movq [r1+r3*2], xmm0
1775 movq [r1+r3*1], xmm1
1778 movq [r4+r3*2], xmm0
1779 movq [r4+r3*1], xmm1
1788 ;-----------------------------------------------------------------------------
1789 ; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
1790 ; int has_topright, ptrdiff_t stride)
1791 ;-----------------------------------------------------------------------------
1794 cglobal pred8x8l_vertical_right_8, 4,5
1797 movq mm0, [r0+r3*1-8]
1798 punpckhbw mm0, [r0+r3*0-8]
1799 movq mm1, [r4+r3*1-8]
1800 punpckhbw mm1, [r0+r3*2-8]
1804 movq mm2, [r0+r3*1-8]
1805 punpckhbw mm2, [r0+r3*0-8]
1807 movq mm3, [r0+r3*1-8]
1808 punpckhbw mm3, [r0+r3*0-8]
1812 movq mm0, [r0+r3*0-8]
1817 PALIGNR mm4, mm0, 7, mm0
1818 PALIGNR mm1, mm2, 1, mm2
1846 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1853 PALIGNR mm2, mm0, 7, mm0
1854 PALIGNR mm1, mm4, 1, mm4
1860 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1864 PALIGNR mm3, mm7, 7, mm0
1865 PALIGNR mm6, mm7, 6, mm1
1869 PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1880 PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1881 PALIGNR mm6, mm0, 7, mm2
1884 PALIGNR mm5, mm0, 7, mm1
1887 PALIGNR mm6, mm0, 7, mm2
1890 PALIGNR mm5, mm0, 7, mm1
1893 PALIGNR mm6, mm0, 7, mm2
1896 PALIGNR mm5, mm0, 7, mm1
1900 %macro PRED8x8L_VERTICAL_RIGHT 0
1901 cglobal pred8x8l_vertical_right_8, 4,5,7
1902 ; manually spill XMM registers for Win64 because
1903 ; the code here is initialized with INIT_MMX
1907 movq mm0, [r0+r3*1-8]
1908 punpckhbw mm0, [r0+r3*0-8]
1909 movq mm1, [r4+r3*1-8]
1910 punpckhbw mm1, [r0+r3*2-8]
1914 movq mm2, [r0+r3*1-8]
1915 punpckhbw mm2, [r0+r3*0-8]
1917 movq mm3, [r0+r3*1-8]
1918 punpckhbw mm3, [r0+r3*0-8]
1922 movq mm0, [r0+r3*0-8]
1927 PALIGNR mm4, mm0, 7, mm0
1928 PALIGNR mm1, mm2, 1, mm2
1955 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1962 PALIGNR mm2, mm0, 7, mm0
1963 PALIGNR mm1, mm4, 1, mm4
1969 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1974 movdqa xmm6, [pw_ff00]
1983 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1989 movhps [r0+r3*2], xmm5
1990 movhps [r0+r3*1], xmm2
1998 movq [r0+r3*2], xmm5
1999 movq [r0+r3*1], xmm2
2002 movq [r2+r3*2], xmm5
2003 movq [r2+r3*1], xmm2
2006 movq [r1+r3*2], xmm5
2007 movq [r1+r3*1], xmm2
2012 PRED8x8L_VERTICAL_RIGHT
2014 PRED8x8L_VERTICAL_RIGHT
2016 ;-----------------------------------------------------------------------------
2017 ; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
2018 ; int has_topright, ptrdiff_t stride)
2019 ;-----------------------------------------------------------------------------
2021 %macro PRED8x8L_VERTICAL_LEFT 0
2022 cglobal pred8x8l_vertical_left_8, 4,4
2029 PALIGNR mm2, mm0, 7, mm0
2030 PALIGNR mm1, mm4, 1, mm4
2053 pshufw mm1, mm3, 0xFF
2056 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2065 PALIGNR mm2, mm3, 7, mm3
2066 PALIGNR mm5, mm4, 1, mm4
2067 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2081 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2083 movq [r0+r3*1], xmm3
2084 movq [r0+r3*2], xmm0
2088 movq [r1+r3*1], xmm3
2089 movq [r1+r3*2], xmm0
2092 movq [r2+r3*1], xmm3
2093 movq [r2+r3*2], xmm0
2096 movq [r0+r3*1], xmm3
2097 movq [r0+r3*2], xmm0
2102 PRED8x8L_VERTICAL_LEFT
2104 PRED8x8L_VERTICAL_LEFT
2106 ;-----------------------------------------------------------------------------
2107 ; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
2108 ; int has_topright, ptrdiff_t stride)
2109 ;-----------------------------------------------------------------------------
2111 %macro PRED8x8L_HORIZONTAL_UP 0
2112 cglobal pred8x8l_horizontal_up_8, 4,4
2115 movq mm0, [r0+r3*1-8]
2119 punpckhbw mm0, [r1+r3*0-8]
2120 movq mm1, [r2+r3*1-8]
2121 punpckhbw mm1, [r0+r3*2-8]
2125 movq mm2, [r0+r3*1-8]
2126 punpckhbw mm2, [r0+r3*0-8]
2128 movq mm3, [r0+r3*1-8]
2129 punpckhbw mm3, [r0+r3*0-8]
2133 movq mm0, [r0+r3*0-8]
2134 movq mm1, [r1+r3*0-8]
2138 PALIGNR mm4, mm0, 7, mm0
2139 PALIGNR mm1, mm2, 1, mm2
2141 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2144 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2146 PALIGNR mm7, mm1, 7, mm3
2148 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2149 psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
2153 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
2160 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
2162 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
2164 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2166 punpcklbw mm4, mm1 ; p4 p3 p2 p1
2167 punpckhbw mm5, mm1 ; p8 p7 p6 p5
2171 PALIGNR mm5, mm4, 2, mm1
2172 pshufw mm1, mm6, 11111001b
2173 PALIGNR mm6, mm4, 4, mm2
2174 pshufw mm2, mm7, 11111110b
2175 PALIGNR mm7, mm4, 6, mm3
2176 pshufw mm3, mm0, 11111111b
2190 PRED8x8L_HORIZONTAL_UP
2192 PRED8x8L_HORIZONTAL_UP
2194 ;-----------------------------------------------------------------------------
2195 ; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
2196 ; int has_topright, ptrdiff_t stride)
2197 ;-----------------------------------------------------------------------------
2200 cglobal pred8x8l_horizontal_down_8, 4,5
2203 movq mm0, [r0+r3*1-8]
2204 punpckhbw mm0, [r0+r3*0-8]
2205 movq mm1, [r4+r3*1-8]
2206 punpckhbw mm1, [r0+r3*2-8]
2210 movq mm2, [r0+r3*1-8]
2211 punpckhbw mm2, [r0+r3*0-8]
2213 movq mm3, [r0+r3*1-8]
2214 punpckhbw mm3, [r0+r3*0-8]
2218 movq mm0, [r0+r3*0-8]
2223 PALIGNR mm4, mm0, 7, mm0
2224 PALIGNR mm1, mm2, 1, mm2
2251 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2255 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2257 PALIGNR mm7, mm1, 7, mm3
2263 PALIGNR mm2, mm0, 7, mm0
2264 PALIGNR mm1, mm4, 1, mm4
2270 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2277 PALIGNR mm2, mm6, 7, mm5
2278 PALIGNR mm6, mm7, 7, mm0
2280 PALIGNR mm4, mm3, 1, mm7
2283 PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2289 PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2297 PALIGNR mm7, mm3, 2, mm5
2299 PALIGNR mm1, mm3, 4, mm5
2301 PALIGNR mm0, mm3, 6, mm3
2306 PALIGNR mm6, mm4, 2, mm5
2308 PALIGNR mm2, mm4, 4, mm5
2310 PALIGNR mm3, mm4, 6, mm4
2314 %macro PRED8x8L_HORIZONTAL_DOWN 0
2315 cglobal pred8x8l_horizontal_down_8, 4,5
2318 movq mm0, [r0+r3*1-8]
2319 punpckhbw mm0, [r0+r3*0-8]
2320 movq mm1, [r4+r3*1-8]
2321 punpckhbw mm1, [r0+r3*2-8]
2325 movq mm2, [r0+r3*1-8]
2326 punpckhbw mm2, [r0+r3*0-8]
2328 movq mm3, [r0+r3*1-8]
2329 punpckhbw mm3, [r0+r3*0-8]
2333 movq mm0, [r0+r3*0-8]
2338 PALIGNR mm4, mm0, 7, mm0
2339 PALIGNR mm1, mm2, 1, mm2
2366 pshufw mm1, mm3, 0xFF
2370 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2374 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2384 PALIGNR mm2, mm0, 7, mm0
2385 PALIGNR mm1, mm4, 1, mm4
2391 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2400 PALIGNR mm2, mm3, 7, mm3
2401 PALIGNR mm5, mm4, 1, mm4
2402 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2411 PALIGNR xmm1, xmm0, 7, xmm4
2412 PALIGNR xmm2, xmm0, 9, xmm5
2414 PALIGNR xmm3, xmm0, 8, xmm0
2418 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2419 punpcklbw xmm4, xmm0
2421 movq [r0+r3*2], xmm4
2422 movq [r2+r3*2], xmm0
2425 movq [r0+r3*1], xmm4
2426 movq [r2+r3*1], xmm0
2429 movq [r1+r3*2], xmm4
2430 movq [r4+r3*2], xmm0
2433 movq [r1+r3*1], xmm4
2434 movq [r4+r3*1], xmm0
2439 PRED8x8L_HORIZONTAL_DOWN
2441 PRED8x8L_HORIZONTAL_DOWN
2443 ;-------------------------------------------------------------------------------
2444 ; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright,
2446 ;-------------------------------------------------------------------------------
2449 cglobal pred4x4_dc_8, 3,5
2455 movzx r1d, byte [r0+r2*1-1]
2458 movzx r1d, byte [r0+r2*2-1]
2461 movzx r1d, byte [r0+r2*1-1]
2463 movzx r1d, byte [r0+r2*2-1]
2467 imul r3d, 0x01010101
2474 ;-----------------------------------------------------------------------------
2475 ; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2477 ;-----------------------------------------------------------------------------
2480 cglobal pred4x4_tm_vp8_8, 3,6
2485 movzx r4d, byte [r0-1]
2488 movzx r1d, byte [r0+r2*1-1]
2489 movzx r3d, byte [r0+r2*2-1]
2521 cglobal pred4x4_tm_vp8_8, 3,3
2530 movd mm2, [r0+r2*1-4]
2531 movd mm3, [r0+r2*2-4]
2532 movd mm4, [r1+r2*1-4]
2533 movd mm5, [r1+r2*2-4]
2553 ;-----------------------------------------------------------------------------
2554 ; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2556 ;-----------------------------------------------------------------------------
2559 cglobal pred4x4_vertical_vp8_8, 3,3
2563 mova m2, m0 ;t0 t1 t2 t3
2564 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2566 psrlq m0, 8 ;t1 t2 t3 t4
2567 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2574 ;-----------------------------------------------------------------------------
2575 ; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2577 ;-----------------------------------------------------------------------------
2579 cglobal pred4x4_down_left_8, 3,3
2589 PRED4x4_LOWPASS m0, m1, m2, m3, m4
2601 ;------------------------------------------------------------------------------
2602 ; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2604 ;------------------------------------------------------------------------------
2607 cglobal pred4x4_vertical_left_8, 3,3
2617 PRED4x4_LOWPASS m0, m1, m2, m3, m5
2627 ;------------------------------------------------------------------------------
2628 ; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
2630 ;------------------------------------------------------------------------------
2633 cglobal pred4x4_horizontal_up_8, 3,3
2636 movd m0, [r0+r2*1-4]
2637 punpcklbw m0, [r0+r2*2-4]
2638 movd m1, [r1+r2*1-4]
2639 punpcklbw m1, [r1+r2*2-4]
2651 PRED4x4_LOWPASS m4, m0, m2, m3, m5
2661 ;------------------------------------------------------------------------------
2662 ; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
2663 ; const uint8_t *topright,
2665 ;------------------------------------------------------------------------------
2668 cglobal pred4x4_horizontal_down_8, 3,3
2671 movh m0, [r0-4] ; lt ..
2672 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
2673 psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
2674 movd m1, [r1+r2*2-4] ; l3
2675 punpcklbw m1, [r1+r2*1-4] ; l2 l3
2676 movd m2, [r0+r2*2-4] ; l1
2677 punpcklbw m2, [r0+r2*1-4] ; l0 l1
2678 punpckhwd m1, m2 ; l0 l1 l2 l3
2679 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
2683 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
2684 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
2686 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2689 PALIGNR m3, m5, 6, m4
2698 ;-----------------------------------------------------------------------------
2699 ; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
2700 ; const uint8_t *topright,
2702 ;-----------------------------------------------------------------------------
2705 cglobal pred4x4_vertical_right_8, 3,3
2708 movh m0, [r0] ; ........t3t2t1t0
2710 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
2712 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
2714 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
2716 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
2717 PRED4x4_LOWPASS m3, m1, m0, m2, m4
2723 PALIGNR m5, m1, 7, m2
2726 PALIGNR m3, m1, 7, m1
2730 ;-----------------------------------------------------------------------------
2731 ; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
2733 ;-----------------------------------------------------------------------------
2736 cglobal pred4x4_down_right_8, 3,3
2740 movq m2, [r0+r2*1-8]
2741 punpckhbw m2, [r0-8]
2744 PALIGNR m3, m1, 5, m1
2746 PALIGNR m3, [r1+r2*1-8], 7, m4
2748 PALIGNR m3, [r1+r2*2-8], 7, m4
2749 PRED4x4_LOWPASS m0, m3, m1, m2, m4