1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
4 ;* Copyright (c) 2010 Holger Lubitz
5 ;* Copyright (c) 2010 Loren Merritt
6 ;* Copyright (c) 2010 Ronald S. Bultje
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
30 tm_shuf: times 8 db 0x03, 0x80
31 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
32 db 1, 2, 3, 4, 5, 6, 7, 8
33 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
34 db 1, 2, 3, 4, 0, 0, 0, 0
35 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
36 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
37 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
38 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
51 ;-----------------------------------------------------------------------------
52 ; void pred16x16_vertical(uint8_t *src, int stride)
53 ;-----------------------------------------------------------------------------
55 cglobal pred16x16_vertical_mmx, 2,3
70 cglobal pred16x16_vertical_sse, 2,3
75 movaps [r0+r1*1], xmm0
76 movaps [r0+r1*2], xmm0
78 movaps [r0+r1*1], xmm0
79 movaps [r0+r1*2], xmm0
85 ;-----------------------------------------------------------------------------
86 ; void pred16x16_horizontal(uint8_t *src, int stride)
87 ;-----------------------------------------------------------------------------
90 cglobal pred16x16_horizontal_%1, 2,3
132 ;-----------------------------------------------------------------------------
133 ; void pred16x16_dc(uint8_t *src, int stride)
134 ;-----------------------------------------------------------------------------
136 %macro PRED16x16_DC 1
137 cglobal pred16x16_dc_%1, 2,7
145 movzx r5d, byte [r0+r1*1]
150 movzx r2d, byte [r0+r1*0]
151 movzx r3d, byte [r0+r1*1]
156 movzx r2d, byte [r0+r1*0]
203 ;-----------------------------------------------------------------------------
204 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
205 ;-----------------------------------------------------------------------------
207 %macro PRED16x16_TM_MMX 1
208 cglobal pred16x16_tm_vp8_%1, 2,5
219 movzx r3d, byte [r0-1]
222 movzx r2d, byte [r0+r1-1]
249 PRED16x16_TM_MMX mmxext
251 cglobal pred16x16_tm_vp8_sse2, 2,6,6
258 movzx r4d, byte [r0-1]
261 movzx r2d, byte [r0+r1*1-1]
262 movzx r3d, byte [r0+r1*2-1]
267 pshuflw xmm2, xmm2, 0
268 pshuflw xmm4, xmm4, 0
269 punpcklqdq xmm2, xmm2
270 punpcklqdq xmm4, xmm4
279 movdqa [r0+r1*1], xmm2
280 movdqa [r0+r1*2], xmm4
286 ;-----------------------------------------------------------------------------
287 ; void pred16x16_plane(uint8_t *src, int stride)
288 ;-----------------------------------------------------------------------------
290 %macro H264_PRED16x16_PLANE 3
291 cglobal pred16x16_plane_%3_%1, 2, 7, %2
305 pmullw m0, [pw_m8tom1 ]
306 pmullw m1, [pw_m8tom1+8]
307 pmullw m2, [pw_1to8 ]
308 pmullw m3, [pw_1to8 +8]
317 pmullw m0, [pw_m8tom1]
321 movhps m0, [r0+r1 +8]
322 pmaddubsw m0, [plane_shuf] ; H coefficients
344 paddw m0, m1 ; sum of H coefficients
360 lea r3, [r3*5] ; 5*(H/4)
364 sar r3, 4 ; (5*(H/4))/16
378 movzx e_reg, byte [r3+r2*2 ]
379 movzx r5, byte [r4+r1 ]
382 movzx e_reg, byte [r3+r2 ]
387 movzx e_reg, byte [r3+r1 ]
388 movzx r6, byte [r4+r2*2 ]
392 movzx e_reg, byte [r3 ]
394 movzx r10, byte [r4+r2 ]
397 movzx r6, byte [r4+r2 ]
406 movzx r4, byte [e_reg+r2 ]
418 movzx r4, byte [e_reg ]
420 movzx r10, byte [r3 +r2 ]
424 movzx r6, byte [r3 +r2 ]
430 movzx r4, byte [e_reg+r1 ]
431 movzx r6, byte [r3 +r2*2]
438 movzx r4, byte [e_reg+r2*2]
439 movzx r6, byte [r3 +r1 ]
442 add r5, r6 ; sum of V coefficients
459 lea r5, [r5*5] ; 5*(V/4)
463 sar r5, 4 ; (5*(V/4))/16
466 movzx r4, byte [r0+r1 +15]
467 movzx r3, byte [r3+r2*2 ]
494 punpcklqdq m0, m0 ; splat H (words)
495 punpcklqdq m1, m1 ; splat V (words)
496 punpcklqdq m3, m3 ; splat a (words)
505 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
514 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
515 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
517 paddw m5, m0 ; a + {8,9,10,11}*H
518 paddw m6, m0 ; a + {12,13,14,15}*H
523 mova m3, m0 ; b[0..7]
524 mova m4, m2 ; b[8..15]
530 mova m3, m5 ; b[8..11]
531 mova m4, m6 ; b[12..15]
544 mova m3, m0 ; b[0..7]
545 mova m4, m2 ; b[8..15]
551 mova m3, m5 ; b[8..11]
552 mova m4, m6 ; b[12..15]
572 H264_PRED16x16_PLANE mmx, 0, h264
573 H264_PRED16x16_PLANE mmx, 0, rv40
574 H264_PRED16x16_PLANE mmx, 0, svq3
575 H264_PRED16x16_PLANE mmx2, 0, h264
576 H264_PRED16x16_PLANE mmx2, 0, rv40
577 H264_PRED16x16_PLANE mmx2, 0, svq3
579 H264_PRED16x16_PLANE sse2, 8, h264
580 H264_PRED16x16_PLANE sse2, 8, rv40
581 H264_PRED16x16_PLANE sse2, 8, svq3
582 H264_PRED16x16_PLANE ssse3, 8, h264
583 H264_PRED16x16_PLANE ssse3, 8, rv40
584 H264_PRED16x16_PLANE ssse3, 8, svq3
586 ;-----------------------------------------------------------------------------
587 ; void pred8x8_plane(uint8_t *src, int stride)
588 ;-----------------------------------------------------------------------------
590 %macro H264_PRED8x8_PLANE 2
591 cglobal pred8x8_plane_%1, 2, 7, %2
601 pmullw m0, [pw_m4to4]
602 pmullw m1, [pw_m4to4+8]
609 pmullw m0, [pw_m4to4]
611 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
612 pmaddubsw m0, [plane8_shuf] ; H coefficients
638 paddw m0, m1 ; sum of H coefficients
654 movzx e_reg, byte [r3+r2*2 ]
655 movzx r5, byte [r4+r1 ]
658 movzx e_reg, byte [r3 ]
660 movzx r10, byte [r4+r2 ]
664 movzx r6, byte [r4+r2 ]
670 movzx e_reg, byte [r3+r1 ]
671 movzx r6, byte [r4+r2*2 ]
678 movzx e_reg, byte [r3+r2 ]
691 movzx r3, byte [r4+r2*2 ]
692 movzx r4, byte [r0+r1 +7]
719 punpcklqdq m0, m0 ; splat H (words)
720 punpcklqdq m1, m1 ; splat V (words)
721 punpcklqdq m3, m3 ; splat a (words)
726 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
727 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
730 paddw m2, m0 ; a + {4,5,6,7}*H
737 mova m3, m0 ; b[0..7]
740 mova m4, m0 ; V+b[0..7]
747 mova m3, m0 ; b[0..3]
748 mova m4, m2 ; b[4..7]
753 mova m5, m0 ; V+b[0..3]
754 mova m6, m2 ; V+b[4..7]
772 H264_PRED8x8_PLANE mmx, 0
773 H264_PRED8x8_PLANE mmx2, 0
775 H264_PRED8x8_PLANE sse2, 8
776 H264_PRED8x8_PLANE ssse3, 8
778 ;-----------------------------------------------------------------------------
779 ; void pred8x8_vertical(uint8_t *src, int stride)
780 ;-----------------------------------------------------------------------------
782 cglobal pred8x8_vertical_mmx, 2,2
794 ;-----------------------------------------------------------------------------
795 ; void pred8x8_horizontal(uint8_t *src, int stride)
796 ;-----------------------------------------------------------------------------
799 cglobal pred8x8_horizontal_%1, 2,3
836 ;-----------------------------------------------------------------------------
837 ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
838 ;-----------------------------------------------------------------------------
840 cglobal pred8x8_top_dc_mmxext, 2,5
857 pshufw mm0, mm0, 0 ; dc0 (w)
858 packuswb mm0, mm1 ; dc0,dc1 (b)
871 ;-----------------------------------------------------------------------------
872 ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
873 ;-----------------------------------------------------------------------------
876 cglobal pred8x8_dc_mmxext, 2,5
885 movzx r2d, byte [r0+r1*1-1]
886 movzx r3d, byte [r0+r1*2-1]
889 movzx r3d, byte [r0+r1*1-1]
891 movzx r3d, byte [r0+r1*2-1]
895 movzx r2d, byte [r0+r1*1-1]
896 movzx r3d, byte [r0+r1*2-1]
899 movzx r3d, byte [r0+r1*1-1]
901 movzx r3d, byte [r0+r1*2-1]
908 punpckldq m0, m2 ; s0, s1, s2, s3
909 pshufw m3, m0, 11110110b ; s2, s1, s3, s3
911 pshufw m0, m0, 01110100b ; s0, s1, s3, s1
915 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
933 ;-----------------------------------------------------------------------------
934 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
935 ;-----------------------------------------------------------------------------
937 cglobal pred8x8_dc_rv40_mmxext, 2,7
943 movzx r5d, byte [r0+r1*1]
947 movzx r2d, byte [r0+r1*0]
948 movzx r3d, byte [r0+r1*1]
953 movzx r2d, byte [r0+r1*0]
969 ;-----------------------------------------------------------------------------
970 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
971 ;-----------------------------------------------------------------------------
973 %macro PRED8x8_TM_MMX 1
974 cglobal pred8x8_tm_vp8_%1, 2,6
981 movzx r4d, byte [r0-1]
984 movzx r2d, byte [r0+r1*1-1]
985 movzx r3d, byte [r0+r1*2-1]
1016 PRED8x8_TM_MMX mmxext
1018 cglobal pred8x8_tm_vp8_sse2, 2,6,4
1022 punpcklbw xmm0, xmm1
1023 movzx r4d, byte [r0-1]
1026 movzx r2d, byte [r0+r1*1-1]
1027 movzx r3d, byte [r0+r1*2-1]
1032 pshuflw xmm2, xmm2, 0
1033 pshuflw xmm3, xmm3, 0
1034 punpcklqdq xmm2, xmm2
1035 punpcklqdq xmm3, xmm3
1039 movq [r0+r1*1], xmm2
1040 movhps [r0+r1*2], xmm2
1046 cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1048 movdqa xmm4, [tm_shuf]
1051 punpcklbw xmm0, xmm1
1056 movd xmm2, [r0+r1*1-4]
1057 movd xmm3, [r0+r1*2-4]
1065 movq [r0+r1*1], xmm2
1066 movhps [r0+r1*2], xmm2
1072 ; dest, left, right, src, tmp
1073 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1074 %macro PRED4x4_LOWPASS 5
1084 ;-----------------------------------------------------------------------------
1085 ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1086 ;-----------------------------------------------------------------------------
1088 %macro PRED8x8L_TOP_DC 1
1089 cglobal pred8x8l_top_dc_%1, 4,4
1097 PALIGNR mm2, mm0, 7, mm0
1098 PALIGNR mm1, mm4, 1, mm4
1099 test r1, r1 ; top_left
1101 test r2, r2 ; top_right
1110 test r2, r2 ; top_right
1119 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1136 %define PALIGNR PALIGNR_MMX
1137 PRED8x8L_TOP_DC mmxext
1138 %define PALIGNR PALIGNR_SSSE3
1139 PRED8x8L_TOP_DC ssse3
1142 ;-----------------------------------------------------------------------------
1143 ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1144 ;-----------------------------------------------------------------------------
1146 %macro PRED8x8L_DC 1
1147 cglobal pred8x8l_dc_%1, 4,5
1150 movq mm0, [r0+r3*1-8]
1151 punpckhbw mm0, [r0+r3*0-8]
1152 movq mm1, [r4+r3*1-8]
1153 punpckhbw mm1, [r0+r3*2-8]
1157 movq mm2, [r0+r3*1-8]
1158 punpckhbw mm2, [r0+r3*0-8]
1160 movq mm3, [r0+r3*1-8]
1161 punpckhbw mm3, [r0+r3*0-8]
1165 movq mm0, [r0+r3*0-8]
1170 PALIGNR mm4, mm0, 7, mm0
1171 PALIGNR mm1, mm2, 1, mm2
1198 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1201 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1203 PALIGNR mm7, mm1, 7, mm3
1209 PALIGNR mm2, mm0, 7, mm0
1210 PALIGNR mm1, mm4, 1, mm4
1217 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1240 %define PALIGNR PALIGNR_MMX
1242 %define PALIGNR PALIGNR_SSSE3
1246 ;-----------------------------------------------------------------------------
1247 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1248 ;-----------------------------------------------------------------------------
1250 cglobal pred4x4_dc_mmxext, 3,5
1256 movzx r1d, byte [r0+r2*1-1]
1259 movzx r1d, byte [r0+r2*2-1]
1262 movzx r1d, byte [r0+r2*1-1]
1264 movzx r1d, byte [r0+r2*2-1]
1268 imul r3d, 0x01010101
1275 ;-----------------------------------------------------------------------------
1276 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1277 ;-----------------------------------------------------------------------------
1279 %macro PRED4x4_TM_MMX 1
1280 cglobal pred4x4_tm_vp8_%1, 3,6
1285 movzx r4d, byte [r0-1]
1288 movzx r1d, byte [r0+r2*1-1]
1289 movzx r3d, byte [r0+r2*2-1]
1316 PRED4x4_TM_MMX mmxext
1318 cglobal pred4x4_tm_vp8_ssse3, 3,3
1327 movd mm2, [r0+r2*1-4]
1328 movd mm3, [r0+r2*2-4]
1329 movd mm4, [r1+r2*1-4]
1330 movd mm5, [r1+r2*2-4]
1353 ;-----------------------------------------------------------------------------
1354 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1355 ;-----------------------------------------------------------------------------
1358 cglobal pred4x4_vertical_vp8_mmxext, 3,3
1362 mova m2, m0 ;t0 t1 t2 t3
1363 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
1365 psrlq m0, 8 ;t1 t2 t3 t4
1366 PRED4x4_LOWPASS m3, m1, m0, m2, m4
1373 ;-----------------------------------------------------------------------------
1374 ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1375 ;-----------------------------------------------------------------------------
1378 cglobal pred4x4_down_left_mmxext, 3,3
1389 PRED4x4_LOWPASS m0, m1, m3, m4, m5