1 ;******************************************************************************
2 ;* H.264 intra prediction asm optimizations
3 ;* Copyright (c) 2010 Jason Garrett-Glaser
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
26 tm_shuf: times 8 db 0x03, 0x80
27 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
28 db 1, 2, 3, 4, 5, 6, 7, 8
29 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
30 db 1, 2, 3, 4, 0, 0, 0, 0
31 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
32 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
33 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
34 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
45 ;-----------------------------------------------------------------------------
46 ; void pred16x16_vertical(uint8_t *src, int stride)
47 ;-----------------------------------------------------------------------------
49 cglobal pred16x16_vertical_mmx, 2,3
64 cglobal pred16x16_vertical_sse, 2,3
69 movaps [r0+r1*1], xmm0
70 movaps [r0+r1*2], xmm0
72 movaps [r0+r1*1], xmm0
73 movaps [r0+r1*2], xmm0
79 ;-----------------------------------------------------------------------------
80 ; void pred16x16_horizontal(uint8_t *src, int stride)
81 ;-----------------------------------------------------------------------------
84 cglobal pred16x16_horizontal_%1, 2,3
126 ;-----------------------------------------------------------------------------
127 ; void pred16x16_dc(uint8_t *src, int stride)
128 ;-----------------------------------------------------------------------------
130 %macro PRED16x16_DC 1
131 cglobal pred16x16_dc_%1, 2,7
139 movzx r5d, byte [r0+r1*1]
144 movzx r2d, byte [r0+r1*0]
145 movzx r3d, byte [r0+r1*1]
150 movzx r2d, byte [r0+r1*0]
197 ;-----------------------------------------------------------------------------
198 ; void pred16x16_tm_vp8(uint8_t *src, int stride)
199 ;-----------------------------------------------------------------------------
201 %macro PRED16x16_TM_MMX 1
202 cglobal pred16x16_tm_vp8_%1, 2,5
213 movzx r3d, byte [r0-1]
216 movzx r2d, byte [r0+r1-1]
243 PRED16x16_TM_MMX mmxext
245 cglobal pred16x16_tm_vp8_sse2, 2,6,6
252 movzx r4d, byte [r0-1]
255 movzx r2d, byte [r0+r1*1-1]
256 movzx r3d, byte [r0+r1*2-1]
261 pshuflw xmm2, xmm2, 0
262 pshuflw xmm4, xmm4, 0
263 punpcklqdq xmm2, xmm2
264 punpcklqdq xmm4, xmm4
273 movdqa [r0+r1*1], xmm2
274 movdqa [r0+r1*2], xmm4
280 ;-----------------------------------------------------------------------------
281 ; void pred16x16_plane(uint8_t *src, int stride)
282 ;-----------------------------------------------------------------------------
284 %macro H264_PRED16x16_PLANE 3
285 cglobal pred16x16_plane_%3_%1, 2, 7, %2
299 pmullw m0, [pw_m8tom1 ]
300 pmullw m1, [pw_m8tom1+8]
301 pmullw m2, [pw_1to8 ]
302 pmullw m3, [pw_1to8 +8]
311 pmullw m0, [pw_m8tom1]
315 movhps m0, [r0+r1 +8]
316 pmaddubsw m0, [plane_shuf] ; H coefficients
338 paddw m0, m1 ; sum of H coefficients
354 lea r3, [r3*5] ; 5*(H/4)
358 sar r3, 4 ; (5*(H/4))/16
372 movzx e_reg, byte [r3+r2*2 ]
373 movzx r5, byte [r4+r1 ]
376 movzx e_reg, byte [r3+r2 ]
381 movzx e_reg, byte [r3+r1 ]
382 movzx r6, byte [r4+r2*2 ]
386 movzx e_reg, byte [r3 ]
388 movzx r10, byte [r4+r2 ]
391 movzx r6, byte [r4+r2 ]
400 movzx r4, byte [e_reg+r2 ]
412 movzx r4, byte [e_reg ]
414 movzx r10, byte [r3 +r2 ]
418 movzx r6, byte [r3 +r2 ]
424 movzx r4, byte [e_reg+r1 ]
425 movzx r6, byte [r3 +r2*2]
432 movzx r4, byte [e_reg+r2*2]
433 movzx r6, byte [r3 +r1 ]
436 add r5, r6 ; sum of V coefficients
453 lea r5, [r5*5] ; 5*(V/4)
457 sar r5, 4 ; (5*(V/4))/16
460 movzx r4, byte [r0+r1 +15]
461 movzx r3, byte [r3+r2*2 ]
488 punpcklqdq m0, m0 ; splat H (words)
489 punpcklqdq m1, m1 ; splat V (words)
490 punpcklqdq m3, m3 ; splat a (words)
499 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
508 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
509 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
511 paddw m5, m0 ; a + {8,9,10,11}*H
512 paddw m6, m0 ; a + {12,13,14,15}*H
517 mova m3, m0 ; b[0..7]
518 mova m4, m2 ; b[8..15]
524 mova m3, m5 ; b[8..11]
525 mova m4, m6 ; b[12..15]
538 mova m3, m0 ; b[0..7]
539 mova m4, m2 ; b[8..15]
545 mova m3, m5 ; b[8..11]
546 mova m4, m6 ; b[12..15]
566 H264_PRED16x16_PLANE mmx, 0, h264
567 H264_PRED16x16_PLANE mmx, 0, rv40
568 H264_PRED16x16_PLANE mmx, 0, svq3
569 H264_PRED16x16_PLANE mmx2, 0, h264
570 H264_PRED16x16_PLANE mmx2, 0, rv40
571 H264_PRED16x16_PLANE mmx2, 0, svq3
573 H264_PRED16x16_PLANE sse2, 8, h264
574 H264_PRED16x16_PLANE sse2, 8, rv40
575 H264_PRED16x16_PLANE sse2, 8, svq3
576 H264_PRED16x16_PLANE ssse3, 8, h264
577 H264_PRED16x16_PLANE ssse3, 8, rv40
578 H264_PRED16x16_PLANE ssse3, 8, svq3
580 ;-----------------------------------------------------------------------------
581 ; void pred8x8_plane(uint8_t *src, int stride)
582 ;-----------------------------------------------------------------------------
584 %macro H264_PRED8x8_PLANE 2
585 cglobal pred8x8_plane_%1, 2, 7, %2
595 pmullw m0, [pw_m4to4]
596 pmullw m1, [pw_m4to4+8]
603 pmullw m0, [pw_m4to4]
605 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
606 pmaddubsw m0, [plane8_shuf] ; H coefficients
632 paddw m0, m1 ; sum of H coefficients
648 movzx e_reg, byte [r3+r2*2 ]
649 movzx r5, byte [r4+r1 ]
652 movzx e_reg, byte [r3 ]
654 movzx r10, byte [r4+r2 ]
658 movzx r6, byte [r4+r2 ]
664 movzx e_reg, byte [r3+r1 ]
665 movzx r6, byte [r4+r2*2 ]
672 movzx e_reg, byte [r3+r2 ]
685 movzx r3, byte [r4+r2*2 ]
686 movzx r4, byte [r0+r1 +7]
713 punpcklqdq m0, m0 ; splat H (words)
714 punpcklqdq m1, m1 ; splat V (words)
715 punpcklqdq m3, m3 ; splat a (words)
720 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
721 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
724 paddw m2, m0 ; a + {4,5,6,7}*H
731 mova m3, m0 ; b[0..7]
734 mova m4, m0 ; V+b[0..7]
741 mova m3, m0 ; b[0..3]
742 mova m4, m2 ; b[4..7]
747 mova m5, m0 ; V+b[0..3]
748 mova m6, m2 ; V+b[4..7]
766 H264_PRED8x8_PLANE mmx, 0
767 H264_PRED8x8_PLANE mmx2, 0
769 H264_PRED8x8_PLANE sse2, 8
770 H264_PRED8x8_PLANE ssse3, 8
772 ;-----------------------------------------------------------------------------
773 ; void pred8x8_vertical(uint8_t *src, int stride)
774 ;-----------------------------------------------------------------------------
776 cglobal pred8x8_vertical_mmx, 2,2
788 ;-----------------------------------------------------------------------------
789 ; void pred8x8_horizontal(uint8_t *src, int stride)
790 ;-----------------------------------------------------------------------------
793 cglobal pred8x8_horizontal_%1, 2,3
830 ;-----------------------------------------------------------------------------
831 ; void pred8x8_dc_rv40(uint8_t *src, int stride)
832 ;-----------------------------------------------------------------------------
834 cglobal pred8x8_dc_rv40_mmxext, 2,7
840 movzx r5d, byte [r0+r1*1]
844 movzx r2d, byte [r0+r1*0]
845 movzx r3d, byte [r0+r1*1]
850 movzx r2d, byte [r0+r1*0]
866 ;-----------------------------------------------------------------------------
867 ; void pred8x8_tm_vp8(uint8_t *src, int stride)
868 ;-----------------------------------------------------------------------------
870 %macro PRED8x8_TM_MMX 1
871 cglobal pred8x8_tm_vp8_%1, 2,6
878 movzx r4d, byte [r0-1]
881 movzx r2d, byte [r0+r1*1-1]
882 movzx r3d, byte [r0+r1*2-1]
913 PRED8x8_TM_MMX mmxext
915 cglobal pred8x8_tm_vp8_sse2, 2,6,4
920 movzx r4d, byte [r0-1]
923 movzx r2d, byte [r0+r1*1-1]
924 movzx r3d, byte [r0+r1*2-1]
929 pshuflw xmm2, xmm2, 0
930 pshuflw xmm3, xmm3, 0
931 punpcklqdq xmm2, xmm2
932 punpcklqdq xmm3, xmm3
937 movhps [r0+r1*2], xmm2
943 cglobal pred8x8_tm_vp8_ssse3, 2,3,6
945 movdqa xmm4, [tm_shuf]
953 movd xmm2, [r0+r1*1-4]
954 movd xmm3, [r0+r1*2-4]
963 movhps [r0+r1*2], xmm2
969 ;-----------------------------------------------------------------------------
970 ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
971 ;-----------------------------------------------------------------------------
973 cglobal pred4x4_dc_mmxext, 3,5
979 movzx r1d, byte [r0+r2*1-1]
982 movzx r1d, byte [r0+r2*2-1]
985 movzx r1d, byte [r0+r2*1-1]
987 movzx r1d, byte [r0+r2*2-1]
998 ;-----------------------------------------------------------------------------
999 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1000 ;-----------------------------------------------------------------------------
1002 %macro PRED4x4_TM_MMX 1
1003 cglobal pred4x4_tm_vp8_%1, 3,6
1008 movzx r4d, byte [r0-1]
1011 movzx r1d, byte [r0+r2*1-1]
1012 movzx r3d, byte [r0+r2*2-1]
1039 PRED4x4_TM_MMX mmxext
1041 cglobal pred4x4_tm_vp8_ssse3, 3,3
1050 movd mm2, [r0+r2*1-4]
1051 movd mm3, [r0+r2*2-4]
1052 movd mm4, [r1+r2*1-4]
1053 movd mm5, [r1+r2*2-4]
1076 ; dest, left, right, src, tmp
1077 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1078 %macro PRED4x4_LOWPASS 5
1088 ;-----------------------------------------------------------------------------
1089 ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
1090 ;-----------------------------------------------------------------------------
1093 cglobal pred4x4_vertical_vp8_mmxext, 3,3
1097 mova m2, m0 ;t0 t1 t2 t3
1098 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
1100 psrlq m0, 8 ;t1 t2 t3 t4
1101 PRED4x4_LOWPASS m3, m1, m0, m2, m4