1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
36 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
38 pw_pixel_max: times 8 dw ((1 << 10)-1)
44 ; dest, left, right, src
45 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46 %macro PRED4x4_LOWPASS 4
52 ;-----------------------------------------------------------------------------
53 ; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred4x4_down_right_10, 3, 3
60 movhps m2, [r0+r2*1-8]
65 PALIGNR m3, m1, 10, m1
66 movhps m4, [r1+r2*1-8]
67 PALIGNR m0, m3, m4, 14, m4
68 movhps m4, [r1+r2*2-8]
69 PALIGNR m2, m0, m4, 14, m4
70 PRED4x4_LOWPASS m0, m2, m3, m0
90 ;------------------------------------------------------------------------------
91 ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
92 ;------------------------------------------------------------------------------
94 cglobal pred4x4_vertical_right_10, 3, 3, 6
97 movq m5, [r0] ; ........t3t2t1t0
99 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
101 movhps m1, [r0+r2*1-8]
102 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
103 movhps m2, [r0+r2*2-8]
104 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
105 movhps m3, [r1+r2*1-8]
106 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
107 PRED4x4_LOWPASS m1, m0, m2, m1
112 PALIGNR m5, m0, 14, m2
115 PALIGNR m1, m0, 14, m0
124 %if HAVE_AVX_EXTERNAL
129 ;-------------------------------------------------------------------------------
130 ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
131 ;-------------------------------------------------------------------------------
133 cglobal pred4x4_horizontal_down_10, 3, 3
136 movq m0, [r0-8] ; lt ..
138 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
139 movq m1, [r1+r2*2-8] ; l3
141 punpcklwd m1, m3 ; l2 l3
142 movq m2, [r0+r2*2-8] ; l1
144 punpcklwd m2, m3 ; l0 l1
145 punpckhdq m1, m2 ; l0 l1 l2 l3
146 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
147 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
148 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
150 PRED4x4_LOWPASS m3, m1, m0, m3
153 PALIGNR m3, m5, 12, m4
166 %if HAVE_AVX_EXTERNAL
171 ;-----------------------------------------------------------------------------
172 ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
173 ;-----------------------------------------------------------------------------
176 cglobal pred4x4_dc_10, 3, 3
180 paddw m2, [r0+r2*2-8]
181 paddw m2, [r1+r2*1-8]
182 paddw m2, [r1+r2*2-8]
196 ;-----------------------------------------------------------------------------
197 ; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride)
198 ;-----------------------------------------------------------------------------
200 cglobal pred4x4_down_left_10, 3, 3
206 pshufhw m2, m2, 10100100b
207 PRED4x4_LOWPASS m0, m3, m2, m0
221 %if HAVE_AVX_EXTERNAL
226 ;-----------------------------------------------------------------------------
227 ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
228 ;-----------------------------------------------------------------------------
230 cglobal pred4x4_vertical_left_10, 3, 3
237 PRED4x4_LOWPASS m0, m1, m2, m0
250 %if HAVE_AVX_EXTERNAL
255 ;-----------------------------------------------------------------------------
256 ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
257 ;-----------------------------------------------------------------------------
259 cglobal pred4x4_horizontal_up_10, 3, 3
263 punpckhwd m0, [r0+r2*2-8]
265 punpckhwd m1, [r1+r2*2-8]
270 pshufw m2, m0, 11111001b
274 pshufw m5, m0, 11111110b
275 PRED4x4_LOWPASS m1, m0, m5, m1
289 ;-----------------------------------------------------------------------------
290 ; void ff_pred8x8_vertical(pixel *src, int stride)
291 ;-----------------------------------------------------------------------------
293 cglobal pred8x8_vertical_10, 2, 2
305 ;-----------------------------------------------------------------------------
306 ; void ff_pred8x8_horizontal(pixel *src, int stride)
307 ;-----------------------------------------------------------------------------
309 cglobal pred8x8_horizontal_10, 2, 3
325 ;-----------------------------------------------------------------------------
326 ; void ff_predict_8x8_dc(pixel *src, int stride)
327 ;-----------------------------------------------------------------------------
329 ; sort of a hack, but it works
339 cglobal pred8x8_dc_10, 2, 6
349 pshufw m2, m0, 00001110b
350 pshufw m3, m1, 00001110b
360 movzx r2d, word [r0+r1*1-2]
361 movzx r3d, word [r0+r1*2-2]
363 movzx r3d, word [r0+r5*1-2]
365 movzx r3d, word [r4-2]
369 movzx r2d, word [r4+r1*1-2]
370 movzx r3d, word [r4+r1*2-2]
372 movzx r3d, word [r4+r5*1-2]
374 movzx r3d, word [r4+r1*4-2]
379 punpckldq m0, m2 ; s0, s1, s2, s3
380 %1 m3, m0, 11110110b ; s2, s1, s3, s3
381 %1 m0, m0, 01110100b ; s0, s1, s3, s1
384 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
387 pshufd m3, m0, 11111010b
412 ;-----------------------------------------------------------------------------
413 ; void ff_pred8x8_top_dc(pixel *src, int stride)
414 ;-----------------------------------------------------------------------------
416 cglobal pred8x8_top_dc_10, 2, 4
439 ;-----------------------------------------------------------------------------
440 ; void ff_pred8x8_plane(pixel *src, int stride)
441 ;-----------------------------------------------------------------------------
443 cglobal pred8x8_plane_10, 2, 7, 7
448 pmaddwd m2, [pw_m32101234]
456 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
457 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
458 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
460 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
461 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
464 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
465 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
469 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
470 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
479 mova m3, [pw_pixel_max]
484 pmullw m2, [pw_m32101234] ; b
485 pmullw m5, m4, [pw_m3] ; c
502 ;-----------------------------------------------------------------------------
503 ; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright,
505 ;-----------------------------------------------------------------------------
506 %macro PRED8x8L_128_DC 0
507 cglobal pred8x8l_128_dc_10, 4, 4
508 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
527 ;-----------------------------------------------------------------------------
528 ; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright,
530 ;-----------------------------------------------------------------------------
531 %macro PRED8x8L_TOP_DC 0
532 cglobal pred8x8l_top_dc_10, 4, 4, 6
540 pinsrw m1, [r0+r1], 0
541 pinsrw m2, [r0+r2+14], 7
544 PRED4x4_LOWPASS m0, m2, m1, m0
562 %if HAVE_AVX_EXTERNAL
567 ;-------------------------------------------------------------------------------
568 ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
569 ;-------------------------------------------------------------------------------
570 ;TODO: see if scalar is faster
572 cglobal pred8x8l_dc_10, 4, 6, 6
576 mova m0, [r0+r3*2-16]
577 punpckhwd m0, [r0+r3*1-16]
578 mova m1, [r4+r3*0-16]
579 punpckhwd m1, [r0+r5*1-16]
581 mova m2, [r4+r3*2-16]
582 punpckhwd m2, [r4+r3*1-16]
583 mova m3, [r4+r3*4-16]
584 punpckhwd m3, [r4+r5*1-16]
593 pinsrw m1, [r0+r1], 0
594 pinsrw m2, [r0+r2+14], 7
599 pshuflw m4, m4, 11100101b
600 pinsrw m5, [r0+r1-2], 7
601 PRED4x4_LOWPASS m3, m4, m5, m3
602 PRED4x4_LOWPASS m0, m2, m1, m0
621 %if HAVE_AVX_EXTERNAL
626 ;-----------------------------------------------------------------------------
627 ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
629 ;-----------------------------------------------------------------------------
630 %macro PRED8x8L_VERTICAL 0
631 cglobal pred8x8l_vertical_10, 4, 4, 6
639 pinsrw m1, [r0+r1], 0
640 pinsrw m2, [r0+r2+14], 7
643 PRED4x4_LOWPASS m0, m2, m1, m0
657 %if HAVE_AVX_EXTERNAL
662 ;-----------------------------------------------------------------------------
663 ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
665 ;-----------------------------------------------------------------------------
666 %macro PRED8x8L_HORIZONTAL 0
667 cglobal pred8x8l_horizontal_10, 4, 4, 5
673 punpckhwd m0, [r0+r1-16]
674 mova m1, [r0+r3*2-16]
675 punpckhwd m1, [r0+r3*1-16]
679 mova m2, [r2+r3*0-16]
680 punpckhwd m2, [r0+r1-16]
681 mova m3, [r2+r3*2-16]
682 punpckhwd m3, [r2+r3*1-16]
685 PALIGNR m4, m3, [r2+r1-16], 14, m0
687 pshuflw m0, m0, 11100101b
688 PRED4x4_LOWPASS m4, m3, m0, m4
714 %if HAVE_AVX_EXTERNAL
719 ;-----------------------------------------------------------------------------
720 ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
722 ;-----------------------------------------------------------------------------
723 %macro PRED8x8L_DOWN_LEFT 0
724 cglobal pred8x8l_down_left_10, 4, 4, 7
732 pinsrw m1, [r0+r1], 0
733 pinsrw m2, [r0+r2+14], 7
734 PRED4x4_LOWPASS m6, m2, m1, m3
735 jz .fix_tr ; flags from shr r2d
738 PALIGNR m2, m1, m3, 14, m3
739 pshufhw m5, m5, 10100100b
740 PRED4x4_LOWPASS m1, m2, m5, m1
745 PALIGNR m2, m1, m6, 2, m0
746 PALIGNR m3, m1, m6, 14, m0
747 PALIGNR m5, m1, 2, m0
749 PRED4x4_LOWPASS m6, m4, m2, m6
750 PRED4x4_LOWPASS m1, m3, m5, m1
752 PALIGNR m1, m6, 14, m2
755 PALIGNR m1, m6, 14, m2
758 PALIGNR m1, m6, 14, m2
761 PALIGNR m1, m6, 14, m2
764 PALIGNR m1, m6, 14, m2
767 PALIGNR m1, m6, 14, m2
770 PALIGNR m1, m6, 14, m6
783 %if HAVE_AVX_EXTERNAL
788 ;-----------------------------------------------------------------------------
789 ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
791 ;-----------------------------------------------------------------------------
792 %macro PRED8x8L_DOWN_RIGHT 0
793 ; standard forbids this when has_topleft is false
795 cglobal pred8x8l_down_right_10, 4, 5, 8
799 mova m0, [r0+r3*1-16]
800 punpckhwd m0, [r0+r3*0-16]
801 mova m1, [r0+r1*1-16]
802 punpckhwd m1, [r0+r3*2-16]
804 mova m2, [r4+r3*1-16]
805 punpckhwd m2, [r4+r3*0-16]
806 mova m3, [r4+r1*1-16]
807 punpckhwd m3, [r4+r3*2-16]
810 mova m0, [r4+r3*4-16]
812 PALIGNR m4, m3, m0, 14, m0
813 PALIGNR m1, m3, 2, m2
815 pshuflw m0, m0, 11100101b
816 PRED4x4_LOWPASS m6, m1, m4, m3
817 PRED4x4_LOWPASS m4, m3, m0, m4
823 pinsrw m2, [r0+r2+14], 7
824 PRED4x4_LOWPASS m3, m2, m1, m3
825 PALIGNR m2, m3, m6, 2, m0
826 PALIGNR m5, m3, m6, 14, m0
828 PRED4x4_LOWPASS m6, m4, m2, m6
829 PRED4x4_LOWPASS m3, m5, m7, m3
831 PALIGNR m3, m6, 14, m2
834 PALIGNR m3, m6, 14, m2
837 PALIGNR m3, m6, 14, m2
840 PALIGNR m3, m6, 14, m2
843 PALIGNR m3, m6, 14, m2
846 PALIGNR m3, m6, 14, m2
849 PALIGNR m3, m6, 14, m6
858 %if HAVE_AVX_EXTERNAL
863 ;-----------------------------------------------------------------------------
864 ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
865 ; int has_topright, int stride)
866 ;-----------------------------------------------------------------------------
867 %macro PRED8x8L_VERTICAL_RIGHT 0
868 ; likewise with 8x8l_down_right
869 cglobal pred8x8l_vertical_right_10, 4, 5, 7
873 mova m0, [r0+r3*1-16]
874 punpckhwd m0, [r0+r3*0-16]
875 mova m1, [r0+r1*1-16]
876 punpckhwd m1, [r0+r3*2-16]
878 mova m2, [r4+r3*1-16]
879 punpckhwd m2, [r4+r3*0-16]
880 mova m3, [r4+r1*1-16]
881 punpckhwd m3, [r4+r3*2-16]
884 mova m0, [r4+r3*4-16]
886 PALIGNR m4, m3, m0, 14, m0
887 PALIGNR m1, m3, 2, m2
888 PRED4x4_LOWPASS m3, m1, m4, m3
894 pinsrw m5, [r0+r2+14], 7
895 PRED4x4_LOWPASS m2, m5, m1, m2
896 PALIGNR m6, m2, m3, 12, m1
897 PALIGNR m5, m2, m3, 14, m0
898 PRED4x4_LOWPASS m0, m6, m2, m5
904 PRED4x4_LOWPASS m1, m3, m6, m1
905 PALIGNR m2, m1, 14, m4
908 PALIGNR m0, m1, 14, m3
911 PALIGNR m2, m1, 14, m4
914 PALIGNR m0, m1, 14, m3
917 PALIGNR m2, m1, 14, m4
920 PALIGNR m0, m1, 14, m1
926 PRED8x8L_VERTICAL_RIGHT
928 PRED8x8L_VERTICAL_RIGHT
929 %if HAVE_AVX_EXTERNAL
931 PRED8x8L_VERTICAL_RIGHT
934 ;-----------------------------------------------------------------------------
935 ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
936 ; int has_topright, int stride)
937 ;-----------------------------------------------------------------------------
938 %macro PRED8x8L_HORIZONTAL_UP 0
939 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
940 mova m0, [r0+r3*0-16]
941 punpckhwd m0, [r0+r3*1-16]
946 mova m4, [r0+r1*1-16]
949 mova m1, [r0+r3*2-16]
950 punpckhwd m1, [r0+r1*1-16]
952 mova m2, [r2+r3*0-16]
953 punpckhwd m2, [r2+r3*1-16]
954 mova m3, [r2+r3*2-16]
955 punpckhwd m3, [r2+r1*1-16]
958 PALIGNR m1, m0, m4, 14, m4
960 pshufhw m2, m2, 10100100b
961 PRED4x4_LOWPASS m0, m1, m2, m0
964 pshufhw m1, m1, 10100100b
965 pshufhw m2, m2, 01010100b
967 PRED4x4_LOWPASS m1, m2, m0, m1
972 pshufd m0, m5, 11111001b
973 pshufd m1, m5, 11111110b
974 pshufd m2, m5, 11111111b
978 PALIGNR m2, m5, m4, 4, m0
979 PALIGNR m3, m5, m4, 8, m1
980 PALIGNR m5, m5, m4, 12, m4
988 PRED8x8L_HORIZONTAL_UP
990 PRED8x8L_HORIZONTAL_UP
991 %if HAVE_AVX_EXTERNAL
993 PRED8x8L_HORIZONTAL_UP
997 ;-----------------------------------------------------------------------------
998 ; void ff_pred16x16_vertical(pixel *src, int stride)
999 ;-----------------------------------------------------------------------------
1002 mova [%1+mmsize], %3
1009 %macro PRED16x16_VERTICAL 0
1010 cglobal pred16x16_vertical_10, 2, 3
1014 mova m1, [r0+mmsize]
1020 MOV16 r0+r1*1, m0, m1, m2, m3
1021 MOV16 r0+r1*2, m0, m1, m2, m3
1033 ;-----------------------------------------------------------------------------
1034 ; void ff_pred16x16_horizontal(pixel *src, int stride)
1035 ;-----------------------------------------------------------------------------
1036 %macro PRED16x16_HORIZONTAL 0
1037 cglobal pred16x16_horizontal_10, 2, 3
1040 movd m0, [r0+r1*0-4]
1041 movd m1, [r0+r1*1-4]
1044 MOV16 r0+r1*0, m0, m0, m0, m0
1045 MOV16 r0+r1*1, m1, m1, m1, m1
1053 PRED16x16_HORIZONTAL
1055 PRED16x16_HORIZONTAL
1057 ;-----------------------------------------------------------------------------
1058 ; void ff_pred16x16_dc(pixel *src, int stride)
1059 ;-----------------------------------------------------------------------------
1060 %macro PRED16x16_DC 0
1061 cglobal pred16x16_dc_10, 2, 6
1065 paddw m0, [r0+mmsize]
1073 movzx r3d, word [r0]
1074 movzx r4d, word [r0+r1]
1077 movzx r2d, word [r0]
1079 movzx r2d, word [r0+r1]
1090 MOV16 r5+r1*0, m0, m0, m0, m0
1091 MOV16 r5+r1*1, m0, m0, m0, m0
1103 ;-----------------------------------------------------------------------------
1104 ; void ff_pred16x16_top_dc(pixel *src, int stride)
1105 ;-----------------------------------------------------------------------------
1106 %macro PRED16x16_TOP_DC 0
1107 cglobal pred16x16_top_dc_10, 2, 3
1110 paddw m0, [r0+mmsize]
1122 MOV16 r0+r1*1, m0, m0, m0, m0
1123 MOV16 r0+r1*2, m0, m0, m0, m0
1135 ;-----------------------------------------------------------------------------
1136 ; void ff_pred16x16_left_dc(pixel *src, int stride)
1137 ;-----------------------------------------------------------------------------
1138 %macro PRED16x16_LEFT_DC 0
1139 cglobal pred16x16_left_dc_10, 2, 6
1143 movzx r3d, word [r0]
1144 movzx r4d, word [r0+r1]
1147 movzx r2d, word [r0]
1149 movzx r2d, word [r0+r1]
1159 MOV16 r5+r1*0, m0, m0, m0, m0
1160 MOV16 r5+r1*1, m0, m0, m0, m0
1172 ;-----------------------------------------------------------------------------
1173 ; void ff_pred16x16_128_dc(pixel *src, int stride)
1174 ;-----------------------------------------------------------------------------
1175 %macro PRED16x16_128_DC 0
1176 cglobal pred16x16_128_dc_10, 2,3
1180 MOV16 r0+r1*0, m0, m0, m0, m0
1181 MOV16 r0+r1*1, m0, m0, m0, m0