1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
35 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
37 pw_pixel_max: times 8 dw ((1 << 10)-1)
38 pw_512: times 8 dw 512
44 ; dest, left, right, src
45 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46 %macro PRED4x4_LOWPASS 4
52 ;-----------------------------------------------------------------------------
53 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred4x4_down_right_10, 3, 3
60 movhps m2, [r0+r2*1-8]
65 PALIGNR m3, m1, 10, m1
66 movhps m4, [r1+r2*1-8]
67 PALIGNR m0, m3, m4, 14, m4
68 movhps m4, [r1+r2*2-8]
69 PALIGNR m2, m0, m4, 14, m4
70 PRED4x4_LOWPASS m0, m2, m3, m0
90 ;-----------------------------------------------------------------------------
91 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
92 ;-----------------------------------------------------------------------------
94 cglobal pred4x4_vertical_right_10, 3, 3, 6
97 movq m5, [r0] ; ........t3t2t1t0
99 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
101 movhps m1, [r0+r2*1-8]
102 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
103 movhps m2, [r0+r2*2-8]
104 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
105 movhps m3, [r1+r2*1-8]
106 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
107 PRED4x4_LOWPASS m1, m0, m2, m1
112 PALIGNR m5, m0, 14, m2
115 PALIGNR m1, m0, 14, m0
124 %if HAVE_AVX_EXTERNAL
129 ;-----------------------------------------------------------------------------
130 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
131 ;-----------------------------------------------------------------------------
133 cglobal pred4x4_horizontal_down_10, 3, 3
136 movq m0, [r0-8] ; lt ..
138 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
139 movq m1, [r1+r2*2-8] ; l3
141 punpcklwd m1, m3 ; l2 l3
142 movq m2, [r0+r2*2-8] ; l1
144 punpcklwd m2, m3 ; l0 l1
145 punpckhdq m1, m2 ; l0 l1 l2 l3
146 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
147 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
148 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
150 PRED4x4_LOWPASS m3, m1, m0, m3
153 PALIGNR m3, m5, 12, m4
166 %if HAVE_AVX_EXTERNAL
171 ;-----------------------------------------------------------------------------
172 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
173 ;-----------------------------------------------------------------------------
174 %macro HADDD 2 ; sum junk
192 cglobal pred4x4_dc_10, 3, 3
196 paddw m2, [r0+r2*2-8]
197 paddw m2, [r1+r2*1-8]
198 paddw m2, [r1+r2*2-8]
212 ;-----------------------------------------------------------------------------
213 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
214 ;-----------------------------------------------------------------------------
216 cglobal pred4x4_down_left_10, 3, 3
222 pshufhw m2, m2, 10100100b
223 PRED4x4_LOWPASS m0, m3, m2, m0
237 %if HAVE_AVX_EXTERNAL
242 ;-----------------------------------------------------------------------------
243 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
244 ;-----------------------------------------------------------------------------
246 cglobal pred4x4_vertical_left_10, 3, 3
253 PRED4x4_LOWPASS m0, m1, m2, m0
266 %if HAVE_AVX_EXTERNAL
271 ;-----------------------------------------------------------------------------
272 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
273 ;-----------------------------------------------------------------------------
275 cglobal pred4x4_horizontal_up_10, 3, 3
279 punpckhwd m0, [r0+r2*2-8]
281 punpckhwd m1, [r1+r2*2-8]
286 pshufw m2, m0, 11111001b
290 pshufw m5, m0, 11111110b
291 PRED4x4_LOWPASS m1, m0, m5, m1
305 ;-----------------------------------------------------------------------------
306 ; void pred8x8_vertical(pixel *src, int stride)
307 ;-----------------------------------------------------------------------------
309 cglobal pred8x8_vertical_10, 2, 2
321 ;-----------------------------------------------------------------------------
322 ; void pred8x8_horizontal(pixel *src, int stride)
323 ;-----------------------------------------------------------------------------
325 cglobal pred8x8_horizontal_10, 2, 3
341 ;-----------------------------------------------------------------------------
342 ; void predict_8x8_dc(pixel *src, int stride)
343 ;-----------------------------------------------------------------------------
345 ; sort of a hack, but it works
355 cglobal pred8x8_dc_10, 2, 6
365 pshufw m2, m0, 00001110b
366 pshufw m3, m1, 00001110b
376 movzx r2d, word [r0+r1*1-2]
377 movzx r3d, word [r0+r1*2-2]
379 movzx r3d, word [r0+r5*1-2]
381 movzx r3d, word [r4-2]
385 movzx r2d, word [r4+r1*1-2]
386 movzx r3d, word [r4+r1*2-2]
388 movzx r3d, word [r4+r5*1-2]
390 movzx r3d, word [r4+r1*4-2]
395 punpckldq m0, m2 ; s0, s1, s2, s3
396 %1 m3, m0, 11110110b ; s2, s1, s3, s3
397 %1 m0, m0, 01110100b ; s0, s1, s3, s1
400 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
403 pshufd m3, m0, 11111010b
428 ;-----------------------------------------------------------------------------
429 ; void pred8x8_top_dc(pixel *src, int stride)
430 ;-----------------------------------------------------------------------------
432 cglobal pred8x8_top_dc_10, 2, 4
455 ;-----------------------------------------------------------------------------
456 ; void pred8x8_plane(pixel *src, int stride)
457 ;-----------------------------------------------------------------------------
459 cglobal pred8x8_plane_10, 2, 7, 7
464 pmaddwd m2, [pw_m32101234]
472 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
473 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
474 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
476 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
477 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
480 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
481 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
485 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
486 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
495 mova m3, [pw_pixel_max]
500 pmullw m2, [pw_m32101234] ; b
501 pmullw m5, m4, [pw_m3] ; c
518 ;-----------------------------------------------------------------------------
519 ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
520 ;-----------------------------------------------------------------------------
521 %macro PRED8x8L_128_DC 0
522 cglobal pred8x8l_128_dc_10, 4, 4
523 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
542 ;-----------------------------------------------------------------------------
543 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
544 ;-----------------------------------------------------------------------------
545 %macro PRED8x8L_TOP_DC 0
546 cglobal pred8x8l_top_dc_10, 4, 4, 6
554 pinsrw m1, [r0+r1], 0
555 pinsrw m2, [r0+r2+14], 7
558 PRED4x4_LOWPASS m0, m2, m1, m0
576 %if HAVE_AVX_EXTERNAL
581 ;-----------------------------------------------------------------------------
582 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
583 ;-----------------------------------------------------------------------------
584 ;TODO: see if scalar is faster
586 cglobal pred8x8l_dc_10, 4, 6, 6
590 mova m0, [r0+r3*2-16]
591 punpckhwd m0, [r0+r3*1-16]
592 mova m1, [r4+r3*0-16]
593 punpckhwd m1, [r0+r5*1-16]
595 mova m2, [r4+r3*2-16]
596 punpckhwd m2, [r4+r3*1-16]
597 mova m3, [r4+r3*4-16]
598 punpckhwd m3, [r4+r5*1-16]
607 pinsrw m1, [r0+r1], 0
608 pinsrw m2, [r0+r2+14], 7
613 pshuflw m4, m4, 11100101b
614 pinsrw m5, [r0+r1-2], 7
615 PRED4x4_LOWPASS m3, m4, m5, m3
616 PRED4x4_LOWPASS m0, m2, m1, m0
635 %if HAVE_AVX_EXTERNAL
640 ;-----------------------------------------------------------------------------
641 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
642 ;-----------------------------------------------------------------------------
643 %macro PRED8x8L_VERTICAL 0
644 cglobal pred8x8l_vertical_10, 4, 4, 6
652 pinsrw m1, [r0+r1], 0
653 pinsrw m2, [r0+r2+14], 7
656 PRED4x4_LOWPASS m0, m2, m1, m0
670 %if HAVE_AVX_EXTERNAL
675 ;-----------------------------------------------------------------------------
676 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
677 ;-----------------------------------------------------------------------------
678 %macro PRED8x8L_HORIZONTAL 0
679 cglobal pred8x8l_horizontal_10, 4, 4, 5
685 punpckhwd m0, [r0+r1-16]
686 mova m1, [r0+r3*2-16]
687 punpckhwd m1, [r0+r3*1-16]
691 mova m2, [r2+r3*0-16]
692 punpckhwd m2, [r0+r1-16]
693 mova m3, [r2+r3*2-16]
694 punpckhwd m3, [r2+r3*1-16]
697 PALIGNR m4, m3, [r2+r1-16], 14, m0
699 pshuflw m0, m0, 11100101b
700 PRED4x4_LOWPASS m4, m3, m0, m4
726 %if HAVE_AVX_EXTERNAL
731 ;-----------------------------------------------------------------------------
732 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
733 ;-----------------------------------------------------------------------------
734 %macro PRED8x8L_DOWN_LEFT 0
735 cglobal pred8x8l_down_left_10, 4, 4, 7
743 pinsrw m1, [r0+r1], 0
744 pinsrw m2, [r0+r2+14], 7
745 PRED4x4_LOWPASS m6, m2, m1, m3
746 jz .fix_tr ; flags from shr r2d
749 PALIGNR m2, m1, m3, 14, m3
750 pshufhw m5, m5, 10100100b
751 PRED4x4_LOWPASS m1, m2, m5, m1
756 PALIGNR m2, m1, m6, 2, m0
757 PALIGNR m3, m1, m6, 14, m0
758 PALIGNR m5, m1, 2, m0
760 PRED4x4_LOWPASS m6, m4, m2, m6
761 PRED4x4_LOWPASS m1, m3, m5, m1
763 PALIGNR m1, m6, 14, m2
766 PALIGNR m1, m6, 14, m2
769 PALIGNR m1, m6, 14, m2
772 PALIGNR m1, m6, 14, m2
775 PALIGNR m1, m6, 14, m2
778 PALIGNR m1, m6, 14, m2
781 PALIGNR m1, m6, 14, m6
794 %if HAVE_AVX_EXTERNAL
799 ;-----------------------------------------------------------------------------
800 ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
801 ;-----------------------------------------------------------------------------
802 %macro PRED8x8L_DOWN_RIGHT 0
803 ; standard forbids this when has_topleft is false
805 cglobal pred8x8l_down_right_10, 4, 5, 8
809 mova m0, [r0+r3*1-16]
810 punpckhwd m0, [r0+r3*0-16]
811 mova m1, [r0+r1*1-16]
812 punpckhwd m1, [r0+r3*2-16]
814 mova m2, [r4+r3*1-16]
815 punpckhwd m2, [r4+r3*0-16]
816 mova m3, [r4+r1*1-16]
817 punpckhwd m3, [r4+r3*2-16]
820 mova m0, [r4+r3*4-16]
822 PALIGNR m4, m3, m0, 14, m0
823 PALIGNR m1, m3, 2, m2
825 pshuflw m0, m0, 11100101b
826 PRED4x4_LOWPASS m6, m1, m4, m3
827 PRED4x4_LOWPASS m4, m3, m0, m4
833 pinsrw m2, [r0+r2+14], 7
834 PRED4x4_LOWPASS m3, m2, m1, m3
835 PALIGNR m2, m3, m6, 2, m0
836 PALIGNR m5, m3, m6, 14, m0
838 PRED4x4_LOWPASS m6, m4, m2, m6
839 PRED4x4_LOWPASS m3, m5, m7, m3
841 PALIGNR m3, m6, 14, m2
844 PALIGNR m3, m6, 14, m2
847 PALIGNR m3, m6, 14, m2
850 PALIGNR m3, m6, 14, m2
853 PALIGNR m3, m6, 14, m2
856 PALIGNR m3, m6, 14, m2
859 PALIGNR m3, m6, 14, m6
868 %if HAVE_AVX_EXTERNAL
873 ;-----------------------------------------------------------------------------
874 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
875 ;-----------------------------------------------------------------------------
876 %macro PRED8x8L_VERTICAL_RIGHT 0
877 ; likewise with 8x8l_down_right
878 cglobal pred8x8l_vertical_right_10, 4, 5, 7
882 mova m0, [r0+r3*1-16]
883 punpckhwd m0, [r0+r3*0-16]
884 mova m1, [r0+r1*1-16]
885 punpckhwd m1, [r0+r3*2-16]
887 mova m2, [r4+r3*1-16]
888 punpckhwd m2, [r4+r3*0-16]
889 mova m3, [r4+r1*1-16]
890 punpckhwd m3, [r4+r3*2-16]
893 mova m0, [r4+r3*4-16]
895 PALIGNR m4, m3, m0, 14, m0
896 PALIGNR m1, m3, 2, m2
897 PRED4x4_LOWPASS m3, m1, m4, m3
903 pinsrw m5, [r0+r2+14], 7
904 PRED4x4_LOWPASS m2, m5, m1, m2
905 PALIGNR m6, m2, m3, 12, m1
906 PALIGNR m5, m2, m3, 14, m0
907 PRED4x4_LOWPASS m0, m6, m2, m5
913 PRED4x4_LOWPASS m1, m3, m6, m1
914 PALIGNR m2, m1, 14, m4
917 PALIGNR m0, m1, 14, m3
920 PALIGNR m2, m1, 14, m4
923 PALIGNR m0, m1, 14, m3
926 PALIGNR m2, m1, 14, m4
929 PALIGNR m0, m1, 14, m1
935 PRED8x8L_VERTICAL_RIGHT
937 PRED8x8L_VERTICAL_RIGHT
938 %if HAVE_AVX_EXTERNAL
940 PRED8x8L_VERTICAL_RIGHT
943 ;-----------------------------------------------------------------------------
944 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
945 ;-----------------------------------------------------------------------------
946 %macro PRED8x8L_HORIZONTAL_UP 0
947 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
948 mova m0, [r0+r3*0-16]
949 punpckhwd m0, [r0+r3*1-16]
954 mova m4, [r0+r1*1-16]
957 mova m1, [r0+r3*2-16]
958 punpckhwd m1, [r0+r1*1-16]
960 mova m2, [r2+r3*0-16]
961 punpckhwd m2, [r2+r3*1-16]
962 mova m3, [r2+r3*2-16]
963 punpckhwd m3, [r2+r1*1-16]
966 PALIGNR m1, m0, m4, 14, m4
968 pshufhw m2, m2, 10100100b
969 PRED4x4_LOWPASS m0, m1, m2, m0
972 pshufhw m1, m1, 10100100b
973 pshufhw m2, m2, 01010100b
975 PRED4x4_LOWPASS m1, m2, m0, m1
980 pshufd m0, m5, 11111001b
981 pshufd m1, m5, 11111110b
982 pshufd m2, m5, 11111111b
986 PALIGNR m2, m5, m4, 4, m0
987 PALIGNR m3, m5, m4, 8, m1
988 PALIGNR m5, m5, m4, 12, m4
996 PRED8x8L_HORIZONTAL_UP
998 PRED8x8L_HORIZONTAL_UP
999 %if HAVE_AVX_EXTERNAL
1001 PRED8x8L_HORIZONTAL_UP
1005 ;-----------------------------------------------------------------------------
1006 ; void pred16x16_vertical(pixel *src, int stride)
1007 ;-----------------------------------------------------------------------------
1010 mova [%1+mmsize], %3
1017 %macro PRED16x16_VERTICAL 0
1018 cglobal pred16x16_vertical_10, 2, 3
1022 mova m1, [r0+mmsize]
1028 MOV16 r0+r1*1, m0, m1, m2, m3
1029 MOV16 r0+r1*2, m0, m1, m2, m3
1041 ;-----------------------------------------------------------------------------
1042 ; void pred16x16_horizontal(pixel *src, int stride)
1043 ;-----------------------------------------------------------------------------
1044 %macro PRED16x16_HORIZONTAL 0
1045 cglobal pred16x16_horizontal_10, 2, 3
1048 movd m0, [r0+r1*0-4]
1049 movd m1, [r0+r1*1-4]
1052 MOV16 r0+r1*0, m0, m0, m0, m0
1053 MOV16 r0+r1*1, m1, m1, m1, m1
1061 PRED16x16_HORIZONTAL
1063 PRED16x16_HORIZONTAL
1065 ;-----------------------------------------------------------------------------
1066 ; void pred16x16_dc(pixel *src, int stride)
1067 ;-----------------------------------------------------------------------------
1068 %macro PRED16x16_DC 0
1069 cglobal pred16x16_dc_10, 2, 6
1073 paddw m0, [r0+mmsize]
1081 movzx r3d, word [r0]
1082 movzx r4d, word [r0+r1]
1085 movzx r2d, word [r0]
1087 movzx r2d, word [r0+r1]
1098 MOV16 r5+r1*0, m0, m0, m0, m0
1099 MOV16 r5+r1*1, m0, m0, m0, m0
1111 ;-----------------------------------------------------------------------------
1112 ; void pred16x16_top_dc(pixel *src, int stride)
1113 ;-----------------------------------------------------------------------------
1114 %macro PRED16x16_TOP_DC 0
1115 cglobal pred16x16_top_dc_10, 2, 3
1118 paddw m0, [r0+mmsize]
1130 MOV16 r0+r1*1, m0, m0, m0, m0
1131 MOV16 r0+r1*2, m0, m0, m0, m0
1143 ;-----------------------------------------------------------------------------
1144 ; void pred16x16_left_dc(pixel *src, int stride)
1145 ;-----------------------------------------------------------------------------
1146 %macro PRED16x16_LEFT_DC 0
1147 cglobal pred16x16_left_dc_10, 2, 6
1151 movzx r3d, word [r0]
1152 movzx r4d, word [r0+r1]
1155 movzx r2d, word [r0]
1157 movzx r2d, word [r0+r1]
1167 MOV16 r5+r1*0, m0, m0, m0, m0
1168 MOV16 r5+r1*1, m0, m0, m0, m0
1180 ;-----------------------------------------------------------------------------
1181 ; void pred16x16_128_dc(pixel *src, int stride)
1182 ;-----------------------------------------------------------------------------
1183 %macro PRED16x16_128_DC 0
1184 cglobal pred16x16_128_dc_10, 2,3
1188 MOV16 r0+r1*0, m0, m0, m0, m0
1189 MOV16 r0+r1*1, m0, m0, m0, m0