1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
30 %define pw_pixel_max pw_1023
39 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
45 ; dest, left, right, src
46 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47 %macro PRED4x4_LOWPASS 4
53 ;-----------------------------------------------------------------------------
54 ; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride)
55 ;-----------------------------------------------------------------------------
57 cglobal pred4x4_down_right_10, 3, 3
61 movhps m2, [r0+r2*1-8]
66 PALIGNR m3, m1, 10, m1
67 movhps m4, [r1+r2*1-8]
68 PALIGNR m0, m3, m4, 14, m4
69 movhps m4, [r1+r2*2-8]
70 PALIGNR m2, m0, m4, 14, m4
71 PRED4x4_LOWPASS m0, m2, m3, m0
91 ;------------------------------------------------------------------------------
92 ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
93 ;------------------------------------------------------------------------------
95 cglobal pred4x4_vertical_right_10, 3, 3, 6
98 movq m5, [r0] ; ........t3t2t1t0
100 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
102 movhps m1, [r0+r2*1-8]
103 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
104 movhps m2, [r0+r2*2-8]
105 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
106 movhps m3, [r1+r2*1-8]
107 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
108 PRED4x4_LOWPASS m1, m0, m2, m1
113 PALIGNR m5, m0, 14, m2
116 PALIGNR m1, m0, 14, m0
125 %if HAVE_AVX_EXTERNAL
130 ;-------------------------------------------------------------------------------
131 ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
132 ;-------------------------------------------------------------------------------
134 cglobal pred4x4_horizontal_down_10, 3, 3
137 movq m0, [r0-8] ; lt ..
139 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
140 movq m1, [r1+r2*2-8] ; l3
142 punpcklwd m1, m3 ; l2 l3
143 movq m2, [r0+r2*2-8] ; l1
145 punpcklwd m2, m3 ; l0 l1
146 punpckhdq m1, m2 ; l0 l1 l2 l3
147 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
148 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
149 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
151 PRED4x4_LOWPASS m3, m1, m0, m3
154 PALIGNR m3, m5, 12, m4
167 %if HAVE_AVX_EXTERNAL
172 ;-----------------------------------------------------------------------------
173 ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
174 ;-----------------------------------------------------------------------------
177 cglobal pred4x4_dc_10, 3, 3
181 paddw m2, [r0+r2*2-8]
182 paddw m2, [r1+r2*1-8]
183 paddw m2, [r1+r2*2-8]
197 ;-----------------------------------------------------------------------------
198 ; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride)
199 ;-----------------------------------------------------------------------------
201 cglobal pred4x4_down_left_10, 3, 3
207 pshufhw m2, m2, 10100100b
208 PRED4x4_LOWPASS m0, m3, m2, m0
222 %if HAVE_AVX_EXTERNAL
227 ;-----------------------------------------------------------------------------
228 ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
229 ;-----------------------------------------------------------------------------
231 cglobal pred4x4_vertical_left_10, 3, 3
238 PRED4x4_LOWPASS m0, m1, m2, m0
251 %if HAVE_AVX_EXTERNAL
256 ;-----------------------------------------------------------------------------
257 ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
258 ;-----------------------------------------------------------------------------
260 cglobal pred4x4_horizontal_up_10, 3, 3
264 punpckhwd m0, [r0+r2*2-8]
266 punpckhwd m1, [r1+r2*2-8]
271 pshufw m2, m0, 11111001b
275 pshufw m5, m0, 11111110b
276 PRED4x4_LOWPASS m1, m0, m5, m1
290 ;-----------------------------------------------------------------------------
291 ; void ff_pred8x8_vertical(pixel *src, int stride)
292 ;-----------------------------------------------------------------------------
294 cglobal pred8x8_vertical_10, 2, 2
306 ;-----------------------------------------------------------------------------
307 ; void ff_pred8x8_horizontal(pixel *src, int stride)
308 ;-----------------------------------------------------------------------------
310 cglobal pred8x8_horizontal_10, 2, 3
326 ;-----------------------------------------------------------------------------
327 ; void ff_predict_8x8_dc(pixel *src, int stride)
328 ;-----------------------------------------------------------------------------
330 ; sort of a hack, but it works
340 cglobal pred8x8_dc_10, 2, 6
350 pshufw m2, m0, 00001110b
351 pshufw m3, m1, 00001110b
361 movzx r2d, word [r0+r1*1-2]
362 movzx r3d, word [r0+r1*2-2]
364 movzx r3d, word [r0+r5*1-2]
366 movzx r3d, word [r4-2]
370 movzx r2d, word [r4+r1*1-2]
371 movzx r3d, word [r4+r1*2-2]
373 movzx r3d, word [r4+r5*1-2]
375 movzx r3d, word [r4+r1*4-2]
380 punpckldq m0, m2 ; s0, s1, s2, s3
381 %1 m3, m0, 11110110b ; s2, s1, s3, s3
382 %1 m0, m0, 01110100b ; s0, s1, s3, s1
385 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
388 pshufd m3, m0, 11111010b
413 ;-----------------------------------------------------------------------------
414 ; void ff_pred8x8_top_dc(pixel *src, int stride)
415 ;-----------------------------------------------------------------------------
417 cglobal pred8x8_top_dc_10, 2, 4
440 ;-----------------------------------------------------------------------------
441 ; void ff_pred8x8_plane(pixel *src, int stride)
442 ;-----------------------------------------------------------------------------
444 cglobal pred8x8_plane_10, 2, 7, 7
449 pmaddwd m2, [pw_m32101234]
457 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
458 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
459 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
461 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
462 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
465 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
466 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
470 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
471 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
480 mova m3, [pw_pixel_max]
485 pmullw m2, [pw_m32101234] ; b
486 pmullw m5, m4, [pw_m3] ; c
503 ;-----------------------------------------------------------------------------
504 ; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright,
506 ;-----------------------------------------------------------------------------
507 %macro PRED8x8L_128_DC 0
508 cglobal pred8x8l_128_dc_10, 4, 4
509 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
528 ;-----------------------------------------------------------------------------
529 ; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright,
531 ;-----------------------------------------------------------------------------
532 %macro PRED8x8L_TOP_DC 0
533 cglobal pred8x8l_top_dc_10, 4, 4, 6
541 pinsrw m1, [r0+r1], 0
542 pinsrw m2, [r0+r2+14], 7
545 PRED4x4_LOWPASS m0, m2, m1, m0
563 %if HAVE_AVX_EXTERNAL
568 ;-------------------------------------------------------------------------------
569 ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
570 ;-------------------------------------------------------------------------------
571 ;TODO: see if scalar is faster
573 cglobal pred8x8l_dc_10, 4, 6, 6
577 mova m0, [r0+r3*2-16]
578 punpckhwd m0, [r0+r3*1-16]
579 mova m1, [r4+r3*0-16]
580 punpckhwd m1, [r0+r5*1-16]
582 mova m2, [r4+r3*2-16]
583 punpckhwd m2, [r4+r3*1-16]
584 mova m3, [r4+r3*4-16]
585 punpckhwd m3, [r4+r5*1-16]
594 pinsrw m1, [r0+r1], 0
595 pinsrw m2, [r0+r2+14], 7
600 pshuflw m4, m4, 11100101b
601 pinsrw m5, [r0+r1-2], 7
602 PRED4x4_LOWPASS m3, m4, m5, m3
603 PRED4x4_LOWPASS m0, m2, m1, m0
622 %if HAVE_AVX_EXTERNAL
627 ;-----------------------------------------------------------------------------
628 ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
630 ;-----------------------------------------------------------------------------
631 %macro PRED8x8L_VERTICAL 0
632 cglobal pred8x8l_vertical_10, 4, 4, 6
640 pinsrw m1, [r0+r1], 0
641 pinsrw m2, [r0+r2+14], 7
644 PRED4x4_LOWPASS m0, m2, m1, m0
658 %if HAVE_AVX_EXTERNAL
663 ;-----------------------------------------------------------------------------
664 ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
666 ;-----------------------------------------------------------------------------
667 %macro PRED8x8L_HORIZONTAL 0
668 cglobal pred8x8l_horizontal_10, 4, 4, 5
674 punpckhwd m0, [r0+r1-16]
675 mova m1, [r0+r3*2-16]
676 punpckhwd m1, [r0+r3*1-16]
680 mova m2, [r2+r3*0-16]
681 punpckhwd m2, [r0+r1-16]
682 mova m3, [r2+r3*2-16]
683 punpckhwd m3, [r2+r3*1-16]
686 PALIGNR m4, m3, [r2+r1-16], 14, m0
688 pshuflw m0, m0, 11100101b
689 PRED4x4_LOWPASS m4, m3, m0, m4
715 %if HAVE_AVX_EXTERNAL
720 ;-----------------------------------------------------------------------------
721 ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
723 ;-----------------------------------------------------------------------------
724 %macro PRED8x8L_DOWN_LEFT 0
725 cglobal pred8x8l_down_left_10, 4, 4, 7
733 pinsrw m1, [r0+r1], 0
734 pinsrw m2, [r0+r2+14], 7
735 PRED4x4_LOWPASS m6, m2, m1, m3
736 jz .fix_tr ; flags from shr r2d
739 PALIGNR m2, m1, m3, 14, m3
740 pshufhw m5, m5, 10100100b
741 PRED4x4_LOWPASS m1, m2, m5, m1
746 PALIGNR m2, m1, m6, 2, m0
747 PALIGNR m3, m1, m6, 14, m0
748 PALIGNR m5, m1, 2, m0
750 PRED4x4_LOWPASS m6, m4, m2, m6
751 PRED4x4_LOWPASS m1, m3, m5, m1
753 PALIGNR m1, m6, 14, m2
756 PALIGNR m1, m6, 14, m2
759 PALIGNR m1, m6, 14, m2
762 PALIGNR m1, m6, 14, m2
765 PALIGNR m1, m6, 14, m2
768 PALIGNR m1, m6, 14, m2
771 PALIGNR m1, m6, 14, m6
784 %if HAVE_AVX_EXTERNAL
789 ;-----------------------------------------------------------------------------
790 ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
792 ;-----------------------------------------------------------------------------
793 %macro PRED8x8L_DOWN_RIGHT 0
794 ; standard forbids this when has_topleft is false
796 cglobal pred8x8l_down_right_10, 4, 5, 8
800 mova m0, [r0+r3*1-16]
801 punpckhwd m0, [r0+r3*0-16]
802 mova m1, [r0+r1*1-16]
803 punpckhwd m1, [r0+r3*2-16]
805 mova m2, [r4+r3*1-16]
806 punpckhwd m2, [r4+r3*0-16]
807 mova m3, [r4+r1*1-16]
808 punpckhwd m3, [r4+r3*2-16]
811 mova m0, [r4+r3*4-16]
813 PALIGNR m4, m3, m0, 14, m0
814 PALIGNR m1, m3, 2, m2
816 pshuflw m0, m0, 11100101b
817 PRED4x4_LOWPASS m6, m1, m4, m3
818 PRED4x4_LOWPASS m4, m3, m0, m4
824 pinsrw m2, [r0+r2+14], 7
825 PRED4x4_LOWPASS m3, m2, m1, m3
826 PALIGNR m2, m3, m6, 2, m0
827 PALIGNR m5, m3, m6, 14, m0
829 PRED4x4_LOWPASS m6, m4, m2, m6
830 PRED4x4_LOWPASS m3, m5, m7, m3
832 PALIGNR m3, m6, 14, m2
835 PALIGNR m3, m6, 14, m2
838 PALIGNR m3, m6, 14, m2
841 PALIGNR m3, m6, 14, m2
844 PALIGNR m3, m6, 14, m2
847 PALIGNR m3, m6, 14, m2
850 PALIGNR m3, m6, 14, m6
859 %if HAVE_AVX_EXTERNAL
864 ;-----------------------------------------------------------------------------
865 ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
866 ; int has_topright, int stride)
867 ;-----------------------------------------------------------------------------
868 %macro PRED8x8L_VERTICAL_RIGHT 0
869 ; likewise with 8x8l_down_right
870 cglobal pred8x8l_vertical_right_10, 4, 5, 7
874 mova m0, [r0+r3*1-16]
875 punpckhwd m0, [r0+r3*0-16]
876 mova m1, [r0+r1*1-16]
877 punpckhwd m1, [r0+r3*2-16]
879 mova m2, [r4+r3*1-16]
880 punpckhwd m2, [r4+r3*0-16]
881 mova m3, [r4+r1*1-16]
882 punpckhwd m3, [r4+r3*2-16]
885 mova m0, [r4+r3*4-16]
887 PALIGNR m4, m3, m0, 14, m0
888 PALIGNR m1, m3, 2, m2
889 PRED4x4_LOWPASS m3, m1, m4, m3
895 pinsrw m5, [r0+r2+14], 7
896 PRED4x4_LOWPASS m2, m5, m1, m2
897 PALIGNR m6, m2, m3, 12, m1
898 PALIGNR m5, m2, m3, 14, m0
899 PRED4x4_LOWPASS m0, m6, m2, m5
905 PRED4x4_LOWPASS m1, m3, m6, m1
906 PALIGNR m2, m1, 14, m4
909 PALIGNR m0, m1, 14, m3
912 PALIGNR m2, m1, 14, m4
915 PALIGNR m0, m1, 14, m3
918 PALIGNR m2, m1, 14, m4
921 PALIGNR m0, m1, 14, m1
927 PRED8x8L_VERTICAL_RIGHT
929 PRED8x8L_VERTICAL_RIGHT
930 %if HAVE_AVX_EXTERNAL
932 PRED8x8L_VERTICAL_RIGHT
935 ;-----------------------------------------------------------------------------
936 ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
937 ; int has_topright, int stride)
938 ;-----------------------------------------------------------------------------
939 %macro PRED8x8L_HORIZONTAL_UP 0
940 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
941 mova m0, [r0+r3*0-16]
942 punpckhwd m0, [r0+r3*1-16]
947 mova m4, [r0+r1*1-16]
950 mova m1, [r0+r3*2-16]
951 punpckhwd m1, [r0+r1*1-16]
953 mova m2, [r2+r3*0-16]
954 punpckhwd m2, [r2+r3*1-16]
955 mova m3, [r2+r3*2-16]
956 punpckhwd m3, [r2+r1*1-16]
959 PALIGNR m1, m0, m4, 14, m4
961 pshufhw m2, m2, 10100100b
962 PRED4x4_LOWPASS m0, m1, m2, m0
965 pshufhw m1, m1, 10100100b
966 pshufhw m2, m2, 01010100b
968 PRED4x4_LOWPASS m1, m2, m0, m1
973 pshufd m0, m5, 11111001b
974 pshufd m1, m5, 11111110b
975 pshufd m2, m5, 11111111b
979 PALIGNR m2, m5, m4, 4, m0
980 PALIGNR m3, m5, m4, 8, m1
981 PALIGNR m5, m5, m4, 12, m4
989 PRED8x8L_HORIZONTAL_UP
991 PRED8x8L_HORIZONTAL_UP
992 %if HAVE_AVX_EXTERNAL
994 PRED8x8L_HORIZONTAL_UP
998 ;-----------------------------------------------------------------------------
999 ; void ff_pred16x16_vertical(pixel *src, int stride)
1000 ;-----------------------------------------------------------------------------
1003 mova [%1+mmsize], %3
1010 %macro PRED16x16_VERTICAL 0
1011 cglobal pred16x16_vertical_10, 2, 3
1015 mova m1, [r0+mmsize]
1021 MOV16 r0+r1*1, m0, m1, m2, m3
1022 MOV16 r0+r1*2, m0, m1, m2, m3
1034 ;-----------------------------------------------------------------------------
1035 ; void ff_pred16x16_horizontal(pixel *src, int stride)
1036 ;-----------------------------------------------------------------------------
1037 %macro PRED16x16_HORIZONTAL 0
1038 cglobal pred16x16_horizontal_10, 2, 3
1041 movd m0, [r0+r1*0-4]
1042 movd m1, [r0+r1*1-4]
1045 MOV16 r0+r1*0, m0, m0, m0, m0
1046 MOV16 r0+r1*1, m1, m1, m1, m1
1054 PRED16x16_HORIZONTAL
1056 PRED16x16_HORIZONTAL
1058 ;-----------------------------------------------------------------------------
1059 ; void ff_pred16x16_dc(pixel *src, int stride)
1060 ;-----------------------------------------------------------------------------
1061 %macro PRED16x16_DC 0
1062 cglobal pred16x16_dc_10, 2, 6
1066 paddw m0, [r0+mmsize]
1074 movzx r3d, word [r0]
1075 movzx r4d, word [r0+r1]
1078 movzx r2d, word [r0]
1080 movzx r2d, word [r0+r1]
1091 MOV16 r5+r1*0, m0, m0, m0, m0
1092 MOV16 r5+r1*1, m0, m0, m0, m0
1104 ;-----------------------------------------------------------------------------
1105 ; void ff_pred16x16_top_dc(pixel *src, int stride)
1106 ;-----------------------------------------------------------------------------
1107 %macro PRED16x16_TOP_DC 0
1108 cglobal pred16x16_top_dc_10, 2, 3
1111 paddw m0, [r0+mmsize]
1123 MOV16 r0+r1*1, m0, m0, m0, m0
1124 MOV16 r0+r1*2, m0, m0, m0, m0
1136 ;-----------------------------------------------------------------------------
1137 ; void ff_pred16x16_left_dc(pixel *src, int stride)
1138 ;-----------------------------------------------------------------------------
1139 %macro PRED16x16_LEFT_DC 0
1140 cglobal pred16x16_left_dc_10, 2, 6
1144 movzx r3d, word [r0]
1145 movzx r4d, word [r0+r1]
1148 movzx r2d, word [r0]
1150 movzx r2d, word [r0+r1]
1160 MOV16 r5+r1*0, m0, m0, m0, m0
1161 MOV16 r5+r1*1, m0, m0, m0, m0
1173 ;-----------------------------------------------------------------------------
1174 ; void ff_pred16x16_128_dc(pixel *src, int stride)
1175 ;-----------------------------------------------------------------------------
1176 %macro PRED16x16_128_DC 0
1177 cglobal pred16x16_128_dc_10, 2,3
1181 MOV16 r0+r1*0, m0, m0, m0, m0
1182 MOV16 r0+r1*1, m0, m0, m0, m0