1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86inc.asm"
26 %include "libavutil/x86/x86util.asm"
36 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
38 pw_pixel_max: times 8 dw ((1 << 10)-1)
39 pw_512: times 8 dw 512
45 ; dest, left, right, src
46 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47 %macro PRED4x4_LOWPASS 4
53 ;-----------------------------------------------------------------------------
54 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
55 ;-----------------------------------------------------------------------------
57 cglobal pred4x4_down_right_10_%1, 3,3
61 movhps m2, [r0+r2*1-8]
66 PALIGNR m3, m1, 10, m1
67 movhps m4, [r1+r2*1-8]
68 PALIGNR m0, m3, m4, 14, m4
69 movhps m4, [r1+r2*2-8]
70 PALIGNR m2, m0, m4, 14, m4
71 PRED4x4_LOWPASS m0, m2, m3, m0
83 %define PALIGNR PALIGNR_MMX
85 %define PALIGNR PALIGNR_SSSE3
92 ;-----------------------------------------------------------------------------
93 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
94 ;-----------------------------------------------------------------------------
96 cglobal pred4x4_vertical_right_10_%1, 3,3,6
99 movq m5, [r0] ; ........t3t2t1t0
101 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
103 movhps m1, [r0+r2*1-8]
104 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
105 movhps m2, [r0+r2*2-8]
106 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
107 movhps m3, [r1+r2*1-8]
108 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
109 PRED4x4_LOWPASS m1, m0, m2, m1
114 PALIGNR m5, m0, 14, m2
117 PALIGNR m1, m0, 14, m0
123 %define PALIGNR PALIGNR_MMX
125 %define PALIGNR PALIGNR_SSSE3
132 ;-----------------------------------------------------------------------------
133 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
134 ;-----------------------------------------------------------------------------
136 cglobal pred4x4_horizontal_down_10_%1, 3,3
139 movq m0, [r0-8] ; lt ..
141 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
142 movq m1, [r1+r2*2-8] ; l3
144 punpcklwd m1, m3 ; l2 l3
145 movq m2, [r0+r2*2-8] ; l1
147 punpcklwd m2, m3 ; l0 l1
148 punpckhdq m1, m2 ; l0 l1 l2 l3
149 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
150 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
151 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
153 PRED4x4_LOWPASS m3, m1, m0, m3
156 PALIGNR m3, m5, 12, m4
166 %define PALIGNR PALIGNR_MMX
168 %define PALIGNR PALIGNR_SSSE3
175 ;-----------------------------------------------------------------------------
176 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
177 ;-----------------------------------------------------------------------------
178 %macro HADDD 2 ; sum junk
196 cglobal pred4x4_dc_10_mmxext, 3,3
200 paddw m2, [r0+r2*2-8]
201 paddw m2, [r1+r2*1-8]
202 paddw m2, [r1+r2*2-8]
216 ;-----------------------------------------------------------------------------
217 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
218 ;-----------------------------------------------------------------------------
220 cglobal pred4x4_down_left_10_%1, 3,3
226 pshufhw m2, m2, 10100100b
227 PRED4x4_LOWPASS m0, m3, m2, m0
246 ;-----------------------------------------------------------------------------
247 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
248 ;-----------------------------------------------------------------------------
250 cglobal pred4x4_vertical_left_10_%1, 3,3
257 PRED4x4_LOWPASS m0, m1, m2, m0
275 ;-----------------------------------------------------------------------------
276 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
277 ;-----------------------------------------------------------------------------
279 cglobal pred4x4_horizontal_up_10_mmxext, 3,3
283 punpckhwd m0, [r0+r2*2-8]
285 punpckhwd m1, [r1+r2*2-8]
290 pshufw m2, m0, 11111001b
294 pshufw m5, m0, 11111110b
295 PRED4x4_LOWPASS m1, m0, m5, m1
309 ;-----------------------------------------------------------------------------
310 ; void pred8x8_vertical(pixel *src, int stride)
311 ;-----------------------------------------------------------------------------
313 cglobal pred8x8_vertical_10_sse2, 2,2
325 ;-----------------------------------------------------------------------------
326 ; void pred8x8_horizontal(pixel *src, int stride)
327 ;-----------------------------------------------------------------------------
329 cglobal pred8x8_horizontal_10_sse2, 2,3
345 ;-----------------------------------------------------------------------------
346 ; void predict_8x8_dc(pixel *src, int stride)
347 ;-----------------------------------------------------------------------------
349 ; sort of a hack, but it works
359 cglobal pred8x8_dc_10_%1, 2,6
369 pshufw m2, m0, 00001110b
370 pshufw m3, m1, 00001110b
380 movzx r2d, word [r0+r1*1-2]
381 movzx r3d, word [r0+r1*2-2]
383 movzx r3d, word [r0+r5*1-2]
385 movzx r3d, word [r4-2]
389 movzx r2d, word [r4+r1*1-2]
390 movzx r3d, word [r4+r1*2-2]
392 movzx r3d, word [r4+r5*1-2]
394 movzx r3d, word [r4+r1*4-2]
399 punpckldq m0, m2 ; s0, s1, s2, s3
400 %2 m3, m0, 11110110b ; s2, s1, s3, s3
401 %2 m0, m0, 01110100b ; s0, s1, s3, s1
404 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
407 pshufd m3, m0, 11111010b
428 PRED8x8_DC mmxext, pshufw
430 PRED8x8_DC sse2 , pshuflw
432 ;-----------------------------------------------------------------------------
433 ; void pred8x8_top_dc(pixel *src, int stride)
434 ;-----------------------------------------------------------------------------
436 cglobal pred8x8_top_dc_10_sse2, 2,4
459 ;-----------------------------------------------------------------------------
460 ; void pred8x8_plane(pixel *src, int stride)
461 ;-----------------------------------------------------------------------------
463 cglobal pred8x8_plane_10_sse2, 2,7,7
468 pmaddwd m2, [pw_m32101234]
476 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
477 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
478 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
480 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
481 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
484 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
485 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
489 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
490 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
499 mova m3, [pw_pixel_max]
504 pmullw m2, [pw_m32101234] ; b
505 pmullw m5, m4, [pw_m3] ; c
522 ;-----------------------------------------------------------------------------
523 ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
524 ;-----------------------------------------------------------------------------
525 %macro PRED8x8L_128_DC 1
526 cglobal pred8x8l_128_dc_10_%1, 4,4
527 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
542 PRED8x8L_128_DC mmxext
546 ;-----------------------------------------------------------------------------
547 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
548 ;-----------------------------------------------------------------------------
549 %macro PRED8x8L_TOP_DC 1
550 cglobal pred8x8l_top_dc_10_%1, 4,4,6
558 pinsrw m1, [r0+r1], 0
559 pinsrw m2, [r0+r2+14], 7
562 PRED4x4_LOWPASS m0, m2, m1, m0
585 ;-----------------------------------------------------------------------------
586 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
587 ;-----------------------------------------------------------------------------
588 ;TODO: see if scalar is faster
590 cglobal pred8x8l_dc_10_%1, 4,6,6
594 mova m0, [r0+r3*2-16]
595 punpckhwd m0, [r0+r3*1-16]
596 mova m1, [r4+r3*0-16]
597 punpckhwd m1, [r0+r5*1-16]
599 mova m2, [r4+r3*2-16]
600 punpckhwd m2, [r4+r3*1-16]
601 mova m3, [r4+r3*4-16]
602 punpckhwd m3, [r4+r5*1-16]
611 pinsrw m1, [r0+r1], 0
612 pinsrw m2, [r0+r2+14], 7
617 pshuflw m4, m4, 11100101b
618 pinsrw m5, [r0+r1-2], 7
619 PRED4x4_LOWPASS m3, m4, m5, m3
620 PRED4x4_LOWPASS m0, m2, m1, m0
644 ;-----------------------------------------------------------------------------
645 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
646 ;-----------------------------------------------------------------------------
647 %macro PRED8x8L_VERTICAL 1
648 cglobal pred8x8l_vertical_10_%1, 4,4,6
656 pinsrw m1, [r0+r1], 0
657 pinsrw m2, [r0+r2+14], 7
660 PRED4x4_LOWPASS m0, m2, m1, m0
673 PRED8x8L_VERTICAL sse2
676 PRED8x8L_VERTICAL avx
679 ;-----------------------------------------------------------------------------
680 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
681 ;-----------------------------------------------------------------------------
682 %macro PRED8x8L_HORIZONTAL 1
683 cglobal pred8x8l_horizontal_10_%1, 4,4,5
689 punpckhwd m0, [r0+r1-16]
690 mova m1, [r0+r3*2-16]
691 punpckhwd m1, [r0+r3*1-16]
695 mova m2, [r2+r3*0-16]
696 punpckhwd m2, [r0+r1-16]
697 mova m3, [r2+r3*2-16]
698 punpckhwd m3, [r2+r3*1-16]
701 PALIGNR m4, m3, [r2+r1-16], 14, m0
703 pshuflw m0, m0, 11100101b
704 PRED4x4_LOWPASS m4, m3, m0, m4
727 %define PALIGNR PALIGNR_MMX
728 PRED8x8L_HORIZONTAL sse2
729 %define PALIGNR PALIGNR_SSSE3
730 PRED8x8L_HORIZONTAL ssse3
733 PRED8x8L_HORIZONTAL avx
736 ;-----------------------------------------------------------------------------
737 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
738 ;-----------------------------------------------------------------------------
739 %macro PRED8x8L_DOWN_LEFT 1
740 cglobal pred8x8l_down_left_10_%1, 4,4,7
748 pinsrw m1, [r0+r1], 0
749 pinsrw m2, [r0+r2+14], 7
750 PRED4x4_LOWPASS m6, m2, m1, m3
751 jz .fix_tr ; flags from shr r2d
754 PALIGNR m2, m1, m3, 14, m3
755 pshufhw m5, m5, 10100100b
756 PRED4x4_LOWPASS m1, m2, m5, m1
761 PALIGNR m2, m1, m6, 2, m0
762 PALIGNR m3, m1, m6, 14, m0
763 PALIGNR m5, m1, 2, m0
765 PRED4x4_LOWPASS m6, m4, m2, m6
766 PRED4x4_LOWPASS m1, m3, m5, m1
768 PALIGNR m1, m6, 14, m2
771 PALIGNR m1, m6, 14, m2
774 PALIGNR m1, m6, 14, m2
777 PALIGNR m1, m6, 14, m2
780 PALIGNR m1, m6, 14, m2
783 PALIGNR m1, m6, 14, m2
786 PALIGNR m1, m6, 14, m6
796 %define PALIGNR PALIGNR_MMX
797 PRED8x8L_DOWN_LEFT sse2
798 %define PALIGNR PALIGNR_SSSE3
799 PRED8x8L_DOWN_LEFT ssse3
802 PRED8x8L_DOWN_LEFT avx
805 ;-----------------------------------------------------------------------------
806 ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
807 ;-----------------------------------------------------------------------------
808 %macro PRED8x8L_DOWN_RIGHT 1
809 ; standard forbids this when has_topleft is false
811 cglobal pred8x8l_down_right_10_%1, 4,5,8
815 mova m0, [r0+r3*1-16]
816 punpckhwd m0, [r0+r3*0-16]
817 mova m1, [r0+r1*1-16]
818 punpckhwd m1, [r0+r3*2-16]
820 mova m2, [r4+r3*1-16]
821 punpckhwd m2, [r4+r3*0-16]
822 mova m3, [r4+r1*1-16]
823 punpckhwd m3, [r4+r3*2-16]
826 mova m0, [r4+r3*4-16]
828 PALIGNR m4, m3, m0, 14, m0
829 PALIGNR m1, m3, 2, m2
831 pshuflw m0, m0, 11100101b
832 PRED4x4_LOWPASS m6, m1, m4, m3
833 PRED4x4_LOWPASS m4, m3, m0, m4
839 pinsrw m2, [r0+r2+14], 7
840 PRED4x4_LOWPASS m3, m2, m1, m3
841 PALIGNR m2, m3, m6, 2, m0
842 PALIGNR m5, m3, m6, 14, m0
844 PRED4x4_LOWPASS m6, m4, m2, m6
845 PRED4x4_LOWPASS m3, m5, m7, m3
847 PALIGNR m3, m6, 14, m2
850 PALIGNR m3, m6, 14, m2
853 PALIGNR m3, m6, 14, m2
856 PALIGNR m3, m6, 14, m2
859 PALIGNR m3, m6, 14, m2
862 PALIGNR m3, m6, 14, m2
865 PALIGNR m3, m6, 14, m6
871 %define PALIGNR PALIGNR_MMX
872 PRED8x8L_DOWN_RIGHT sse2
873 %define PALIGNR PALIGNR_SSSE3
874 PRED8x8L_DOWN_RIGHT ssse3
877 PRED8x8L_DOWN_RIGHT avx
880 ;-----------------------------------------------------------------------------
881 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
882 ;-----------------------------------------------------------------------------
883 %macro PRED8x8L_VERTICAL_RIGHT 1
884 ; likewise with 8x8l_down_right
885 cglobal pred8x8l_vertical_right_10_%1, 4,5,7
889 mova m0, [r0+r3*1-16]
890 punpckhwd m0, [r0+r3*0-16]
891 mova m1, [r0+r1*1-16]
892 punpckhwd m1, [r0+r3*2-16]
894 mova m2, [r4+r3*1-16]
895 punpckhwd m2, [r4+r3*0-16]
896 mova m3, [r4+r1*1-16]
897 punpckhwd m3, [r4+r3*2-16]
900 mova m0, [r4+r3*4-16]
902 PALIGNR m4, m3, m0, 14, m0
903 PALIGNR m1, m3, 2, m2
904 PRED4x4_LOWPASS m3, m1, m4, m3
910 pinsrw m5, [r0+r2+14], 7
911 PRED4x4_LOWPASS m2, m5, m1, m2
912 PALIGNR m6, m2, m3, 12, m1
913 PALIGNR m5, m2, m3, 14, m0
914 PRED4x4_LOWPASS m0, m6, m2, m5
920 PRED4x4_LOWPASS m1, m3, m6, m1
921 PALIGNR m2, m1, 14, m4
924 PALIGNR m0, m1, 14, m3
927 PALIGNR m2, m1, 14, m4
930 PALIGNR m0, m1, 14, m3
933 PALIGNR m2, m1, 14, m4
936 PALIGNR m0, m1, 14, m1
942 %define PALIGNR PALIGNR_MMX
943 PRED8x8L_VERTICAL_RIGHT sse2
944 %define PALIGNR PALIGNR_SSSE3
945 PRED8x8L_VERTICAL_RIGHT ssse3
948 PRED8x8L_VERTICAL_RIGHT avx
951 ;-----------------------------------------------------------------------------
952 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
953 ;-----------------------------------------------------------------------------
954 %macro PRED8x8L_HORIZONTAL_UP 1
955 cglobal pred8x8l_horizontal_up_10_%1, 4,4,6
956 mova m0, [r0+r3*0-16]
957 punpckhwd m0, [r0+r3*1-16]
962 mova m4, [r0+r1*1-16]
965 mova m1, [r0+r3*2-16]
966 punpckhwd m1, [r0+r1*1-16]
968 mova m2, [r2+r3*0-16]
969 punpckhwd m2, [r2+r3*1-16]
970 mova m3, [r2+r3*2-16]
971 punpckhwd m3, [r2+r1*1-16]
974 PALIGNR m1, m0, m4, 14, m4
976 pshufhw m2, m2, 10100100b
977 PRED4x4_LOWPASS m0, m1, m2, m0
980 pshufhw m1, m1, 10100100b
981 pshufhw m2, m2, 01010100b
983 PRED4x4_LOWPASS m1, m2, m0, m1
988 pshufd m0, m5, 11111001b
989 pshufd m1, m5, 11111110b
990 pshufd m2, m5, 11111111b
994 PALIGNR m2, m5, m4, 4, m0
995 PALIGNR m3, m5, m4, 8, m1
996 PALIGNR m5, m5, m4, 12, m4
1004 %define PALIGNR PALIGNR_MMX
1005 PRED8x8L_HORIZONTAL_UP sse2
1006 %define PALIGNR PALIGNR_SSSE3
1007 PRED8x8L_HORIZONTAL_UP ssse3
1010 PRED8x8L_HORIZONTAL_UP avx
1014 ;-----------------------------------------------------------------------------
1015 ; void pred16x16_vertical(pixel *src, int stride)
1016 ;-----------------------------------------------------------------------------
1019 mova [%1+mmsize], %3
1026 %macro PRED16x16_VERTICAL 1
1027 cglobal pred16x16_vertical_10_%1, 2,3
1031 mova m1, [r0+mmsize]
1037 MOV16 r0+r1*1, m0, m1, m2, m3
1038 MOV16 r0+r1*2, m0, m1, m2, m3
1046 PRED16x16_VERTICAL mmxext
1048 PRED16x16_VERTICAL sse2
1050 ;-----------------------------------------------------------------------------
1051 ; void pred16x16_horizontal(pixel *src, int stride)
1052 ;-----------------------------------------------------------------------------
1053 %macro PRED16x16_HORIZONTAL 1
1054 cglobal pred16x16_horizontal_10_%1, 2,3
1057 movd m0, [r0+r1*0-4]
1058 movd m1, [r0+r1*1-4]
1061 MOV16 r0+r1*0, m0, m0, m0, m0
1062 MOV16 r0+r1*1, m1, m1, m1, m1
1070 PRED16x16_HORIZONTAL mmxext
1072 PRED16x16_HORIZONTAL sse2
1074 ;-----------------------------------------------------------------------------
1075 ; void pred16x16_dc(pixel *src, int stride)
1076 ;-----------------------------------------------------------------------------
1077 %macro PRED16x16_DC 1
1078 cglobal pred16x16_dc_10_%1, 2,6
1082 paddw m0, [r0+mmsize]
1090 movzx r3d, word [r0]
1091 movzx r4d, word [r0+r1]
1094 movzx r2d, word [r0]
1096 movzx r2d, word [r0+r1]
1107 MOV16 r5+r1*0, m0, m0, m0, m0
1108 MOV16 r5+r1*1, m0, m0, m0, m0
1120 ;-----------------------------------------------------------------------------
1121 ; void pred16x16_top_dc(pixel *src, int stride)
1122 ;-----------------------------------------------------------------------------
1123 %macro PRED16x16_TOP_DC 1
1124 cglobal pred16x16_top_dc_10_%1, 2,3
1127 paddw m0, [r0+mmsize]
1139 MOV16 r0+r1*1, m0, m0, m0, m0
1140 MOV16 r0+r1*2, m0, m0, m0, m0
1148 PRED16x16_TOP_DC mmxext
1150 PRED16x16_TOP_DC sse2
1152 ;-----------------------------------------------------------------------------
1153 ; void pred16x16_left_dc(pixel *src, int stride)
1154 ;-----------------------------------------------------------------------------
1155 %macro PRED16x16_LEFT_DC 1
1156 cglobal pred16x16_left_dc_10_%1, 2,6
1160 movzx r3d, word [r0]
1161 movzx r4d, word [r0+r1]
1164 movzx r2d, word [r0]
1166 movzx r2d, word [r0+r1]
1176 MOV16 r5+r1*0, m0, m0, m0, m0
1177 MOV16 r5+r1*1, m0, m0, m0, m0
1185 PRED16x16_LEFT_DC mmxext
1187 PRED16x16_LEFT_DC sse2
1189 ;-----------------------------------------------------------------------------
1190 ; void pred16x16_128_dc(pixel *src, int stride)
1191 ;-----------------------------------------------------------------------------
1192 %macro PRED16x16_128_DC 1
1193 cglobal pred16x16_128_dc_10_%1, 2,3
1197 MOV16 r0+r1*0, m0, m0, m0, m0
1198 MOV16 r0+r1*1, m0, m0, m0, m0
1206 PRED16x16_128_DC mmxext
1208 PRED16x16_128_DC sse2