1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
37 ; dest, left, right, src
38 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
39 %macro PRED4x4_LOWPASS 4
45 ;-----------------------------------------------------------------------------
46 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
47 ;-----------------------------------------------------------------------------
49 cglobal pred4x4_down_right_10_%1, 3,3
53 movhps m2, [r0+r2*1-8]
58 PALIGNR m3, m1, 10, m1
60 movhps m4, [r1+r2*1-8]
61 PALIGNR m3, m4, 14, m4
63 movhps m4, [r1+r2*2-8]
64 PALIGNR m3, m4, 14, m4
65 PRED4x4_LOWPASS m0, m3, m1, m2
77 %define PALIGNR PALIGNR_MMX
79 %define PALIGNR PALIGNR_SSSE3
86 ;-----------------------------------------------------------------------------
87 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
88 ;-----------------------------------------------------------------------------
90 cglobal pred4x4_vertical_right_10_%1, 3,3,6
93 movq m5, [r0] ; ........t3t2t1t0
95 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
97 movhps m1, [r0+r2*1-8]
98 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
100 movhps m2, [r0+r2*2-8]
101 PALIGNR m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
103 movhps m3, [r1+r2*1-8]
104 PALIGNR m0, m3, 14, m3 ; t3t2t1t0ltl0l1l2
105 PRED4x4_LOWPASS m3, m1, m0, m2
110 PALIGNR m5, m1, 14, m2
113 PALIGNR m3, m1, 14, m1
119 %define PALIGNR PALIGNR_MMX
121 %define PALIGNR PALIGNR_SSSE3
128 ;-----------------------------------------------------------------------------
129 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
130 ;-----------------------------------------------------------------------------
132 cglobal pred4x4_horizontal_down_10_%1, 3,3
135 movq m0, [r0-8] ; lt ..
137 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
138 movq m1, [r1+r2*2-8] ; l3
140 punpcklwd m1, m3 ; l2 l3
141 movq m2, [r0+r2*2-8] ; l1
143 punpcklwd m2, m3 ; l0 l1
144 punpckhdq m1, m2 ; l0 l1 l2 l3
145 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
146 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
147 psrldq m2, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
149 PRED4x4_LOWPASS m3, m1, m0, m2
152 PALIGNR m3, m5, 12, m4
162 %define PALIGNR PALIGNR_MMX
164 %define PALIGNR PALIGNR_SSSE3
171 ;-----------------------------------------------------------------------------
172 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
173 ;-----------------------------------------------------------------------------
174 %macro HADDD 2 ; sum junk
192 cglobal pred4x4_dc_10_mmxext, 3,3
196 paddw m2, [r0+r2*2-8]
197 paddw m2, [r1+r2*1-8]
198 paddw m2, [r1+r2*2-8]
212 ;-----------------------------------------------------------------------------
213 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
214 ;-----------------------------------------------------------------------------
217 cglobal pred4x4_down_left_10_%1, 3,3
225 PRED4x4_LOWPASS m0, m5, m3, m1
244 ;-----------------------------------------------------------------------------
245 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
246 ;-----------------------------------------------------------------------------
248 cglobal pred4x4_vertical_left_10_%1, 3,3
255 PRED4x4_LOWPASS m0, m1, m2, m3
273 ;-----------------------------------------------------------------------------
274 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
275 ;-----------------------------------------------------------------------------
277 cglobal pred4x4_horizontal_up_10_mmxext, 3,3
281 punpckhwd m0, [r0+r2*2-8]
283 punpckhwd m1, [r1+r2*2-8]
288 pshufw m2, m0, 11111001b
292 pshufw m5, m0, 11111110b
293 PRED4x4_LOWPASS m3, m0, m5, m1
307 ;-----------------------------------------------------------------------------
308 ; void pred8x8_vertical(pixel *src, int stride)
309 ;-----------------------------------------------------------------------------
311 cglobal pred8x8_vertical_10_sse2, 2,2
323 ;-----------------------------------------------------------------------------
324 ; void pred8x8_horizontal(pixel *src, int stride)
325 ;-----------------------------------------------------------------------------
327 cglobal pred8x8_horizontal_10_sse2, 2,3
343 ;-----------------------------------------------------------------------------
344 ; void predict_8x8_dc(pixel *src, int stride)
345 ;-----------------------------------------------------------------------------
347 ; sort of a hack, but it works
357 cglobal pred8x8_dc_10_%1, 2,4
371 movzx r2d, word [r0+r1*1-2]
372 movzx r3d, word [r0+r1*2-2]
375 movzx r3d, word [r0+r1*1-2]
377 movzx r3d, word [r0+r1*2-2]
382 movzx r2d, word [r0+r1*1-2]
383 movzx r3d, word [r0+r1*2-2]
386 movzx r3d, word [r0+r1*1-2]
388 movzx r3d, word [r0+r1*2-2]
395 punpckldq m0, m2 ; s0, s1, s2, s3
396 %2 m3, m0, 11110110b ; s2, s1, s3, s3
398 %2 m0, m0, 01110100b ; s0, s1, s3, s1
402 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
405 pshufd m3, m0, 11111010b
426 PRED8x8_DC mmxext, pshufw
428 PRED8x8_DC sse2 , pshuflw
430 ;-----------------------------------------------------------------------------
431 ; void pred8x8_top_dc(pixel *src, int stride)
432 ;-----------------------------------------------------------------------------
433 %macro PRED8x8_TOP_DC 2
434 cglobal pred8x8_top_dc_10_%1, 2,4
463 PRED8x8_TOP_DC mmxext, pshufw
465 PRED8x8_TOP_DC sse2 , pshuflw
469 ;-----------------------------------------------------------------------------
470 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
471 ;-----------------------------------------------------------------------------
472 %macro PRED8x8L_TOP_DC 1
473 cglobal pred8x8l_top_dc_10_%1, 4,4,6
481 PALIGNR m2, m0, 14, m0
482 PALIGNR m1, m4, 2, m4
483 test r1, r1 ; top_left
485 test r2, r2 ; top_right
494 test r2, r2 ; top_right
505 PRED4x4_LOWPASS m0, m2, m1, m3
522 %define PALIGNR PALIGNR_MMX
524 %define PALIGNR PALIGNR_SSSE3
525 PRED8x8L_TOP_DC ssse3
527 ;-----------------------------------------------------------------------------
528 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
529 ;-----------------------------------------------------------------------------
530 ;TODO: see if scalar is faster
532 cglobal pred8x8l_dc_10_%1, 4,5,8
535 mova m0, [r0+r3*1-16]
536 punpckhwd m0, [r0+r3*0-16]
537 mova m1, [r4+r3*1-16]
538 punpckhwd m1, [r0+r3*2-16]
542 mova m2, [r0+r3*1-16]
543 punpckhwd m2, [r0+r3*0-16]
545 mova m3, [r0+r3*1-16]
546 punpckhwd m3, [r0+r3*0-16]
550 mova m0, [r0+r3*0-16]
555 PALIGNR m4, m0, 14, m0
556 PALIGNR m1, m2, 2, m2
583 PRED4x4_LOWPASS m2, m1, m4, m3
586 PRED4x4_LOWPASS m1, m3, m0, m4
588 PALIGNR m7, m1, 14, m3
594 PALIGNR m2, m0, 14, m0
595 PALIGNR m1, m4, 2, m4
602 PRED4x4_LOWPASS m6, m2, m1, m3
622 %define PALIGNR PALIGNR_MMX
624 %define PALIGNR PALIGNR_SSSE3
627 ;-----------------------------------------------------------------------------
628 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
629 ;-----------------------------------------------------------------------------
630 %macro PRED8x8L_VERTICAL 1
631 cglobal pred8x8l_vertical_10_%1, 4,4,6
638 PALIGNR m2, m0, 14, m0
639 PALIGNR m1, m4, 2, m4
640 test r1, r1 ; top_left
642 test r2, r2 ; top_right
651 test r2, r2 ; top_right
662 PRED4x4_LOWPASS m0, m2, m1, m3
675 %define PALIGNR PALIGNR_MMX
676 PRED8x8L_VERTICAL sse2
677 %define PALIGNR PALIGNR_SSSE3
678 PRED8x8L_VERTICAL ssse3
680 ;-----------------------------------------------------------------------------
681 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
682 ;-----------------------------------------------------------------------------
683 %macro PRED8x8L_HORIZONTAL 1
684 cglobal pred8x8l_horizontal_10_%1, 4,4,8
687 mova m0, [r0+r3*1-16]
691 punpckhwd m0, [r1+r3*0-16]
692 mova m1, [r2+r3*1-16]
693 punpckhwd m1, [r0+r3*2-16]
697 mova m2, [r0+r3*1-16]
698 punpckhwd m2, [r0+r3*0-16]
700 mova m3, [r0+r3*1-16]
701 punpckhwd m3, [r0+r3*0-16]
705 mova m0, [r0+r3*0-16]
706 mova m1, [r1+r3*0-16]
710 PALIGNR m4, m0, 14, m0
711 PALIGNR m1, m2, 2, m2
713 PRED4x4_LOWPASS m2, m1, m4, m3
716 PRED4x4_LOWPASS m1, m3, m0, m4
718 PALIGNR m7, m1, 14, m3
743 %define PALIGNR PALIGNR_MMX
744 PRED8x8L_HORIZONTAL sse2
745 %define PALIGNR PALIGNR_SSSE3
746 PRED8x8L_HORIZONTAL ssse3
748 ;-----------------------------------------------------------------------------
749 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
750 ;-----------------------------------------------------------------------------
751 %macro PRED8x8L_DOWN_LEFT 1
752 cglobal pred8x8l_down_left_10_%1, 4,4,8
759 PALIGNR m2, m0, 14, m0
760 PALIGNR m1, m4, 2, m4
786 PRED4x4_LOWPASS m4, m2, m1, m3
795 PALIGNR m2, m3, 14, m3
796 PALIGNR m5, m4, 2, m4
797 PRED4x4_LOWPASS m1, m2, m5, m0
805 PALIGNR m2, m7, 2, m0
807 PALIGNR m3, m7, 14, m0
808 PALIGNR m4, m6, 2, m0
813 PRED4x4_LOWPASS m0, m1, m2, m5
814 PRED4x4_LOWPASS m1, m3, m4, m7
860 %define PALIGNR PALIGNR_MMX
861 PRED8x8L_DOWN_LEFT sse2
862 %define PALIGNR PALIGNR_SSSE3
863 PRED8x8L_DOWN_LEFT ssse3
865 ;-----------------------------------------------------------------------------
866 ;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride)
867 ;-----------------------------------------------------------------------------
868 %macro PRED8x8L_DOWN_RIGHT 1
869 cglobal pred8x8l_down_right_10_%1, 4,5,8
872 mova m0, [r0+r3*1-16]
873 punpckhwd m0, [r0+r3*0-16]
874 mova m1, [r4+r3*1-16]
875 punpckhwd m1, [r0+r3*2-16]
879 mova m2, [r0+r3*1-16]
880 punpckhwd m2, [r0+r3*0-16]
882 mova m3, [r0+r3*1-16]
883 punpckhwd m3, [r0+r3*0-16]
887 mova m0, [r0+r3*0-16]
892 PALIGNR m4, m0, 14, m0
893 PALIGNR m1, m2, 2, m2
894 test r1, r1 ; top_left
898 PRED4x4_LOWPASS m2, m1, m4, m3
902 PRED4x4_LOWPASS m1, m3, m0, m4
904 PALIGNR m7, m1, 14, m3
910 PALIGNR m2, m0, 14, m0
911 PALIGNR m1, m4, 2, m4
912 test r1, r1 ; top_left
914 test r2, r2 ; top_right
917 PRED4x4_LOWPASS m4, m2, m1, m3
933 test r2, r2 ; top_right
949 PALIGNR m2, m6, 2, m0
951 PALIGNR m3, m6, 14, m0
954 PRED4x4_LOWPASS m0, m1, m2, m5
955 PRED4x4_LOWPASS m1, m3, m4, m7
1001 %define PALIGNR PALIGNR_MMX
1002 PRED8x8L_DOWN_RIGHT sse2
1003 %define PALIGNR PALIGNR_SSSE3
1004 PRED8x8L_DOWN_RIGHT ssse3
1006 ;-----------------------------------------------------------------------------
1007 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
1008 ;-----------------------------------------------------------------------------
1009 %macro PRED8x8L_VERTICAL_RIGHT 1
1010 cglobal pred8x8l_vertical_right_10_%1, 4,5,8
1013 mova m0, [r0+r3*1-16]
1014 punpckhwd m0, [r0+r3*0-16]
1015 mova m1, [r4+r3*1-16]
1016 punpckhwd m1, [r0+r3*2-16]
1020 mova m2, [r0+r3*1-16]
1021 punpckhwd m2, [r0+r3*0-16]
1023 mova m3, [r0+r3*1-16]
1024 punpckhwd m3, [r0+r3*0-16]
1028 mova m0, [r0+r3*0-16]
1033 PALIGNR m4, m0, 14, m0
1034 PALIGNR m1, m2, 2, m2
1062 PRED4x4_LOWPASS m2, m1, m4, m3
1069 PALIGNR m2, m0, 14, m0
1070 PALIGNR m1, m4, 2, m4
1076 PRED4x4_LOWPASS m6, m2, m1, m3
1080 PALIGNR m3, m7, 14, m0
1081 PALIGNR m6, m7, 12, m1
1085 PRED4x4_LOWPASS m0, m6, m2, m4
1095 PRED4x4_LOWPASS m0, m1, m3, m2
1096 PALIGNR m6, m0, 14, m2
1099 PALIGNR m5, m0, 14, m1
1102 PALIGNR m6, m0, 14, m2
1105 PALIGNR m5, m0, 14, m1
1108 PALIGNR m6, m0, 14, m2
1111 PALIGNR m5, m0, 14, m1
1117 %define PALIGNR PALIGNR_MMX
1118 PRED8x8L_VERTICAL_RIGHT sse2
1119 %define PALIGNR PALIGNR_SSSE3
1120 PRED8x8L_VERTICAL_RIGHT ssse3
1122 ;-----------------------------------------------------------------------------
1123 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
1124 ;-----------------------------------------------------------------------------
1125 %macro PRED8x8L_HORIZONTAL_UP 1
1126 cglobal pred8x8l_horizontal_up_10_%1, 4,4,8
1129 mova m0, [r0+r3*1-16]
1133 punpckhwd m0, [r1+r3*0-16]
1134 mova m1, [r2+r3*1-16]
1135 punpckhwd m1, [r0+r3*2-16]
1139 mova m2, [r0+r3*1-16]
1140 punpckhwd m2, [r0+r3*0-16]
1142 mova m3, [r0+r3*1-16]
1143 punpckhwd m3, [r0+r3*0-16]
1147 mova m0, [r0+r3*0-16]
1148 mova m1, [r1+r3*0-16]
1152 PALIGNR m4, m0, 14, m0
1153 PALIGNR m1, m2, 2, m2
1155 PRED4x4_LOWPASS m2, m1, m4, m3
1158 PRED4x4_LOWPASS m1, m3, m0, m4
1160 PALIGNR m7, m1, 14, m3
1162 pshufd m0, m7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
1163 pslldq m7, 14 ; l7 .. .. .. .. .. .. ..
1167 por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
1174 por m2, m7 ; l7 l7 l6 l5 l4 l3 l2 l1
1176 por m3, m7 ; l7 l7 l7 l6 l5 l4 l3 l2
1178 PRED4x4_LOWPASS m1, m3, m5, m2
1180 punpcklwd m4, m1 ; p4 p3 p2 p1
1181 punpckhwd m5, m1 ; p8 p7 p6 p5
1185 PALIGNR m5, m4, 4, m1
1186 pshufd m1, m6, 11111001b
1187 PALIGNR m6, m4, 8, m2
1188 pshufd m2, m7, 11111110b
1189 PALIGNR m7, m4, 12, m3
1190 pshufd m3, m0, 11111111b
1203 %define PALIGNR PALIGNR_MMX
1204 PRED8x8L_HORIZONTAL_UP sse2
1205 %define PALIGNR PALIGNR_SSSE3
1206 PRED8x8L_HORIZONTAL_UP ssse3
1210 ;-----------------------------------------------------------------------------
1211 ; void pred16x16_vertical(pixel *src, int stride)
1212 ;-----------------------------------------------------------------------------
1215 mova [%1+mmsize], %3
1222 %macro PRED16x16_VERTICAL 1
1223 cglobal pred16x16_vertical_10_%1, 2,3
1227 mova m1, [r0+mmsize]
1233 MOV16 r0+r1*1, m0, m1, m2, m3
1234 MOV16 r0+r1*2, m0, m1, m2, m3
1242 PRED16x16_VERTICAL mmxext
1244 PRED16x16_VERTICAL sse2
1246 ;-----------------------------------------------------------------------------
1247 ; void pred16x16_horizontal(pixel *src, int stride)
1248 ;-----------------------------------------------------------------------------
1249 %macro PRED16x16_HORIZONTAL 1
1250 cglobal pred16x16_horizontal_10_%1, 2,3
1253 movd m0, [r0+r1*0-4]
1254 movd m1, [r0+r1*1-4]
1257 MOV16 r0+r1*0, m0, m0, m0, m0
1258 MOV16 r0+r1*1, m1, m1, m1, m1
1266 PRED16x16_HORIZONTAL mmxext
1268 PRED16x16_HORIZONTAL sse2