1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86inc.asm"
26 %include "libavutil/x86/x86util.asm"
36 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
38 pw_pixel_max: times 8 dw ((1 << 10)-1)
39 pw_512: times 8 dw 512
45 ; dest, left, right, src
46 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47 %macro PRED4x4_LOWPASS 4
53 ;-----------------------------------------------------------------------------
54 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
55 ;-----------------------------------------------------------------------------
57 cglobal pred4x4_down_right_10, 3, 3
61 movhps m2, [r0+r2*1-8]
66 PALIGNR m3, m1, 10, m1
67 movhps m4, [r1+r2*1-8]
68 PALIGNR m0, m3, m4, 14, m4
69 movhps m4, [r1+r2*2-8]
70 PALIGNR m2, m0, m4, 14, m4
71 PRED4x4_LOWPASS m0, m2, m3, m0
83 %define PALIGNR PALIGNR_MMX
86 %define PALIGNR PALIGNR_SSSE3
93 ;-----------------------------------------------------------------------------
94 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
95 ;-----------------------------------------------------------------------------
97 cglobal pred4x4_vertical_right_10, 3, 3, 6
100 movq m5, [r0] ; ........t3t2t1t0
102 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
104 movhps m1, [r0+r2*1-8]
105 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
106 movhps m2, [r0+r2*2-8]
107 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
108 movhps m3, [r1+r2*1-8]
109 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
110 PRED4x4_LOWPASS m1, m0, m2, m1
115 PALIGNR m5, m0, 14, m2
118 PALIGNR m1, m0, 14, m0
124 %define PALIGNR PALIGNR_MMX
127 %define PALIGNR PALIGNR_SSSE3
129 %if HAVE_AVX_EXTERNAL
134 ;-----------------------------------------------------------------------------
135 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
136 ;-----------------------------------------------------------------------------
138 cglobal pred4x4_horizontal_down_10, 3, 3
141 movq m0, [r0-8] ; lt ..
143 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
144 movq m1, [r1+r2*2-8] ; l3
146 punpcklwd m1, m3 ; l2 l3
147 movq m2, [r0+r2*2-8] ; l1
149 punpcklwd m2, m3 ; l0 l1
150 punpckhdq m1, m2 ; l0 l1 l2 l3
151 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
152 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
153 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
155 PRED4x4_LOWPASS m3, m1, m0, m3
158 PALIGNR m3, m5, 12, m4
168 %define PALIGNR PALIGNR_MMX
171 %define PALIGNR PALIGNR_SSSE3
173 %if HAVE_AVX_EXTERNAL
178 ;-----------------------------------------------------------------------------
179 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
180 ;-----------------------------------------------------------------------------
181 %macro HADDD 2 ; sum junk
199 cglobal pred4x4_dc_10, 3, 3
203 paddw m2, [r0+r2*2-8]
204 paddw m2, [r1+r2*1-8]
205 paddw m2, [r1+r2*2-8]
219 ;-----------------------------------------------------------------------------
220 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
221 ;-----------------------------------------------------------------------------
223 cglobal pred4x4_down_left_10, 3, 3
229 pshufhw m2, m2, 10100100b
230 PRED4x4_LOWPASS m0, m3, m2, m0
244 %if HAVE_AVX_EXTERNAL
249 ;-----------------------------------------------------------------------------
250 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
251 ;-----------------------------------------------------------------------------
253 cglobal pred4x4_vertical_left_10, 3, 3
260 PRED4x4_LOWPASS m0, m1, m2, m0
273 %if HAVE_AVX_EXTERNAL
278 ;-----------------------------------------------------------------------------
279 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
280 ;-----------------------------------------------------------------------------
282 cglobal pred4x4_horizontal_up_10, 3, 3
286 punpckhwd m0, [r0+r2*2-8]
288 punpckhwd m1, [r1+r2*2-8]
293 pshufw m2, m0, 11111001b
297 pshufw m5, m0, 11111110b
298 PRED4x4_LOWPASS m1, m0, m5, m1
312 ;-----------------------------------------------------------------------------
313 ; void pred8x8_vertical(pixel *src, int stride)
314 ;-----------------------------------------------------------------------------
316 cglobal pred8x8_vertical_10, 2, 2
328 ;-----------------------------------------------------------------------------
329 ; void pred8x8_horizontal(pixel *src, int stride)
330 ;-----------------------------------------------------------------------------
332 cglobal pred8x8_horizontal_10, 2, 3
348 ;-----------------------------------------------------------------------------
349 ; void predict_8x8_dc(pixel *src, int stride)
350 ;-----------------------------------------------------------------------------
352 ; sort of a hack, but it works
362 cglobal pred8x8_dc_10, 2, 6
372 pshufw m2, m0, 00001110b
373 pshufw m3, m1, 00001110b
383 movzx r2d, word [r0+r1*1-2]
384 movzx r3d, word [r0+r1*2-2]
386 movzx r3d, word [r0+r5*1-2]
388 movzx r3d, word [r4-2]
392 movzx r2d, word [r4+r1*1-2]
393 movzx r3d, word [r4+r1*2-2]
395 movzx r3d, word [r4+r5*1-2]
397 movzx r3d, word [r4+r1*4-2]
402 punpckldq m0, m2 ; s0, s1, s2, s3
403 %1 m3, m0, 11110110b ; s2, s1, s3, s3
404 %1 m0, m0, 01110100b ; s0, s1, s3, s1
407 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
410 pshufd m3, m0, 11111010b
435 ;-----------------------------------------------------------------------------
436 ; void pred8x8_top_dc(pixel *src, int stride)
437 ;-----------------------------------------------------------------------------
439 cglobal pred8x8_top_dc_10, 2, 4
462 ;-----------------------------------------------------------------------------
463 ; void pred8x8_plane(pixel *src, int stride)
464 ;-----------------------------------------------------------------------------
466 cglobal pred8x8_plane_10, 2, 7, 7
471 pmaddwd m2, [pw_m32101234]
479 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
480 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
481 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
483 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
484 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
487 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
488 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
492 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
493 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
502 mova m3, [pw_pixel_max]
507 pmullw m2, [pw_m32101234] ; b
508 pmullw m5, m4, [pw_m3] ; c
525 ;-----------------------------------------------------------------------------
526 ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
527 ;-----------------------------------------------------------------------------
528 %macro PRED8x8L_128_DC 0
529 cglobal pred8x8l_128_dc_10, 4, 4
530 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
549 ;-----------------------------------------------------------------------------
550 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
551 ;-----------------------------------------------------------------------------
552 %macro PRED8x8L_TOP_DC 0
553 cglobal pred8x8l_top_dc_10, 4, 4, 6
561 pinsrw m1, [r0+r1], 0
562 pinsrw m2, [r0+r2+14], 7
565 PRED4x4_LOWPASS m0, m2, m1, m0
583 %if HAVE_AVX_EXTERNAL
588 ;-----------------------------------------------------------------------------
589 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
590 ;-----------------------------------------------------------------------------
591 ;TODO: see if scalar is faster
593 cglobal pred8x8l_dc_10, 4, 6, 6
597 mova m0, [r0+r3*2-16]
598 punpckhwd m0, [r0+r3*1-16]
599 mova m1, [r4+r3*0-16]
600 punpckhwd m1, [r0+r5*1-16]
602 mova m2, [r4+r3*2-16]
603 punpckhwd m2, [r4+r3*1-16]
604 mova m3, [r4+r3*4-16]
605 punpckhwd m3, [r4+r5*1-16]
614 pinsrw m1, [r0+r1], 0
615 pinsrw m2, [r0+r2+14], 7
620 pshuflw m4, m4, 11100101b
621 pinsrw m5, [r0+r1-2], 7
622 PRED4x4_LOWPASS m3, m4, m5, m3
623 PRED4x4_LOWPASS m0, m2, m1, m0
642 %if HAVE_AVX_EXTERNAL
647 ;-----------------------------------------------------------------------------
648 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
649 ;-----------------------------------------------------------------------------
650 %macro PRED8x8L_VERTICAL 0
651 cglobal pred8x8l_vertical_10, 4, 4, 6
659 pinsrw m1, [r0+r1], 0
660 pinsrw m2, [r0+r2+14], 7
663 PRED4x4_LOWPASS m0, m2, m1, m0
677 %if HAVE_AVX_EXTERNAL
682 ;-----------------------------------------------------------------------------
683 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
684 ;-----------------------------------------------------------------------------
685 %macro PRED8x8L_HORIZONTAL 0
686 cglobal pred8x8l_horizontal_10, 4, 4, 5
692 punpckhwd m0, [r0+r1-16]
693 mova m1, [r0+r3*2-16]
694 punpckhwd m1, [r0+r3*1-16]
698 mova m2, [r2+r3*0-16]
699 punpckhwd m2, [r0+r1-16]
700 mova m3, [r2+r3*2-16]
701 punpckhwd m3, [r2+r3*1-16]
704 PALIGNR m4, m3, [r2+r1-16], 14, m0
706 pshuflw m0, m0, 11100101b
707 PRED4x4_LOWPASS m4, m3, m0, m4
730 %define PALIGNR PALIGNR_MMX
733 %define PALIGNR PALIGNR_SSSE3
735 %if HAVE_AVX_EXTERNAL
740 ;-----------------------------------------------------------------------------
741 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
742 ;-----------------------------------------------------------------------------
743 %macro PRED8x8L_DOWN_LEFT 0
744 cglobal pred8x8l_down_left_10, 4, 4, 7
752 pinsrw m1, [r0+r1], 0
753 pinsrw m2, [r0+r2+14], 7
754 PRED4x4_LOWPASS m6, m2, m1, m3
755 jz .fix_tr ; flags from shr r2d
758 PALIGNR m2, m1, m3, 14, m3
759 pshufhw m5, m5, 10100100b
760 PRED4x4_LOWPASS m1, m2, m5, m1
765 PALIGNR m2, m1, m6, 2, m0
766 PALIGNR m3, m1, m6, 14, m0
767 PALIGNR m5, m1, 2, m0
769 PRED4x4_LOWPASS m6, m4, m2, m6
770 PRED4x4_LOWPASS m1, m3, m5, m1
772 PALIGNR m1, m6, 14, m2
775 PALIGNR m1, m6, 14, m2
778 PALIGNR m1, m6, 14, m2
781 PALIGNR m1, m6, 14, m2
784 PALIGNR m1, m6, 14, m2
787 PALIGNR m1, m6, 14, m2
790 PALIGNR m1, m6, 14, m6
800 %define PALIGNR PALIGNR_MMX
803 %define PALIGNR PALIGNR_SSSE3
805 %if HAVE_AVX_EXTERNAL
810 ;-----------------------------------------------------------------------------
811 ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
812 ;-----------------------------------------------------------------------------
813 %macro PRED8x8L_DOWN_RIGHT 0
814 ; standard forbids this when has_topleft is false
816 cglobal pred8x8l_down_right_10, 4, 5, 8
820 mova m0, [r0+r3*1-16]
821 punpckhwd m0, [r0+r3*0-16]
822 mova m1, [r0+r1*1-16]
823 punpckhwd m1, [r0+r3*2-16]
825 mova m2, [r4+r3*1-16]
826 punpckhwd m2, [r4+r3*0-16]
827 mova m3, [r4+r1*1-16]
828 punpckhwd m3, [r4+r3*2-16]
831 mova m0, [r4+r3*4-16]
833 PALIGNR m4, m3, m0, 14, m0
834 PALIGNR m1, m3, 2, m2
836 pshuflw m0, m0, 11100101b
837 PRED4x4_LOWPASS m6, m1, m4, m3
838 PRED4x4_LOWPASS m4, m3, m0, m4
844 pinsrw m2, [r0+r2+14], 7
845 PRED4x4_LOWPASS m3, m2, m1, m3
846 PALIGNR m2, m3, m6, 2, m0
847 PALIGNR m5, m3, m6, 14, m0
849 PRED4x4_LOWPASS m6, m4, m2, m6
850 PRED4x4_LOWPASS m3, m5, m7, m3
852 PALIGNR m3, m6, 14, m2
855 PALIGNR m3, m6, 14, m2
858 PALIGNR m3, m6, 14, m2
861 PALIGNR m3, m6, 14, m2
864 PALIGNR m3, m6, 14, m2
867 PALIGNR m3, m6, 14, m2
870 PALIGNR m3, m6, 14, m6
876 %define PALIGNR PALIGNR_MMX
879 %define PALIGNR PALIGNR_SSSE3
881 %if HAVE_AVX_EXTERNAL
886 ;-----------------------------------------------------------------------------
887 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
888 ;-----------------------------------------------------------------------------
889 %macro PRED8x8L_VERTICAL_RIGHT 0
890 ; likewise with 8x8l_down_right
891 cglobal pred8x8l_vertical_right_10, 4, 5, 7
895 mova m0, [r0+r3*1-16]
896 punpckhwd m0, [r0+r3*0-16]
897 mova m1, [r0+r1*1-16]
898 punpckhwd m1, [r0+r3*2-16]
900 mova m2, [r4+r3*1-16]
901 punpckhwd m2, [r4+r3*0-16]
902 mova m3, [r4+r1*1-16]
903 punpckhwd m3, [r4+r3*2-16]
906 mova m0, [r4+r3*4-16]
908 PALIGNR m4, m3, m0, 14, m0
909 PALIGNR m1, m3, 2, m2
910 PRED4x4_LOWPASS m3, m1, m4, m3
916 pinsrw m5, [r0+r2+14], 7
917 PRED4x4_LOWPASS m2, m5, m1, m2
918 PALIGNR m6, m2, m3, 12, m1
919 PALIGNR m5, m2, m3, 14, m0
920 PRED4x4_LOWPASS m0, m6, m2, m5
926 PRED4x4_LOWPASS m1, m3, m6, m1
927 PALIGNR m2, m1, 14, m4
930 PALIGNR m0, m1, 14, m3
933 PALIGNR m2, m1, 14, m4
936 PALIGNR m0, m1, 14, m3
939 PALIGNR m2, m1, 14, m4
942 PALIGNR m0, m1, 14, m1
948 %define PALIGNR PALIGNR_MMX
949 PRED8x8L_VERTICAL_RIGHT
951 %define PALIGNR PALIGNR_SSSE3
952 PRED8x8L_VERTICAL_RIGHT
953 %if HAVE_AVX_EXTERNAL
955 PRED8x8L_VERTICAL_RIGHT
958 ;-----------------------------------------------------------------------------
959 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
960 ;-----------------------------------------------------------------------------
961 %macro PRED8x8L_HORIZONTAL_UP 0
962 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
963 mova m0, [r0+r3*0-16]
964 punpckhwd m0, [r0+r3*1-16]
969 mova m4, [r0+r1*1-16]
972 mova m1, [r0+r3*2-16]
973 punpckhwd m1, [r0+r1*1-16]
975 mova m2, [r2+r3*0-16]
976 punpckhwd m2, [r2+r3*1-16]
977 mova m3, [r2+r3*2-16]
978 punpckhwd m3, [r2+r1*1-16]
981 PALIGNR m1, m0, m4, 14, m4
983 pshufhw m2, m2, 10100100b
984 PRED4x4_LOWPASS m0, m1, m2, m0
987 pshufhw m1, m1, 10100100b
988 pshufhw m2, m2, 01010100b
990 PRED4x4_LOWPASS m1, m2, m0, m1
995 pshufd m0, m5, 11111001b
996 pshufd m1, m5, 11111110b
997 pshufd m2, m5, 11111111b
1001 PALIGNR m2, m5, m4, 4, m0
1002 PALIGNR m3, m5, m4, 8, m1
1003 PALIGNR m5, m5, m4, 12, m4
1011 %define PALIGNR PALIGNR_MMX
1012 PRED8x8L_HORIZONTAL_UP
1014 %define PALIGNR PALIGNR_SSSE3
1015 PRED8x8L_HORIZONTAL_UP
1016 %if HAVE_AVX_EXTERNAL
1018 PRED8x8L_HORIZONTAL_UP
1022 ;-----------------------------------------------------------------------------
1023 ; void pred16x16_vertical(pixel *src, int stride)
1024 ;-----------------------------------------------------------------------------
1027 mova [%1+mmsize], %3
1034 %macro PRED16x16_VERTICAL 0
1035 cglobal pred16x16_vertical_10, 2, 3
1039 mova m1, [r0+mmsize]
1045 MOV16 r0+r1*1, m0, m1, m2, m3
1046 MOV16 r0+r1*2, m0, m1, m2, m3
1058 ;-----------------------------------------------------------------------------
1059 ; void pred16x16_horizontal(pixel *src, int stride)
1060 ;-----------------------------------------------------------------------------
1061 %macro PRED16x16_HORIZONTAL 0
1062 cglobal pred16x16_horizontal_10, 2, 3
1065 movd m0, [r0+r1*0-4]
1066 movd m1, [r0+r1*1-4]
1069 MOV16 r0+r1*0, m0, m0, m0, m0
1070 MOV16 r0+r1*1, m1, m1, m1, m1
1078 PRED16x16_HORIZONTAL
1080 PRED16x16_HORIZONTAL
1082 ;-----------------------------------------------------------------------------
1083 ; void pred16x16_dc(pixel *src, int stride)
1084 ;-----------------------------------------------------------------------------
1085 %macro PRED16x16_DC 0
1086 cglobal pred16x16_dc_10, 2, 6
1090 paddw m0, [r0+mmsize]
1098 movzx r3d, word [r0]
1099 movzx r4d, word [r0+r1]
1102 movzx r2d, word [r0]
1104 movzx r2d, word [r0+r1]
1115 MOV16 r5+r1*0, m0, m0, m0, m0
1116 MOV16 r5+r1*1, m0, m0, m0, m0
1128 ;-----------------------------------------------------------------------------
1129 ; void pred16x16_top_dc(pixel *src, int stride)
1130 ;-----------------------------------------------------------------------------
1131 %macro PRED16x16_TOP_DC 0
1132 cglobal pred16x16_top_dc_10, 2, 3
1135 paddw m0, [r0+mmsize]
1147 MOV16 r0+r1*1, m0, m0, m0, m0
1148 MOV16 r0+r1*2, m0, m0, m0, m0
1160 ;-----------------------------------------------------------------------------
1161 ; void pred16x16_left_dc(pixel *src, int stride)
1162 ;-----------------------------------------------------------------------------
1163 %macro PRED16x16_LEFT_DC 0
1164 cglobal pred16x16_left_dc_10, 2, 6
1168 movzx r3d, word [r0]
1169 movzx r4d, word [r0+r1]
1172 movzx r2d, word [r0]
1174 movzx r2d, word [r0+r1]
1184 MOV16 r5+r1*0, m0, m0, m0, m0
1185 MOV16 r5+r1*1, m0, m0, m0, m0
1197 ;-----------------------------------------------------------------------------
1198 ; void pred16x16_128_dc(pixel *src, int stride)
1199 ;-----------------------------------------------------------------------------
1200 %macro PRED16x16_128_DC 0
1201 cglobal pred16x16_128_dc_10, 2,3
1205 MOV16 r0+r1*0, m0, m0, m0, m0
1206 MOV16 r0+r1*1, m0, m0, m0, m0