1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
35 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
37 pw_pixel_max: times 8 dw ((1 << 10)-1)
38 pw_512: times 8 dw 512
44 ; dest, left, right, src
45 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46 %macro PRED4x4_LOWPASS 4
52 ;-----------------------------------------------------------------------------
53 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred4x4_down_right_10, 3, 3
60 movhps m2, [r0+r2*1-8]
65 PALIGNR m3, m1, 10, m1
66 movhps m4, [r1+r2*1-8]
67 PALIGNR m0, m3, m4, 14, m4
68 movhps m4, [r1+r2*2-8]
69 PALIGNR m2, m0, m4, 14, m4
70 PRED4x4_LOWPASS m0, m2, m3, m0
82 %define PALIGNR PALIGNR_MMX
85 %define PALIGNR PALIGNR_SSSE3
92 ;-----------------------------------------------------------------------------
93 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
94 ;-----------------------------------------------------------------------------
96 cglobal pred4x4_vertical_right_10, 3, 3, 6
99 movq m5, [r0] ; ........t3t2t1t0
101 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
103 movhps m1, [r0+r2*1-8]
104 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
105 movhps m2, [r0+r2*2-8]
106 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
107 movhps m3, [r1+r2*1-8]
108 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
109 PRED4x4_LOWPASS m1, m0, m2, m1
114 PALIGNR m5, m0, 14, m2
117 PALIGNR m1, m0, 14, m0
123 %define PALIGNR PALIGNR_MMX
126 %define PALIGNR PALIGNR_SSSE3
128 %if HAVE_AVX_EXTERNAL
133 ;-----------------------------------------------------------------------------
134 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
135 ;-----------------------------------------------------------------------------
137 cglobal pred4x4_horizontal_down_10, 3, 3
140 movq m0, [r0-8] ; lt ..
142 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
143 movq m1, [r1+r2*2-8] ; l3
145 punpcklwd m1, m3 ; l2 l3
146 movq m2, [r0+r2*2-8] ; l1
148 punpcklwd m2, m3 ; l0 l1
149 punpckhdq m1, m2 ; l0 l1 l2 l3
150 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
151 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
152 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
154 PRED4x4_LOWPASS m3, m1, m0, m3
157 PALIGNR m3, m5, 12, m4
167 %define PALIGNR PALIGNR_MMX
170 %define PALIGNR PALIGNR_SSSE3
172 %if HAVE_AVX_EXTERNAL
177 ;-----------------------------------------------------------------------------
178 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
179 ;-----------------------------------------------------------------------------
180 %macro HADDD 2 ; sum junk
198 cglobal pred4x4_dc_10, 3, 3
202 paddw m2, [r0+r2*2-8]
203 paddw m2, [r1+r2*1-8]
204 paddw m2, [r1+r2*2-8]
218 ;-----------------------------------------------------------------------------
219 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
220 ;-----------------------------------------------------------------------------
222 cglobal pred4x4_down_left_10, 3, 3
228 pshufhw m2, m2, 10100100b
229 PRED4x4_LOWPASS m0, m3, m2, m0
243 %if HAVE_AVX_EXTERNAL
248 ;-----------------------------------------------------------------------------
249 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
250 ;-----------------------------------------------------------------------------
252 cglobal pred4x4_vertical_left_10, 3, 3
259 PRED4x4_LOWPASS m0, m1, m2, m0
272 %if HAVE_AVX_EXTERNAL
277 ;-----------------------------------------------------------------------------
278 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
279 ;-----------------------------------------------------------------------------
281 cglobal pred4x4_horizontal_up_10, 3, 3
285 punpckhwd m0, [r0+r2*2-8]
287 punpckhwd m1, [r1+r2*2-8]
292 pshufw m2, m0, 11111001b
296 pshufw m5, m0, 11111110b
297 PRED4x4_LOWPASS m1, m0, m5, m1
311 ;-----------------------------------------------------------------------------
312 ; void pred8x8_vertical(pixel *src, int stride)
313 ;-----------------------------------------------------------------------------
315 cglobal pred8x8_vertical_10, 2, 2
327 ;-----------------------------------------------------------------------------
328 ; void pred8x8_horizontal(pixel *src, int stride)
329 ;-----------------------------------------------------------------------------
331 cglobal pred8x8_horizontal_10, 2, 3
347 ;-----------------------------------------------------------------------------
348 ; void predict_8x8_dc(pixel *src, int stride)
349 ;-----------------------------------------------------------------------------
351 ; sort of a hack, but it works
361 cglobal pred8x8_dc_10, 2, 6
371 pshufw m2, m0, 00001110b
372 pshufw m3, m1, 00001110b
382 movzx r2d, word [r0+r1*1-2]
383 movzx r3d, word [r0+r1*2-2]
385 movzx r3d, word [r0+r5*1-2]
387 movzx r3d, word [r4-2]
391 movzx r2d, word [r4+r1*1-2]
392 movzx r3d, word [r4+r1*2-2]
394 movzx r3d, word [r4+r5*1-2]
396 movzx r3d, word [r4+r1*4-2]
401 punpckldq m0, m2 ; s0, s1, s2, s3
402 %1 m3, m0, 11110110b ; s2, s1, s3, s3
403 %1 m0, m0, 01110100b ; s0, s1, s3, s1
406 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
409 pshufd m3, m0, 11111010b
434 ;-----------------------------------------------------------------------------
435 ; void pred8x8_top_dc(pixel *src, int stride)
436 ;-----------------------------------------------------------------------------
438 cglobal pred8x8_top_dc_10, 2, 4
461 ;-----------------------------------------------------------------------------
462 ; void pred8x8_plane(pixel *src, int stride)
463 ;-----------------------------------------------------------------------------
465 cglobal pred8x8_plane_10, 2, 7, 7
470 pmaddwd m2, [pw_m32101234]
478 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
479 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
480 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
482 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
483 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
486 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
487 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
491 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
492 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
501 mova m3, [pw_pixel_max]
506 pmullw m2, [pw_m32101234] ; b
507 pmullw m5, m4, [pw_m3] ; c
524 ;-----------------------------------------------------------------------------
525 ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
526 ;-----------------------------------------------------------------------------
527 %macro PRED8x8L_128_DC 0
528 cglobal pred8x8l_128_dc_10, 4, 4
529 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
548 ;-----------------------------------------------------------------------------
549 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
550 ;-----------------------------------------------------------------------------
551 %macro PRED8x8L_TOP_DC 0
552 cglobal pred8x8l_top_dc_10, 4, 4, 6
560 pinsrw m1, [r0+r1], 0
561 pinsrw m2, [r0+r2+14], 7
564 PRED4x4_LOWPASS m0, m2, m1, m0
582 %if HAVE_AVX_EXTERNAL
587 ;-----------------------------------------------------------------------------
588 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
589 ;-----------------------------------------------------------------------------
590 ;TODO: see if scalar is faster
592 cglobal pred8x8l_dc_10, 4, 6, 6
596 mova m0, [r0+r3*2-16]
597 punpckhwd m0, [r0+r3*1-16]
598 mova m1, [r4+r3*0-16]
599 punpckhwd m1, [r0+r5*1-16]
601 mova m2, [r4+r3*2-16]
602 punpckhwd m2, [r4+r3*1-16]
603 mova m3, [r4+r3*4-16]
604 punpckhwd m3, [r4+r5*1-16]
613 pinsrw m1, [r0+r1], 0
614 pinsrw m2, [r0+r2+14], 7
619 pshuflw m4, m4, 11100101b
620 pinsrw m5, [r0+r1-2], 7
621 PRED4x4_LOWPASS m3, m4, m5, m3
622 PRED4x4_LOWPASS m0, m2, m1, m0
641 %if HAVE_AVX_EXTERNAL
646 ;-----------------------------------------------------------------------------
647 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
648 ;-----------------------------------------------------------------------------
649 %macro PRED8x8L_VERTICAL 0
650 cglobal pred8x8l_vertical_10, 4, 4, 6
658 pinsrw m1, [r0+r1], 0
659 pinsrw m2, [r0+r2+14], 7
662 PRED4x4_LOWPASS m0, m2, m1, m0
676 %if HAVE_AVX_EXTERNAL
681 ;-----------------------------------------------------------------------------
682 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
683 ;-----------------------------------------------------------------------------
684 %macro PRED8x8L_HORIZONTAL 0
685 cglobal pred8x8l_horizontal_10, 4, 4, 5
691 punpckhwd m0, [r0+r1-16]
692 mova m1, [r0+r3*2-16]
693 punpckhwd m1, [r0+r3*1-16]
697 mova m2, [r2+r3*0-16]
698 punpckhwd m2, [r0+r1-16]
699 mova m3, [r2+r3*2-16]
700 punpckhwd m3, [r2+r3*1-16]
703 PALIGNR m4, m3, [r2+r1-16], 14, m0
705 pshuflw m0, m0, 11100101b
706 PRED4x4_LOWPASS m4, m3, m0, m4
729 %define PALIGNR PALIGNR_MMX
732 %define PALIGNR PALIGNR_SSSE3
734 %if HAVE_AVX_EXTERNAL
739 ;-----------------------------------------------------------------------------
740 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
741 ;-----------------------------------------------------------------------------
742 %macro PRED8x8L_DOWN_LEFT 0
743 cglobal pred8x8l_down_left_10, 4, 4, 7
751 pinsrw m1, [r0+r1], 0
752 pinsrw m2, [r0+r2+14], 7
753 PRED4x4_LOWPASS m6, m2, m1, m3
754 jz .fix_tr ; flags from shr r2d
757 PALIGNR m2, m1, m3, 14, m3
758 pshufhw m5, m5, 10100100b
759 PRED4x4_LOWPASS m1, m2, m5, m1
764 PALIGNR m2, m1, m6, 2, m0
765 PALIGNR m3, m1, m6, 14, m0
766 PALIGNR m5, m1, 2, m0
768 PRED4x4_LOWPASS m6, m4, m2, m6
769 PRED4x4_LOWPASS m1, m3, m5, m1
771 PALIGNR m1, m6, 14, m2
774 PALIGNR m1, m6, 14, m2
777 PALIGNR m1, m6, 14, m2
780 PALIGNR m1, m6, 14, m2
783 PALIGNR m1, m6, 14, m2
786 PALIGNR m1, m6, 14, m2
789 PALIGNR m1, m6, 14, m6
799 %define PALIGNR PALIGNR_MMX
802 %define PALIGNR PALIGNR_SSSE3
804 %if HAVE_AVX_EXTERNAL
809 ;-----------------------------------------------------------------------------
810 ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
811 ;-----------------------------------------------------------------------------
812 %macro PRED8x8L_DOWN_RIGHT 0
813 ; standard forbids this when has_topleft is false
815 cglobal pred8x8l_down_right_10, 4, 5, 8
819 mova m0, [r0+r3*1-16]
820 punpckhwd m0, [r0+r3*0-16]
821 mova m1, [r0+r1*1-16]
822 punpckhwd m1, [r0+r3*2-16]
824 mova m2, [r4+r3*1-16]
825 punpckhwd m2, [r4+r3*0-16]
826 mova m3, [r4+r1*1-16]
827 punpckhwd m3, [r4+r3*2-16]
830 mova m0, [r4+r3*4-16]
832 PALIGNR m4, m3, m0, 14, m0
833 PALIGNR m1, m3, 2, m2
835 pshuflw m0, m0, 11100101b
836 PRED4x4_LOWPASS m6, m1, m4, m3
837 PRED4x4_LOWPASS m4, m3, m0, m4
843 pinsrw m2, [r0+r2+14], 7
844 PRED4x4_LOWPASS m3, m2, m1, m3
845 PALIGNR m2, m3, m6, 2, m0
846 PALIGNR m5, m3, m6, 14, m0
848 PRED4x4_LOWPASS m6, m4, m2, m6
849 PRED4x4_LOWPASS m3, m5, m7, m3
851 PALIGNR m3, m6, 14, m2
854 PALIGNR m3, m6, 14, m2
857 PALIGNR m3, m6, 14, m2
860 PALIGNR m3, m6, 14, m2
863 PALIGNR m3, m6, 14, m2
866 PALIGNR m3, m6, 14, m2
869 PALIGNR m3, m6, 14, m6
875 %define PALIGNR PALIGNR_MMX
878 %define PALIGNR PALIGNR_SSSE3
880 %if HAVE_AVX_EXTERNAL
885 ;-----------------------------------------------------------------------------
886 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
887 ;-----------------------------------------------------------------------------
888 %macro PRED8x8L_VERTICAL_RIGHT 0
889 ; likewise with 8x8l_down_right
890 cglobal pred8x8l_vertical_right_10, 4, 5, 7
894 mova m0, [r0+r3*1-16]
895 punpckhwd m0, [r0+r3*0-16]
896 mova m1, [r0+r1*1-16]
897 punpckhwd m1, [r0+r3*2-16]
899 mova m2, [r4+r3*1-16]
900 punpckhwd m2, [r4+r3*0-16]
901 mova m3, [r4+r1*1-16]
902 punpckhwd m3, [r4+r3*2-16]
905 mova m0, [r4+r3*4-16]
907 PALIGNR m4, m3, m0, 14, m0
908 PALIGNR m1, m3, 2, m2
909 PRED4x4_LOWPASS m3, m1, m4, m3
915 pinsrw m5, [r0+r2+14], 7
916 PRED4x4_LOWPASS m2, m5, m1, m2
917 PALIGNR m6, m2, m3, 12, m1
918 PALIGNR m5, m2, m3, 14, m0
919 PRED4x4_LOWPASS m0, m6, m2, m5
925 PRED4x4_LOWPASS m1, m3, m6, m1
926 PALIGNR m2, m1, 14, m4
929 PALIGNR m0, m1, 14, m3
932 PALIGNR m2, m1, 14, m4
935 PALIGNR m0, m1, 14, m3
938 PALIGNR m2, m1, 14, m4
941 PALIGNR m0, m1, 14, m1
947 %define PALIGNR PALIGNR_MMX
948 PRED8x8L_VERTICAL_RIGHT
950 %define PALIGNR PALIGNR_SSSE3
951 PRED8x8L_VERTICAL_RIGHT
952 %if HAVE_AVX_EXTERNAL
954 PRED8x8L_VERTICAL_RIGHT
957 ;-----------------------------------------------------------------------------
958 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
959 ;-----------------------------------------------------------------------------
960 %macro PRED8x8L_HORIZONTAL_UP 0
961 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
962 mova m0, [r0+r3*0-16]
963 punpckhwd m0, [r0+r3*1-16]
968 mova m4, [r0+r1*1-16]
971 mova m1, [r0+r3*2-16]
972 punpckhwd m1, [r0+r1*1-16]
974 mova m2, [r2+r3*0-16]
975 punpckhwd m2, [r2+r3*1-16]
976 mova m3, [r2+r3*2-16]
977 punpckhwd m3, [r2+r1*1-16]
980 PALIGNR m1, m0, m4, 14, m4
982 pshufhw m2, m2, 10100100b
983 PRED4x4_LOWPASS m0, m1, m2, m0
986 pshufhw m1, m1, 10100100b
987 pshufhw m2, m2, 01010100b
989 PRED4x4_LOWPASS m1, m2, m0, m1
994 pshufd m0, m5, 11111001b
995 pshufd m1, m5, 11111110b
996 pshufd m2, m5, 11111111b
1000 PALIGNR m2, m5, m4, 4, m0
1001 PALIGNR m3, m5, m4, 8, m1
1002 PALIGNR m5, m5, m4, 12, m4
1010 %define PALIGNR PALIGNR_MMX
1011 PRED8x8L_HORIZONTAL_UP
1013 %define PALIGNR PALIGNR_SSSE3
1014 PRED8x8L_HORIZONTAL_UP
1015 %if HAVE_AVX_EXTERNAL
1017 PRED8x8L_HORIZONTAL_UP
1021 ;-----------------------------------------------------------------------------
1022 ; void pred16x16_vertical(pixel *src, int stride)
1023 ;-----------------------------------------------------------------------------
1026 mova [%1+mmsize], %3
1033 %macro PRED16x16_VERTICAL 0
1034 cglobal pred16x16_vertical_10, 2, 3
1038 mova m1, [r0+mmsize]
1044 MOV16 r0+r1*1, m0, m1, m2, m3
1045 MOV16 r0+r1*2, m0, m1, m2, m3
1057 ;-----------------------------------------------------------------------------
1058 ; void pred16x16_horizontal(pixel *src, int stride)
1059 ;-----------------------------------------------------------------------------
1060 %macro PRED16x16_HORIZONTAL 0
1061 cglobal pred16x16_horizontal_10, 2, 3
1064 movd m0, [r0+r1*0-4]
1065 movd m1, [r0+r1*1-4]
1068 MOV16 r0+r1*0, m0, m0, m0, m0
1069 MOV16 r0+r1*1, m1, m1, m1, m1
1077 PRED16x16_HORIZONTAL
1079 PRED16x16_HORIZONTAL
1081 ;-----------------------------------------------------------------------------
1082 ; void pred16x16_dc(pixel *src, int stride)
1083 ;-----------------------------------------------------------------------------
1084 %macro PRED16x16_DC 0
1085 cglobal pred16x16_dc_10, 2, 6
1089 paddw m0, [r0+mmsize]
1097 movzx r3d, word [r0]
1098 movzx r4d, word [r0+r1]
1101 movzx r2d, word [r0]
1103 movzx r2d, word [r0+r1]
1114 MOV16 r5+r1*0, m0, m0, m0, m0
1115 MOV16 r5+r1*1, m0, m0, m0, m0
1127 ;-----------------------------------------------------------------------------
1128 ; void pred16x16_top_dc(pixel *src, int stride)
1129 ;-----------------------------------------------------------------------------
1130 %macro PRED16x16_TOP_DC 0
1131 cglobal pred16x16_top_dc_10, 2, 3
1134 paddw m0, [r0+mmsize]
1146 MOV16 r0+r1*1, m0, m0, m0, m0
1147 MOV16 r0+r1*2, m0, m0, m0, m0
1159 ;-----------------------------------------------------------------------------
1160 ; void pred16x16_left_dc(pixel *src, int stride)
1161 ;-----------------------------------------------------------------------------
1162 %macro PRED16x16_LEFT_DC 0
1163 cglobal pred16x16_left_dc_10, 2, 6
1167 movzx r3d, word [r0]
1168 movzx r4d, word [r0+r1]
1171 movzx r2d, word [r0]
1173 movzx r2d, word [r0+r1]
1183 MOV16 r5+r1*0, m0, m0, m0, m0
1184 MOV16 r5+r1*1, m0, m0, m0, m0
1196 ;-----------------------------------------------------------------------------
1197 ; void pred16x16_128_dc(pixel *src, int stride)
1198 ;-----------------------------------------------------------------------------
1199 %macro PRED16x16_128_DC 0
1200 cglobal pred16x16_128_dc_10, 2,3
1204 MOV16 r0+r1*0, m0, m0, m0, m0
1205 MOV16 r0+r1*1, m0, m0, m0, m0