1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
35 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
37 pw_pixel_max: times 8 dw ((1 << 10)-1)
38 pw_512: times 8 dw 512
44 ; dest, left, right, src
45 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46 %macro PRED4x4_LOWPASS 4
52 ;-----------------------------------------------------------------------------
53 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred4x4_down_right_10, 3, 3
60 movhps m2, [r0+r2*1-8]
65 PALIGNR m3, m1, 10, m1
66 movhps m4, [r1+r2*1-8]
67 PALIGNR m0, m3, m4, 14, m4
68 movhps m4, [r1+r2*2-8]
69 PALIGNR m2, m0, m4, 14, m4
70 PRED4x4_LOWPASS m0, m2, m3, m0
88 ;-----------------------------------------------------------------------------
89 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
90 ;-----------------------------------------------------------------------------
92 cglobal pred4x4_vertical_right_10, 3, 3, 6
95 movq m5, [r0] ; ........t3t2t1t0
97 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
99 movhps m1, [r0+r2*1-8]
100 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
101 movhps m2, [r0+r2*2-8]
102 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
103 movhps m3, [r1+r2*1-8]
104 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
105 PRED4x4_LOWPASS m1, m0, m2, m1
110 PALIGNR m5, m0, 14, m2
113 PALIGNR m1, m0, 14, m0
125 ;-----------------------------------------------------------------------------
126 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
127 ;-----------------------------------------------------------------------------
129 cglobal pred4x4_horizontal_down_10, 3, 3
132 movq m0, [r0-8] ; lt ..
134 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
135 movq m1, [r1+r2*2-8] ; l3
137 punpcklwd m1, m3 ; l2 l3
138 movq m2, [r0+r2*2-8] ; l1
140 punpcklwd m2, m3 ; l0 l1
141 punpckhdq m1, m2 ; l0 l1 l2 l3
142 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
143 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
144 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
146 PRED4x4_LOWPASS m3, m1, m0, m3
149 PALIGNR m3, m5, 12, m4
165 ;-----------------------------------------------------------------------------
166 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
167 ;-----------------------------------------------------------------------------
168 %macro HADDD 2 ; sum junk
186 cglobal pred4x4_dc_10, 3, 3
190 paddw m2, [r0+r2*2-8]
191 paddw m2, [r1+r2*1-8]
192 paddw m2, [r1+r2*2-8]
206 ;-----------------------------------------------------------------------------
207 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
208 ;-----------------------------------------------------------------------------
210 cglobal pred4x4_down_left_10, 3, 3
216 pshufhw m2, m2, 10100100b
217 PRED4x4_LOWPASS m0, m3, m2, m0
234 ;-----------------------------------------------------------------------------
235 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
236 ;-----------------------------------------------------------------------------
238 cglobal pred4x4_vertical_left_10, 3, 3
245 PRED4x4_LOWPASS m0, m1, m2, m0
261 ;-----------------------------------------------------------------------------
262 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
263 ;-----------------------------------------------------------------------------
265 cglobal pred4x4_horizontal_up_10, 3, 3
269 punpckhwd m0, [r0+r2*2-8]
271 punpckhwd m1, [r1+r2*2-8]
276 pshufw m2, m0, 11111001b
280 pshufw m5, m0, 11111110b
281 PRED4x4_LOWPASS m1, m0, m5, m1
295 ;-----------------------------------------------------------------------------
296 ; void pred8x8_vertical(pixel *src, int stride)
297 ;-----------------------------------------------------------------------------
299 cglobal pred8x8_vertical_10, 2, 2
311 ;-----------------------------------------------------------------------------
312 ; void pred8x8_horizontal(pixel *src, int stride)
313 ;-----------------------------------------------------------------------------
315 cglobal pred8x8_horizontal_10, 2, 3
331 ;-----------------------------------------------------------------------------
332 ; void predict_8x8_dc(pixel *src, int stride)
333 ;-----------------------------------------------------------------------------
335 ; sort of a hack, but it works
345 cglobal pred8x8_dc_10, 2, 6
355 pshufw m2, m0, 00001110b
356 pshufw m3, m1, 00001110b
366 movzx r2d, word [r0+r1*1-2]
367 movzx r3d, word [r0+r1*2-2]
369 movzx r3d, word [r0+r5*1-2]
371 movzx r3d, word [r4-2]
375 movzx r2d, word [r4+r1*1-2]
376 movzx r3d, word [r4+r1*2-2]
378 movzx r3d, word [r4+r5*1-2]
380 movzx r3d, word [r4+r1*4-2]
385 punpckldq m0, m2 ; s0, s1, s2, s3
386 %1 m3, m0, 11110110b ; s2, s1, s3, s3
387 %1 m0, m0, 01110100b ; s0, s1, s3, s1
390 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
393 pshufd m3, m0, 11111010b
418 ;-----------------------------------------------------------------------------
419 ; void pred8x8_top_dc(pixel *src, int stride)
420 ;-----------------------------------------------------------------------------
422 cglobal pred8x8_top_dc_10, 2, 4
445 ;-----------------------------------------------------------------------------
446 ; void pred8x8_plane(pixel *src, int stride)
447 ;-----------------------------------------------------------------------------
449 cglobal pred8x8_plane_10, 2, 7, 7
454 pmaddwd m2, [pw_m32101234]
462 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
463 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
464 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
466 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
467 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
470 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
471 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
475 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
476 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
485 mova m3, [pw_pixel_max]
490 pmullw m2, [pw_m32101234] ; b
491 pmullw m5, m4, [pw_m3] ; c
508 ;-----------------------------------------------------------------------------
509 ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
510 ;-----------------------------------------------------------------------------
511 %macro PRED8x8L_128_DC 0
512 cglobal pred8x8l_128_dc_10, 4, 4
513 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
532 ;-----------------------------------------------------------------------------
533 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
534 ;-----------------------------------------------------------------------------
535 %macro PRED8x8L_TOP_DC 0
536 cglobal pred8x8l_top_dc_10, 4, 4, 6
544 pinsrw m1, [r0+r1], 0
545 pinsrw m2, [r0+r2+14], 7
548 PRED4x4_LOWPASS m0, m2, m1, m0
569 ;-----------------------------------------------------------------------------
570 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
571 ;-----------------------------------------------------------------------------
572 ;TODO: see if scalar is faster
574 cglobal pred8x8l_dc_10, 4, 6, 6
578 mova m0, [r0+r3*2-16]
579 punpckhwd m0, [r0+r3*1-16]
580 mova m1, [r4+r3*0-16]
581 punpckhwd m1, [r0+r5*1-16]
583 mova m2, [r4+r3*2-16]
584 punpckhwd m2, [r4+r3*1-16]
585 mova m3, [r4+r3*4-16]
586 punpckhwd m3, [r4+r5*1-16]
595 pinsrw m1, [r0+r1], 0
596 pinsrw m2, [r0+r2+14], 7
601 pshuflw m4, m4, 11100101b
602 pinsrw m5, [r0+r1-2], 7
603 PRED4x4_LOWPASS m3, m4, m5, m3
604 PRED4x4_LOWPASS m0, m2, m1, m0
626 ;-----------------------------------------------------------------------------
627 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
628 ;-----------------------------------------------------------------------------
629 %macro PRED8x8L_VERTICAL 0
630 cglobal pred8x8l_vertical_10, 4, 4, 6
638 pinsrw m1, [r0+r1], 0
639 pinsrw m2, [r0+r2+14], 7
642 PRED4x4_LOWPASS m0, m2, m1, m0
659 ;-----------------------------------------------------------------------------
660 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
661 ;-----------------------------------------------------------------------------
662 %macro PRED8x8L_HORIZONTAL 0
663 cglobal pred8x8l_horizontal_10, 4, 4, 5
669 punpckhwd m0, [r0+r1-16]
670 mova m1, [r0+r3*2-16]
671 punpckhwd m1, [r0+r3*1-16]
675 mova m2, [r2+r3*0-16]
676 punpckhwd m2, [r0+r1-16]
677 mova m3, [r2+r3*2-16]
678 punpckhwd m3, [r2+r3*1-16]
681 PALIGNR m4, m3, [r2+r1-16], 14, m0
683 pshuflw m0, m0, 11100101b
684 PRED4x4_LOWPASS m4, m3, m0, m4
713 ;-----------------------------------------------------------------------------
714 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
715 ;-----------------------------------------------------------------------------
716 %macro PRED8x8L_DOWN_LEFT 0
717 cglobal pred8x8l_down_left_10, 4, 4, 7
725 pinsrw m1, [r0+r1], 0
726 pinsrw m2, [r0+r2+14], 7
727 PRED4x4_LOWPASS m6, m2, m1, m3
728 jz .fix_tr ; flags from shr r2d
731 PALIGNR m2, m1, m3, 14, m3
732 pshufhw m5, m5, 10100100b
733 PRED4x4_LOWPASS m1, m2, m5, m1
738 PALIGNR m2, m1, m6, 2, m0
739 PALIGNR m3, m1, m6, 14, m0
740 PALIGNR m5, m1, 2, m0
742 PRED4x4_LOWPASS m6, m4, m2, m6
743 PRED4x4_LOWPASS m1, m3, m5, m1
745 PALIGNR m1, m6, 14, m2
748 PALIGNR m1, m6, 14, m2
751 PALIGNR m1, m6, 14, m2
754 PALIGNR m1, m6, 14, m2
757 PALIGNR m1, m6, 14, m2
760 PALIGNR m1, m6, 14, m2
763 PALIGNR m1, m6, 14, m6
779 ;-----------------------------------------------------------------------------
780 ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
781 ;-----------------------------------------------------------------------------
782 %macro PRED8x8L_DOWN_RIGHT 0
783 ; standard forbids this when has_topleft is false
785 cglobal pred8x8l_down_right_10, 4, 5, 8
789 mova m0, [r0+r3*1-16]
790 punpckhwd m0, [r0+r3*0-16]
791 mova m1, [r0+r1*1-16]
792 punpckhwd m1, [r0+r3*2-16]
794 mova m2, [r4+r3*1-16]
795 punpckhwd m2, [r4+r3*0-16]
796 mova m3, [r4+r1*1-16]
797 punpckhwd m3, [r4+r3*2-16]
800 mova m0, [r4+r3*4-16]
802 PALIGNR m4, m3, m0, 14, m0
803 PALIGNR m1, m3, 2, m2
805 pshuflw m0, m0, 11100101b
806 PRED4x4_LOWPASS m6, m1, m4, m3
807 PRED4x4_LOWPASS m4, m3, m0, m4
813 pinsrw m2, [r0+r2+14], 7
814 PRED4x4_LOWPASS m3, m2, m1, m3
815 PALIGNR m2, m3, m6, 2, m0
816 PALIGNR m5, m3, m6, 14, m0
818 PRED4x4_LOWPASS m6, m4, m2, m6
819 PRED4x4_LOWPASS m3, m5, m7, m3
821 PALIGNR m3, m6, 14, m2
824 PALIGNR m3, m6, 14, m2
827 PALIGNR m3, m6, 14, m2
830 PALIGNR m3, m6, 14, m2
833 PALIGNR m3, m6, 14, m2
836 PALIGNR m3, m6, 14, m2
839 PALIGNR m3, m6, 14, m6
851 ;-----------------------------------------------------------------------------
852 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
853 ;-----------------------------------------------------------------------------
854 %macro PRED8x8L_VERTICAL_RIGHT 0
855 ; likewise with 8x8l_down_right
856 cglobal pred8x8l_vertical_right_10, 4, 5, 7
860 mova m0, [r0+r3*1-16]
861 punpckhwd m0, [r0+r3*0-16]
862 mova m1, [r0+r1*1-16]
863 punpckhwd m1, [r0+r3*2-16]
865 mova m2, [r4+r3*1-16]
866 punpckhwd m2, [r4+r3*0-16]
867 mova m3, [r4+r1*1-16]
868 punpckhwd m3, [r4+r3*2-16]
871 mova m0, [r4+r3*4-16]
873 PALIGNR m4, m3, m0, 14, m0
874 PALIGNR m1, m3, 2, m2
875 PRED4x4_LOWPASS m3, m1, m4, m3
881 pinsrw m5, [r0+r2+14], 7
882 PRED4x4_LOWPASS m2, m5, m1, m2
883 PALIGNR m6, m2, m3, 12, m1
884 PALIGNR m5, m2, m3, 14, m0
885 PRED4x4_LOWPASS m0, m6, m2, m5
891 PRED4x4_LOWPASS m1, m3, m6, m1
892 PALIGNR m2, m1, 14, m4
895 PALIGNR m0, m1, 14, m3
898 PALIGNR m2, m1, 14, m4
901 PALIGNR m0, m1, 14, m3
904 PALIGNR m2, m1, 14, m4
907 PALIGNR m0, m1, 14, m1
913 PRED8x8L_VERTICAL_RIGHT
915 PRED8x8L_VERTICAL_RIGHT
917 PRED8x8L_VERTICAL_RIGHT
919 ;-----------------------------------------------------------------------------
920 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
921 ;-----------------------------------------------------------------------------
922 %macro PRED8x8L_HORIZONTAL_UP 0
923 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
924 mova m0, [r0+r3*0-16]
925 punpckhwd m0, [r0+r3*1-16]
930 mova m4, [r0+r1*1-16]
933 mova m1, [r0+r3*2-16]
934 punpckhwd m1, [r0+r1*1-16]
936 mova m2, [r2+r3*0-16]
937 punpckhwd m2, [r2+r3*1-16]
938 mova m3, [r2+r3*2-16]
939 punpckhwd m3, [r2+r1*1-16]
942 PALIGNR m1, m0, m4, 14, m4
944 pshufhw m2, m2, 10100100b
945 PRED4x4_LOWPASS m0, m1, m2, m0
948 pshufhw m1, m1, 10100100b
949 pshufhw m2, m2, 01010100b
951 PRED4x4_LOWPASS m1, m2, m0, m1
956 pshufd m0, m5, 11111001b
957 pshufd m1, m5, 11111110b
958 pshufd m2, m5, 11111111b
962 PALIGNR m2, m5, m4, 4, m0
963 PALIGNR m3, m5, m4, 8, m1
964 PALIGNR m5, m5, m4, 12, m4
972 PRED8x8L_HORIZONTAL_UP
974 PRED8x8L_HORIZONTAL_UP
976 PRED8x8L_HORIZONTAL_UP
979 ;-----------------------------------------------------------------------------
980 ; void pred16x16_vertical(pixel *src, int stride)
981 ;-----------------------------------------------------------------------------
991 %macro PRED16x16_VERTICAL 0
992 cglobal pred16x16_vertical_10, 2, 3
1002 MOV16 r0+r1*1, m0, m1, m2, m3
1003 MOV16 r0+r1*2, m0, m1, m2, m3
1015 ;-----------------------------------------------------------------------------
1016 ; void pred16x16_horizontal(pixel *src, int stride)
1017 ;-----------------------------------------------------------------------------
1018 %macro PRED16x16_HORIZONTAL 0
1019 cglobal pred16x16_horizontal_10, 2, 3
1022 movd m0, [r0+r1*0-4]
1023 movd m1, [r0+r1*1-4]
1026 MOV16 r0+r1*0, m0, m0, m0, m0
1027 MOV16 r0+r1*1, m1, m1, m1, m1
1035 PRED16x16_HORIZONTAL
1037 PRED16x16_HORIZONTAL
1039 ;-----------------------------------------------------------------------------
1040 ; void pred16x16_dc(pixel *src, int stride)
1041 ;-----------------------------------------------------------------------------
1042 %macro PRED16x16_DC 0
1043 cglobal pred16x16_dc_10, 2, 6
1047 paddw m0, [r0+mmsize]
1055 movzx r3d, word [r0]
1056 movzx r4d, word [r0+r1]
1059 movzx r2d, word [r0]
1061 movzx r2d, word [r0+r1]
1072 MOV16 r5+r1*0, m0, m0, m0, m0
1073 MOV16 r5+r1*1, m0, m0, m0, m0
1085 ;-----------------------------------------------------------------------------
1086 ; void pred16x16_top_dc(pixel *src, int stride)
1087 ;-----------------------------------------------------------------------------
1088 %macro PRED16x16_TOP_DC 0
1089 cglobal pred16x16_top_dc_10, 2, 3
1092 paddw m0, [r0+mmsize]
1104 MOV16 r0+r1*1, m0, m0, m0, m0
1105 MOV16 r0+r1*2, m0, m0, m0, m0
1117 ;-----------------------------------------------------------------------------
1118 ; void pred16x16_left_dc(pixel *src, int stride)
1119 ;-----------------------------------------------------------------------------
1120 %macro PRED16x16_LEFT_DC 0
1121 cglobal pred16x16_left_dc_10, 2, 6
1125 movzx r3d, word [r0]
1126 movzx r4d, word [r0+r1]
1129 movzx r2d, word [r0]
1131 movzx r2d, word [r0+r1]
1141 MOV16 r5+r1*0, m0, m0, m0, m0
1142 MOV16 r5+r1*1, m0, m0, m0, m0
1154 ;-----------------------------------------------------------------------------
1155 ; void pred16x16_128_dc(pixel *src, int stride)
1156 ;-----------------------------------------------------------------------------
1157 %macro PRED16x16_128_DC 0
1158 cglobal pred16x16_128_dc_10, 2,3
1162 MOV16 r0+r1*0, m0, m0, m0, m0
1163 MOV16 r0+r1*1, m0, m0, m0, m0