1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
35 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
37 pw_pixel_max: times 8 dw ((1 << 10)-1)
38 pw_512: times 8 dw 512
44 ; dest, left, right, src
45 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46 %macro PRED4x4_LOWPASS 4
52 ;-----------------------------------------------------------------------------
53 ; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred4x4_down_right_10, 3, 3
60 movhps m2, [r0+r2*1-8]
65 PALIGNR m3, m1, 10, m1
66 movhps m4, [r1+r2*1-8]
67 PALIGNR m0, m3, m4, 14, m4
68 movhps m4, [r1+r2*2-8]
69 PALIGNR m2, m0, m4, 14, m4
70 PRED4x4_LOWPASS m0, m2, m3, m0
88 ;------------------------------------------------------------------------------
89 ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
90 ;------------------------------------------------------------------------------
92 cglobal pred4x4_vertical_right_10, 3, 3, 6
95 movq m5, [r0] ; ........t3t2t1t0
97 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
99 movhps m1, [r0+r2*1-8]
100 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
101 movhps m2, [r0+r2*2-8]
102 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
103 movhps m3, [r1+r2*1-8]
104 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
105 PRED4x4_LOWPASS m1, m0, m2, m1
110 PALIGNR m5, m0, 14, m2
113 PALIGNR m1, m0, 14, m0
125 ;-------------------------------------------------------------------------------
126 ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
127 ;-------------------------------------------------------------------------------
129 cglobal pred4x4_horizontal_down_10, 3, 3
132 movq m0, [r0-8] ; lt ..
134 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
135 movq m1, [r1+r2*2-8] ; l3
137 punpcklwd m1, m3 ; l2 l3
138 movq m2, [r0+r2*2-8] ; l1
140 punpcklwd m2, m3 ; l0 l1
141 punpckhdq m1, m2 ; l0 l1 l2 l3
142 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
143 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
144 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
146 PRED4x4_LOWPASS m3, m1, m0, m3
149 PALIGNR m3, m5, 12, m4
165 ;-----------------------------------------------------------------------------
166 ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
167 ;-----------------------------------------------------------------------------
168 %macro HADDD 2 ; sum junk
186 cglobal pred4x4_dc_10, 3, 3
190 paddw m2, [r0+r2*2-8]
191 paddw m2, [r1+r2*1-8]
192 paddw m2, [r1+r2*2-8]
206 ;-----------------------------------------------------------------------------
207 ; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride)
208 ;-----------------------------------------------------------------------------
210 cglobal pred4x4_down_left_10, 3, 3
216 pshufhw m2, m2, 10100100b
217 PRED4x4_LOWPASS m0, m3, m2, m0
234 ;-----------------------------------------------------------------------------
235 ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
236 ;-----------------------------------------------------------------------------
238 cglobal pred4x4_vertical_left_10, 3, 3
245 PRED4x4_LOWPASS m0, m1, m2, m0
261 ;-----------------------------------------------------------------------------
262 ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
263 ;-----------------------------------------------------------------------------
265 cglobal pred4x4_horizontal_up_10, 3, 3
269 punpckhwd m0, [r0+r2*2-8]
271 punpckhwd m1, [r1+r2*2-8]
276 pshufw m2, m0, 11111001b
280 pshufw m5, m0, 11111110b
281 PRED4x4_LOWPASS m1, m0, m5, m1
295 ;-----------------------------------------------------------------------------
296 ; void ff_pred8x8_vertical(pixel *src, int stride)
297 ;-----------------------------------------------------------------------------
299 cglobal pred8x8_vertical_10, 2, 2
311 ;-----------------------------------------------------------------------------
312 ; void ff_pred8x8_horizontal(pixel *src, int stride)
313 ;-----------------------------------------------------------------------------
315 cglobal pred8x8_horizontal_10, 2, 3
331 ;-----------------------------------------------------------------------------
332 ; void ff_predict_8x8_dc(pixel *src, int stride)
333 ;-----------------------------------------------------------------------------
335 ; sort of a hack, but it works
345 cglobal pred8x8_dc_10, 2, 6
355 pshufw m2, m0, 00001110b
356 pshufw m3, m1, 00001110b
366 movzx r2d, word [r0+r1*1-2]
367 movzx r3d, word [r0+r1*2-2]
369 movzx r3d, word [r0+r5*1-2]
371 movzx r3d, word [r4-2]
375 movzx r2d, word [r4+r1*1-2]
376 movzx r3d, word [r4+r1*2-2]
378 movzx r3d, word [r4+r5*1-2]
380 movzx r3d, word [r4+r1*4-2]
385 punpckldq m0, m2 ; s0, s1, s2, s3
386 %1 m3, m0, 11110110b ; s2, s1, s3, s3
387 %1 m0, m0, 01110100b ; s0, s1, s3, s1
390 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
393 pshufd m3, m0, 11111010b
418 ;-----------------------------------------------------------------------------
419 ; void ff_pred8x8_top_dc(pixel *src, int stride)
420 ;-----------------------------------------------------------------------------
422 cglobal pred8x8_top_dc_10, 2, 4
445 ;-----------------------------------------------------------------------------
446 ; void ff_pred8x8_plane(pixel *src, int stride)
447 ;-----------------------------------------------------------------------------
449 cglobal pred8x8_plane_10, 2, 7, 7
454 pmaddwd m2, [pw_m32101234]
462 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
463 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
464 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
466 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
467 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
470 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
471 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
475 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
476 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
485 mova m3, [pw_pixel_max]
490 pmullw m2, [pw_m32101234] ; b
491 pmullw m5, m4, [pw_m3] ; c
508 ;-----------------------------------------------------------------------------
509 ; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright,
511 ;-----------------------------------------------------------------------------
512 %macro PRED8x8L_128_DC 0
513 cglobal pred8x8l_128_dc_10, 4, 4
514 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
533 ;-----------------------------------------------------------------------------
534 ; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright,
536 ;-----------------------------------------------------------------------------
537 %macro PRED8x8L_TOP_DC 0
538 cglobal pred8x8l_top_dc_10, 4, 4, 6
546 pinsrw m1, [r0+r1], 0
547 pinsrw m2, [r0+r2+14], 7
550 PRED4x4_LOWPASS m0, m2, m1, m0
571 ;-------------------------------------------------------------------------------
572 ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
573 ;-------------------------------------------------------------------------------
574 ;TODO: see if scalar is faster
576 cglobal pred8x8l_dc_10, 4, 6, 6
580 mova m0, [r0+r3*2-16]
581 punpckhwd m0, [r0+r3*1-16]
582 mova m1, [r4+r3*0-16]
583 punpckhwd m1, [r0+r5*1-16]
585 mova m2, [r4+r3*2-16]
586 punpckhwd m2, [r4+r3*1-16]
587 mova m3, [r4+r3*4-16]
588 punpckhwd m3, [r4+r5*1-16]
597 pinsrw m1, [r0+r1], 0
598 pinsrw m2, [r0+r2+14], 7
603 pshuflw m4, m4, 11100101b
604 pinsrw m5, [r0+r1-2], 7
605 PRED4x4_LOWPASS m3, m4, m5, m3
606 PRED4x4_LOWPASS m0, m2, m1, m0
628 ;-----------------------------------------------------------------------------
629 ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
631 ;-----------------------------------------------------------------------------
632 %macro PRED8x8L_VERTICAL 0
633 cglobal pred8x8l_vertical_10, 4, 4, 6
641 pinsrw m1, [r0+r1], 0
642 pinsrw m2, [r0+r2+14], 7
645 PRED4x4_LOWPASS m0, m2, m1, m0
662 ;-----------------------------------------------------------------------------
663 ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
665 ;-----------------------------------------------------------------------------
666 %macro PRED8x8L_HORIZONTAL 0
667 cglobal pred8x8l_horizontal_10, 4, 4, 5
673 punpckhwd m0, [r0+r1-16]
674 mova m1, [r0+r3*2-16]
675 punpckhwd m1, [r0+r3*1-16]
679 mova m2, [r2+r3*0-16]
680 punpckhwd m2, [r0+r1-16]
681 mova m3, [r2+r3*2-16]
682 punpckhwd m3, [r2+r3*1-16]
685 PALIGNR m4, m3, [r2+r1-16], 14, m0
687 pshuflw m0, m0, 11100101b
688 PRED4x4_LOWPASS m4, m3, m0, m4
717 ;-----------------------------------------------------------------------------
718 ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
720 ;-----------------------------------------------------------------------------
721 %macro PRED8x8L_DOWN_LEFT 0
722 cglobal pred8x8l_down_left_10, 4, 4, 7
730 pinsrw m1, [r0+r1], 0
731 pinsrw m2, [r0+r2+14], 7
732 PRED4x4_LOWPASS m6, m2, m1, m3
733 jz .fix_tr ; flags from shr r2d
736 PALIGNR m2, m1, m3, 14, m3
737 pshufhw m5, m5, 10100100b
738 PRED4x4_LOWPASS m1, m2, m5, m1
743 PALIGNR m2, m1, m6, 2, m0
744 PALIGNR m3, m1, m6, 14, m0
745 PALIGNR m5, m1, 2, m0
747 PRED4x4_LOWPASS m6, m4, m2, m6
748 PRED4x4_LOWPASS m1, m3, m5, m1
750 PALIGNR m1, m6, 14, m2
753 PALIGNR m1, m6, 14, m2
756 PALIGNR m1, m6, 14, m2
759 PALIGNR m1, m6, 14, m2
762 PALIGNR m1, m6, 14, m2
765 PALIGNR m1, m6, 14, m2
768 PALIGNR m1, m6, 14, m6
784 ;-----------------------------------------------------------------------------
785 ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
787 ;-----------------------------------------------------------------------------
788 %macro PRED8x8L_DOWN_RIGHT 0
789 ; standard forbids this when has_topleft is false
791 cglobal pred8x8l_down_right_10, 4, 5, 8
795 mova m0, [r0+r3*1-16]
796 punpckhwd m0, [r0+r3*0-16]
797 mova m1, [r0+r1*1-16]
798 punpckhwd m1, [r0+r3*2-16]
800 mova m2, [r4+r3*1-16]
801 punpckhwd m2, [r4+r3*0-16]
802 mova m3, [r4+r1*1-16]
803 punpckhwd m3, [r4+r3*2-16]
806 mova m0, [r4+r3*4-16]
808 PALIGNR m4, m3, m0, 14, m0
809 PALIGNR m1, m3, 2, m2
811 pshuflw m0, m0, 11100101b
812 PRED4x4_LOWPASS m6, m1, m4, m3
813 PRED4x4_LOWPASS m4, m3, m0, m4
819 pinsrw m2, [r0+r2+14], 7
820 PRED4x4_LOWPASS m3, m2, m1, m3
821 PALIGNR m2, m3, m6, 2, m0
822 PALIGNR m5, m3, m6, 14, m0
824 PRED4x4_LOWPASS m6, m4, m2, m6
825 PRED4x4_LOWPASS m3, m5, m7, m3
827 PALIGNR m3, m6, 14, m2
830 PALIGNR m3, m6, 14, m2
833 PALIGNR m3, m6, 14, m2
836 PALIGNR m3, m6, 14, m2
839 PALIGNR m3, m6, 14, m2
842 PALIGNR m3, m6, 14, m2
845 PALIGNR m3, m6, 14, m6
857 ;-----------------------------------------------------------------------------
858 ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
859 ; int has_topright, int stride)
860 ;-----------------------------------------------------------------------------
861 %macro PRED8x8L_VERTICAL_RIGHT 0
862 ; likewise with 8x8l_down_right
863 cglobal pred8x8l_vertical_right_10, 4, 5, 7
867 mova m0, [r0+r3*1-16]
868 punpckhwd m0, [r0+r3*0-16]
869 mova m1, [r0+r1*1-16]
870 punpckhwd m1, [r0+r3*2-16]
872 mova m2, [r4+r3*1-16]
873 punpckhwd m2, [r4+r3*0-16]
874 mova m3, [r4+r1*1-16]
875 punpckhwd m3, [r4+r3*2-16]
878 mova m0, [r4+r3*4-16]
880 PALIGNR m4, m3, m0, 14, m0
881 PALIGNR m1, m3, 2, m2
882 PRED4x4_LOWPASS m3, m1, m4, m3
888 pinsrw m5, [r0+r2+14], 7
889 PRED4x4_LOWPASS m2, m5, m1, m2
890 PALIGNR m6, m2, m3, 12, m1
891 PALIGNR m5, m2, m3, 14, m0
892 PRED4x4_LOWPASS m0, m6, m2, m5
898 PRED4x4_LOWPASS m1, m3, m6, m1
899 PALIGNR m2, m1, 14, m4
902 PALIGNR m0, m1, 14, m3
905 PALIGNR m2, m1, 14, m4
908 PALIGNR m0, m1, 14, m3
911 PALIGNR m2, m1, 14, m4
914 PALIGNR m0, m1, 14, m1
920 PRED8x8L_VERTICAL_RIGHT
922 PRED8x8L_VERTICAL_RIGHT
924 PRED8x8L_VERTICAL_RIGHT
926 ;-----------------------------------------------------------------------------
927 ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
928 ; int has_topright, int stride)
929 ;-----------------------------------------------------------------------------
930 %macro PRED8x8L_HORIZONTAL_UP 0
931 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
932 mova m0, [r0+r3*0-16]
933 punpckhwd m0, [r0+r3*1-16]
938 mova m4, [r0+r1*1-16]
941 mova m1, [r0+r3*2-16]
942 punpckhwd m1, [r0+r1*1-16]
944 mova m2, [r2+r3*0-16]
945 punpckhwd m2, [r2+r3*1-16]
946 mova m3, [r2+r3*2-16]
947 punpckhwd m3, [r2+r1*1-16]
950 PALIGNR m1, m0, m4, 14, m4
952 pshufhw m2, m2, 10100100b
953 PRED4x4_LOWPASS m0, m1, m2, m0
956 pshufhw m1, m1, 10100100b
957 pshufhw m2, m2, 01010100b
959 PRED4x4_LOWPASS m1, m2, m0, m1
964 pshufd m0, m5, 11111001b
965 pshufd m1, m5, 11111110b
966 pshufd m2, m5, 11111111b
970 PALIGNR m2, m5, m4, 4, m0
971 PALIGNR m3, m5, m4, 8, m1
972 PALIGNR m5, m5, m4, 12, m4
980 PRED8x8L_HORIZONTAL_UP
982 PRED8x8L_HORIZONTAL_UP
984 PRED8x8L_HORIZONTAL_UP
987 ;-----------------------------------------------------------------------------
988 ; void ff_pred16x16_vertical(pixel *src, int stride)
989 ;-----------------------------------------------------------------------------
999 %macro PRED16x16_VERTICAL 0
1000 cglobal pred16x16_vertical_10, 2, 3
1004 mova m1, [r0+mmsize]
1010 MOV16 r0+r1*1, m0, m1, m2, m3
1011 MOV16 r0+r1*2, m0, m1, m2, m3
1023 ;-----------------------------------------------------------------------------
1024 ; void ff_pred16x16_horizontal(pixel *src, int stride)
1025 ;-----------------------------------------------------------------------------
1026 %macro PRED16x16_HORIZONTAL 0
1027 cglobal pred16x16_horizontal_10, 2, 3
1030 movd m0, [r0+r1*0-4]
1031 movd m1, [r0+r1*1-4]
1034 MOV16 r0+r1*0, m0, m0, m0, m0
1035 MOV16 r0+r1*1, m1, m1, m1, m1
1043 PRED16x16_HORIZONTAL
1045 PRED16x16_HORIZONTAL
1047 ;-----------------------------------------------------------------------------
1048 ; void ff_pred16x16_dc(pixel *src, int stride)
1049 ;-----------------------------------------------------------------------------
1050 %macro PRED16x16_DC 0
1051 cglobal pred16x16_dc_10, 2, 6
1055 paddw m0, [r0+mmsize]
1063 movzx r3d, word [r0]
1064 movzx r4d, word [r0+r1]
1067 movzx r2d, word [r0]
1069 movzx r2d, word [r0+r1]
1080 MOV16 r5+r1*0, m0, m0, m0, m0
1081 MOV16 r5+r1*1, m0, m0, m0, m0
1093 ;-----------------------------------------------------------------------------
1094 ; void ff_pred16x16_top_dc(pixel *src, int stride)
1095 ;-----------------------------------------------------------------------------
1096 %macro PRED16x16_TOP_DC 0
1097 cglobal pred16x16_top_dc_10, 2, 3
1100 paddw m0, [r0+mmsize]
1112 MOV16 r0+r1*1, m0, m0, m0, m0
1113 MOV16 r0+r1*2, m0, m0, m0, m0
1125 ;-----------------------------------------------------------------------------
1126 ; void ff_pred16x16_left_dc(pixel *src, int stride)
1127 ;-----------------------------------------------------------------------------
1128 %macro PRED16x16_LEFT_DC 0
1129 cglobal pred16x16_left_dc_10, 2, 6
1133 movzx r3d, word [r0]
1134 movzx r4d, word [r0+r1]
1137 movzx r2d, word [r0]
1139 movzx r2d, word [r0+r1]
1149 MOV16 r5+r1*0, m0, m0, m0, m0
1150 MOV16 r5+r1*1, m0, m0, m0, m0
1162 ;-----------------------------------------------------------------------------
1163 ; void ff_pred16x16_128_dc(pixel *src, int stride)
1164 ;-----------------------------------------------------------------------------
1165 %macro PRED16x16_128_DC 0
1166 cglobal pred16x16_128_dc_10, 2,3
1170 MOV16 r0+r1*0, m0, m0, m0, m0
1171 MOV16 r0+r1*1, m0, m0, m0, m0