1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
30 %define pw_pixel_max pw_1023
39 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
45 ; dest, left, right, src
46 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47 %macro PRED4x4_LOWPASS 4
53 ;-----------------------------------------------------------------------------
54 ; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright,
56 ;-----------------------------------------------------------------------------
58 cglobal pred4x4_down_right_10, 3, 3
62 movhps m2, [r0+r2*1-8]
67 PALIGNR m3, m1, 10, m1
68 movhps m4, [r1+r2*1-8]
69 PALIGNR m0, m3, m4, 14, m4
70 movhps m4, [r1+r2*2-8]
71 PALIGNR m2, m0, m4, 14, m4
72 PRED4x4_LOWPASS m0, m2, m3, m0
92 ;------------------------------------------------------------------------------
93 ; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
95 ;------------------------------------------------------------------------------
97 cglobal pred4x4_vertical_right_10, 3, 3, 6
100 movq m5, [r0] ; ........t3t2t1t0
102 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
104 movhps m1, [r0+r2*1-8]
105 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
106 movhps m2, [r0+r2*2-8]
107 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
108 movhps m3, [r1+r2*1-8]
109 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
110 PRED4x4_LOWPASS m1, m0, m2, m1
115 PALIGNR m5, m0, 14, m2
118 PALIGNR m1, m0, 14, m0
127 %if HAVE_AVX_EXTERNAL
132 ;-------------------------------------------------------------------------------
133 ; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
135 ;-------------------------------------------------------------------------------
137 cglobal pred4x4_horizontal_down_10, 3, 3
140 movq m0, [r0-8] ; lt ..
142 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
143 movq m1, [r1+r2*2-8] ; l3
145 punpcklwd m1, m3 ; l2 l3
146 movq m2, [r0+r2*2-8] ; l1
148 punpcklwd m2, m3 ; l0 l1
149 punpckhdq m1, m2 ; l0 l1 l2 l3
150 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
151 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
152 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
154 PRED4x4_LOWPASS m3, m1, m0, m3
157 PALIGNR m3, m5, 12, m4
170 %if HAVE_AVX_EXTERNAL
175 ;-----------------------------------------------------------------------------
176 ; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
177 ;-----------------------------------------------------------------------------
180 cglobal pred4x4_dc_10, 3, 3
184 paddw m2, [r0+r2*2-8]
185 paddw m2, [r1+r2*1-8]
186 paddw m2, [r1+r2*2-8]
200 ;-----------------------------------------------------------------------------
201 ; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright,
203 ;-----------------------------------------------------------------------------
205 cglobal pred4x4_down_left_10, 3, 3
211 pshufhw m2, m2, 10100100b
212 PRED4x4_LOWPASS m0, m3, m2, m0
226 %if HAVE_AVX_EXTERNAL
231 ;-----------------------------------------------------------------------------
232 ; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
234 ;-----------------------------------------------------------------------------
236 cglobal pred4x4_vertical_left_10, 3, 3
243 PRED4x4_LOWPASS m0, m1, m2, m0
256 %if HAVE_AVX_EXTERNAL
261 ;-----------------------------------------------------------------------------
262 ; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
264 ;-----------------------------------------------------------------------------
266 cglobal pred4x4_horizontal_up_10, 3, 3
270 punpckhwd m0, [r0+r2*2-8]
272 punpckhwd m1, [r1+r2*2-8]
277 pshufw m2, m0, 11111001b
281 pshufw m5, m0, 11111110b
282 PRED4x4_LOWPASS m1, m0, m5, m1
296 ;-----------------------------------------------------------------------------
297 ; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride)
298 ;-----------------------------------------------------------------------------
300 cglobal pred8x8_vertical_10, 2, 2
312 ;-----------------------------------------------------------------------------
313 ; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride)
314 ;-----------------------------------------------------------------------------
316 cglobal pred8x8_horizontal_10, 2, 3
332 ;-----------------------------------------------------------------------------
333 ; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
334 ;-----------------------------------------------------------------------------
336 ; sort of a hack, but it works
346 cglobal pred8x8_dc_10, 2, 6
356 pshufw m2, m0, 00001110b
357 pshufw m3, m1, 00001110b
367 movzx r2d, word [r0+r1*1-2]
368 movzx r3d, word [r0+r1*2-2]
370 movzx r3d, word [r0+r5*1-2]
372 movzx r3d, word [r4-2]
376 movzx r2d, word [r4+r1*1-2]
377 movzx r3d, word [r4+r1*2-2]
379 movzx r3d, word [r4+r5*1-2]
381 movzx r3d, word [r4+r1*4-2]
386 punpckldq m0, m2 ; s0, s1, s2, s3
387 %1 m3, m0, 11110110b ; s2, s1, s3, s3
388 %1 m0, m0, 01110100b ; s0, s1, s3, s1
391 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
394 pshufd m3, m0, 11111010b
419 ;-----------------------------------------------------------------------------
420 ; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride)
421 ;-----------------------------------------------------------------------------
423 cglobal pred8x8_top_dc_10, 2, 4
446 ;-----------------------------------------------------------------------------
447 ; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride)
448 ;-----------------------------------------------------------------------------
450 cglobal pred8x8_plane_10, 2, 7, 7
455 pmaddwd m2, [pw_m32101234]
463 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
464 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
465 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
467 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
468 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
471 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
472 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
476 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
477 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
486 mova m3, [pw_pixel_max]
491 pmullw m2, [pw_m32101234] ; b
492 pmullw m5, m4, [pw_m3] ; c
509 ;-----------------------------------------------------------------------------
510 ; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright,
512 ;-----------------------------------------------------------------------------
513 %macro PRED8x8L_128_DC 0
514 cglobal pred8x8l_128_dc_10, 4, 4
515 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
534 ;-----------------------------------------------------------------------------
535 ; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright,
537 ;-----------------------------------------------------------------------------
538 %macro PRED8x8L_TOP_DC 0
539 cglobal pred8x8l_top_dc_10, 4, 4, 6
547 pinsrw m1, [r0+r1], 0
548 pinsrw m2, [r0+r2+14], 7
551 PRED4x4_LOWPASS m0, m2, m1, m0
569 %if HAVE_AVX_EXTERNAL
574 ;-------------------------------------------------------------------------------
575 ; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
577 ;-------------------------------------------------------------------------------
578 ;TODO: see if scalar is faster
580 cglobal pred8x8l_dc_10, 4, 6, 6
584 mova m0, [r0+r3*2-16]
585 punpckhwd m0, [r0+r3*1-16]
586 mova m1, [r4+r3*0-16]
587 punpckhwd m1, [r0+r5*1-16]
589 mova m2, [r4+r3*2-16]
590 punpckhwd m2, [r4+r3*1-16]
591 mova m3, [r4+r3*4-16]
592 punpckhwd m3, [r4+r5*1-16]
601 pinsrw m1, [r0+r1], 0
602 pinsrw m2, [r0+r2+14], 7
607 pshuflw m4, m4, 11100101b
608 pinsrw m5, [r0+r1-2], 7
609 PRED4x4_LOWPASS m3, m4, m5, m3
610 PRED4x4_LOWPASS m0, m2, m1, m0
629 %if HAVE_AVX_EXTERNAL
634 ;-----------------------------------------------------------------------------
635 ; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
637 ;-----------------------------------------------------------------------------
638 %macro PRED8x8L_VERTICAL 0
639 cglobal pred8x8l_vertical_10, 4, 4, 6
647 pinsrw m1, [r0+r1], 0
648 pinsrw m2, [r0+r2+14], 7
651 PRED4x4_LOWPASS m0, m2, m1, m0
665 %if HAVE_AVX_EXTERNAL
670 ;-----------------------------------------------------------------------------
671 ; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
672 ; int has_topright, ptrdiff_t stride)
673 ;-----------------------------------------------------------------------------
674 %macro PRED8x8L_HORIZONTAL 0
675 cglobal pred8x8l_horizontal_10, 4, 4, 5
681 punpckhwd m0, [r0+r1-16]
682 mova m1, [r0+r3*2-16]
683 punpckhwd m1, [r0+r3*1-16]
687 mova m2, [r2+r3*0-16]
688 punpckhwd m2, [r0+r1-16]
689 mova m3, [r2+r3*2-16]
690 punpckhwd m3, [r2+r3*1-16]
693 PALIGNR m4, m3, [r2+r1-16], 14, m0
695 pshuflw m0, m0, 11100101b
696 PRED4x4_LOWPASS m4, m3, m0, m4
722 %if HAVE_AVX_EXTERNAL
727 ;-----------------------------------------------------------------------------
728 ; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
730 ;-----------------------------------------------------------------------------
731 %macro PRED8x8L_DOWN_LEFT 0
732 cglobal pred8x8l_down_left_10, 4, 4, 7
740 pinsrw m1, [r0+r1], 0
741 pinsrw m2, [r0+r2+14], 7
742 PRED4x4_LOWPASS m6, m2, m1, m3
743 jz .fix_tr ; flags from shr r2d
746 PALIGNR m2, m1, m3, 14, m3
747 pshufhw m5, m5, 10100100b
748 PRED4x4_LOWPASS m1, m2, m5, m1
753 PALIGNR m2, m1, m6, 2, m0
754 PALIGNR m3, m1, m6, 14, m0
755 PALIGNR m5, m1, 2, m0
757 PRED4x4_LOWPASS m6, m4, m2, m6
758 PRED4x4_LOWPASS m1, m3, m5, m1
760 PALIGNR m1, m6, 14, m2
763 PALIGNR m1, m6, 14, m2
766 PALIGNR m1, m6, 14, m2
769 PALIGNR m1, m6, 14, m2
772 PALIGNR m1, m6, 14, m2
775 PALIGNR m1, m6, 14, m2
778 PALIGNR m1, m6, 14, m6
791 %if HAVE_AVX_EXTERNAL
796 ;-----------------------------------------------------------------------------
797 ; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
798 ; int has_topright, ptrdiff_t stride)
799 ;-----------------------------------------------------------------------------
800 %macro PRED8x8L_DOWN_RIGHT 0
801 ; standard forbids this when has_topleft is false
803 cglobal pred8x8l_down_right_10, 4, 5, 8
807 mova m0, [r0+r3*1-16]
808 punpckhwd m0, [r0+r3*0-16]
809 mova m1, [r0+r1*1-16]
810 punpckhwd m1, [r0+r3*2-16]
812 mova m2, [r4+r3*1-16]
813 punpckhwd m2, [r4+r3*0-16]
814 mova m3, [r4+r1*1-16]
815 punpckhwd m3, [r4+r3*2-16]
818 mova m0, [r4+r3*4-16]
820 PALIGNR m4, m3, m0, 14, m0
821 PALIGNR m1, m3, 2, m2
823 pshuflw m0, m0, 11100101b
824 PRED4x4_LOWPASS m6, m1, m4, m3
825 PRED4x4_LOWPASS m4, m3, m0, m4
831 pinsrw m2, [r0+r2+14], 7
832 PRED4x4_LOWPASS m3, m2, m1, m3
833 PALIGNR m2, m3, m6, 2, m0
834 PALIGNR m5, m3, m6, 14, m0
836 PRED4x4_LOWPASS m6, m4, m2, m6
837 PRED4x4_LOWPASS m3, m5, m7, m3
839 PALIGNR m3, m6, 14, m2
842 PALIGNR m3, m6, 14, m2
845 PALIGNR m3, m6, 14, m2
848 PALIGNR m3, m6, 14, m2
851 PALIGNR m3, m6, 14, m2
854 PALIGNR m3, m6, 14, m2
857 PALIGNR m3, m6, 14, m6
866 %if HAVE_AVX_EXTERNAL
871 ;-----------------------------------------------------------------------------
872 ; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
873 ; int has_topright, ptrdiff_t stride)
874 ;-----------------------------------------------------------------------------
875 %macro PRED8x8L_VERTICAL_RIGHT 0
876 ; likewise with 8x8l_down_right
877 cglobal pred8x8l_vertical_right_10, 4, 5, 7
881 mova m0, [r0+r3*1-16]
882 punpckhwd m0, [r0+r3*0-16]
883 mova m1, [r0+r1*1-16]
884 punpckhwd m1, [r0+r3*2-16]
886 mova m2, [r4+r3*1-16]
887 punpckhwd m2, [r4+r3*0-16]
888 mova m3, [r4+r1*1-16]
889 punpckhwd m3, [r4+r3*2-16]
892 mova m0, [r4+r3*4-16]
894 PALIGNR m4, m3, m0, 14, m0
895 PALIGNR m1, m3, 2, m2
896 PRED4x4_LOWPASS m3, m1, m4, m3
902 pinsrw m5, [r0+r2+14], 7
903 PRED4x4_LOWPASS m2, m5, m1, m2
904 PALIGNR m6, m2, m3, 12, m1
905 PALIGNR m5, m2, m3, 14, m0
906 PRED4x4_LOWPASS m0, m6, m2, m5
912 PRED4x4_LOWPASS m1, m3, m6, m1
913 PALIGNR m2, m1, 14, m4
916 PALIGNR m0, m1, 14, m3
919 PALIGNR m2, m1, 14, m4
922 PALIGNR m0, m1, 14, m3
925 PALIGNR m2, m1, 14, m4
928 PALIGNR m0, m1, 14, m1
934 PRED8x8L_VERTICAL_RIGHT
936 PRED8x8L_VERTICAL_RIGHT
937 %if HAVE_AVX_EXTERNAL
939 PRED8x8L_VERTICAL_RIGHT
942 ;-----------------------------------------------------------------------------
943 ; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
944 ; int has_topright, ptrdiff_t stride)
945 ;-----------------------------------------------------------------------------
946 %macro PRED8x8L_HORIZONTAL_UP 0
947 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
948 mova m0, [r0+r3*0-16]
949 punpckhwd m0, [r0+r3*1-16]
954 mova m4, [r0+r1*1-16]
957 mova m1, [r0+r3*2-16]
958 punpckhwd m1, [r0+r1*1-16]
960 mova m2, [r2+r3*0-16]
961 punpckhwd m2, [r2+r3*1-16]
962 mova m3, [r2+r3*2-16]
963 punpckhwd m3, [r2+r1*1-16]
966 PALIGNR m1, m0, m4, 14, m4
968 pshufhw m2, m2, 10100100b
969 PRED4x4_LOWPASS m0, m1, m2, m0
972 pshufhw m1, m1, 10100100b
973 pshufhw m2, m2, 01010100b
975 PRED4x4_LOWPASS m1, m2, m0, m1
980 pshufd m0, m5, 11111001b
981 pshufd m1, m5, 11111110b
982 pshufd m2, m5, 11111111b
986 PALIGNR m2, m5, m4, 4, m0
987 PALIGNR m3, m5, m4, 8, m1
988 PALIGNR m5, m5, m4, 12, m4
996 PRED8x8L_HORIZONTAL_UP
998 PRED8x8L_HORIZONTAL_UP
999 %if HAVE_AVX_EXTERNAL
1001 PRED8x8L_HORIZONTAL_UP
1005 ;-----------------------------------------------------------------------------
1006 ; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride)
1007 ;-----------------------------------------------------------------------------
1010 mova [%1+mmsize], %3
1017 %macro PRED16x16_VERTICAL 0
1018 cglobal pred16x16_vertical_10, 2, 3
1022 mova m1, [r0+mmsize]
1028 MOV16 r0+r1*1, m0, m1, m2, m3
1029 MOV16 r0+r1*2, m0, m1, m2, m3
1041 ;-----------------------------------------------------------------------------
1042 ; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
1043 ;-----------------------------------------------------------------------------
1044 %macro PRED16x16_HORIZONTAL 0
1045 cglobal pred16x16_horizontal_10, 2, 3
1048 movd m0, [r0+r1*0-4]
1049 movd m1, [r0+r1*1-4]
1052 MOV16 r0+r1*0, m0, m0, m0, m0
1053 MOV16 r0+r1*1, m1, m1, m1, m1
1061 PRED16x16_HORIZONTAL
1063 PRED16x16_HORIZONTAL
1065 ;-----------------------------------------------------------------------------
1066 ; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
1067 ;-----------------------------------------------------------------------------
1068 %macro PRED16x16_DC 0
1069 cglobal pred16x16_dc_10, 2, 6
1073 paddw m0, [r0+mmsize]
1081 movzx r3d, word [r0]
1082 movzx r4d, word [r0+r1]
1085 movzx r2d, word [r0]
1087 movzx r2d, word [r0+r1]
1098 MOV16 r5+r1*0, m0, m0, m0, m0
1099 MOV16 r5+r1*1, m0, m0, m0, m0
1111 ;-----------------------------------------------------------------------------
1112 ; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
1113 ;-----------------------------------------------------------------------------
1114 %macro PRED16x16_TOP_DC 0
1115 cglobal pred16x16_top_dc_10, 2, 3
1118 paddw m0, [r0+mmsize]
1130 MOV16 r0+r1*1, m0, m0, m0, m0
1131 MOV16 r0+r1*2, m0, m0, m0, m0
1143 ;-----------------------------------------------------------------------------
1144 ; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
1145 ;-----------------------------------------------------------------------------
1146 %macro PRED16x16_LEFT_DC 0
1147 cglobal pred16x16_left_dc_10, 2, 6
1151 movzx r3d, word [r0]
1152 movzx r4d, word [r0+r1]
1155 movzx r2d, word [r0]
1157 movzx r2d, word [r0+r1]
1167 MOV16 r5+r1*0, m0, m0, m0, m0
1168 MOV16 r5+r1*1, m0, m0, m0, m0
1180 ;-----------------------------------------------------------------------------
1181 ; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
1182 ;-----------------------------------------------------------------------------
1183 %macro PRED16x16_128_DC 0
1184 cglobal pred16x16_128_dc_10, 2,3
1188 MOV16 r0+r1*0, m0, m0, m0, m0
1189 MOV16 r0+r1*1, m0, m0, m0, m0