1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
38 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
40 pw_pixel_max: times 8 dw ((1 << 10)-1)
41 pw_512: times 8 dw 512
45 ; dest, left, right, src
46 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47 %macro PRED4x4_LOWPASS 4
53 ;-----------------------------------------------------------------------------
54 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
55 ;-----------------------------------------------------------------------------
57 cglobal pred4x4_down_right_10_%1, 3,3
61 movhps m2, [r0+r2*1-8]
66 PALIGNR m3, m1, 10, m1
68 movhps m4, [r1+r2*1-8]
69 PALIGNR m3, m4, 14, m4
71 movhps m4, [r1+r2*2-8]
72 PALIGNR m3, m4, 14, m4
73 PRED4x4_LOWPASS m0, m3, m1, m2
85 %define PALIGNR PALIGNR_MMX
87 %define PALIGNR PALIGNR_SSSE3
94 ;-----------------------------------------------------------------------------
95 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
96 ;-----------------------------------------------------------------------------
98 cglobal pred4x4_vertical_right_10_%1, 3,3,6
101 movq m5, [r0] ; ........t3t2t1t0
103 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
105 movhps m1, [r0+r2*1-8]
106 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
108 movhps m2, [r0+r2*2-8]
109 PALIGNR m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
111 movhps m3, [r1+r2*1-8]
112 PALIGNR m0, m3, 14, m3 ; t3t2t1t0ltl0l1l2
113 PRED4x4_LOWPASS m3, m1, m0, m2
118 PALIGNR m5, m1, 14, m2
121 PALIGNR m3, m1, 14, m1
127 %define PALIGNR PALIGNR_MMX
129 %define PALIGNR PALIGNR_SSSE3
136 ;-----------------------------------------------------------------------------
137 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
138 ;-----------------------------------------------------------------------------
140 cglobal pred4x4_horizontal_down_10_%1, 3,3
143 movq m0, [r0-8] ; lt ..
145 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
146 movq m1, [r1+r2*2-8] ; l3
148 punpcklwd m1, m3 ; l2 l3
149 movq m2, [r0+r2*2-8] ; l1
151 punpcklwd m2, m3 ; l0 l1
152 punpckhdq m1, m2 ; l0 l1 l2 l3
153 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
154 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
155 psrldq m2, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
157 PRED4x4_LOWPASS m3, m1, m0, m2
160 PALIGNR m3, m5, 12, m4
170 %define PALIGNR PALIGNR_MMX
172 %define PALIGNR PALIGNR_SSSE3
179 ;-----------------------------------------------------------------------------
180 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
181 ;-----------------------------------------------------------------------------
182 %macro HADDD 2 ; sum junk
200 cglobal pred4x4_dc_10_mmxext, 3,3
204 paddw m2, [r0+r2*2-8]
205 paddw m2, [r1+r2*1-8]
206 paddw m2, [r1+r2*2-8]
220 ;-----------------------------------------------------------------------------
221 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
222 ;-----------------------------------------------------------------------------
225 cglobal pred4x4_down_left_10_%1, 3,3
233 PRED4x4_LOWPASS m0, m5, m3, m1
252 ;-----------------------------------------------------------------------------
253 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
254 ;-----------------------------------------------------------------------------
256 cglobal pred4x4_vertical_left_10_%1, 3,3
263 PRED4x4_LOWPASS m0, m1, m2, m3
281 ;-----------------------------------------------------------------------------
282 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
283 ;-----------------------------------------------------------------------------
285 cglobal pred4x4_horizontal_up_10_mmxext, 3,3
289 punpckhwd m0, [r0+r2*2-8]
291 punpckhwd m1, [r1+r2*2-8]
296 pshufw m2, m0, 11111001b
300 pshufw m5, m0, 11111110b
301 PRED4x4_LOWPASS m3, m0, m5, m1
315 ;-----------------------------------------------------------------------------
316 ; void pred8x8_vertical(pixel *src, int stride)
317 ;-----------------------------------------------------------------------------
319 cglobal pred8x8_vertical_10_sse2, 2,2
331 ;-----------------------------------------------------------------------------
332 ; void pred8x8_horizontal(pixel *src, int stride)
333 ;-----------------------------------------------------------------------------
335 cglobal pred8x8_horizontal_10_sse2, 2,3
351 ;-----------------------------------------------------------------------------
352 ; void predict_8x8_dc(pixel *src, int stride)
353 ;-----------------------------------------------------------------------------
355 ; sort of a hack, but it works
365 cglobal pred8x8_dc_10_%1, 2,4
379 movzx r2d, word [r0+r1*1-2]
380 movzx r3d, word [r0+r1*2-2]
383 movzx r3d, word [r0+r1*1-2]
385 movzx r3d, word [r0+r1*2-2]
390 movzx r2d, word [r0+r1*1-2]
391 movzx r3d, word [r0+r1*2-2]
394 movzx r3d, word [r0+r1*1-2]
396 movzx r3d, word [r0+r1*2-2]
403 punpckldq m0, m2 ; s0, s1, s2, s3
404 %2 m3, m0, 11110110b ; s2, s1, s3, s3
406 %2 m0, m0, 01110100b ; s0, s1, s3, s1
410 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
413 pshufd m3, m0, 11111010b
434 PRED8x8_DC mmxext, pshufw
436 PRED8x8_DC sse2 , pshuflw
438 ;-----------------------------------------------------------------------------
439 ; void pred8x8_top_dc(pixel *src, int stride)
440 ;-----------------------------------------------------------------------------
441 %macro PRED8x8_TOP_DC 2
442 cglobal pred8x8_top_dc_10_%1, 2,4
471 PRED8x8_TOP_DC mmxext, pshufw
473 PRED8x8_TOP_DC sse2 , pshuflw
475 ;-----------------------------------------------------------------------------
476 ; void pred8x8_plane(pixel *src, int stride)
477 ;-----------------------------------------------------------------------------
479 cglobal pred8x8_plane_10_sse2, 2,7,7
484 pmaddwd m2, [pw_m32101234]
492 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
493 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
494 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
496 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
497 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
500 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
501 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
505 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
506 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
515 mova m3, [pw_pixel_max]
520 pmullw m2, [pw_m32101234] ; b
521 pmullw m5, m4, [pw_m3] ; c
538 ;-----------------------------------------------------------------------------
539 ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
540 ;-----------------------------------------------------------------------------
541 %macro PRED8x8L_128_DC 1
542 cglobal pred8x8l_128_dc_10_%1, 4,4
558 PRED8x8L_128_DC mmxext
562 ;-----------------------------------------------------------------------------
563 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
564 ;-----------------------------------------------------------------------------
565 %macro PRED8x8L_TOP_DC 1
566 cglobal pred8x8l_top_dc_10_%1, 4,4,6
574 PALIGNR m2, m0, 14, m0
575 PALIGNR m1, m4, 2, m4
576 test r1, r1 ; top_left
578 test r2, r2 ; top_right
587 test r2, r2 ; top_right
598 PRED4x4_LOWPASS m0, m2, m1, m3
615 %define PALIGNR PALIGNR_MMX
617 %define PALIGNR PALIGNR_SSSE3
618 PRED8x8L_TOP_DC ssse3
620 ;-----------------------------------------------------------------------------
621 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
622 ;-----------------------------------------------------------------------------
623 ;TODO: see if scalar is faster
625 cglobal pred8x8l_dc_10_%1, 4,5,8
628 mova m0, [r0+r3*1-16]
629 punpckhwd m0, [r0+r3*0-16]
630 mova m1, [r4+r3*1-16]
631 punpckhwd m1, [r0+r3*2-16]
635 mova m2, [r0+r3*1-16]
636 punpckhwd m2, [r0+r3*0-16]
638 mova m3, [r0+r3*1-16]
639 punpckhwd m3, [r0+r3*0-16]
643 mova m0, [r0+r3*0-16]
648 PALIGNR m4, m0, 14, m0
649 PALIGNR m1, m2, 2, m2
676 PRED4x4_LOWPASS m2, m1, m4, m3
679 PRED4x4_LOWPASS m1, m3, m0, m4
681 PALIGNR m7, m1, 14, m3
687 PALIGNR m2, m0, 14, m0
688 PALIGNR m1, m4, 2, m4
695 PRED4x4_LOWPASS m6, m2, m1, m3
715 %define PALIGNR PALIGNR_MMX
717 %define PALIGNR PALIGNR_SSSE3
720 ;-----------------------------------------------------------------------------
721 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
722 ;-----------------------------------------------------------------------------
723 %macro PRED8x8L_VERTICAL 1
724 cglobal pred8x8l_vertical_10_%1, 4,4,6
731 PALIGNR m2, m0, 14, m0
732 PALIGNR m1, m4, 2, m4
733 test r1, r1 ; top_left
735 test r2, r2 ; top_right
744 test r2, r2 ; top_right
755 PRED4x4_LOWPASS m0, m2, m1, m3
768 %define PALIGNR PALIGNR_MMX
769 PRED8x8L_VERTICAL sse2
770 %define PALIGNR PALIGNR_SSSE3
771 PRED8x8L_VERTICAL ssse3
773 ;-----------------------------------------------------------------------------
774 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
775 ;-----------------------------------------------------------------------------
776 %macro PRED8x8L_HORIZONTAL 1
777 cglobal pred8x8l_horizontal_10_%1, 4,4,8
780 mova m0, [r0+r3*1-16]
784 punpckhwd m0, [r1+r3*0-16]
785 mova m1, [r2+r3*1-16]
786 punpckhwd m1, [r0+r3*2-16]
790 mova m2, [r0+r3*1-16]
791 punpckhwd m2, [r0+r3*0-16]
793 mova m3, [r0+r3*1-16]
794 punpckhwd m3, [r0+r3*0-16]
798 mova m0, [r0+r3*0-16]
799 mova m1, [r1+r3*0-16]
803 PALIGNR m4, m0, 14, m0
804 PALIGNR m1, m2, 2, m2
806 PRED4x4_LOWPASS m2, m1, m4, m3
809 PRED4x4_LOWPASS m1, m3, m0, m4
811 PALIGNR m7, m1, 14, m3
836 %define PALIGNR PALIGNR_MMX
837 PRED8x8L_HORIZONTAL sse2
838 %define PALIGNR PALIGNR_SSSE3
839 PRED8x8L_HORIZONTAL ssse3
841 ;-----------------------------------------------------------------------------
842 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
843 ;-----------------------------------------------------------------------------
844 %macro PRED8x8L_DOWN_LEFT 1
845 cglobal pred8x8l_down_left_10_%1, 4,4,8
852 PALIGNR m2, m0, 14, m0
853 PALIGNR m1, m4, 2, m4
879 PRED4x4_LOWPASS m4, m2, m1, m3
888 PALIGNR m2, m3, 14, m3
889 PALIGNR m5, m4, 2, m4
890 PRED4x4_LOWPASS m1, m2, m5, m0
898 PALIGNR m2, m7, 2, m0
900 PALIGNR m3, m7, 14, m0
901 PALIGNR m4, m6, 2, m0
906 PRED4x4_LOWPASS m0, m1, m2, m5
907 PRED4x4_LOWPASS m1, m3, m4, m7
953 %define PALIGNR PALIGNR_MMX
954 PRED8x8L_DOWN_LEFT sse2
955 %define PALIGNR PALIGNR_SSSE3
956 PRED8x8L_DOWN_LEFT ssse3
958 ;-----------------------------------------------------------------------------
959 ;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride)
960 ;-----------------------------------------------------------------------------
961 %macro PRED8x8L_DOWN_RIGHT 1
962 cglobal pred8x8l_down_right_10_%1, 4,5,8
965 mova m0, [r0+r3*1-16]
966 punpckhwd m0, [r0+r3*0-16]
967 mova m1, [r4+r3*1-16]
968 punpckhwd m1, [r0+r3*2-16]
972 mova m2, [r0+r3*1-16]
973 punpckhwd m2, [r0+r3*0-16]
975 mova m3, [r0+r3*1-16]
976 punpckhwd m3, [r0+r3*0-16]
980 mova m0, [r0+r3*0-16]
985 PALIGNR m4, m0, 14, m0
986 PALIGNR m1, m2, 2, m2
987 test r1, r1 ; top_left
991 PRED4x4_LOWPASS m2, m1, m4, m3
995 PRED4x4_LOWPASS m1, m3, m0, m4
997 PALIGNR m7, m1, 14, m3
1003 PALIGNR m2, m0, 14, m0
1004 PALIGNR m1, m4, 2, m4
1005 test r1, r1 ; top_left
1007 test r2, r2 ; top_right
1010 PRED4x4_LOWPASS m4, m2, m1, m3
1026 test r2, r2 ; top_right
1042 PALIGNR m2, m6, 2, m0
1044 PALIGNR m3, m6, 14, m0
1047 PRED4x4_LOWPASS m0, m1, m2, m5
1048 PRED4x4_LOWPASS m1, m3, m4, m7
1094 %define PALIGNR PALIGNR_MMX
1095 PRED8x8L_DOWN_RIGHT sse2
1096 %define PALIGNR PALIGNR_SSSE3
1097 PRED8x8L_DOWN_RIGHT ssse3
1099 ;-----------------------------------------------------------------------------
1100 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
1101 ;-----------------------------------------------------------------------------
1102 %macro PRED8x8L_VERTICAL_RIGHT 1
1103 cglobal pred8x8l_vertical_right_10_%1, 4,5,8
1106 mova m0, [r0+r3*1-16]
1107 punpckhwd m0, [r0+r3*0-16]
1108 mova m1, [r4+r3*1-16]
1109 punpckhwd m1, [r0+r3*2-16]
1113 mova m2, [r0+r3*1-16]
1114 punpckhwd m2, [r0+r3*0-16]
1116 mova m3, [r0+r3*1-16]
1117 punpckhwd m3, [r0+r3*0-16]
1121 mova m0, [r0+r3*0-16]
1126 PALIGNR m4, m0, 14, m0
1127 PALIGNR m1, m2, 2, m2
1155 PRED4x4_LOWPASS m2, m1, m4, m3
1162 PALIGNR m2, m0, 14, m0
1163 PALIGNR m1, m4, 2, m4
1169 PRED4x4_LOWPASS m6, m2, m1, m3
1173 PALIGNR m3, m7, 14, m0
1174 PALIGNR m6, m7, 12, m1
1178 PRED4x4_LOWPASS m0, m6, m2, m4
1188 PRED4x4_LOWPASS m0, m1, m3, m2
1189 PALIGNR m6, m0, 14, m2
1192 PALIGNR m5, m0, 14, m1
1195 PALIGNR m6, m0, 14, m2
1198 PALIGNR m5, m0, 14, m1
1201 PALIGNR m6, m0, 14, m2
1204 PALIGNR m5, m0, 14, m1
1210 %define PALIGNR PALIGNR_MMX
1211 PRED8x8L_VERTICAL_RIGHT sse2
1212 %define PALIGNR PALIGNR_SSSE3
1213 PRED8x8L_VERTICAL_RIGHT ssse3
1215 ;-----------------------------------------------------------------------------
1216 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
1217 ;-----------------------------------------------------------------------------
1218 %macro PRED8x8L_HORIZONTAL_UP 1
1219 cglobal pred8x8l_horizontal_up_10_%1, 4,4,8
1222 mova m0, [r0+r3*1-16]
1226 punpckhwd m0, [r1+r3*0-16]
1227 mova m1, [r2+r3*1-16]
1228 punpckhwd m1, [r0+r3*2-16]
1232 mova m2, [r0+r3*1-16]
1233 punpckhwd m2, [r0+r3*0-16]
1235 mova m3, [r0+r3*1-16]
1236 punpckhwd m3, [r0+r3*0-16]
1240 mova m0, [r0+r3*0-16]
1241 mova m1, [r1+r3*0-16]
1245 PALIGNR m4, m0, 14, m0
1246 PALIGNR m1, m2, 2, m2
1248 PRED4x4_LOWPASS m2, m1, m4, m3
1251 PRED4x4_LOWPASS m1, m3, m0, m4
1253 PALIGNR m7, m1, 14, m3
1255 pshufd m0, m7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
1256 pslldq m7, 14 ; l7 .. .. .. .. .. .. ..
1260 por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
1267 por m2, m7 ; l7 l7 l6 l5 l4 l3 l2 l1
1269 por m3, m7 ; l7 l7 l7 l6 l5 l4 l3 l2
1271 PRED4x4_LOWPASS m1, m3, m5, m2
1273 punpcklwd m4, m1 ; p4 p3 p2 p1
1274 punpckhwd m5, m1 ; p8 p7 p6 p5
1278 PALIGNR m5, m4, 4, m1
1279 pshufd m1, m6, 11111001b
1280 PALIGNR m6, m4, 8, m2
1281 pshufd m2, m7, 11111110b
1282 PALIGNR m7, m4, 12, m3
1283 pshufd m3, m0, 11111111b
1296 %define PALIGNR PALIGNR_MMX
1297 PRED8x8L_HORIZONTAL_UP sse2
1298 %define PALIGNR PALIGNR_SSSE3
1299 PRED8x8L_HORIZONTAL_UP ssse3
1303 ;-----------------------------------------------------------------------------
1304 ; void pred16x16_vertical(pixel *src, int stride)
1305 ;-----------------------------------------------------------------------------
1308 mova [%1+mmsize], %3
1315 %macro PRED16x16_VERTICAL 1
1316 cglobal pred16x16_vertical_10_%1, 2,3
1320 mova m1, [r0+mmsize]
1326 MOV16 r0+r1*1, m0, m1, m2, m3
1327 MOV16 r0+r1*2, m0, m1, m2, m3
1335 PRED16x16_VERTICAL mmxext
1337 PRED16x16_VERTICAL sse2
1339 ;-----------------------------------------------------------------------------
1340 ; void pred16x16_horizontal(pixel *src, int stride)
1341 ;-----------------------------------------------------------------------------
1342 %macro PRED16x16_HORIZONTAL 1
1343 cglobal pred16x16_horizontal_10_%1, 2,3
1346 movd m0, [r0+r1*0-4]
1347 movd m1, [r0+r1*1-4]
1350 MOV16 r0+r1*0, m0, m0, m0, m0
1351 MOV16 r0+r1*1, m1, m1, m1, m1
1359 PRED16x16_HORIZONTAL mmxext
1361 PRED16x16_HORIZONTAL sse2
1363 ;-----------------------------------------------------------------------------
1364 ; void pred16x16_dc(pixel *src, int stride)
1365 ;-----------------------------------------------------------------------------
1366 %macro PRED16x16_DC 1
1367 cglobal pred16x16_dc_10_%1, 2,7
1371 paddw m0, [r0+mmsize]
1379 movzx r3d, word [r0+r1*1]
1380 movzx r5d, word [r0+r1*2]
1383 movzx r2d, word [r0+r1*1]
1385 movzx r2d, word [r0+r1*2]
1396 MOV16 r4+r1*0, m0, m0, m0, m0
1397 MOV16 r4+r1*1, m0, m0, m0, m0
1409 ;-----------------------------------------------------------------------------
1410 ; void pred16x16_top_dc(pixel *src, int stride)
1411 ;-----------------------------------------------------------------------------
1412 %macro PRED16x16_TOP_DC 1
1413 cglobal pred16x16_top_dc_10_%1, 2,3
1416 paddw m0, [r0+mmsize]
1428 MOV16 r0+r1*1, m0, m0, m0, m0
1429 MOV16 r0+r1*2, m0, m0, m0, m0
1437 PRED16x16_TOP_DC mmxext
1439 PRED16x16_TOP_DC sse2
1441 ;-----------------------------------------------------------------------------
1442 ; void pred16x16_left_dc(pixel *src, int stride)
1443 ;-----------------------------------------------------------------------------
1444 %macro PRED16x16_LEFT_DC 1
1445 cglobal pred16x16_left_dc_10_%1, 2,7
1449 movzx r5d, word [r0+r1*0]
1450 movzx r6d, word [r0+r1*1]
1453 movzx r2d, word [r0+r1*0]
1454 movzx r3d, word [r0+r1*1]
1465 MOV16 r4+r1*0, m0, m0, m0, m0
1466 MOV16 r4+r1*1, m0, m0, m0, m0
1474 PRED16x16_LEFT_DC mmxext
1476 PRED16x16_LEFT_DC sse2
1478 ;-----------------------------------------------------------------------------
1479 ; void pred16x16_128_dc(pixel *src, int stride)
1480 ;-----------------------------------------------------------------------------
1481 %macro PRED16x16_128_DC 1
1482 cglobal pred16x16_128_dc_10_%1, 2,3
1486 MOV16 r0+r1*0, m0, m0, m0, m0
1487 MOV16 r0+r1*1, m0, m0, m0, m0
1495 PRED16x16_128_DC mmxext
1497 PRED16x16_128_DC sse2