1 ;*****************************************************************************
2 ;* x86-optimized functions for colorspace filter
4 ;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
33 pw_128: times 8 dw 128
34 pw_256: times 8 dw 256
35 pw_512: times 8 dw 512
36 pw_1023: times 8 dw 1023
37 pw_1024: times 8 dw 1024
38 pw_2048: times 8 dw 2048
39 pw_4095: times 8 dw 4095
40 pw_8192: times 8 dw 8192
41 pw_16384: times 8 dw 16384
45 pd_128: times 4 dd 128
46 pd_512: times 4 dd 512
47 pd_2048: times 4 dd 2048
48 pd_8192: times 4 dd 8192
49 pd_32768: times 4 dd 32768
50 pd_131072: times 4 dd 131072
54 ; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3],
55 ; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3],
56 ; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8],
57 ; const int16_t yuv_offset[2][8])
60 %macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert)
62 %assign %%sh (14 + %1 - %2)
63 %assign %%rnd (1 << (%%sh - 1))
64 %assign %%uvinoff (128 << (%1 - 8))
65 %assign %%uvoutoff (128 << (%2 - 8))
74 %assign %%maxval (1 << %2) - 1
77 %assign %%ypsh %%sh - 1
79 %assign %%yoffsh %%ypsh - 13
84 %assign %%yprnd (1 << (%%yoffsh - 1))
85 %assign %%ypmul (1 << %%ypsh)
87 cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \
88 yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo
97 mov [rsp+3*mmsize+0], wd
98 mov [rsp+3*mmsize+4], hd
102 mova m12, [pd_ %+ %%uvoutoff]
104 paddd m12, [pd_ %+ %%rnd]
105 mova m13, [pw_ %+ %%uvinoff]
106 mova m14, [yoffq+ 0] ; y_off_in
107 mova m15, [yoffq+16] ; y_off_out
111 paddw m15, [pw_ %+ %%yprnd]
113 mova m15, [pw_ %+ %%ypmul]
114 movh m0, [cq+1*16] ; cyu
115 movh m1, [cq+2*16] ; cyv
116 movh m2, [cq+4*16] ; cuu
117 movh m3, [cq+5*16] ; cuv
118 movh m4, [cq+7*16] ; cvu
119 movh m5, [cq+8*16] ; cvv
123 mova [rsp+0*mmsize], m0
124 mova [rsp+1*mmsize], m2
125 mova [rsp+2*mmsize], m4
127 DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp
129 mov uiq, [yiq+gprsize*1]
130 mov viq, [yiq+gprsize*2]
131 mov yiq, [yiq+gprsize*0]
132 mov uoq, [yoq+gprsize*1]
133 mov voq, [yoq+gprsize*2]
134 mov yoq, [yoq+gprsize*0]
135 mov uisq, [yisq+gprsize*1]
136 mov visq, [yisq+gprsize*2]
137 mov yisq, [yisq+gprsize*0]
138 mov uosq, [yosq+gprsize*1]
139 mov vosq, [yosq+gprsize*2]
140 mov yosq, [yosq+gprsize*0]
150 movu m0, [yiq+xq*(1<<%3)] ; y00/01
152 movu m2, [tmpq+xq*2] ; y10/11
155 movh m4, [uiq+xq] ; u
156 movh m5, [viq+xq] ; v
158 movu m4, [uiq+xq] ; u
159 movu m5, [viq+xq] ; v
161 punpckhbw m1, m0, m11
164 punpckhbw m3, m2, m11
168 punpckhbw m2, m4, m11
169 punpckhbw m3, m5, m11
174 movu m0, [yiq+xq*(2<<%3)] ; y00/01
175 movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01
177 movu m2, [tmpq+xq*4] ; y10/11
178 movu m3, [tmpq+xq*4+mmsize] ; y10/11
180 movu m4, [uiq+xq*2] ; u
181 movu m5, [viq+xq*2] ; v
183 movu m2, [uiq+xq*2+mmsize]
184 movu m3, [viq+xq*2+mmsize]
200 SBUTTERFLY wd, 4, 5, 6
201 pmaddwd m6, m4, [rsp+1*mmsize]
202 pmaddwd m7, m5, [rsp+1*mmsize]
204 SBUTTERFLY wd, 2, 3, 8
205 pmaddwd m8, m2, [rsp+1*mmsize]
206 pmaddwd m9, m3, [rsp+1*mmsize]
208 pmaddwd m8, m4, [rsp+2*mmsize]
209 pmaddwd m9, m5, [rsp+2*mmsize]
230 CLIPW m6, m11, [pw_ %+ %%maxval]
231 CLIPW m8, m11, [pw_ %+ %%maxval]
234 movu [uoq+xq*2+mmsize], m8
241 pmaddwd m6, m4, [rsp+2*mmsize]
242 pmaddwd m7, m5, [rsp+2*mmsize]
243 pmaddwd m8, m2, [rsp+2*mmsize]
244 pmaddwd m9, m3, [rsp+2*mmsize]
259 CLIPW m6, m11, [pw_ %+ %%maxval]
260 CLIPW m8, m11, [pw_ %+ %%maxval]
262 movu [voq+xq*2+mmsize], m8
266 pmaddwd m4, [rsp+0*mmsize]
267 pmaddwd m5, [rsp+0*mmsize] ; uv_val
269 pmaddwd m2, [rsp+0*mmsize]
270 pmaddwd m3, [rsp+0*mmsize]
273 ; unpack y pixels with m15 (shifted round + offset), then multiply
274 ; by m10, add uv pixels, and we're done!
285 punpckhwd m6, m2, m15
287 punpckhwd m7, m3, m15
309 CLIPW m2, m11, [pw_ %+ %%maxval]
310 CLIPW m3, m11, [pw_ %+ %%maxval]
312 movu [tmpq+xq*4+mmsize], m3
316 punpckhwd m6, m0, m15
318 punpckhwd m7, m1, m15
337 movu [yoq+xq*(1<<%3)], m0
339 CLIPW m0, m11, [pw_ %+ %%maxval]
340 CLIPW m1, m11, [pw_ %+ %%maxval]
341 movu [yoq+xq*(2<<%3)], m0
342 movu [yoq+xq*(2<<%3)+mmsize], m1
346 cmp xd, dword [rsp+3*mmsize+0]
350 lea yiq, [yiq+yisq*2]
351 lea yoq, [yoq+yosq*2]
360 dec dword [rsp+3*mmsize+4]
366 %macro YUV2YUV_FNS 2 ; ss_w, ss_h
367 YUV2YUV_FN 8, 8, %1, %2
368 YUV2YUV_FN 10, 8, %1, %2
369 YUV2YUV_FN 12, 8, %1, %2
370 YUV2YUV_FN 8, 10, %1, %2
371 YUV2YUV_FN 10, 10, %1, %2
372 YUV2YUV_FN 12, 10, %1, %2
373 YUV2YUV_FN 8, 12, %1, %2
374 YUV2YUV_FN 10, 12, %1, %2
375 YUV2YUV_FN 12, 12, %1, %2
383 ; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride,
384 ; uint8_t *yuv[3], ptrdiff_t yuv_stride[3],
385 ; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8],
386 ; const int16_t yuv_offset[8])
387 %macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
388 %assign %%sh (%1 - 1)
389 %assign %%rnd (1 << (%%sh - 1))
390 %assign %%uvoff (1 << (%1 - 1))
399 cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \
400 rgb, rgbs, yuv, yuvs, ww, h, c, yoff
410 mova m15, [yoffq] ; yoff
411 movh m14, [cq+ 0] ; cy
412 movh m10, [cq+ 32] ; crv
413 movh m13, [cq+112] ; cbu
414 movh m12, [cq+ 64] ; cgu
415 movh m9, [cq+ 80] ; cgv
416 punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd
417 punpcklwd m13, m11 ; cbu, 0
418 punpcklwd m11, m10 ; 0, crv
419 punpcklwd m12, m9 ; cgu, cgv
420 mova [rsp+0*mmsize], m11
421 mova [rsp+1*mmsize], m12
422 mova [rsp+2*mmsize], m13
423 mova [rsp+3*mmsize], m14
426 DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp
428 mov gq, [rq+1*gprsize]
429 mov bq, [rq+2*gprsize]
430 mov rq, [rq+0*gprsize]
431 mov uq, [yq+1*gprsize]
432 mov vq, [yq+2*gprsize]
433 mov yq, [yq+0*gprsize]
434 mov usq, [ysq+1*gprsize]
435 mov vsq, [ysq+2*gprsize]
436 mov ysq, [ysq+0*gprsize]
446 movu m0, [yq+xq*(1<<%2)]
457 punpckhbw m1, m0, m14
460 punpckhbw m3, m2, m14
464 punpckhbw m2, m4, m14
465 punpckhbw m3, m5, m14
470 movu m0, [yq+xq*(2<<%2)]
471 movu m1, [yq+xq*(2<<%2)+mmsize]
474 movu m3, [tmpq+xq*4+mmsize]
479 movu m2, [uq+xq*2+mmsize]
480 movu m3, [vq+xq*2+mmsize]
489 psubw m4, [pw_ %+ %%uvoff]
490 psubw m5, [pw_ %+ %%uvoff]
491 SBUTTERFLY wd, 4, 5, 6
493 psubw m2, [pw_ %+ %%uvoff]
494 psubw m3, [pw_ %+ %%uvoff]
495 SBUTTERFLY wd, 2, 3, 6
498 ; calculate y+rnd full-resolution [0-3,6-9]
499 punpckhwd m6, m0, [pw_1] ; y, 1
500 punpcklwd m0, [pw_1] ; y, 1
501 punpckhwd m7, m1, [pw_1] ; y, 1
502 punpcklwd m1, [pw_1] ; y, 1
503 pmaddwd m0, [rsp+3*mmsize]
504 pmaddwd m6, [rsp+3*mmsize]
505 pmaddwd m1, [rsp+3*mmsize]
506 pmaddwd m7, [rsp+3*mmsize]
508 punpckhwd m8, m2, [pw_1] ; y, 1
509 punpcklwd m2, [pw_1] ; y, 1
510 punpckhwd m9, m3, [pw_1] ; y, 1
511 punpcklwd m3, [pw_1] ; y, 1
512 pmaddwd m2, [rsp+3*mmsize]
513 pmaddwd m8, [rsp+3*mmsize]
514 pmaddwd m3, [rsp+3*mmsize]
515 pmaddwd m9, [rsp+3*mmsize]
516 mova [rsp+4*mmsize], m2
517 mova [rsp+5*mmsize], m8
518 mova [rsp+6*mmsize], m3
519 mova [rsp+7*mmsize], m9
522 ; calculate r offsets (un-subsampled, then duplicate)
523 pmaddwd m10, m4, [rsp+0*mmsize]
525 pmaddwd m12, m5, [rsp+0*mmsize]
526 punpckhdq m11, m10, m10
528 punpckhdq m13, m12, m12
531 pmaddwd m11, m5, [rsp+0*mmsize]
532 pmaddwd m12, m2, [rsp+0*mmsize]
533 pmaddwd m13, m3, [rsp+0*mmsize]
536 paddd m2, m10, [rsp+4*mmsize]
537 paddd m3, m11, [rsp+5*mmsize]
538 paddd m8, m12, [rsp+6*mmsize]
539 paddd m9, m13, [rsp+7*mmsize]
556 lea tmpq, [rq+rgbsq*2]
560 mova [tmpq+xq*4+mmsize], m8
564 mova [rq+xq*(2 << %2)], m10
565 mova [rq+xq*(2 << %2)+mmsize], m12
567 ; calculate g offsets (un-subsampled, then duplicate)
568 pmaddwd m10, m4, [rsp+1*mmsize]
570 pmaddwd m12, m5, [rsp+1*mmsize]
571 punpckhdq m11, m10, m10
573 punpckhdq m13, m12, m12
576 pmaddwd m11, m5, [rsp+1*mmsize]
577 pmaddwd m12, m2, [rsp+1*mmsize]
578 pmaddwd m13, m3, [rsp+1*mmsize]
581 paddd m2, m10, [rsp+4*mmsize]
582 paddd m3, m11, [rsp+5*mmsize]
583 paddd m8, m12, [rsp+6*mmsize]
584 paddd m9, m13, [rsp+7*mmsize]
601 lea tmpq, [gq+rgbsq*2]
605 mova [tmpq+xq*4+mmsize], m8
609 mova [gq+xq*(2 << %2)], m10
610 mova [gq+xq*(2 << %2)+mmsize], m12
612 ; calculate b offsets (un-subsampled, then duplicate)
613 pmaddwd m4, [rsp+2*mmsize]
614 pmaddwd m5, [rsp+2*mmsize]
621 pmaddwd m2, [rsp+2*mmsize]
622 pmaddwd m3, [rsp+2*mmsize]
630 paddd m4, [rsp+4*mmsize]
631 paddd m2, [rsp+5*mmsize]
632 paddd m5, [rsp+6*mmsize]
633 paddd m3, [rsp+7*mmsize]
647 movu [bq+xq*(2 << %2)], m0
648 movu [bq+xq*(2 << %2)+mmsize], m1
650 lea tmpq, [bq+rgbsq*2]
654 movu [tmpq+xq*4+mmsize], m5
661 lea rq, [rq+rgbsq*(2 << %3)]
662 lea gq, [gq+rgbsq*(2 << %3)]
663 lea bq, [bq+rgbsq*(2 << %3)]
679 YUV2RGB_FN 10, %1, %2
680 YUV2RGB_FN 12, %1, %2
688 %macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
690 %assign %%rnd (1 << (%%sh - 15))
691 %assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14))
693 %assign %%maxval ((1 << %1) - 1)
703 cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \
704 yuv, yuvs, rgb, rgbs, ww, h, c, off
716 movh m9, [pw_ %+ %%uvrnd]
718 paddw m9, [pw_ %+ %%rnd]
719 paddw m8, [pw_ %+ %%rnd]
735 mova [rsp+0*mmsize], m0 ; cry, cgy
736 mova [rsp+1*mmsize], m2 ; cby, off + rnd
737 mova [rsp+2*mmsize], m3 ; cru, cgu
738 mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd
739 mova [rsp+4*mmsize], m5 ; cburv, cgv
740 mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd
743 DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x
744 mov gq, [rq+gprsize*1]
745 mov bq, [rq+gprsize*2]
746 mov rq, [rq+gprsize*0]
747 mov uq, [yq+gprsize*1]
748 mov vq, [yq+gprsize*2]
749 mov yq, [yq+gprsize*0]
750 mov usq, [ysq+gprsize*1]
751 mov vsq, [ysq+gprsize*2]
752 mov ysq, [ysq+gprsize*0]
760 mova m0, [rq+xq*(2<<%2)]
761 mova m3, [rq+xq*(2<<%2)+mmsize]
762 mova m1, [gq+xq*(2<<%2)]
763 mova m4, [gq+xq*(2<<%2)+mmsize]
764 mova m2, [bq+xq*(2<<%2)]
765 mova m5, [bq+xq*(2<<%2)+mmsize]
771 punpcklwd m10, m2, [pw_16384]
772 punpckhwd m11, m2, [pw_16384]
773 punpcklwd m12, m5, [pw_16384]
774 punpckhwd m13, m5, [pw_16384]
776 pmaddwd m6, [rsp+0*mmsize]
777 pmaddwd m7, [rsp+0*mmsize]
778 pmaddwd m8, [rsp+0*mmsize]
779 pmaddwd m9, [rsp+0*mmsize]
780 pmaddwd m10, [rsp+1*mmsize]
781 pmaddwd m11, [rsp+1*mmsize]
782 pmaddwd m12, [rsp+1*mmsize]
783 pmaddwd m13, [rsp+1*mmsize]
796 movu [yq+xq*(1<<%2)], m6
798 CLIPW m6, m15, [pw_ %+ %%maxval]
799 CLIPW m8, m15, [pw_ %+ %%maxval]
800 movu [yq+xq*(2<<%2)], m6
801 movu [yq+xq*(2<<%2)+mmsize], m8
805 ; subsampling cached data
814 ; bottom line y, r/g portion only
815 lea tmpq, [rgbsq+xq*2]
817 mova m9, [rq+tmpq*2+mmsize]
819 mova m10, [gq+tmpq*2+mmsize]
821 mova m11, [bq+tmpq*2+mmsize]
823 punpcklwd m12, m6, m7
824 punpckhwd m13, m6, m7
825 punpcklwd m14, m9, m10
826 punpckhwd m15, m9, m10
828 ; release two more registers
838 ; bottom line y, b/rnd portion only
839 punpcklwd m6, m8, [pw_16384]
840 punpckhwd m7, m8, [pw_16384]
841 punpcklwd m9, m11, [pw_16384]
842 punpckhwd m10, m11, [pw_16384]
844 pmaddwd m12, [rsp+0*mmsize]
845 pmaddwd m13, [rsp+0*mmsize]
846 pmaddwd m14, [rsp+0*mmsize]
847 pmaddwd m15, [rsp+0*mmsize]
848 pmaddwd m6, [rsp+1*mmsize]
849 pmaddwd m7, [rsp+1*mmsize]
850 pmaddwd m9, [rsp+1*mmsize]
851 pmaddwd m10, [rsp+1*mmsize]
865 movu [tmpq+xq*2], m12
868 CLIPW m12, m15, [pw_ %+ %%maxval]
869 CLIPW m14, m15, [pw_ %+ %%maxval]
870 movu [tmpq+xq*4], m12
871 movu [tmpq+xq*4+mmsize], m14
874 ; complete subsampling of r/g/b pixels for u/v
911 SBUTTERFLY wd, 0, 1, 6
912 punpckhwd m6, m2, [pw_16384]
913 punpcklwd m2, [pw_16384]
915 pmaddwd m7, m0, [rsp+2*mmsize]
916 pmaddwd m8, m1, [rsp+2*mmsize]
917 pmaddwd m9, m2, [rsp+3*mmsize]
918 pmaddwd m10, m6, [rsp+3*mmsize]
919 pmaddwd m0, [rsp+4*mmsize]
920 pmaddwd m1, [rsp+4*mmsize]
921 pmaddwd m2, [rsp+5*mmsize]
922 pmaddwd m6, [rsp+5*mmsize]
939 CLIPW m7, m15, [pw_ %+ %%maxval]
940 CLIPW m0, m15, [pw_ %+ %%maxval]
945 ; second set of u/v pixels
946 SBUTTERFLY wd, 3, 4, 6
947 punpckhwd m6, m5, [pw_16384]
948 punpcklwd m5, [pw_16384]
950 pmaddwd m8, m3, [rsp+2*mmsize]
951 pmaddwd m9, m4, [rsp+2*mmsize]
952 pmaddwd m10, m5, [rsp+3*mmsize]
953 pmaddwd m11, m6, [rsp+3*mmsize]
954 pmaddwd m3, [rsp+4*mmsize]
955 pmaddwd m4, [rsp+4*mmsize]
956 pmaddwd m5, [rsp+5*mmsize]
957 pmaddwd m6, [rsp+5*mmsize]
975 CLIPW m7, m15, [pw_ %+ %%maxval]
976 CLIPW m0, m15, [pw_ %+ %%maxval]
977 CLIPW m8, m15, [pw_ %+ %%maxval]
978 CLIPW m3, m15, [pw_ %+ %%maxval]
980 movu [uq+xq*2+mmsize], m8
982 movu [vq+xq*2+mmsize], m3
997 lea rq, [rq+rgbsq*(2<<%3)]
998 lea gq, [gq+rgbsq*(2<<%3)]
999 lea bq, [bq+rgbsq*(2<<%3)]
1006 %macro RGB2YUV_FNS 2
1007 RGB2YUV_FN 8, %1, %2
1008 RGB2YUV_FN 10, %1, %2
1009 RGB2YUV_FN 12, %1, %2
1017 ; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride,
1018 ; int w, int h, const int16_t coeff[3][3][8])
1020 cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c
1027 punpcklwd m0, [cq+ 16]
1028 punpcklwd m1, [pw_8192]
1029 punpcklwd m2, [cq+ 64]
1030 punpcklwd m3, [pw_8192]
1031 punpcklwd m4, [cq+112]
1032 punpcklwd m5, [pw_8192]
1034 DEFINE_ARGS data0, stride, ww, h, data1, data2, x
1036 mov data1q, [data0q+gprsize*1]
1037 mov data2q, [data0q+gprsize*2]
1038 mov data0q, [data0q+gprsize*0]
1044 mova m6, [data0q+xq*2]
1045 mova m7, [data1q+xq*2]
1046 mova m8, [data2q+xq*2]
1047 SBUTTERFLY wd, 6, 7, 9
1048 punpckhwd m9, m8, [pw_1]
1049 punpcklwd m8, [pw_1]
1082 mova [data0q+xq*2], m10
1083 mova [data1q+xq*2], m12
1084 mova [data2q+xq*2], m6