2 ; * Provide SSE luma and chroma mc functions for HEVC decoding
3 ; * Copyright (c) 2013 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
34 %define pw_bi_10 pw_1024
35 %define pw_bi_12 pw_4096
36 %define max_pixels_8 pw_255
37 %define max_pixels_10 pw_1023
38 pw_bi_8: times 16 dw (1 << 8)
39 max_pixels_12: times 16 dw ((1 << 12)-1)
45 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
62 EPEL_TABLE 8,16, b, avx2
63 EPEL_TABLE 10, 8, w, avx2
65 EPEL_TABLE 8, 8, b, sse4
66 EPEL_TABLE 10, 4, w, sse4
67 EPEL_TABLE 12, 4, w, sse4
70 hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
84 QPEL_TABLE 8, 8, b, sse4
85 QPEL_TABLE 10, 4, w, sse4
86 QPEL_TABLE 12, 4, w, sse4
88 QPEL_TABLE 8,16, b, avx2
89 QPEL_TABLE 10, 8, w, avx2
91 %define MAX_PB_SIZE 64
93 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
95 %define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
99 %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
101 movq %3, [%2] ; load data from source2
103 movdqa %3, [%2] ; load data from source2
108 movdqa %3, [%2] ; load data from source2
109 movq %4, [%2+16] ; load data from source2
115 movdqa %3, [%2] ; load data from source2
116 movdqa %4, [%2+16] ; load data from source2
124 %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
125 %if %1 == 2 || (%2 == 8 && %1 <= 4)
126 movd %4, [%3] ; load data from source
127 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
128 movq %4, [%3] ; load data from source
129 %elif notcpuflag(avx)
130 movu %4, [%3] ; load data from source
131 %elif %1 <= 8 || (%2 == 8 && %1 <= 16)
139 %macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
143 lea %5q, [hevc_epel_filters_avx2_%1]
146 %define FILTER hevc_epel_filters_avx2_%1
151 lea %5q, [hevc_epel_filters_sse4_%1]
154 %define FILTER hevc_epel_filters_sse4_%1
156 %endif ;cpuflag(avx2)
159 shl %2q, 6 ; multiply by 64
161 shl %2q, 5 ; multiply by 32
163 mova %3, [FILTER + %2q] ; get 2 first values of filters
164 mova %4, [FILTER + %2q+%%offset] ; get 2 last values of filters
167 %macro EPEL_HV_FILTER 1
171 %define %%table hevc_epel_filters_avx2_%1
175 %define %%table hevc_epel_filters_sse4_%1
179 lea r3srcq, [%%table]
180 %define FILTER r3srcq
182 %define FILTER %%table
186 shl mxq, %%shift ; multiply by 32
187 shl myq, %%shift ; multiply by 32
188 mova m14, [FILTER + mxq] ; get 2 first values of filters
189 mova m15, [FILTER + mxq+%%offset] ; get 2 last values of filters
192 %define %%table hevc_epel_filters_avx2_10
194 %define %%table hevc_epel_filters_sse4_10
197 lea r3srcq, [%%table]
198 %define FILTER r3srcq
200 %define FILTER %%table
202 mova m12, [FILTER + myq] ; get 2 first values of filters
203 mova m13, [FILTER + myq+%%offset] ; get 2 last values of filters
204 lea r3srcq, [srcstrideq*3]
212 %define %%table hevc_qpel_filters_avx2_%1
216 %define %%table hevc_qpel_filters_sse4_%1
220 lea rfilterq, [%%table]
222 %define rfilterq %%table
225 shl %2q, %%shift ; multiply by 32
226 mova m12, [rfilterq + %2q] ; get 4 first values of filters
227 mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters
228 mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters
229 mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters
233 %if (%1 == 8 && %4 <= 4)
235 %elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
238 %define %%load movdqu
244 %%load m2, [%2q+2*%3]
245 %%load m3, [%2q+3*%3]
247 %%load m1, [%2q+ %3q]
248 %%load m2, [%2q+2*%3q]
249 %%load m3, [%2q+r3srcq]
253 SBUTTERFLY bw, 0, 1, 7
254 SBUTTERFLY bw, 2, 3, 7
261 SBUTTERFLY wd, 0, 1, 7
262 SBUTTERFLY wd, 2, 3, 7
272 %assign %%stride (%1+7)/8
290 %%load m0, [%2-3*%%stride] ;load data from source
291 %%load m1, [%2-2*%%stride]
292 %%load m2, [%2-%%stride ]
294 %%load m4, [%2+%%stride ]
295 %%load m5, [%2+2*%%stride]
296 %%load m6, [%2+3*%%stride]
297 %%load m7, [%2+4*%%stride]
301 SBUTTERFLY wd, 0, 1, %4
302 SBUTTERFLY wd, 2, 3, %4
303 SBUTTERFLY wd, 4, 5, %4
304 SBUTTERFLY wd, 6, 7, %4
313 SBUTTERFLY dq, 0, 1, %4
314 SBUTTERFLY dq, 2, 3, %4
315 SBUTTERFLY dq, 4, 5, %4
316 SBUTTERFLY dq, 6, 7, %4
329 movu m0, [%5q ] ;load x- 3*srcstride
330 movu m1, [%5q+ %3q ] ;load x- 2*srcstride
331 movu m2, [%5q+ 2*%3q ] ;load x-srcstride
332 movu m3, [%2 ] ;load x
333 movu m4, [%2+ %3q] ;load x+stride
334 movu m5, [%2+ 2*%3q] ;load x+2*stride
335 movu m6, [%2+r3srcq] ;load x+3*stride
336 movu m7, [%2+ 4*%3q] ;load x+4*stride
339 SBUTTERFLY bw, 0, 1, 8
340 SBUTTERFLY bw, 2, 3, 8
341 SBUTTERFLY bw, 4, 5, 8
342 SBUTTERFLY bw, 6, 7, 8
351 SBUTTERFLY wd, 0, 1, 8
352 SBUTTERFLY wd, 2, 3, 8
353 SBUTTERFLY wd, 4, 5, 8
354 SBUTTERFLY wd, 6, 7, 8
364 %macro PEL_12STORE2 3
367 %macro PEL_12STORE4 3
370 %macro PEL_12STORE6 3
375 %macro PEL_12STORE8 3
378 %macro PEL_12STORE12 3
382 %macro PEL_12STORE16 3
383 PEL_12STORE8 %1, %2, %3
387 %macro PEL_10STORE2 3
390 %macro PEL_10STORE4 3
393 %macro PEL_10STORE6 3
398 %macro PEL_10STORE8 3
401 %macro PEL_10STORE12 3
405 %macro PEL_10STORE16 3
409 PEL_10STORE8 %1, %2, %3
414 %macro PEL_10STORE32 3
415 PEL_10STORE16 %1, %2, %3
432 %macro PEL_8STORE12 3
437 %macro PEL_8STORE16 3
444 %macro PEL_8STORE32 3
449 add %1q, 2*MAX_PB_SIZE ; dst += dststride
450 add %2q, %3q ; src += srcstride
451 dec heightd ; cmp height
452 jnz .loop ; height loop
456 %macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
458 %if cpuflag(avx2) && %0 ==3
460 vextracti128 xm1, m0, 1
476 %macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
489 %if cpuflag(avx2) && (%0 == 5)
491 vperm2i128 m10, m0, m1, q0301
493 vinserti128 m0, m0, xm1, 1
496 vperm2i128 m10, m2, m3, q0301
498 vinserti128 m2, m2, xm3, 1
501 pmaddubsw %%reg0, %3 ;x1*c1+x2*c2
502 pmaddubsw %%reg2, %4 ;x3*c3+x4*c4
524 packssdw %%reg0, %%reg1
528 %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
532 %define %%table hevc_qpel_filters_avx2_%2
535 %define %%table hevc_qpel_filters_sse4_%2
539 lea rfilterq, [%%table]
541 %define rfilterq %%table
545 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
546 pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4
547 pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6
548 pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8
553 pmaddwd m0, [rfilterq + %3q*8 ]
554 pmaddwd m2, [rfilterq + %3q*8+%%offset]
555 pmaddwd m4, [rfilterq + %3q*8+2*%%offset]
556 pmaddwd m6, [rfilterq + %3q*8+3*%%offset]
564 pmaddwd m1, [rfilterq + %3q*8 ]
565 pmaddwd m3, [rfilterq + %3q*8+%%offset]
566 pmaddwd m5, [rfilterq + %3q*8+2*%%offset]
567 pmaddwd m7, [rfilterq + %3q*8+3*%%offset]
579 %macro QPEL_COMPUTE 2-3 ; width, bitdepth
581 %if cpuflag(avx2) && (%0 == 3)
583 vperm2i128 m10, m0, m1, q0301
584 vinserti128 m0, m0, xm1, 1
587 vperm2i128 m10, m2, m3, q0301
588 vinserti128 m2, m2, xm3, 1
592 vperm2i128 m10, m4, m5, q0301
593 vinserti128 m4, m4, xm5, 1
596 vperm2i128 m10, m6, m7, q0301
597 vinserti128 m6, m6, xm7, 1
601 pmaddubsw m0, m12 ;x1*c1+x2*c2
602 pmaddubsw m2, m13 ;x3*c3+x4*c4
603 pmaddubsw m4, m14 ;x5*c5+x6*c6
604 pmaddubsw m6, m15 ;x7*c7+x8*c8
643 %macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
648 UNI_COMPUTE %1, %2, %3, %4, %7
649 %if %0 == 8 && cpuflag(avx2) && (%2 == 8)
657 %if %1 > 8 || (%2 > 8 && %1 > 4)
663 CLIPW %3, [pb_0], [max_pixels_%2]
664 %if (%1 > 8 && notcpuflag(avx)) || %1 > 16
665 CLIPW %4, [pb_0], [max_pixels_%2]
671 ; ******************************
672 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
673 ; uint8_t *_src, ptrdiff_t _srcstride,
674 ; int height, int mx, int my)
675 ; ******************************
677 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
678 HEVC_PEL_PIXELS %1, %2
679 HEVC_UNI_PEL_PIXELS %1, %2
680 HEVC_BI_PEL_PIXELS %1, %2
683 %macro HEVC_PEL_PIXELS 2
684 cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
687 SIMPLE_LOAD %1, %2, srcq, m0
688 MC_PIXEL_COMPUTE %1, %2, 1
689 PEL_10STORE%1 dstq, m0, m1
690 LOOP_END dst, src, srcstride
694 %macro HEVC_UNI_PEL_PIXELS 2
695 cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
697 SIMPLE_LOAD %1, %2, srcq, m0
698 PEL_%2STORE%1 dstq, m0, m1
699 add dstq, dststrideq ; dst += dststride
700 add srcq, srcstrideq ; src += srcstride
701 dec heightd ; cmp height
702 jnz .loop ; height loop
706 %macro HEVC_BI_PEL_PIXELS 2
707 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
709 movdqa m5, [pw_bi_%2]
711 SIMPLE_LOAD %1, %2, srcq, m0
712 SIMPLE_BILOAD %1, src2q, m3, m4
713 MC_PIXEL_COMPUTE %1, %2, 1
714 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1
715 PEL_%2STORE%1 dstq, m0, m1
716 add dstq, dststrideq ; dst += dststride
717 add srcq, srcstrideq ; src += srcstride
718 add src2q, 2*MAX_PB_SIZE ; src += srcstride
719 dec heightd ; cmp height
720 jnz .loop ; height loop
725 ; ******************************
726 ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
727 ; uint8_t *_src, ptrdiff_t _srcstride,
728 ; int height, int mx, int my, int width);
729 ; ******************************
732 %macro HEVC_PUT_HEVC_EPEL 2
739 cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
740 %assign %%stride ((%2 + 7)/8)
741 EPEL_FILTER %2, mx, m4, m5, rfilter
743 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
744 EPEL_COMPUTE %2, %1, m4, m5, 1
745 PEL_10STORE%1 dstq, m0, m1
746 LOOP_END dst, src, srcstride
749 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
750 %assign %%stride ((%2 + 7)/8)
752 EPEL_FILTER %2, mx, m4, m5, rfilter
754 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
755 EPEL_COMPUTE %2, %1, m4, m5
756 UNI_COMPUTE %1, %2, m0, m1, m6
757 PEL_%2STORE%1 dstq, m0, m1
758 add dstq, dststrideq ; dst += dststride
759 add srcq, srcstrideq ; src += srcstride
760 dec heightd ; cmp height
761 jnz .loop ; height loop
764 cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
765 movdqa m6, [pw_bi_%2]
766 EPEL_FILTER %2, mx, m4, m5, rfilter
768 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
769 EPEL_COMPUTE %2, %1, m4, m5, 1
770 SIMPLE_BILOAD %1, src2q, m2, m3
771 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
772 PEL_%2STORE%1 dstq, m0, m1
773 add dstq, dststrideq ; dst += dststride
774 add srcq, srcstrideq ; src += srcstride
775 add src2q, 2*MAX_PB_SIZE ; src += srcstride
776 dec heightd ; cmp height
777 jnz .loop ; height loop
780 ; ******************************
781 ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
782 ; uint8_t *_src, ptrdiff_t _srcstride,
783 ; int height, int mx, int my, int width)
784 ; ******************************
786 cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
789 EPEL_FILTER %2, my, m4, m5, r3src
790 lea r3srcq, [srcstrideq*3]
792 EPEL_LOAD %2, srcq, srcstride, %1
793 EPEL_COMPUTE %2, %1, m4, m5, 1
794 PEL_10STORE%1 dstq, m0, m1
795 LOOP_END dst, src, srcstride
798 cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
802 EPEL_FILTER %2, my, m4, m5, r3src
803 lea r3srcq, [srcstrideq*3]
805 EPEL_LOAD %2, srcq, srcstride, %1
806 EPEL_COMPUTE %2, %1, m4, m5
807 UNI_COMPUTE %1, %2, m0, m1, m6
808 PEL_%2STORE%1 dstq, m0, m1
809 add dstq, dststrideq ; dst += dststride
810 add srcq, srcstrideq ; src += srcstride
811 dec heightd ; cmp height
812 jnz .loop ; height loop
816 cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
818 movdqa m6, [pw_bi_%2]
820 EPEL_FILTER %2, my, m4, m5, r3src
821 lea r3srcq, [srcstrideq*3]
823 EPEL_LOAD %2, srcq, srcstride, %1
824 EPEL_COMPUTE %2, %1, m4, m5, 1
825 SIMPLE_BILOAD %1, src2q, m2, m3
826 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
827 PEL_%2STORE%1 dstq, m0, m1
828 add dstq, dststrideq ; dst += dststride
829 add srcq, srcstrideq ; src += srcstride
830 add src2q, 2*MAX_PB_SIZE ; src += srcstride
831 dec heightd ; cmp height
832 jnz .loop ; height loop
837 ; ******************************
838 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
839 ; uint8_t *_src, ptrdiff_t _srcstride,
840 ; int height, int mx, int my, int width)
841 ; ******************************
843 %macro HEVC_PUT_HEVC_EPEL_HV 2
844 cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
845 %assign %%stride ((%2 + 7)/8)
848 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
849 EPEL_COMPUTE %2, %1, m14, m15
850 %if (%1 > 8 && (%2 == 8))
855 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
856 EPEL_COMPUTE %2, %1, m14, m15
857 %if (%1 > 8 && (%2 == 8))
862 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
863 EPEL_COMPUTE %2, %1, m14, m15
864 %if (%1 > 8 && (%2 == 8))
870 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
871 EPEL_COMPUTE %2, %1, m14, m15
872 %if (%1 > 8 && (%2 == 8))
882 EPEL_COMPUTE 14, %1, m12, m13
883 %if (%1 > 8 && (%2 == 8))
885 punpcklwd m2, m10, m11
887 punpckhwd m3, m10, m11
888 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
890 vinserti128 m2, m0, xm4, 1
891 vperm2i128 m3, m0, m4, q0301
892 PEL_10STORE%1 dstq, m2, m3
894 PEL_10STORE%1 dstq, m0, m4
897 PEL_10STORE%1 dstq, m0, m1
902 %if (%1 > 8 && (%2 == 8))
907 LOOP_END dst, src, srcstride
910 cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
911 %assign %%stride ((%2 + 7)/8)
914 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
915 EPEL_COMPUTE %2, %1, m14, m15
916 %if (%1 > 8 && (%2 == 8))
921 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
922 EPEL_COMPUTE %2, %1, m14, m15
923 %if (%1 > 8 && (%2 == 8))
928 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
929 EPEL_COMPUTE %2, %1, m14, m15
930 %if (%1 > 8 && (%2 == 8))
936 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
937 EPEL_COMPUTE %2, %1, m14, m15
938 %if (%1 > 8 && (%2 == 8))
948 EPEL_COMPUTE 14, %1, m12, m13
949 %if (%1 > 8 && (%2 == 8))
951 punpcklwd m2, m10, m11
953 punpckhwd m3, m10, m11
954 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
955 UNI_COMPUTE %1, %2, m0, m4, [pw_%2]
957 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
959 PEL_%2STORE%1 dstq, m0, m1
963 %if (%1 > 8 && (%2 == 8))
968 add dstq, dststrideq ; dst += dststride
969 add srcq, srcstrideq ; src += srcstride
970 dec heightd ; cmp height
971 jnz .loop ; height loop
974 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
975 %assign %%stride ((%2 + 7)/8)
978 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
979 EPEL_COMPUTE %2, %1, m14, m15
980 %if (%1 > 8 && (%2 == 8))
985 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
986 EPEL_COMPUTE %2, %1, m14, m15
987 %if (%1 > 8 && (%2 == 8))
992 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
993 EPEL_COMPUTE %2, %1, m14, m15
994 %if (%1 > 8 && (%2 == 8))
1000 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
1001 EPEL_COMPUTE %2, %1, m14, m15
1002 %if (%1 > 8 && (%2 == 8))
1006 punpcklwd m0, m4, m5
1007 punpcklwd m2, m6, m7
1009 punpckhwd m1, m4, m5
1010 punpckhwd m3, m6, m7
1012 EPEL_COMPUTE 14, %1, m12, m13
1013 %if (%1 > 8 && (%2 == 8))
1014 punpcklwd m4, m8, m9
1015 punpcklwd m2, m10, m11
1016 punpckhwd m8, m8, m9
1017 punpckhwd m3, m10, m11
1018 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
1019 SIMPLE_BILOAD %1, src2q, m8, m3
1021 vinserti128 m1, m8, xm3, 1
1022 vperm2i128 m2, m8, m3, q0301
1023 BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2]
1025 BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2]
1028 SIMPLE_BILOAD %1, src2q, m8, m9
1029 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1031 PEL_%2STORE%1 dstq, m0, m4
1035 %if (%1 > 8 && (%2 == 8))
1040 add dstq, dststrideq ; dst += dststride
1041 add srcq, srcstrideq ; src += srcstride
1042 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1043 dec heightd ; cmp height
1044 jnz .loop ; height loop
1048 ; ******************************
1049 ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
1050 ; uint8_t *_src, ptrdiff_t _srcstride,
1051 ; int height, int mx, int my, int width)
1052 ; ******************************
1054 %macro HEVC_PUT_HEVC_QPEL 2
1055 cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
1058 QPEL_H_LOAD %2, srcq, %1, 10
1059 QPEL_COMPUTE %1, %2, 1
1063 PEL_10STORE%1 dstq, m0, m1
1064 LOOP_END dst, src, srcstride
1067 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
1071 QPEL_H_LOAD %2, srcq, %1, 10
1076 UNI_COMPUTE %1, %2, m0, m1, m9
1077 PEL_%2STORE%1 dstq, m0, m1
1078 add dstq, dststrideq ; dst += dststride
1079 add srcq, srcstrideq ; src += srcstride
1080 dec heightd ; cmp height
1081 jnz .loop ; height loop
1084 cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
1085 movdqa m9, [pw_bi_%2]
1088 QPEL_H_LOAD %2, srcq, %1, 10
1089 QPEL_COMPUTE %1, %2, 1
1093 SIMPLE_BILOAD %1, src2q, m10, m11
1094 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
1095 PEL_%2STORE%1 dstq, m0, m1
1096 add dstq, dststrideq ; dst += dststride
1097 add srcq, srcstrideq ; src += srcstride
1098 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1099 dec heightd ; cmp height
1100 jnz .loop ; height loop
1104 ; ******************************
1105 ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
1106 ; uint8_t *_src, ptrdiff_t _srcstride,
1107 ; int height, int mx, int my, int width)
1108 ; ******************************
1110 cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
1112 lea r3srcq, [srcstrideq*3]
1115 QPEL_V_LOAD %2, srcq, srcstride, %1, r7
1116 QPEL_COMPUTE %1, %2, 1
1120 PEL_10STORE%1 dstq, m0, m1
1121 LOOP_END dst, src, srcstride
1124 cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
1127 lea r3srcq, [srcstrideq*3]
1130 QPEL_V_LOAD %2, srcq, srcstride, %1, r8
1135 UNI_COMPUTE %1, %2, m0, m1, m9
1136 PEL_%2STORE%1 dstq, m0, m1
1137 add dstq, dststrideq ; dst += dststride
1138 add srcq, srcstrideq ; src += srcstride
1139 dec heightd ; cmp height
1140 jnz .loop ; height loop
1143 cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
1145 movdqa m9, [pw_bi_%2]
1146 lea r3srcq, [srcstrideq*3]
1149 QPEL_V_LOAD %2, srcq, srcstride, %1, r9
1150 QPEL_COMPUTE %1, %2, 1
1154 SIMPLE_BILOAD %1, src2q, m10, m11
1155 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
1156 PEL_%2STORE%1 dstq, m0, m1
1157 add dstq, dststrideq ; dst += dststride
1158 add srcq, srcstrideq ; src += srcstride
1159 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1160 dec heightd ; cmp height
1161 jnz .loop ; height loop
1166 ; ******************************
1167 ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
1168 ; uint8_t *_src, ptrdiff_t _srcstride,
1169 ; int height, int mx, int my)
1170 ; ******************************
1171 %macro HEVC_PUT_HEVC_QPEL_HV 2
1172 cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
1180 shl mxq, %%shift ; multiply by 32
1181 shl myq, %%shift ; multiply by 32
1182 lea r3srcq, [srcstrideq*3]
1184 QPEL_H_LOAD %2, srcq, %1, 15
1185 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1187 add srcq, srcstrideq
1188 QPEL_H_LOAD %2, srcq, %1, 15
1189 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1191 add srcq, srcstrideq
1192 QPEL_H_LOAD %2, srcq, %1, 15
1193 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1195 add srcq, srcstrideq
1196 QPEL_H_LOAD %2, srcq, %1, 15
1197 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1199 add srcq, srcstrideq
1200 QPEL_H_LOAD %2, srcq, %1, 15
1201 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1203 add srcq, srcstrideq
1204 QPEL_H_LOAD %2, srcq, %1, 15
1205 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1207 add srcq, srcstrideq
1208 QPEL_H_LOAD %2, srcq, %1, 15
1209 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1211 add srcq, srcstrideq
1213 QPEL_H_LOAD %2, srcq, %1, 15
1214 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1216 punpcklwd m0, m8, m9
1217 punpcklwd m2, m10, m11
1218 punpcklwd m4, m12, m13
1219 punpcklwd m6, m14, m15
1221 punpckhwd m1, m8, m9
1222 punpckhwd m3, m10, m11
1223 punpckhwd m5, m12, m13
1224 punpckhwd m7, m14, m15
1226 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1227 PEL_10STORE%1 dstq, m0, m1
1245 LOOP_END dst, src, srcstride
1248 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
1256 shl mxq, %%shift ; multiply by 32
1257 shl myq, %%shift ; multiply by 32
1258 lea r3srcq, [srcstrideq*3]
1260 QPEL_H_LOAD %2, srcq, %1, 15
1261 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1263 add srcq, srcstrideq
1264 QPEL_H_LOAD %2, srcq, %1, 15
1265 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1267 add srcq, srcstrideq
1268 QPEL_H_LOAD %2, srcq, %1, 15
1269 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1271 add srcq, srcstrideq
1272 QPEL_H_LOAD %2, srcq, %1, 15
1273 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1275 add srcq, srcstrideq
1276 QPEL_H_LOAD %2, srcq, %1, 15
1277 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1279 add srcq, srcstrideq
1280 QPEL_H_LOAD %2, srcq, %1, 15
1281 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1283 add srcq, srcstrideq
1284 QPEL_H_LOAD %2, srcq, %1, 15
1285 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1287 add srcq, srcstrideq
1289 QPEL_H_LOAD %2, srcq, %1, 15
1290 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1292 punpcklwd m0, m8, m9
1293 punpcklwd m2, m10, m11
1294 punpcklwd m4, m12, m13
1295 punpcklwd m6, m14, m15
1297 punpckhwd m1, m8, m9
1298 punpckhwd m3, m10, m11
1299 punpckhwd m5, m12, m13
1300 punpckhwd m7, m14, m15
1302 QPEL_HV_COMPUTE %1, 14, my, ackusdw
1303 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
1304 PEL_%2STORE%1 dstq, m0, m1
1323 add dstq, dststrideq ; dst += dststride
1324 add srcq, srcstrideq ; src += srcstride
1325 dec heightd ; cmp height
1326 jnz .loop ; height loop
1329 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
1337 shl mxq, %%shift ; multiply by 32
1338 shl myq, %%shift ; multiply by 32
1339 lea r3srcq, [srcstrideq*3]
1341 QPEL_H_LOAD %2, srcq, %1, 15
1342 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1344 add srcq, srcstrideq
1345 QPEL_H_LOAD %2, srcq, %1, 15
1346 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1348 add srcq, srcstrideq
1349 QPEL_H_LOAD %2, srcq, %1, 15
1350 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1352 add srcq, srcstrideq
1353 QPEL_H_LOAD %2, srcq, %1, 15
1354 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1356 add srcq, srcstrideq
1357 QPEL_H_LOAD %2, srcq, %1, 15
1358 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1360 add srcq, srcstrideq
1361 QPEL_H_LOAD %2, srcq, %1, 15
1362 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1364 add srcq, srcstrideq
1365 QPEL_H_LOAD %2, srcq, %1, 15
1366 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1368 add srcq, srcstrideq
1370 QPEL_H_LOAD %2, srcq, %1, 15
1371 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1373 punpcklwd m0, m8, m9
1374 punpcklwd m2, m10, m11
1375 punpcklwd m4, m12, m13
1376 punpcklwd m6, m14, m15
1378 punpckhwd m1, m8, m9
1379 punpckhwd m3, m10, m11
1380 punpckhwd m5, m12, m13
1381 punpckhwd m7, m14, m15
1383 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1384 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
1385 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1386 PEL_%2STORE%1 dstq, m0, m1
1405 add dstq, dststrideq ; dst += dststride
1406 add srcq, srcstrideq ; src += srcstride
1407 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1408 dec heightd ; cmp height
1409 jnz .loop ; height loop
1413 %macro WEIGHTING_FUNCS 2
1414 %if WIN64 || ARCH_X86_32
1415 cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
1419 cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
1420 %define SHIFT denomd
1422 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
1427 movd m4, SHIFT ; shift
1440 shl SHIFT, %2-8 ; ox << (bitd - 8)
1444 %if WIN64 || ARCH_X86_32
1448 SIMPLE_LOAD %1, 10, srcq, m0
1458 punpckhwd m1, m0, m6
1471 CLIPW m0, [pb_0], [max_pixels_%2]
1473 PEL_%2STORE%1 dstq, m0, m1
1474 add dstq, dststrideq ; dst += dststride
1475 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1476 dec heightd ; cmp height
1477 jnz .loop ; height loop
1480 cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
1481 movifnidn r5d, denomm
1486 lea r5d, [r5d+14-%2] ; shift = 14 - bitd + denom
1488 movd m0, r5d ; shift
1497 movd m5, r5d ; shift+1
1503 shl r5d, %2-8 ; ox << (bitd - 8)
1506 movd m4, r5d ; offset
1517 SIMPLE_LOAD %1, 10, srcq, m0
1518 SIMPLE_LOAD %1, 10, src2q, m8
1532 punpckhwd m1, m0, m6
1534 punpckhwd m9, m8, m7
1547 CLIPW m0, [pb_0], [max_pixels_%2]
1549 PEL_%2STORE%1 dstq, m0, m1
1550 add dstq, dststrideq ; dst += dststride
1551 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1552 add src2q, 2*MAX_PB_SIZE ; src2 += srcstride
1554 jnz .loop ; height loop
1558 INIT_XMM sse4 ; adds ff_ and _sse4 to function name
1560 WEIGHTING_FUNCS 2, 8
1561 WEIGHTING_FUNCS 4, 8
1562 WEIGHTING_FUNCS 6, 8
1563 WEIGHTING_FUNCS 8, 8
1565 WEIGHTING_FUNCS 2, 10
1566 WEIGHTING_FUNCS 4, 10
1567 WEIGHTING_FUNCS 6, 10
1568 WEIGHTING_FUNCS 8, 10
1570 WEIGHTING_FUNCS 2, 12
1571 WEIGHTING_FUNCS 4, 12
1572 WEIGHTING_FUNCS 6, 12
1573 WEIGHTING_FUNCS 8, 12
1575 HEVC_PUT_HEVC_PEL_PIXELS 2, 8
1576 HEVC_PUT_HEVC_PEL_PIXELS 4, 8
1577 HEVC_PUT_HEVC_PEL_PIXELS 6, 8
1578 HEVC_PUT_HEVC_PEL_PIXELS 8, 8
1579 HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1580 HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1582 HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1583 HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1584 HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1585 HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1587 HEVC_PUT_HEVC_PEL_PIXELS 2, 12
1588 HEVC_PUT_HEVC_PEL_PIXELS 4, 12
1589 HEVC_PUT_HEVC_PEL_PIXELS 6, 12
1590 HEVC_PUT_HEVC_PEL_PIXELS 8, 12
1592 HEVC_PUT_HEVC_EPEL 2, 8
1593 HEVC_PUT_HEVC_EPEL 4, 8
1594 HEVC_PUT_HEVC_EPEL 6, 8
1595 HEVC_PUT_HEVC_EPEL 8, 8
1596 HEVC_PUT_HEVC_EPEL 12, 8
1597 HEVC_PUT_HEVC_EPEL 16, 8
1600 HEVC_PUT_HEVC_EPEL 2, 10
1601 HEVC_PUT_HEVC_EPEL 4, 10
1602 HEVC_PUT_HEVC_EPEL 6, 10
1603 HEVC_PUT_HEVC_EPEL 8, 10
1605 HEVC_PUT_HEVC_EPEL 2, 12
1606 HEVC_PUT_HEVC_EPEL 4, 12
1607 HEVC_PUT_HEVC_EPEL 6, 12
1608 HEVC_PUT_HEVC_EPEL 8, 12
1610 HEVC_PUT_HEVC_EPEL_HV 2, 8
1611 HEVC_PUT_HEVC_EPEL_HV 4, 8
1612 HEVC_PUT_HEVC_EPEL_HV 6, 8
1613 HEVC_PUT_HEVC_EPEL_HV 8, 8
1614 HEVC_PUT_HEVC_EPEL_HV 16, 8
1616 HEVC_PUT_HEVC_EPEL_HV 2, 10
1617 HEVC_PUT_HEVC_EPEL_HV 4, 10
1618 HEVC_PUT_HEVC_EPEL_HV 6, 10
1619 HEVC_PUT_HEVC_EPEL_HV 8, 10
1621 HEVC_PUT_HEVC_EPEL_HV 2, 12
1622 HEVC_PUT_HEVC_EPEL_HV 4, 12
1623 HEVC_PUT_HEVC_EPEL_HV 6, 12
1624 HEVC_PUT_HEVC_EPEL_HV 8, 12
1626 HEVC_PUT_HEVC_QPEL 4, 8
1627 HEVC_PUT_HEVC_QPEL 8, 8
1628 HEVC_PUT_HEVC_QPEL 12, 8
1629 HEVC_PUT_HEVC_QPEL 16, 8
1631 HEVC_PUT_HEVC_QPEL 4, 10
1632 HEVC_PUT_HEVC_QPEL 8, 10
1634 HEVC_PUT_HEVC_QPEL 4, 12
1635 HEVC_PUT_HEVC_QPEL 8, 12
1637 HEVC_PUT_HEVC_QPEL_HV 2, 8
1638 HEVC_PUT_HEVC_QPEL_HV 4, 8
1639 HEVC_PUT_HEVC_QPEL_HV 6, 8
1640 HEVC_PUT_HEVC_QPEL_HV 8, 8
1642 HEVC_PUT_HEVC_QPEL_HV 2, 10
1643 HEVC_PUT_HEVC_QPEL_HV 4, 10
1644 HEVC_PUT_HEVC_QPEL_HV 6, 10
1645 HEVC_PUT_HEVC_QPEL_HV 8, 10
1647 HEVC_PUT_HEVC_QPEL_HV 2, 12
1648 HEVC_PUT_HEVC_QPEL_HV 4, 12
1649 HEVC_PUT_HEVC_QPEL_HV 6, 12
1650 HEVC_PUT_HEVC_QPEL_HV 8, 12
1652 %if HAVE_AVX2_EXTERNAL
1653 INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
1655 HEVC_PUT_HEVC_PEL_PIXELS 32, 8
1656 HEVC_PUT_HEVC_PEL_PIXELS 16, 10
1658 HEVC_PUT_HEVC_EPEL 32, 8
1659 HEVC_PUT_HEVC_EPEL 16, 10
1661 HEVC_PUT_HEVC_EPEL_HV 16, 10
1662 HEVC_PUT_HEVC_EPEL_HV 32, 8
1664 HEVC_PUT_HEVC_QPEL 32, 8
1666 HEVC_PUT_HEVC_QPEL 16, 10
1668 HEVC_PUT_HEVC_QPEL_HV 16, 10
1671 %endif ; ARCH_X86_64