2 ; * Provide SSE luma and chroma mc functions for HEVC decoding
3 ; * Copyright (c) 2013 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
34 %define pw_bi_10 pw_1024
35 %define pw_bi_12 pw_4096
36 %define max_pixels_8 pw_255
37 %define max_pixels_10 pw_1023
38 pw_bi_8: times 16 dw (1 << 8)
39 max_pixels_12: times 16 dw ((1 << 12)-1)
44 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
61 EPEL_TABLE 8,16, b, avx2
62 EPEL_TABLE 10, 8, w, avx2
64 EPEL_TABLE 8, 8, b, sse4
65 EPEL_TABLE 10, 4, w, sse4
66 EPEL_TABLE 12, 4, w, sse4
69 hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
83 QPEL_TABLE 8, 8, b, sse4
84 QPEL_TABLE 10, 4, w, sse4
85 QPEL_TABLE 12, 4, w, sse4
87 QPEL_TABLE 8,16, b, avx2
88 QPEL_TABLE 10, 8, w, avx2
92 %define MAX_PB_SIZE 64
94 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
96 %define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
100 %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
102 movq %3, [%2] ; load data from source2
104 movdqa %3, [%2] ; load data from source2
109 movdqa %3, [%2] ; load data from source2
110 movq %4, [%2+16] ; load data from source2
116 movdqa %3, [%2] ; load data from source2
117 movdqa %4, [%2+16] ; load data from source2
125 %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
126 %if %1 == 2 || (%2 == 8 && %1 <= 4)
127 movd %4, [%3] ; load data from source
128 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
129 movq %4, [%3] ; load data from source
130 %elif notcpuflag(avx)
131 movu %4, [%3] ; load data from source
132 %elif %1 <= 8 || (%2 == 8 && %1 <= 16)
140 %macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
144 lea %5q, [hevc_epel_filters_avx2_%1]
147 %define FILTER hevc_epel_filters_avx2_%1
152 lea %5q, [hevc_epel_filters_sse4_%1]
155 %define FILTER hevc_epel_filters_sse4_%1
157 %endif ;cpuflag(avx2)
160 shl %2q, 6 ; multiply by 64
162 shl %2q, 5 ; multiply by 32
164 mova %3, [FILTER + %2q] ; get 2 first values of filters
165 mova %4, [FILTER + %2q+%%offset] ; get 2 last values of filters
168 %macro EPEL_HV_FILTER 1
172 %define %%table hevc_epel_filters_avx2_%1
176 %define %%table hevc_epel_filters_sse4_%1
180 lea r3srcq, [%%table]
181 %define FILTER r3srcq
183 %define FILTER %%table
187 shl mxq, %%shift ; multiply by 32
188 shl myq, %%shift ; multiply by 32
189 mova m14, [FILTER + mxq] ; get 2 first values of filters
190 mova m15, [FILTER + mxq+%%offset] ; get 2 last values of filters
193 %define %%table hevc_epel_filters_avx2_10
195 %define %%table hevc_epel_filters_sse4_10
198 lea r3srcq, [%%table]
199 %define FILTER r3srcq
201 %define FILTER %%table
203 mova m12, [FILTER + myq] ; get 2 first values of filters
204 mova m13, [FILTER + myq+%%offset] ; get 2 last values of filters
205 lea r3srcq, [srcstrideq*3]
213 %define %%table hevc_qpel_filters_avx2_%1
217 %define %%table hevc_qpel_filters_sse4_%1
221 lea rfilterq, [%%table]
223 %define rfilterq %%table
226 shl %2q, %%shift ; multiply by 32
227 mova m12, [rfilterq + %2q] ; get 4 first values of filters
228 mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters
229 mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters
230 mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters
234 %if (%1 == 8 && %4 <= 4)
236 %elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
239 %define %%load movdqu
245 %%load m2, [%2q+2*%3]
246 %%load m3, [%2q+3*%3]
248 %%load m1, [%2q+ %3q]
249 %%load m2, [%2q+2*%3q]
250 %%load m3, [%2q+r3srcq]
254 SBUTTERFLY bw, 0, 1, 7
255 SBUTTERFLY bw, 2, 3, 7
262 SBUTTERFLY wd, 0, 1, 7
263 SBUTTERFLY wd, 2, 3, 7
273 %assign %%stride (%1+7)/8
291 %%load m0, [%2-3*%%stride] ;load data from source
292 %%load m1, [%2-2*%%stride]
293 %%load m2, [%2-%%stride ]
295 %%load m4, [%2+%%stride ]
296 %%load m5, [%2+2*%%stride]
297 %%load m6, [%2+3*%%stride]
298 %%load m7, [%2+4*%%stride]
302 SBUTTERFLY wd, 0, 1, %4
303 SBUTTERFLY wd, 2, 3, %4
304 SBUTTERFLY wd, 4, 5, %4
305 SBUTTERFLY wd, 6, 7, %4
314 SBUTTERFLY dq, 0, 1, %4
315 SBUTTERFLY dq, 2, 3, %4
316 SBUTTERFLY dq, 4, 5, %4
317 SBUTTERFLY dq, 6, 7, %4
330 movu m0, [%5q ] ;load x- 3*srcstride
331 movu m1, [%5q+ %3q ] ;load x- 2*srcstride
332 movu m2, [%5q+ 2*%3q ] ;load x-srcstride
333 movu m3, [%2 ] ;load x
334 movu m4, [%2+ %3q] ;load x+stride
335 movu m5, [%2+ 2*%3q] ;load x+2*stride
336 movu m6, [%2+r3srcq] ;load x+3*stride
337 movu m7, [%2+ 4*%3q] ;load x+4*stride
340 SBUTTERFLY bw, 0, 1, 8
341 SBUTTERFLY bw, 2, 3, 8
342 SBUTTERFLY bw, 4, 5, 8
343 SBUTTERFLY bw, 6, 7, 8
352 SBUTTERFLY wd, 0, 1, 8
353 SBUTTERFLY wd, 2, 3, 8
354 SBUTTERFLY wd, 4, 5, 8
355 SBUTTERFLY wd, 6, 7, 8
365 %macro PEL_12STORE2 3
368 %macro PEL_12STORE4 3
371 %macro PEL_12STORE6 3
376 %macro PEL_12STORE8 3
379 %macro PEL_12STORE12 3
383 %macro PEL_12STORE16 3
384 PEL_12STORE8 %1, %2, %3
388 %macro PEL_10STORE2 3
391 %macro PEL_10STORE4 3
394 %macro PEL_10STORE6 3
399 %macro PEL_10STORE8 3
402 %macro PEL_10STORE12 3
406 %macro PEL_10STORE16 3
410 PEL_10STORE8 %1, %2, %3
415 %macro PEL_10STORE32 3
416 PEL_10STORE16 %1, %2, %3
433 %macro PEL_8STORE12 3
438 %macro PEL_8STORE16 3
445 %macro PEL_8STORE32 3
450 add %1q, 2*MAX_PB_SIZE ; dst += dststride
451 add %2q, %3q ; src += srcstride
452 dec heightd ; cmp height
453 jnz .loop ; height loop
457 %macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
459 %if cpuflag(avx2) && %0 ==3
461 vextracti128 xm1, m0, 1
477 %macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
490 %if cpuflag(avx2) && (%0 == 5)
492 vperm2i128 m10, m0, m1, q0301
494 vinserti128 m0, m0, xm1, 1
497 vperm2i128 m10, m2, m3, q0301
499 vinserti128 m2, m2, xm3, 1
502 pmaddubsw %%reg0, %3 ;x1*c1+x2*c2
503 pmaddubsw %%reg2, %4 ;x3*c3+x4*c4
525 packssdw %%reg0, %%reg1
529 %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
533 %define %%table hevc_qpel_filters_avx2_%2
536 %define %%table hevc_qpel_filters_sse4_%2
540 lea rfilterq, [%%table]
542 %define rfilterq %%table
546 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
547 pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4
548 pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6
549 pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8
554 pmaddwd m0, [rfilterq + %3q*8 ]
555 pmaddwd m2, [rfilterq + %3q*8+%%offset]
556 pmaddwd m4, [rfilterq + %3q*8+2*%%offset]
557 pmaddwd m6, [rfilterq + %3q*8+3*%%offset]
565 pmaddwd m1, [rfilterq + %3q*8 ]
566 pmaddwd m3, [rfilterq + %3q*8+%%offset]
567 pmaddwd m5, [rfilterq + %3q*8+2*%%offset]
568 pmaddwd m7, [rfilterq + %3q*8+3*%%offset]
580 %macro QPEL_COMPUTE 2-3 ; width, bitdepth
582 %if cpuflag(avx2) && (%0 == 3)
584 vperm2i128 m10, m0, m1, q0301
585 vinserti128 m0, m0, xm1, 1
588 vperm2i128 m10, m2, m3, q0301
589 vinserti128 m2, m2, xm3, 1
593 vperm2i128 m10, m4, m5, q0301
594 vinserti128 m4, m4, xm5, 1
597 vperm2i128 m10, m6, m7, q0301
598 vinserti128 m6, m6, xm7, 1
602 pmaddubsw m0, m12 ;x1*c1+x2*c2
603 pmaddubsw m2, m13 ;x3*c3+x4*c4
604 pmaddubsw m4, m14 ;x5*c5+x6*c6
605 pmaddubsw m6, m15 ;x7*c7+x8*c8
644 %macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
649 UNI_COMPUTE %1, %2, %3, %4, %7
650 %if %0 == 8 && cpuflag(avx2) && (%2 == 8)
658 %if %1 > 8 || (%2 > 8 && %1 > 4)
664 CLIPW %3, [pb_0], [max_pixels_%2]
665 %if (%1 > 8 && notcpuflag(avx)) || %1 > 16
666 CLIPW %4, [pb_0], [max_pixels_%2]
672 ; ******************************
673 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
674 ; uint8_t *_src, ptrdiff_t _srcstride,
675 ; int height, int mx, int my)
676 ; ******************************
678 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
679 HEVC_PEL_PIXELS %1, %2
680 HEVC_UNI_PEL_PIXELS %1, %2
681 HEVC_BI_PEL_PIXELS %1, %2
684 %macro HEVC_PEL_PIXELS 2
685 cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
688 SIMPLE_LOAD %1, %2, srcq, m0
689 MC_PIXEL_COMPUTE %1, %2, 1
690 PEL_10STORE%1 dstq, m0, m1
691 LOOP_END dst, src, srcstride
695 %macro HEVC_UNI_PEL_PIXELS 2
696 cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
698 SIMPLE_LOAD %1, %2, srcq, m0
699 PEL_%2STORE%1 dstq, m0, m1
700 add dstq, dststrideq ; dst += dststride
701 add srcq, srcstrideq ; src += srcstride
702 dec heightd ; cmp height
703 jnz .loop ; height loop
707 %macro HEVC_BI_PEL_PIXELS 2
708 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
710 movdqa m5, [pw_bi_%2]
712 SIMPLE_LOAD %1, %2, srcq, m0
713 SIMPLE_BILOAD %1, src2q, m3, m4
714 MC_PIXEL_COMPUTE %1, %2, 1
715 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1
716 PEL_%2STORE%1 dstq, m0, m1
717 add dstq, dststrideq ; dst += dststride
718 add srcq, srcstrideq ; src += srcstride
719 add src2q, 2*MAX_PB_SIZE ; src += srcstride
720 dec heightd ; cmp height
721 jnz .loop ; height loop
726 ; ******************************
727 ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
728 ; uint8_t *_src, ptrdiff_t _srcstride,
729 ; int height, int mx, int my, int width);
730 ; ******************************
733 %macro HEVC_PUT_HEVC_EPEL 2
740 cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
741 %assign %%stride ((%2 + 7)/8)
742 EPEL_FILTER %2, mx, m4, m5, rfilter
744 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
745 EPEL_COMPUTE %2, %1, m4, m5, 1
746 PEL_10STORE%1 dstq, m0, m1
747 LOOP_END dst, src, srcstride
750 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
751 %assign %%stride ((%2 + 7)/8)
753 EPEL_FILTER %2, mx, m4, m5, rfilter
755 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
756 EPEL_COMPUTE %2, %1, m4, m5
757 UNI_COMPUTE %1, %2, m0, m1, m6
758 PEL_%2STORE%1 dstq, m0, m1
759 add dstq, dststrideq ; dst += dststride
760 add srcq, srcstrideq ; src += srcstride
761 dec heightd ; cmp height
762 jnz .loop ; height loop
765 cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
766 movdqa m6, [pw_bi_%2]
767 EPEL_FILTER %2, mx, m4, m5, rfilter
769 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
770 EPEL_COMPUTE %2, %1, m4, m5, 1
771 SIMPLE_BILOAD %1, src2q, m2, m3
772 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
773 PEL_%2STORE%1 dstq, m0, m1
774 add dstq, dststrideq ; dst += dststride
775 add srcq, srcstrideq ; src += srcstride
776 add src2q, 2*MAX_PB_SIZE ; src += srcstride
777 dec heightd ; cmp height
778 jnz .loop ; height loop
781 ; ******************************
782 ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
783 ; uint8_t *_src, ptrdiff_t _srcstride,
784 ; int height, int mx, int my, int width)
785 ; ******************************
787 cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
790 EPEL_FILTER %2, my, m4, m5, r3src
791 lea r3srcq, [srcstrideq*3]
793 EPEL_LOAD %2, srcq, srcstride, %1
794 EPEL_COMPUTE %2, %1, m4, m5, 1
795 PEL_10STORE%1 dstq, m0, m1
796 LOOP_END dst, src, srcstride
799 cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
803 EPEL_FILTER %2, my, m4, m5, r3src
804 lea r3srcq, [srcstrideq*3]
806 EPEL_LOAD %2, srcq, srcstride, %1
807 EPEL_COMPUTE %2, %1, m4, m5
808 UNI_COMPUTE %1, %2, m0, m1, m6
809 PEL_%2STORE%1 dstq, m0, m1
810 add dstq, dststrideq ; dst += dststride
811 add srcq, srcstrideq ; src += srcstride
812 dec heightd ; cmp height
813 jnz .loop ; height loop
817 cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
819 movdqa m6, [pw_bi_%2]
821 EPEL_FILTER %2, my, m4, m5, r3src
822 lea r3srcq, [srcstrideq*3]
824 EPEL_LOAD %2, srcq, srcstride, %1
825 EPEL_COMPUTE %2, %1, m4, m5, 1
826 SIMPLE_BILOAD %1, src2q, m2, m3
827 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
828 PEL_%2STORE%1 dstq, m0, m1
829 add dstq, dststrideq ; dst += dststride
830 add srcq, srcstrideq ; src += srcstride
831 add src2q, 2*MAX_PB_SIZE ; src += srcstride
832 dec heightd ; cmp height
833 jnz .loop ; height loop
838 ; ******************************
839 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
840 ; uint8_t *_src, ptrdiff_t _srcstride,
841 ; int height, int mx, int my, int width)
842 ; ******************************
844 %macro HEVC_PUT_HEVC_EPEL_HV 2
845 cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
846 %assign %%stride ((%2 + 7)/8)
849 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
850 EPEL_COMPUTE %2, %1, m14, m15
851 %if (%1 > 8 && (%2 == 8))
856 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
857 EPEL_COMPUTE %2, %1, m14, m15
858 %if (%1 > 8 && (%2 == 8))
863 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
864 EPEL_COMPUTE %2, %1, m14, m15
865 %if (%1 > 8 && (%2 == 8))
871 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
872 EPEL_COMPUTE %2, %1, m14, m15
873 %if (%1 > 8 && (%2 == 8))
883 EPEL_COMPUTE 14, %1, m12, m13
884 %if (%1 > 8 && (%2 == 8))
886 punpcklwd m2, m10, m11
888 punpckhwd m3, m10, m11
889 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
891 vinserti128 m2, m0, xm4, 1
892 vperm2i128 m3, m0, m4, q0301
893 PEL_10STORE%1 dstq, m2, m3
895 PEL_10STORE%1 dstq, m0, m4
898 PEL_10STORE%1 dstq, m0, m1
903 %if (%1 > 8 && (%2 == 8))
908 LOOP_END dst, src, srcstride
911 cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
912 %assign %%stride ((%2 + 7)/8)
915 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
916 EPEL_COMPUTE %2, %1, m14, m15
917 %if (%1 > 8 && (%2 == 8))
922 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
923 EPEL_COMPUTE %2, %1, m14, m15
924 %if (%1 > 8 && (%2 == 8))
929 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
930 EPEL_COMPUTE %2, %1, m14, m15
931 %if (%1 > 8 && (%2 == 8))
937 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
938 EPEL_COMPUTE %2, %1, m14, m15
939 %if (%1 > 8 && (%2 == 8))
949 EPEL_COMPUTE 14, %1, m12, m13
950 %if (%1 > 8 && (%2 == 8))
952 punpcklwd m2, m10, m11
954 punpckhwd m3, m10, m11
955 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
956 UNI_COMPUTE %1, %2, m0, m4, [pw_%2]
958 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
960 PEL_%2STORE%1 dstq, m0, m1
964 %if (%1 > 8 && (%2 == 8))
969 add dstq, dststrideq ; dst += dststride
970 add srcq, srcstrideq ; src += srcstride
971 dec heightd ; cmp height
972 jnz .loop ; height loop
975 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
976 %assign %%stride ((%2 + 7)/8)
979 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
980 EPEL_COMPUTE %2, %1, m14, m15
981 %if (%1 > 8 && (%2 == 8))
986 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
987 EPEL_COMPUTE %2, %1, m14, m15
988 %if (%1 > 8 && (%2 == 8))
993 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
994 EPEL_COMPUTE %2, %1, m14, m15
995 %if (%1 > 8 && (%2 == 8))
1001 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
1002 EPEL_COMPUTE %2, %1, m14, m15
1003 %if (%1 > 8 && (%2 == 8))
1007 punpcklwd m0, m4, m5
1008 punpcklwd m2, m6, m7
1010 punpckhwd m1, m4, m5
1011 punpckhwd m3, m6, m7
1013 EPEL_COMPUTE 14, %1, m12, m13
1014 %if (%1 > 8 && (%2 == 8))
1015 punpcklwd m4, m8, m9
1016 punpcklwd m2, m10, m11
1017 punpckhwd m8, m8, m9
1018 punpckhwd m3, m10, m11
1019 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
1020 SIMPLE_BILOAD %1, src2q, m8, m3
1022 vinserti128 m1, m8, xm3, 1
1023 vperm2i128 m2, m8, m3, q0301
1024 BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2]
1026 BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2]
1029 SIMPLE_BILOAD %1, src2q, m8, m9
1030 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1032 PEL_%2STORE%1 dstq, m0, m4
1036 %if (%1 > 8 && (%2 == 8))
1041 add dstq, dststrideq ; dst += dststride
1042 add srcq, srcstrideq ; src += srcstride
1043 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1044 dec heightd ; cmp height
1045 jnz .loop ; height loop
1049 ; ******************************
1050 ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
1051 ; uint8_t *_src, ptrdiff_t _srcstride,
1052 ; int height, int mx, int my, int width)
1053 ; ******************************
1055 %macro HEVC_PUT_HEVC_QPEL 2
1056 cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
1059 QPEL_H_LOAD %2, srcq, %1, 10
1060 QPEL_COMPUTE %1, %2, 1
1064 PEL_10STORE%1 dstq, m0, m1
1065 LOOP_END dst, src, srcstride
1068 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
1072 QPEL_H_LOAD %2, srcq, %1, 10
1077 UNI_COMPUTE %1, %2, m0, m1, m9
1078 PEL_%2STORE%1 dstq, m0, m1
1079 add dstq, dststrideq ; dst += dststride
1080 add srcq, srcstrideq ; src += srcstride
1081 dec heightd ; cmp height
1082 jnz .loop ; height loop
1085 cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
1086 movdqa m9, [pw_bi_%2]
1089 QPEL_H_LOAD %2, srcq, %1, 10
1090 QPEL_COMPUTE %1, %2, 1
1094 SIMPLE_BILOAD %1, src2q, m10, m11
1095 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
1096 PEL_%2STORE%1 dstq, m0, m1
1097 add dstq, dststrideq ; dst += dststride
1098 add srcq, srcstrideq ; src += srcstride
1099 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1100 dec heightd ; cmp height
1101 jnz .loop ; height loop
1105 ; ******************************
1106 ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
1107 ; uint8_t *_src, ptrdiff_t _srcstride,
1108 ; int height, int mx, int my, int width)
1109 ; ******************************
1111 cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
1113 lea r3srcq, [srcstrideq*3]
1116 QPEL_V_LOAD %2, srcq, srcstride, %1, r7
1117 QPEL_COMPUTE %1, %2, 1
1121 PEL_10STORE%1 dstq, m0, m1
1122 LOOP_END dst, src, srcstride
1125 cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
1128 lea r3srcq, [srcstrideq*3]
1131 QPEL_V_LOAD %2, srcq, srcstride, %1, r8
1136 UNI_COMPUTE %1, %2, m0, m1, m9
1137 PEL_%2STORE%1 dstq, m0, m1
1138 add dstq, dststrideq ; dst += dststride
1139 add srcq, srcstrideq ; src += srcstride
1140 dec heightd ; cmp height
1141 jnz .loop ; height loop
1144 cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
1146 movdqa m9, [pw_bi_%2]
1147 lea r3srcq, [srcstrideq*3]
1150 QPEL_V_LOAD %2, srcq, srcstride, %1, r9
1151 QPEL_COMPUTE %1, %2, 1
1155 SIMPLE_BILOAD %1, src2q, m10, m11
1156 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
1157 PEL_%2STORE%1 dstq, m0, m1
1158 add dstq, dststrideq ; dst += dststride
1159 add srcq, srcstrideq ; src += srcstride
1160 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1161 dec heightd ; cmp height
1162 jnz .loop ; height loop
1167 ; ******************************
1168 ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
1169 ; uint8_t *_src, ptrdiff_t _srcstride,
1170 ; int height, int mx, int my)
1171 ; ******************************
1172 %macro HEVC_PUT_HEVC_QPEL_HV 2
1173 cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
1181 shl mxq, %%shift ; multiply by 32
1182 shl myq, %%shift ; multiply by 32
1183 lea r3srcq, [srcstrideq*3]
1185 QPEL_H_LOAD %2, srcq, %1, 15
1186 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1188 add srcq, srcstrideq
1189 QPEL_H_LOAD %2, srcq, %1, 15
1190 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1192 add srcq, srcstrideq
1193 QPEL_H_LOAD %2, srcq, %1, 15
1194 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1196 add srcq, srcstrideq
1197 QPEL_H_LOAD %2, srcq, %1, 15
1198 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1200 add srcq, srcstrideq
1201 QPEL_H_LOAD %2, srcq, %1, 15
1202 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1204 add srcq, srcstrideq
1205 QPEL_H_LOAD %2, srcq, %1, 15
1206 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1208 add srcq, srcstrideq
1209 QPEL_H_LOAD %2, srcq, %1, 15
1210 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1212 add srcq, srcstrideq
1214 QPEL_H_LOAD %2, srcq, %1, 15
1215 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1217 punpcklwd m0, m8, m9
1218 punpcklwd m2, m10, m11
1219 punpcklwd m4, m12, m13
1220 punpcklwd m6, m14, m15
1222 punpckhwd m1, m8, m9
1223 punpckhwd m3, m10, m11
1224 punpckhwd m5, m12, m13
1225 punpckhwd m7, m14, m15
1227 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1228 PEL_10STORE%1 dstq, m0, m1
1246 LOOP_END dst, src, srcstride
1249 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
1257 shl mxq, %%shift ; multiply by 32
1258 shl myq, %%shift ; multiply by 32
1259 lea r3srcq, [srcstrideq*3]
1261 QPEL_H_LOAD %2, srcq, %1, 15
1262 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1264 add srcq, srcstrideq
1265 QPEL_H_LOAD %2, srcq, %1, 15
1266 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1268 add srcq, srcstrideq
1269 QPEL_H_LOAD %2, srcq, %1, 15
1270 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1272 add srcq, srcstrideq
1273 QPEL_H_LOAD %2, srcq, %1, 15
1274 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1276 add srcq, srcstrideq
1277 QPEL_H_LOAD %2, srcq, %1, 15
1278 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1280 add srcq, srcstrideq
1281 QPEL_H_LOAD %2, srcq, %1, 15
1282 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1284 add srcq, srcstrideq
1285 QPEL_H_LOAD %2, srcq, %1, 15
1286 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1288 add srcq, srcstrideq
1290 QPEL_H_LOAD %2, srcq, %1, 15
1291 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1293 punpcklwd m0, m8, m9
1294 punpcklwd m2, m10, m11
1295 punpcklwd m4, m12, m13
1296 punpcklwd m6, m14, m15
1298 punpckhwd m1, m8, m9
1299 punpckhwd m3, m10, m11
1300 punpckhwd m5, m12, m13
1301 punpckhwd m7, m14, m15
1303 QPEL_HV_COMPUTE %1, 14, my, ackusdw
1304 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
1305 PEL_%2STORE%1 dstq, m0, m1
1324 add dstq, dststrideq ; dst += dststride
1325 add srcq, srcstrideq ; src += srcstride
1326 dec heightd ; cmp height
1327 jnz .loop ; height loop
1330 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
1338 shl mxq, %%shift ; multiply by 32
1339 shl myq, %%shift ; multiply by 32
1340 lea r3srcq, [srcstrideq*3]
1342 QPEL_H_LOAD %2, srcq, %1, 15
1343 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1345 add srcq, srcstrideq
1346 QPEL_H_LOAD %2, srcq, %1, 15
1347 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1349 add srcq, srcstrideq
1350 QPEL_H_LOAD %2, srcq, %1, 15
1351 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1353 add srcq, srcstrideq
1354 QPEL_H_LOAD %2, srcq, %1, 15
1355 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1357 add srcq, srcstrideq
1358 QPEL_H_LOAD %2, srcq, %1, 15
1359 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1361 add srcq, srcstrideq
1362 QPEL_H_LOAD %2, srcq, %1, 15
1363 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1365 add srcq, srcstrideq
1366 QPEL_H_LOAD %2, srcq, %1, 15
1367 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1369 add srcq, srcstrideq
1371 QPEL_H_LOAD %2, srcq, %1, 15
1372 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1374 punpcklwd m0, m8, m9
1375 punpcklwd m2, m10, m11
1376 punpcklwd m4, m12, m13
1377 punpcklwd m6, m14, m15
1379 punpckhwd m1, m8, m9
1380 punpckhwd m3, m10, m11
1381 punpckhwd m5, m12, m13
1382 punpckhwd m7, m14, m15
1384 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1385 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
1386 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1387 PEL_%2STORE%1 dstq, m0, m1
1406 add dstq, dststrideq ; dst += dststride
1407 add srcq, srcstrideq ; src += srcstride
1408 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1409 dec heightd ; cmp height
1410 jnz .loop ; height loop
1414 %macro WEIGHTING_FUNCS 2
1415 %if WIN64 || ARCH_X86_32
1416 cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
1420 cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
1421 %define SHIFT denomd
1423 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
1428 movd m4, SHIFT ; shift
1441 shl SHIFT, %2-8 ; ox << (bitd - 8)
1445 %if WIN64 || ARCH_X86_32
1449 SIMPLE_LOAD %1, 10, srcq, m0
1459 punpckhwd m1, m0, m6
1472 CLIPW m0, [pb_0], [max_pixels_%2]
1474 PEL_%2STORE%1 dstq, m0, m1
1475 add dstq, dststrideq ; dst += dststride
1476 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1477 dec heightd ; cmp height
1478 jnz .loop ; height loop
1481 cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
1482 movifnidn r5d, denomm
1487 lea r5d, [r5d+14-%2] ; shift = 14 - bitd + denom
1489 movd m0, r5d ; shift
1498 movd m5, r5d ; shift+1
1504 shl r5d, %2-8 ; ox << (bitd - 8)
1507 movd m4, r5d ; offset
1518 SIMPLE_LOAD %1, 10, srcq, m0
1519 SIMPLE_LOAD %1, 10, src2q, m8
1533 punpckhwd m1, m0, m6
1535 punpckhwd m9, m8, m7
1548 CLIPW m0, [pb_0], [max_pixels_%2]
1550 PEL_%2STORE%1 dstq, m0, m1
1551 add dstq, dststrideq ; dst += dststride
1552 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1553 add src2q, 2*MAX_PB_SIZE ; src2 += srcstride
1555 jnz .loop ; height loop
1559 INIT_XMM sse4 ; adds ff_ and _sse4 to function name
1561 WEIGHTING_FUNCS 2, 8
1562 WEIGHTING_FUNCS 4, 8
1563 WEIGHTING_FUNCS 6, 8
1564 WEIGHTING_FUNCS 8, 8
1566 WEIGHTING_FUNCS 2, 10
1567 WEIGHTING_FUNCS 4, 10
1568 WEIGHTING_FUNCS 6, 10
1569 WEIGHTING_FUNCS 8, 10
1571 WEIGHTING_FUNCS 2, 12
1572 WEIGHTING_FUNCS 4, 12
1573 WEIGHTING_FUNCS 6, 12
1574 WEIGHTING_FUNCS 8, 12
1576 HEVC_PUT_HEVC_PEL_PIXELS 2, 8
1577 HEVC_PUT_HEVC_PEL_PIXELS 4, 8
1578 HEVC_PUT_HEVC_PEL_PIXELS 6, 8
1579 HEVC_PUT_HEVC_PEL_PIXELS 8, 8
1580 HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1581 HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1583 HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1584 HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1585 HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1586 HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1588 HEVC_PUT_HEVC_PEL_PIXELS 2, 12
1589 HEVC_PUT_HEVC_PEL_PIXELS 4, 12
1590 HEVC_PUT_HEVC_PEL_PIXELS 6, 12
1591 HEVC_PUT_HEVC_PEL_PIXELS 8, 12
1593 HEVC_PUT_HEVC_EPEL 2, 8
1594 HEVC_PUT_HEVC_EPEL 4, 8
1595 HEVC_PUT_HEVC_EPEL 6, 8
1596 HEVC_PUT_HEVC_EPEL 8, 8
1597 HEVC_PUT_HEVC_EPEL 12, 8
1598 HEVC_PUT_HEVC_EPEL 16, 8
1601 HEVC_PUT_HEVC_EPEL 2, 10
1602 HEVC_PUT_HEVC_EPEL 4, 10
1603 HEVC_PUT_HEVC_EPEL 6, 10
1604 HEVC_PUT_HEVC_EPEL 8, 10
1606 HEVC_PUT_HEVC_EPEL 2, 12
1607 HEVC_PUT_HEVC_EPEL 4, 12
1608 HEVC_PUT_HEVC_EPEL 6, 12
1609 HEVC_PUT_HEVC_EPEL 8, 12
1611 HEVC_PUT_HEVC_EPEL_HV 2, 8
1612 HEVC_PUT_HEVC_EPEL_HV 4, 8
1613 HEVC_PUT_HEVC_EPEL_HV 6, 8
1614 HEVC_PUT_HEVC_EPEL_HV 8, 8
1615 HEVC_PUT_HEVC_EPEL_HV 16, 8
1617 HEVC_PUT_HEVC_EPEL_HV 2, 10
1618 HEVC_PUT_HEVC_EPEL_HV 4, 10
1619 HEVC_PUT_HEVC_EPEL_HV 6, 10
1620 HEVC_PUT_HEVC_EPEL_HV 8, 10
1622 HEVC_PUT_HEVC_EPEL_HV 2, 12
1623 HEVC_PUT_HEVC_EPEL_HV 4, 12
1624 HEVC_PUT_HEVC_EPEL_HV 6, 12
1625 HEVC_PUT_HEVC_EPEL_HV 8, 12
1627 HEVC_PUT_HEVC_QPEL 4, 8
1628 HEVC_PUT_HEVC_QPEL 8, 8
1629 HEVC_PUT_HEVC_QPEL 12, 8
1630 HEVC_PUT_HEVC_QPEL 16, 8
1632 HEVC_PUT_HEVC_QPEL 4, 10
1633 HEVC_PUT_HEVC_QPEL 8, 10
1635 HEVC_PUT_HEVC_QPEL 4, 12
1636 HEVC_PUT_HEVC_QPEL 8, 12
1638 HEVC_PUT_HEVC_QPEL_HV 2, 8
1639 HEVC_PUT_HEVC_QPEL_HV 4, 8
1640 HEVC_PUT_HEVC_QPEL_HV 6, 8
1641 HEVC_PUT_HEVC_QPEL_HV 8, 8
1643 HEVC_PUT_HEVC_QPEL_HV 2, 10
1644 HEVC_PUT_HEVC_QPEL_HV 4, 10
1645 HEVC_PUT_HEVC_QPEL_HV 6, 10
1646 HEVC_PUT_HEVC_QPEL_HV 8, 10
1648 HEVC_PUT_HEVC_QPEL_HV 2, 12
1649 HEVC_PUT_HEVC_QPEL_HV 4, 12
1650 HEVC_PUT_HEVC_QPEL_HV 6, 12
1651 HEVC_PUT_HEVC_QPEL_HV 8, 12
1653 %if HAVE_AVX2_EXTERNAL
1654 INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
1656 HEVC_PUT_HEVC_PEL_PIXELS 32, 8
1657 HEVC_PUT_HEVC_PEL_PIXELS 16, 10
1659 HEVC_PUT_HEVC_EPEL 32, 8
1660 HEVC_PUT_HEVC_EPEL 16, 10
1662 HEVC_PUT_HEVC_EPEL_HV 16, 10
1663 HEVC_PUT_HEVC_EPEL_HV 32, 8
1665 HEVC_PUT_HEVC_QPEL 32, 8
1667 HEVC_PUT_HEVC_QPEL 16, 10
1669 HEVC_PUT_HEVC_QPEL_HV 16, 10
1672 %endif ; ARCH_X86_64