2 ; * Provide SSE luma and chroma mc functions for HEVC decoding
3 ; * Copyright (c) 2013 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
24 pw_8: times 8 dw (1 << 9)
25 pw_10: times 8 dw (1 << 11)
26 pw_12: times 8 dw (1 << 13)
27 pw_bi_8: times 8 dw (1 << 8)
28 pw_bi_10: times 8 dw (1 << 10)
29 pw_bi_12: times 8 dw (1 << 12)
30 max_pixels_10: times 8 dw ((1 << 10)-1)
31 max_pixels_12: times 8 dw ((1 << 12)-1)
33 one_per_32: times 4 dd 1
37 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
55 EPEL_TABLE 8, 8, b, sse4
56 EPEL_TABLE 10, 4, w, sse4
57 EPEL_TABLE 12, 4, w, sse4
60 hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
74 QPEL_TABLE 8, 8, b, sse4
75 QPEL_TABLE 10, 4, w, sse4
76 QPEL_TABLE 12, 4, w, sse4
78 %define MAX_PB_SIZE 64
80 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
84 %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
86 movq %3, [%2] ; load data from source2
88 movdqa %3, [%2] ; load data from source2
90 movdqa %3, [%2] ; load data from source2
91 movq %4, [%2+16] ; load data from source2
93 movdqa %3, [%2] ; load data from source2
94 movdqa %4, [%2+16] ; load data from source2
98 %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
99 %if %1 == 2 || (%2 == 8 && %1 <= 4)
100 movd %4, [%3] ; load data from source
101 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
102 movq %4, [%3] ; load data from source
104 movdqu %4, [%3] ; load data from source
108 %macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
109 %if %1 == 2 || (%2 == 8 && %1 <= 4)
110 movq %4, [%3] ; load data from source2
111 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
112 movdqa %4, [%3] ; load data from source2
114 movdqa %4, [%3] ; load data from source2
115 movq %5, [%3+16] ; load data from source2
117 movdqa %4, [%3] ; load data from source2
118 movdqa %5, [%3+16] ; load data from source2
122 %macro EPEL_FILTER 2-4 ; bit depth, filter index
124 lea rfilterq, [hevc_epel_filters_sse4_%1]
126 %define rfilterq hevc_epel_filters_sse4_%1
129 shl %2q, 5 ; multiply by 32
130 movdqa %3, [rfilterq + %2q] ; get 2 first values of filters
131 movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters
134 %macro EPEL_HV_FILTER 1
136 lea rfilterq, [hevc_epel_filters_sse4_%1]
138 %define rfilterq hevc_epel_filters_sse4_%1
142 shl mxq, 5 ; multiply by 32
143 shl myq, 5 ; multiply by 32
144 movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
145 movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
146 lea r3srcq, [srcstrideq*3]
149 lea rfilterq, [hevc_epel_filters_sse4_10]
151 %define rfilterq hevc_epel_filters_sse4_10
153 movdqa m12, [rfilterq + myq] ; get 2 first values of filters
154 movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
159 lea rfilterq, [hevc_qpel_filters_sse4_%1]
161 %define rfilterq hevc_qpel_filters_sse4_%1
164 movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
165 movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
166 movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
167 movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
176 %if (%1 == 8 && %4 <= 4)
178 %elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
181 %define %%load movdqu
184 %%load m0, [rfilterq ]
186 %%load m1, [rfilterq+ %3]
187 %%load m2, [rfilterq+2*%3]
188 %%load m3, [rfilterq+3*%3]
190 %%load m1, [rfilterq+ %3q]
191 %%load m2, [rfilterq+2*%3q]
192 %%load m3, [rfilterq+r3srcq]
197 SBUTTERFLY bw, 0, 1, 10
198 SBUTTERFLY bw, 2, 3, 10
205 SBUTTERFLY wd, 0, 1, 10
206 SBUTTERFLY wd, 2, 3, 10
216 %assign %%stride (%1+7)/8
223 %define %%load movdqu
231 %define %%load movdqu
234 %%load m0, [%2-3*%%stride] ;load data from source
235 %%load m1, [%2-2*%%stride]
236 %%load m2, [%2-%%stride ]
238 %%load m4, [%2+%%stride ]
239 %%load m5, [%2+2*%%stride]
240 %%load m6, [%2+3*%%stride]
241 %%load m7, [%2+4*%%stride]
245 SBUTTERFLY wd, 0, 1, %4
246 SBUTTERFLY wd, 2, 3, %4
247 SBUTTERFLY wd, 4, 5, %4
248 SBUTTERFLY wd, 6, 7, %4
257 SBUTTERFLY dq, 0, 1, %4
258 SBUTTERFLY dq, 2, 3, %4
259 SBUTTERFLY dq, 4, 5, %4
260 SBUTTERFLY dq, 6, 7, %4
273 movdqu m0, [%5q ] ;load x- 3*srcstride
274 movdqu m1, [%5q+ %3q ] ;load x- 2*srcstride
275 movdqu m2, [%5q+ 2*%3q ] ;load x-srcstride
276 movdqu m3, [%2 ] ;load x
277 movdqu m4, [%2+ %3q] ;load x+stride
278 movdqu m5, [%2+ 2*%3q] ;load x+2*stride
279 movdqu m6, [%2+r3srcq] ;load x+3*stride
280 movdqu m7, [%2+ 4*%3q] ;load x+4*stride
283 SBUTTERFLY bw, 0, 1, 8
284 SBUTTERFLY bw, 2, 3, 8
285 SBUTTERFLY bw, 4, 5, 8
286 SBUTTERFLY bw, 6, 7, 8
295 SBUTTERFLY wd, 0, 1, 8
296 SBUTTERFLY wd, 2, 3, 8
297 SBUTTERFLY wd, 4, 5, 8
298 SBUTTERFLY wd, 6, 7, 8
308 %macro PEL_12STORE2 3
311 %macro PEL_12STORE4 3
314 %macro PEL_12STORE6 3
319 %macro PEL_12STORE8 3
322 %macro PEL_12STORE12 3
326 %macro PEL_12STORE16 3
327 PEL_12STORE8 %1, %2, %3
331 %macro PEL_10STORE2 3
334 %macro PEL_10STORE4 3
337 %macro PEL_10STORE6 3
342 %macro PEL_10STORE8 3
345 %macro PEL_10STORE12 3
349 %macro PEL_10STORE16 3
350 PEL_10STORE8 %1, %2, %3
367 %macro PEL_8STORE12 3
372 %macro PEL_8STORE16 3
377 add %1q, 2*MAX_PB_SIZE ; dst += dststride
378 add %2q, %3q ; src += srcstride
379 dec heightd ; cmp height
380 jnz .loop ; height loop
384 %macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
396 %macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
398 pmaddubsw m0, %3 ;x1*c1+x2*c2
399 pmaddubsw m2, %4 ;x3*c3+x4*c4
423 %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
425 lea rfilterq, [hevc_qpel_filters_sse4_%2]
427 %define rfilterq hevc_qpel_filters_sse4_%2
431 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
432 pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4
433 pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6
434 pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8
439 pmaddwd m0, [rfilterq + %3q*8 ]
440 pmaddwd m2, [rfilterq + %3q*8+16]
441 pmaddwd m4, [rfilterq + %3q*8+32]
442 pmaddwd m6, [rfilterq + %3q*8+48]
450 pmaddwd m1, [rfilterq + %3q*8 ]
451 pmaddwd m3, [rfilterq + %3q*8+16]
452 pmaddwd m5, [rfilterq + %3q*8+32]
453 pmaddwd m7, [rfilterq + %3q*8+48]
465 %macro QPEL_COMPUTE 2 ; width, bitdepth
467 pmaddubsw m0, m12 ;x1*c1+x2*c2
468 pmaddubsw m2, m13 ;x3*c3+x4*c4
469 pmaddubsw m4, m14 ;x5*c5+x6*c6
470 pmaddubsw m6, m15 ;x7*c7+x8*c8
509 %macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
514 UNI_COMPUTE %1, %2, %3, %4, %7
519 %if %1 > 8 || (%2 > 8 && %1 > 4)
525 pminsw %3, [max_pixels_%2]
528 pminsw %4, [max_pixels_%2]
534 INIT_XMM sse4 ; adds ff_ and _sse4 to function name
535 ; ******************************
536 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
537 ; uint8_t *_src, ptrdiff_t _srcstride,
538 ; int height, int mx, int my)
539 ; ******************************
541 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
542 cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
545 SIMPLE_LOAD %1, %2, srcq, m0
546 MC_PIXEL_COMPUTE %1, %2
547 PEL_10STORE%1 dstq, m0, m1
548 LOOP_END dst, src, srcstride
551 cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
553 SIMPLE_LOAD %1, %2, srcq, m0
554 PEL_%2STORE%1 dstq, m0, m1
555 add dstq, dststrideq ; dst += dststride
556 add srcq, srcstrideq ; src += srcstride
557 dec heightd ; cmp height
558 jnz .loop ; height loop
561 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
563 movdqa m5, [pw_bi_%2]
565 SIMPLE_LOAD %1, %2, srcq, m0
566 SIMPLE_BILOAD %1, src2q, m3, m4
567 MC_PIXEL_COMPUTE %1, %2
568 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
569 PEL_%2STORE%1 dstq, m0, m1
570 add dstq, dststrideq ; dst += dststride
571 add srcq, srcstrideq ; src += srcstride
572 add src2q, 2*MAX_PB_SIZE ; src += srcstride
573 dec heightd ; cmp height
574 jnz .loop ; height loop
580 ; ******************************
581 ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
582 ; uint8_t *_src, ptrdiff_t _srcstride,
583 ; int width, int height, int mx, int my,
585 ; ******************************
588 %macro HEVC_PUT_HEVC_EPEL 2
589 cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 11, dst, src, srcstride, height, mx, rfilter
590 %assign %%stride ((%2 + 7)/8)
591 EPEL_FILTER %2, mx, m4, m5
593 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
594 EPEL_COMPUTE %2, %1, m4, m5
595 PEL_10STORE%1 dstq, m0, m1
596 LOOP_END dst, src, srcstride
599 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 11, dst, dststride, src, srcstride, height, mx, rfilter
600 %assign %%stride ((%2 + 7)/8)
602 EPEL_FILTER %2, mx, m4, m5
604 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
605 EPEL_COMPUTE %2, %1, m4, m5
606 UNI_COMPUTE %1, %2, m0, m1, m6
607 PEL_%2STORE%1 dstq, m0, m1
608 add dstq, dststrideq ; dst += dststride
609 add srcq, srcstrideq ; src += srcstride
610 dec heightd ; cmp height
611 jnz .loop ; height loop
614 cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride, src2, height, mx, rfilter
615 movdqa m6, [pw_bi_%2]
616 EPEL_FILTER %2, mx, m4, m5
618 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
619 EPEL_COMPUTE %2, %1, m4, m5
620 SIMPLE_BILOAD %1, src2q, m2, m3
621 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
622 PEL_%2STORE%1 dstq, m0, m1
623 add dstq, dststrideq ; dst += dststride
624 add srcq, srcstrideq ; src += srcstride
625 add src2q, 2*MAX_PB_SIZE ; src += srcstride
626 dec heightd ; cmp height
627 jnz .loop ; height loop
630 ; ******************************
631 ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
632 ; uint8_t *_src, ptrdiff_t _srcstride,
633 ; int width, int height, int mx, int my,
635 ; ******************************
637 cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 11, dst, src, srcstride, height, r3src, my, rfilter
638 lea r3srcq, [srcstrideq*3]
640 EPEL_FILTER %2, my, m4, m5
642 EPEL_LOAD %2, srcq, srcstride, %1
643 EPEL_COMPUTE %2, %1, m4, m5
644 PEL_10STORE%1 dstq, m0, m1
645 LOOP_END dst, src, srcstride
648 cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 11, dst, dststride, src, srcstride, height, r3src, my, rfilter
649 lea r3srcq, [srcstrideq*3]
652 EPEL_FILTER %2, my, m4, m5
654 EPEL_LOAD %2, srcq, srcstride, %1
655 EPEL_COMPUTE %2, %1, m4, m5
656 UNI_COMPUTE %1, %2, m0, m1, m6
657 PEL_%2STORE%1 dstq, m0, m1
658 add dstq, dststrideq ; dst += dststride
659 add srcq, srcstrideq ; src += srcstride
660 dec heightd ; cmp height
661 jnz .loop ; height loop
665 cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 11, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
666 lea r3srcq, [srcstrideq*3]
667 movdqa m6, [pw_bi_%2]
669 EPEL_FILTER %2, my, m4, m5
671 EPEL_LOAD %2, srcq, srcstride, %1
672 EPEL_COMPUTE %2, %1, m4, m5
673 SIMPLE_BILOAD %1, src2q, m2, m3
674 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
675 PEL_%2STORE%1 dstq, m0, m1
676 add dstq, dststrideq ; dst += dststride
677 add srcq, srcstrideq ; src += srcstride
678 add src2q, 2*MAX_PB_SIZE ; src += srcstride
679 dec heightd ; cmp height
680 jnz .loop ; height loop
685 ; ******************************
686 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
687 ; uint8_t *_src, ptrdiff_t _srcstride,
688 ; int width, int height, int mx, int my)
689 ; ******************************
691 %macro HEVC_PUT_HEVC_EPEL_HV 2
692 cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx, my, r3src, rfilter
693 %assign %%stride ((%2 + 7)/8)
696 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
697 EPEL_COMPUTE %2, %1, m14, m15
700 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
701 EPEL_COMPUTE %2, %1, m14, m15
704 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
705 EPEL_COMPUTE %2, %1, m14, m15
709 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
710 EPEL_COMPUTE %2, %1, m14, m15
718 EPEL_COMPUTE 14, %1, m12, m13
719 PEL_10STORE%1 dstq, m0, m1
723 LOOP_END dst, src, srcstride
726 cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
727 %assign %%stride ((%2 + 7)/8)
730 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
731 EPEL_COMPUTE %2, %1, m14, m15
734 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
735 EPEL_COMPUTE %2, %1, m14, m15
738 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
739 EPEL_COMPUTE %2, %1, m14, m15
743 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
744 EPEL_COMPUTE %2, %1, m14, m15
752 EPEL_COMPUTE 14, %1, m12, m13
753 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
754 PEL_%2STORE%1 dstq, m0, m1
758 add dstq, dststrideq ; dst += dststride
759 add srcq, srcstrideq ; src += srcstride
760 dec heightd ; cmp height
761 jnz .loop ; height loop
765 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
766 %assign %%stride ((%2 + 7)/8)
769 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
770 EPEL_COMPUTE %2, %1, m14, m15
773 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
774 EPEL_COMPUTE %2, %1, m14, m15
777 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
778 EPEL_COMPUTE %2, %1, m14, m15
782 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
783 EPEL_COMPUTE %2, %1, m14, m15
791 EPEL_COMPUTE 14, %1, m12, m13
792 SIMPLE_BILOAD %1, src2q, m8, m9
793 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
794 PEL_%2STORE%1 dstq, m0, m1
798 add dstq, dststrideq ; dst += dststride
799 add srcq, srcstrideq ; src += srcstride
800 add src2q, 2*MAX_PB_SIZE ; src += srcstride
801 dec heightd ; cmp height
802 jnz .loop ; height loop
806 ; ******************************
807 ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
808 ; uint8_t *_src, ptrdiff_t _srcstride,
809 ; int width, int height, int mx, int my)
810 ; ******************************
812 %macro HEVC_PUT_HEVC_QPEL 2
813 cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
816 QPEL_H_LOAD %2, srcq, %1, 10
821 PEL_10STORE%1 dstq, m0, m1
822 LOOP_END dst, src, srcstride
825 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
829 QPEL_H_LOAD %2, srcq, %1, 10
834 UNI_COMPUTE %1, %2, m0, m1, m9
835 PEL_%2STORE%1 dstq, m0, m1
836 add dstq, dststrideq ; dst += dststride
837 add srcq, srcstrideq ; src += srcstride
838 dec heightd ; cmp height
839 jnz .loop ; height loop
842 cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
843 movdqa m9, [pw_bi_%2]
846 QPEL_H_LOAD %2, srcq, %1, 10
851 SIMPLE_BILOAD %1, src2q, m10, m11
852 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
853 PEL_%2STORE%1 dstq, m0, m1
854 add dstq, dststrideq ; dst += dststride
855 add srcq, srcstrideq ; src += srcstride
856 add src2q, 2*MAX_PB_SIZE ; src += srcstride
857 dec heightd ; cmp height
858 jnz .loop ; height loop
862 ; ******************************
863 ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
864 ; uint8_t *_src, ptrdiff_t _srcstride,
865 ; int width, int height, int mx, int my)
866 ; ******************************
868 cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
869 lea r3srcq, [srcstrideq*3]
872 QPEL_V_LOAD %2, srcq, srcstride, %1, r7
877 PEL_10STORE%1 dstq, m0, m1
878 LOOP_END dst, src, srcstride
881 cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
883 lea r3srcq, [srcstrideq*3]
886 QPEL_V_LOAD %2, srcq, srcstride, %1, r8
891 UNI_COMPUTE %1, %2, m0, m1, m9
892 PEL_%2STORE%1 dstq, m0, m1
893 add dstq, dststrideq ; dst += dststride
894 add srcq, srcstrideq ; src += srcstride
895 dec heightd ; cmp height
896 jnz .loop ; height loop
899 cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
900 movdqa m9, [pw_bi_%2]
901 lea r3srcq, [srcstrideq*3]
904 SIMPLE_BILOAD %1, src2q, m10, m11
905 QPEL_V_LOAD %2, srcq, srcstride, %1, r9
910 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
911 PEL_%2STORE%1 dstq, m0, m1
912 add dstq, dststrideq ; dst += dststride
913 add srcq, srcstrideq ; src += srcstride
914 add src2q, 2*MAX_PB_SIZE ; src += srcstride
915 dec heightd ; cmp height
916 jnz .loop ; height loop
921 ; ******************************
922 ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
923 ; uint8_t *_src, ptrdiff_t _srcstride,
924 ; int height, int mx, int my)
925 ; ******************************
926 %macro HEVC_PUT_HEVC_QPEL_HV 2
927 cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
930 lea r3srcq, [srcstrideq*3]
932 QPEL_H_LOAD %2, srcq, %1, 15
933 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
936 QPEL_H_LOAD %2, srcq, %1, 15
937 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
940 QPEL_H_LOAD %2, srcq, %1, 15
941 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
944 QPEL_H_LOAD %2, srcq, %1, 15
945 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
948 QPEL_H_LOAD %2, srcq, %1, 15
949 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
952 QPEL_H_LOAD %2, srcq, %1, 15
953 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
956 QPEL_H_LOAD %2, srcq, %1, 15
957 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
961 QPEL_H_LOAD %2, srcq, %1, 15
962 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
965 punpcklwd m2, m10, m11
966 punpcklwd m4, m12, m13
967 punpcklwd m6, m14, m15
970 punpckhwd m3, m10, m11
971 punpckhwd m5, m12, m13
972 punpckhwd m7, m14, m15
974 QPEL_HV_COMPUTE %1, 14, my, ackssdw
975 PEL_10STORE%1 dstq, m0, m1
993 LOOP_END dst, src, srcstride
996 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
999 lea r3srcq, [srcstrideq*3]
1001 QPEL_H_LOAD %2, srcq, %1, 15
1002 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1004 add srcq, srcstrideq
1005 QPEL_H_LOAD %2, srcq, %1, 15
1006 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1008 add srcq, srcstrideq
1009 QPEL_H_LOAD %2, srcq, %1, 15
1010 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1012 add srcq, srcstrideq
1013 QPEL_H_LOAD %2, srcq, %1, 15
1014 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1016 add srcq, srcstrideq
1017 QPEL_H_LOAD %2, srcq, %1, 15
1018 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1020 add srcq, srcstrideq
1021 QPEL_H_LOAD %2, srcq, %1, 15
1022 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1024 add srcq, srcstrideq
1025 QPEL_H_LOAD %2, srcq, %1, 15
1026 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1028 add srcq, srcstrideq
1030 QPEL_H_LOAD %2, srcq, %1, 15
1031 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1033 punpcklwd m0, m8, m9
1034 punpcklwd m2, m10, m11
1035 punpcklwd m4, m12, m13
1036 punpcklwd m6, m14, m15
1038 punpckhwd m1, m8, m9
1039 punpckhwd m3, m10, m11
1040 punpckhwd m5, m12, m13
1041 punpckhwd m7, m14, m15
1043 QPEL_HV_COMPUTE %1, 14, my, ackusdw
1044 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
1045 PEL_%2STORE%1 dstq, m0, m1
1064 add dstq, dststrideq ; dst += dststride
1065 add srcq, srcstrideq ; src += srcstride
1066 dec heightd ; cmp height
1067 jnz .loop ; height loop
1070 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
1073 lea r3srcq, [srcstrideq*3]
1075 QPEL_H_LOAD %2, srcq, %1, 15
1076 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1078 add srcq, srcstrideq
1079 QPEL_H_LOAD %2, srcq, %1, 15
1080 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1082 add srcq, srcstrideq
1083 QPEL_H_LOAD %2, srcq, %1, 15
1084 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1086 add srcq, srcstrideq
1087 QPEL_H_LOAD %2, srcq, %1, 15
1088 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1090 add srcq, srcstrideq
1091 QPEL_H_LOAD %2, srcq, %1, 15
1092 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1094 add srcq, srcstrideq
1095 QPEL_H_LOAD %2, srcq, %1, 15
1096 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1098 add srcq, srcstrideq
1099 QPEL_H_LOAD %2, srcq, %1, 15
1100 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1102 add srcq, srcstrideq
1104 QPEL_H_LOAD %2, srcq, %1, 15
1105 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1107 punpcklwd m0, m8, m9
1108 punpcklwd m2, m10, m11
1109 punpcklwd m4, m12, m13
1110 punpcklwd m6, m14, m15
1112 punpckhwd m1, m8, m9
1113 punpckhwd m3, m10, m11
1114 punpckhwd m5, m12, m13
1115 punpckhwd m7, m14, m15
1117 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1118 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
1119 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1120 PEL_%2STORE%1 dstq, m0, m1
1139 add dstq, dststrideq ; dst += dststride
1140 add srcq, srcstrideq ; src += srcstride
1141 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1142 dec heightd ; cmp height
1143 jnz .loop ; height loop
1147 %macro WEIGHTING_FUNCS 2
1148 %if WIN64 || ARCH_X86_32
1149 cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1153 cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1154 %define SHIFT denomd
1156 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
1161 movd m4, SHIFT ; shift
1168 movdqu m5, [one_per_32]
1174 shl SHIFT, %2-8 ; ox << (bitd - 8)
1178 %if WIN64 || ARCH_X86_32
1182 SIMPLE_LOAD %1, 10, srcq, m0
1192 punpckhwd m1, m0, m6
1205 pminsw m0, [max_pixels_%2]
1208 PEL_%2STORE%1 dstq, m0, m1
1209 add dstq, dststrideq ; dst += dststride
1210 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1211 dec heightd ; cmp height
1212 jnz .loop ; height loop
1215 cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1
1221 lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
1223 movd m0, r6d ; shift
1232 movd m5, r6d ; shift+1
1238 shl r6d, %2-8 ; ox << (bitd - 8)
1241 movd m4, r6d ; offset
1247 SIMPLE_LOAD %1, 10, srcq, m0
1248 SIMPLE_LOAD %1, 10, src2q, m8
1262 punpckhwd m1, m0, m6
1264 punpckhwd m9, m8, m7
1277 pminsw m0, [max_pixels_%2]
1280 PEL_%2STORE%1 dstq, m0, m1
1281 add dstq, dststrideq ; dst += dststride
1282 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1283 add src2q, 2*MAX_PB_SIZE ; src2 += srcstride
1284 dec r6d ; cmp height
1285 jnz .loop ; height loop
1289 WEIGHTING_FUNCS 2, 8
1290 WEIGHTING_FUNCS 4, 8
1291 WEIGHTING_FUNCS 6, 8
1292 WEIGHTING_FUNCS 8, 8
1294 WEIGHTING_FUNCS 2, 10
1295 WEIGHTING_FUNCS 4, 10
1296 WEIGHTING_FUNCS 6, 10
1297 WEIGHTING_FUNCS 8, 10
1299 WEIGHTING_FUNCS 2, 12
1300 WEIGHTING_FUNCS 4, 12
1301 WEIGHTING_FUNCS 6, 12
1302 WEIGHTING_FUNCS 8, 12
1304 HEVC_PUT_HEVC_PEL_PIXELS 2, 8
1305 HEVC_PUT_HEVC_PEL_PIXELS 4, 8
1306 HEVC_PUT_HEVC_PEL_PIXELS 6, 8
1307 HEVC_PUT_HEVC_PEL_PIXELS 8, 8
1308 HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1309 HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1311 HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1312 HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1313 HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1314 HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1316 HEVC_PUT_HEVC_PEL_PIXELS 2, 12
1317 HEVC_PUT_HEVC_PEL_PIXELS 4, 12
1318 HEVC_PUT_HEVC_PEL_PIXELS 6, 12
1319 HEVC_PUT_HEVC_PEL_PIXELS 8, 12
1321 HEVC_PUT_HEVC_EPEL 2, 8
1322 HEVC_PUT_HEVC_EPEL 4, 8
1323 HEVC_PUT_HEVC_EPEL 6, 8
1324 HEVC_PUT_HEVC_EPEL 8, 8
1325 HEVC_PUT_HEVC_EPEL 12, 8
1326 HEVC_PUT_HEVC_EPEL 16, 8
1329 HEVC_PUT_HEVC_EPEL 2, 10
1330 HEVC_PUT_HEVC_EPEL 4, 10
1331 HEVC_PUT_HEVC_EPEL 6, 10
1332 HEVC_PUT_HEVC_EPEL 8, 10
1334 HEVC_PUT_HEVC_EPEL 2, 12
1335 HEVC_PUT_HEVC_EPEL 4, 12
1336 HEVC_PUT_HEVC_EPEL 6, 12
1337 HEVC_PUT_HEVC_EPEL 8, 12
1339 HEVC_PUT_HEVC_EPEL_HV 2, 8
1340 HEVC_PUT_HEVC_EPEL_HV 4, 8
1341 HEVC_PUT_HEVC_EPEL_HV 6, 8
1342 HEVC_PUT_HEVC_EPEL_HV 8, 8
1344 HEVC_PUT_HEVC_EPEL_HV 2, 10
1345 HEVC_PUT_HEVC_EPEL_HV 4, 10
1346 HEVC_PUT_HEVC_EPEL_HV 6, 10
1347 HEVC_PUT_HEVC_EPEL_HV 8, 10
1349 HEVC_PUT_HEVC_EPEL_HV 2, 12
1350 HEVC_PUT_HEVC_EPEL_HV 4, 12
1351 HEVC_PUT_HEVC_EPEL_HV 6, 12
1352 HEVC_PUT_HEVC_EPEL_HV 8, 12
1354 HEVC_PUT_HEVC_QPEL 4, 8
1355 HEVC_PUT_HEVC_QPEL 8, 8
1356 HEVC_PUT_HEVC_QPEL 12, 8
1357 HEVC_PUT_HEVC_QPEL 16, 8
1359 HEVC_PUT_HEVC_QPEL 4, 10
1360 HEVC_PUT_HEVC_QPEL 8, 10
1362 HEVC_PUT_HEVC_QPEL 4, 12
1363 HEVC_PUT_HEVC_QPEL 8, 12
1365 HEVC_PUT_HEVC_QPEL_HV 2, 8
1366 HEVC_PUT_HEVC_QPEL_HV 4, 8
1367 HEVC_PUT_HEVC_QPEL_HV 6, 8
1368 HEVC_PUT_HEVC_QPEL_HV 8, 8
1370 HEVC_PUT_HEVC_QPEL_HV 2, 10
1371 HEVC_PUT_HEVC_QPEL_HV 4, 10
1372 HEVC_PUT_HEVC_QPEL_HV 6, 10
1373 HEVC_PUT_HEVC_QPEL_HV 8, 10
1375 HEVC_PUT_HEVC_QPEL_HV 2, 12
1376 HEVC_PUT_HEVC_QPEL_HV 4, 12
1377 HEVC_PUT_HEVC_QPEL_HV 6, 12
1378 HEVC_PUT_HEVC_QPEL_HV 8, 12
1380 %endif ; ARCH_X86_64