2 ; * Provide SSE luma and chroma mc functions for HEVC decoding
3 ; * Copyright (c) 2013 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
25 pw_10: times 8 dw 2048
26 pw_bi_8: times 8 dw 256
27 pw_bi_10: times 8 dw 1024
28 max_pixels_10: times 8 dw 1023
30 one_per_32: times 4 dd 1
34 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
52 EPEL_TABLE 8, 8, b, sse4
53 EPEL_TABLE 10, 4, w, sse4
56 hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
70 QPEL_TABLE 8, 8, b, sse4
71 QPEL_TABLE 10, 4, w, sse4
73 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
77 %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
79 movq %3, [%2] ; load data from source2
81 movdqa %3, [%2] ; load data from source2
83 movdqa %3, [%2] ; load data from source2
84 movq %4, [%2+16] ; load data from source2
86 movdqa %3, [%2] ; load data from source2
87 movdqa %4, [%2+16] ; load data from source2
91 %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
92 %if %1 == 2 || (%2 == 8 && %1 <= 4)
93 movd %4, [%3] ; load data from source
94 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
95 movq %4, [%3] ; load data from source
97 movdqu %4, [%3] ; load data from source
101 %macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
102 %if %1 == 2 || (%2 == 8 && %1 <= 4)
103 movq %4, [%3] ; load data from source2
104 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
105 movdqa %4, [%3] ; load data from source2
107 movdqa %4, [%3] ; load data from source2
108 movq %5, [%3+16] ; load data from source2
110 movdqa %4, [%3] ; load data from source2
111 movdqa %5, [%3+16] ; load data from source2
115 %macro EPEL_FILTER 2-4 ; bit depth, filter index
117 lea rfilterq, [hevc_epel_filters_sse4_%1]
119 %define rfilterq hevc_epel_filters_sse4_%1
122 shl %2q, 5 ; multiply by 32
124 movdqa m14, [rfilterq + %2q] ; get 2 first values of filters
125 movdqa m15, [rfilterq + %2q+16] ; get 2 last values of filters
127 movdqa %3, [rfilterq + %2q] ; get 2 first values of filters
128 movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters
132 %macro EPEL_HV_FILTER 1
134 lea rfilterq, [hevc_epel_filters_sse4_%1]
136 %define rfilterq hevc_epel_filters_sse4_%1
140 shl mxq, 5 ; multiply by 32
141 shl myq, 5 ; multiply by 32
142 movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
143 movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
144 lea r3srcq, [srcstrideq*3]
147 lea rfilterq, [hevc_epel_filters_sse4_10]
149 %define rfilterq hevc_epel_filters_sse4_10
151 movdqa m12, [rfilterq + myq] ; get 2 first values of filters
152 movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
157 lea rfilterq, [hevc_qpel_filters_sse4_%1]
159 %define rfilterq hevc_qpel_filters_sse4_%1
162 movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
163 movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
164 movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
165 movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
174 movdqu m0, [rfilterq ] ;load 128bit of x
176 movdqu m1, [rfilterq+ %3] ;load 128bit of x+stride
177 movdqu m2, [rfilterq+2*%3] ;load 128bit of x+2*stride
178 movdqu m3, [rfilterq+3*%3] ;load 128bit of x+3*stride
180 movdqu m1, [rfilterq+ %3q] ;load 128bit of x+stride
181 movdqu m2, [rfilterq+2*%3q] ;load 128bit of x+2*stride
182 movdqu m3, [rfilterq+r3srcq] ;load 128bit of x+2*stride
187 SBUTTERFLY bw, 0, 1, 10
188 SBUTTERFLY bw, 2, 3, 10
195 SBUTTERFLY wd, 0, 1, 10
196 SBUTTERFLY wd, 2, 3, 10
206 %assign %%stride (%1+7)/8
213 %define %%load movdqu
221 %define %%load movdqu
224 %%load m0, [%2-3*%%stride] ;load data from source
225 %%load m1, [%2-2*%%stride]
226 %%load m2, [%2-%%stride ]
228 %%load m4, [%2+%%stride ]
229 %%load m5, [%2+2*%%stride]
230 %%load m6, [%2+3*%%stride]
231 %%load m7, [%2+4*%%stride]
235 SBUTTERFLY wd, 0, 1, %4
236 SBUTTERFLY wd, 2, 3, %4
237 SBUTTERFLY wd, 4, 5, %4
238 SBUTTERFLY wd, 6, 7, %4
247 SBUTTERFLY dq, 0, 1, %4
248 SBUTTERFLY dq, 2, 3, %4
249 SBUTTERFLY dq, 4, 5, %4
250 SBUTTERFLY dq, 6, 7, %4
263 movdqu m0, [r12 ] ;load x- 3*srcstride
264 movdqu m1, [r12+ %3q ] ;load x- 2*srcstride
265 movdqu m2, [r12+ 2*%3q ] ;load x-srcstride
266 movdqu m3, [%2 ] ;load x
267 movdqu m4, [%2+ %3q] ;load x+stride
268 movdqu m5, [%2+ 2*%3q] ;load x+2*stride
269 movdqu m6, [%2+r3srcq] ;load x+3*stride
270 movdqu m7, [%2+ 4*%3q] ;load x+4*stride
273 SBUTTERFLY bw, 0, 1, 8
274 SBUTTERFLY bw, 2, 3, 8
275 SBUTTERFLY bw, 4, 5, 8
276 SBUTTERFLY bw, 6, 7, 8
285 SBUTTERFLY wd, 0, 1, 8
286 SBUTTERFLY wd, 2, 3, 8
287 SBUTTERFLY wd, 4, 5, 8
288 SBUTTERFLY wd, 6, 7, 8
298 %macro PEL_10STORE2 3
301 %macro PEL_10STORE4 3
304 %macro PEL_10STORE6 3
309 %macro PEL_10STORE8 3
312 %macro PEL_10STORE12 3
316 %macro PEL_10STORE16 3
317 PEL_10STORE8 %1, %2, %3
334 %macro PEL_8STORE12 3
339 %macro PEL_8STORE16 3
344 lea %1q, [%1q+2*%2q] ; dst += dststride
345 lea %3q, [%3q+ %4q] ; src += srcstride
346 dec heightd ; cmp height
347 jnz .loop ; height loop
351 %macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
363 %macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
365 pmaddubsw m0, %3 ;x1*c1+x2*c2
366 pmaddubsw m2, %4 ;x3*c3+x4*c4
390 %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
392 lea rfilterq, [hevc_qpel_filters_sse4_%2]
394 %define rfilterq hevc_qpel_filters_sse4_%2
398 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
399 pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4
400 pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6
401 pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8
406 pmaddwd m0, [rfilterq + %3q*8 ]
407 pmaddwd m2, [rfilterq + %3q*8+16]
408 pmaddwd m4, [rfilterq + %3q*8+32]
409 pmaddwd m6, [rfilterq + %3q*8+48]
417 pmaddwd m1, [rfilterq + %3q*8 ]
418 pmaddwd m3, [rfilterq + %3q*8+16]
419 pmaddwd m5, [rfilterq + %3q*8+32]
420 pmaddwd m7, [rfilterq + %3q*8+48]
432 %macro QPEL_COMPUTE 2 ; width, bitdepth
434 pmaddubsw m0, m12 ;x1*c1+x2*c2
435 pmaddubsw m2, m13 ;x3*c3+x4*c4
436 pmaddubsw m4, m14 ;x5*c5+x6*c6
437 pmaddubsw m6, m15 ;x7*c7+x8*c8
476 %macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
481 UNI_COMPUTE %1, %2, %3, %4, %7
486 %if %1 > 8 || (%2 > 8 && %1 > 4)
492 pminsw %3, [max_pixels_%2]
495 pminsw %4, [max_pixels_%2]
501 INIT_XMM sse4 ; adds ff_ and _sse4 to function name
502 ; ******************************
503 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
504 ; uint8_t *_src, ptrdiff_t _srcstride,
505 ; int height, int mx, int my)
506 ; ******************************
508 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
509 cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
512 SIMPLE_LOAD %1, %2, srcq, m0
513 MC_PIXEL_COMPUTE %1, %2
514 PEL_10STORE%1 dstq, m0, m1
515 LOOP_END dst, dststride, src, srcstride
518 cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
521 SIMPLE_LOAD %1, %2, srcq, m0
522 PEL_%2STORE%1 dstq, m0, m1
523 lea dstq, [dstq+dststrideq] ; dst += dststride
524 lea srcq, [srcq+srcstrideq] ; src += srcstride
525 dec heightd ; cmp height
526 jnz .loop ; height loop
529 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
531 movdqa m5, [pw_bi_%2]
533 SIMPLE_LOAD %1, %2, srcq, m0
534 SIMPLE_BILOAD %1, src2q, m3, m4
535 MC_PIXEL_COMPUTE %1, %2
536 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
537 PEL_%2STORE%1 dstq, m0, m1
538 lea dstq, [dstq+dststrideq] ; dst += dststride
539 lea srcq, [srcq+srcstrideq] ; src += srcstride
540 lea src2q, [src2q+2*src2strideq] ; src += srcstride
541 dec heightd ; cmp height
542 jnz .loop ; height loop
548 ; ******************************
549 ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
550 ; uint8_t *_src, ptrdiff_t _srcstride,
551 ; int width, int height, int mx, int my,
553 ; ******************************
556 %macro HEVC_PUT_HEVC_EPEL 2
557 cglobal hevc_put_hevc_epel_h%1_%2, 6, 7, 6, dst, dststride, src, srcstride, height, mx, rfilter
558 %assign %%stride ((%2 + 7)/8)
559 EPEL_FILTER %2, mx, m4, m5
561 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
562 EPEL_COMPUTE %2, %1, m4, m5
563 PEL_10STORE%1 dstq, m0, m1
564 LOOP_END dst, dststride, src, srcstride
567 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter
568 %assign %%stride ((%2 + 7)/8)
570 EPEL_FILTER %2, mx, m4, m5
572 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
573 EPEL_COMPUTE %2, %1, m4, m5
574 UNI_COMPUTE %1, %2, m0, m1, m6
575 PEL_%2STORE%1 dstq, m0, m1
576 lea dstq, [dstq+dststrideq] ; dst += dststride
577 lea srcq, [srcq+srcstrideq] ; src += srcstride
578 dec heightd ; cmp height
579 jnz .loop ; height loop
582 cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx, rfilter
583 movdqa m6, [pw_bi_%2]
584 EPEL_FILTER %2, mx, m4, m5
586 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
587 EPEL_COMPUTE %2, %1, m4, m5
588 SIMPLE_BILOAD %1, src2q, m2, m3
589 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
590 PEL_%2STORE%1 dstq, m0, m1
591 lea dstq, [dstq+dststrideq] ; dst += dststride
592 lea srcq, [srcq+srcstrideq] ; src += srcstride
593 lea src2q, [src2q+2*src2strideq] ; src += srcstride
594 dec heightd ; cmp height
595 jnz .loop ; height loop
598 ; ******************************
599 ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
600 ; uint8_t *_src, ptrdiff_t _srcstride,
601 ; int width, int height, int mx, int my,
603 ; ******************************
605 cglobal hevc_put_hevc_epel_v%1_%2, 7, 8, 6, dst, dststride, src, srcstride, height, r3src, my, rfilter
606 lea r3srcq, [srcstrideq*3]
608 EPEL_FILTER %2, my, m4, m5
610 EPEL_LOAD %2, srcq, srcstride, %1
611 EPEL_COMPUTE %2, %1, m4, m5
612 PEL_10STORE%1 dstq, m0, m1
613 LOOP_END dst, dststride, src, srcstride
616 cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter
617 lea r3srcq, [srcstrideq*3]
620 EPEL_FILTER %2, my, m4, m5
622 EPEL_LOAD %2, srcq, srcstride, %1
623 EPEL_COMPUTE %2, %1, m4, m5
624 UNI_COMPUTE %1, %2, m0, m1, m6
625 PEL_%2STORE%1 dstq, m0, m1
626 lea dstq, [dstq+dststrideq] ; dst += dststride
627 lea srcq, [srcq+srcstrideq] ; src += srcstride
628 dec heightd ; cmp height
629 jnz .loop ; height loop
633 cglobal hevc_put_hevc_bi_epel_v%1_%2, 9, 10, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my, rfilter
634 lea r3srcq, [srcstrideq*3]
635 movdqa m6, [pw_bi_%2]
637 EPEL_FILTER %2, my, m4, m5
639 EPEL_LOAD %2, srcq, srcstride, %1
640 EPEL_COMPUTE %2, %1, m4, m5
641 SIMPLE_BILOAD %1, src2q, m2, m3
642 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
643 PEL_%2STORE%1 dstq, m0, m1
644 lea dstq, [dstq+dststrideq] ; dst += dststride
645 lea srcq, [srcq+srcstrideq] ; src += srcstride
646 lea src2q, [src2q+2*src2strideq] ; src += srcstride
647 dec heightd ; cmp height
648 jnz .loop ; height loop
653 ; ******************************
654 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
655 ; uint8_t *_src, ptrdiff_t _srcstride,
656 ; int width, int height, int mx, int my)
657 ; ******************************
659 %macro HEVC_PUT_HEVC_EPEL_HV 2
660 cglobal hevc_put_hevc_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
661 %assign %%stride ((%2 + 7)/8)
664 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
665 EPEL_COMPUTE %2, %1, m14, m15
667 lea srcq, [srcq + srcstrideq]
668 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
669 EPEL_COMPUTE %2, %1, m14, m15
671 lea srcq, [srcq + srcstrideq]
672 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
673 EPEL_COMPUTE %2, %1, m14, m15
675 lea srcq, [srcq + srcstrideq]
677 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
678 EPEL_COMPUTE %2, %1, m14, m15
686 EPEL_COMPUTE 14, %1, m12, m13
687 PEL_10STORE%1 dstq, m0, m1
691 LOOP_END dst, dststride, src, srcstride
694 cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
695 %assign %%stride ((%2 + 7)/8)
698 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
699 EPEL_COMPUTE %2, %1, m14, m15
701 lea srcq, [srcq + srcstrideq]
702 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
703 EPEL_COMPUTE %2, %1, m14, m15
705 lea srcq, [srcq + srcstrideq]
706 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
707 EPEL_COMPUTE %2, %1, m14, m15
709 lea srcq, [srcq + srcstrideq]
711 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
712 EPEL_COMPUTE %2, %1, m14, m15
720 EPEL_COMPUTE 14, %1, m12, m13
721 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
722 PEL_%2STORE%1 dstq, m0, m1
726 lea dstq, [dstq+dststrideq] ; dst += dststride
727 lea srcq, [srcq+srcstrideq] ; src += srcstride
728 dec heightd ; cmp height
729 jnz .loop ; height loop
733 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
734 %assign %%stride ((%2 + 7)/8)
737 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
738 EPEL_COMPUTE %2, %1, m14, m15
740 lea srcq, [srcq + srcstrideq]
741 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
742 EPEL_COMPUTE %2, %1, m14, m15
744 lea srcq, [srcq + srcstrideq]
745 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
746 EPEL_COMPUTE %2, %1, m14, m15
748 lea srcq, [srcq + srcstrideq]
750 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
751 EPEL_COMPUTE %2, %1, m14, m15
759 EPEL_COMPUTE 14, %1, m12, m13
760 SIMPLE_BILOAD %1, src2q, m8, m9
761 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
762 PEL_%2STORE%1 dstq, m0, m1
766 lea dstq, [dstq+dststrideq] ; dst += dststride
767 lea srcq, [srcq+srcstrideq] ; src += srcstride
768 lea src2q, [src2q+2*src2strideq] ; src += srcstride
769 dec heightd ; cmp height
770 jnz .loop ; height loop
774 ; ******************************
775 ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
776 ; uint8_t *_src, ptrdiff_t _srcstride,
777 ; int width, int height, int mx, int my)
778 ; ******************************
780 %macro HEVC_PUT_HEVC_QPEL 2
781 cglobal hevc_put_hevc_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
784 QPEL_H_LOAD %2, srcq, %1, 10
789 PEL_10STORE%1 dstq, m0, m1
790 LOOP_END dst, dststride, src, srcstride
793 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
797 QPEL_H_LOAD %2, srcq, %1, 10
802 UNI_COMPUTE %1, %2, m0, m1, m9
803 PEL_%2STORE%1 dstq, m0, m1
804 lea dstq, [dstq+dststrideq] ; dst += dststride
805 lea srcq, [srcq+srcstrideq] ; src += srcstride
806 dec heightd ; cmp height
807 jnz .loop ; height loop
810 cglobal hevc_put_hevc_bi_qpel_h%1_%2, 8, 9, 16 , dst, dststride, src, srcstride, src2, src2stride, height, mx, rfilter
811 movdqa m9, [pw_bi_%2]
814 QPEL_H_LOAD %2, srcq, %1, 10
819 SIMPLE_BILOAD %1, src2q, m10, m11
820 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
821 PEL_%2STORE%1 dstq, m0, m1
822 lea dstq, [dstq+dststrideq] ; dst += dststride
823 lea srcq, [srcq+srcstrideq] ; src += srcstride
824 lea src2q, [src2q+2*src2strideq] ; src += srcstride
825 dec heightd ; cmp height
826 jnz .loop ; height loop
830 ; ******************************
831 ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
832 ; uint8_t *_src, ptrdiff_t _srcstride,
833 ; int width, int height, int mx, int my)
834 ; ******************************
836 cglobal hevc_put_hevc_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstride, height, r3src, my, rfilter
837 lea r3srcq, [srcstrideq*3]
840 QPEL_V_LOAD %2, srcq, srcstride, %1
845 PEL_10STORE%1 dstq, m0, m1
846 LOOP_END dst, dststride, src, srcstride
849 cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstride, height, r3src, my, rfilter
851 lea r3srcq, [srcstrideq*3]
854 QPEL_V_LOAD %2, srcq, srcstride, %1
859 UNI_COMPUTE %1, %2, m0, m1, m9
860 PEL_%2STORE%1 dstq, m0, m1
861 lea dstq, [dstq+dststrideq] ; dst += dststride
862 lea srcq, [srcq+srcstrideq] ; src += srcstride
863 dec heightd ; cmp height
864 jnz .loop ; height loop
867 cglobal hevc_put_hevc_bi_qpel_v%1_%2, 9, 14, 16 , dst, dststride, src, srcstride, src2, src2stride, height, r3src, my, rfilter
868 movdqa m9, [pw_bi_%2]
869 lea r3srcq, [srcstrideq*3]
872 SIMPLE_BILOAD %1, src2q, m10, m11
873 QPEL_V_LOAD %2, srcq, srcstride, %1
878 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
879 PEL_%2STORE%1 dstq, m0, m1
880 lea dstq, [dstq+dststrideq] ; dst += dststride
881 lea srcq, [srcq+srcstrideq] ; src += srcstride
882 lea src2q, [src2q+2*src2strideq] ; src += srcstride
883 dec heightd ; cmp height
884 jnz .loop ; height loop
889 ; ******************************
890 ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
891 ; uint8_t *_src, ptrdiff_t _srcstride,
892 ; int height, int mx, int my)
893 ; ******************************
894 %macro HEVC_PUT_HEVC_QPEL_HV 2
895 cglobal hevc_put_hevc_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
898 lea r3srcq, [srcstrideq*3]
900 QPEL_H_LOAD %2, srcq, %1, 15
901 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
903 lea srcq, [srcq + srcstrideq]
904 QPEL_H_LOAD %2, srcq, %1, 15
905 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
907 lea srcq, [srcq + srcstrideq]
908 QPEL_H_LOAD %2, srcq, %1, 15
909 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
911 lea srcq, [srcq + srcstrideq]
912 QPEL_H_LOAD %2, srcq, %1, 15
913 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
915 lea srcq, [srcq + srcstrideq]
916 QPEL_H_LOAD %2, srcq, %1, 15
917 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
919 lea srcq, [srcq + srcstrideq]
920 QPEL_H_LOAD %2, srcq, %1, 15
921 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
923 lea srcq, [srcq + srcstrideq]
924 QPEL_H_LOAD %2, srcq, %1, 15
925 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
927 lea srcq, [srcq + srcstrideq]
929 QPEL_H_LOAD %2, srcq, %1, 15
930 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
933 punpcklwd m2, m10, m11
934 punpcklwd m4, m12, m13
935 punpcklwd m6, m14, m15
938 punpckhwd m3, m10, m11
939 punpckhwd m5, m12, m13
940 punpckhwd m7, m14, m15
942 QPEL_HV_COMPUTE %1, 14, my, ackssdw
943 PEL_10STORE%1 dstq, m0, m1
961 LOOP_END dst, dststride, src, srcstride
964 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
967 lea r3srcq, [srcstrideq*3]
969 QPEL_H_LOAD %2, srcq, %1, 15
970 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
972 lea srcq, [srcq + srcstrideq]
973 QPEL_H_LOAD %2, srcq, %1, 15
974 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
976 lea srcq, [srcq + srcstrideq]
977 QPEL_H_LOAD %2, srcq, %1, 15
978 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
980 lea srcq, [srcq + srcstrideq]
981 QPEL_H_LOAD %2, srcq, %1, 15
982 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
984 lea srcq, [srcq + srcstrideq]
985 QPEL_H_LOAD %2, srcq, %1, 15
986 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
988 lea srcq, [srcq + srcstrideq]
989 QPEL_H_LOAD %2, srcq, %1, 15
990 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
992 lea srcq, [srcq + srcstrideq]
993 QPEL_H_LOAD %2, srcq, %1, 15
994 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
996 lea srcq, [srcq + srcstrideq]
998 QPEL_H_LOAD %2, srcq, %1, 15
999 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1001 punpcklwd m0, m8, m9
1002 punpcklwd m2, m10, m11
1003 punpcklwd m4, m12, m13
1004 punpcklwd m6, m14, m15
1006 punpckhwd m1, m8, m9
1007 punpckhwd m3, m10, m11
1008 punpckhwd m5, m12, m13
1009 punpckhwd m7, m14, m15
1011 QPEL_HV_COMPUTE %1, 14, my, ackusdw
1012 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
1013 PEL_%2STORE%1 dstq, m0, m1
1032 lea dstq, [dstq+dststrideq] ; dst += dststride
1033 lea srcq, [srcq+srcstrideq] ; src += srcstride
1034 dec heightd ; cmp height
1035 jnz .loop ; height loop
1038 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
1041 lea r3srcq, [srcstrideq*3]
1043 QPEL_H_LOAD %2, srcq, %1, 15
1044 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1046 lea srcq, [srcq + srcstrideq]
1047 QPEL_H_LOAD %2, srcq, %1, 15
1048 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1050 lea srcq, [srcq + srcstrideq]
1051 QPEL_H_LOAD %2, srcq, %1, 15
1052 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1054 lea srcq, [srcq + srcstrideq]
1055 QPEL_H_LOAD %2, srcq, %1, 15
1056 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1058 lea srcq, [srcq + srcstrideq]
1059 QPEL_H_LOAD %2, srcq, %1, 15
1060 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1062 lea srcq, [srcq + srcstrideq]
1063 QPEL_H_LOAD %2, srcq, %1, 15
1064 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1066 lea srcq, [srcq + srcstrideq]
1067 QPEL_H_LOAD %2, srcq, %1, 15
1068 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1070 lea srcq, [srcq + srcstrideq]
1072 QPEL_H_LOAD %2, srcq, %1, 15
1073 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1075 punpcklwd m0, m8, m9
1076 punpcklwd m2, m10, m11
1077 punpcklwd m4, m12, m13
1078 punpcklwd m6, m14, m15
1080 punpckhwd m1, m8, m9
1081 punpckhwd m3, m10, m11
1082 punpckhwd m5, m12, m13
1083 punpckhwd m7, m14, m15
1085 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1086 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
1087 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1088 PEL_%2STORE%1 dstq, m0, m1
1107 lea dstq, [dstq+dststrideq] ; dst += dststride
1108 lea srcq, [srcq+srcstrideq] ; src += srcstride
1109 lea src2q, [src2q+2*src2strideq] ; src += srcstride
1110 dec heightd ; cmp height
1111 jnz .loop ; height loop
1115 %macro WEIGHTING_FUNCS 2
1116 %if WIN64 || ARCH_X86_32
1117 cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1121 cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1122 %define SHIFT denomd
1124 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
1126 movd m4, SHIFT ; shift
1129 movdqu m5, [one_per_32]
1135 shl SHIFT, %2-8 ; ox << (bitd - 8)
1139 %if WIN64 || ARCH_X86_32
1143 SIMPLE_LOAD %1, 10, srcq, m0
1146 punpckhwd m1, m0, m6
1158 pminsw m0, [max_pixels_%2]
1160 PEL_%2STORE%1 dstq, m0, m1
1161 lea dstq, [dstq+dststrideq] ; dst += dststride
1162 lea srcq, [srcq+2*srcstrideq] ; src += srcstride
1163 dec heightd ; cmp height
1164 jnz .loop ; height loop
1167 cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1
1170 lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
1172 movd m0, r6d ; shift
1176 movd m5, r6d ; shift+1
1182 shl r6d, %2-8 ; ox << (bitd - 8)
1185 movd m4, r6d ; offset
1191 SIMPLE_LOAD %1, 10, srcq, m0
1192 SIMPLE_LOAD %1, 10, src2q, m8
1197 punpckhwd m1, m0, m6
1199 punpckhwd m9, m8, m7
1211 pminsw m0, [max_pixels_%2]
1213 PEL_%2STORE%1 dstq, m0, m1
1214 lea dstq, [dstq+dststrideq] ; dst += dststride
1215 lea srcq, [srcq+2*srcstrideq] ; src += srcstride
1216 lea src2q, [src2q+2*src2strideq] ; src2 += srcstride
1217 dec r6d ; cmp height
1218 jnz .loop ; height loop
1222 WEIGHTING_FUNCS 2, 8
1223 WEIGHTING_FUNCS 4, 8
1224 WEIGHTING_FUNCS 6, 8
1225 WEIGHTING_FUNCS 8, 8
1227 WEIGHTING_FUNCS 2, 10
1228 WEIGHTING_FUNCS 4, 10
1229 WEIGHTING_FUNCS 6, 10
1230 WEIGHTING_FUNCS 8, 10
1232 HEVC_PUT_HEVC_PEL_PIXELS 2, 8
1233 HEVC_PUT_HEVC_PEL_PIXELS 4, 8
1234 HEVC_PUT_HEVC_PEL_PIXELS 6, 8
1235 HEVC_PUT_HEVC_PEL_PIXELS 8, 8
1236 HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1237 HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1239 HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1240 HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1241 HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1242 HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1245 HEVC_PUT_HEVC_EPEL 2, 8
1246 HEVC_PUT_HEVC_EPEL 4, 8
1247 HEVC_PUT_HEVC_EPEL 6, 8
1248 HEVC_PUT_HEVC_EPEL 8, 8
1249 HEVC_PUT_HEVC_EPEL 12, 8
1250 HEVC_PUT_HEVC_EPEL 16, 8
1253 HEVC_PUT_HEVC_EPEL 2, 10
1254 HEVC_PUT_HEVC_EPEL 4, 10
1255 HEVC_PUT_HEVC_EPEL 6, 10
1256 HEVC_PUT_HEVC_EPEL 8, 10
1259 HEVC_PUT_HEVC_EPEL_HV 2, 8
1260 HEVC_PUT_HEVC_EPEL_HV 4, 8
1261 HEVC_PUT_HEVC_EPEL_HV 6, 8
1262 HEVC_PUT_HEVC_EPEL_HV 8, 8
1264 HEVC_PUT_HEVC_EPEL_HV 2, 10
1265 HEVC_PUT_HEVC_EPEL_HV 4, 10
1266 HEVC_PUT_HEVC_EPEL_HV 6, 10
1267 HEVC_PUT_HEVC_EPEL_HV 8, 10
1270 HEVC_PUT_HEVC_QPEL 4, 8
1271 HEVC_PUT_HEVC_QPEL 8, 8
1272 HEVC_PUT_HEVC_QPEL 12, 8
1273 HEVC_PUT_HEVC_QPEL 16, 8
1275 HEVC_PUT_HEVC_QPEL 4, 10
1276 HEVC_PUT_HEVC_QPEL 8, 10
1278 HEVC_PUT_HEVC_QPEL_HV 2, 8
1279 HEVC_PUT_HEVC_QPEL_HV 4, 8
1280 HEVC_PUT_HEVC_QPEL_HV 6, 8
1281 HEVC_PUT_HEVC_QPEL_HV 8, 8
1283 HEVC_PUT_HEVC_QPEL_HV 2, 10
1284 HEVC_PUT_HEVC_QPEL_HV 4, 10
1285 HEVC_PUT_HEVC_QPEL_HV 6, 10
1286 HEVC_PUT_HEVC_QPEL_HV 8, 10
1288 %endif ; ARCH_X86_64