2 ; * Provide SSE luma and chroma mc functions for HEVC decoding
3 ; * Copyright (c) 2013 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
25 pw_10: times 8 dw 2048
26 pw_bi_8: times 8 dw 256
27 pw_bi_10: times 8 dw 1024
28 max_pixels_10: times 8 dw 1023
30 one_per_32: times 4 dd 1
34 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
52 EPEL_TABLE 8, 8, b, sse4
53 EPEL_TABLE 10, 4, w, sse4
56 hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
70 QPEL_TABLE 8, 8, b, sse4
71 QPEL_TABLE 10, 4, w, sse4
73 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
77 %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
79 movq %3, [%2] ; load data from source2
81 movdqa %3, [%2] ; load data from source2
83 movdqa %3, [%2] ; load data from source2
84 movq %4, [%2+16] ; load data from source2
86 movdqa %3, [%2] ; load data from source2
87 movdqa %4, [%2+16] ; load data from source2
91 %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
92 %if %1 == 2 || (%2 == 8 && %1 <= 4)
93 movd %4, [%3] ; load data from source
94 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
95 movq %4, [%3] ; load data from source
97 movdqu %4, [%3] ; load data from source
101 %macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
102 %if %1 == 2 || (%2 == 8 && %1 <= 4)
103 movq %4, [%3] ; load data from source2
104 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
105 movdqa %4, [%3] ; load data from source2
107 movdqa %4, [%3] ; load data from source2
108 movq %5, [%3+16] ; load data from source2
110 movdqa %4, [%3] ; load data from source2
111 movdqa %5, [%3+16] ; load data from source2
115 %macro EPEL_FILTER 2-4 ; bit depth, filter index
117 lea rfilterq, [hevc_epel_filters_sse4_%1]
119 %define rfilterq hevc_epel_filters_sse4_%1
122 shl %2q, 5 ; multiply by 32
124 movdqa m14, [rfilterq + %2q] ; get 2 first values of filters
125 movdqa m15, [rfilterq + %2q+16] ; get 2 last values of filters
127 movdqa %3, [rfilterq + %2q] ; get 2 first values of filters
128 movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters
132 %macro EPEL_HV_FILTER 1
134 lea rfilterq, [hevc_epel_filters_sse4_%1]
136 %define rfilterq hevc_epel_filters_sse4_%1
140 shl mxq, 5 ; multiply by 32
141 shl myq, 5 ; multiply by 32
142 movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
143 movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
144 lea r3srcq, [srcstrideq*3]
147 lea rfilterq, [hevc_epel_filters_sse4_10]
149 %define rfilterq hevc_epel_filters_sse4_10
151 movdqa m12, [rfilterq + myq] ; get 2 first values of filters
152 movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
157 lea rfilterq, [hevc_qpel_filters_sse4_%1]
159 %define rfilterq hevc_qpel_filters_sse4_%1
162 movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
163 movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
164 movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
165 movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
174 movdqu m0, [rfilterq ] ;load 128bit of x
176 movdqu m1, [rfilterq+ %3] ;load 128bit of x+stride
177 movdqu m2, [rfilterq+2*%3] ;load 128bit of x+2*stride
178 movdqu m3, [rfilterq+3*%3] ;load 128bit of x+3*stride
180 movdqu m1, [rfilterq+ %3q] ;load 128bit of x+stride
181 movdqu m2, [rfilterq+2*%3q] ;load 128bit of x+2*stride
182 movdqu m3, [rfilterq+r3srcq] ;load 128bit of x+2*stride
187 SBUTTERFLY bw, 0, 1, 10
188 SBUTTERFLY bw, 2, 3, 10
195 SBUTTERFLY wd, 0, 1, 10
196 SBUTTERFLY wd, 2, 3, 10
206 %assign %%stride (%1+7)/8
213 %define %%load movdqu
221 %define %%load movdqu
224 %%load m0, [%2-3*%%stride] ;load data from source
225 %%load m1, [%2-2*%%stride]
226 %%load m2, [%2-%%stride ]
228 %%load m4, [%2+%%stride ]
229 %%load m5, [%2+2*%%stride]
230 %%load m6, [%2+3*%%stride]
231 %%load m7, [%2+4*%%stride]
235 SBUTTERFLY wd, 0, 1, %4
236 SBUTTERFLY wd, 2, 3, %4
237 SBUTTERFLY wd, 4, 5, %4
238 SBUTTERFLY wd, 6, 7, %4
247 SBUTTERFLY dq, 0, 1, %4
248 SBUTTERFLY dq, 2, 3, %4
249 SBUTTERFLY dq, 4, 5, %4
250 SBUTTERFLY dq, 6, 7, %4
263 movdqu m0, [r12 ] ;load x- 3*srcstride
264 movdqu m1, [r12+ %3q ] ;load x- 2*srcstride
265 movdqu m2, [r12+ 2*%3q ] ;load x-srcstride
266 movdqu m3, [%2 ] ;load x
267 movdqu m4, [%2+ %3q] ;load x+stride
268 movdqu m5, [%2+ 2*%3q] ;load x+2*stride
269 movdqu m6, [%2+r3srcq] ;load x+3*stride
270 movdqu m7, [%2+ 4*%3q] ;load x+4*stride
273 SBUTTERFLY bw, 0, 1, 8
274 SBUTTERFLY bw, 2, 3, 8
275 SBUTTERFLY bw, 4, 5, 8
276 SBUTTERFLY bw, 6, 7, 8
285 SBUTTERFLY wd, 0, 1, 8
286 SBUTTERFLY wd, 2, 3, 8
287 SBUTTERFLY wd, 4, 5, 8
288 SBUTTERFLY wd, 6, 7, 8
298 %macro PEL_10STORE2 3
301 %macro PEL_10STORE4 3
304 %macro PEL_10STORE6 3
309 %macro PEL_10STORE8 3
312 %macro PEL_10STORE12 3
316 %macro PEL_10STORE16 3
317 PEL_10STORE8 %1, %2, %3
334 %macro PEL_8STORE12 3
339 %macro PEL_8STORE16 3
344 lea %1q, [%1q+2*%2q] ; dst += dststride
345 lea %3q, [%3q+ %4q] ; src += srcstride
346 dec heightd ; cmp height
347 jnz .loop ; height loop
351 %macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
363 %macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
365 pmaddubsw m0, %3 ;x1*c1+x2*c2
366 pmaddubsw m2, %4 ;x3*c3+x4*c4
388 %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
390 lea rfilterq, [hevc_qpel_filters_sse4_%2]
392 %define rfilterq hevc_qpel_filters_sse4_%2
396 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
397 pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4
398 pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6
399 pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8
404 pmaddwd m0, [rfilterq + %3q*8 ]
405 pmaddwd m2, [rfilterq + %3q*8+16]
406 pmaddwd m4, [rfilterq + %3q*8+32]
407 pmaddwd m6, [rfilterq + %3q*8+48]
413 pmaddwd m1, [rfilterq + %3q*8 ]
414 pmaddwd m3, [rfilterq + %3q*8+16]
415 pmaddwd m5, [rfilterq + %3q*8+32]
416 pmaddwd m7, [rfilterq + %3q*8+48]
426 %macro QPEL_COMPUTE 2 ; width, bitdepth
428 pmaddubsw m0, m12 ;x1*c1+x2*c2
429 pmaddubsw m2, m13 ;x3*c3+x4*c4
430 pmaddubsw m4, m14 ;x5*c5+x6*c6
431 pmaddubsw m6, m15 ;x7*c7+x8*c8
466 %macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
471 UNI_COMPUTE %1, %2, %3, %4, %7
476 %if %1 > 8 || (%2 > 8 && %1 > 4)
482 pminsw %3, [max_pixels_%2]
485 pminsw %4, [max_pixels_%2]
491 INIT_XMM sse4 ; adds ff_ and _sse4 to function name
492 ; ******************************
493 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
494 ; uint8_t *_src, ptrdiff_t _srcstride,
495 ; int height, int mx, int my)
496 ; ******************************
498 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
499 cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
502 SIMPLE_LOAD %1, %2, srcq, m0
503 MC_PIXEL_COMPUTE %1, %2
504 PEL_10STORE%1 dstq, m0, m1
505 LOOP_END dst, dststride, src, srcstride
508 cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
511 SIMPLE_LOAD %1, %2, srcq, m0
512 PEL_%2STORE%1 dstq, m0, m1
513 lea dstq, [dstq+dststrideq] ; dst += dststride
514 lea srcq, [srcq+srcstrideq] ; src += srcstride
515 dec heightd ; cmp height
516 jnz .loop ; height loop
519 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
521 movdqa m5, [pw_bi_%2]
523 SIMPLE_LOAD %1, %2, srcq, m0
524 SIMPLE_BILOAD %1, src2q, m3, m4
525 MC_PIXEL_COMPUTE %1, %2
526 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
527 PEL_%2STORE%1 dstq, m0, m1
528 lea dstq, [dstq+dststrideq] ; dst += dststride
529 lea srcq, [srcq+srcstrideq] ; src += srcstride
530 lea src2q, [src2q+2*src2strideq] ; src += srcstride
531 dec heightd ; cmp height
532 jnz .loop ; height loop
538 ; ******************************
539 ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
540 ; uint8_t *_src, ptrdiff_t _srcstride,
541 ; int width, int height, int mx, int my,
543 ; ******************************
546 %macro HEVC_PUT_HEVC_EPEL 2
547 cglobal hevc_put_hevc_epel_h%1_%2, 6, 7, 6, dst, dststride, src, srcstride, height, mx, rfilter
548 %assign %%stride ((%2 + 7)/8)
549 EPEL_FILTER %2, mx, m4, m5
551 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
552 EPEL_COMPUTE %2, %1, m4, m5
553 PEL_10STORE%1 dstq, m0, m1
554 LOOP_END dst, dststride, src, srcstride
557 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter
558 %assign %%stride ((%2 + 7)/8)
560 EPEL_FILTER %2, mx, m4, m5
562 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
563 EPEL_COMPUTE %2, %1, m4, m5
564 UNI_COMPUTE %1, %2, m0, m1, m6
565 PEL_%2STORE%1 dstq, m0, m1
566 lea dstq, [dstq+dststrideq] ; dst += dststride
567 lea srcq, [srcq+srcstrideq] ; src += srcstride
568 dec heightd ; cmp height
569 jnz .loop ; height loop
572 cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx, rfilter
573 movdqa m6, [pw_bi_%2]
574 EPEL_FILTER %2, mx, m4, m5
576 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
577 EPEL_COMPUTE %2, %1, m4, m5
578 SIMPLE_BILOAD %1, src2q, m2, m3
579 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
580 PEL_%2STORE%1 dstq, m0, m1
581 lea dstq, [dstq+dststrideq] ; dst += dststride
582 lea srcq, [srcq+srcstrideq] ; src += srcstride
583 lea src2q, [src2q+2*src2strideq] ; src += srcstride
584 dec heightd ; cmp height
585 jnz .loop ; height loop
588 ; ******************************
589 ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
590 ; uint8_t *_src, ptrdiff_t _srcstride,
591 ; int width, int height, int mx, int my,
593 ; ******************************
595 cglobal hevc_put_hevc_epel_v%1_%2, 7, 8, 6, dst, dststride, src, srcstride, height, r3src, my, rfilter
596 lea r3srcq, [srcstrideq*3]
598 EPEL_FILTER %2, my, m4, m5
600 EPEL_LOAD %2, srcq, srcstride, %1
601 EPEL_COMPUTE %2, %1, m4, m5
602 PEL_10STORE%1 dstq, m0, m1
603 LOOP_END dst, dststride, src, srcstride
606 cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter
607 lea r3srcq, [srcstrideq*3]
610 EPEL_FILTER %2, my, m4, m5
612 EPEL_LOAD %2, srcq, srcstride, %1
613 EPEL_COMPUTE %2, %1, m4, m5
614 UNI_COMPUTE %1, %2, m0, m1, m6
615 PEL_%2STORE%1 dstq, m0, m1
616 lea dstq, [dstq+dststrideq] ; dst += dststride
617 lea srcq, [srcq+srcstrideq] ; src += srcstride
618 dec heightd ; cmp height
619 jnz .loop ; height loop
623 cglobal hevc_put_hevc_bi_epel_v%1_%2, 9, 10, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my, rfilter
624 lea r3srcq, [srcstrideq*3]
625 movdqa m6, [pw_bi_%2]
627 EPEL_FILTER %2, my, m4, m5
629 EPEL_LOAD %2, srcq, srcstride, %1
630 EPEL_COMPUTE %2, %1, m4, m5
631 SIMPLE_BILOAD %1, src2q, m2, m3
632 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
633 PEL_%2STORE%1 dstq, m0, m1
634 lea dstq, [dstq+dststrideq] ; dst += dststride
635 lea srcq, [srcq+srcstrideq] ; src += srcstride
636 lea src2q, [src2q+2*src2strideq] ; src += srcstride
637 dec heightd ; cmp height
638 jnz .loop ; height loop
643 ; ******************************
644 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
645 ; uint8_t *_src, ptrdiff_t _srcstride,
646 ; int width, int height, int mx, int my)
647 ; ******************************
649 %macro HEVC_PUT_HEVC_EPEL_HV 2
650 cglobal hevc_put_hevc_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
651 %assign %%stride ((%2 + 7)/8)
654 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
655 EPEL_COMPUTE %2, %1, m14, m15
657 lea srcq, [srcq + srcstrideq]
658 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
659 EPEL_COMPUTE %2, %1, m14, m15
661 lea srcq, [srcq + srcstrideq]
662 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
663 EPEL_COMPUTE %2, %1, m14, m15
665 lea srcq, [srcq + srcstrideq]
667 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
668 EPEL_COMPUTE %2, %1, m14, m15
676 EPEL_COMPUTE 14, %1, m12, m13
677 PEL_10STORE%1 dstq, m0, m1
681 LOOP_END dst, dststride, src, srcstride
684 cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
685 %assign %%stride ((%2 + 7)/8)
688 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
689 EPEL_COMPUTE %2, %1, m14, m15
691 lea srcq, [srcq + srcstrideq]
692 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
693 EPEL_COMPUTE %2, %1, m14, m15
695 lea srcq, [srcq + srcstrideq]
696 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
697 EPEL_COMPUTE %2, %1, m14, m15
699 lea srcq, [srcq + srcstrideq]
701 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
702 EPEL_COMPUTE %2, %1, m14, m15
710 EPEL_COMPUTE 14, %1, m12, m13
711 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
712 PEL_%2STORE%1 dstq, m0, m1
716 lea dstq, [dstq+dststrideq] ; dst += dststride
717 lea srcq, [srcq+srcstrideq] ; src += srcstride
718 dec heightd ; cmp height
719 jnz .loop ; height loop
723 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
724 %assign %%stride ((%2 + 7)/8)
727 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
728 EPEL_COMPUTE %2, %1, m14, m15
730 lea srcq, [srcq + srcstrideq]
731 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
732 EPEL_COMPUTE %2, %1, m14, m15
734 lea srcq, [srcq + srcstrideq]
735 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
736 EPEL_COMPUTE %2, %1, m14, m15
738 lea srcq, [srcq + srcstrideq]
740 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
741 EPEL_COMPUTE %2, %1, m14, m15
749 EPEL_COMPUTE 14, %1, m12, m13
750 SIMPLE_BILOAD %1, src2q, m8, m9
751 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
752 PEL_%2STORE%1 dstq, m0, m1
756 lea dstq, [dstq+dststrideq] ; dst += dststride
757 lea srcq, [srcq+srcstrideq] ; src += srcstride
758 lea src2q, [src2q+2*src2strideq] ; src += srcstride
759 dec heightd ; cmp height
760 jnz .loop ; height loop
764 ; ******************************
765 ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
766 ; uint8_t *_src, ptrdiff_t _srcstride,
767 ; int width, int height, int mx, int my)
768 ; ******************************
770 %macro HEVC_PUT_HEVC_QPEL 2
771 cglobal hevc_put_hevc_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
774 QPEL_H_LOAD %2, srcq, %1, 10
779 PEL_10STORE%1 dstq, m0, m1
780 LOOP_END dst, dststride, src, srcstride
783 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
787 QPEL_H_LOAD %2, srcq, %1, 10
792 UNI_COMPUTE %1, %2, m0, m1, m9
793 PEL_%2STORE%1 dstq, m0, m1
794 lea dstq, [dstq+dststrideq] ; dst += dststride
795 lea srcq, [srcq+srcstrideq] ; src += srcstride
796 dec heightd ; cmp height
797 jnz .loop ; height loop
800 cglobal hevc_put_hevc_bi_qpel_h%1_%2, 8, 9, 16 , dst, dststride, src, srcstride, src2, src2stride, height, mx, rfilter
801 movdqa m9, [pw_bi_%2]
804 QPEL_H_LOAD %2, srcq, %1, 10
809 SIMPLE_BILOAD %1, src2q, m10, m11
810 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
811 PEL_%2STORE%1 dstq, m0, m1
812 lea dstq, [dstq+dststrideq] ; dst += dststride
813 lea srcq, [srcq+srcstrideq] ; src += srcstride
814 lea src2q, [src2q+2*src2strideq] ; src += srcstride
815 dec heightd ; cmp height
816 jnz .loop ; height loop
820 ; ******************************
821 ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
822 ; uint8_t *_src, ptrdiff_t _srcstride,
823 ; int width, int height, int mx, int my)
824 ; ******************************
826 cglobal hevc_put_hevc_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstride, height, r3src, my, rfilter
827 lea r3srcq, [srcstrideq*3]
830 QPEL_V_LOAD %2, srcq, srcstride, %1
835 PEL_10STORE%1 dstq, m0, m1
836 LOOP_END dst, dststride, src, srcstride
839 cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstride, height, r3src, my, rfilter
841 lea r3srcq, [srcstrideq*3]
844 QPEL_V_LOAD %2, srcq, srcstride, %1
849 UNI_COMPUTE %1, %2, m0, m1, m9
850 PEL_%2STORE%1 dstq, m0, m1
851 lea dstq, [dstq+dststrideq] ; dst += dststride
852 lea srcq, [srcq+srcstrideq] ; src += srcstride
853 dec heightd ; cmp height
854 jnz .loop ; height loop
857 cglobal hevc_put_hevc_bi_qpel_v%1_%2, 9, 14, 16 , dst, dststride, src, srcstride, src2, src2stride, height, r3src, my, rfilter
858 movdqa m9, [pw_bi_%2]
859 lea r3srcq, [srcstrideq*3]
862 SIMPLE_BILOAD %1, src2q, m10, m11
863 QPEL_V_LOAD %2, srcq, srcstride, %1
868 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
869 PEL_%2STORE%1 dstq, m0, m1
870 lea dstq, [dstq+dststrideq] ; dst += dststride
871 lea srcq, [srcq+srcstrideq] ; src += srcstride
872 lea src2q, [src2q+2*src2strideq] ; src += srcstride
873 dec heightd ; cmp height
874 jnz .loop ; height loop
879 ; ******************************
880 ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
881 ; uint8_t *_src, ptrdiff_t _srcstride,
882 ; int height, int mx, int my)
883 ; ******************************
884 %macro HEVC_PUT_HEVC_QPEL_HV 2
885 cglobal hevc_put_hevc_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
888 lea r3srcq, [srcstrideq*3]
890 QPEL_H_LOAD %2, srcq, %1, 15
891 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
893 lea srcq, [srcq + srcstrideq]
894 QPEL_H_LOAD %2, srcq, %1, 15
895 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
897 lea srcq, [srcq + srcstrideq]
898 QPEL_H_LOAD %2, srcq, %1, 15
899 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
901 lea srcq, [srcq + srcstrideq]
902 QPEL_H_LOAD %2, srcq, %1, 15
903 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
905 lea srcq, [srcq + srcstrideq]
906 QPEL_H_LOAD %2, srcq, %1, 15
907 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
909 lea srcq, [srcq + srcstrideq]
910 QPEL_H_LOAD %2, srcq, %1, 15
911 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
913 lea srcq, [srcq + srcstrideq]
914 QPEL_H_LOAD %2, srcq, %1, 15
915 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
917 lea srcq, [srcq + srcstrideq]
919 QPEL_H_LOAD %2, srcq, %1, 15
920 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
923 punpcklwd m2, m10, m11
924 punpcklwd m4, m12, m13
925 punpcklwd m6, m14, m15
928 punpckhwd m3, m10, m11
929 punpckhwd m5, m12, m13
930 punpckhwd m7, m14, m15
932 QPEL_HV_COMPUTE %1, 14, my, ackssdw
933 PEL_10STORE%1 dstq, m0, m1
951 LOOP_END dst, dststride, src, srcstride
954 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
957 lea r3srcq, [srcstrideq*3]
959 QPEL_H_LOAD %2, srcq, %1, 15
960 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
962 lea srcq, [srcq + srcstrideq]
963 QPEL_H_LOAD %2, srcq, %1, 15
964 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
966 lea srcq, [srcq + srcstrideq]
967 QPEL_H_LOAD %2, srcq, %1, 15
968 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
970 lea srcq, [srcq + srcstrideq]
971 QPEL_H_LOAD %2, srcq, %1, 15
972 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
974 lea srcq, [srcq + srcstrideq]
975 QPEL_H_LOAD %2, srcq, %1, 15
976 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
978 lea srcq, [srcq + srcstrideq]
979 QPEL_H_LOAD %2, srcq, %1, 15
980 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
982 lea srcq, [srcq + srcstrideq]
983 QPEL_H_LOAD %2, srcq, %1, 15
984 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
986 lea srcq, [srcq + srcstrideq]
988 QPEL_H_LOAD %2, srcq, %1, 15
989 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
992 punpcklwd m2, m10, m11
993 punpcklwd m4, m12, m13
994 punpcklwd m6, m14, m15
997 punpckhwd m3, m10, m11
998 punpckhwd m5, m12, m13
999 punpckhwd m7, m14, m15
1001 QPEL_HV_COMPUTE %1, 14, my, ackusdw
1002 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
1003 PEL_%2STORE%1 dstq, m0, m1
1022 lea dstq, [dstq+dststrideq] ; dst += dststride
1023 lea srcq, [srcq+srcstrideq] ; src += srcstride
1024 dec heightd ; cmp height
1025 jnz .loop ; height loop
1028 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
1031 lea r3srcq, [srcstrideq*3]
1033 QPEL_H_LOAD %2, srcq, %1, 15
1034 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1036 lea srcq, [srcq + srcstrideq]
1037 QPEL_H_LOAD %2, srcq, %1, 15
1038 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1040 lea srcq, [srcq + srcstrideq]
1041 QPEL_H_LOAD %2, srcq, %1, 15
1042 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1044 lea srcq, [srcq + srcstrideq]
1045 QPEL_H_LOAD %2, srcq, %1, 15
1046 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1048 lea srcq, [srcq + srcstrideq]
1049 QPEL_H_LOAD %2, srcq, %1, 15
1050 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1052 lea srcq, [srcq + srcstrideq]
1053 QPEL_H_LOAD %2, srcq, %1, 15
1054 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1056 lea srcq, [srcq + srcstrideq]
1057 QPEL_H_LOAD %2, srcq, %1, 15
1058 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1060 lea srcq, [srcq + srcstrideq]
1062 QPEL_H_LOAD %2, srcq, %1, 15
1063 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1065 punpcklwd m0, m8, m9
1066 punpcklwd m2, m10, m11
1067 punpcklwd m4, m12, m13
1068 punpcklwd m6, m14, m15
1070 punpckhwd m1, m8, m9
1071 punpckhwd m3, m10, m11
1072 punpckhwd m5, m12, m13
1073 punpckhwd m7, m14, m15
1075 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1076 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
1077 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1078 PEL_%2STORE%1 dstq, m0, m1
1097 lea dstq, [dstq+dststrideq] ; dst += dststride
1098 lea srcq, [srcq+srcstrideq] ; src += srcstride
1099 lea src2q, [src2q+2*src2strideq] ; src += srcstride
1100 dec heightd ; cmp height
1101 jnz .loop ; height loop
1105 %macro WEIGHTING_FUNCS 2
1106 cglobal hevc_put_hevc_uni_w%1_%2, 8, 10, 11, dst, dststride, src, srcstride, height, denom, wx, ox, shift
1107 lea shiftd, [denomd+14-%2] ; shift = 14 - bitd + denom
1108 shl oxd, %2-8 ; ox << (bitd - 8)
1111 movd m4, shiftd ; shift
1117 movdqu m5, [one_per_32]
1120 SIMPLE_LOAD %1, 10, srcq, m0
1123 punpckhwd m1, m0, m6
1135 pminsw m0, [max_pixels_%2]
1137 PEL_%2STORE%1 dstq, m0, m1
1138 lea dstq, [dstq+dststrideq] ; dst += dststride
1139 lea srcq, [srcq+2*srcstrideq] ; src += srcstride
1140 dec heightd ; cmp height
1141 jnz .loop ; height loop
1144 cglobal hevc_put_hevc_bi_w%1_%2, 12, 14, 14, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1, shift, temp
1145 shl ox0d, %2-8 ; ox << (bitd - 8)
1146 shl ox1d, %2-8 ; ox << (bitd - 8)
1147 lea shiftd, [denomd+14-%2] ; shift = 14 - bitd + denom
1156 movd m4, ox0d ; offset
1158 movd m5, shiftd ; shift
1161 movd m5, shiftd ; shift
1164 SIMPLE_LOAD %1, 10, srcq, m0
1165 SIMPLE_LOAD %1, 10, src2q, m10
1170 punpckhwd m1, m0, m6
1172 punpckhwd m11, m10, m7
1184 pminsw m0, [max_pixels_%2]
1186 PEL_%2STORE%1 dstq, m0, m1
1187 lea dstq, [dstq+dststrideq] ; dst += dststride
1188 lea srcq, [srcq+2*srcstrideq] ; src += srcstride
1189 lea src2q, [src2q+2*src2strideq] ; src2 += srcstride
1190 dec heightd ; cmp height
1191 jnz .loop ; height loop
1195 WEIGHTING_FUNCS 2, 8
1196 WEIGHTING_FUNCS 4, 8
1197 WEIGHTING_FUNCS 6, 8
1198 WEIGHTING_FUNCS 8, 8
1200 WEIGHTING_FUNCS 2, 10
1201 WEIGHTING_FUNCS 4, 10
1202 WEIGHTING_FUNCS 6, 10
1203 WEIGHTING_FUNCS 8, 10
1205 HEVC_PUT_HEVC_PEL_PIXELS 2, 8
1206 HEVC_PUT_HEVC_PEL_PIXELS 4, 8
1207 HEVC_PUT_HEVC_PEL_PIXELS 6, 8
1208 HEVC_PUT_HEVC_PEL_PIXELS 8, 8
1209 HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1210 HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1212 HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1213 HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1214 HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1215 HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1218 HEVC_PUT_HEVC_EPEL 2, 8
1219 HEVC_PUT_HEVC_EPEL 4, 8
1220 HEVC_PUT_HEVC_EPEL 6, 8
1221 HEVC_PUT_HEVC_EPEL 8, 8
1222 HEVC_PUT_HEVC_EPEL 12, 8
1223 HEVC_PUT_HEVC_EPEL 16, 8
1226 HEVC_PUT_HEVC_EPEL 2, 10
1227 HEVC_PUT_HEVC_EPEL 4, 10
1228 HEVC_PUT_HEVC_EPEL 6, 10
1229 HEVC_PUT_HEVC_EPEL 8, 10
1232 HEVC_PUT_HEVC_EPEL_HV 2, 8
1233 HEVC_PUT_HEVC_EPEL_HV 4, 8
1234 HEVC_PUT_HEVC_EPEL_HV 6, 8
1235 HEVC_PUT_HEVC_EPEL_HV 8, 8
1237 HEVC_PUT_HEVC_EPEL_HV 2, 10
1238 HEVC_PUT_HEVC_EPEL_HV 4, 10
1239 HEVC_PUT_HEVC_EPEL_HV 6, 10
1240 HEVC_PUT_HEVC_EPEL_HV 8, 10
1243 HEVC_PUT_HEVC_QPEL 4, 8
1244 HEVC_PUT_HEVC_QPEL 8, 8
1245 HEVC_PUT_HEVC_QPEL 12, 8
1246 HEVC_PUT_HEVC_QPEL 16, 8
1248 HEVC_PUT_HEVC_QPEL 4, 10
1249 HEVC_PUT_HEVC_QPEL 8, 10
1251 HEVC_PUT_HEVC_QPEL_HV 2, 8
1252 HEVC_PUT_HEVC_QPEL_HV 4, 8
1253 HEVC_PUT_HEVC_QPEL_HV 6, 8
1254 HEVC_PUT_HEVC_QPEL_HV 8, 8
1256 HEVC_PUT_HEVC_QPEL_HV 2, 10
1257 HEVC_PUT_HEVC_QPEL_HV 4, 10
1258 HEVC_PUT_HEVC_QPEL_HV 6, 10
1259 HEVC_PUT_HEVC_QPEL_HV 8, 10
1261 %endif ; ARCH_X86_64