2 ; * Provide SSE luma and chroma mc functions for HEVC decoding
3 ; * Copyright (c) 2013 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
24 pw_8: times 8 dw (1 << 9)
25 pw_10: times 8 dw (1 << 11)
26 pw_12: times 8 dw (1 << 13)
27 pw_bi_8: times 8 dw (1 << 8)
28 pw_bi_10: times 8 dw (1 << 10)
29 pw_bi_12: times 8 dw (1 << 12)
30 max_pixels_10: times 8 dw ((1 << 10)-1)
31 max_pixels_12: times 8 dw ((1 << 12)-1)
33 one_per_32: times 4 dd 1
37 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
55 EPEL_TABLE 8, 8, b, sse4
56 EPEL_TABLE 10, 4, w, sse4
57 EPEL_TABLE 12, 4, w, sse4
60 hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
74 QPEL_TABLE 8, 8, b, sse4
75 QPEL_TABLE 10, 4, w, sse4
76 QPEL_TABLE 12, 4, w, sse4
78 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
82 %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
84 movq %3, [%2] ; load data from source2
86 movdqa %3, [%2] ; load data from source2
88 movdqa %3, [%2] ; load data from source2
89 movq %4, [%2+16] ; load data from source2
91 movdqa %3, [%2] ; load data from source2
92 movdqa %4, [%2+16] ; load data from source2
96 %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
97 %if %1 == 2 || (%2 == 8 && %1 <= 4)
98 movd %4, [%3] ; load data from source
99 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
100 movq %4, [%3] ; load data from source
102 movdqu %4, [%3] ; load data from source
106 %macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
107 %if %1 == 2 || (%2 == 8 && %1 <= 4)
108 movq %4, [%3] ; load data from source2
109 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
110 movdqa %4, [%3] ; load data from source2
112 movdqa %4, [%3] ; load data from source2
113 movq %5, [%3+16] ; load data from source2
115 movdqa %4, [%3] ; load data from source2
116 movdqa %5, [%3+16] ; load data from source2
120 %macro EPEL_FILTER 2-4 ; bit depth, filter index
122 lea rfilterq, [hevc_epel_filters_sse4_%1]
124 %define rfilterq hevc_epel_filters_sse4_%1
127 shl %2q, 5 ; multiply by 32
129 movdqa m14, [rfilterq + %2q] ; get 2 first values of filters
130 movdqa m15, [rfilterq + %2q+16] ; get 2 last values of filters
132 movdqa %3, [rfilterq + %2q] ; get 2 first values of filters
133 movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters
137 %macro EPEL_HV_FILTER 1
139 lea rfilterq, [hevc_epel_filters_sse4_%1]
141 %define rfilterq hevc_epel_filters_sse4_%1
145 shl mxq, 5 ; multiply by 32
146 shl myq, 5 ; multiply by 32
147 movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
148 movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
149 lea r3srcq, [srcstrideq*3]
152 lea rfilterq, [hevc_epel_filters_sse4_10]
154 %define rfilterq hevc_epel_filters_sse4_10
156 movdqa m12, [rfilterq + myq] ; get 2 first values of filters
157 movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
162 lea rfilterq, [hevc_qpel_filters_sse4_%1]
164 %define rfilterq hevc_qpel_filters_sse4_%1
167 movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
168 movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
169 movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
170 movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
179 %if (%1 == 8 && %4 <= 4)
181 %elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
184 %define %%load movdqu
187 %%load m0, [rfilterq ]
189 %%load m1, [rfilterq+ %3]
190 %%load m2, [rfilterq+2*%3]
191 %%load m3, [rfilterq+3*%3]
193 %%load m1, [rfilterq+ %3q]
194 %%load m2, [rfilterq+2*%3q]
195 %%load m3, [rfilterq+r3srcq]
200 SBUTTERFLY bw, 0, 1, 10
201 SBUTTERFLY bw, 2, 3, 10
208 SBUTTERFLY wd, 0, 1, 10
209 SBUTTERFLY wd, 2, 3, 10
219 %assign %%stride (%1+7)/8
226 %define %%load movdqu
234 %define %%load movdqu
237 %%load m0, [%2-3*%%stride] ;load data from source
238 %%load m1, [%2-2*%%stride]
239 %%load m2, [%2-%%stride ]
241 %%load m4, [%2+%%stride ]
242 %%load m5, [%2+2*%%stride]
243 %%load m6, [%2+3*%%stride]
244 %%load m7, [%2+4*%%stride]
248 SBUTTERFLY wd, 0, 1, %4
249 SBUTTERFLY wd, 2, 3, %4
250 SBUTTERFLY wd, 4, 5, %4
251 SBUTTERFLY wd, 6, 7, %4
260 SBUTTERFLY dq, 0, 1, %4
261 SBUTTERFLY dq, 2, 3, %4
262 SBUTTERFLY dq, 4, 5, %4
263 SBUTTERFLY dq, 6, 7, %4
276 movdqu m0, [%5q ] ;load x- 3*srcstride
277 movdqu m1, [%5q+ %3q ] ;load x- 2*srcstride
278 movdqu m2, [%5q+ 2*%3q ] ;load x-srcstride
279 movdqu m3, [%2 ] ;load x
280 movdqu m4, [%2+ %3q] ;load x+stride
281 movdqu m5, [%2+ 2*%3q] ;load x+2*stride
282 movdqu m6, [%2+r3srcq] ;load x+3*stride
283 movdqu m7, [%2+ 4*%3q] ;load x+4*stride
286 SBUTTERFLY bw, 0, 1, 8
287 SBUTTERFLY bw, 2, 3, 8
288 SBUTTERFLY bw, 4, 5, 8
289 SBUTTERFLY bw, 6, 7, 8
298 SBUTTERFLY wd, 0, 1, 8
299 SBUTTERFLY wd, 2, 3, 8
300 SBUTTERFLY wd, 4, 5, 8
301 SBUTTERFLY wd, 6, 7, 8
311 %macro PEL_12STORE2 3
314 %macro PEL_12STORE4 3
317 %macro PEL_12STORE6 3
322 %macro PEL_12STORE8 3
325 %macro PEL_12STORE12 3
329 %macro PEL_12STORE16 3
330 PEL_12STORE8 %1, %2, %3
334 %macro PEL_10STORE2 3
337 %macro PEL_10STORE4 3
340 %macro PEL_10STORE6 3
345 %macro PEL_10STORE8 3
348 %macro PEL_10STORE12 3
352 %macro PEL_10STORE16 3
353 PEL_10STORE8 %1, %2, %3
370 %macro PEL_8STORE12 3
375 %macro PEL_8STORE16 3
380 lea %1q, [%1q+2*%2q] ; dst += dststride
381 add %3q, %4q ; src += srcstride
382 dec heightd ; cmp height
383 jnz .loop ; height loop
387 %macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
399 %macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
401 pmaddubsw m0, %3 ;x1*c1+x2*c2
402 pmaddubsw m2, %4 ;x3*c3+x4*c4
426 %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
428 lea rfilterq, [hevc_qpel_filters_sse4_%2]
430 %define rfilterq hevc_qpel_filters_sse4_%2
434 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
435 pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4
436 pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6
437 pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8
442 pmaddwd m0, [rfilterq + %3q*8 ]
443 pmaddwd m2, [rfilterq + %3q*8+16]
444 pmaddwd m4, [rfilterq + %3q*8+32]
445 pmaddwd m6, [rfilterq + %3q*8+48]
453 pmaddwd m1, [rfilterq + %3q*8 ]
454 pmaddwd m3, [rfilterq + %3q*8+16]
455 pmaddwd m5, [rfilterq + %3q*8+32]
456 pmaddwd m7, [rfilterq + %3q*8+48]
468 %macro QPEL_COMPUTE 2 ; width, bitdepth
470 pmaddubsw m0, m12 ;x1*c1+x2*c2
471 pmaddubsw m2, m13 ;x3*c3+x4*c4
472 pmaddubsw m4, m14 ;x5*c5+x6*c6
473 pmaddubsw m6, m15 ;x7*c7+x8*c8
512 %macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
517 UNI_COMPUTE %1, %2, %3, %4, %7
522 %if %1 > 8 || (%2 > 8 && %1 > 4)
528 pminsw %3, [max_pixels_%2]
531 pminsw %4, [max_pixels_%2]
537 INIT_XMM sse4 ; adds ff_ and _sse4 to function name
538 ; ******************************
539 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
540 ; uint8_t *_src, ptrdiff_t _srcstride,
541 ; int height, int mx, int my)
542 ; ******************************
544 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
545 cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
548 SIMPLE_LOAD %1, %2, srcq, m0
549 MC_PIXEL_COMPUTE %1, %2
550 PEL_10STORE%1 dstq, m0, m1
551 LOOP_END dst, dststride, src, srcstride
554 cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
557 SIMPLE_LOAD %1, %2, srcq, m0
558 PEL_%2STORE%1 dstq, m0, m1
559 add dstq, dststrideq ; dst += dststride
560 add srcq, srcstrideq ; src += srcstride
561 dec heightd ; cmp height
562 jnz .loop ; height loop
565 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
567 movdqa m5, [pw_bi_%2]
569 SIMPLE_LOAD %1, %2, srcq, m0
570 SIMPLE_BILOAD %1, src2q, m3, m4
571 MC_PIXEL_COMPUTE %1, %2
572 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
573 PEL_%2STORE%1 dstq, m0, m1
574 add dstq, dststrideq ; dst += dststride
575 add srcq, srcstrideq ; src += srcstride
576 lea src2q, [src2q+2*src2strideq] ; src += srcstride
577 dec heightd ; cmp height
578 jnz .loop ; height loop
584 ; ******************************
585 ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
586 ; uint8_t *_src, ptrdiff_t _srcstride,
587 ; int width, int height, int mx, int my,
589 ; ******************************
592 %macro HEVC_PUT_HEVC_EPEL 2
593 cglobal hevc_put_hevc_epel_h%1_%2, 6, 7, 6, dst, dststride, src, srcstride, height, mx, rfilter
594 %assign %%stride ((%2 + 7)/8)
595 EPEL_FILTER %2, mx, m4, m5
597 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
598 EPEL_COMPUTE %2, %1, m4, m5
599 PEL_10STORE%1 dstq, m0, m1
600 LOOP_END dst, dststride, src, srcstride
603 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter
604 %assign %%stride ((%2 + 7)/8)
606 EPEL_FILTER %2, mx, m4, m5
608 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
609 EPEL_COMPUTE %2, %1, m4, m5
610 UNI_COMPUTE %1, %2, m0, m1, m6
611 PEL_%2STORE%1 dstq, m0, m1
612 add dstq, dststrideq ; dst += dststride
613 add srcq, srcstrideq ; src += srcstride
614 dec heightd ; cmp height
615 jnz .loop ; height loop
618 cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx, rfilter
619 movdqa m6, [pw_bi_%2]
620 EPEL_FILTER %2, mx, m4, m5
622 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
623 EPEL_COMPUTE %2, %1, m4, m5
624 SIMPLE_BILOAD %1, src2q, m2, m3
625 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
626 PEL_%2STORE%1 dstq, m0, m1
627 add dstq, dststrideq ; dst += dststride
628 add srcq, srcstrideq ; src += srcstride
629 lea src2q, [src2q+2*src2strideq] ; src += srcstride
630 dec heightd ; cmp height
631 jnz .loop ; height loop
634 ; ******************************
635 ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
636 ; uint8_t *_src, ptrdiff_t _srcstride,
637 ; int width, int height, int mx, int my,
639 ; ******************************
641 cglobal hevc_put_hevc_epel_v%1_%2, 7, 8, 6, dst, dststride, src, srcstride, height, r3src, my, rfilter
642 lea r3srcq, [srcstrideq*3]
644 EPEL_FILTER %2, my, m4, m5
646 EPEL_LOAD %2, srcq, srcstride, %1
647 EPEL_COMPUTE %2, %1, m4, m5
648 PEL_10STORE%1 dstq, m0, m1
649 LOOP_END dst, dststride, src, srcstride
652 cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter
653 lea r3srcq, [srcstrideq*3]
656 EPEL_FILTER %2, my, m4, m5
658 EPEL_LOAD %2, srcq, srcstride, %1
659 EPEL_COMPUTE %2, %1, m4, m5
660 UNI_COMPUTE %1, %2, m0, m1, m6
661 PEL_%2STORE%1 dstq, m0, m1
662 add dstq, dststrideq ; dst += dststride
663 add srcq, srcstrideq ; src += srcstride
664 dec heightd ; cmp height
665 jnz .loop ; height loop
669 cglobal hevc_put_hevc_bi_epel_v%1_%2, 9, 10, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my, rfilter
670 lea r3srcq, [srcstrideq*3]
671 movdqa m6, [pw_bi_%2]
673 EPEL_FILTER %2, my, m4, m5
675 EPEL_LOAD %2, srcq, srcstride, %1
676 EPEL_COMPUTE %2, %1, m4, m5
677 SIMPLE_BILOAD %1, src2q, m2, m3
678 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
679 PEL_%2STORE%1 dstq, m0, m1
680 add dstq, dststrideq ; dst += dststride
681 add srcq, srcstrideq ; src += srcstride
682 lea src2q, [src2q+2*src2strideq] ; src += srcstride
683 dec heightd ; cmp height
684 jnz .loop ; height loop
689 ; ******************************
690 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
691 ; uint8_t *_src, ptrdiff_t _srcstride,
692 ; int width, int height, int mx, int my)
693 ; ******************************
695 %macro HEVC_PUT_HEVC_EPEL_HV 2
696 cglobal hevc_put_hevc_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
697 %assign %%stride ((%2 + 7)/8)
700 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
701 EPEL_COMPUTE %2, %1, m14, m15
704 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
705 EPEL_COMPUTE %2, %1, m14, m15
708 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
709 EPEL_COMPUTE %2, %1, m14, m15
713 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
714 EPEL_COMPUTE %2, %1, m14, m15
722 EPEL_COMPUTE 14, %1, m12, m13
723 PEL_10STORE%1 dstq, m0, m1
727 LOOP_END dst, dststride, src, srcstride
730 cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
731 %assign %%stride ((%2 + 7)/8)
734 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
735 EPEL_COMPUTE %2, %1, m14, m15
738 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
739 EPEL_COMPUTE %2, %1, m14, m15
742 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
743 EPEL_COMPUTE %2, %1, m14, m15
747 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
748 EPEL_COMPUTE %2, %1, m14, m15
756 EPEL_COMPUTE 14, %1, m12, m13
757 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
758 PEL_%2STORE%1 dstq, m0, m1
762 add dstq, dststrideq ; dst += dststride
763 add srcq, srcstrideq ; src += srcstride
764 dec heightd ; cmp height
765 jnz .loop ; height loop
769 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
770 %assign %%stride ((%2 + 7)/8)
773 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
774 EPEL_COMPUTE %2, %1, m14, m15
777 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
778 EPEL_COMPUTE %2, %1, m14, m15
781 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
782 EPEL_COMPUTE %2, %1, m14, m15
786 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
787 EPEL_COMPUTE %2, %1, m14, m15
795 EPEL_COMPUTE 14, %1, m12, m13
796 SIMPLE_BILOAD %1, src2q, m8, m9
797 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
798 PEL_%2STORE%1 dstq, m0, m1
802 add dstq, dststrideq ; dst += dststride
803 add srcq, srcstrideq ; src += srcstride
804 lea src2q, [src2q+2*src2strideq] ; src += srcstride
805 dec heightd ; cmp height
806 jnz .loop ; height loop
810 ; ******************************
811 ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
812 ; uint8_t *_src, ptrdiff_t _srcstride,
813 ; int width, int height, int mx, int my)
814 ; ******************************
816 %macro HEVC_PUT_HEVC_QPEL 2
817 cglobal hevc_put_hevc_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
820 QPEL_H_LOAD %2, srcq, %1, 10
825 PEL_10STORE%1 dstq, m0, m1
826 LOOP_END dst, dststride, src, srcstride
829 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
833 QPEL_H_LOAD %2, srcq, %1, 10
838 UNI_COMPUTE %1, %2, m0, m1, m9
839 PEL_%2STORE%1 dstq, m0, m1
840 add dstq, dststrideq ; dst += dststride
841 add srcq, srcstrideq ; src += srcstride
842 dec heightd ; cmp height
843 jnz .loop ; height loop
846 cglobal hevc_put_hevc_bi_qpel_h%1_%2, 8, 9, 16 , dst, dststride, src, srcstride, src2, src2stride, height, mx, rfilter
847 movdqa m9, [pw_bi_%2]
850 QPEL_H_LOAD %2, srcq, %1, 10
855 SIMPLE_BILOAD %1, src2q, m10, m11
856 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
857 PEL_%2STORE%1 dstq, m0, m1
858 add dstq, dststrideq ; dst += dststride
859 add srcq, srcstrideq ; src += srcstride
860 lea src2q, [src2q+2*src2strideq] ; src += srcstride
861 dec heightd ; cmp height
862 jnz .loop ; height loop
866 ; ******************************
867 ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
868 ; uint8_t *_src, ptrdiff_t _srcstride,
869 ; int width, int height, int mx, int my)
870 ; ******************************
872 cglobal hevc_put_hevc_qpel_v%1_%2, 7, 9, 15, dst, dststride, src, srcstride, height, r3src, my, rfilter
873 lea r3srcq, [srcstrideq*3]
876 QPEL_V_LOAD %2, srcq, srcstride, %1, r8
881 PEL_10STORE%1 dstq, m0, m1
882 LOOP_END dst, dststride, src, srcstride
885 cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 9, 15, dst, dststride, src, srcstride, height, r3src, my, rfilter
887 lea r3srcq, [srcstrideq*3]
890 QPEL_V_LOAD %2, srcq, srcstride, %1, r8
895 UNI_COMPUTE %1, %2, m0, m1, m9
896 PEL_%2STORE%1 dstq, m0, m1
897 add dstq, dststrideq ; dst += dststride
898 add srcq, srcstrideq ; src += srcstride
899 dec heightd ; cmp height
900 jnz .loop ; height loop
903 cglobal hevc_put_hevc_bi_qpel_v%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, r3src, my, rfilter
904 movdqa m9, [pw_bi_%2]
905 lea r3srcq, [srcstrideq*3]
908 SIMPLE_BILOAD %1, src2q, m10, m11
909 QPEL_V_LOAD %2, srcq, srcstride, %1, r10
914 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
915 PEL_%2STORE%1 dstq, m0, m1
916 add dstq, dststrideq ; dst += dststride
917 add srcq, srcstrideq ; src += srcstride
918 lea src2q, [src2q+2*src2strideq] ; src += srcstride
919 dec heightd ; cmp height
920 jnz .loop ; height loop
925 ; ******************************
926 ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
927 ; uint8_t *_src, ptrdiff_t _srcstride,
928 ; int height, int mx, int my)
929 ; ******************************
930 %macro HEVC_PUT_HEVC_QPEL_HV 2
931 cglobal hevc_put_hevc_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
934 lea r3srcq, [srcstrideq*3]
936 QPEL_H_LOAD %2, srcq, %1, 15
937 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
940 QPEL_H_LOAD %2, srcq, %1, 15
941 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
944 QPEL_H_LOAD %2, srcq, %1, 15
945 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
948 QPEL_H_LOAD %2, srcq, %1, 15
949 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
952 QPEL_H_LOAD %2, srcq, %1, 15
953 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
956 QPEL_H_LOAD %2, srcq, %1, 15
957 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
960 QPEL_H_LOAD %2, srcq, %1, 15
961 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
965 QPEL_H_LOAD %2, srcq, %1, 15
966 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
969 punpcklwd m2, m10, m11
970 punpcklwd m4, m12, m13
971 punpcklwd m6, m14, m15
974 punpckhwd m3, m10, m11
975 punpckhwd m5, m12, m13
976 punpckhwd m7, m14, m15
978 QPEL_HV_COMPUTE %1, 14, my, ackssdw
979 PEL_10STORE%1 dstq, m0, m1
997 LOOP_END dst, dststride, src, srcstride
1000 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
1003 lea r3srcq, [srcstrideq*3]
1005 QPEL_H_LOAD %2, srcq, %1, 15
1006 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1008 add srcq, srcstrideq
1009 QPEL_H_LOAD %2, srcq, %1, 15
1010 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1012 add srcq, srcstrideq
1013 QPEL_H_LOAD %2, srcq, %1, 15
1014 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1016 add srcq, srcstrideq
1017 QPEL_H_LOAD %2, srcq, %1, 15
1018 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1020 add srcq, srcstrideq
1021 QPEL_H_LOAD %2, srcq, %1, 15
1022 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1024 add srcq, srcstrideq
1025 QPEL_H_LOAD %2, srcq, %1, 15
1026 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1028 add srcq, srcstrideq
1029 QPEL_H_LOAD %2, srcq, %1, 15
1030 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1032 add srcq, srcstrideq
1034 QPEL_H_LOAD %2, srcq, %1, 15
1035 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1037 punpcklwd m0, m8, m9
1038 punpcklwd m2, m10, m11
1039 punpcklwd m4, m12, m13
1040 punpcklwd m6, m14, m15
1042 punpckhwd m1, m8, m9
1043 punpckhwd m3, m10, m11
1044 punpckhwd m5, m12, m13
1045 punpckhwd m7, m14, m15
1047 QPEL_HV_COMPUTE %1, 14, my, ackusdw
1048 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
1049 PEL_%2STORE%1 dstq, m0, m1
1068 add dstq, dststrideq ; dst += dststride
1069 add srcq, srcstrideq ; src += srcstride
1070 dec heightd ; cmp height
1071 jnz .loop ; height loop
1074 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
1077 lea r3srcq, [srcstrideq*3]
1079 QPEL_H_LOAD %2, srcq, %1, 15
1080 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1082 add srcq, srcstrideq
1083 QPEL_H_LOAD %2, srcq, %1, 15
1084 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1086 add srcq, srcstrideq
1087 QPEL_H_LOAD %2, srcq, %1, 15
1088 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1090 add srcq, srcstrideq
1091 QPEL_H_LOAD %2, srcq, %1, 15
1092 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1094 add srcq, srcstrideq
1095 QPEL_H_LOAD %2, srcq, %1, 15
1096 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1098 add srcq, srcstrideq
1099 QPEL_H_LOAD %2, srcq, %1, 15
1100 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1102 add srcq, srcstrideq
1103 QPEL_H_LOAD %2, srcq, %1, 15
1104 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1106 add srcq, srcstrideq
1108 QPEL_H_LOAD %2, srcq, %1, 15
1109 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1111 punpcklwd m0, m8, m9
1112 punpcklwd m2, m10, m11
1113 punpcklwd m4, m12, m13
1114 punpcklwd m6, m14, m15
1116 punpckhwd m1, m8, m9
1117 punpckhwd m3, m10, m11
1118 punpckhwd m5, m12, m13
1119 punpckhwd m7, m14, m15
1121 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1122 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
1123 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1124 PEL_%2STORE%1 dstq, m0, m1
1143 add dstq, dststrideq ; dst += dststride
1144 add srcq, srcstrideq ; src += srcstride
1145 lea src2q, [src2q+2*src2strideq] ; src += srcstride
1146 dec heightd ; cmp height
1147 jnz .loop ; height loop
1151 %macro WEIGHTING_FUNCS 2
1152 %if WIN64 || ARCH_X86_32
1153 cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1157 cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1158 %define SHIFT denomd
1160 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
1162 movd m4, SHIFT ; shift
1165 movdqu m5, [one_per_32]
1171 shl SHIFT, %2-8 ; ox << (bitd - 8)
1175 %if WIN64 || ARCH_X86_32
1179 SIMPLE_LOAD %1, 10, srcq, m0
1182 punpckhwd m1, m0, m6
1194 pminsw m0, [max_pixels_%2]
1196 PEL_%2STORE%1 dstq, m0, m1
1197 add dstq, dststrideq ; dst += dststride
1198 lea srcq, [srcq+2*srcstrideq] ; src += srcstride
1199 dec heightd ; cmp height
1200 jnz .loop ; height loop
1203 cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1
1206 lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
1208 movd m0, r6d ; shift
1212 movd m5, r6d ; shift+1
1218 shl r6d, %2-8 ; ox << (bitd - 8)
1221 movd m4, r6d ; offset
1227 SIMPLE_LOAD %1, 10, srcq, m0
1228 SIMPLE_LOAD %1, 10, src2q, m8
1233 punpckhwd m1, m0, m6
1235 punpckhwd m9, m8, m7
1247 pminsw m0, [max_pixels_%2]
1249 PEL_%2STORE%1 dstq, m0, m1
1250 add dstq, dststrideq ; dst += dststride
1251 lea srcq, [srcq+2*srcstrideq] ; src += srcstride
1252 lea src2q, [src2q+2*src2strideq] ; src2 += srcstride
1253 dec r6d ; cmp height
1254 jnz .loop ; height loop
1258 WEIGHTING_FUNCS 2, 8
1259 WEIGHTING_FUNCS 4, 8
1260 WEIGHTING_FUNCS 6, 8
1261 WEIGHTING_FUNCS 8, 8
1263 WEIGHTING_FUNCS 2, 10
1264 WEIGHTING_FUNCS 4, 10
1265 WEIGHTING_FUNCS 6, 10
1266 WEIGHTING_FUNCS 8, 10
1268 WEIGHTING_FUNCS 2, 12
1269 WEIGHTING_FUNCS 4, 12
1270 WEIGHTING_FUNCS 6, 12
1271 WEIGHTING_FUNCS 8, 12
1273 HEVC_PUT_HEVC_PEL_PIXELS 2, 8
1274 HEVC_PUT_HEVC_PEL_PIXELS 4, 8
1275 HEVC_PUT_HEVC_PEL_PIXELS 6, 8
1276 HEVC_PUT_HEVC_PEL_PIXELS 8, 8
1277 HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1278 HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1280 HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1281 HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1282 HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1283 HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1285 HEVC_PUT_HEVC_PEL_PIXELS 2, 12
1286 HEVC_PUT_HEVC_PEL_PIXELS 4, 12
1287 HEVC_PUT_HEVC_PEL_PIXELS 6, 12
1288 HEVC_PUT_HEVC_PEL_PIXELS 8, 12
1290 HEVC_PUT_HEVC_EPEL 2, 8
1291 HEVC_PUT_HEVC_EPEL 4, 8
1292 HEVC_PUT_HEVC_EPEL 6, 8
1293 HEVC_PUT_HEVC_EPEL 8, 8
1294 HEVC_PUT_HEVC_EPEL 12, 8
1295 HEVC_PUT_HEVC_EPEL 16, 8
1298 HEVC_PUT_HEVC_EPEL 2, 10
1299 HEVC_PUT_HEVC_EPEL 4, 10
1300 HEVC_PUT_HEVC_EPEL 6, 10
1301 HEVC_PUT_HEVC_EPEL 8, 10
1303 HEVC_PUT_HEVC_EPEL 2, 12
1304 HEVC_PUT_HEVC_EPEL 4, 12
1305 HEVC_PUT_HEVC_EPEL 6, 12
1306 HEVC_PUT_HEVC_EPEL 8, 12
1308 HEVC_PUT_HEVC_EPEL_HV 2, 8
1309 HEVC_PUT_HEVC_EPEL_HV 4, 8
1310 HEVC_PUT_HEVC_EPEL_HV 6, 8
1311 HEVC_PUT_HEVC_EPEL_HV 8, 8
1313 HEVC_PUT_HEVC_EPEL_HV 2, 10
1314 HEVC_PUT_HEVC_EPEL_HV 4, 10
1315 HEVC_PUT_HEVC_EPEL_HV 6, 10
1316 HEVC_PUT_HEVC_EPEL_HV 8, 10
1318 HEVC_PUT_HEVC_EPEL_HV 2, 12
1319 HEVC_PUT_HEVC_EPEL_HV 4, 12
1320 HEVC_PUT_HEVC_EPEL_HV 6, 12
1321 HEVC_PUT_HEVC_EPEL_HV 8, 12
1323 HEVC_PUT_HEVC_QPEL 4, 8
1324 HEVC_PUT_HEVC_QPEL 8, 8
1325 HEVC_PUT_HEVC_QPEL 12, 8
1326 HEVC_PUT_HEVC_QPEL 16, 8
1328 HEVC_PUT_HEVC_QPEL 4, 10
1329 HEVC_PUT_HEVC_QPEL 8, 10
1331 HEVC_PUT_HEVC_QPEL 4, 12
1332 HEVC_PUT_HEVC_QPEL 8, 12
1334 HEVC_PUT_HEVC_QPEL_HV 2, 8
1335 HEVC_PUT_HEVC_QPEL_HV 4, 8
1336 HEVC_PUT_HEVC_QPEL_HV 6, 8
1337 HEVC_PUT_HEVC_QPEL_HV 8, 8
1339 HEVC_PUT_HEVC_QPEL_HV 2, 10
1340 HEVC_PUT_HEVC_QPEL_HV 4, 10
1341 HEVC_PUT_HEVC_QPEL_HV 6, 10
1342 HEVC_PUT_HEVC_QPEL_HV 8, 10
1344 HEVC_PUT_HEVC_QPEL_HV 2, 12
1345 HEVC_PUT_HEVC_QPEL_HV 4, 12
1346 HEVC_PUT_HEVC_QPEL_HV 6, 12
1347 HEVC_PUT_HEVC_QPEL_HV 8, 12
1349 %endif ; ARCH_X86_64