1 ;*****************************************************************************
2 ;* x86-optimized HEVC MC
3 ;* Copyright 2015 Anton Khirnov
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 pw_1023: times 8 dw 1023
28 cextern hevc_qpel_coeffs
29 cextern hevc_qpel_coeffs8
31 cextern hevc_epel_coeffs
32 cextern hevc_epel_coeffs8
45 %assign nb_blocks ((%1 + blocksize - 1) / blocksize)
46 %define last_block_truncated (blocksize * nb_blocks > %1)
48 %define LOAD_BLOCK movu
49 %define LOAD_HALFBLOCK movq
52 %define LOAD_BLOCK movq
53 %define LOAD_HALFBLOCK movd
56 %define STORE_BLOCK mova
57 %define STORE_HALFBLOCK movq
62 %if last_block_truncated && %1 == nb_blocks - 1
63 %define block_truncated 1
64 %define LOAD LOAD_HALFBLOCK
65 %define STORE STORE_HALFBLOCK
67 %define block_truncated 0
68 %define LOAD LOAD_BLOCK
69 %define STORE STORE_BLOCK
74 ; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
75 ; pixel *src, ptrdiff_t srcstride,
76 ; int height, int mx, int my, int *mcbuffer)
80 ; %3: log2 of height unroll
82 cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused
103 LOAD m1, [srcq + j * pixelsize * blocksize]
108 STORE [dstq + j * 2 * blocksize], m1
143 ; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
144 ; uint8_t *src, ptrdiff_t srcstride,
145 ; int height, int mx, int my, int *mcbuffer)
147 ; 8-bit qpel interpolation
149 ; %2: 0 - horizontal; 1 - vertical
154 %define coeffsaddr r5q
155 %define pixstride srcstrideq
156 %define pixstride3 r5q
161 %define coeffsaddr r6q
164 %define src_m3 (srcq - 3)
169 cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my
173 lea coeffsaddr, [hevc_qpel_coeffs8]
174 mova m0, [coeffsaddr + mvfrac]
182 lea pixstride3, [srcstrideq + 2 * srcstrideq]
184 sub src_m3, pixstride3
194 LOAD m4, [src_m3 + i * blocksize]
195 LOAD m5, [src_m3 + i * blocksize + 1 * pixstride]
199 LOAD m5, [src_m3 + i * blocksize + 2 * pixstride]
200 LOAD m6, [srcq + i * blocksize]
205 LOAD m5, [srcq + i * blocksize + 1 * pixstride]
206 LOAD m6, [srcq + i * blocksize + 2 * pixstride]
211 LOAD m5, [srcq + i * blocksize + pixstride3]
212 LOAD m6, [srcq + i * blocksize + 4 * pixstride]
217 STORE [dstq + i * 2 * blocksize], m4
225 add src_m3, srcstrideq
252 ; 16-bit qpel interpolation
254 ; %2: shift applied to the result
255 ; %3: 0 - horizontal; 1 - vertical
259 %define pixstride srcstrideq
260 %define pixstride3 sstride3q
261 %define src_m3 srcm3q
266 %define src_m3 (srcq - 6)
274 lea coeffsregq, [hevc_qpel_coeffs]
275 mova m0, [coeffsregq + mvfrac]
283 lea sstride3q, [srcstrideq + 2 * srcstrideq]
285 sub srcm3q, sstride3q
295 LOAD m4, [src_m3 + i * 2 * blocksize]
296 LOAD m5, [src_m3 + i * 2 * blocksize + 1 * pixstride]
297 LOAD m6, [src_m3 + i * 2 * blocksize + 2 * pixstride]
298 LOAD m7, [srcq + i * 2 * blocksize + 0 * pixstride]
299 LOAD m8, [srcq + i * 2 * blocksize + 1 * pixstride]
300 LOAD m9, [srcq + i * 2 * blocksize + 2 * pixstride]
301 LOAD m10, [srcq + i * 2 * blocksize + pixstride3]
302 LOAD m11, [srcq + i * 2 * blocksize + 4 * pixstride]
304 punpcklwd m12, m4, m5
307 punpcklwd m13, m6, m7
311 punpcklwd m13, m8, m9
315 punpcklwd m13, m10, m11
320 %if block_truncated == 0
339 STORE [dstq + i * 2 * blocksize], m12
347 add srcm3q, srcstrideq
358 cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg
373 cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
387 ; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
388 ; uint8_t *src, ptrdiff_t srcstride,
389 ; int height, int mx, int my, int *mcbuffer)
392 cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
408 ; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
409 ; uint8_t *src, ptrdiff_t srcstride,
410 ; int height, int mx, int my, int *mcbuffer)
412 ; 8-bit epel interpolation
414 ; %2: 0 - horizontal; 1 - vertical
419 %define coeffsaddr r5q
420 %define pixstride srcstrideq
421 %define pixstride3 r5q
425 %define coeffsaddr r6q
432 cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my
436 lea coeffsaddr, [hevc_epel_coeffs8]
437 movq m0, [coeffsaddr + mvfrac]
443 lea pixstride3, [srcstrideq + 2 * srcstrideq]
454 LOAD m2, [srcq + i * blocksize + 0 * pixstride]
455 LOAD m3, [srcq + i * blocksize + 1 * pixstride]
456 LOAD m4, [srcq + i * blocksize + 2 * pixstride]
457 LOAD m5, [srcq + i * blocksize + pixstride3]
467 STORE [dstq + i * 2 * blocksize], m2
498 %define pixstride srcstrideq
499 %define pixstride3 sstride3q
511 lea coeffsregq, [hevc_epel_coeffs]
512 mova m0, [coeffsregq + mvfrac]
518 lea sstride3q, [srcstrideq + 2 * srcstrideq]
529 LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride]
530 LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride]
531 LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride]
532 LOAD m5, [srcq + i * 2 * blocksize + pixstride3]
541 %if block_truncated == 0
550 STORE [dstq + i * 2 * blocksize], m6
566 cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
579 cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
591 ; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
592 ; int16_t *src, ptrdiff_t srcstride,
593 ; int height, int mx, int my, int *mcbuffer)
596 cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
610 ; hevc_put_unweighted_pred_<w>_<d>(pixel *dst, ptrdiff_t dststride,
611 ; int16_t *src, ptrdiff_t srcstride,
625 ; %1: 0 - one source; 1 - two sources
630 cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height
632 cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height
635 %assign shift 14 + %1 - %3
636 %assign offset (1 << (shift - 1))
637 %define offset_data pw_ %+ offset
639 mova m0, [offset_data]
642 %define STORE_BLOCK movu
643 %define STORE_HALF movq
645 %assign pixel_max ((1 << %3) - 1)
646 %define pw_pixel_max pw_ %+ pixel_max
648 mova m2, [pw_pixel_max]
650 %define STORE_BLOCK movq
651 %define STORE_HALF movd
660 %define STORE STORE_HALF
663 %define STORE STORE_BLOCK
666 LOAD m3, [srcq + 16 * i]
667 AVG m3, [src2q + 16 * i], %1, %3 - i * 8, m4
674 STORE [dstq + 8 * i], m3
677 STORE [dstq + 16 * i], m3
685 add src2q, srcstrideq
728 %macro PUT_WEIGHTED_PRED 3
730 cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height
732 cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height
736 movsx weight0d, weight0w
737 movsx offset0d, offset0w
739 movsx weight1d, weight1w
740 movsx offset1d, offset1w
743 add denomd, 14 + %1 - %3
747 %assign pixel_max ((1 << %3) - 1)
748 %define pw_pixel_max pw_ %+ pixel_max
750 mova m5, [pw_pixel_max]
759 lea offset0d, [offset0d + offset1d + 1]
761 lea offset0d, [2 * offset0d + 1]
779 pmovsxwd m6, [src0q + 8 * i]
783 pmovsxwd m7, [src1q + 8 * i]
795 movq [dstq + 8 * i], m6
798 movd [dstq + 4 * i], m6
805 add src0q, srcstrideq
807 add src1q, srcstrideq
817 PUT_WEIGHTED_PRED 0, 4, 8
818 PUT_WEIGHTED_PRED 1, 4, 8
819 PUT_WEIGHTED_PRED 0, 8, 8
820 PUT_WEIGHTED_PRED 1, 8, 8
821 PUT_WEIGHTED_PRED 0, 12, 8
822 PUT_WEIGHTED_PRED 1, 12, 8
823 PUT_WEIGHTED_PRED 0, 16, 8
824 PUT_WEIGHTED_PRED 1, 16, 8
825 PUT_WEIGHTED_PRED 0, 24, 8
826 PUT_WEIGHTED_PRED 1, 24, 8
827 PUT_WEIGHTED_PRED 0, 32, 8
828 PUT_WEIGHTED_PRED 1, 32, 8
829 PUT_WEIGHTED_PRED 0, 48, 8
830 PUT_WEIGHTED_PRED 1, 48, 8
831 PUT_WEIGHTED_PRED 0, 64, 8
832 PUT_WEIGHTED_PRED 1, 64, 8
834 PUT_WEIGHTED_PRED 0, 4, 10
835 PUT_WEIGHTED_PRED 1, 4, 10
836 PUT_WEIGHTED_PRED 0, 8, 10
837 PUT_WEIGHTED_PRED 1, 8, 10
838 PUT_WEIGHTED_PRED 0, 12, 10
839 PUT_WEIGHTED_PRED 1, 12, 10
840 PUT_WEIGHTED_PRED 0, 16, 10
841 PUT_WEIGHTED_PRED 1, 16, 10
842 PUT_WEIGHTED_PRED 0, 24, 10
843 PUT_WEIGHTED_PRED 1, 24, 10
844 PUT_WEIGHTED_PRED 0, 32, 10
845 PUT_WEIGHTED_PRED 1, 32, 10
846 PUT_WEIGHTED_PRED 0, 48, 10
847 PUT_WEIGHTED_PRED 1, 48, 10
848 PUT_WEIGHTED_PRED 0, 64, 10
849 PUT_WEIGHTED_PRED 1, 64, 10