1 ;******************************************************************************
2 ;* x86-optimized functions for the CFHD decoder
3 ;* Copyright (c) 2020 Paul B Mahol
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
27 factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
28 factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
29 factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
33 pw_1023: times 8 dw 1023
34 pw_4095: times 8 dw 4095
38 %macro CFHD_HORIZ_FILTER 1
40 cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
42 %define ostrideq widthq
43 %define lwidthq widthq
44 %define hwidthq widthq
46 cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
48 %define ostrideq widthq
49 %define lwidthq widthq
50 %define hwidthq widthq
53 cglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp
62 cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
82 mova m8, [factor_p1_n1]
83 mova m9, [factor_n1_p1]
94 movsx tempq, word [lowq + 2]
98 movsx xq, word [lowq + 4]
103 movsx xq, word [highq]
109 CLIPW m0, [pw_0], [pw_%1]
112 mov word [outputq], tempw
114 movsx xq, word [lowq]
117 movsx tempq, word [lowq + 2]
121 movsx xq, word [lowq + 4]
126 movsx xq, word [highq]
132 CLIPW m0, [pw_0], [pw_%1]
135 mov word [outputq + 2], tempw
141 movu m1, [lowq + xq + 4]
161 pmaddwd m4, [factor_p1_n1]
162 pmaddwd m5, [factor_p1_n1]
163 pmaddwd m6, [factor_n1_p1]
164 pmaddwd m7, [factor_n1_p1]
177 movu m2, [lowq + xq + 2]
178 movu m3, [highq + xq + 2]
195 pmaddwd m1, [factor_p1_n1]
196 pmaddwd m3, [factor_p1_n1]
217 CLIPW m2, [pw_0], [pw_%1]
218 CLIPW m0, [pw_0], [pw_%1]
221 movu [outputq + xq * 2 + 4], m2
222 movu [outputq + xq * 2 + mmsize + 4], m0
233 movsx xq, word [lowq - 2]
236 movsx tempq, word [lowq - 4]
240 movsx xq, word [lowq - 6]
245 movsx xq, word [highq - 2]
251 CLIPW m0, [pw_0], [pw_%1]
254 mov word [outputq - 4], tempw
256 movsx xq, word [lowq - 2]
259 movsx tempq, word [lowq - 4]
263 movsx xq, word [lowq - 6]
268 movsx xq, word [highq - 2]
274 CLIPW m0, [pw_0], [pw_%1]
277 mov word [outputq - 2], tempw
287 add outputq, ostrideq
288 add outputq, ostrideq
300 CFHD_HORIZ_FILTER 1023
303 CFHD_HORIZ_FILTER 4095
307 cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos
315 mova m8, [factor_p1_n1]
316 mova m9, [factor_n1_p1]
319 mova m12, [factor_p11_n4]
320 mova m13, [factor_p5_p4]
322 cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
339 %define heightq widthm
348 movu m0, [lowq + posq]
350 movu m1, [lowq + posq]
359 pmaddwd m0, [factor_p11_n4]
360 pmaddwd m2, [factor_p11_n4]
365 movu m1, [lowq + posq]
384 movu m1, [highq + posq]
400 movu [outputq + posq], m0
402 movu m0, [lowq + posq]
404 movu m1, [lowq + posq]
413 pmaddwd m0, [factor_p5_p4]
414 pmaddwd m2, [factor_p5_p4]
419 movu m1, [lowq + posq]
438 movu m1, [highq + posq]
455 movu [outputq + posq], m0
464 movu m4, [lowq + posq]
468 movu m1, [lowq + posq]
488 pmaddwd m4, [factor_p1_n1]
489 pmaddwd m5, [factor_p1_n1]
490 pmaddwd m6, [factor_n1_p1]
491 pmaddwd m7, [factor_n1_p1]
505 movu m0, [lowq + posq]
510 movu m1, [highq + posq]
527 pmaddwd m1, [factor_p1_n1]
528 pmaddwd m3, [factor_p1_n1]
549 movu [outputq + posq], m0
551 movu [outputq + posq], m1
560 movu m0, [lowq + posq]
562 movu m1, [lowq + posq]
571 pmaddwd m0, [factor_p5_p4]
572 pmaddwd m2, [factor_p5_p4]
577 movu m1, [lowq + posq]
603 movu m1, [highq + posq]
623 movu [outputq + posq], m0
628 movu m0, [lowq + posq]
630 movu m1, [lowq + posq]
639 pmaddwd m0, [factor_p11_n4]
640 pmaddwd m2, [factor_p11_n4]
645 movu m1, [lowq + posq]
671 movu m1, [highq + posq]
692 movu [outputq + posq], m0