1 ;******************************************************************************
2 ;* x86-optimized functions for the CFHD decoder
3 ;* Copyright (c) 2020 Paul B Mahol
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
27 factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
28 factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
29 factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
33 pw_1023: times 8 dw 1023
34 pw_4095: times 8 dw 4095
38 %macro CFHD_HORIZ_FILTER 1
40 cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, bpc
41 DEFINE_ARGS output, low, high, width, x, temp
43 %define ostrideq widthq
44 %define lwidthq widthq
45 %define hwidthq widthq
47 cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, bpc
48 DEFINE_ARGS output, low, high, width, x, temp
50 %define ostrideq widthq
51 %define lwidthq widthq
52 %define hwidthq widthq
55 cglobal cfhd_horiz_filter, 11, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height
56 DEFINE_ARGS output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp
65 cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
85 mova m8, [factor_p1_n1]
86 mova m9, [factor_n1_p1]
97 movsx tempq, word [lowq + 2]
101 movsx xq, word [lowq + 4]
106 movsx xq, word [highq]
112 CLIPW m0, [pw_0], [pw_%1]
115 mov word [outputq], tempw
117 movsx xq, word [lowq]
120 movsx tempq, word [lowq + 2]
124 movsx xq, word [lowq + 4]
129 movsx xq, word [highq]
135 CLIPW m0, [pw_0], [pw_%1]
138 mov word [outputq + 2], tempw
144 movu m1, [lowq + xq + 4]
164 pmaddwd m4, [factor_p1_n1]
165 pmaddwd m5, [factor_p1_n1]
166 pmaddwd m6, [factor_n1_p1]
167 pmaddwd m7, [factor_n1_p1]
180 movu m2, [lowq + xq + 2]
181 movu m3, [highq + xq + 2]
198 pmaddwd m1, [factor_p1_n1]
199 pmaddwd m3, [factor_p1_n1]
220 CLIPW m2, [pw_0], [pw_%1]
221 CLIPW m0, [pw_0], [pw_%1]
224 movu [outputq + xq * 2 + 4], m2
225 movu [outputq + xq * 2 + mmsize + 4], m0
236 movsx xq, word [lowq - 2]
239 movsx tempq, word [lowq - 4]
243 movsx xq, word [lowq - 6]
248 movsx xq, word [highq - 2]
254 CLIPW m0, [pw_0], [pw_%1]
257 mov word [outputq - 4], tempw
259 movsx xq, word [lowq - 2]
262 movsx tempq, word [lowq - 4]
266 movsx xq, word [lowq - 6]
271 movsx xq, word [highq - 2]
277 CLIPW m0, [pw_0], [pw_%1]
280 mov word [outputq - 2], tempw
290 add outputq, ostrideq
291 add outputq, ostrideq
303 CFHD_HORIZ_FILTER 1023
306 CFHD_HORIZ_FILTER 4095
310 cglobal cfhd_vert_filter, 11, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height
311 DEFINE_ARGS output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos
319 mova m8, [factor_p1_n1]
320 mova m9, [factor_n1_p1]
323 mova m12, [factor_p11_n4]
324 mova m13, [factor_p5_p4]
326 cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
343 %define heightq widthm
352 movu m0, [lowq + posq]
354 movu m1, [lowq + posq]
363 pmaddwd m0, [factor_p11_n4]
364 pmaddwd m2, [factor_p11_n4]
369 movu m1, [lowq + posq]
388 movu m1, [highq + posq]
404 movu [outputq + posq], m0
406 movu m0, [lowq + posq]
408 movu m1, [lowq + posq]
417 pmaddwd m0, [factor_p5_p4]
418 pmaddwd m2, [factor_p5_p4]
423 movu m1, [lowq + posq]
442 movu m1, [highq + posq]
459 movu [outputq + posq], m0
468 movu m4, [lowq + posq]
472 movu m1, [lowq + posq]
492 pmaddwd m4, [factor_p1_n1]
493 pmaddwd m5, [factor_p1_n1]
494 pmaddwd m6, [factor_n1_p1]
495 pmaddwd m7, [factor_n1_p1]
509 movu m0, [lowq + posq]
514 movu m1, [highq + posq]
531 pmaddwd m1, [factor_p1_n1]
532 pmaddwd m3, [factor_p1_n1]
553 movu [outputq + posq], m0
555 movu [outputq + posq], m1
564 movu m0, [lowq + posq]
566 movu m1, [lowq + posq]
575 pmaddwd m0, [factor_p5_p4]
576 pmaddwd m2, [factor_p5_p4]
581 movu m1, [lowq + posq]
607 movu m1, [highq + posq]
627 movu [outputq + posq], m0
632 movu m0, [lowq + posq]
634 movu m1, [lowq + posq]
643 pmaddwd m0, [factor_p11_n4]
644 pmaddwd m2, [factor_p11_n4]
649 movu m1, [lowq + posq]
675 movu m1, [highq + posq]
696 movu [outputq + posq], m0