1 ;*****************************************************************************
2 ;* x86-optimized functions for w3fdif filter
4 ;* Copyright (c) 2015 Paul B Mahol
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
28 cglobal w3fdif_scale, 3, 3, 2, 0, out_pixel, work_pixel, linesize
30 mova m0, [work_pixelq]
31 mova m1, [work_pixelq+mmsize]
37 add out_pixelq, mmsize/2
38 add work_pixelq, mmsize*2
39 sub linesized, mmsize/2
43 cglobal w3fdif_simple_low, 4, 5, 6, 0, work_line, in_lines_cur0, coef, linesize, offset
45 DEFINE_ARGS work_line, in_lines_cur0, in_lines_cur1, linesize, offset
50 mov in_lines_cur1q, [in_lines_cur0q + gprsize]
51 mov in_lines_cur0q, [in_lines_cur0q]
54 movh m2, [in_lines_cur0q+offsetq]
55 movh m3, [in_lines_cur1q+offsetq]
58 SBUTTERFLY wd, 2, 3, 5
61 mova [work_lineq+offsetq*4], m2
62 mova [work_lineq+offsetq*4+mmsize], m3
64 sub linesized, mmsize/2
68 cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, in_lines_cur0, coef, linesize
70 DEFINE_ARGS work_line, in_lines_cur0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_cur3
75 mov in_lines_cur3q, [in_lines_cur0q+gprsize*3]
76 mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
77 mov in_lines_cur1q, [in_lines_cur0q+gprsize]
78 mov in_lines_cur0q, [in_lines_cur0q]
81 movh m4, [in_lines_cur0q+offsetq]
82 movh m5, [in_lines_cur1q+offsetq]
85 SBUTTERFLY wd, 4, 5, 7
88 movh m6, [in_lines_cur2q+offsetq]
89 movh m3, [in_lines_cur3q+offsetq]
92 SBUTTERFLY wd, 6, 3, 7
97 mova [work_lineq+offsetq*4], m4
98 mova [work_lineq+offsetq*4+mmsize], m5
100 sub linesized, mmsize/2
106 cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
108 DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
113 mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
114 mov in_lines_cur1q, [in_lines_cur0q+gprsize]
115 mov in_lines_cur0q, [in_lines_cur0q]
116 mov in_lines_adj2q, [in_lines_adj0q+gprsize*2]
117 mov in_lines_adj1q, [in_lines_adj0q+gprsize]
118 mov in_lines_adj0q, [in_lines_adj0q]
121 movh m3, [in_lines_cur0q+offsetq]
122 movh m4, [in_lines_cur1q+offsetq]
125 SBUTTERFLY wd, 3, 4, 1
128 movh m5, [in_lines_adj0q+offsetq]
129 movh m6, [in_lines_adj1q+offsetq]
132 SBUTTERFLY wd, 5, 6, 1
137 movh m5, [in_lines_cur2q+offsetq]
138 movh m6, [in_lines_adj2q+offsetq]
141 SBUTTERFLY wd, 5, 6, 1
146 paddd m3, [work_lineq+offsetq*4]
147 paddd m4, [work_lineq+offsetq*4+mmsize]
148 mova [work_lineq+offsetq*4], m3
149 mova [work_lineq+offsetq*4+mmsize], m4
150 add offsetq, mmsize/2
151 sub linesized, mmsize/2
155 cglobal w3fdif_complex_high, 5, 13, 10, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
158 DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_cur3, in_lines_cur4, in_lines_adj1, in_lines_adj2, in_lines_adj3, in_lines_adj4
164 mov in_lines_cur4q, [in_lines_cur0q+gprsize*4]
165 mov in_lines_cur3q, [in_lines_cur0q+gprsize*3]
166 mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
167 mov in_lines_cur1q, [in_lines_cur0q+gprsize]
168 mov in_lines_cur0q, [in_lines_cur0q]
169 mov in_lines_adj4q, [in_lines_adj0q+gprsize*4]
170 mov in_lines_adj3q, [in_lines_adj0q+gprsize*3]
171 mov in_lines_adj2q, [in_lines_adj0q+gprsize*2]
172 mov in_lines_adj1q, [in_lines_adj0q+gprsize]
173 mov in_lines_adj0q, [in_lines_adj0q]
176 movh m5, [in_lines_cur0q+offsetq]
177 movh m6, [in_lines_cur1q+offsetq]
180 SBUTTERFLY wd, 5, 6, 2
183 movh m8, [in_lines_cur2q+offsetq]
184 movh m9, [in_lines_cur3q+offsetq]
187 SBUTTERFLY wd, 8, 9, 2
192 movh m8, [in_lines_adj0q+offsetq]
193 movh m9, [in_lines_adj1q+offsetq]
196 SBUTTERFLY wd, 8, 9, 2
201 movh m8, [in_lines_adj2q+offsetq]
202 movh m9, [in_lines_adj3q+offsetq]
205 SBUTTERFLY wd, 8, 9, 2
210 movh m8, [in_lines_cur4q+offsetq]
211 movh m9, [in_lines_adj4q+offsetq]
214 SBUTTERFLY wd, 8, 9, 2
219 paddd m5, [work_lineq+offsetq*4]
220 paddd m6, [work_lineq+offsetq*4+mmsize]
221 mova [work_lineq+offsetq*4], m5
222 mova [work_lineq+offsetq*4+mmsize], m6
223 add offsetq, mmsize/2
224 sub linesized, mmsize/2