1 ;*****************************************************************************
2 ;* x86-optimized functions for w3fdif filter
4 ;* Copyright (c) 2015 Paul B Mahol
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
28 cglobal w3fdif_scale, 3, 3, 2, 0, out_pixel, work_pixel, linesize
30 mova m0, [work_pixelq]
31 mova m1, [work_pixelq+mmsize]
37 add out_pixelq, mmsize/2
38 add work_pixelq, mmsize*2
39 sub linesized, mmsize/2
43 cglobal w3fdif_simple_low, 4, 5, 6, 0, work_line, in_lines_cur0, coef, linesize, offset
45 DEFINE_ARGS work_line, in_lines_cur0, in_lines_cur1, linesize, offset
50 mov in_lines_cur1q, [in_lines_cur0q + gprsize]
51 mov in_lines_cur0q, [in_lines_cur0q]
54 movh m2, [in_lines_cur0q+offsetq]
55 movh m3, [in_lines_cur1q+offsetq]
58 SBUTTERFLY wd, 2, 3, 5
61 mova [work_lineq+offsetq*4], m2
62 mova [work_lineq+offsetq*4+mmsize], m3
64 sub linesized, mmsize/2
68 cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, in_lines_cur0, coef, linesize
70 DEFINE_ARGS work_line, in_lines_cur0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_cur3
75 mov in_lines_cur3q, [in_lines_cur0q+gprsize*3]
76 mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
77 mov in_lines_cur1q, [in_lines_cur0q+gprsize]
78 mov in_lines_cur0q, [in_lines_cur0q]
81 movh m4, [in_lines_cur0q+offsetq]
82 movh m5, [in_lines_cur1q+offsetq]
85 SBUTTERFLY wd, 4, 5, 7
88 movh m6, [in_lines_cur2q+offsetq]
89 movh m3, [in_lines_cur3q+offsetq]
92 SBUTTERFLY wd, 6, 3, 7
97 mova [work_lineq+offsetq*4], m4
98 mova [work_lineq+offsetq*4+mmsize], m5
100 sub linesized, mmsize/2
105 cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
107 cglobal w3fdif_simple_high, 4, 7, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
111 DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
114 DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2
115 %define linesized r4mp
121 mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
122 mov in_lines_cur1q, [in_lines_cur0q+gprsize]
123 mov in_lines_cur0q, [in_lines_cur0q]
124 mov in_lines_adj2q, [in_lines_adj0q+gprsize*2]
125 mov in_lines_adj1q, [in_lines_adj0q+gprsize]
126 mov in_lines_adj0q, [in_lines_adj0q]
129 sub in_lines_cur1q, in_lines_cur0q
130 sub in_lines_cur2q, in_lines_cur0q
131 sub in_lines_adj0q, in_lines_cur0q
132 sub in_lines_adj1q, in_lines_cur0q
133 sub in_lines_adj2q, in_lines_cur0q
134 %define offsetq in_lines_cur0q
139 movh m3, [in_lines_cur0q+offsetq]
141 movh m3, [in_lines_cur0q]
143 movh m4, [in_lines_cur1q+offsetq]
146 SBUTTERFLY wd, 3, 4, 1
149 movh m5, [in_lines_adj0q+offsetq]
150 movh m6, [in_lines_adj1q+offsetq]
153 SBUTTERFLY wd, 5, 6, 1
158 movh m5, [in_lines_cur2q+offsetq]
159 movh m6, [in_lines_adj2q+offsetq]
162 SBUTTERFLY wd, 5, 6, 1
168 paddd m3, [work_lineq+offsetq*4]
169 paddd m4, [work_lineq+offsetq*4+mmsize]
170 mova [work_lineq+offsetq*4], m3
171 mova [work_lineq+offsetq*4+mmsize], m4
173 paddd m3, [work_lineq]
174 paddd m4, [work_lineq+mmsize]
175 mova [work_lineq], m3
176 mova [work_lineq+mmsize], m4
177 add work_lineq, mmsize*2
179 add offsetq, mmsize/2
180 sub linesized, mmsize/2
186 cglobal w3fdif_complex_high, 5, 13, 10, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
189 DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_cur3, in_lines_cur4, in_lines_adj1, in_lines_adj2, in_lines_adj3, in_lines_adj4
195 mov in_lines_cur4q, [in_lines_cur0q+gprsize*4]
196 mov in_lines_cur3q, [in_lines_cur0q+gprsize*3]
197 mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
198 mov in_lines_cur1q, [in_lines_cur0q+gprsize]
199 mov in_lines_cur0q, [in_lines_cur0q]
200 mov in_lines_adj4q, [in_lines_adj0q+gprsize*4]
201 mov in_lines_adj3q, [in_lines_adj0q+gprsize*3]
202 mov in_lines_adj2q, [in_lines_adj0q+gprsize*2]
203 mov in_lines_adj1q, [in_lines_adj0q+gprsize]
204 mov in_lines_adj0q, [in_lines_adj0q]
207 movh m5, [in_lines_cur0q+offsetq]
208 movh m6, [in_lines_cur1q+offsetq]
211 SBUTTERFLY wd, 5, 6, 2
214 movh m8, [in_lines_cur2q+offsetq]
215 movh m9, [in_lines_cur3q+offsetq]
218 SBUTTERFLY wd, 8, 9, 2
223 movh m8, [in_lines_adj0q+offsetq]
224 movh m9, [in_lines_adj1q+offsetq]
227 SBUTTERFLY wd, 8, 9, 2
232 movh m8, [in_lines_adj2q+offsetq]
233 movh m9, [in_lines_adj3q+offsetq]
236 SBUTTERFLY wd, 8, 9, 2
241 movh m8, [in_lines_cur4q+offsetq]
242 movh m9, [in_lines_adj4q+offsetq]
245 SBUTTERFLY wd, 8, 9, 2
250 paddd m5, [work_lineq+offsetq*4]
251 paddd m6, [work_lineq+offsetq*4+mmsize]
252 mova [work_lineq+offsetq*4], m5
253 mova [work_lineq+offsetq*4+mmsize], m6
254 add offsetq, mmsize/2
255 sub linesized, mmsize/2