1 ;******************************************************************************
2 ;* VP9 MC SIMD optimizations
4 ;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
34 %macro filter_h4_fn 1-2 12
35 cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
38 %if notcpuflag(sse4) && ARCH_X86_64
42 mova m7, [filteryq+ 0]
43 %if ARCH_X86_64 && mmsize > 8
44 mova m8, [filteryq+32]
45 mova m9, [filteryq+64]
46 mova m10, [filteryq+96]
57 %if ARCH_X86_64 && mmsize > 8
60 pmaddwd m2, [filteryq+32]
69 %if ARCH_X86_64 && mmsize > 8
73 pmaddwd m4, [filteryq+64]
74 pmaddwd m3, [filteryq+96]
106 cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
108 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
115 %macro filter_h_fn 1-2 12
116 %assign %%px mmsize/2
117 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
120 %if notcpuflag(sse4) && ARCH_X86_64
124 mova m7, [filteryq+ 0]
125 %if ARCH_X86_64 && mmsize > 8
126 mova m8, [filteryq+32]
127 mova m9, [filteryq+64]
128 mova m10, [filteryq+96]
138 %if ARCH_X86_64 && mmsize > 8
143 pmaddwd m2, [filteryq+32]
144 pmaddwd m3, [filteryq+32]
145 pmaddwd m4, [filteryq+64]
154 %if ARCH_X86_64 && mmsize > 8
159 pmaddwd m2, [filteryq+64]
160 pmaddwd m3, [filteryq+96]
161 pmaddwd m4, [filteryq+96]
196 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
198 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
204 %if HAVE_AVX2_EXTERNAL
210 %macro filter_v4_fn 1-2 12
212 cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
214 cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
220 %if notcpuflag(sse4) && ARCH_X86_64
224 lea sstride3q, [sstrideq*3]
225 lea src4q, [srcq+sstrideq]
227 mova m7, [filteryq+ 0]
228 %if ARCH_X86_64 && mmsize > 8
229 mova m8, [filteryq+ 32]
230 mova m9, [filteryq+ 64]
231 mova m10, [filteryq+ 96]
234 ; FIXME maybe reuse loads from previous rows, or just
235 ; more generally unroll this to prevent multiple loads of
238 movh m1, [srcq+sstrideq]
239 movh m2, [srcq+sstrideq*2]
240 movh m3, [srcq+sstride3q]
246 %if ARCH_X86_64 && mmsize > 8
249 pmaddwd m2, [filteryq+ 32]
251 movh m1, [src4q+sstrideq]
252 movh m3, [src4q+sstrideq*2]
254 movh m2, [src4q+sstride3q]
258 %if ARCH_X86_64 && mmsize > 8
262 pmaddwd m4, [filteryq+ 64]
263 pmaddwd m3, [filteryq+ 96]
296 cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
298 cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
302 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
309 %macro filter_v_fn 1-2 13
310 %assign %%px mmsize/2
312 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
314 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
320 %if notcpuflag(sse4) && ARCH_X86_64
326 lea sstride3q, [sstrideq*3]
327 lea src4q, [srcq+sstrideq]
329 mova m7, [filteryq+ 0]
330 %if ARCH_X86_64 && mmsize > 8
331 mova m8, [filteryq+ 32]
332 mova m9, [filteryq+ 64]
333 mova m10, [filteryq+ 96]
336 ; FIXME maybe reuse loads from previous rows, or just
337 ; more generally unroll this to prevent multiple loads of
340 movu m1, [srcq+sstrideq]
341 movu m2, [srcq+sstrideq*2]
342 movu m3, [srcq+sstride3q]
345 SBUTTERFLY wd, 0, 1, 6
346 SBUTTERFLY wd, 2, 3, 6
349 %if ARCH_X86_64 && mmsize > 8
353 pmaddwd m2, [filteryq+ 32]
354 pmaddwd m3, [filteryq+ 32]
358 movu m2, [src4q+sstrideq]
359 movu m3, [src4q+sstrideq*2]
360 SBUTTERFLY wd, 4, 2, 6
361 %if ARCH_X86_64 && mmsize > 8
365 pmaddwd m4, [filteryq+ 64]
366 pmaddwd m2, [filteryq+ 64]
370 movu m4, [src4q+sstride3q]
372 SBUTTERFLY wd, 3, 4, 6
373 %if ARCH_X86_64 && mmsize > 8
377 pmaddwd m3, [filteryq+ 96]
378 pmaddwd m4, [filteryq+ 96]
415 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
417 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
421 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
427 %if HAVE_AVX2_EXTERNAL