1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the VP6 decoder
3 ;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
4 ;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86inc.asm"
24 %include "libavutil/x86/x86util.asm"
39 pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0]
40 pmullw m1, [rsp+8*12] ; src[x ] * biweight [1]
41 pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0]
42 pmullw m4, [rsp+8*12] ; src[x ] * biweight [1]
53 pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2]
54 pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3]
55 pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2]
56 pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3]
61 paddsw m0, m6 ; Add 64
62 paddsw m3, m6 ; Add 64
74 pmullw m0, m4 ; src[x-8 ] * biweight [0]
75 pmullw m1, m5 ; src[x ] * biweight [1]
81 pmullw m1, m6 ; src[x+8 ] * biweight [2]
82 pmullw m2, m3 ; src[x+16] * biweight [3]
85 paddsw m0, [pw_64] ; Add 64
91 %macro SPLAT4REGS_MMX 0
107 %macro SPLAT4REGS_SSE2 0
118 %macro vp6_filter_diag4 2
119 ; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride,
120 ; const int16_t h_weight[4], const int16_t v_weights[4])
121 cglobal vp6_filter_diag4_%1, 5, 7, %2
122 mov r5, rsp ; backup stack pointer
123 and rsp, ~(mmsize-1) ; align stack
143 DIAG4 r1, -1, 0, 1, 2, r3
155 DIAG4 r3, -8, 0, 8, 16, r0
161 mov rsp, r5 ; restore stack pointer
166 %define DIAG4 DIAG4_MMX
167 %define SPLAT4REGS SPLAT4REGS_MMX
168 vp6_filter_diag4 mmx, 0
171 %define DIAG4 DIAG4_SSE2
172 %define SPLAT4REGS SPLAT4REGS_SSE2
173 vp6_filter_diag4 sse2, 8