1 ;******************************************************************************
2 ;* VC1 motion compensation optimizations
3 ;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
31 ; XXX some of these macros are not used right now, but they will in the future
32 ; when more functions are ported.
34 %macro OP_PUT 2 ; dst, src
37 %macro OP_AVG 2 ; dst, src
41 %macro NORMALIZE_MMX 1 ; shift
42 paddw m3, m7 ; +bias-r
43 paddw m4, m7 ; +bias-r
48 %macro TRANSFER_DO_PACK 2 ; op, dst
54 %macro TRANSFER_DONT_PACK 2 ; op, dst
58 mova [mmsize + %2], m4
61 ; see MSPEL_FILTER13_CORE for use as UNPACK macro
62 %macro DO_UNPACK 1 ; reg
65 %macro DONT_UNPACK 1 ; reg
68 ; Compute the rounder 32-r or 8-r and unpacks it to m7
69 %macro LOAD_ROUNDER_MMX 1 ; round
75 %macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
77 movh m%2, [srcq + stride_neg2]
80 movh m%5, [srcq + strideq]
91 ; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
92 ; x86_reg stride, int rnd, int64_t shift)
93 ; Sacrificing m6 makes it possible to pipeline loads from src
95 cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
96 DECLARE_REG_TMP 3, 4, 5
98 %define shift qword r4m
100 cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
101 DECLARE_REG_TMP 4, 5, 6
103 ; We need shift either in memory or in a mm reg as it's used in psraw
104 ; On WIN64, the arg is already on the stack
105 ; On UNIX64, m5 doesn't seem to be used
113 %define stride_neg2 t0q
114 %define stride_9minus4 t1q
116 mov stride_neg2, strideq
118 add stride_neg2, stride_neg2
119 lea stride_9minus4, [strideq * 9 - 4]
130 SHIFT2_LINE 0, 1, 2, 3, 4
131 SHIFT2_LINE 24, 2, 3, 4, 1
132 SHIFT2_LINE 48, 3, 4, 1, 2
133 SHIFT2_LINE 72, 4, 1, 2, 3
134 SHIFT2_LINE 96, 1, 2, 3, 4
135 SHIFT2_LINE 120, 2, 3, 4, 1
136 SHIFT2_LINE 144, 3, 4, 1, 2
137 SHIFT2_LINE 168, 4, 1, 2, 3
138 sub srcq, stride_9minus4
146 %undef stride_9minus4
149 ; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
150 ; const int16_t *src, int rnd);
151 ; Data is already unpacked, so some operations can directly be made from
153 %macro HOR_16B_SHIFT2 2 ; op, opname
154 cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
157 sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias
158 LOAD_ROUNDER_MMX rndq
164 mova m1, [srcq + 2 * 0]
165 mova m2, [srcq + 2 * 0 + mmsize]
166 mova m3, [srcq + 2 * 1]
167 mova m4, [srcq + 2 * 1 + mmsize]
168 paddw m3, [srcq + 2 * 2]
169 paddw m4, [srcq + 2 * 2 + mmsize]
170 paddw m1, [srcq + 2 * 3]
171 paddw m2, [srcq + 2 * 3 + mmsize]
180 TRANSFER_DO_PACK %1, dstq
190 HOR_16B_SHIFT2 OP_PUT, put
193 HOR_16B_SHIFT2 OP_AVG, avg
194 %endif ; HAVE_MMX_INLINE
196 %macro INV_TRANS_INIT 0
197 movsxdifnidn linesizeq, linesized
205 DEFINE_ARGS dest, linesize, linesize3
206 lea linesize3q, [linesizeq*3]
209 %macro INV_TRANS_PROCESS 1
210 mov%1 m2, [destq+linesizeq*0]
211 mov%1 m3, [destq+linesizeq*1]
212 mov%1 m4, [destq+linesizeq*2]
213 mov%1 m5, [destq+linesize3q]
222 mov%1 [linesizeq*0+destq], m2
223 mov%1 [linesizeq*1+destq], m3
224 mov%1 [linesizeq*2+destq], m4
225 mov%1 [linesize3q +destq], m5
228 ; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, int linesize, int16_t *block)
230 cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
231 movsx r3d, WORD [blockq]
233 shl blockd, 4 ; 16 * dc
234 lea blockd, [blockq+r3+4] ; 17 * dc + 4
237 shl blockd, 4 ; 16 * dc
238 lea blockd, [blockq+r3+64] ; 17 * dc + 64
247 cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
248 movsx r3d, WORD [blockq]
250 shl blockd, 4 ; 16 * dc
251 lea blockd, [blockq+r3+4] ; 17 * dc + 4
253 shl blockd, 2 ; 4 * dc
254 lea blockd, [blockq*3+64] ; 12 * dc + 64
260 lea destq, [destq+linesizeq*4]
265 cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
266 movsx blockd, WORD [blockq] ; dc
267 lea blockd, [blockq*3+1] ; 3 * dc + 1
270 shl blockd, 4 ; 16 * dc
271 lea blockd, [blockq+r3+64] ; 17 * dc + 64
280 cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
281 movsx blockd, WORD [blockq] ; dc
282 lea blockd, [blockq*3+1] ; 3 * dc + 1
284 lea blockd, [blockq*3+16] ; 3 * dc + 16
290 lea destq, [destq+linesizeq*4]