1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the RV40 decoder
3 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
23 %include "x86util.asm"
28 shift_round: times 8 dw 1 << (16 - 6)
33 ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
35 movh m4, [%3 + r6 + 0]
36 movh m5, [%4 + r6 + 0]
38 %define OFFSET r6 + mmsize / 2
40 ; 8x8 block and sse2, stride was provided
44 movh m6, [%3 + OFFSET]
45 movh m7, [%4 + OFFSET]
102 ; Only called for 8x8 blocks and sse2
115 RV40_WCORE %2, r0, r1, r2
117 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
120 ; Prepare for next loop
124 RV40_WCORE %2, r0, r1, r2, r5
125 ; Prepare 2 next lines
128 RV40_WCORE %2, r0, r1, r2
129 ; Prepare single next line
136 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
137 ; %1=size %2=num of xmm regs
138 ; The weights are FP0.14 notation of fractions depending on pts.
139 ; For timebases without rounding error (i.e. PAL), the fractions
140 ; can be simplified, and several operations can be avoided.
141 ; Therefore, we check here whether they are multiples of 2^9 for
142 ; those simplifications to occur.
144 cglobal rv40_weight_func_%1_%2, 6, 7, 8
146 mova m1, [shift_round]
151 ; Set loop counter and increments
181 RV40_WEIGHT rnd, 8, 3
182 RV40_WEIGHT rnd, 16, 4
183 RV40_WEIGHT nornd, 8, 3
184 RV40_WEIGHT nornd, 16, 4
187 RV40_WEIGHT rnd, 8, 3
188 RV40_WEIGHT rnd, 16, 4
189 RV40_WEIGHT nornd, 8, 3
190 RV40_WEIGHT nornd, 16, 4
193 RV40_WEIGHT rnd, 8, 3
194 RV40_WEIGHT rnd, 16, 4
195 RV40_WEIGHT nornd, 8, 3
196 RV40_WEIGHT nornd, 16, 4