1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the RV40 decoder
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
7 ;* This file is part of Libav.
9 ;* Libav is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* Libav is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with Libav; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
29 pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
31 sixtap_filter_hb_m: times 8 db 1, -5
33 ; multiplied by 2 to have the same shift
40 sixtap_filter_v_m: times 8 dw 1
44 ; multiplied by 2 to have the same shift
56 %define sixtap_filter_hw picregq
57 %define sixtap_filter_hb picregq
58 %define sixtap_filter_v picregq
61 %define sixtap_filter_hw sixtap_filter_hw_m
62 %define sixtap_filter_hb sixtap_filter_hb_m
63 %define sixtap_filter_v sixtap_filter_v_m
67 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
68 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
69 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
77 ;-----------------------------------------------------------------------------
78 ; subpel MC functions:
80 ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
81 ; uint8_t *src, int srcstride,
83 ;----------------------------------------------------------------------
111 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
113 lea picregq, [sixtap_filter_v_m]
116 LOAD my, sixtap_filter_v
122 movh m1, [srcq+srcstrideq]
123 movh m2, [srcq+srcstrideq*2]
124 lea srcq, [srcq+srcstrideq*2]
127 movh m4, [srcq+srcstrideq]
144 %define COEFF05 [myq+ 0]
145 %define COEFF14 [myq+16]
146 %define COEFF2 [myq+32]
147 %define COEFF3 [myq+48]
151 movh m5, [srcq+2*srcstrideq] ; read new row
176 dec heightd ; next row
182 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
184 lea picregq, [sixtap_filter_v_m]
187 LOAD mx, sixtap_filter_v
199 %define COEFF05 [mxq+ 0]
200 %define COEFF14 [mxq+16]
201 %define COEFF2 [mxq+32]
202 %define COEFF3 [mxq+48]
233 dec heightd ; next row
258 %macro FILTER_SSSE3 1
259 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
261 lea picregq, [sixtap_filter_hb_m]
266 LOAD my, sixtap_filter_hb
269 movh m1, [srcq+srcstrideq]
270 movh m2, [srcq+srcstrideq*2]
271 lea srcq, [srcq+srcstrideq*2]
275 movh m4, [srcq+srcstrideq]
276 lea srcq, [srcq+2*srcstrideq]
283 pmaddubsw m6, [myq+16]
284 movh m7, [srcq] ; read new row
294 pmulhrsw m6, [pw_512]
300 dec heightd ; next row
304 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
306 lea picregq, [sixtap_filter_hb_m]
308 mova m3, [filter_h6_shuf2]
309 mova m4, [filter_h6_shuf3]
310 LOAD mx, sixtap_filter_hb
311 mova m5, [mxq] ; set up 6tap filter in bytes
313 mova m7, [filter_h6_shuf1]
327 pmulhrsw m0, [pw_512]
333 dec heightd ; next row
342 ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
343 %macro RV40_WCORE 4-5
344 movh m4, [%3 + r6 + 0]
345 movh m5, [%4 + r6 + 0]
347 %define OFFSET r6 + mmsize / 2
349 ; 8x8 block and sse2, stride was provided
353 movh m6, [%3 + OFFSET]
354 movh m7, [%4 + OFFSET]
398 ; bias and shift down
411 ; Only called for 8x8 blocks and sse2
424 RV40_WCORE %2, r0, r1, r2
426 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
429 ; Prepare for next loop
433 RV40_WCORE %2, r0, r1, r2, r5
434 ; Prepare 2 next lines
437 RV40_WCORE %2, r0, r1, r2
438 ; Prepare single next line
445 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
446 ; %1=size %2=num of xmm regs
447 ; The weights are FP0.14 notation of fractions depending on pts.
448 ; For timebases without rounding error (i.e. PAL), the fractions
449 ; can be simplified, and several operations can be avoided.
450 ; Therefore, we check here whether they are multiples of 2^9 for
451 ; those simplifications to occur.
453 cglobal rv40_weight_func_%1_%2, 6, 7, 8
460 ; Set loop counter and increments
490 RV40_WEIGHT rnd, 8, 3
491 RV40_WEIGHT rnd, 16, 4
492 RV40_WEIGHT nornd, 8, 3
493 RV40_WEIGHT nornd, 16, 4
496 RV40_WEIGHT rnd, 8, 3
497 RV40_WEIGHT rnd, 16, 4
498 RV40_WEIGHT nornd, 8, 3
499 RV40_WEIGHT nornd, 16, 4
502 RV40_WEIGHT rnd, 8, 3
503 RV40_WEIGHT rnd, 16, 4
504 RV40_WEIGHT nornd, 8, 3
505 RV40_WEIGHT nornd, 16, 4