1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
3 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
23 %include "x86util.asm"
26 pw_row_coeffs: times 4 dw 13
29 pd_512: times 2 dd 0x200
30 pw_col_coeffs: dw 13, 13, 13, -13
37 %macro IDCT_DC_NOROUND 1
42 %macro IDCT_DC_ROUND 1
49 cglobal rv34_idct_%1, 1, 2, 0
62 %define IDCT_DC IDCT_DC_ROUND
64 %define IDCT_DC IDCT_DC_NOROUND
67 ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
69 cglobal rv34_idct_dc_add, 3, 3
102 ; Load coeffs and perform row transform
103 ; Output: coeffs in mm[0467], rounder in mm5
104 %macro ROW_TRANSFORM 1
115 mova mm6, [pw_row_coeffs+ 0]
116 paddsw mm0, mm2 ; b0 + b2
117 psubsw mm4, mm2 ; b0 - b2
118 pmullw mm0, mm6 ; *13 = z0
119 pmullw mm4, mm6 ; *13 = z1
121 pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
122 pmullw mm5, [pw_row_coeffs+16] ; b1* 7
124 pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
125 pmullw mm7, [pw_row_coeffs+16] ; b3* 7
126 paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
127 psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
130 paddsw mm0, mm1 ; z0 + z3
131 psubsw mm7, mm1 ; z0 - z3
132 paddsw mm4, mm5 ; z1 + z2
133 psubsw mm6, mm5 ; z1 - z2
134 mova mm5, [pd_512] ; 0x200
137 ; ff_rv34_idct_add_mmx2(uint8_t *dst, ptrdiff_t stride, DCTELEM *block);
138 %macro COL_TRANSFORM 4
139 pshufw mm3, %2, 0xDD ; col. 1,3,1,3
140 pshufw %2, %2, 0x88 ; col. 0,2,0,2
141 pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
142 pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
144 pshufw mm1, %2, 01001110b ; z1 | z0
145 pshufw mm2, mm3, 01001110b ; z2 | z3
146 paddd %2, mm3 ; z0+z3 | z1+z2
147 psubd mm1, mm2 ; z1-z2 | z0-z3
159 cglobal rv34_idct_add, 3,3,0, d, s, b
161 COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
162 mova mm0, [pw_col_coeffs+ 0]
163 COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
164 mova mm4, [pw_col_coeffs+ 8]
166 COL_TRANSFORM [dq], mm6, mm0, mm4
167 COL_TRANSFORM [dq+sq], mm7, mm0, mm4
170 ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
172 cglobal rv34_idct_dc_add, 3, 3, 6
194 pextrd [r0+r1], m2, 1
196 pextrd [r2+r1], m2, 3