1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
3 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
25 pw_row_coeffs: times 4 dw 13
28 pd_512: times 2 dd 0x200
29 pw_col_coeffs: dw 13, 13, 13, -13
36 %macro IDCT_DC_NOROUND 1
41 %macro IDCT_DC_ROUND 1
48 cglobal rv34_idct_%1, 1, 2, 0
61 %define IDCT_DC IDCT_DC_ROUND
63 %define IDCT_DC IDCT_DC_NOROUND
66 ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
69 cglobal rv34_idct_dc_add, 3, 3
103 ; Load coeffs and perform row transform
104 ; Output: coeffs in mm[0467], rounder in mm5
105 %macro ROW_TRANSFORM 1
116 mova mm6, [pw_row_coeffs+ 0]
117 paddsw mm0, mm2 ; b0 + b2
118 psubsw mm4, mm2 ; b0 - b2
119 pmullw mm0, mm6 ; *13 = z0
120 pmullw mm4, mm6 ; *13 = z1
122 pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
123 pmullw mm5, [pw_row_coeffs+16] ; b1* 7
125 pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
126 pmullw mm7, [pw_row_coeffs+16] ; b3* 7
127 paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
128 psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
131 paddsw mm0, mm1 ; z0 + z3
132 psubsw mm7, mm1 ; z0 - z3
133 paddsw mm4, mm5 ; z1 + z2
134 psubsw mm6, mm5 ; z1 - z2
135 mova mm5, [pd_512] ; 0x200
138 ; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
139 %macro COL_TRANSFORM 4
140 pshufw mm3, %2, 0xDD ; col. 1,3,1,3
141 pshufw %2, %2, 0x88 ; col. 0,2,0,2
142 pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
143 pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
145 pshufw mm1, %2, 01001110b ; z1 | z0
146 pshufw mm2, mm3, 01001110b ; z2 | z3
147 paddd %2, mm3 ; z0+z3 | z1+z2
148 psubd mm1, mm2 ; z1-z2 | z0-z3
160 cglobal rv34_idct_add, 3,3,0, d, s, b
162 COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
163 mova mm0, [pw_col_coeffs+ 0]
164 COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
165 mova mm4, [pw_col_coeffs+ 8]
167 COL_TRANSFORM [dq], mm6, mm0, mm4
168 COL_TRANSFORM [dq+sq], mm7, mm0, mm4
171 ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
172 %macro RV34_IDCT_DC_ADD 0
173 cglobal rv34_idct_dc_add, 3, 3, 6
196 pextrd [r0+r1], m2, 1
198 pextrd [r2+r1], m2, 3