1 /*****************************************************************************
2 * ac3_downmix_sse.c: accelerated SSE ac3 downmix functions
3 *****************************************************************************
4 * Copyright (C) 1999, 2000, 2001 VideoLAN
5 * $Id: ac3_downmix_sse.c,v 1.6 2001/11/28 15:08:05 massiot Exp $
7 * Authors: Renaud Dartus <reno@videolan.org>
8 * Aaron Holtzman <aholtzma@engr.uvic.ca>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define MODULE_NAME downmixsse
26 #include "modules_inner.h"
28 /*****************************************************************************
30 *****************************************************************************/
36 #include "ac3_downmix.h"
38 static const float sqrt2_sse __asm__ ("sqrt2_sse") __attribute__ ((aligned (16))) = 0.7071068;
40 void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
42 __asm__ __volatile__ (
45 "movl $64, %%ebx\n" /* loop counter */
47 "movss (%%ecx), %%xmm5\n" /* unit */
48 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
50 "movss 4(%%ecx), %%xmm6\n" /* clev */
51 "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
53 "movss 8(%%ecx), %%xmm7\n" /* slev */
54 "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
58 "movaps (%%eax), %%xmm0\n" /* left */
59 "movaps 2048(%%eax), %%xmm1\n" /* right */
60 "movaps 1024(%%eax), %%xmm2\n" /* center */
61 "movaps 3072(%%eax), %%xmm3\n" /* leftsur */
62 "movaps 4096(%%eax), %%xmm4\n" /* rithgsur */
63 "mulps %%xmm5, %%xmm0\n"
64 "mulps %%xmm5, %%xmm1\n"
65 "mulps %%xmm6, %%xmm2\n"
66 "addps %%xmm2, %%xmm0\n"
67 "addps %%xmm2, %%xmm1\n"
68 "mulps %%xmm7, %%xmm3\n"
69 "mulps %%xmm7, %%xmm4\n"
70 "addps %%xmm3, %%xmm0\n"
71 "addps %%xmm4, %%xmm1\n"
73 "movaps %%xmm0, (%%eax)\n"
74 "movaps %%xmm1, 1024(%%eax)\n"
82 : "a" (samples), "c" (dm_par));
85 void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
87 __asm__ __volatile__ (
90 "movl $64, %%ebx\n" /* loop counter */
92 "movss (%%ecx), %%xmm5\n" /* unit */
93 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
95 "movss 8(%%ecx), %%xmm7\n" /* slev */
96 "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
100 "movaps (%%eax), %%xmm0\n" /* left */
101 "movaps 1024(%%eax), %%xmm1\n" /* right */
102 "movaps 2048(%%eax), %%xmm3\n" /* leftsur */
103 "movaps 3072(%%eax), %%xmm4\n" /* rightsur */
104 "mulps %%xmm5, %%xmm0\n"
105 "mulps %%xmm5, %%xmm1\n"
106 "mulps %%xmm7, %%xmm3\n"
107 "mulps %%xmm7, %%xmm4\n"
108 "addps %%xmm3, %%xmm0\n"
109 "addps %%xmm4, %%xmm1\n"
111 "movaps %%xmm0, (%%eax)\n"
112 "movaps %%xmm1, 1024(%%eax)\n"
120 : "a" (samples), "c" (dm_par));
123 void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
125 __asm__ __volatile__ (
128 "movl $64, %%ebx\n" /* loop counter */
130 "movss (%%ecx), %%xmm5\n" /* unit */
131 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
133 "movss 4(%%ecx), %%xmm6\n" /* clev */
134 "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
136 "movss 8(%%ecx), %%xmm7\n" /* slev */
137 "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
141 "movaps (%%eax), %%xmm0\n" /* left */
142 "movaps 2048(%%eax), %%xmm1\n" /* right */
143 "movaps 1024(%%eax), %%xmm2\n" /* center */
144 "movaps 3072(%%eax), %%xmm3\n" /* sur */
145 "mulps %%xmm5, %%xmm0\n"
146 "mulps %%xmm5, %%xmm1\n"
147 "mulps %%xmm6, %%xmm2\n"
148 "addps %%xmm2, %%xmm0\n"
149 "mulps %%xmm7, %%xmm3\n"
150 "addps %%xmm2, %%xmm1\n"
151 "subps %%xmm3, %%xmm0\n"
152 "addps %%xmm3, %%xmm1\n"
154 "movaps %%xmm0, (%%eax)\n"
155 "movaps %%xmm1, 1024(%%eax)\n"
163 : "a" (samples), "c" (dm_par));
166 void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
168 __asm__ __volatile__ (
171 "movl $64, %%ebx\n" /* loop counter */
173 "movss (%%ecx), %%xmm5\n" /* unit */
174 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
176 "movss 8(%%ecx), %%xmm7\n" /* slev */
177 "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
181 "movaps (%%eax), %%xmm0\n" /* left */
182 "movaps 1024(%%eax), %%xmm1\n" /* right */
183 "movaps 2048(%%eax), %%xmm3\n" /* sur */
184 "mulps %%xmm5, %%xmm0\n"
185 "mulps %%xmm5, %%xmm1\n"
186 "mulps %%xmm7, %%xmm3\n"
187 "subps %%xmm3, %%xmm0\n"
188 "addps %%xmm3, %%xmm1\n"
190 "movaps %%xmm0, (%%eax)\n"
191 "movaps %%xmm1, 1024(%%eax)\n"
199 : "a" (samples), "c" (dm_par));
202 void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
204 __asm__ __volatile__ (
207 "movl $64, %%ebx\n" /* loop counter */
209 "movss (%%ecx), %%xmm5\n" /* unit */
210 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
212 "movss 4(%%ecx), %%xmm6\n" /* clev */
213 "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
217 "movaps (%%eax), %%xmm0\n" /*left */
218 "movaps 2048(%%eax), %%xmm1\n" /* right */
219 "movaps 1024(%%eax), %%xmm2\n" /* center */
220 "mulps %%xmm5, %%xmm0\n"
221 "mulps %%xmm5, %%xmm1\n"
222 "mulps %%xmm6, %%xmm2\n"
223 "addps %%xmm2, %%xmm0\n"
224 "addps %%xmm2, %%xmm1\n"
226 "movaps %%xmm0, (%%eax)\n"
227 "movaps %%xmm1, 1024(%%eax)\n"
235 : "a" (samples), "c" (dm_par));
238 void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
240 __asm__ __volatile__ (
245 "movl $sqrt2_sse, %%edx\n"
246 "movss (%%edx), %%xmm7\n"
247 "shufps $0, %%xmm7, %%xmm7\n" /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
252 "movaps (%%ecx), %%xmm0\n" /* c3 | c2 | c1 | c0 */
253 "mulps %%xmm7, %%xmm0\n"
254 "movhlps %%xmm0, %%xmm2\n" /* c3 | c2 */
256 "cvtps2pi %%xmm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */
257 "cvtps2pi %%xmm2, %%mm1\n" /* c3 c2 --> mm1, int_32 */
259 "packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */
260 "packssdw %%mm1, %%mm1\n" /* c3 c3 c2 c2 --> mm1, int_16 */
262 "movq %%mm0, (%%eax)\n"
263 "movq %%mm1, 8(%%eax)\n"
273 : "=a" (s16_samples), "=c" (left)
274 : "a" (s16_samples), "c" (left));
277 void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right)
279 __asm__ __volatile__ (
286 "movaps (%%ecx), %%xmm0\n" /* l3 | l2 | l1 | l0 */
287 "movaps (%%edx), %%xmm1\n" /* r3 | r2 | r1 | r0 */
288 "movhlps %%xmm0, %%xmm2\n" /* l3 | l2 */
289 "movhlps %%xmm1, %%xmm3\n" /* r3 | r2 */
290 "unpcklps %%xmm1, %%xmm0\n" /* r1 | l1 | r0 | l0 */
291 "unpcklps %%xmm3, %%xmm2\n" /* r3 | l3 | r2 | l2 */
293 "cvtps2pi %%xmm0, %%mm0\n" /* r0 l0 --> mm0, int_32 */
294 "movhlps %%xmm0, %%xmm0\n"
295 "cvtps2pi %%xmm0, %%mm1\n" /* r1 l1 --> mm1, int_32 */
296 "cvtps2pi %%xmm2, %%mm2\n" /* r2 l2 --> mm2, int_32 */
297 "movhlps %%xmm2, %%xmm2\n"
298 "cvtps2pi %%xmm2, %%mm3\n" /* r3 l3 --> mm3, int_32 */
300 "packssdw %%mm1, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */
301 "packssdw %%mm3, %%mm2\n" /* r3 l3 r2 l2 --> mm2, int_16 */
303 "movq %%mm0, (%%eax)\n"
304 "movq %%mm2, 8(%%eax)\n"
314 : "=a" (s16_samples), "=c" (left), "=d" (right)
315 : "a" (s16_samples), "c" (left), "d" (right));