1 /*****************************************************************************
2 * ac3_downmix_sse.c: accelerated SSE ac3 downmix functions
3 *****************************************************************************
4 * Copyright (C) 1999, 2000, 2001 VideoLAN
5 * $Id: ac3_downmix_sse.c,v 1.9 2001/12/30 07:09:54 sam Exp $
7 * Authors: Renaud Dartus <reno@videolan.org>
8 * Aaron Holtzman <aholtzma@engr.uvic.ca>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
28 #include <videolan/vlc.h>
30 #include "ac3_downmix.h"
32 const float sqrt2_sse __asm__ ("sqrt2_sse") __attribute__ ((aligned (16))) = 0.7071068;
34 void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
36 __asm__ __volatile__ (
39 "movl $64, %%ebx\n" /* loop counter */
41 "movss (%%ecx), %%xmm5\n" /* unit */
42 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
44 "movss 4(%%ecx), %%xmm6\n" /* clev */
45 "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
47 "movss 8(%%ecx), %%xmm7\n" /* slev */
48 "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
52 "movaps (%%eax), %%xmm0\n" /* left */
53 "movaps 2048(%%eax), %%xmm1\n" /* right */
54 "movaps 1024(%%eax), %%xmm2\n" /* center */
55 "movaps 3072(%%eax), %%xmm3\n" /* leftsur */
56 "movaps 4096(%%eax), %%xmm4\n" /* rithgsur */
57 "mulps %%xmm5, %%xmm0\n"
58 "mulps %%xmm5, %%xmm1\n"
59 "mulps %%xmm6, %%xmm2\n"
60 "addps %%xmm2, %%xmm0\n"
61 "addps %%xmm2, %%xmm1\n"
62 "mulps %%xmm7, %%xmm3\n"
63 "mulps %%xmm7, %%xmm4\n"
64 "addps %%xmm3, %%xmm0\n"
65 "addps %%xmm4, %%xmm1\n"
67 "movaps %%xmm0, (%%eax)\n"
68 "movaps %%xmm1, 1024(%%eax)\n"
76 : "a" (samples), "c" (dm_par));
79 void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
81 __asm__ __volatile__ (
84 "movl $64, %%ebx\n" /* loop counter */
86 "movss (%%ecx), %%xmm5\n" /* unit */
87 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
89 "movss 8(%%ecx), %%xmm7\n" /* slev */
90 "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
94 "movaps (%%eax), %%xmm0\n" /* left */
95 "movaps 1024(%%eax), %%xmm1\n" /* right */
96 "movaps 2048(%%eax), %%xmm3\n" /* leftsur */
97 "movaps 3072(%%eax), %%xmm4\n" /* rightsur */
98 "mulps %%xmm5, %%xmm0\n"
99 "mulps %%xmm5, %%xmm1\n"
100 "mulps %%xmm7, %%xmm3\n"
101 "mulps %%xmm7, %%xmm4\n"
102 "addps %%xmm3, %%xmm0\n"
103 "addps %%xmm4, %%xmm1\n"
105 "movaps %%xmm0, (%%eax)\n"
106 "movaps %%xmm1, 1024(%%eax)\n"
114 : "a" (samples), "c" (dm_par));
117 void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
119 __asm__ __volatile__ (
122 "movl $64, %%ebx\n" /* loop counter */
124 "movss (%%ecx), %%xmm5\n" /* unit */
125 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
127 "movss 4(%%ecx), %%xmm6\n" /* clev */
128 "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
130 "movss 8(%%ecx), %%xmm7\n" /* slev */
131 "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
135 "movaps (%%eax), %%xmm0\n" /* left */
136 "movaps 2048(%%eax), %%xmm1\n" /* right */
137 "movaps 1024(%%eax), %%xmm2\n" /* center */
138 "movaps 3072(%%eax), %%xmm3\n" /* sur */
139 "mulps %%xmm5, %%xmm0\n"
140 "mulps %%xmm5, %%xmm1\n"
141 "mulps %%xmm6, %%xmm2\n"
142 "addps %%xmm2, %%xmm0\n"
143 "mulps %%xmm7, %%xmm3\n"
144 "addps %%xmm2, %%xmm1\n"
145 "subps %%xmm3, %%xmm0\n"
146 "addps %%xmm3, %%xmm1\n"
148 "movaps %%xmm0, (%%eax)\n"
149 "movaps %%xmm1, 1024(%%eax)\n"
157 : "a" (samples), "c" (dm_par));
160 void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
162 __asm__ __volatile__ (
165 "movl $64, %%ebx\n" /* loop counter */
167 "movss (%%ecx), %%xmm5\n" /* unit */
168 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
170 "movss 8(%%ecx), %%xmm7\n" /* slev */
171 "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
175 "movaps (%%eax), %%xmm0\n" /* left */
176 "movaps 1024(%%eax), %%xmm1\n" /* right */
177 "movaps 2048(%%eax), %%xmm3\n" /* sur */
178 "mulps %%xmm5, %%xmm0\n"
179 "mulps %%xmm5, %%xmm1\n"
180 "mulps %%xmm7, %%xmm3\n"
181 "subps %%xmm3, %%xmm0\n"
182 "addps %%xmm3, %%xmm1\n"
184 "movaps %%xmm0, (%%eax)\n"
185 "movaps %%xmm1, 1024(%%eax)\n"
193 : "a" (samples), "c" (dm_par));
196 void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
198 __asm__ __volatile__ (
201 "movl $64, %%ebx\n" /* loop counter */
203 "movss (%%ecx), %%xmm5\n" /* unit */
204 "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
206 "movss 4(%%ecx), %%xmm6\n" /* clev */
207 "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
211 "movaps (%%eax), %%xmm0\n" /*left */
212 "movaps 2048(%%eax), %%xmm1\n" /* right */
213 "movaps 1024(%%eax), %%xmm2\n" /* center */
214 "mulps %%xmm5, %%xmm0\n"
215 "mulps %%xmm5, %%xmm1\n"
216 "mulps %%xmm6, %%xmm2\n"
217 "addps %%xmm2, %%xmm0\n"
218 "addps %%xmm2, %%xmm1\n"
220 "movaps %%xmm0, (%%eax)\n"
221 "movaps %%xmm1, 1024(%%eax)\n"
229 : "a" (samples), "c" (dm_par));
232 void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
234 __asm__ __volatile__ (
239 "movl $sqrt2_sse, %%edx\n"
240 "movss (%%edx), %%xmm7\n"
241 "shufps $0, %%xmm7, %%xmm7\n" /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
246 "movaps (%%ecx), %%xmm0\n" /* c3 | c2 | c1 | c0 */
247 "mulps %%xmm7, %%xmm0\n"
248 "movhlps %%xmm0, %%xmm2\n" /* c3 | c2 */
250 "cvtps2pi %%xmm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */
251 "cvtps2pi %%xmm2, %%mm1\n" /* c3 c2 --> mm1, int_32 */
253 "packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */
254 "packssdw %%mm1, %%mm1\n" /* c3 c3 c2 c2 --> mm1, int_16 */
256 "movq %%mm0, (%%eax)\n"
257 "movq %%mm1, 8(%%eax)\n"
267 : "=a" (s16_samples), "=c" (left)
268 : "a" (s16_samples), "c" (left));
271 void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right)
273 __asm__ __volatile__ (
280 "movaps (%%ecx), %%xmm0\n" /* l3 | l2 | l1 | l0 */
281 "movaps (%%edx), %%xmm1\n" /* r3 | r2 | r1 | r0 */
282 "movhlps %%xmm0, %%xmm2\n" /* l3 | l2 */
283 "movhlps %%xmm1, %%xmm3\n" /* r3 | r2 */
284 "unpcklps %%xmm1, %%xmm0\n" /* r1 | l1 | r0 | l0 */
285 "unpcklps %%xmm3, %%xmm2\n" /* r3 | l3 | r2 | l2 */
287 "cvtps2pi %%xmm0, %%mm0\n" /* r0 l0 --> mm0, int_32 */
288 "movhlps %%xmm0, %%xmm0\n"
289 "cvtps2pi %%xmm0, %%mm1\n" /* r1 l1 --> mm1, int_32 */
290 "cvtps2pi %%xmm2, %%mm2\n" /* r2 l2 --> mm2, int_32 */
291 "movhlps %%xmm2, %%xmm2\n"
292 "cvtps2pi %%xmm2, %%mm3\n" /* r3 l3 --> mm3, int_32 */
294 "packssdw %%mm1, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */
295 "packssdw %%mm3, %%mm2\n" /* r3 l3 r2 l2 --> mm2, int_16 */
297 "movq %%mm0, (%%eax)\n"
298 "movq %%mm2, 8(%%eax)\n"
308 : "=a" (s16_samples), "=c" (left), "=d" (right)
309 : "a" (s16_samples), "c" (left), "d" (right));