1 ;******************************************************************************
2 ;* x86 optimized channel mixing
3 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
27 ;-----------------------------------------------------------------------------
28 ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len,
29 ; int out_ch, int in_ch);
30 ;-----------------------------------------------------------------------------
32 %macro MIX_2_TO_1_FLTP_FLT 0
33 cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
34 mov src1q, [srcq+gprsize]
37 mov matrixq, [matrixq ]
38 VBROADCASTSS m4, [matrixq ]
39 VBROADCASTSS m5, [matrixq+4]
43 mulps m1, m5, [srcq+src1q ]
44 mulps m2, m4, [srcq+ mmsize]
45 mulps m3, m5, [srcq+src1q+mmsize]
49 mova [srcq+mmsize], m2
63 ;-----------------------------------------------------------------------------
64 ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
65 ; int out_ch, int in_ch);
66 ;-----------------------------------------------------------------------------
68 %macro MIX_2_TO_1_S16P_FLT 0
69 cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
70 mov src1q, [srcq+gprsize]
73 mov matrixq, [matrixq ]
74 VBROADCASTSS m4, [matrixq ]
75 VBROADCASTSS m5, [matrixq+4]
107 ;-----------------------------------------------------------------------------
108 ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
109 ; int out_ch, int in_ch);
110 ;-----------------------------------------------------------------------------
113 cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
114 mov src1q, [srcq+gprsize]
117 mov matrixq, [matrixq]
128 mova m2, [srcq+src1q]
148 ;-----------------------------------------------------------------------------
149 ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len,
150 ; int out_ch, int in_ch);
151 ;-----------------------------------------------------------------------------
153 %macro MIX_1_TO_2_FLTP_FLT 0
154 cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
155 mov src1q, [src0q+gprsize]
158 mov matrix1q, [matrix0q+gprsize]
159 mov matrix0q, [matrix0q]
160 VBROADCASTSS m2, [matrix0q]
161 VBROADCASTSS m3, [matrix1q]
168 mova [src0q+src1q], m1
177 %if HAVE_AVX_EXTERNAL
182 ;-----------------------------------------------------------------------------
183 ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len,
184 ; int out_ch, int in_ch);
185 ;-----------------------------------------------------------------------------
187 %macro MIX_1_TO_2_S16P_FLT 0
188 cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1
189 mov src1q, [src0q+gprsize]
192 mov matrix1q, [matrix0q+gprsize]
193 mov matrix0q, [matrix0q]
194 VBROADCASTSS m4, [matrix0q]
195 VBROADCASTSS m5, [matrix1q]
213 mova [src0q+src1q], m1
224 %if HAVE_AVX_EXTERNAL
229 ;-----------------------------------------------------------------------------
230 ; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
231 ; int len, int out_ch, int in_ch);
232 ;-----------------------------------------------------------------------------
234 %macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
235 ; define some names to make the code clearer
236 %assign in_channels %1
237 %assign out_channels %2
238 %assign stereo out_channels - 1
245 ; determine how many matrix elements must go on the stack vs. mmregs
246 %assign matrix_elements in_channels * out_channels
249 %assign needed_mmregs 7
251 %assign needed_mmregs 5
255 %assign needed_mmregs 4
257 %assign needed_mmregs 3
260 %assign matrix_elements_mm num_mmregs - needed_mmregs
261 %if matrix_elements < matrix_elements_mm
262 %assign matrix_elements_mm matrix_elements
264 %if matrix_elements_mm < matrix_elements
265 %assign matrix_elements_stack matrix_elements - matrix_elements_mm
267 %assign matrix_elements_stack 0
269 %assign matrix_stack_size matrix_elements_stack * mmsize
271 %assign needed_stack_size -1 * matrix_stack_size
272 %if ARCH_X86_32 && in_channels >= 7
273 %assign needed_stack_size needed_stack_size - 16
276 cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, needed_stack_size, src0, src1, len, src2, src3, src4, src5, src6, src7
278 ; define src pointers on stack if needed
279 %if matrix_elements_stack > 0 && ARCH_X86_32 && in_channels >= 7
280 %define src5m [rsp+matrix_stack_size+0]
281 %define src6m [rsp+matrix_stack_size+4]
282 %define src7m [rsp+matrix_stack_size+8]
285 ; load matrix pointers
289 mov matrix1q, [matrix0q+gprsize]
291 mov matrix0q, [matrix0q]
293 ; define matrix coeff names
295 %assign %%j needed_mmregs
297 %if %%i >= matrix_elements_mm
298 CAT_XDEFINE mx_stack_0_, %%i, 1
299 CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
301 CAT_XDEFINE mx_stack_0_, %%i, 0
302 CAT_XDEFINE mx_0_, %%i, m %+ %%j
310 %if in_channels + %%i >= matrix_elements_mm
311 CAT_XDEFINE mx_stack_1_, %%i, 1
312 CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
314 CAT_XDEFINE mx_stack_1_, %%i, 0
315 CAT_XDEFINE mx_1_, %%i, m %+ %%j
322 ; load/splat matrix coeffs
325 %if mx_stack_0_ %+ %%i
326 VBROADCASTSS m0, [matrix0q+4*%%i]
327 mova mx_0_ %+ %%i, m0
329 VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
332 %if mx_stack_1_ %+ %%i
333 VBROADCASTSS m0, [matrix1q+4*%%i]
334 mova mx_1_ %+ %%i, m0
336 VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
342 ; load channel pointers to registers as offsets from the first channel pointer
348 %rep (in_channels - 1)
349 %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
350 mov src5q, [src0q+%%i*gprsize]
352 mov src %+ %%i %+ m, src5q
354 mov src %+ %%i %+ q, [src0q+%%i*gprsize]
355 add src %+ %%i %+ q, lenq
363 ; for x86-32 with 7-8 channels we do not have enough gp registers for all src
364 ; pointers, so we have to load some of them from the stack each time
365 %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
367 ; mix with s16p input
368 mova m0, [src0q+lenq]
379 %rep (in_channels - 1)
380 %if copy_src_from_stack
381 %define src_ptr src5q
383 %define src_ptr src %+ %%i %+ q
386 %if copy_src_from_stack
387 mov src_ptr, src %+ %%i %+ m
389 mova m4, [src_ptr+lenq]
393 FMULADD_PS m2, m4, mx_1_ %+ %%i, m2, m6
394 FMULADD_PS m3, m5, mx_1_ %+ %%i, m3, m6
395 FMULADD_PS m0, m4, mx_0_ %+ %%i, m0, m4
396 FMULADD_PS m1, m5, mx_0_ %+ %%i, m1, m5
398 %if copy_src_from_stack
399 mov src_ptr, src %+ %%i %+ m
401 mova m2, [src_ptr+lenq]
405 FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m4
406 FMULADD_PS m1, m3, mx_0_ %+ %%i, m1, m4
414 mova [src1q+lenq], m2
419 mova [src0q+lenq], m0
421 ; mix with fltp input
422 %if stereo || mx_stack_0_0
423 mova m0, [src0q+lenq]
428 %if stereo || mx_stack_0_0
431 mulps m0, mx_0_0, [src0q+lenq]
434 %rep (in_channels - 1)
435 %if copy_src_from_stack
436 %define src_ptr src5q
437 mov src_ptr, src %+ %%i %+ m
439 %define src_ptr src %+ %%i %+ q
441 ; avoid extra load for mono if matrix is in a mm register
442 %if stereo || mx_stack_0_ %+ %%i
443 mova m2, [src_ptr+lenq]
446 FMULADD_PS m1, m2, mx_1_ %+ %%i, m1, m3
448 %if stereo || mx_stack_0_ %+ %%i
449 FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m2
451 FMULADD_PS m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
455 mova [src0q+lenq], m0
457 mova [src1q+lenq], m1
463 ; zero ymm high halves
470 %macro MIX_3_8_TO_1_2_FLT_FUNCS 0
474 MIX_3_8_TO_1_2_FLT %%i, 1, fltp
475 MIX_3_8_TO_1_2_FLT %%i, 2, fltp
477 MIX_3_8_TO_1_2_FLT %%i, 1, s16p
478 MIX_3_8_TO_1_2_FLT %%i, 2, s16p
480 MIX_3_8_TO_1_2_FLT %%i, 1, s16p
481 MIX_3_8_TO_1_2_FLT %%i, 2, s16p
482 ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
483 %if HAVE_AVX_EXTERNAL
484 %if ARCH_X86_64 || %%i < 6
489 MIX_3_8_TO_1_2_FLT %%i, 1, fltp
490 MIX_3_8_TO_1_2_FLT %%i, 2, fltp
492 MIX_3_8_TO_1_2_FLT %%i, 1, s16p
493 MIX_3_8_TO_1_2_FLT %%i, 2, s16p
495 %if HAVE_FMA4_EXTERNAL
496 %if ARCH_X86_64 || %%i < 6
501 MIX_3_8_TO_1_2_FLT %%i, 1, fltp
502 MIX_3_8_TO_1_2_FLT %%i, 2, fltp
504 MIX_3_8_TO_1_2_FLT %%i, 1, s16p
505 MIX_3_8_TO_1_2_FLT %%i, 2, s16p
511 MIX_3_8_TO_1_2_FLT_FUNCS