1 ;******************************************************************************
2 ;* x86 optimized Format Conversion Utils
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
6 ;* This file is part of Libav.
8 ;* Libav is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* Libav is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with Libav; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
28 pf_s32_inv_scale: times 8 dd 0x30000000
29 pf_s32_scale: times 8 dd 0x4f000000
30 pf_s32_clip: times 8 dd 0x4effffff
31 pf_s16_inv_scale: times 4 dd 0x38000000
32 pf_s16_scale: times 4 dd 0x47000000
33 pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
34 pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
35 pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
36 pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
37 pw_zero_even: times 4 dw 0x0000, 0xffff
41 ;------------------------------------------------------------------------------
42 ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
43 ;------------------------------------------------------------------------------
46 cglobal conv_s16_to_s32, 3,3,3, dst, src, len
48 lea dstq, [dstq+2*lenq]
57 mova [dstq+2*lenq ], m0
58 mova [dstq+2*lenq+mmsize], m1
63 ;------------------------------------------------------------------------------
64 ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
65 ;------------------------------------------------------------------------------
67 %macro CONV_S16_TO_FLT 0
68 cglobal conv_s16_to_flt, 3,3,3, dst, src, len
71 lea dstq, [dstq + 2*lenq]
73 mova m2, [pf_s16_inv_scale]
82 mova [dstq+2*lenq ], m0
83 mova [dstq+2*lenq+mmsize], m1
94 ;------------------------------------------------------------------------------
95 ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
96 ;------------------------------------------------------------------------------
98 %macro CONV_S32_TO_S16 0
99 cglobal conv_s32_to_s16, 3,3,4, dst, src, len
101 lea srcq, [srcq+2*lenq]
105 mova m0, [srcq+2*lenq ]
106 mova m1, [srcq+2*lenq+ mmsize]
107 mova m2, [srcq+2*lenq+2*mmsize]
108 mova m3, [srcq+2*lenq+3*mmsize]
115 mova [dstq+lenq ], m0
116 mova [dstq+lenq+mmsize], m2
132 ;------------------------------------------------------------------------------
133 ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
134 ;------------------------------------------------------------------------------
136 %macro CONV_S32_TO_FLT 0
137 cglobal conv_s32_to_flt, 3,3,3, dst, src, len
142 mova m0, [pf_s32_inv_scale]
145 cvtdq2ps m1, [srcq+lenq ]
146 cvtdq2ps m2, [srcq+lenq+mmsize]
149 mova [dstq+lenq ], m1
150 mova [dstq+lenq+mmsize], m2
161 ;------------------------------------------------------------------------------
162 ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
163 ;------------------------------------------------------------------------------
166 cglobal conv_flt_to_s16, 3,3,5, dst, src, len
168 lea srcq, [srcq+2*lenq]
171 mova m4, [pf_s16_scale]
173 mova m0, [srcq+2*lenq ]
174 mova m1, [srcq+2*lenq+1*mmsize]
175 mova m2, [srcq+2*lenq+2*mmsize]
176 mova m3, [srcq+2*lenq+3*mmsize]
187 mova [dstq+lenq ], m0
188 mova [dstq+lenq+mmsize], m2
193 ;------------------------------------------------------------------------------
194 ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
195 ;------------------------------------------------------------------------------
197 %macro CONV_FLT_TO_S32 0
198 cglobal conv_flt_to_s32, 3,3,6, dst, src, len
203 mova m4, [pf_s32_scale]
204 mova m5, [pf_s32_clip]
206 mulps m0, m4, [srcq+lenq ]
207 mulps m1, m4, [srcq+lenq+1*mmsize]
208 mulps m2, m4, [srcq+lenq+2*mmsize]
209 mulps m3, m4, [srcq+lenq+3*mmsize]
218 mova [dstq+lenq ], m0
219 mova [dstq+lenq+1*mmsize], m1
220 mova [dstq+lenq+2*mmsize], m2
221 mova [dstq+lenq+3*mmsize], m3
232 ;------------------------------------------------------------------------------
233 ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
235 ;------------------------------------------------------------------------------
237 %macro CONV_S16P_TO_S16_2CH 0
238 cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
239 mov src1q, [src0q+gprsize]
244 lea dstq, [dstq+2*lenq]
247 mova m0, [src0q+lenq ]
248 mova m1, [src1q+lenq ]
249 mova m2, [src0q+lenq+mmsize]
250 mova m3, [src1q+lenq+mmsize]
251 SBUTTERFLY2 wd, 0, 1, 4
252 SBUTTERFLY2 wd, 2, 3, 4
253 mova [dstq+2*lenq+0*mmsize], m0
254 mova [dstq+2*lenq+1*mmsize], m1
255 mova [dstq+2*lenq+2*mmsize], m2
256 mova [dstq+2*lenq+3*mmsize], m3
267 ;------------------------------------------------------------------------------
268 ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
270 ;------------------------------------------------------------------------------
272 ;------------------------------------------------------------------------------
273 ; NOTE: In the 6-channel functions, len could be used as an index on x86-64
274 ; instead of just a counter, which would avoid incrementing the
275 ; pointers, but the extra complexity and amount of code is not worth
276 ; the small gain. On x86-32 there are not enough registers to use len
277 ; as an index without keeping two of the pointers on the stack and
278 ; loading them in each iteration.
279 ;------------------------------------------------------------------------------
281 %macro CONV_S16P_TO_S16_6CH 0
283 cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
285 cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
286 %define lend dword r2m
288 mov src1q, [src0q+1*gprsize]
289 mov src2q, [src0q+2*gprsize]
290 mov src3q, [src0q+3*gprsize]
291 mov src4q, [src0q+4*gprsize]
292 mov src5q, [src0q+5*gprsize]
300 %if cpuflag(sse2slow)
301 movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
302 movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
303 movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
304 movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
305 movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
306 movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
308 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
309 punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
310 punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
312 shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
313 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
314 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
316 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
317 pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
318 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
319 movq [dstq+0*mmsize/2], m1
320 movq [dstq+1*mmsize/2], m0
321 movq [dstq+2*mmsize/2], m2
322 movhps [dstq+3*mmsize/2], m1
323 movhps [dstq+4*mmsize/2], m0
324 movhps [dstq+5*mmsize/2], m2
329 mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
330 mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
331 mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
332 mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
333 mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
334 mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
336 SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
337 ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
338 SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
339 ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
340 SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
341 ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
343 shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
344 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
345 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
346 SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
347 shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
348 shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
349 shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
350 SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
352 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
353 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
354 pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
355 pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
356 pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
357 pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
359 punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
360 punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
361 shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
362 SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
363 punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
364 punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
365 shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
366 SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
367 mova [dstq+0*mmsize], m4
368 mova [dstq+1*mmsize], m2
369 mova [dstq+2*mmsize], m0
370 mova [dstq+3*mmsize], m5
371 mova [dstq+4*mmsize], m3
372 mova [dstq+5*mmsize], m1
388 ;------------------------------------------------------------------------------
389 ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
391 ;------------------------------------------------------------------------------
393 %macro CONV_S16P_TO_FLT_2CH 0
394 cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
396 mov src1q, [src0q+gprsize]
398 lea dstq, [dstq+4*lenq]
402 mova m5, [pf_s32_inv_scale]
404 mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
405 mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
406 SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
407 ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
409 punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
410 punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
411 punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
412 punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
421 mova [dstq+4*lenq ], m0
422 mova [dstq+4*lenq+ mmsize], m1
423 mova [dstq+4*lenq+2*mmsize], m2
424 mova [dstq+4*lenq+3*mmsize], m3
435 ;------------------------------------------------------------------------------
436 ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
438 ;------------------------------------------------------------------------------
440 %macro CONV_S16P_TO_FLT_6CH 0
442 cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
444 cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
445 %define lend dword r2m
447 mov src1q, [srcq+1*gprsize]
448 mov src2q, [srcq+2*gprsize]
449 mov src3q, [srcq+3*gprsize]
450 mov src4q, [srcq+4*gprsize]
451 mov src5q, [srcq+5*gprsize]
458 mova m7, [pf_s32_inv_scale]
460 %define unpack_even m6
461 mova m6, [pb_shuf_unpack_even]
463 %define unpack_odd m8
464 mova m8, [pb_shuf_unpack_odd]
466 %define unpack_odd [pb_shuf_unpack_odd]
470 movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
471 movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
472 movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
473 movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
474 movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
475 movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
477 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
478 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
479 punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
481 shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
482 shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
483 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
485 pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
486 pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
487 pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
488 pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
489 pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
490 pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
493 pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
494 pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
495 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
496 pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
497 punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
498 punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
499 punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
500 punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
501 punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
502 punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
503 SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
505 cvtdq2ps m0, m0 ; convert s32 to float
511 mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
518 mova [dstq+ mmsize], m1
519 mova [dstq+2*mmsize], m2
520 mova [dstq+3*mmsize], m3
521 mova [dstq+4*mmsize], m4
522 mova [dstq+5*mmsize], m5
537 ;------------------------------------------------------------------------------
538 ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
540 ;------------------------------------------------------------------------------
542 %macro CONV_FLTP_TO_S16_2CH 0
543 cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
545 mov src1q, [src0q+gprsize]
551 mova m2, [pf_s16_scale]
553 mova m3, [pb_interleave_words]
556 mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
557 mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
561 packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
562 pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
564 packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
565 packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
566 punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
579 ;------------------------------------------------------------------------------
580 ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
582 ;------------------------------------------------------------------------------
584 %macro CONV_FLTP_TO_S16_6CH 0
586 cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
588 cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
589 %define lend dword r2m
591 mov src1q, [srcq+1*gprsize]
592 mov src2q, [srcq+2*gprsize]
593 mov src3q, [srcq+3*gprsize]
594 mov src4q, [srcq+4*gprsize]
595 mov src5q, [srcq+5*gprsize]
602 movaps xmm6, [pf_s16_scale]
605 mulps m0, m6, [srcq ]
606 mulps m1, m6, [srcq+src1q]
607 mulps m2, m6, [srcq+src2q]
608 mulps m3, m6, [srcq+src3q]
609 mulps m4, m6, [srcq+src4q]
610 mulps m5, m6, [srcq+src5q]
617 packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
618 packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
619 packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
621 movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
622 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
623 punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
624 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
626 shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
627 shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
628 shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
630 shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
631 shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
632 shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
633 mova [dstq+0*mmsize], m3
634 mova [dstq+1*mmsize], m1
635 mova [dstq+2*mmsize], m0
638 movlps xmm1, [srcq+src1q]
639 movlps xmm2, [srcq+src2q]
640 movlps xmm3, [srcq+src3q]
641 movlps xmm4, [srcq+src4q]
642 movlps xmm5, [srcq+src5q]
655 packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
656 packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
657 packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
659 pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
660 punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
661 punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
662 punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
664 pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
665 punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
666 punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
667 punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
668 mova [dstq+0*mmsize], mm0
669 mova [dstq+1*mmsize], mm1
670 mova [dstq+2*mmsize], mm2
691 ;------------------------------------------------------------------------------
692 ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
694 ;------------------------------------------------------------------------------
696 %macro CONV_FLTP_TO_FLT_2CH 0
697 cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
698 mov src1q, [src0q+gprsize]
703 lea dstq, [dstq+2*lenq]
706 mova m0, [src0q+lenq ]
707 mova m1, [src1q+lenq ]
708 mova m2, [src0q+lenq+mmsize]
709 mova m3, [src1q+lenq+mmsize]
712 mova [dstq+2*lenq+0*mmsize], m0
713 mova [dstq+2*lenq+1*mmsize], m1
714 mova [dstq+2*lenq+2*mmsize], m2
715 mova [dstq+2*lenq+3*mmsize], m3
726 ;-----------------------------------------------------------------------------
727 ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
729 ;-----------------------------------------------------------------------------
731 %macro CONV_FLTP_TO_FLT_6CH 0
732 cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
736 %define lend dword r2m
738 mov src1q, [srcq+1*gprsize]
739 mov src2q, [srcq+2*gprsize]
740 mov src3q, [srcq+3*gprsize]
741 mov src4q, [srcq+4*gprsize]
742 mov src5q, [srcq+5*gprsize]
751 mova m1, [srcq+src1q]
752 mova m2, [srcq+src2q]
753 mova m3, [srcq+src3q]
754 mova m4, [srcq+src4q]
755 mova m5, [srcq+src5q]
761 blendps m6, m4, m0, 1100b
764 blendps m2, m5, m1, 1100b
775 SBUTTERFLY dq, 0, 1, 6
776 SBUTTERFLY dq, 2, 3, 6
777 SBUTTERFLY dq, 4, 5, 6
805 ;------------------------------------------------------------------------------
806 ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
808 ;------------------------------------------------------------------------------
810 %macro CONV_S16_TO_S16P_2CH 0
811 cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
813 mov dst1q, [dst0q+gprsize]
815 lea srcq, [srcq+2*lenq]
820 mova m3, [pb_deinterleave_words]
823 mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
824 mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
826 pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
827 pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
828 SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
829 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
831 pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
832 pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
833 pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
834 pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
835 DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
836 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
838 mova [dst0q+lenq], m0
839 mova [dst1q+lenq], m1
852 ;------------------------------------------------------------------------------
853 ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
855 ;------------------------------------------------------------------------------
857 %macro CONV_S16_TO_S16P_6CH 0
859 cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
861 cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
862 %define lend dword r2m
864 mov dst1q, [dstq+ gprsize]
865 mov dst2q, [dstq+2*gprsize]
866 mov dst3q, [dstq+3*gprsize]
867 mov dst4q, [dstq+4*gprsize]
868 mov dst5q, [dstq+5*gprsize]
876 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
877 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
878 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
879 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
880 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
881 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
882 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
883 ; m1 = 4, 10, 5, 11, x, x, x, x
884 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
885 ; m2 = 16, 22, 17, 23, x, x, x, x
886 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
887 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
888 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
890 movhps [dstq+dst1q], m0
891 movq [dstq+dst2q], m3
892 movhps [dstq+dst3q], m3
893 movq [dstq+dst4q], m1
894 movhps [dstq+dst5q], m1
909 ;------------------------------------------------------------------------------
910 ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
912 ;------------------------------------------------------------------------------
914 %macro CONV_S16_TO_FLTP_2CH 0
915 cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
917 mov dst1q, [dst0q+gprsize]
923 mova m3, [pf_s32_inv_scale]
924 mova m4, [pw_zero_even]
933 mova [dst0q+lenq], m0
934 mova [dst1q+lenq], m1
945 ;------------------------------------------------------------------------------
946 ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
948 ;------------------------------------------------------------------------------
950 %macro CONV_S16_TO_FLTP_6CH 0
952 cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
954 cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
955 %define lend dword r2m
957 mov dst1q, [dstq+ gprsize]
958 mov dst2q, [dstq+2*gprsize]
959 mov dst3q, [dstq+3*gprsize]
960 mov dst4q, [dstq+4*gprsize]
961 mov dst5q, [dstq+5*gprsize]
968 mova m6, [pf_s16_inv_scale]
970 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
971 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
972 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
973 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
974 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
975 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
976 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
977 ; m1 = 4, 10, 5, 11, x, x, x, x
978 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
979 ; m2 = 16, 22, 17, 23, x, x, x, x
980 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
981 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
982 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
983 S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
985 S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
987 S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
1003 mova [dstq+dst1q], m1
1004 mova [dstq+dst2q], m2
1005 mova [dstq+dst3q], m3
1006 mova [dstq+dst4q], m4
1007 mova [dstq+dst5q], m5
1016 CONV_S16_TO_FLTP_6CH
1018 CONV_S16_TO_FLTP_6CH
1020 CONV_S16_TO_FLTP_6CH
1022 CONV_S16_TO_FLTP_6CH
1024 ;------------------------------------------------------------------------------
1025 ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
1027 ;------------------------------------------------------------------------------
1029 %macro CONV_FLT_TO_S16P_2CH 0
1030 cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
1032 mov dst1q, [dst0q+gprsize]
1034 lea srcq, [srcq+4*lenq]
1038 mova m5, [pf_s16_scale]
1040 mova m0, [srcq+4*lenq ]
1041 mova m1, [srcq+4*lenq+ mmsize]
1042 mova m2, [srcq+4*lenq+2*mmsize]
1043 mova m3, [srcq+4*lenq+3*mmsize]
1056 mova [dst0q+lenq], m0
1057 mova [dst1q+lenq], m1
1064 CONV_FLT_TO_S16P_2CH
1066 CONV_FLT_TO_S16P_2CH
1068 ;------------------------------------------------------------------------------
1069 ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
1071 ;------------------------------------------------------------------------------
1073 %macro CONV_FLT_TO_S16P_6CH 0
1075 cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1077 cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1078 %define lend dword r2m
1080 mov dst1q, [dstq+ gprsize]
1081 mov dst2q, [dstq+2*gprsize]
1082 mov dst3q, [dstq+3*gprsize]
1083 mov dst4q, [dstq+4*gprsize]
1084 mov dst5q, [dstq+5*gprsize]
1091 mova m6, [pf_s16_scale]
1093 mulps m0, m6, [srcq+0*mmsize]
1094 mulps m3, m6, [srcq+1*mmsize]
1095 mulps m1, m6, [srcq+2*mmsize]
1096 mulps m4, m6, [srcq+3*mmsize]
1097 mulps m2, m6, [srcq+4*mmsize]
1098 mulps m5, m6, [srcq+5*mmsize]
1105 packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
1106 packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
1107 packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
1108 PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
1109 shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
1110 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
1111 SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
1112 ; m3 = 4, 10, 5, 11, x, x, x, x
1113 SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
1114 ; m2 = 16, 22, 17, 23, x, x, x, x
1115 SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
1116 ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
1117 punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
1119 movhps [dstq+dst1q], m0
1120 movq [dstq+dst2q], m1
1121 movhps [dstq+dst3q], m1
1122 movq [dstq+dst4q], m3
1123 movhps [dstq+dst5q], m3
1132 CONV_FLT_TO_S16P_6CH
1134 CONV_FLT_TO_S16P_6CH
1136 CONV_FLT_TO_S16P_6CH
1138 ;------------------------------------------------------------------------------
1139 ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
1141 ;------------------------------------------------------------------------------
1143 %macro CONV_FLT_TO_FLTP_2CH 0
1144 cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
1146 mov dst1q, [dst0q+gprsize]
1148 lea srcq, [srcq+2*lenq]
1153 mova m0, [srcq+2*lenq ]
1154 mova m1, [srcq+2*lenq+mmsize]
1156 mova [dst0q+lenq], m0
1157 mova [dst1q+lenq], m1
1164 CONV_FLT_TO_FLTP_2CH
1166 CONV_FLT_TO_FLTP_2CH
1168 ;------------------------------------------------------------------------------
1169 ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
1171 ;------------------------------------------------------------------------------
1173 %macro CONV_FLT_TO_FLTP_6CH 0
1175 cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1177 cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1178 %define lend dword r2m
1180 mov dst1q, [dstq+ gprsize]
1181 mov dst2q, [dstq+2*gprsize]
1182 mov dst3q, [dstq+3*gprsize]
1183 mov dst4q, [dstq+4*gprsize]
1184 mov dst5q, [dstq+5*gprsize]
1192 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
1193 mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
1194 mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
1195 mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
1196 mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
1197 mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
1199 SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
1201 SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
1203 SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
1204 ; m5 = 10, 22, 11, 23
1205 SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
1207 SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
1209 SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
1210 ; m5 = 5, 11, 17, 23
1212 mova [dstq+dst1q], m4
1213 mova [dstq+dst2q], m3
1214 mova [dstq+dst3q], m2
1215 mova [dstq+dst4q], m1
1216 mova [dstq+dst5q], m5
1225 CONV_FLT_TO_FLTP_6CH
1227 CONV_FLT_TO_FLTP_6CH