1 ;******************************************************************************
2 ;* x86 optimized Format Conversion Utils
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
28 pf_s32_inv_scale: times 8 dd 0x30000000
29 pf_s32_scale: times 8 dd 0x4f000000
30 pf_s32_clip: times 8 dd 0x4effffff
31 pf_s16_inv_scale: times 4 dd 0x38000000
32 pf_s16_scale: times 4 dd 0x47000000
33 pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
34 pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
35 pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
36 pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
37 pw_zero_even: times 4 dw 0x0000, 0xffff
41 ;------------------------------------------------------------------------------
42 ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
43 ;------------------------------------------------------------------------------
46 cglobal conv_s16_to_s32, 3,3,3, dst, src, len
48 lea dstq, [dstq+2*lenq]
57 mova [dstq+2*lenq ], m0
58 mova [dstq+2*lenq+mmsize], m1
63 ;------------------------------------------------------------------------------
64 ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
65 ;------------------------------------------------------------------------------
67 %macro CONV_S16_TO_FLT 0
68 cglobal conv_s16_to_flt, 3,3,3, dst, src, len
71 lea dstq, [dstq + 2*lenq]
73 mova m2, [pf_s16_inv_scale]
82 mova [dstq+2*lenq ], m0
83 mova [dstq+2*lenq+mmsize], m1
94 ;------------------------------------------------------------------------------
95 ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
96 ;------------------------------------------------------------------------------
98 %macro CONV_S32_TO_S16 0
99 cglobal conv_s32_to_s16, 3,3,4, dst, src, len
101 lea srcq, [srcq+2*lenq]
105 mova m0, [srcq+2*lenq ]
106 mova m1, [srcq+2*lenq+ mmsize]
107 mova m2, [srcq+2*lenq+2*mmsize]
108 mova m3, [srcq+2*lenq+3*mmsize]
115 mova [dstq+lenq ], m0
116 mova [dstq+lenq+mmsize], m2
132 ;------------------------------------------------------------------------------
133 ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
134 ;------------------------------------------------------------------------------
136 %macro CONV_S32_TO_FLT 0
137 cglobal conv_s32_to_flt, 3,3,3, dst, src, len
142 mova m0, [pf_s32_inv_scale]
145 cvtdq2ps m1, [srcq+lenq ]
146 cvtdq2ps m2, [srcq+lenq+mmsize]
149 mova [dstq+lenq ], m1
150 mova [dstq+lenq+mmsize], m2
158 %if HAVE_AVX_EXTERNAL
163 ;------------------------------------------------------------------------------
164 ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
165 ;------------------------------------------------------------------------------
168 cglobal conv_flt_to_s16, 3,3,5, dst, src, len
170 lea srcq, [srcq+2*lenq]
173 mova m4, [pf_s16_scale]
175 mova m0, [srcq+2*lenq ]
176 mova m1, [srcq+2*lenq+1*mmsize]
177 mova m2, [srcq+2*lenq+2*mmsize]
178 mova m3, [srcq+2*lenq+3*mmsize]
189 mova [dstq+lenq ], m0
190 mova [dstq+lenq+mmsize], m2
195 ;------------------------------------------------------------------------------
196 ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
197 ;------------------------------------------------------------------------------
199 %macro CONV_FLT_TO_S32 0
200 cglobal conv_flt_to_s32, 3,3,6, dst, src, len
205 mova m4, [pf_s32_scale]
206 mova m5, [pf_s32_clip]
208 mulps m0, m4, [srcq+lenq ]
209 mulps m1, m4, [srcq+lenq+1*mmsize]
210 mulps m2, m4, [srcq+lenq+2*mmsize]
211 mulps m3, m4, [srcq+lenq+3*mmsize]
220 mova [dstq+lenq ], m0
221 mova [dstq+lenq+1*mmsize], m1
222 mova [dstq+lenq+2*mmsize], m2
223 mova [dstq+lenq+3*mmsize], m3
231 %if HAVE_AVX_EXTERNAL
236 ;------------------------------------------------------------------------------
237 ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
239 ;------------------------------------------------------------------------------
241 %macro CONV_S16P_TO_S16_2CH 0
242 cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
243 mov src1q, [src0q+gprsize]
248 lea dstq, [dstq+2*lenq]
251 mova m0, [src0q+lenq ]
252 mova m1, [src1q+lenq ]
253 mova m2, [src0q+lenq+mmsize]
254 mova m3, [src1q+lenq+mmsize]
255 SBUTTERFLY2 wd, 0, 1, 4
256 SBUTTERFLY2 wd, 2, 3, 4
257 mova [dstq+2*lenq+0*mmsize], m0
258 mova [dstq+2*lenq+1*mmsize], m1
259 mova [dstq+2*lenq+2*mmsize], m2
260 mova [dstq+2*lenq+3*mmsize], m3
268 %if HAVE_AVX_EXTERNAL
273 ;------------------------------------------------------------------------------
274 ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
276 ;------------------------------------------------------------------------------
278 ;------------------------------------------------------------------------------
279 ; NOTE: In the 6-channel functions, len could be used as an index on x86-64
280 ; instead of just a counter, which would avoid incrementing the
281 ; pointers, but the extra complexity and amount of code is not worth
282 ; the small gain. On x86-32 there are not enough registers to use len
283 ; as an index without keeping two of the pointers on the stack and
284 ; loading them in each iteration.
285 ;------------------------------------------------------------------------------
287 %macro CONV_S16P_TO_S16_6CH 0
289 cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
291 cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
292 %define lend dword r2m
294 mov src1q, [src0q+1*gprsize]
295 mov src2q, [src0q+2*gprsize]
296 mov src3q, [src0q+3*gprsize]
297 mov src4q, [src0q+4*gprsize]
298 mov src5q, [src0q+5*gprsize]
306 %if cpuflag(sse2slow)
307 movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
308 movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
309 movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
310 movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
311 movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
312 movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
314 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
315 punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
316 punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
318 shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
319 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
320 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
322 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
323 pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
324 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
325 movq [dstq+0*mmsize/2], m1
326 movq [dstq+1*mmsize/2], m0
327 movq [dstq+2*mmsize/2], m2
328 movhps [dstq+3*mmsize/2], m1
329 movhps [dstq+4*mmsize/2], m0
330 movhps [dstq+5*mmsize/2], m2
335 mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
336 mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
337 mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
338 mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
339 mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
340 mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
342 SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
343 ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
344 SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
345 ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
346 SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
347 ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
349 shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
350 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
351 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
352 SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
353 shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
354 shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
355 shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
356 SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
358 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
359 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
360 pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
361 pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
362 pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
363 pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
365 punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
366 punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
367 shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
368 SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
369 punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
370 punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
371 shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
372 SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
373 mova [dstq+0*mmsize], m4
374 mova [dstq+1*mmsize], m2
375 mova [dstq+2*mmsize], m0
376 mova [dstq+3*mmsize], m5
377 mova [dstq+4*mmsize], m3
378 mova [dstq+5*mmsize], m1
391 %if HAVE_AVX_EXTERNAL
396 ;------------------------------------------------------------------------------
397 ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
399 ;------------------------------------------------------------------------------
401 %macro CONV_S16P_TO_FLT_2CH 0
402 cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
404 mov src1q, [src0q+gprsize]
406 lea dstq, [dstq+4*lenq]
410 mova m5, [pf_s32_inv_scale]
412 mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
413 mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
414 SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
415 ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
417 punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
418 punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
419 punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
420 punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
429 mova [dstq+4*lenq ], m0
430 mova [dstq+4*lenq+ mmsize], m1
431 mova [dstq+4*lenq+2*mmsize], m2
432 mova [dstq+4*lenq+3*mmsize], m3
440 %if HAVE_AVX_EXTERNAL
445 ;------------------------------------------------------------------------------
446 ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
448 ;------------------------------------------------------------------------------
450 %macro CONV_S16P_TO_FLT_6CH 0
452 cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
454 cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
455 %define lend dword r2m
457 mov src1q, [srcq+1*gprsize]
458 mov src2q, [srcq+2*gprsize]
459 mov src3q, [srcq+3*gprsize]
460 mov src4q, [srcq+4*gprsize]
461 mov src5q, [srcq+5*gprsize]
468 mova m7, [pf_s32_inv_scale]
470 %define unpack_even m6
471 mova m6, [pb_shuf_unpack_even]
473 %define unpack_odd m8
474 mova m8, [pb_shuf_unpack_odd]
476 %define unpack_odd [pb_shuf_unpack_odd]
480 movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
481 movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
482 movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
483 movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
484 movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
485 movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
487 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
488 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
489 punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
491 shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
492 shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
493 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
495 pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
496 pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
497 pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
498 pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
499 pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
500 pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
503 pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
504 pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
505 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
506 pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
507 punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
508 punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
509 punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
510 punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
511 punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
512 punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
513 SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
515 cvtdq2ps m0, m0 ; convert s32 to float
521 mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
528 mova [dstq+ mmsize], m1
529 mova [dstq+2*mmsize], m2
530 mova [dstq+3*mmsize], m3
531 mova [dstq+4*mmsize], m4
532 mova [dstq+5*mmsize], m5
544 %if HAVE_AVX_EXTERNAL
549 ;------------------------------------------------------------------------------
550 ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
552 ;------------------------------------------------------------------------------
554 %macro CONV_FLTP_TO_S16_2CH 0
555 cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
557 mov src1q, [src0q+gprsize]
563 mova m2, [pf_s16_scale]
565 mova m3, [pb_interleave_words]
568 mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
569 mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
573 packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
574 pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
576 packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
577 packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
578 punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
591 ;------------------------------------------------------------------------------
592 ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
594 ;------------------------------------------------------------------------------
596 %macro CONV_FLTP_TO_S16_6CH 0
598 cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
600 cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
601 %define lend dword r2m
603 mov src1q, [srcq+1*gprsize]
604 mov src2q, [srcq+2*gprsize]
605 mov src3q, [srcq+3*gprsize]
606 mov src4q, [srcq+4*gprsize]
607 mov src5q, [srcq+5*gprsize]
614 movaps xmm6, [pf_s16_scale]
617 mulps m0, m6, [srcq ]
618 mulps m1, m6, [srcq+src1q]
619 mulps m2, m6, [srcq+src2q]
620 mulps m3, m6, [srcq+src3q]
621 mulps m4, m6, [srcq+src4q]
622 mulps m5, m6, [srcq+src5q]
629 packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
630 packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
631 packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
633 movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
634 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
635 punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
636 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
638 shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
639 shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
640 shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
642 shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
643 shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
644 shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
645 mova [dstq+0*mmsize], m3
646 mova [dstq+1*mmsize], m1
647 mova [dstq+2*mmsize], m0
650 movlps xmm1, [srcq+src1q]
651 movlps xmm2, [srcq+src2q]
652 movlps xmm3, [srcq+src3q]
653 movlps xmm4, [srcq+src4q]
654 movlps xmm5, [srcq+src5q]
667 packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
668 packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
669 packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
671 pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
672 punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
673 punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
674 punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
676 pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
677 punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
678 punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
679 punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
680 mova [dstq+0*mmsize], mm0
681 mova [dstq+1*mmsize], mm1
682 mova [dstq+2*mmsize], mm2
700 %if HAVE_AVX_EXTERNAL
705 ;------------------------------------------------------------------------------
706 ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
708 ;------------------------------------------------------------------------------
710 %macro CONV_FLTP_TO_FLT_2CH 0
711 cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
712 mov src1q, [src0q+gprsize]
717 lea dstq, [dstq+2*lenq]
720 mova m0, [src0q+lenq ]
721 mova m1, [src1q+lenq ]
722 mova m2, [src0q+lenq+mmsize]
723 mova m3, [src1q+lenq+mmsize]
726 mova [dstq+2*lenq+0*mmsize], m0
727 mova [dstq+2*lenq+1*mmsize], m1
728 mova [dstq+2*lenq+2*mmsize], m2
729 mova [dstq+2*lenq+3*mmsize], m3
737 %if HAVE_AVX_EXTERNAL
742 ;-----------------------------------------------------------------------------
743 ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
745 ;-----------------------------------------------------------------------------
747 %macro CONV_FLTP_TO_FLT_6CH 0
748 cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
752 %define lend dword r2m
754 mov src1q, [srcq+1*gprsize]
755 mov src2q, [srcq+2*gprsize]
756 mov src3q, [srcq+3*gprsize]
757 mov src4q, [srcq+4*gprsize]
758 mov src5q, [srcq+5*gprsize]
767 mova m1, [srcq+src1q]
768 mova m2, [srcq+src2q]
769 mova m3, [srcq+src3q]
770 mova m4, [srcq+src4q]
771 mova m5, [srcq+src5q]
777 blendps m6, m4, m0, 1100b
780 blendps m2, m5, m1, 1100b
791 SBUTTERFLY dq, 0, 1, 6
792 SBUTTERFLY dq, 2, 3, 6
793 SBUTTERFLY dq, 4, 5, 6
818 %if HAVE_AVX_EXTERNAL
823 ;------------------------------------------------------------------------------
824 ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
826 ;------------------------------------------------------------------------------
828 %macro CONV_S16_TO_S16P_2CH 0
829 cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
831 mov dst1q, [dst0q+gprsize]
833 lea srcq, [srcq+2*lenq]
838 mova m3, [pb_deinterleave_words]
841 mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
842 mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
844 pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
845 pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
846 SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
847 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
849 pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
850 pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
851 pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
852 pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
853 DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
854 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
856 mova [dst0q+lenq], m0
857 mova [dst1q+lenq], m1
867 %if HAVE_AVX_EXTERNAL
872 ;------------------------------------------------------------------------------
873 ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
875 ;------------------------------------------------------------------------------
877 %macro CONV_S16_TO_S16P_6CH 0
879 cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
881 cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
882 %define lend dword r2m
884 mov dst1q, [dstq+ gprsize]
885 mov dst2q, [dstq+2*gprsize]
886 mov dst3q, [dstq+3*gprsize]
887 mov dst4q, [dstq+4*gprsize]
888 mov dst5q, [dstq+5*gprsize]
896 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
897 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
898 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
899 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
900 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
901 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
902 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
903 ; m1 = 4, 10, 5, 11, x, x, x, x
904 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
905 ; m2 = 16, 22, 17, 23, x, x, x, x
906 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
907 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
908 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
910 movhps [dstq+dst1q], m0
911 movq [dstq+dst2q], m3
912 movhps [dstq+dst3q], m3
913 movq [dstq+dst4q], m1
914 movhps [dstq+dst5q], m1
926 %if HAVE_AVX_EXTERNAL
931 ;------------------------------------------------------------------------------
932 ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
934 ;------------------------------------------------------------------------------
936 %macro CONV_S16_TO_FLTP_2CH 0
937 cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
939 mov dst1q, [dst0q+gprsize]
945 mova m3, [pf_s32_inv_scale]
946 mova m4, [pw_zero_even]
955 mova [dst0q+lenq], m0
956 mova [dst1q+lenq], m1
964 %if HAVE_AVX_EXTERNAL
969 ;------------------------------------------------------------------------------
970 ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
972 ;------------------------------------------------------------------------------
974 %macro CONV_S16_TO_FLTP_6CH 0
976 cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
978 cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
979 %define lend dword r2m
981 mov dst1q, [dstq+ gprsize]
982 mov dst2q, [dstq+2*gprsize]
983 mov dst3q, [dstq+3*gprsize]
984 mov dst4q, [dstq+4*gprsize]
985 mov dst5q, [dstq+5*gprsize]
992 mova m6, [pf_s16_inv_scale]
994 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
995 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
996 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
997 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
998 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
999 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
1000 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
1001 ; m1 = 4, 10, 5, 11, x, x, x, x
1002 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
1003 ; m2 = 16, 22, 17, 23, x, x, x, x
1004 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
1005 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
1006 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
1007 S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
1009 S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
1011 S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
1012 ; m5 = 5, 11, 17, 23
1027 mova [dstq+dst1q], m1
1028 mova [dstq+dst2q], m2
1029 mova [dstq+dst3q], m3
1030 mova [dstq+dst4q], m4
1031 mova [dstq+dst5q], m5
1040 CONV_S16_TO_FLTP_6CH
1042 CONV_S16_TO_FLTP_6CH
1044 CONV_S16_TO_FLTP_6CH
1045 %if HAVE_AVX_EXTERNAL
1047 CONV_S16_TO_FLTP_6CH
1050 ;------------------------------------------------------------------------------
1051 ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
1053 ;------------------------------------------------------------------------------
1055 %macro CONV_FLT_TO_S16P_2CH 0
1056 cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
1058 mov dst1q, [dst0q+gprsize]
1060 lea srcq, [srcq+4*lenq]
1064 mova m5, [pf_s16_scale]
1066 mova m0, [srcq+4*lenq ]
1067 mova m1, [srcq+4*lenq+ mmsize]
1068 mova m2, [srcq+4*lenq+2*mmsize]
1069 mova m3, [srcq+4*lenq+3*mmsize]
1082 mova [dst0q+lenq], m0
1083 mova [dst1q+lenq], m1
1090 CONV_FLT_TO_S16P_2CH
1091 %if HAVE_AVX_EXTERNAL
1093 CONV_FLT_TO_S16P_2CH
1096 ;------------------------------------------------------------------------------
1097 ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
1099 ;------------------------------------------------------------------------------
1101 %macro CONV_FLT_TO_S16P_6CH 0
1103 cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1105 cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1106 %define lend dword r2m
1108 mov dst1q, [dstq+ gprsize]
1109 mov dst2q, [dstq+2*gprsize]
1110 mov dst3q, [dstq+3*gprsize]
1111 mov dst4q, [dstq+4*gprsize]
1112 mov dst5q, [dstq+5*gprsize]
1119 mova m6, [pf_s16_scale]
1121 mulps m0, m6, [srcq+0*mmsize]
1122 mulps m3, m6, [srcq+1*mmsize]
1123 mulps m1, m6, [srcq+2*mmsize]
1124 mulps m4, m6, [srcq+3*mmsize]
1125 mulps m2, m6, [srcq+4*mmsize]
1126 mulps m5, m6, [srcq+5*mmsize]
1133 packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
1134 packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
1135 packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
1136 PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
1137 shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
1138 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
1139 SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
1140 ; m3 = 4, 10, 5, 11, x, x, x, x
1141 SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
1142 ; m2 = 16, 22, 17, 23, x, x, x, x
1143 SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
1144 ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
1145 punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
1147 movhps [dstq+dst1q], m0
1148 movq [dstq+dst2q], m1
1149 movhps [dstq+dst3q], m1
1150 movq [dstq+dst4q], m3
1151 movhps [dstq+dst5q], m3
1160 CONV_FLT_TO_S16P_6CH
1162 CONV_FLT_TO_S16P_6CH
1163 %if HAVE_AVX_EXTERNAL
1165 CONV_FLT_TO_S16P_6CH
1168 ;------------------------------------------------------------------------------
1169 ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
1171 ;------------------------------------------------------------------------------
1173 %macro CONV_FLT_TO_FLTP_2CH 0
1174 cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
1176 mov dst1q, [dst0q+gprsize]
1178 lea srcq, [srcq+2*lenq]
1183 mova m0, [srcq+2*lenq ]
1184 mova m1, [srcq+2*lenq+mmsize]
1186 mova [dst0q+lenq], m0
1187 mova [dst1q+lenq], m1
1194 CONV_FLT_TO_FLTP_2CH
1195 %if HAVE_AVX_EXTERNAL
1197 CONV_FLT_TO_FLTP_2CH
1200 ;------------------------------------------------------------------------------
1201 ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
1203 ;------------------------------------------------------------------------------
1205 %macro CONV_FLT_TO_FLTP_6CH 0
1207 cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1209 cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1210 %define lend dword r2m
1212 mov dst1q, [dstq+ gprsize]
1213 mov dst2q, [dstq+2*gprsize]
1214 mov dst3q, [dstq+3*gprsize]
1215 mov dst4q, [dstq+4*gprsize]
1216 mov dst5q, [dstq+5*gprsize]
1224 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
1225 mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
1226 mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
1227 mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
1228 mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
1229 mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
1231 SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
1233 SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
1235 SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
1236 ; m5 = 10, 22, 11, 23
1237 SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
1239 SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
1241 SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
1242 ; m5 = 5, 11, 17, 23
1244 mova [dstq+dst1q], m4
1245 mova [dstq+dst2q], m3
1246 mova [dstq+dst3q], m2
1247 mova [dstq+dst4q], m1
1248 mova [dstq+dst5q], m5
1257 CONV_FLT_TO_FLTP_6CH
1258 %if HAVE_AVX_EXTERNAL
1260 CONV_FLT_TO_FLTP_6CH