1 ;******************************************************************************
2 ;* 32 point SSE-optimized DCT transform
3 ;* Copyright (c) 2010 Vitor Sessak
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
23 %include "x86util.asm"
28 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
29 dd 0.553104, 0.582935, 0.622504, 0.674808
30 dd -10.190008, -3.407609, -2.057781, -1.484165
31 dd -1.169440, -0.972568, -0.839350, -0.744536
32 dd 0.502419, 0.522499, 0.566944, 0.646822
33 dd 0.788155, 1.060678, 1.722447, 5.101149
34 dd 0.509796, 0.601345, 0.899976, 2.562916
35 dd 0.509796, 0.601345, 0.899976, 2.562916
36 dd 1.000000, 1.000000, 1.306563, 0.541196
37 dd 1.000000, 1.000000, 1.306563, 0.541196
38 dd 1.000000, 0.707107, 1.000000, -0.707107
39 dd 1.000000, 0.707107, 1.000000, -0.707107
40 dd 0.707107, 0.707107, 0.707107, 0.707107
43 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
45 %macro BUTTERFLY_SSE 4
52 %macro BUTTERFLY_AVX 4
58 %macro BUTTERFLY0_SSE 5
66 %macro BUTTERFLY0_SSE2 5
73 %macro BUTTERFLY0_AVX 5
74 vshufps %4, %1, %1, %5
81 BUTTERFLY0 %1, %2, %3, %4, 0x1b
85 BUTTERFLY0 %1, %2, %3, %4, 0xb1
93 mulps m%2, [ps_cos_vec+192]
97 mulps m%4, [ps_cos_vec+192]
100 %macro PASS6_AND_PERMUTE 0
209 %define BUTTERFLY BUTTERFLY_AVX
210 %define BUTTERFLY0 BUTTERFLY0_AVX
215 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
216 cglobal dct32_float_avx, 2,3,8, out, in, tmp
219 vinsertf128 m5, m5, [inq+96], 1
220 vinsertf128 m5, m5, [inq+112], 0
221 vshufps m5, m5, m5, 0x1b
222 BUTTERFLY m4, m5, [ps_cos_vec], m6
225 vinsertf128 m6, m6, [inq+32], 1
226 vinsertf128 m6, m6, [inq+48], 0
227 vshufps m6, m6, m6, 0x1b
228 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
232 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
233 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
237 vperm2f128 m3, m6, m4, 0x31
238 vperm2f128 m1, m6, m4, 0x20
239 vshufps m3, m3, m3, 0x1b
241 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
244 vperm2f128 m4, m5, m2, 0x20
245 vperm2f128 m5, m5, m2, 0x31
246 vshufps m5, m5, m5, 0x1b
248 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
251 vmovaps m6, [ps_p1p1m1m1+0]
252 vmovaps m2, [ps_cos_vec+128]
254 BUTTERFLY2 m5, m6, m2, m7
255 BUTTERFLY2 m4, m6, m2, m7
256 BUTTERFLY2 m1, m6, m2, m7
257 BUTTERFLY2 m3, m6, m2, m7
261 vshufps m6, m6, m6, 0xcc
262 vmovaps m2, [ps_cos_vec+160]
264 BUTTERFLY3 m5, m6, m2, m7
265 BUTTERFLY3 m4, m6, m2, m7
266 BUTTERFLY3 m1, m6, m2, m7
267 BUTTERFLY3 m3, m6, m2, m7
269 vperm2f128 m6, m3, m3, 0x31
272 vextractf128 [outq+64], m5, 1
273 vextractf128 [outq+32], m5, 0
275 vextractf128 [outq+80], m4, 1
276 vextractf128 [outq+48], m4, 0
278 vperm2f128 m0, m1, m1, 0x31
279 vmovaps [outq+96], m1
289 %define BUTTERFLY BUTTERFLY_SSE
290 %define BUTTERFLY0 BUTTERFLY0_SSE
297 nop ; FIXME code alignment
303 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
304 TRANSPOSE4x4PS 8, 9, 10, 11, 0
305 BUTTERFLY3V 8, 9, 10, 11, 0
307 TRANSPOSE4x4PS 12, 13, 14, 15, 0
308 BUTTERFLY3V 12, 13, 14, 15, 0
318 movss [outq+0x00], m8
320 movss [outq+0x10], m9
322 movss [outq+0x20], m10
324 movss [outq+0x30], m11
326 movss [outq+0x40], m12
328 movss [outq+0x50], m13
330 movss [outq+0x60], m14
332 movaps [outq+0x70], m15
336 movss [outq+0x08], m0
338 movss [outq+0x18], m1
340 movss [outq+0x28], m2
342 movss [outq+0x38], m3
344 movss [outq+0x48], m4
346 movss [outq+0x58], m5
347 movss [outq+0x68], m6
348 movss [outq+0x78], m7
350 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
353 SWAP 0, 2, 4, 6, 8, 10, 12, 14
354 SWAP 1, 3, 5, 7, 9, 11, 13, 15
359 SWAP 0, 2, 4, 6, 8, 10, 12, 14
360 SWAP 1, 3, 5, 7, 9, 11, 13, 15
366 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
372 %macro SPILL 2 ; xmm#, mempos
373 movaps [outq+(%2-8)*16], m%1
376 movaps m%1, [outq+(%2-8)*16]
379 %define PASS6 PASS6_AND_PERMUTE
381 movaps m2, [ps_cos_vec+160]
384 BUTTERFLY3 m5, m3, m2, m1
388 BUTTERFLY3 m1, m3, m2, m5
391 BUTTERFLY3 m4, m3, m2, m5
394 BUTTERFLY3 m7, m3, m2, m5
398 BUTTERFLY3 m5, m3, m2, m7
402 BUTTERFLY3 m4, m3, m2, m7
405 BUTTERFLY3 m6, m3, m2, m7
408 BUTTERFLY3 m0, m3, m2, m7
416 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
417 cglobal dct32_float_%1, 2,3,16, out, in, tmp
421 LOAD_INV m1, [inq+112]
422 BUTTERFLY m0, m1, [ps_cos_vec], m3
425 LOAD_INV m4, [inq+48]
426 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
429 movaps m2, [ps_cos_vec+64]
430 BUTTERFLY m1, m4, m2, m3
436 LOAD_INV m6, [inq+96]
437 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
440 LOAD_INV m5, [inq+32]
441 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
444 BUTTERFLY m0, m7, m2, m3
446 movaps m2, [ps_cos_vec+80]
447 BUTTERFLY m6, m5, m2, m3
449 BUTTERFLY m1, m4, m2, m3
452 movaps m2, [ps_cos_vec+96]
454 BUTTERFLY m0, m1, m2, m3
460 BUTTERFLY m0, m5, m2, m3
464 BUTTERFLY m1, m6, m2, m3
468 BUTTERFLY m7, m4, m2, m3
471 movaps m3, [ps_p1p1m1m1+0]
472 movaps m2, [ps_cos_vec+128]
474 BUTTERFLY2 m5, m3, m2, m1
476 BUTTERFLY2 m0, m3, m2, m1
479 BUTTERFLY2 m6, m3, m2, m1
483 BUTTERFLY2 m0, m3, m2, m1
486 BUTTERFLY2 m4, m3, m2, m1
488 BUTTERFLY2 m7, m3, m2, m1
491 BUTTERFLY2 m6, m3, m2, m1
494 BUTTERFLY2 m0, m3, m2, m1
501 %macro LOAD_INV_SSE 2
506 %define LOAD_INV LOAD_INV_SSE
509 %macro LOAD_INV_SSE2 2
513 %define LOAD_INV LOAD_INV_SSE2
514 %define BUTTERFLY0 BUTTERFLY0_SSE2