1 ;******************************************************************************
2 ;* 32 point SSE-optimized DCT transform
3 ;* Copyright (c) 2010 Vitor Sessak
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
23 %include "x86util.asm"
28 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
29 dd 0.553104, 0.582935, 0.622504, 0.674808
30 dd -10.190008, -3.407609, -2.057781, -1.484165
31 dd -1.169440, -0.972568, -0.839350, -0.744536
32 dd 0.502419, 0.522499, 0.566944, 0.646822
33 dd 0.788155, 1.060678, 1.722447, 5.101149
34 dd 0.509796, 0.601345, 0.899976, 2.562916
35 dd 0.509796, 0.601345, 0.899976, 2.562916
36 dd 1.000000, 1.000000, 1.306563, 0.541196
37 dd 1.000000, 1.000000, 1.306563, 0.541196
38 dd 1.000000, 0.707107, 1.000000, -0.707107
39 dd 1.000000, 0.707107, 1.000000, -0.707107
40 dd 0.707107, 0.707107, 0.707107, 0.707107
43 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
45 %macro BUTTERFLY_SSE 4
52 %macro BUTTERFLY_AVX 4
58 %macro BUTTERFLY0_SSE 5
66 %macro BUTTERFLY0_AVX 5
67 vshufps %4, %1, %1, %5
74 BUTTERFLY0 %1, %2, %3, %4, 0x1b
78 BUTTERFLY0 %1, %2, %3, %4, 0xb1
86 mulps m%2, [ps_cos_vec+192]
90 mulps m%4, [ps_cos_vec+192]
93 %macro PASS6_AND_PERMUTE 0
202 %define BUTTERFLY BUTTERFLY_AVX
203 %define BUTTERFLY0 BUTTERFLY0_AVX
208 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
209 cglobal dct32_float_avx, 2,3,8, out, in, tmp
212 vinsertf128 m5, m5, [inq+96], 1
213 vinsertf128 m5, m5, [inq+112], 0
214 vshufps m5, m5, m5, 0x1b
215 BUTTERFLY m4, m5, [ps_cos_vec], m6
218 vinsertf128 m6, m6, [inq+32], 1
219 vinsertf128 m6, m6, [inq+48], 0
220 vshufps m6, m6, m6, 0x1b
221 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
225 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
226 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
230 vperm2f128 m3, m6, m4, 0x31
231 vperm2f128 m1, m6, m4, 0x20
232 vshufps m3, m3, m3, 0x1b
234 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
237 vperm2f128 m4, m5, m2, 0x20
238 vperm2f128 m5, m5, m2, 0x31
239 vshufps m5, m5, m5, 0x1b
241 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
244 vmovaps m6, [ps_p1p1m1m1+0]
245 vmovaps m2, [ps_cos_vec+128]
247 BUTTERFLY2 m5, m6, m2, m7
248 BUTTERFLY2 m4, m6, m2, m7
249 BUTTERFLY2 m1, m6, m2, m7
250 BUTTERFLY2 m3, m6, m2, m7
254 vshufps m6, m6, m6, 0xcc
255 vmovaps m2, [ps_cos_vec+160]
257 BUTTERFLY3 m5, m6, m2, m7
258 BUTTERFLY3 m4, m6, m2, m7
259 BUTTERFLY3 m1, m6, m2, m7
260 BUTTERFLY3 m3, m6, m2, m7
262 vperm2f128 m6, m3, m3, 0x31
265 vextractf128 [outq+64], m5, 1
266 vextractf128 [outq+32], m5, 0
268 vextractf128 [outq+80], m4, 1
269 vextractf128 [outq+48], m4, 0
271 vperm2f128 m0, m1, m1, 0x31
272 vmovaps [outq+96], m1
282 %define BUTTERFLY BUTTERFLY_SSE
283 %define BUTTERFLY0 BUTTERFLY0_SSE
290 nop ; FIXME code alignment
296 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
297 TRANSPOSE4x4PS 8, 9, 10, 11, 0
298 BUTTERFLY3V 8, 9, 10, 11, 0
300 TRANSPOSE4x4PS 12, 13, 14, 15, 0
301 BUTTERFLY3V 12, 13, 14, 15, 0
311 movss [outq+0x00], m8
313 movss [outq+0x10], m9
315 movss [outq+0x20], m10
317 movss [outq+0x30], m11
319 movss [outq+0x40], m12
321 movss [outq+0x50], m13
323 movss [outq+0x60], m14
325 movaps [outq+0x70], m15
329 movss [outq+0x08], m0
331 movss [outq+0x18], m1
333 movss [outq+0x28], m2
335 movss [outq+0x38], m3
337 movss [outq+0x48], m4
339 movss [outq+0x58], m5
340 movss [outq+0x68], m6
341 movss [outq+0x78], m7
343 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
346 SWAP 0, 2, 4, 6, 8, 10, 12, 14
347 SWAP 1, 3, 5, 7, 9, 11, 13, 15
352 SWAP 0, 2, 4, 6, 8, 10, 12, 14
353 SWAP 1, 3, 5, 7, 9, 11, 13, 15
359 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
365 %macro SPILL 2 ; xmm#, mempos
366 movaps [outq+(%2-8)*16], m%1
369 movaps m%1, [outq+(%2-8)*16]
372 %define PASS6 PASS6_AND_PERMUTE
374 movaps m2, [ps_cos_vec+160]
377 BUTTERFLY3 m5, m3, m2, m1
381 BUTTERFLY3 m1, m3, m2, m5
384 BUTTERFLY3 m4, m3, m2, m5
387 BUTTERFLY3 m7, m3, m2, m5
391 BUTTERFLY3 m5, m3, m2, m7
395 BUTTERFLY3 m4, m3, m2, m7
398 BUTTERFLY3 m6, m3, m2, m7
401 BUTTERFLY3 m0, m3, m2, m7
408 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
409 cglobal dct32_float_sse, 2,3,16, out, in, tmp
415 BUTTERFLY m0, m1, [ps_cos_vec], m3
420 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
423 movaps m2, [ps_cos_vec+64]
424 BUTTERFLY m1, m4, m2, m3
432 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
437 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
440 BUTTERFLY m0, m7, m2, m3
442 movaps m2, [ps_cos_vec+80]
443 BUTTERFLY m6, m5, m2, m3
445 BUTTERFLY m1, m4, m2, m3
448 movaps m2, [ps_cos_vec+96]
450 BUTTERFLY m0, m1, m2, m3
456 BUTTERFLY m0, m5, m2, m3
460 BUTTERFLY m1, m6, m2, m3
464 BUTTERFLY m7, m4, m2, m3
467 movaps m3, [ps_p1p1m1m1+0]
468 movaps m2, [ps_cos_vec+128]
470 BUTTERFLY2 m5, m3, m2, m1
472 BUTTERFLY2 m0, m3, m2, m1
475 BUTTERFLY2 m6, m3, m2, m1
479 BUTTERFLY2 m0, m3, m2, m1
482 BUTTERFLY2 m4, m3, m2, m1
484 BUTTERFLY2 m7, m3, m2, m1
487 BUTTERFLY2 m6, m3, m2, m1
490 BUTTERFLY2 m0, m3, m2, m1