1 ;******************************************************************************
2 ;* 32 point SSE-optimized DCT transform
3 ;* Copyright (c) 2010 Vitor Sessak
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
23 %include "x86util.asm"
28 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
29 dd 0.553104, 0.582935, 0.622504, 0.674808
30 dd -10.190008, -3.407609, -2.057781, -1.484165
31 dd -1.169440, -0.972568, -0.839350, -0.744536
32 dd 0.502419, 0.522499, 0.566944, 0.646822
33 dd 0.788155, 1.060678, 1.722447, 5.101149
34 dd 0.509796, 0.601345, 0.899976, 2.562916
35 dd 0.509796, 0.601345, 0.899976, 2.562916
36 dd 1.000000, 1.000000, 1.306563, 0.541196
37 dd 1.000000, 1.000000, 1.306563, 0.541196
38 dd 1.000000, 0.707107, 1.000000, -0.707107
39 dd 1.000000, 0.707107, 1.000000, -0.707107
40 dd 0.707107, 0.707107, 0.707107, 0.707107
43 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
52 %if cpuflag(sse2) && notcpuflag(avx)
66 BUTTERFLY0 %1, %2, %3, %4, 0x1b
70 BUTTERFLY0 %1, %2, %3, %4, 0xb1
78 mulps m%2, [ps_cos_vec+192]
82 mulps m%4, [ps_cos_vec+192]
85 %macro PASS6_AND_PERMUTE 0
197 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
198 cglobal dct32_float, 2,3,8, out, in, tmp
201 vinsertf128 m5, m5, [inq+96], 1
202 vinsertf128 m5, m5, [inq+112], 0
203 vshufps m5, m5, m5, 0x1b
204 BUTTERFLY m4, m5, [ps_cos_vec], m6
207 vinsertf128 m6, m6, [inq+32], 1
208 vinsertf128 m6, m6, [inq+48], 0
209 vshufps m6, m6, m6, 0x1b
210 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
214 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
215 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
219 vperm2f128 m3, m6, m4, 0x31
220 vperm2f128 m1, m6, m4, 0x20
221 vshufps m3, m3, m3, 0x1b
223 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
226 vperm2f128 m4, m5, m2, 0x20
227 vperm2f128 m5, m5, m2, 0x31
228 vshufps m5, m5, m5, 0x1b
230 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
233 vmovaps m6, [ps_p1p1m1m1+0]
234 vmovaps m2, [ps_cos_vec+128]
236 BUTTERFLY2 m5, m6, m2, m7
237 BUTTERFLY2 m4, m6, m2, m7
238 BUTTERFLY2 m1, m6, m2, m7
239 BUTTERFLY2 m3, m6, m2, m7
243 vshufps m6, m6, m6, 0xcc
244 vmovaps m2, [ps_cos_vec+160]
246 BUTTERFLY3 m5, m6, m2, m7
247 BUTTERFLY3 m4, m6, m2, m7
248 BUTTERFLY3 m1, m6, m2, m7
249 BUTTERFLY3 m3, m6, m2, m7
251 vperm2f128 m6, m3, m3, 0x31
254 vextractf128 [outq+64], m5, 1
255 vextractf128 [outq+32], m5, 0
257 vextractf128 [outq+80], m4, 1
258 vextractf128 [outq+48], m4, 0
260 vperm2f128 m0, m1, m1, 0x31
261 vmovaps [outq+96], m1
276 nop ; FIXME code alignment
282 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
283 TRANSPOSE4x4PS 8, 9, 10, 11, 0
284 BUTTERFLY3V 8, 9, 10, 11, 0
286 TRANSPOSE4x4PS 12, 13, 14, 15, 0
287 BUTTERFLY3V 12, 13, 14, 15, 0
297 movss [outq+0x00], m8
299 movss [outq+0x10], m9
301 movss [outq+0x20], m10
303 movss [outq+0x30], m11
305 movss [outq+0x40], m12
307 movss [outq+0x50], m13
309 movss [outq+0x60], m14
311 movaps [outq+0x70], m15
315 movss [outq+0x08], m0
317 movss [outq+0x18], m1
319 movss [outq+0x28], m2
321 movss [outq+0x38], m3
323 movss [outq+0x48], m4
325 movss [outq+0x58], m5
326 movss [outq+0x68], m6
327 movss [outq+0x78], m7
329 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
332 SWAP 0, 2, 4, 6, 8, 10, 12, 14
333 SWAP 1, 3, 5, 7, 9, 11, 13, 15
338 SWAP 0, 2, 4, 6, 8, 10, 12, 14
339 SWAP 1, 3, 5, 7, 9, 11, 13, 15
345 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
351 %macro SPILL 2 ; xmm#, mempos
352 movaps [outq+(%2-8)*16], m%1
355 movaps m%1, [outq+(%2-8)*16]
358 %define PASS6 PASS6_AND_PERMUTE
360 movaps m2, [ps_cos_vec+160]
363 BUTTERFLY3 m5, m3, m2, m1
367 BUTTERFLY3 m1, m3, m2, m5
370 BUTTERFLY3 m4, m3, m2, m5
373 BUTTERFLY3 m7, m3, m2, m5
377 BUTTERFLY3 m5, m3, m2, m7
381 BUTTERFLY3 m4, m3, m2, m7
384 BUTTERFLY3 m6, m3, m2, m7
387 BUTTERFLY3 m0, m3, m2, m7
393 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
395 cglobal dct32_float, 2, 3, 16, out, in, tmp
399 LOAD_INV m1, [inq+112]
400 BUTTERFLY m0, m1, [ps_cos_vec], m3
403 LOAD_INV m4, [inq+48]
404 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
407 movaps m2, [ps_cos_vec+64]
408 BUTTERFLY m1, m4, m2, m3
414 LOAD_INV m6, [inq+96]
415 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
418 LOAD_INV m5, [inq+32]
419 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
422 BUTTERFLY m0, m7, m2, m3
424 movaps m2, [ps_cos_vec+80]
425 BUTTERFLY m6, m5, m2, m3
427 BUTTERFLY m1, m4, m2, m3
430 movaps m2, [ps_cos_vec+96]
432 BUTTERFLY m0, m1, m2, m3
438 BUTTERFLY m0, m5, m2, m3
442 BUTTERFLY m1, m6, m2, m3
446 BUTTERFLY m7, m4, m2, m3
449 movaps m3, [ps_p1p1m1m1+0]
450 movaps m2, [ps_cos_vec+128]
452 BUTTERFLY2 m5, m3, m2, m1
454 BUTTERFLY2 m0, m3, m2, m1
457 BUTTERFLY2 m6, m3, m2, m1
461 BUTTERFLY2 m0, m3, m2, m1
464 BUTTERFLY2 m4, m3, m2, m1
466 BUTTERFLY2 m7, m3, m2, m1
469 BUTTERFLY2 m6, m3, m2, m1
472 BUTTERFLY2 m0, m3, m2, m1