1 ;******************************************************************************
2 ;* 32 point SSE-optimized DCT transform
3 ;* Copyright (c) 2010 Vitor Sessak
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
27 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
28 dd 0.553104, 0.582935, 0.622504, 0.674808
29 dd -10.190008, -3.407609, -2.057781, -1.484165
30 dd -1.169440, -0.972568, -0.839350, -0.744536
31 dd 0.502419, 0.522499, 0.566944, 0.646822
32 dd 0.788155, 1.060678, 1.722447, 5.101149
33 dd 0.509796, 0.601345, 0.899976, 2.562916
34 dd 0.509796, 0.601345, 0.899976, 2.562916
35 dd 1.000000, 1.000000, 1.306563, 0.541196
36 dd 1.000000, 1.000000, 1.306563, 0.541196
37 dd 1.000000, 0.707107, 1.000000, -0.707107
38 dd 1.000000, 0.707107, 1.000000, -0.707107
39 dd 0.707107, 0.707107, 0.707107, 0.707107
42 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
51 %if cpuflag(sse2) && notcpuflag(avx)
65 BUTTERFLY0 %1, %2, %3, %4, 0x1b
69 BUTTERFLY0 %1, %2, %3, %4, 0xb1
77 mulps m%2, [ps_cos_vec+192]
81 mulps m%4, [ps_cos_vec+192]
84 %macro PASS6_AND_PERMUTE 0
195 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
196 cglobal dct32_float, 2,3,8, out, in, tmp
199 vinsertf128 m5, m5, [inq+96], 1
200 vinsertf128 m5, m5, [inq+112], 0
201 vshufps m5, m5, m5, 0x1b
202 BUTTERFLY m4, m5, [ps_cos_vec], m6
205 vinsertf128 m6, m6, [inq+32], 1
206 vinsertf128 m6, m6, [inq+48], 0
207 vshufps m6, m6, m6, 0x1b
208 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
212 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
213 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
217 vperm2f128 m3, m6, m4, 0x31
218 vperm2f128 m1, m6, m4, 0x20
219 vshufps m3, m3, m3, 0x1b
221 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
224 vperm2f128 m4, m5, m2, 0x20
225 vperm2f128 m5, m5, m2, 0x31
226 vshufps m5, m5, m5, 0x1b
228 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
231 vmovaps m6, [ps_p1p1m1m1+0]
232 vmovaps m2, [ps_cos_vec+128]
234 BUTTERFLY2 m5, m6, m2, m7
235 BUTTERFLY2 m4, m6, m2, m7
236 BUTTERFLY2 m1, m6, m2, m7
237 BUTTERFLY2 m3, m6, m2, m7
241 vshufps m6, m6, m6, 0xcc
242 vmovaps m2, [ps_cos_vec+160]
244 BUTTERFLY3 m5, m6, m2, m7
245 BUTTERFLY3 m4, m6, m2, m7
246 BUTTERFLY3 m1, m6, m2, m7
247 BUTTERFLY3 m3, m6, m2, m7
249 vperm2f128 m6, m3, m3, 0x31
252 vextractf128 [outq+64], m5, 1
253 vextractf128 [outq+32], m5, 0
255 vextractf128 [outq+80], m4, 1
256 vextractf128 [outq+48], m4, 0
258 vperm2f128 m0, m1, m1, 0x31
259 vmovaps [outq+96], m1
273 nop ; FIXME code alignment
279 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
280 TRANSPOSE4x4PS 8, 9, 10, 11, 0
281 BUTTERFLY3V 8, 9, 10, 11, 0
283 TRANSPOSE4x4PS 12, 13, 14, 15, 0
284 BUTTERFLY3V 12, 13, 14, 15, 0
294 movss [outq+0x00], m8
296 movss [outq+0x10], m9
298 movss [outq+0x20], m10
300 movss [outq+0x30], m11
302 movss [outq+0x40], m12
304 movss [outq+0x50], m13
306 movss [outq+0x60], m14
308 movaps [outq+0x70], m15
312 movss [outq+0x08], m0
314 movss [outq+0x18], m1
316 movss [outq+0x28], m2
318 movss [outq+0x38], m3
320 movss [outq+0x48], m4
322 movss [outq+0x58], m5
323 movss [outq+0x68], m6
324 movss [outq+0x78], m7
326 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
329 SWAP 0, 2, 4, 6, 8, 10, 12, 14
330 SWAP 1, 3, 5, 7, 9, 11, 13, 15
335 SWAP 0, 2, 4, 6, 8, 10, 12, 14
336 SWAP 1, 3, 5, 7, 9, 11, 13, 15
342 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
348 %macro SPILL 2 ; xmm#, mempos
349 movaps [outq+(%2-8)*16], m%1
352 movaps m%1, [outq+(%2-8)*16]
355 %define PASS6 PASS6_AND_PERMUTE
357 movaps m2, [ps_cos_vec+160]
360 BUTTERFLY3 m5, m3, m2, m1
364 BUTTERFLY3 m1, m3, m2, m5
367 BUTTERFLY3 m4, m3, m2, m5
370 BUTTERFLY3 m7, m3, m2, m5
374 BUTTERFLY3 m5, m3, m2, m7
378 BUTTERFLY3 m4, m3, m2, m7
381 BUTTERFLY3 m6, m3, m2, m7
384 BUTTERFLY3 m0, m3, m2, m7
390 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
392 cglobal dct32_float, 2, 3, 16, out, in, tmp
396 LOAD_INV m1, [inq+112]
397 BUTTERFLY m0, m1, [ps_cos_vec], m3
400 LOAD_INV m4, [inq+48]
401 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
404 movaps m2, [ps_cos_vec+64]
405 BUTTERFLY m1, m4, m2, m3
411 LOAD_INV m6, [inq+96]
412 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
415 LOAD_INV m5, [inq+32]
416 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
419 BUTTERFLY m0, m7, m2, m3
421 movaps m2, [ps_cos_vec+80]
422 BUTTERFLY m6, m5, m2, m3
424 BUTTERFLY m1, m4, m2, m3
427 movaps m2, [ps_cos_vec+96]
429 BUTTERFLY m0, m1, m2, m3
435 BUTTERFLY m0, m5, m2, m3
439 BUTTERFLY m1, m6, m2, m3
443 BUTTERFLY m7, m4, m2, m3
446 movaps m3, [ps_p1p1m1m1+0]
447 movaps m2, [ps_cos_vec+128]
449 BUTTERFLY2 m5, m3, m2, m1
451 BUTTERFLY2 m0, m3, m2, m1
454 BUTTERFLY2 m6, m3, m2, m1
458 BUTTERFLY2 m0, m3, m2, m1
461 BUTTERFLY2 m4, m3, m2, m1
463 BUTTERFLY2 m7, m3, m2, m1
466 BUTTERFLY2 m6, m3, m2, m1
469 BUTTERFLY2 m0, m3, m2, m1