1 ;******************************************************************************
2 ;* 32 point SSE-optimized DCT transform
3 ;* Copyright (c) 2010 Vitor Sessak
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
28 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
29 dd 0.553104, 0.582935, 0.622504, 0.674808
30 dd -10.190008, -3.407609, -2.057781, -1.484165
31 dd -1.169440, -0.972568, -0.839350, -0.744536
32 dd 0.502419, 0.522499, 0.566944, 0.646822
33 dd 0.788155, 1.060678, 1.722447, 5.101149
34 dd 0.509796, 0.601345, 0.899976, 2.562916
35 dd 0.509796, 0.601345, 0.899976, 2.562916
36 dd 1.000000, 1.000000, 1.306563, 0.541196
37 dd 1.000000, 1.000000, 1.306563, 0.541196
38 dd 1.000000, 0.707107, 1.000000, -0.707107
39 dd 1.000000, 0.707107, 1.000000, -0.707107
40 dd 0.707107, 0.707107, 0.707107, 0.707107
49 %if cpuflag(sse2) && notcpuflag(avx)
63 BUTTERFLY0 %1, %2, %3, %4, 0x1b
67 BUTTERFLY0 %1, %2, %3, %4, 0xb1
75 mulps m%2, [ps_cos_vec+192]
79 mulps m%4, [ps_cos_vec+192]
82 %macro PASS6_AND_PERMUTE 0
193 %if HAVE_AVX_EXTERNAL
194 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
195 cglobal dct32_float, 2,3,8, out, in, tmp
198 vinsertf128 m5, m5, [inq+96], 1
199 vinsertf128 m5, m5, [inq+112], 0
200 vshufps m5, m5, m5, 0x1b
201 BUTTERFLY m4, m5, [ps_cos_vec], m6
204 vinsertf128 m6, m6, [inq+32], 1
205 vinsertf128 m6, m6, [inq+48], 0
206 vshufps m6, m6, m6, 0x1b
207 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
211 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
212 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
216 vperm2f128 m3, m6, m4, 0x31
217 vperm2f128 m1, m6, m4, 0x20
218 vshufps m3, m3, m3, 0x1b
220 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
223 vperm2f128 m4, m5, m2, 0x20
224 vperm2f128 m5, m5, m2, 0x31
225 vshufps m5, m5, m5, 0x1b
227 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
230 vmovaps m6, [ps_p1p1m1m1+0]
231 vmovaps m2, [ps_cos_vec+128]
233 BUTTERFLY2 m5, m6, m2, m7
234 BUTTERFLY2 m4, m6, m2, m7
235 BUTTERFLY2 m1, m6, m2, m7
236 BUTTERFLY2 m3, m6, m2, m7
240 vshufps m6, m6, m6, 0xcc
241 vmovaps m2, [ps_cos_vec+160]
243 BUTTERFLY3 m5, m6, m2, m7
244 BUTTERFLY3 m4, m6, m2, m7
245 BUTTERFLY3 m1, m6, m2, m7
246 BUTTERFLY3 m3, m6, m2, m7
248 vperm2f128 m6, m3, m3, 0x31
251 vextractf128 [outq+64], m5, 1
252 vextractf128 [outq+32], m5, 0
254 vextractf128 [outq+80], m4, 1
255 vextractf128 [outq+48], m4, 0
257 vperm2f128 m0, m1, m1, 0x31
258 vmovaps [outq+96], m1
273 nop ; FIXME code alignment
279 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
280 TRANSPOSE4x4PS 8, 9, 10, 11, 0
281 BUTTERFLY3V 8, 9, 10, 11, 0
283 TRANSPOSE4x4PS 12, 13, 14, 15, 0
284 BUTTERFLY3V 12, 13, 14, 15, 0
294 movss [outq+0x00], m8
296 movss [outq+0x10], m9
298 movss [outq+0x20], m10
300 movss [outq+0x30], m11
302 movss [outq+0x40], m12
304 movss [outq+0x50], m13
306 movss [outq+0x60], m14
308 movaps [outq+0x70], m15
312 movss [outq+0x08], m0
314 movss [outq+0x18], m1
316 movss [outq+0x28], m2
318 movss [outq+0x38], m3
320 movss [outq+0x48], m4
322 movss [outq+0x58], m5
323 movss [outq+0x68], m6
324 movss [outq+0x78], m7
326 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
329 SWAP 0, 2, 4, 6, 8, 10, 12, 14
330 SWAP 1, 3, 5, 7, 9, 11, 13, 15
335 SWAP 0, 2, 4, 6, 8, 10, 12, 14
336 SWAP 1, 3, 5, 7, 9, 11, 13, 15
342 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
348 %macro SPILL 2 ; xmm#, mempos
349 movaps [outq+(%2-8)*16], m%1
352 movaps m%1, [outq+(%2-8)*16]
355 %define PASS6 PASS6_AND_PERMUTE
357 movaps m2, [ps_cos_vec+160]
360 BUTTERFLY3 m5, m3, m2, m1
364 BUTTERFLY3 m1, m3, m2, m5
367 BUTTERFLY3 m4, m3, m2, m5
370 BUTTERFLY3 m7, m3, m2, m5
374 BUTTERFLY3 m5, m3, m2, m7
378 BUTTERFLY3 m4, m3, m2, m7
381 BUTTERFLY3 m6, m3, m2, m7
384 BUTTERFLY3 m0, m3, m2, m7
390 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
392 cglobal dct32_float, 2, 3, 16, out, in, tmp
396 LOAD_INV m1, [inq+112]
397 BUTTERFLY m0, m1, [ps_cos_vec], m3
400 LOAD_INV m4, [inq+48]
401 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
404 movaps m2, [ps_cos_vec+64]
405 BUTTERFLY m1, m4, m2, m3
411 LOAD_INV m6, [inq+96]
412 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
415 LOAD_INV m5, [inq+32]
416 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
419 BUTTERFLY m0, m7, m2, m3
421 movaps m2, [ps_cos_vec+80]
422 BUTTERFLY m6, m5, m2, m3
424 BUTTERFLY m1, m4, m2, m3
427 movaps m2, [ps_cos_vec+96]
429 BUTTERFLY m0, m1, m2, m3
435 BUTTERFLY m0, m5, m2, m3
439 BUTTERFLY m1, m6, m2, m3
443 BUTTERFLY m7, m4, m2, m3
446 movaps m3, [ps_p1p1m1m1+0]
447 movaps m2, [ps_cos_vec+128]
449 BUTTERFLY2 m5, m3, m2, m1
451 BUTTERFLY2 m0, m3, m2, m1
454 BUTTERFLY2 m6, m3, m2, m1
458 BUTTERFLY2 m0, m3, m2, m1
461 BUTTERFLY2 m4, m3, m2, m1
463 BUTTERFLY2 m7, m3, m2, m1
466 BUTTERFLY2 m6, m3, m2, m1
469 BUTTERFLY2 m0, m3, m2, m1