1 ;******************************************************************************
2 ;* 32 point SSE-optimized DCT transform
3 ;* Copyright (c) 2010 Vitor Sessak
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
27 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
28 dd 0.553104, 0.582935, 0.622504, 0.674808
29 dd -10.190008, -3.407609, -2.057781, -1.484165
30 dd -1.169440, -0.972568, -0.839350, -0.744536
31 dd 0.502419, 0.522499, 0.566944, 0.646822
32 dd 0.788155, 1.060678, 1.722447, 5.101149
33 dd 0.509796, 0.601345, 0.899976, 2.562916
34 dd 0.509796, 0.601345, 0.899976, 2.562916
35 dd 1.000000, 1.000000, 1.306563, 0.541196
36 dd 1.000000, 1.000000, 1.306563, 0.541196
37 dd 1.000000, 0.707107, 1.000000, -0.707107
38 dd 1.000000, 0.707107, 1.000000, -0.707107
39 dd 0.707107, 0.707107, 0.707107, 0.707107
42 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
51 %if cpuflag(sse2) && notcpuflag(avx)
65 BUTTERFLY0 %1, %2, %3, %4, 0x1b
69 BUTTERFLY0 %1, %2, %3, %4, 0xb1
77 mulps m%2, [ps_cos_vec+192]
81 mulps m%4, [ps_cos_vec+192]
84 %macro PASS6_AND_PERMUTE 0
195 %if HAVE_AVX_EXTERNAL
196 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
197 cglobal dct32_float, 2,3,8, out, in, tmp
200 vinsertf128 m5, m5, [inq+96], 1
201 vinsertf128 m5, m5, [inq+112], 0
202 vshufps m5, m5, m5, 0x1b
203 BUTTERFLY m4, m5, [ps_cos_vec], m6
206 vinsertf128 m6, m6, [inq+32], 1
207 vinsertf128 m6, m6, [inq+48], 0
208 vshufps m6, m6, m6, 0x1b
209 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
213 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
214 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
218 vperm2f128 m3, m6, m4, 0x31
219 vperm2f128 m1, m6, m4, 0x20
220 vshufps m3, m3, m3, 0x1b
222 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
225 vperm2f128 m4, m5, m2, 0x20
226 vperm2f128 m5, m5, m2, 0x31
227 vshufps m5, m5, m5, 0x1b
229 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
232 vmovaps m6, [ps_p1p1m1m1+0]
233 vmovaps m2, [ps_cos_vec+128]
235 BUTTERFLY2 m5, m6, m2, m7
236 BUTTERFLY2 m4, m6, m2, m7
237 BUTTERFLY2 m1, m6, m2, m7
238 BUTTERFLY2 m3, m6, m2, m7
242 vshufps m6, m6, m6, 0xcc
243 vmovaps m2, [ps_cos_vec+160]
245 BUTTERFLY3 m5, m6, m2, m7
246 BUTTERFLY3 m4, m6, m2, m7
247 BUTTERFLY3 m1, m6, m2, m7
248 BUTTERFLY3 m3, m6, m2, m7
250 vperm2f128 m6, m3, m3, 0x31
253 vextractf128 [outq+64], m5, 1
254 vextractf128 [outq+32], m5, 0
256 vextractf128 [outq+80], m4, 1
257 vextractf128 [outq+48], m4, 0
259 vperm2f128 m0, m1, m1, 0x31
260 vmovaps [outq+96], m1
275 nop ; FIXME code alignment
281 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
282 TRANSPOSE4x4PS 8, 9, 10, 11, 0
283 BUTTERFLY3V 8, 9, 10, 11, 0
285 TRANSPOSE4x4PS 12, 13, 14, 15, 0
286 BUTTERFLY3V 12, 13, 14, 15, 0
296 movss [outq+0x00], m8
298 movss [outq+0x10], m9
300 movss [outq+0x20], m10
302 movss [outq+0x30], m11
304 movss [outq+0x40], m12
306 movss [outq+0x50], m13
308 movss [outq+0x60], m14
310 movaps [outq+0x70], m15
314 movss [outq+0x08], m0
316 movss [outq+0x18], m1
318 movss [outq+0x28], m2
320 movss [outq+0x38], m3
322 movss [outq+0x48], m4
324 movss [outq+0x58], m5
325 movss [outq+0x68], m6
326 movss [outq+0x78], m7
328 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
331 SWAP 0, 2, 4, 6, 8, 10, 12, 14
332 SWAP 1, 3, 5, 7, 9, 11, 13, 15
337 SWAP 0, 2, 4, 6, 8, 10, 12, 14
338 SWAP 1, 3, 5, 7, 9, 11, 13, 15
344 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
350 %macro SPILL 2 ; xmm#, mempos
351 movaps [outq+(%2-8)*16], m%1
354 movaps m%1, [outq+(%2-8)*16]
357 %define PASS6 PASS6_AND_PERMUTE
359 movaps m2, [ps_cos_vec+160]
362 BUTTERFLY3 m5, m3, m2, m1
366 BUTTERFLY3 m1, m3, m2, m5
369 BUTTERFLY3 m4, m3, m2, m5
372 BUTTERFLY3 m7, m3, m2, m5
376 BUTTERFLY3 m5, m3, m2, m7
380 BUTTERFLY3 m4, m3, m2, m7
383 BUTTERFLY3 m6, m3, m2, m7
386 BUTTERFLY3 m0, m3, m2, m7
392 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
394 cglobal dct32_float, 2, 3, 16, out, in, tmp
398 LOAD_INV m1, [inq+112]
399 BUTTERFLY m0, m1, [ps_cos_vec], m3
402 LOAD_INV m4, [inq+48]
403 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
406 movaps m2, [ps_cos_vec+64]
407 BUTTERFLY m1, m4, m2, m3
413 LOAD_INV m6, [inq+96]
414 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
417 LOAD_INV m5, [inq+32]
418 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
421 BUTTERFLY m0, m7, m2, m3
423 movaps m2, [ps_cos_vec+80]
424 BUTTERFLY m6, m5, m2, m3
426 BUTTERFLY m1, m4, m2, m3
429 movaps m2, [ps_cos_vec+96]
431 BUTTERFLY m0, m1, m2, m3
437 BUTTERFLY m0, m5, m2, m3
441 BUTTERFLY m1, m6, m2, m3
445 BUTTERFLY m7, m4, m2, m3
448 movaps m3, [ps_p1p1m1m1+0]
449 movaps m2, [ps_cos_vec+128]
451 BUTTERFLY2 m5, m3, m2, m1
453 BUTTERFLY2 m0, m3, m2, m1
456 BUTTERFLY2 m6, m3, m2, m1
460 BUTTERFLY2 m0, m3, m2, m1
463 BUTTERFLY2 m4, m3, m2, m1
465 BUTTERFLY2 m7, m3, m2, m1
468 BUTTERFLY2 m6, m3, m2, m1
471 BUTTERFLY2 m0, m3, m2, m1