1 ;******************************************************************************
2 ;* 36 point SSE-optimized IMDCT transform
3 ;* Copyright (c) 2011 Vitor Sessak
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 ps_mask: dd 0, ~0, ~0, ~0
27 ps_mask2: dd 0, ~0, 0, ~0
28 ps_mask3: dd 0, 0, 0, ~0
29 ps_mask4: dd 0, ~0, 0, 0
31 ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
32 ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
33 ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
34 ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
35 ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
36 ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
37 ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
39 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
40 ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
42 ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
43 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
44 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
45 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
46 dd 1.0, 0.70710678118654752439, 0.0, 0.0
48 ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
49 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
50 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
51 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
52 dd 1.0, -0.70710678118654752439, 0.0, 0.0
54 costabs: times 4 dd 0.98480773
57 times 4 dd -0.76604444
58 times 4 dd -0.64278764
60 times 4 dd -0.50000000
61 times 4 dd -0.34202015
62 times 4 dd -0.17364818
77 %if cpuflag(sse2) && notcpuflag(avx)
84 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
85 ; output %1={x3,x4,y1,y2}
86 %macro BUILDINVHIGHLOW 3
88 shufps %1, %2, %3, 0x4e
95 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
96 ; output %1={x4,y1,y2,y3}
99 palignr %1, %3, %2, 12
101 BUILDINVHIGHLOW %1, %2, %3
102 shufps %1, %1, %3, 0x99
117 xorps %1, [ps_p1p1m1m1]
120 mulps %1, %1, [ps_cosh_sse3 + %3]
124 mulps %1, [ps_cosh + %3]
126 xorps %1, [ps_p1m1p1m1]
133 mulps %1, %1, [ps_cosh_sse3 + %3]
137 mulps %1, [ps_cosh + %3]
139 xorps %1, [ps_p1m1p1m1]
147 extractps dword [%3 + %4], %1, 1
148 extractps dword [%3 + 2*%4], %1, 2
149 extractps dword [%3 + 3*%4], %1, 3
153 movss [%3 + 2*%4], %2
157 movss [%3 + 3*%4], %2
164 movlps %2, [%3 + 2*%4]
165 movhps %2, [%3 + 3*%4]
178 %macro DEFINE_IMDCT 0
179 cglobal imdct36_float, 4,4,9, out, buf, in, win
181 ; for(i=17;i>=1;i--) in[i] += in[i-1];
188 andps m6, m6, [ps_mask]
202 BUILDINVHIGHLOW m6, m3, m4
203 shufps m6, m6, m4, 0xa9
209 ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
211 andps m5, m5, [ps_mask3]
213 BUILDINVHIGHLOW m7, m0, m1
214 andps m7, m7, [ps_mask2]
218 BUILDINVHIGHLOW m6, m1, m2
219 andps m6, m6, [ps_mask2]
223 BUILDINVHIGHLOW m7, m2, m3
224 andps m7, m7, [ps_mask2]
229 andps m6, m6, [ps_mask4]
235 movlhps m6, m1, m5 ; zero out high values
244 mulps m7, m2, [ps_val1]
247 mulps m5, m8, [ps_val2]
249 mulps m5, m5, [ps_val2]
253 mulps m5, m6, [ps_val1]
265 shufps m6, m4, m3, 0xe4
267 mulps m6, m6, [ps_val3]
270 mulps m4, m4, [ps_val4]
272 shufps m1, m1, m0, 0xe4
274 mulps m1, m1, [ps_val5]
276 mulps m3, m3, [ps_val6]
277 mulps m0, m0, [ps_val7]
280 xorps m2, m1, [ps_p1p1m1m1]
286 xorps m3, m3, [ps_p1p1m1m1]
288 shufps m0, m0, m4, 0xe4
292 BUILDINVHIGHLOW m4, m2, m3
293 shufps m3, m3, m2, 0x4e
295 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
304 ; m0 0 1 2 3 => 2 6 10 14 m1
305 ; m7 4 5 6 7 => 3 7 11 15 m2
306 ; m3 8 9 10 11 => 17 13 9 5 m3
307 ; m4 12 13 14 15 => 16 12 8 4 m5
308 ; m5 16 17 xx xx => 0 1 xx xx m0
323 movss m4, [bufq + 4*68]
324 movss m7, [bufq + 4*64]
326 mulps m6, m6, [winq + 16*4]
328 movss [outq + 64*SBLIMIT], m6
329 shufps m6, m6, m6, 0xb1
330 movss [outq + 68*SBLIMIT], m6
332 mulps m6, m3, [winq + 4*4]
333 LOAD m4, m7, bufq + 4*16, 16
335 STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
337 shufps m4, m0, m3, 0xb5
338 mulps m4, m4, [winq + 8*4]
339 LOAD m7, m6, bufq + 4*32, 16
341 STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
343 shufps m3, m3, m2, 0xb1
344 mulps m3, m3, [winq + 12*4]
345 LOAD m7, m6, bufq + 4*48, 16
347 STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
350 LOAD m6, m7, bufq, 16
352 STORE m2, m7, outq, 4*SBLIMIT
354 mulps m4, m1, [winq + 20*4]
355 STORE m4, m7, bufq, 16
357 mulps m3, m5, [winq + 24*4]
358 STORE m3, m7, bufq + 4*16, 16
360 shufps m0, m0, m5, 0xb0
361 mulps m0, m0, [winq + 28*4]
362 STORE m0, m7, bufq + 4*32, 16
364 shufps m5, m5, m1, 0xb1
365 mulps m5, m5, [winq + 32*4]
366 STORE m5, m7, bufq + 4*48, 16
368 shufps m1, m1, m1, 0xb1
369 mulps m1, m1, [winq + 36*4]
370 movss [bufq + 4*64], m1
372 movss [bufq + 4*68], m1
390 %if HAVE_AVX_EXTERNAL
400 %define SPILLED(x) m %+ x
402 %define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
403 %macro SPILL 2 ; xmm#, mempos
404 movaps SPILLED(%2), m%1
407 movaps m%1, SPILLED(%2)
411 %macro DEFINE_FOUR_IMDCT 0
412 cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
414 movhps m0, [inq+64 + 72]
415 movlps m3, [inq+64 + 2*72]
416 movhps m3, [inq+64 + 3*72]
418 shufps m5, m0, m3, 0xdd
419 shufps m0, m0, m3, 0x88
422 movu m6, [inq+48 + 72]
423 mova m7, [inq+48 + 2*72]
424 movu m3, [inq+48 + 3*72]
426 TRANSPOSE4x4PS 1, 6, 7, 3, 4
441 movu m5, [inq+32 + 72]
442 mova m2, [inq+32 + 2*72]
443 movu m7, [inq+32 + 3*72]
445 TRANSPOSE4x4PS 4, 5, 2, 7, 3
461 movu m7, [inq+16 + 72]
462 mova m1, [inq+16 + 2*72]
463 movu m6, [inq+16 + 3*72]
465 TRANSPOSE4x4PS 2, 7, 1, 6, 3
474 mulps m6, [costabs + 16*2]
481 mova m3, [inq + 2*72]
482 movu m5, [inq + 3*72]
484 TRANSPOSE4x4PS 1, 6, 3, 5, 0
494 addps m6, m4, SPILLED(12)
499 mulps m7, [costabs + 16*5]
501 mulps m0, m6, [costabs + 16*6]
507 mulps m6, [costabs + 16*1]
508 subps m4, SPILLED(12)
509 mulps m4, [costabs + 16*8]
510 addps m2, SPILLED(12)
511 mulps m2, [costabs + 16*3]
523 mulps m5, [costabs + 16*7]
525 mulps m1, [costabs + 16*2]
527 mulps m4, [costabs + 16*4]
532 mulps m3, [costabs + 16*2]
543 addps m1, m0, SPILLED(15)
545 mova m4, [costabs + 16*5]
550 mulps m5, m1, [costabs + 16*6]
558 mulps m5, [costabs + 16*1]
559 mulps m7, [costabs + 16*8]
561 mulps m0, [costabs + 16*3]
573 subps m0, SPILLED(11)
574 mulps m0, [costabs + 16*2]
575 addps m4, m7, SPILLED(11)
578 mulps m7, [costabs + 16*7]
579 addps m2, SPILLED(11)
580 mulps m2, [costabs + 16*4]
581 addps m1, m7, [tmpq+4*8]
588 addps m4, m6, SPILLED(10)
589 subps m6, SPILLED(10)
591 mulps m2, [costabs + 16*9]
593 mulps m5, [costabs + 16*17]
596 mulps m2, m1, [winq+4*36]
597 addps m2, [bufq+4*36]
599 mulps m1, [winq+4*32]
600 addps m1, [bufq+4*32]
602 mulps m1, m4, [winq+4*116]
604 mulps m4, [winq+4*112]
608 mulps m1, m6, [winq+4*68]
609 addps m1, [bufq+4*68]
614 mulps m1, m2, [winq+4*148]
616 mulps m2, [winq+4*80]
618 addps m5, m3, [tmpq+4*24]
623 mulps m1, [costabs + 16*10]
625 mulps m0, [costabs + 16*16]
628 mulps m3, m5, [winq+4*40]
629 addps m3, [bufq+4*40]
631 mulps m5, [winq+4*28]
632 addps m5, [bufq+4*28]
634 mulps m1, m6, [winq+4*120]
636 mulps m6, [winq+4*108]
640 mulps m5, m2, [winq+4*64]
641 addps m5, [bufq+4*64]
646 mulps m0, m1, [winq+4*144]
648 mulps m1, [winq+4*84]
652 addps m1, SPILLED(13)
653 subps m5, SPILLED(13)
656 mulps m2, [costabs + 16*11]
658 mulps m3, [costabs + 16*15]
662 mulps m6, m1, [winq+4*44]
663 addps m6, [bufq+4*44]
665 mulps m1, [winq+4*24]
666 addps m1, [bufq+4*24]
668 mulps m0, m2, [winq+4*124]
670 mulps m2, [winq+4*104]
674 mulps m1, m5, [winq+4*60]
675 addps m1, [bufq+4*60]
680 mulps m1, m0, [winq+4*140]
682 mulps m0, [winq+4*88]
685 addps m1, SPILLED(12)
687 subps m2, SPILLED(12)
689 subps m0, m7, SPILLED(11)
690 addps m7, SPILLED(11)
691 mulps m4, m7, [costabs + 16*12]
692 mulps m0, [costabs + 16*14]
695 mulps m7, m1, [winq+4*48]
696 addps m7, [bufq+4*48]
698 mulps m1, [winq+4*20]
699 addps m1, [bufq+4*20]
701 mulps m1, m5, [winq+4*128]
703 mulps m5, [winq+4*100]
707 mulps m1, m2, [winq+4*56]
708 addps m1, [bufq+4*56]
710 mulps m2, [winq+4*12]
711 addps m2, [bufq+4*12]
713 mulps m0, m6, [winq+4*136]
715 mulps m6, [winq+4*92]
718 mulps m0, [costabs + 16*13]
722 mulps m0, m3, [winq+4*52]
723 addps m0, [bufq+4*52]
725 mulps m3, [winq+4*16]
726 addps m3, [bufq+4*16]
728 mulps m0, m2, [winq+4*132]
730 mulps m2, [winq+4*96]
738 %if HAVE_AVX_EXTERNAL