1 ;******************************************************************************
2 ;* 32 point SSE-optimized DCT transform
3 ;* Copyright (c) 2010 Vitor Sessak
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
27 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
28 dd 0.553104, 0.582935, 0.622504, 0.674808
29 dd -1.169440, -0.972568, -0.839350, -0.744536
30 dd -10.190008, -3.407609, -2.057781, -1.484165
31 dd 0.502419, 0.522499, 0.566944, 0.646822
32 dd 0.788155, 1.060678, 1.722447, 5.101149
33 dd 0.509796, 0.601345, 0.899976, 2.562916
34 dd 1.000000, 1.000000, 1.306563, 0.541196
35 dd 1.000000, 0.707107, 1.000000, -0.707107
38 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
56 BUTTERFLY0 %1, %2, %3, %4, 0x1b
60 BUTTERFLY0 %1, %2, %3, %4, 0xb1
64 section .text align=16
65 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
66 cglobal dct32_float_sse, 2,3,8, out, in, tmp
72 BUTTERFLY m0, m1, [ps_cos_vec], m3
77 BUTTERFLY m7, m4, [ps_cos_vec+48], m3
81 movaps m2, [ps_cos_vec+64]
82 BUTTERFLY m1, m4, m2, m3
90 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
95 BUTTERFLY m4, m5, [ps_cos_vec+32], m3
98 BUTTERFLY m0, m7, m2, m3
100 movaps m2, [ps_cos_vec+80]
101 BUTTERFLY m6, m5, m2, m3
103 BUTTERFLY m1, m4, m2, m3
106 movaps m2, [ps_cos_vec+96]
108 BUTTERFLY m0, m1, m2, m3
109 movaps [outq+112], m0
110 movaps [outq+ 96], m1
114 BUTTERFLY m0, m5, m2, m3
118 BUTTERFLY m1, m6, m2, m3
122 BUTTERFLY m7, m4, m2, m3
125 movaps m3, [ps_p1p1m1m1+0]
126 movaps m2, [ps_cos_vec+112]
128 BUTTERFLY2 m5, m3, m2, m1
130 BUTTERFLY2 m0, m3, m2, m1
133 BUTTERFLY2 m6, m3, m2, m1
137 BUTTERFLY2 m0, m3, m2, m1
140 BUTTERFLY2 m4, m3, m2, m1
142 BUTTERFLY2 m7, m3, m2, m1
145 BUTTERFLY2 m6, m3, m2, m1
147 movaps m0, [outq+112]
148 BUTTERFLY2 m0, m3, m2, m1
151 movaps m2, [ps_cos_vec+128]
154 BUTTERFLY3 m5, m3, m2, m1
158 BUTTERFLY3 m1, m3, m2, m5
161 BUTTERFLY3 m4, m3, m2, m5
164 BUTTERFLY3 m7, m3, m2, m5
168 BUTTERFLY3 m5, m3, m2, m7
172 BUTTERFLY3 m4, m3, m2, m7
175 BUTTERFLY3 m6, m3, m2, m7
178 BUTTERFLY3 m0, m3, m2, m7
179 movaps [outq+112], m0