1 ;******************************************************************************
2 ;* 36 point SSE-optimized IMDCT transform
3 ;* Copyright (c) 2011 Vitor Sessak
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86inc.asm"
23 %include "libavutil/x86/x86util.asm"
28 ps_mask: dd 0, ~0, ~0, ~0
29 ps_mask2: dd 0, ~0, 0, ~0
30 ps_mask3: dd 0, 0, 0, ~0
31 ps_mask4: dd 0, ~0, 0, 0
33 ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
34 ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
35 ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
36 ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
37 ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
38 ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
39 ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
41 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
42 ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
44 ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
45 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
46 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
47 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
48 dd 1.0, 0.70710678118654752439, 0.0, 0.0
50 ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
51 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
52 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
53 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
54 dd 1.0, 0.70710678118654752439, 0.0, 0.0
59 %macro PSHUFD_SSE_AVX 3
66 ; input %1={x1,x2,x3,x4}, %2={y1,y2,y3,y4}
67 ; output %3={x3,x4,y1,y2}
68 %macro BUILDINVHIGHLOW_SSE 3
72 %macro BUILDINVHIGHLOW_AVX 3
73 shufps %3, %1, %2, 0x4e
76 ; input %1={x1,x2,x3,x4}, %2={y1,y2,y3,y4}
77 ; output %3={x4,y1,y2,y3}
79 BUILDINVHIGHLOW %1, %2, %3
80 shufps %3, %3, %2, 0x99
83 %macro ROTLEFT_SSSE3 3
84 palignr %3, %2, %1, 12
87 %macro INVERTHL_SSE1 2
92 %macro INVERTHL_SSE2 2
96 %macro BUTTERF_SSE12 3
98 xorps %1, [ps_p1p1m1m1]
100 mulps %1, [ps_cosh + %3]
102 xorps %1, [ps_p1m1p1m1]
105 %macro BUTTERF_SSE3 3
107 xorps %1, %1, [ps_p1p1m1m1]
109 mulps %1, %1, [ps_cosh_sse3 + %3]
117 movss [%3 + 8*SBLIMIT], %2
119 movss [%3 + 4*SBLIMIT], %1
121 movss [%3 + 12*SBLIMIT], %2
134 %macro DEFINE_IMDCT 1
135 cglobal imdct36_float_%1, 4,4,9, out, buf, in, win
137 ; for(i=17;i>=1;i--) in[i] += in[i-1];
144 andps m6, m6, [ps_mask]
158 BUILDINVHIGHLOW m3, m4, m6
159 shufps m6, m6, m4, 0xa9
165 ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
167 andps m5, m5, [ps_mask3]
169 BUILDINVHIGHLOW m0, m1, m7
170 andps m7, m7, [ps_mask2]
174 BUILDINVHIGHLOW m1, m2, m6
175 andps m6, m6, [ps_mask2]
179 BUILDINVHIGHLOW m2, m3, m7
180 andps m7, m7, [ps_mask2]
185 andps m6, m6, [ps_mask4]
191 movlhps m6, m1, m5 ; zero out high values
200 mulps m7, m2, [ps_val1]
203 mulps m5, m8, [ps_val2]
205 mulps m5, m5, [ps_val2]
209 mulps m5, m6, [ps_val1]
221 shufps m6, m4, m3, 0xe4
223 mulps m6, m6, [ps_val3]
226 mulps m4, m4, [ps_val4]
228 shufps m1, m1, m0, 0xe4
230 mulps m1, m1, [ps_val5]
232 mulps m3, m3, [ps_val6]
233 mulps m0, m0, [ps_val7]
236 xorps m2, m1, [ps_p1p1m1m1]
242 xorps m3, m3, [ps_p1p1m1m1]
244 shufps m0, m0, m4, 0xe4
248 BUILDINVHIGHLOW m2, m3, m4
249 shufps m3, m3, m2, 0x4e
251 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
258 mulps m5, m5, [ps_cosh + 64]
260 xorps m5, m5, [ps_p1m1p1m1]
264 ; m0 0 1 2 3 => 2 6 10 14 m1
265 ; m7 4 5 6 7 => 3 7 11 15 m2
266 ; m3 8 9 10 11 => 17 13 9 5 m3
267 ; m4 12 13 14 15 => 16 12 8 4 m5
268 ; m5 16 17 xx xx => 0 1 xx xx m0
283 movlps m7, [bufq + 64]
284 mulps m6, m6, [winq + 16*4]
286 movss [outq + 64*SBLIMIT], m6
287 shufps m6, m6, m6, 0xb1
288 movss [outq + 68*SBLIMIT], m6
290 mulps m6, m3, [winq + 4*4]
291 LOADA64 m4, bufq + 16
293 STORE m6, m7, outq + 16*SBLIMIT
295 shufps m4, m0, m3, 0xb5
296 mulps m4, m4, [winq + 8*4]
297 LOADA64 m7, bufq + 32
299 STORE m4, m6, outq + 32*SBLIMIT
301 shufps m3, m3, m2, 0xb1
302 mulps m3, m3, [winq + 12*4]
303 LOADA64 m7, bufq + 48
305 STORE m3, m7, outq + 48*SBLIMIT
312 mulps m4, m1, [winq + 20*4]
315 mulps m3, m5, [winq + 24*4]
316 STOREA64 bufq + 16, m3
318 shufps m0, m0, m5, 0xb0
319 mulps m0, m0, [winq + 28*4]
320 STOREA64 bufq + 32, m0
322 shufps m5, m5, m1, 0xb1
323 mulps m5, m5, [winq + 32*4]
324 STOREA64 bufq + 48, m5
326 shufps m1, m1, m1, 0xb1
327 mulps m1, m1, [winq + 36*4]
328 movlps [bufq + 64], m1
332 %define PSHUFD PSHUFD_SSE_AVX
333 %define INVERTHL INVERTHL_SSE1
334 %define BUTTERF BUTTERF_SSE12
335 %define BUTTERF0 BUTTERF0_SSE12
336 %define BUILDINVHIGHLOW BUILDINVHIGHLOW_SSE
337 %define ROTLEFT ROTLEFT_SSE
343 %define PSHUFD PSHUFD_SSE2
344 %define INVERTHL INVERTHL_SSE2
348 %define BUTTERF BUTTERF_SSE3
349 %define BUTTERF0 BUTTERF0_SSE3
353 %define ROTLEFT ROTLEFT_SSSE3
357 %define BUILDINVHIGHLOW BUILDINVHIGHLOW_AVX
358 %define PSHUFD PSHUFD_SSE_AVX