2 * ARM NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28 #define ff_fft_calc_neon X(ff_fft_calc_neon)
30 function ff_imdct_half_neon, export=1
34 ldr lr, [r0, #20] @ mdct_bits
35 ldr r4, [r0, #24] @ tcos
36 ldr r3, [r0, #8] @ revtab
37 lsl r12, r12, lr @ n = 1 << nbits
38 lsr lr, r12, #2 @ n4 = n >> 2
39 add r7, r2, r12, lsl #1
43 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
44 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
46 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
58 add r8, r1, r8, lsl #3
59 add r6, r1, r6, lsl #3
61 vld2.32 {d16-d17},[r7,:128],r12
62 vld2.32 {d0-d1}, [r2,:128]!
64 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
67 vst2.32 {d4[0],d5[0]}, [r6,:64]
68 vst2.32 {d4[1],d5[1]}, [r8,:64]
71 vst2.32 {d4[0],d5[0]}, [r6,:64]
72 vst2.32 {d4[1],d5[1]}, [r8,:64]
79 ldr lr, [r4, #20] @ mdct_bits
80 ldr r4, [r4, #24] @ tcos
81 lsl r12, r12, lr @ n = 1 << nbits
82 lsr lr, r12, #3 @ n8 = n >> 3
84 add r4, r4, lr, lsl #3
85 add r6, r6, lr, lsl #3
93 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
94 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
95 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
99 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
101 vmul.f32 d5, d21, d19
102 vmul.f32 d6, d20, d19
103 vmul.f32 d22, d1, d16
104 vmul.f32 d23, d21, d17
105 vmul.f32 d24, d0, d16
106 vmul.f32 d25, d20, d17
112 vld2.32 {d0-d1}, [r3,:128], r7
113 vld2.32 {d20-d21},[r6,:128]!
114 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
116 vst2.32 {d4,d6}, [r0,:128], r7
117 vst2.32 {d5,d7}, [r8,:128]!
121 vst2.32 {d4,d6}, [r0,:128]
122 vst2.32 {d5,d7}, [r8,:128]
127 function ff_imdct_calc_neon, export=1
136 bl ff_imdct_half_neon
138 add r0, r5, r4, lsl #2
139 add r1, r5, r4, lsl #1
146 vld1.32 {d0-d1}, [r2,:128], r3
149 vld1.32 {d2-d3}, [r1,:128]!
154 vst1.32 {d2}, [r0,:64], r6
155 vst1.32 {d3}, [r0,:64], r6
156 vst1.32 {d4-d5}, [r5,:128]!
163 function ff_mdct_calc_neon, export=1
167 ldr lr, [r0, #20] @ mdct_bits
168 ldr r4, [r0, #24] @ tcos
169 ldr r3, [r0, #8] @ revtab
170 lsl lr, r12, lr @ n = 1 << nbits
171 add r7, r2, lr @ in4u
172 sub r9, r7, #16 @ in4d
173 add r2, r7, lr, lsl #1 @ in3u
174 add r8, r9, lr, lsl #1 @ in3d
175 add r5, r4, lr, lsl #1
180 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
181 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
182 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
183 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
184 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
185 vsub.f32 d0, d18, d0 @ in4d-in4u I
186 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
187 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
188 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
189 vadd.f32 d1, d1, d19 @ in3u+in3d -R
190 vsub.f32 d16, d16, d2 @ in0u-in2d R
191 vadd.f32 d17, d17, d3 @ in2u+in1d -I
193 vmul.f32 d7, d0, d21 @ I*s
194 A ldr r10, [r3, lr, lsr #1]
197 vmul.f32 d6, d1, d20 @ -R*c
199 vmul.f32 d4, d1, d21 @ -R*s
200 vmul.f32 d5, d0, d20 @ I*c
201 vmul.f32 d24, d16, d30 @ R*c
202 vmul.f32 d25, d17, d31 @ -I*s
203 vmul.f32 d22, d16, d31 @ R*s
204 vmul.f32 d23, d17, d30 @ I*c
206 vsub.f32 d6, d6, d7 @ -R*c-I*s
207 vadd.f32 d7, d4, d5 @ -R*s+I*c
208 vsub.f32 d24, d25, d24 @ I*s-R*c
209 vadd.f32 d25, d22, d23 @ R*s-I*c
212 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
213 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
214 vneg.f32 d7, d7 @ R*s-I*c
215 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
216 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
217 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
218 vsub.f32 d0, d18, d0 @ in4d-in4u I
219 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
220 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
221 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
222 vadd.f32 d1, d1, d19 @ in3u+in3d -R
223 vsub.f32 d16, d16, d2 @ in0u-in2d R
224 vadd.f32 d17, d17, d3 @ in2u+in1d -I
225 uxth r12, r6, ror #16
227 add r12, r1, r12, lsl #3
228 add r6, r1, r6, lsl #3
229 vst2.32 {d6[0],d7[0]}, [r6,:64]
230 vst2.32 {d6[1],d7[1]}, [r12,:64]
231 uxth r6, r10, ror #16
233 add r6 , r1, r6, lsl #3
234 add r10, r1, r10, lsl #3
235 vst2.32 {d24[0],d25[0]},[r10,:64]
236 vst2.32 {d24[1],d25[1]},[r6,:64]
239 vneg.f32 d7, d7 @ R*s-I*c
240 uxth r12, r6, ror #16
242 add r12, r1, r12, lsl #3
243 add r6, r1, r6, lsl #3
244 vst2.32 {d6[0],d7[0]}, [r6,:64]
245 vst2.32 {d6[1],d7[1]}, [r12,:64]
246 uxth r6, r10, ror #16
248 add r6 , r1, r6, lsl #3
249 add r10, r1, r10, lsl #3
250 vst2.32 {d24[0],d25[0]},[r10,:64]
251 vst2.32 {d24[1],d25[1]},[r6,:64]
258 ldr lr, [r4, #20] @ mdct_bits
259 ldr r4, [r4, #24] @ tcos
260 lsl r12, r12, lr @ n = 1 << nbits
261 lsr lr, r12, #3 @ n8 = n >> 3
263 add r4, r4, lr, lsl #3
264 add r6, r6, lr, lsl #3
272 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
273 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
274 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
277 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
278 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
279 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
280 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
281 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
282 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
283 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
284 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
285 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
286 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
287 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
288 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
289 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
292 vld2.32 {d0-d1}, [r3,:128], r7
293 vld2.32 {d20-d21},[r6,:128]!
294 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
296 vst2.32 {d4,d6}, [r0,:128], r7
297 vst2.32 {d5,d7}, [r8,:128]!
301 vst2.32 {d4,d6}, [r0,:128]
302 vst2.32 {d5,d7}, [r8,:128]