]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/aacpsdsp_neon.S
avfilter/avfilter: Remove compatibility code for old filter options
[ffmpeg] / libavcodec / arm / aacpsdsp_neon.S
1 /*
2  * Copyright (c) 2012 Mans Rullgard
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 function ff_ps_add_squares_neon, export=1
24         mov             r3,  r0
25         sub             r2,  r2,  #4
26         vld1.32         {q0},     [r1,:128]!
27         vmul.f32        q0,  q0,  q0
28         vld1.32         {q2},     [r1,:128]!
29         vmul.f32        q2,  q2,  q2
30         vld1.32         {q1},     [r0,:128]!
31 1:
32         vpadd.f32       d6,  d0,  d1
33         vld1.32         {q0},     [r1,:128]!
34         vpadd.f32       d7,  d4,  d5
35         vmul.f32        q0,  q0,  q0
36         vld1.32         {q2},     [r1,:128]!
37         vadd.f32        q3,  q1,  q3
38         vld1.32         {q1},     [r0,:128]!
39         vmul.f32        q2,  q2,  q2
40         vst1.32         {q3},     [r3,:128]!
41         subs            r2,  r2,  #4
42         bgt             1b
43         vpadd.f32       d6,  d0,  d1
44         vpadd.f32       d7,  d4,  d5
45         vadd.f32        q1,  q1,  q3
46         vst1.32         {q1},     [r3,:128]!
47         bx              lr
48 endfunc
49
50 function ff_ps_mul_pair_single_neon, export=1
51         sub             r3,  r3,  #4
52         tst             r1,  #8
53         bne             2f
54         vld1.32         {q0},     [r1,:128]!
55 1:
56         vld1.32         {q3},     [r2,:128]!
57         vmul.f32        d4,  d0,  d6[0]
58         vmul.f32        d5,  d1,  d6[1]
59         vld1.32         {q1},     [r1,:128]!
60         vmul.f32        d6,  d2,  d7[0]
61         vmul.f32        d7,  d3,  d7[1]
62         vld1.32         {q0},     [r1,:128]!
63         vst1.32         {q2,q3},  [r0,:128]!
64         subs            r3,  r3,  #4
65         bgt             1b
66         vld1.32         {q3},     [r2,:128]!
67         vmul.f32        d4,  d0,  d6[0]
68         vmul.f32        d5,  d1,  d6[1]
69         vld1.32         {q1},     [r1,:128]!
70         vmul.f32        d6,  d2,  d7[0]
71         vmul.f32        d7,  d3,  d7[1]
72         vst1.32         {q2,q3},  [r0,:128]!
73         bx              lr
74 2:
75         vld1.32         {d0},     [r1,:64]!
76         vld1.32         {d1,d2},  [r1,:128]!
77 1:
78         vld1.32         {q3},     [r2,:128]!
79         vmul.f32        d4,  d0,  d6[0]
80         vmul.f32        d5,  d1,  d6[1]
81         vld1.32         {d0,d1},  [r1,:128]!
82         vmul.f32        d6,  d2,  d7[0]
83         vmul.f32        d7,  d0,  d7[1]
84         vmov            d0,  d1
85         vld1.32         {d1,d2},  [r1,:128]!
86         vst1.32         {q2,q3},  [r0,:128]!
87         subs            r3,  r3,  #4
88         bgt             1b
89         vld1.32         {q3},     [r2,:128]!
90         vmul.f32        d4,  d0,  d6[0]
91         vmul.f32        d5,  d1,  d6[1]
92         vld1.32         {d0},     [r1,:64]!
93         vmul.f32        d6,  d2,  d7[0]
94         vmul.f32        d7,  d0,  d7[1]
95         vst1.32         {q2,q3},  [r0,:128]!
96         bx              lr
97 endfunc
98
99 function ff_ps_hybrid_synthesis_deint_neon, export=1
100         push            {r4-r8,lr}
101         add             r0,  r0,  r2,  lsl #2
102         add             r1,  r1,  r2,  lsl #5+1+2
103         rsb             r2,  r2,  #64
104         mov             r5,  #64*4
105         mov             lr,  r0
106         add             r4,  r0,  #38*64*4
107         mov             r12, r3
108 2:
109         vld1.32         {d0,d1},  [r1,:128]!
110         vst1.32         {d0[0]},  [lr,:32], r5
111         vst1.32         {d0[1]},  [r4,:32], r5
112         vst1.32         {d1[0]},  [lr,:32], r5
113         vst1.32         {d1[1]},  [r4,:32], r5
114         subs            r12, r12, #2
115         bgt             2b
116         add             r0,  r0,  #4
117         sub             r2,  r2,  #1
118         tst             r2,  #2
119         bne             6f
120 1:
121         mov             lr,  r0
122         add             r4,  r0,  #38*64*4
123         add             r6,  r1,  #  32*2*4
124         add             r7,  r1,  #2*32*2*4
125         add             r8,  r1,  #3*32*2*4
126         mov             r12, r3
127 2:
128         vld1.32         {d0,d1},  [r1,:128]!
129         vld1.32         {d2,d3},  [r6,:128]!
130         vld1.32         {d4,d5},  [r7,:128]!
131         vld1.32         {d6,d7},  [r8,:128]!
132         vst4.32         {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
133         vst4.32         {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
134         vst4.32         {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
135         vst4.32         {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
136         subs            r12, r12, #2
137         bgt             2b
138         add             r0,  r0,  #16
139         add             r1,  r1,  #3*32*2*4
140         subs            r2,  r2,  #4
141         bgt             1b
142         pop             {r4-r8,pc}
143 6:
144         mov             lr,  r0
145         add             r4,  r0,  #38*64*4
146         add             r6,  r1,  #32*2*4
147         mov             r12, r3
148 2:
149         vld1.32         {d0,d1},  [r1,:128]!
150         vld1.32         {d2,d3},  [r6,:128]!
151         vst2.32         {d0[0],d2[0]}, [lr,:64], r5
152         vst2.32         {d0[1],d2[1]}, [r4,:64], r5
153         vst2.32         {d1[0],d3[0]}, [lr,:64], r5
154         vst2.32         {d1[1],d3[1]}, [r4,:64], r5
155         subs            r12, r12, #2
156         bgt             2b
157         add             r0,  r0,  #8
158         add             r1,  r1,  #32*2*4
159         sub             r2,  r2,  #2
160         b               1b
161 endfunc
162
163 function ff_ps_hybrid_analysis_neon, export=1
164         vldm            r1,  {d19-d31}
165         ldr             r12, [sp]
166         lsl             r3,  r3,  #3
167         vadd.f32        d16, d19, d31
168         vadd.f32        d17, d20, d30
169         vsub.f32        d18, d19, d31
170         vsub.f32        d19, d20, d30
171         vsub.f32        d0,  d21, d29
172         vsub.f32        d1,  d22, d28
173         vadd.f32        d2,  d21, d29
174         vadd.f32        d3,  d22, d28
175         vadd.f32        d20, d23, d27
176         vadd.f32        d21, d24, d26
177         vsub.f32        d22, d23, d27
178         vsub.f32        d23, d24, d26
179         vmov.i32        d6,  #1<<31
180         vmov.i32        d7,  #0
181         vmov.f32        q14, #0.0
182         vmov.f32        q15, #0.0
183         vtrn.32         d6,  d7
184         vrev64.32       q9,  q9
185         vrev64.32       q0,  q0
186         vrev64.32       q11, q11
187         veor            q9,  q9,  q3
188         veor            q0,  q0,  q3
189         veor            q11, q11, q3
190         vld1.32         {q13},    [r2,:128]!
191         vtrn.32         q8,  q9
192         vtrn.32         q1,  q0
193         vtrn.32         q10, q11
194         sub             r12, r12, #1
195         vmla.f32        q14, q8,  q13
196         vld1.32         {q2},     [r2,:128]!
197         vmla.f32        q15, q9,  q13
198 1:
199         vmla.f32        q14, q1,  q2
200         vld1.32         {q13},    [r2,:128]!
201         vmla.f32        q15, q0,  q2
202         vmla.f32        q14, q10, q13
203         vld1.32         {q2},     [r2,:128]!
204         vmla.f32        q15, q11, q13
205         vld1.32         {q13},    [r2,:128]!
206         vadd.f32        d6,  d28, d29
207         vadd.f32        d7,  d30, d31
208         vmov.f32        q14, #0.0
209         vmov.f32        q15, #0.0
210         vmla.f32        q14, q8,  q13
211         vpadd.f32       d6,  d6,  d7
212         vmla.f32        q15, q9,  q13
213         vmla.f32        d6,  d25, d4[0]
214         vld1.32         {q2},     [r2,:128]!
215         vst1.32         {d6},     [r0,:64], r3
216         subs            r12, r12, #1
217         bgt             1b
218         vmla.f32        q14, q1,  q2
219         vld1.32         {q13},    [r2,:128]!
220         vmla.f32        q15, q0,  q2
221         vmla.f32        q14, q10, q13
222         vld1.32         {q2},     [r2,:128]!
223         vmla.f32        q15, q11, q13
224         vadd.f32        d6,  d28, d29
225         vadd.f32        d7,  d30, d31
226         vpadd.f32       d6,  d6,  d7
227         vmla.f32        d6,  d25, d4[0]
228         vst1.32         {d6},     [r0,:64], r3
229         bx              lr
230 endfunc
231
232 function ff_ps_stereo_interpolate_neon, export=1
233         vld1.32         {q0},     [r2]
234         vld1.32         {q14},    [r3]
235         mov             r2,  r0
236         mov             r3,  r1
237         ldr             r12, [sp]
238         vadd.f32        q1,  q0,  q14
239         vadd.f32        q0,  q1,  q14
240         vld1.32         {q2},     [r0,:64]!
241         vld1.32         {q3},     [r1,:64]!
242         subs            r12, r12, #1
243         beq             2f
244 1:
245         vmul.f32        d16, d4,  d2[0]
246         vmul.f32        d17, d5,  d0[0]
247         vmul.f32        d18, d4,  d2[1]
248         vmul.f32        d19, d5,  d0[1]
249         vmla.f32        d16, d6,  d3[0]
250         vmla.f32        d17, d7,  d1[0]
251         vmla.f32        d18, d6,  d3[1]
252         vmla.f32        d19, d7,  d1[1]
253         vadd.f32        q1,  q1,  q14
254         vadd.f32        q0,  q0,  q14
255         vadd.f32        q1,  q1,  q14
256         vadd.f32        q0,  q0,  q14
257         vld1.32         {q2},     [r0,:64]!
258         vld1.32         {q3},     [r1,:64]!
259         vst1.32         {q8},     [r2,:64]!
260         vst1.32         {q9},     [r3,:64]!
261         subs            r12, r12, #2
262         bgt             1b
263         it              lt
264         bxlt            lr
265 2:
266         vmul.f32        d16, d4,  d2[0]
267         vmul.f32        d18, d4,  d2[1]
268         vmla.f32        d16, d6,  d3[0]
269         vmla.f32        d18, d6,  d3[1]
270         vst1.32         {d16},    [r2,:64]!
271         vst1.32         {d18},    [r3,:64]!
272         bx              lr
273 endfunc