2 * Copyright (c) 2012 Mans Rullgard
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 function ff_ps_add_squares_neon, export=1
26 vld1.32 {q0}, [r1,:128]!
28 vld1.32 {q2}, [r1,:128]!
30 vld1.32 {q1}, [r0,:128]!
33 vld1.32 {q0}, [r1,:128]!
36 vld1.32 {q2}, [r1,:128]!
38 vld1.32 {q1}, [r0,:128]!
40 vst1.32 {q3}, [r3,:128]!
46 vst1.32 {q1}, [r3,:128]!
50 function ff_ps_mul_pair_single_neon, export=1
54 vld1.32 {q0}, [r1,:128]!
56 vld1.32 {q3}, [r2,:128]!
57 vmul.f32 d4, d0, d6[0]
58 vmul.f32 d5, d1, d6[1]
59 vld1.32 {q1}, [r1,:128]!
60 vmul.f32 d6, d2, d7[0]
61 vmul.f32 d7, d3, d7[1]
62 vld1.32 {q0}, [r1,:128]!
63 vst1.32 {q2,q3}, [r0,:128]!
66 vld1.32 {q3}, [r2,:128]!
67 vmul.f32 d4, d0, d6[0]
68 vmul.f32 d5, d1, d6[1]
69 vld1.32 {q1}, [r1,:128]!
70 vmul.f32 d6, d2, d7[0]
71 vmul.f32 d7, d3, d7[1]
72 vst1.32 {q2,q3}, [r0,:128]!
75 vld1.32 {d0}, [r1,:64]!
76 vld1.32 {d1,d2}, [r1,:128]!
78 vld1.32 {q3}, [r2,:128]!
79 vmul.f32 d4, d0, d6[0]
80 vmul.f32 d5, d1, d6[1]
81 vld1.32 {d0,d1}, [r1,:128]!
82 vmul.f32 d6, d2, d7[0]
83 vmul.f32 d7, d0, d7[1]
85 vld1.32 {d1,d2}, [r1,:128]!
86 vst1.32 {q2,q3}, [r0,:128]!
89 vld1.32 {q3}, [r2,:128]!
90 vmul.f32 d4, d0, d6[0]
91 vmul.f32 d5, d1, d6[1]
92 vld1.32 {d0}, [r1,:64]!
93 vmul.f32 d6, d2, d7[0]
94 vmul.f32 d7, d0, d7[1]
95 vst1.32 {q2,q3}, [r0,:128]!
99 function ff_ps_hybrid_synthesis_deint_neon, export=1
101 add r0, r0, r2, lsl #2
102 add r1, r1, r2, lsl #5+1+2
109 vld1.32 {d0,d1}, [r1,:128]!
110 vst1.32 {d0[0]}, [lr,:32], r5
111 vst1.32 {d0[1]}, [r4,:32], r5
112 vst1.32 {d1[0]}, [lr,:32], r5
113 vst1.32 {d1[1]}, [r4,:32], r5
124 add r7, r1, #2*32*2*4
125 add r8, r1, #3*32*2*4
128 vld1.32 {d0,d1}, [r1,:128]!
129 vld1.32 {d2,d3}, [r6,:128]!
130 vld1.32 {d4,d5}, [r7,:128]!
131 vld1.32 {d6,d7}, [r8,:128]!
132 vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
133 vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
134 vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
135 vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
139 add r1, r1, #3*32*2*4
149 vld1.32 {d0,d1}, [r1,:128]!
150 vld1.32 {d2,d3}, [r6,:128]!
151 vst2.32 {d0[0],d2[0]}, [lr,:64], r5
152 vst2.32 {d0[1],d2[1]}, [r4,:64], r5
153 vst2.32 {d1[0],d3[0]}, [lr,:64], r5
154 vst2.32 {d1[1],d3[1]}, [r4,:64], r5
163 function ff_ps_hybrid_analysis_neon, export=1
167 vadd.f32 d16, d19, d31
168 vadd.f32 d17, d20, d30
169 vsub.f32 d18, d19, d31
170 vsub.f32 d19, d20, d30
171 vsub.f32 d0, d21, d29
172 vsub.f32 d1, d22, d28
173 vadd.f32 d2, d21, d29
174 vadd.f32 d3, d22, d28
175 vadd.f32 d20, d23, d27
176 vadd.f32 d21, d24, d26
177 vsub.f32 d22, d23, d27
178 vsub.f32 d23, d24, d26
190 vld1.32 {q13}, [r2,:128]!
195 vmla.f32 q14, q8, q13
196 vld1.32 {q2}, [r2,:128]!
197 vmla.f32 q15, q9, q13
200 vld1.32 {q13}, [r2,:128]!
202 vmla.f32 q14, q10, q13
203 vld1.32 {q2}, [r2,:128]!
204 vmla.f32 q15, q11, q13
205 vld1.32 {q13}, [r2,:128]!
206 vadd.f32 d6, d28, d29
207 vadd.f32 d7, d30, d31
210 vmla.f32 q14, q8, q13
212 vmla.f32 q15, q9, q13
213 vmla.f32 d6, d25, d4[0]
214 vld1.32 {q2}, [r2,:128]!
215 vst1.32 {d6}, [r0,:64], r3
219 vld1.32 {q13}, [r2,:128]!
221 vmla.f32 q14, q10, q13
222 vld1.32 {q2}, [r2,:128]!
223 vmla.f32 q15, q11, q13
224 vadd.f32 d6, d28, d29
225 vadd.f32 d7, d30, d31
227 vmla.f32 d6, d25, d4[0]
228 vst1.32 {d6}, [r0,:64], r3
232 function ff_ps_stereo_interpolate_neon, export=1
235 vadd.f32 q15, q14, q14
241 vld1.32 {q2}, [r0,:64]!
242 vld1.32 {q3}, [r1,:64]!
246 vmul.f32 d16, d4, d2[0]
247 vmul.f32 d17, d5, d0[0]
248 vmul.f32 d18, d4, d2[1]
249 vmul.f32 d19, d5, d0[1]
250 vmla.f32 d16, d6, d3[0]
251 vmla.f32 d17, d7, d1[0]
252 vmla.f32 d18, d6, d3[1]
253 vmla.f32 d19, d7, d1[1]
256 vld1.32 {q2}, [r0,:64]!
257 vld1.32 {q3}, [r1,:64]!
258 vst1.32 {q8}, [r2,:64]!
259 vst1.32 {q9}, [r3,:64]!
265 vmul.f32 d16, d4, d2[0]
266 vmul.f32 d18, d4, d2[1]
267 vmla.f32 d16, d6, d3[0]
268 vmla.f32 d18, d6, d3[1]
269 vst1.32 {d16}, [r2,:64]!
270 vst1.32 {d18}, [r3,:64]!