]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/dsputil_neon.S
Move FFT parts from dsputil.h to fft.h
[ffmpeg] / libavcodec / arm / dsputil_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "asm.S"
24
25         preserve8
26         .text
27
28         .macro pixels16 avg=0
29 .if \avg
30         mov             ip,  r0
31 .endif
32 1:      vld1.64         {d0, d1},  [r1], r2
33         vld1.64         {d2, d3},  [r1], r2
34         vld1.64         {d4, d5},  [r1], r2
35         pld             [r1, r2, lsl #2]
36         vld1.64         {d6, d7},  [r1], r2
37         pld             [r1]
38         pld             [r1, r2]
39         pld             [r1, r2, lsl #1]
40 .if \avg
41         vld1.64         {d16,d17}, [ip,:128], r2
42         vrhadd.u8       q0,  q0,  q8
43         vld1.64         {d18,d19}, [ip,:128], r2
44         vrhadd.u8       q1,  q1,  q9
45         vld1.64         {d20,d21}, [ip,:128], r2
46         vrhadd.u8       q2,  q2,  q10
47         vld1.64         {d22,d23}, [ip,:128], r2
48         vrhadd.u8       q3,  q3,  q11
49 .endif
50         subs            r3,  r3,  #4
51         vst1.64         {d0, d1},  [r0,:128], r2
52         vst1.64         {d2, d3},  [r0,:128], r2
53         vst1.64         {d4, d5},  [r0,:128], r2
54         vst1.64         {d6, d7},  [r0,:128], r2
55         bne             1b
56         bx              lr
57         .endm
58
59         .macro pixels16_x2 vhadd=vrhadd.u8
60 1:      vld1.64         {d0-d2},   [r1], r2
61         vld1.64         {d4-d6},   [r1], r2
62         pld             [r1]
63         pld             [r1, r2]
64         subs            r3,  r3,  #2
65         vext.8          q1,  q0,  q1,  #1
66         \vhadd          q0,  q0,  q1
67         vext.8          q3,  q2,  q3,  #1
68         \vhadd          q2,  q2,  q3
69         vst1.64         {d0, d1},  [r0,:128], r2
70         vst1.64         {d4, d5},  [r0,:128], r2
71         bne             1b
72         bx              lr
73         .endm
74
75         .macro pixels16_y2 vhadd=vrhadd.u8
76         vld1.64         {d0, d1},  [r1], r2
77         vld1.64         {d2, d3},  [r1], r2
78 1:      subs            r3,  r3,  #2
79         \vhadd          q2,  q0,  q1
80         vld1.64         {d0, d1},  [r1], r2
81         \vhadd          q3,  q0,  q1
82         vld1.64         {d2, d3},  [r1], r2
83         pld             [r1]
84         pld             [r1, r2]
85         vst1.64         {d4, d5},  [r0,:128], r2
86         vst1.64         {d6, d7},  [r0,:128], r2
87         bne             1b
88         bx              lr
89         .endm
90
91         .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
92         vld1.64         {d0-d2},   [r1], r2
93         vld1.64         {d4-d6},   [r1], r2
94 .if \no_rnd
95         vmov.i16        q13, #1
96 .endif
97         pld             [r1]
98         pld             [r1, r2]
99         vext.8          q1,  q0,  q1,  #1
100         vext.8          q3,  q2,  q3,  #1
101         vaddl.u8        q8,  d0,  d2
102         vaddl.u8        q10, d1,  d3
103         vaddl.u8        q9,  d4,  d6
104         vaddl.u8        q11, d5,  d7
105 1:      subs            r3,  r3,  #2
106         vld1.64         {d0-d2},   [r1], r2
107         vadd.u16        q12, q8,  q9
108         pld             [r1]
109 .if \no_rnd
110         vadd.u16        q12, q12, q13
111 .endif
112         vext.8          q15, q0,  q1,  #1
113         vadd.u16        q1 , q10, q11
114         \vshrn          d28, q12, #2
115 .if \no_rnd
116         vadd.u16        q1,  q1,  q13
117 .endif
118         \vshrn          d29, q1,  #2
119         vaddl.u8        q8,  d0,  d30
120         vld1.64         {d2-d4},   [r1], r2
121         vaddl.u8        q10, d1,  d31
122         vst1.64         {d28,d29}, [r0,:128], r2
123         vadd.u16        q12, q8,  q9
124         pld             [r1, r2]
125 .if \no_rnd
126         vadd.u16        q12, q12, q13
127 .endif
128         vext.8          q2,  q1,  q2,  #1
129         vadd.u16        q0,  q10, q11
130         \vshrn          d30, q12, #2
131 .if \no_rnd
132         vadd.u16        q0,  q0,  q13
133 .endif
134         \vshrn          d31, q0,  #2
135         vaddl.u8        q9,  d2,  d4
136         vaddl.u8        q11, d3,  d5
137         vst1.64         {d30,d31}, [r0,:128], r2
138         bgt             1b
139         bx              lr
140         .endm
141
142         .macro pixels8 avg=0
143 1:      vld1.64         {d0}, [r1], r2
144         vld1.64         {d1}, [r1], r2
145         vld1.64         {d2}, [r1], r2
146         pld             [r1, r2, lsl #2]
147         vld1.64         {d3}, [r1], r2
148         pld             [r1]
149         pld             [r1, r2]
150         pld             [r1, r2, lsl #1]
151 .if \avg
152         vld1.64         {d4}, [r0,:64], r2
153         vrhadd.u8       d0,  d0,  d4
154         vld1.64         {d5}, [r0,:64], r2
155         vrhadd.u8       d1,  d1,  d5
156         vld1.64         {d6}, [r0,:64], r2
157         vrhadd.u8       d2,  d2,  d6
158         vld1.64         {d7}, [r0,:64], r2
159         vrhadd.u8       d3,  d3,  d7
160         sub             r0,  r0,  r2,  lsl #2
161 .endif
162         subs            r3,  r3,  #4
163         vst1.64         {d0}, [r0,:64], r2
164         vst1.64         {d1}, [r0,:64], r2
165         vst1.64         {d2}, [r0,:64], r2
166         vst1.64         {d3}, [r0,:64], r2
167         bne             1b
168         bx              lr
169         .endm
170
171         .macro pixels8_x2 vhadd=vrhadd.u8
172 1:      vld1.64         {d0, d1},  [r1], r2
173         vext.8          d1,  d0,  d1,  #1
174         vld1.64         {d2, d3},  [r1], r2
175         vext.8          d3,  d2,  d3,  #1
176         pld             [r1]
177         pld             [r1, r2]
178         subs            r3,  r3,  #2
179         vswp            d1,  d2
180         \vhadd          q0,  q0,  q1
181         vst1.64         {d0},      [r0,:64], r2
182         vst1.64         {d1},      [r0,:64], r2
183         bne             1b
184         bx              lr
185         .endm
186
187         .macro pixels8_y2 vhadd=vrhadd.u8
188         vld1.64         {d0},      [r1], r2
189         vld1.64         {d1},      [r1], r2
190 1:      subs            r3,  r3,  #2
191         \vhadd          d4,  d0,  d1
192         vld1.64         {d0},      [r1], r2
193         \vhadd          d5,  d0,  d1
194         vld1.64         {d1},      [r1], r2
195         pld             [r1]
196         pld             [r1, r2]
197         vst1.64         {d4},      [r0,:64], r2
198         vst1.64         {d5},      [r0,:64], r2
199         bne             1b
200         bx              lr
201         .endm
202
203         .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
204         vld1.64         {d0, d1},  [r1], r2
205         vld1.64         {d2, d3},  [r1], r2
206 .if \no_rnd
207         vmov.i16        q11, #1
208 .endif
209         pld             [r1]
210         pld             [r1, r2]
211         vext.8          d4,  d0,  d1,  #1
212         vext.8          d6,  d2,  d3,  #1
213         vaddl.u8        q8,  d0,  d4
214         vaddl.u8        q9,  d2,  d6
215 1:      subs            r3,  r3,  #2
216         vld1.64         {d0, d1},  [r1], r2
217         pld             [r1]
218         vadd.u16        q10, q8,  q9
219         vext.8          d4,  d0,  d1,  #1
220 .if \no_rnd
221         vadd.u16        q10, q10, q11
222 .endif
223         vaddl.u8        q8,  d0,  d4
224         \vshrn          d5,  q10, #2
225         vld1.64         {d2, d3},  [r1], r2
226         vadd.u16        q10, q8,  q9
227         pld             [r1, r2]
228 .if \no_rnd
229         vadd.u16        q10, q10, q11
230 .endif
231         vst1.64         {d5},      [r0,:64], r2
232         \vshrn          d7,  q10, #2
233         vext.8          d6,  d2,  d3,  #1
234         vaddl.u8        q9,  d2,  d6
235         vst1.64         {d7},      [r0,:64], r2
236         bgt             1b
237         bx              lr
238         .endm
239
240         .macro pixfunc pfx name suf rnd_op args:vararg
241 function ff_\pfx\name\suf\()_neon, export=1
242         \name \rnd_op \args
243         .endfunc
244         .endm
245
246         .macro pixfunc2 pfx name args:vararg
247         pixfunc \pfx \name
248         pixfunc \pfx \name \args
249         .endm
250
251 function ff_put_h264_qpel16_mc00_neon, export=1
252         mov             r3,  #16
253         .endfunc
254
255         pixfunc  put_ pixels16
256         pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
257         pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
258         pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
259
260 function ff_avg_h264_qpel16_mc00_neon, export=1
261         mov             r3,  #16
262         .endfunc
263
264         pixfunc  avg_ pixels16,, 1
265
266 function ff_put_h264_qpel8_mc00_neon, export=1
267         mov             r3,  #8
268         .endfunc
269
270         pixfunc  put_ pixels8
271         pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
272         pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
273         pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
274
275 function ff_avg_h264_qpel8_mc00_neon, export=1
276         mov             r3,  #8
277         .endfunc
278
279         pixfunc  avg_ pixels8,, 1
280
281 function ff_put_pixels_clamped_neon, export=1
282         vld1.64         {d16-d19}, [r0,:128]!
283         vqmovun.s16     d0, q8
284         vld1.64         {d20-d23}, [r0,:128]!
285         vqmovun.s16     d1, q9
286         vld1.64         {d24-d27}, [r0,:128]!
287         vqmovun.s16     d2, q10
288         vld1.64         {d28-d31}, [r0,:128]!
289         vqmovun.s16     d3, q11
290         vst1.64         {d0},      [r1,:64], r2
291         vqmovun.s16     d4, q12
292         vst1.64         {d1},      [r1,:64], r2
293         vqmovun.s16     d5, q13
294         vst1.64         {d2},      [r1,:64], r2
295         vqmovun.s16     d6, q14
296         vst1.64         {d3},      [r1,:64], r2
297         vqmovun.s16     d7, q15
298         vst1.64         {d4},      [r1,:64], r2
299         vst1.64         {d5},      [r1,:64], r2
300         vst1.64         {d6},      [r1,:64], r2
301         vst1.64         {d7},      [r1,:64], r2
302         bx              lr
303         .endfunc
304
305 function ff_put_signed_pixels_clamped_neon, export=1
306         vmov.u8         d31, #128
307         vld1.64         {d16-d17}, [r0,:128]!
308         vqmovn.s16      d0, q8
309         vld1.64         {d18-d19}, [r0,:128]!
310         vqmovn.s16      d1, q9
311         vld1.64         {d16-d17}, [r0,:128]!
312         vqmovn.s16      d2, q8
313         vld1.64         {d18-d19}, [r0,:128]!
314         vadd.u8         d0, d0, d31
315         vld1.64         {d20-d21}, [r0,:128]!
316         vadd.u8         d1, d1, d31
317         vld1.64         {d22-d23}, [r0,:128]!
318         vadd.u8         d2, d2, d31
319         vst1.64         {d0},      [r1,:64], r2
320         vqmovn.s16      d3, q9
321         vst1.64         {d1},      [r1,:64], r2
322         vqmovn.s16      d4, q10
323         vst1.64         {d2},      [r1,:64], r2
324         vqmovn.s16      d5, q11
325         vld1.64         {d24-d25}, [r0,:128]!
326         vadd.u8         d3, d3, d31
327         vld1.64         {d26-d27}, [r0,:128]!
328         vadd.u8         d4, d4, d31
329         vadd.u8         d5, d5, d31
330         vst1.64         {d3},      [r1,:64], r2
331         vqmovn.s16      d6, q12
332         vst1.64         {d4},      [r1,:64], r2
333         vqmovn.s16      d7, q13
334         vst1.64         {d5},      [r1,:64], r2
335         vadd.u8         d6, d6, d31
336         vadd.u8         d7, d7, d31
337         vst1.64         {d6},      [r1,:64], r2
338         vst1.64         {d7},      [r1,:64], r2
339         bx              lr
340         .endfunc
341
342 function ff_add_pixels_clamped_neon, export=1
343         mov             r3, r1
344         vld1.64         {d16},   [r1,:64], r2
345         vld1.64         {d0-d1}, [r0,:128]!
346         vaddw.u8        q0, q0, d16
347         vld1.64         {d17},   [r1,:64], r2
348         vld1.64         {d2-d3}, [r0,:128]!
349         vqmovun.s16     d0, q0
350         vld1.64         {d18},   [r1,:64], r2
351         vaddw.u8        q1, q1, d17
352         vld1.64         {d4-d5}, [r0,:128]!
353         vaddw.u8        q2, q2, d18
354         vst1.64         {d0},    [r3,:64], r2
355         vqmovun.s16     d2, q1
356         vld1.64         {d19},   [r1,:64], r2
357         vld1.64         {d6-d7}, [r0,:128]!
358         vaddw.u8        q3, q3, d19
359         vqmovun.s16     d4, q2
360         vst1.64         {d2},    [r3,:64], r2
361         vld1.64         {d16},   [r1,:64], r2
362         vqmovun.s16     d6, q3
363         vld1.64         {d0-d1}, [r0,:128]!
364         vaddw.u8        q0, q0, d16
365         vst1.64         {d4},    [r3,:64], r2
366         vld1.64         {d17},   [r1,:64], r2
367         vld1.64         {d2-d3}, [r0,:128]!
368         vaddw.u8        q1, q1, d17
369         vst1.64         {d6},    [r3,:64], r2
370         vqmovun.s16     d0, q0
371         vld1.64         {d18},   [r1,:64], r2
372         vld1.64         {d4-d5}, [r0,:128]!
373         vaddw.u8        q2, q2, d18
374         vst1.64         {d0},    [r3,:64], r2
375         vqmovun.s16     d2, q1
376         vld1.64         {d19},   [r1,:64], r2
377         vqmovun.s16     d4, q2
378         vld1.64         {d6-d7}, [r0,:128]!
379         vaddw.u8        q3, q3, d19
380         vst1.64         {d2},    [r3,:64], r2
381         vqmovun.s16     d6, q3
382         vst1.64         {d4},    [r3,:64], r2
383         vst1.64         {d6},    [r3,:64], r2
384         bx              lr
385         .endfunc
386
387 function ff_float_to_int16_neon, export=1
388         subs            r2,  r2,  #8
389         vld1.64         {d0-d1},  [r1,:128]!
390         vcvt.s32.f32    q8,  q0,  #16
391         vld1.64         {d2-d3},  [r1,:128]!
392         vcvt.s32.f32    q9,  q1,  #16
393         beq             3f
394         bics            ip,  r2,  #15
395         beq             2f
396 1:      subs            ip,  ip,  #16
397         vshrn.s32       d4,  q8,  #16
398         vld1.64         {d0-d1},  [r1,:128]!
399         vcvt.s32.f32    q0,  q0,  #16
400         vshrn.s32       d5,  q9,  #16
401         vld1.64         {d2-d3},  [r1,:128]!
402         vcvt.s32.f32    q1,  q1,  #16
403         vshrn.s32       d6,  q0,  #16
404         vst1.64         {d4-d5},  [r0,:128]!
405         vshrn.s32       d7,  q1,  #16
406         vld1.64         {d16-d17},[r1,:128]!
407         vcvt.s32.f32    q8,  q8,  #16
408         vld1.64         {d18-d19},[r1,:128]!
409         vcvt.s32.f32    q9,  q9,  #16
410         vst1.64         {d6-d7},  [r0,:128]!
411         bne             1b
412         ands            r2,  r2,  #15
413         beq             3f
414 2:      vld1.64         {d0-d1},  [r1,:128]!
415         vshrn.s32       d4,  q8,  #16
416         vcvt.s32.f32    q0,  q0,  #16
417         vld1.64         {d2-d3},  [r1,:128]!
418         vshrn.s32       d5,  q9,  #16
419         vcvt.s32.f32    q1,  q1,  #16
420         vshrn.s32       d6,  q0,  #16
421         vst1.64         {d4-d5},  [r0,:128]!
422         vshrn.s32       d7,  q1,  #16
423         vst1.64         {d6-d7},  [r0,:128]!
424         bx              lr
425 3:      vshrn.s32       d4,  q8,  #16
426         vshrn.s32       d5,  q9,  #16
427         vst1.64         {d4-d5},  [r0,:128]!
428         bx              lr
429         .endfunc
430
431 function ff_float_to_int16_interleave_neon, export=1
432         cmp             r3, #2
433         ldrlt           r1, [r1]
434         blt             ff_float_to_int16_neon
435         bne             4f
436
437         ldr             r3, [r1]
438         ldr             r1, [r1, #4]
439
440         subs            r2,  r2,  #8
441         vld1.64         {d0-d1},  [r3,:128]!
442         vcvt.s32.f32    q8,  q0,  #16
443         vld1.64         {d2-d3},  [r3,:128]!
444         vcvt.s32.f32    q9,  q1,  #16
445         vld1.64         {d20-d21},[r1,:128]!
446         vcvt.s32.f32    q10, q10, #16
447         vld1.64         {d22-d23},[r1,:128]!
448         vcvt.s32.f32    q11, q11, #16
449         beq             3f
450         bics            ip,  r2,  #15
451         beq             2f
452 1:      subs            ip,  ip,  #16
453         vld1.64         {d0-d1},  [r3,:128]!
454         vcvt.s32.f32    q0,  q0,  #16
455         vsri.32         q10, q8,  #16
456         vld1.64         {d2-d3},  [r3,:128]!
457         vcvt.s32.f32    q1,  q1,  #16
458         vld1.64         {d24-d25},[r1,:128]!
459         vcvt.s32.f32    q12, q12, #16
460         vld1.64         {d26-d27},[r1,:128]!
461         vsri.32         q11, q9,  #16
462         vst1.64         {d20-d21},[r0,:128]!
463         vcvt.s32.f32    q13, q13, #16
464         vst1.64         {d22-d23},[r0,:128]!
465         vsri.32         q12, q0,  #16
466         vld1.64         {d16-d17},[r3,:128]!
467         vsri.32         q13, q1,  #16
468         vst1.64         {d24-d25},[r0,:128]!
469         vcvt.s32.f32    q8,  q8,  #16
470         vld1.64         {d18-d19},[r3,:128]!
471         vcvt.s32.f32    q9,  q9,  #16
472         vld1.64         {d20-d21},[r1,:128]!
473         vcvt.s32.f32    q10, q10, #16
474         vld1.64         {d22-d23},[r1,:128]!
475         vcvt.s32.f32    q11, q11, #16
476         vst1.64         {d26-d27},[r0,:128]!
477         bne             1b
478         ands            r2,  r2,  #15
479         beq             3f
480 2:      vsri.32         q10, q8,  #16
481         vld1.64         {d0-d1},  [r3,:128]!
482         vcvt.s32.f32    q0,  q0,  #16
483         vld1.64         {d2-d3},  [r3,:128]!
484         vcvt.s32.f32    q1,  q1,  #16
485         vld1.64         {d24-d25},[r1,:128]!
486         vcvt.s32.f32    q12, q12, #16
487         vsri.32         q11, q9,  #16
488         vld1.64         {d26-d27},[r1,:128]!
489         vcvt.s32.f32    q13, q13, #16
490         vst1.64         {d20-d21},[r0,:128]!
491         vsri.32         q12, q0,  #16
492         vst1.64         {d22-d23},[r0,:128]!
493         vsri.32         q13, q1,  #16
494         vst1.64         {d24-d27},[r0,:128]!
495         bx              lr
496 3:      vsri.32         q10, q8,  #16
497         vsri.32         q11, q9,  #16
498         vst1.64         {d20-d23},[r0,:128]!
499         bx              lr
500
501 4:      push            {r4-r8,lr}
502         cmp             r3,  #4
503         lsl             ip,  r3,  #1
504         blt             4f
505
506         @ 4 channels
507 5:      ldmia           r1!, {r4-r7}
508         mov             lr,  r2
509         mov             r8,  r0
510         vld1.64         {d16-d17},[r4,:128]!
511         vcvt.s32.f32    q8,  q8,  #16
512         vld1.64         {d18-d19},[r5,:128]!
513         vcvt.s32.f32    q9,  q9,  #16
514         vld1.64         {d20-d21},[r6,:128]!
515         vcvt.s32.f32    q10, q10, #16
516         vld1.64         {d22-d23},[r7,:128]!
517         vcvt.s32.f32    q11, q11, #16
518 6:      subs            lr,  lr,  #8
519         vld1.64         {d0-d1},  [r4,:128]!
520         vcvt.s32.f32    q0,  q0,  #16
521         vsri.32         q9,  q8,  #16
522         vld1.64         {d2-d3},  [r5,:128]!
523         vcvt.s32.f32    q1,  q1,  #16
524         vsri.32         q11, q10, #16
525         vld1.64         {d4-d5},  [r6,:128]!
526         vcvt.s32.f32    q2,  q2,  #16
527         vzip.32         d18, d22
528         vld1.64         {d6-d7},  [r7,:128]!
529         vcvt.s32.f32    q3,  q3,  #16
530         vzip.32         d19, d23
531         vst1.64         {d18},    [r8], ip
532         vsri.32         q1,  q0,  #16
533         vst1.64         {d22},    [r8], ip
534         vsri.32         q3,  q2,  #16
535         vst1.64         {d19},    [r8], ip
536         vzip.32         d2,  d6
537         vst1.64         {d23},    [r8], ip
538         vzip.32         d3,  d7
539         beq             7f
540         vld1.64         {d16-d17},[r4,:128]!
541         vcvt.s32.f32    q8,  q8,  #16
542         vst1.64         {d2},     [r8], ip
543         vld1.64         {d18-d19},[r5,:128]!
544         vcvt.s32.f32    q9,  q9,  #16
545         vst1.64         {d6},     [r8], ip
546         vld1.64         {d20-d21},[r6,:128]!
547         vcvt.s32.f32    q10, q10, #16
548         vst1.64         {d3},     [r8], ip
549         vld1.64         {d22-d23},[r7,:128]!
550         vcvt.s32.f32    q11, q11, #16
551         vst1.64         {d7},     [r8], ip
552         b               6b
553 7:      vst1.64         {d2},     [r8], ip
554         vst1.64         {d6},     [r8], ip
555         vst1.64         {d3},     [r8], ip
556         vst1.64         {d7},     [r8], ip
557         subs            r3,  r3,  #4
558         popeq           {r4-r8,pc}
559         cmp             r3,  #4
560         add             r0,  r0,  #8
561         bge             5b
562
563         @ 2 channels
564 4:      cmp             r3,  #2
565         blt             4f
566         ldmia           r1!, {r4-r5}
567         mov             lr,  r2
568         mov             r8,  r0
569         tst             lr,  #8
570         vld1.64         {d16-d17},[r4,:128]!
571         vcvt.s32.f32    q8,  q8,  #16
572         vld1.64         {d18-d19},[r5,:128]!
573         vcvt.s32.f32    q9,  q9,  #16
574         vld1.64         {d20-d21},[r4,:128]!
575         vcvt.s32.f32    q10, q10, #16
576         vld1.64         {d22-d23},[r5,:128]!
577         vcvt.s32.f32    q11, q11, #16
578         beq             6f
579         subs            lr,  lr,  #8
580         beq             7f
581         vsri.32         d18, d16, #16
582         vsri.32         d19, d17, #16
583         vld1.64         {d16-d17},[r4,:128]!
584         vcvt.s32.f32    q8,  q8,  #16
585         vst1.32         {d18[0]}, [r8], ip
586         vsri.32         d22, d20, #16
587         vst1.32         {d18[1]}, [r8], ip
588         vsri.32         d23, d21, #16
589         vst1.32         {d19[0]}, [r8], ip
590         vst1.32         {d19[1]}, [r8], ip
591         vld1.64         {d18-d19},[r5,:128]!
592         vcvt.s32.f32    q9,  q9,  #16
593         vst1.32         {d22[0]}, [r8], ip
594         vst1.32         {d22[1]}, [r8], ip
595         vld1.64         {d20-d21},[r4,:128]!
596         vcvt.s32.f32    q10, q10, #16
597         vst1.32         {d23[0]}, [r8], ip
598         vst1.32         {d23[1]}, [r8], ip
599         vld1.64         {d22-d23},[r5,:128]!
600         vcvt.s32.f32    q11, q11, #16
601 6:      subs            lr,  lr,  #16
602         vld1.64         {d0-d1},  [r4,:128]!
603         vcvt.s32.f32    q0,  q0,  #16
604         vsri.32         d18, d16, #16
605         vld1.64         {d2-d3},  [r5,:128]!
606         vcvt.s32.f32    q1,  q1,  #16
607         vsri.32         d19, d17, #16
608         vld1.64         {d4-d5},  [r4,:128]!
609         vcvt.s32.f32    q2,  q2,  #16
610         vld1.64         {d6-d7},  [r5,:128]!
611         vcvt.s32.f32    q3,  q3,  #16
612         vst1.32         {d18[0]}, [r8], ip
613         vsri.32         d22, d20, #16
614         vst1.32         {d18[1]}, [r8], ip
615         vsri.32         d23, d21, #16
616         vst1.32         {d19[0]}, [r8], ip
617         vsri.32         d2,  d0,  #16
618         vst1.32         {d19[1]}, [r8], ip
619         vsri.32         d3,  d1,  #16
620         vst1.32         {d22[0]}, [r8], ip
621         vsri.32         d6,  d4,  #16
622         vst1.32         {d22[1]}, [r8], ip
623         vsri.32         d7,  d5,  #16
624         vst1.32         {d23[0]}, [r8], ip
625         vst1.32         {d23[1]}, [r8], ip
626         beq             6f
627         vld1.64         {d16-d17},[r4,:128]!
628         vcvt.s32.f32    q8,  q8,  #16
629         vst1.32         {d2[0]},  [r8], ip
630         vst1.32         {d2[1]},  [r8], ip
631         vld1.64         {d18-d19},[r5,:128]!
632         vcvt.s32.f32    q9,  q9,  #16
633         vst1.32         {d3[0]},  [r8], ip
634         vst1.32         {d3[1]},  [r8], ip
635         vld1.64         {d20-d21},[r4,:128]!
636         vcvt.s32.f32    q10, q10, #16
637         vst1.32         {d6[0]},  [r8], ip
638         vst1.32         {d6[1]},  [r8], ip
639         vld1.64         {d22-d23},[r5,:128]!
640         vcvt.s32.f32    q11, q11, #16
641         vst1.32         {d7[0]},  [r8], ip
642         vst1.32         {d7[1]},  [r8], ip
643         bgt             6b
644 6:      vst1.32         {d2[0]},  [r8], ip
645         vst1.32         {d2[1]},  [r8], ip
646         vst1.32         {d3[0]},  [r8], ip
647         vst1.32         {d3[1]},  [r8], ip
648         vst1.32         {d6[0]},  [r8], ip
649         vst1.32         {d6[1]},  [r8], ip
650         vst1.32         {d7[0]},  [r8], ip
651         vst1.32         {d7[1]},  [r8], ip
652         b               8f
653 7:      vsri.32         d18, d16, #16
654         vsri.32         d19, d17, #16
655         vst1.32         {d18[0]}, [r8], ip
656         vsri.32         d22, d20, #16
657         vst1.32         {d18[1]}, [r8], ip
658         vsri.32         d23, d21, #16
659         vst1.32         {d19[0]}, [r8], ip
660         vst1.32         {d19[1]}, [r8], ip
661         vst1.32         {d22[0]}, [r8], ip
662         vst1.32         {d22[1]}, [r8], ip
663         vst1.32         {d23[0]}, [r8], ip
664         vst1.32         {d23[1]}, [r8], ip
665 8:      subs            r3,  r3,  #2
666         add             r0,  r0,  #4
667         popeq           {r4-r8,pc}
668
669         @ 1 channel
670 4:      ldr             r4,  [r1],#4
671         tst             r2,  #8
672         mov             lr,  r2
673         mov             r5,  r0
674         vld1.64         {d0-d1},  [r4,:128]!
675         vcvt.s32.f32    q0,  q0,  #16
676         vld1.64         {d2-d3},  [r4,:128]!
677         vcvt.s32.f32    q1,  q1,  #16
678         bne             8f
679 6:      subs            lr,  lr,  #16
680         vld1.64         {d4-d5},  [r4,:128]!
681         vcvt.s32.f32    q2,  q2,  #16
682         vld1.64         {d6-d7},  [r4,:128]!
683         vcvt.s32.f32    q3,  q3,  #16
684         vst1.16         {d0[1]},  [r5,:16], ip
685         vst1.16         {d0[3]},  [r5,:16], ip
686         vst1.16         {d1[1]},  [r5,:16], ip
687         vst1.16         {d1[3]},  [r5,:16], ip
688         vst1.16         {d2[1]},  [r5,:16], ip
689         vst1.16         {d2[3]},  [r5,:16], ip
690         vst1.16         {d3[1]},  [r5,:16], ip
691         vst1.16         {d3[3]},  [r5,:16], ip
692         beq             7f
693         vld1.64         {d0-d1},  [r4,:128]!
694         vcvt.s32.f32    q0,  q0,  #16
695         vld1.64         {d2-d3},  [r4,:128]!
696         vcvt.s32.f32    q1,  q1,  #16
697 7:      vst1.16         {d4[1]},  [r5,:16], ip
698         vst1.16         {d4[3]},  [r5,:16], ip
699         vst1.16         {d5[1]},  [r5,:16], ip
700         vst1.16         {d5[3]},  [r5,:16], ip
701         vst1.16         {d6[1]},  [r5,:16], ip
702         vst1.16         {d6[3]},  [r5,:16], ip
703         vst1.16         {d7[1]},  [r5,:16], ip
704         vst1.16         {d7[3]},  [r5,:16], ip
705         bgt             6b
706         pop             {r4-r8,pc}
707 8:      subs            lr,  lr,  #8
708         vst1.16         {d0[1]},  [r5,:16], ip
709         vst1.16         {d0[3]},  [r5,:16], ip
710         vst1.16         {d1[1]},  [r5,:16], ip
711         vst1.16         {d1[3]},  [r5,:16], ip
712         vst1.16         {d2[1]},  [r5,:16], ip
713         vst1.16         {d2[3]},  [r5,:16], ip
714         vst1.16         {d3[1]},  [r5,:16], ip
715         vst1.16         {d3[3]},  [r5,:16], ip
716         popeq           {r4-r8,pc}
717         vld1.64         {d0-d1},  [r4,:128]!
718         vcvt.s32.f32    q0,  q0,  #16
719         vld1.64         {d2-d3},  [r4,:128]!
720         vcvt.s32.f32    q1,  q1,  #16
721         b               6b
722         .endfunc
723
724 function ff_vector_fmul_neon, export=1
725         mov             r3,  r0
726         subs            r2,  r2,  #8
727         vld1.64         {d0-d3},  [r0,:128]!
728         vld1.64         {d4-d7},  [r1,:128]!
729         vmul.f32        q8,  q0,  q2
730         vmul.f32        q9,  q1,  q3
731         beq             3f
732         bics            ip,  r2,  #15
733         beq             2f
734 1:      subs            ip,  ip,  #16
735         vld1.64         {d0-d1},  [r0,:128]!
736         vld1.64         {d4-d5},  [r1,:128]!
737         vmul.f32        q10, q0,  q2
738         vld1.64         {d2-d3},  [r0,:128]!
739         vld1.64         {d6-d7},  [r1,:128]!
740         vmul.f32        q11, q1,  q3
741         vst1.64         {d16-d19},[r3,:128]!
742         vld1.64         {d0-d1},  [r0,:128]!
743         vld1.64         {d4-d5},  [r1,:128]!
744         vmul.f32        q8,  q0,  q2
745         vld1.64         {d2-d3},  [r0,:128]!
746         vld1.64         {d6-d7},  [r1,:128]!
747         vmul.f32        q9,  q1,  q3
748         vst1.64         {d20-d23},[r3,:128]!
749         bne             1b
750         ands            r2,  r2,  #15
751         beq             3f
752 2:      vld1.64         {d0-d1},  [r0,:128]!
753         vld1.64         {d4-d5},  [r1,:128]!
754         vst1.64         {d16-d17},[r3,:128]!
755         vmul.f32        q8,  q0,  q2
756         vld1.64         {d2-d3},  [r0,:128]!
757         vld1.64         {d6-d7},  [r1,:128]!
758         vst1.64         {d18-d19},[r3,:128]!
759         vmul.f32        q9,  q1,  q3
760 3:      vst1.64         {d16-d19},[r3,:128]!
761         bx              lr
762         .endfunc
763
764 function ff_vector_fmul_window_neon, export=1
765 VFP     vdup.32         q8,  d0[0]
766 NOVFP   vld1.32         {d16[],d17[]}, [sp,:32]
767         push            {r4,r5,lr}
768 VFP     ldr             lr,  [sp, #12]
769 NOVFP   ldr             lr,  [sp, #16]
770         sub             r2,  r2,  #8
771         sub             r5,  lr,  #2
772         add             r2,  r2,  r5, lsl #2
773         add             r4,  r3,  r5, lsl #3
774         add             ip,  r0,  r5, lsl #3
775         mov             r5,  #-16
776         vld1.64         {d0,d1},  [r1,:128]!
777         vld1.64         {d2,d3},  [r2,:128], r5
778         vld1.64         {d4,d5},  [r3,:128]!
779         vld1.64         {d6,d7},  [r4,:128], r5
780 1:      subs            lr,  lr,  #4
781         vmov            q11, q8
782         vmla.f32        d22, d0,  d4
783         vmov            q10, q8
784         vmla.f32        d23, d1,  d5
785         vrev64.32       q3,  q3
786         vmla.f32        d20, d0,  d7
787         vrev64.32       q1,  q1
788         vmla.f32        d21, d1,  d6
789         beq             2f
790         vmla.f32        d22, d3,  d7
791         vld1.64         {d0,d1},  [r1,:128]!
792         vmla.f32        d23, d2,  d6
793         vld1.64         {d18,d19},[r2,:128], r5
794         vmls.f32        d20, d3,  d4
795         vld1.64         {d24,d25},[r3,:128]!
796         vmls.f32        d21, d2,  d5
797         vld1.64         {d6,d7},  [r4,:128], r5
798         vmov            q1,  q9
799         vrev64.32       q11, q11
800         vmov            q2,  q12
801         vswp            d22, d23
802         vst1.64         {d20,d21},[r0,:128]!
803         vst1.64         {d22,d23},[ip,:128], r5
804         b               1b
805 2:      vmla.f32        d22, d3,  d7
806         vmla.f32        d23, d2,  d6
807         vmls.f32        d20, d3,  d4
808         vmls.f32        d21, d2,  d5
809         vrev64.32       q11, q11
810         vswp            d22, d23
811         vst1.64         {d20,d21},[r0,:128]!
812         vst1.64         {d22,d23},[ip,:128], r5
813         pop             {r4,r5,pc}
814         .endfunc
815
816 #if CONFIG_VORBIS_DECODER
817 function ff_vorbis_inverse_coupling_neon, export=1
818         vmov.i32        q10, #1<<31
819         subs            r2,  r2,  #4
820         mov             r3,  r0
821         mov             r12, r1
822         beq             3f
823
824         vld1.32         {d24-d25},[r1,:128]!
825         vld1.32         {d22-d23},[r0,:128]!
826         vcle.s32        q8,  q12, #0
827         vand            q9,  q11, q10
828         veor            q12, q12, q9
829         vand            q2,  q12, q8
830         vbic            q3,  q12, q8
831         vadd.f32        q12, q11, q2
832         vsub.f32        q11, q11, q3
833 1:      vld1.32         {d2-d3},  [r1,:128]!
834         vld1.32         {d0-d1},  [r0,:128]!
835         vcle.s32        q8,  q1,  #0
836         vand            q9,  q0,  q10
837         veor            q1,  q1,  q9
838         vst1.32         {d24-d25},[r3, :128]!
839         vst1.32         {d22-d23},[r12,:128]!
840         vand            q2,  q1,  q8
841         vbic            q3,  q1,  q8
842         vadd.f32        q1,  q0,  q2
843         vsub.f32        q0,  q0,  q3
844         subs            r2,  r2,  #8
845         ble             2f
846         vld1.32         {d24-d25},[r1,:128]!
847         vld1.32         {d22-d23},[r0,:128]!
848         vcle.s32        q8,  q12, #0
849         vand            q9,  q11, q10
850         veor            q12, q12, q9
851         vst1.32         {d2-d3},  [r3, :128]!
852         vst1.32         {d0-d1},  [r12,:128]!
853         vand            q2,  q12, q8
854         vbic            q3,  q12, q8
855         vadd.f32        q12, q11, q2
856         vsub.f32        q11, q11, q3
857         b               1b
858
859 2:      vst1.32         {d2-d3},  [r3, :128]!
860         vst1.32         {d0-d1},  [r12,:128]!
861         bxlt            lr
862
863 3:      vld1.32         {d2-d3},  [r1,:128]
864         vld1.32         {d0-d1},  [r0,:128]
865         vcle.s32        q8,  q1,  #0
866         vand            q9,  q0,  q10
867         veor            q1,  q1,  q9
868         vand            q2,  q1,  q8
869         vbic            q3,  q1,  q8
870         vadd.f32        q1,  q0,  q2
871         vsub.f32        q0,  q0,  q3
872         vst1.32         {d2-d3},  [r0,:128]!
873         vst1.32         {d0-d1},  [r1,:128]!
874         bx              lr
875         .endfunc
876 #endif
877
878 function ff_vector_fmul_scalar_neon, export=1
879 VFP     len .req r2
880 NOVFP   len .req r3
881 VFP     vdup.32         q8,  d0[0]
882 NOVFP   vdup.32         q8,  r2
883         bics            r12, len, #15
884         beq             3f
885         vld1.32         {q0},[r1,:128]!
886         vld1.32         {q1},[r1,:128]!
887 1:      vmul.f32        q0,  q0,  q8
888         vld1.32         {q2},[r1,:128]!
889         vmul.f32        q1,  q1,  q8
890         vld1.32         {q3},[r1,:128]!
891         vmul.f32        q2,  q2,  q8
892         vst1.32         {q0},[r0,:128]!
893         vmul.f32        q3,  q3,  q8
894         vst1.32         {q1},[r0,:128]!
895         subs            r12, r12, #16
896         beq             2f
897         vld1.32         {q0},[r1,:128]!
898         vst1.32         {q2},[r0,:128]!
899         vld1.32         {q1},[r1,:128]!
900         vst1.32         {q3},[r0,:128]!
901         b               1b
902 2:      vst1.32         {q2},[r0,:128]!
903         vst1.32         {q3},[r0,:128]!
904         ands            len, len, #15
905         bxeq            lr
906 3:      vld1.32         {q0},[r1,:128]!
907         vmul.f32        q0,  q0,  q8
908         vst1.32         {q0},[r0,:128]!
909         subs            len, len, #4
910         bgt             3b
911         bx              lr
912         .unreq          len
913         .endfunc
914
915 function ff_vector_fmul_sv_scalar_2_neon, export=1
916 VFP     vdup.32         d16, d0[0]
917 NOVFP   vdup.32         d16, r3
918 NOVFP   ldr             r3,  [sp]
919         vld1.32         {d0},[r1,:64]!
920         vld1.32         {d1},[r1,:64]!
921 1:      subs            r3,  r3,  #4
922         vmul.f32        d4,  d0,  d16
923         vmul.f32        d5,  d1,  d16
924         ldr             r12, [r2], #4
925         vld1.32         {d2},[r12,:64]
926         ldr             r12, [r2], #4
927         vld1.32         {d3},[r12,:64]
928         vmul.f32        d4,  d4,  d2
929         vmul.f32        d5,  d5,  d3
930         beq             2f
931         vld1.32         {d0},[r1,:64]!
932         vld1.32         {d1},[r1,:64]!
933         vst1.32         {d4},[r0,:64]!
934         vst1.32         {d5},[r0,:64]!
935         b               1b
936 2:      vst1.32         {d4},[r0,:64]!
937         vst1.32         {d5},[r0,:64]!
938         bx              lr
939         .endfunc
940
941 function ff_vector_fmul_sv_scalar_4_neon, export=1
942 VFP     vdup.32         q10, d0[0]
943 NOVFP   vdup.32         q10, r3
944 NOVFP   ldr             r3,  [sp]
945         push            {lr}
946         bics            lr,  r3,  #7
947         beq             3f
948         vld1.32         {q0},[r1,:128]!
949         vld1.32         {q2},[r1,:128]!
950 1:      ldr             r12, [r2], #4
951         vld1.32         {q1},[r12,:128]
952         ldr             r12, [r2], #4
953         vld1.32         {q3},[r12,:128]
954         vmul.f32        q8,  q0,  q10
955         vmul.f32        q8,  q8,  q1
956         vmul.f32        q9,  q2,  q10
957         vmul.f32        q9,  q9,  q3
958         subs            lr,  lr,  #8
959         beq             2f
960         vld1.32         {q0},[r1,:128]!
961         vld1.32         {q2},[r1,:128]!
962         vst1.32         {q8},[r0,:128]!
963         vst1.32         {q9},[r0,:128]!
964         b               1b
965 2:      vst1.32         {q8},[r0,:128]!
966         vst1.32         {q9},[r0,:128]!
967         ands            r3,  r3,  #7
968         popeq           {pc}
969 3:      vld1.32         {q0},[r1,:128]!
970         ldr             r12, [r2], #4
971         vld1.32         {q1},[r12,:128]
972         vmul.f32        q0,  q0,  q10
973         vmul.f32        q0,  q0,  q1
974         vst1.32         {q0},[r0,:128]!
975         subs            r3,  r3,  #4
976         bgt             3b
977         pop             {pc}
978         .endfunc
979
980 function ff_sv_fmul_scalar_2_neon, export=1
981 VFP     len .req r2
982 NOVFP   len .req r3
983 VFP     vdup.32         q8,  d0[0]
984 NOVFP   vdup.32         q8,  r2
985         ldr             r12, [r1], #4
986         vld1.32         {d0},[r12,:64]
987         ldr             r12, [r1], #4
988         vld1.32         {d1},[r12,:64]
989 1:      vmul.f32        q1,  q0,  q8
990         subs            len, len, #4
991         beq             2f
992         ldr             r12, [r1], #4
993         vld1.32         {d0},[r12,:64]
994         ldr             r12, [r1], #4
995         vld1.32         {d1},[r12,:64]
996         vst1.32         {q1},[r0,:128]!
997         b               1b
998 2:      vst1.32         {q1},[r0,:128]!
999         bx              lr
1000         .unreq          len
1001         .endfunc
1002
1003 function ff_sv_fmul_scalar_4_neon, export=1
1004 VFP     len .req r2
1005 NOVFP   len .req r3
1006 VFP     vdup.32         q8,  d0[0]
1007 NOVFP   vdup.32         q8,  r2
1008 1:      ldr             r12, [r1], #4
1009         vld1.32         {q0},[r12,:128]
1010         vmul.f32        q0,  q0,  q8
1011         vst1.32         {q0},[r0,:128]!
1012         subs            len, len, #4
1013         bgt             1b
1014         bx              lr
1015         .unreq          len
1016         .endfunc
1017
1018 function ff_butterflies_float_neon, export=1
1019 1:      vld1.32         {q0},[r0,:128]
1020         vld1.32         {q1},[r1,:128]
1021         vsub.f32        q2,  q0,  q1
1022         vadd.f32        q1,  q0,  q1
1023         vst1.32         {q2},[r1,:128]!
1024         vst1.32         {q1},[r0,:128]!
1025         subs            r2,  r2,  #4
1026         bgt             1b
1027         bx              lr
1028         .endfunc
1029
1030 function ff_scalarproduct_float_neon, export=1
1031         vmov.f32        q2,  #0.0
1032 1:      vld1.32         {q0},[r0,:128]!
1033         vld1.32         {q1},[r1,:128]!
1034         vmla.f32        q2,  q0,  q1
1035         subs            r2,  r2,  #4
1036         bgt             1b
1037         vadd.f32        d0,  d4,  d5
1038         vpadd.f32       d0,  d0,  d0
1039 NOVFP   vmov.32         r0,  d0[0]
1040         bx              lr
1041         .endfunc
1042
1043 function ff_int32_to_float_fmul_scalar_neon, export=1
1044 VFP     vdup.32         q0,  d0[0]
1045 VFP     len     .req    r2
1046 NOVFP   vdup.32         q0,  r2
1047 NOVFP   len     .req    r3
1048
1049         vld1.32         {q1},[r1,:128]!
1050         vcvt.f32.s32    q3,  q1
1051         vld1.32         {q2},[r1,:128]!
1052         vcvt.f32.s32    q8,  q2
1053 1:      subs            len, len, #8
1054         pld             [r1, #16]
1055         vmul.f32        q9,  q3,  q0
1056         vmul.f32        q10, q8,  q0
1057         beq             2f
1058         vld1.32         {q1},[r1,:128]!
1059         vcvt.f32.s32    q3,  q1
1060         vld1.32         {q2},[r1,:128]!
1061         vcvt.f32.s32    q8,  q2
1062         vst1.32         {q9}, [r0,:128]!
1063         vst1.32         {q10},[r0,:128]!
1064         b               1b
1065 2:      vst1.32         {q9}, [r0,:128]!
1066         vst1.32         {q10},[r0,:128]!
1067         bx              lr
1068         .unreq  len
1069         .endfunc
1070
1071 function ff_vector_fmul_reverse_neon, export=1
1072         add             r2,  r2,  r3,  lsl #2
1073         sub             r2,  r2,  #32
1074         mov             r12, #-32
1075         vld1.32         {q0-q1},  [r1,:128]!
1076         vld1.32         {q2-q3},  [r2,:128], r12
1077 1:      pld             [r1, #32]
1078         vrev64.32       q3,  q3
1079         vmul.f32        d16, d0,  d7
1080         vmul.f32        d17, d1,  d6
1081         pld             [r2, #-32]
1082         vrev64.32       q2,  q2
1083         vmul.f32        d18, d2,  d5
1084         vmul.f32        d19, d3,  d4
1085         subs            r3,  r3,  #8
1086         beq             2f
1087         vld1.32         {q0-q1},  [r1,:128]!
1088         vld1.32         {q2-q3},  [r2,:128], r12
1089         vst1.32         {q8-q9},  [r0,:128]!
1090         b               1b
1091 2:      vst1.32         {q8-q9},  [r0,:128]!
1092         bx              lr
1093         .endfunc
1094
1095 function ff_vector_fmul_add_neon, export=1
1096         ldr             r12, [sp]
1097         vld1.32         {q0-q1},  [r1,:128]!
1098         vld1.32         {q8-q9},  [r2,:128]!
1099         vld1.32         {q2-q3},  [r3,:128]!
1100         vmul.f32        q10, q0,  q8
1101         vmul.f32        q11, q1,  q9
1102 1:      vadd.f32        q12, q2,  q10
1103         vadd.f32        q13, q3,  q11
1104         pld             [r1, #16]
1105         pld             [r2, #16]
1106         pld             [r3, #16]
1107         subs            r12, r12, #8
1108         beq             2f
1109         vld1.32         {q0},     [r1,:128]!
1110         vld1.32         {q8},     [r2,:128]!
1111         vmul.f32        q10, q0,  q8
1112         vld1.32         {q1},     [r1,:128]!
1113         vld1.32         {q9},     [r2,:128]!
1114         vmul.f32        q11, q1,  q9
1115         vld1.32         {q2-q3},  [r3,:128]!
1116         vst1.32         {q12-q13},[r0,:128]!
1117         b               1b
1118 2:      vst1.32         {q12-q13},[r0,:128]!
1119         bx              lr
1120         .endfunc
1121
1122 function ff_vector_clipf_neon, export=1
1123 VFP     vdup.32         q1,  d0[1]
1124 VFP     vdup.32         q0,  d0[0]
1125 NOVFP   vdup.32         q0,  r2
1126 NOVFP   vdup.32         q1,  r3
1127 NOVFP   ldr             r2,  [sp]
1128         vld1.f32        {q2},[r1,:128]!
1129         vmin.f32        q10, q2,  q1
1130         vld1.f32        {q3},[r1,:128]!
1131         vmin.f32        q11, q3,  q1
1132 1:      vmax.f32        q8,  q10, q0
1133         vmax.f32        q9,  q11, q0
1134         subs            r2,  r2,  #8
1135         beq             2f
1136         vld1.f32        {q2},[r1,:128]!
1137         vmin.f32        q10, q2,  q1
1138         vld1.f32        {q3},[r1,:128]!
1139         vmin.f32        q11, q3,  q1
1140         vst1.f32        {q8},[r0,:128]!
1141         vst1.f32        {q9},[r0,:128]!
1142         b               1b
1143 2:      vst1.f32        {q8},[r0,:128]!
1144         vst1.f32        {q9},[r0,:128]!
1145         bx              lr
1146         .endfunc