]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/dsputil_neon.S
Merge branch 'sws_32bit_integration'
[ffmpeg] / libavcodec / arm / dsputil_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "asm.S"
24
25         preserve8
26         .text
27
28 function ff_clear_block_neon, export=1
29         vmov.i16        q0,  #0
30         .rept           8
31         vst1.16         {q0}, [r0,:128]!
32         .endr
33         bx              lr
34 endfunc
35
36 function ff_clear_blocks_neon, export=1
37         vmov.i16        q0,  #0
38         .rept           8*6
39         vst1.16         {q0}, [r0,:128]!
40         .endr
41         bx              lr
42 endfunc
43
44         .macro pixels16 avg=0
45 .if \avg
46         mov             ip,  r0
47 .endif
48 1:      vld1.64         {d0, d1},  [r1], r2
49         vld1.64         {d2, d3},  [r1], r2
50         vld1.64         {d4, d5},  [r1], r2
51         pld             [r1, r2, lsl #2]
52         vld1.64         {d6, d7},  [r1], r2
53         pld             [r1]
54         pld             [r1, r2]
55         pld             [r1, r2, lsl #1]
56 .if \avg
57         vld1.64         {d16,d17}, [ip,:128], r2
58         vrhadd.u8       q0,  q0,  q8
59         vld1.64         {d18,d19}, [ip,:128], r2
60         vrhadd.u8       q1,  q1,  q9
61         vld1.64         {d20,d21}, [ip,:128], r2
62         vrhadd.u8       q2,  q2,  q10
63         vld1.64         {d22,d23}, [ip,:128], r2
64         vrhadd.u8       q3,  q3,  q11
65 .endif
66         subs            r3,  r3,  #4
67         vst1.64         {d0, d1},  [r0,:128], r2
68         vst1.64         {d2, d3},  [r0,:128], r2
69         vst1.64         {d4, d5},  [r0,:128], r2
70         vst1.64         {d6, d7},  [r0,:128], r2
71         bne             1b
72         bx              lr
73         .endm
74
75         .macro pixels16_x2 vhadd=vrhadd.u8
76 1:      vld1.64         {d0-d2},   [r1], r2
77         vld1.64         {d4-d6},   [r1], r2
78         pld             [r1]
79         pld             [r1, r2]
80         subs            r3,  r3,  #2
81         vext.8          q1,  q0,  q1,  #1
82         \vhadd          q0,  q0,  q1
83         vext.8          q3,  q2,  q3,  #1
84         \vhadd          q2,  q2,  q3
85         vst1.64         {d0, d1},  [r0,:128], r2
86         vst1.64         {d4, d5},  [r0,:128], r2
87         bne             1b
88         bx              lr
89         .endm
90
91         .macro pixels16_y2 vhadd=vrhadd.u8
92         vld1.64         {d0, d1},  [r1], r2
93         vld1.64         {d2, d3},  [r1], r2
94 1:      subs            r3,  r3,  #2
95         \vhadd          q2,  q0,  q1
96         vld1.64         {d0, d1},  [r1], r2
97         \vhadd          q3,  q0,  q1
98         vld1.64         {d2, d3},  [r1], r2
99         pld             [r1]
100         pld             [r1, r2]
101         vst1.64         {d4, d5},  [r0,:128], r2
102         vst1.64         {d6, d7},  [r0,:128], r2
103         bne             1b
104         bx              lr
105         .endm
106
107         .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
108         vld1.64         {d0-d2},   [r1], r2
109         vld1.64         {d4-d6},   [r1], r2
110 .if \no_rnd
111         vmov.i16        q13, #1
112 .endif
113         pld             [r1]
114         pld             [r1, r2]
115         vext.8          q1,  q0,  q1,  #1
116         vext.8          q3,  q2,  q3,  #1
117         vaddl.u8        q8,  d0,  d2
118         vaddl.u8        q10, d1,  d3
119         vaddl.u8        q9,  d4,  d6
120         vaddl.u8        q11, d5,  d7
121 1:      subs            r3,  r3,  #2
122         vld1.64         {d0-d2},   [r1], r2
123         vadd.u16        q12, q8,  q9
124         pld             [r1]
125 .if \no_rnd
126         vadd.u16        q12, q12, q13
127 .endif
128         vext.8          q15, q0,  q1,  #1
129         vadd.u16        q1 , q10, q11
130         \vshrn          d28, q12, #2
131 .if \no_rnd
132         vadd.u16        q1,  q1,  q13
133 .endif
134         \vshrn          d29, q1,  #2
135         vaddl.u8        q8,  d0,  d30
136         vld1.64         {d2-d4},   [r1], r2
137         vaddl.u8        q10, d1,  d31
138         vst1.64         {d28,d29}, [r0,:128], r2
139         vadd.u16        q12, q8,  q9
140         pld             [r1, r2]
141 .if \no_rnd
142         vadd.u16        q12, q12, q13
143 .endif
144         vext.8          q2,  q1,  q2,  #1
145         vadd.u16        q0,  q10, q11
146         \vshrn          d30, q12, #2
147 .if \no_rnd
148         vadd.u16        q0,  q0,  q13
149 .endif
150         \vshrn          d31, q0,  #2
151         vaddl.u8        q9,  d2,  d4
152         vaddl.u8        q11, d3,  d5
153         vst1.64         {d30,d31}, [r0,:128], r2
154         bgt             1b
155         bx              lr
156         .endm
157
158         .macro pixels8 avg=0
159 1:      vld1.64         {d0}, [r1], r2
160         vld1.64         {d1}, [r1], r2
161         vld1.64         {d2}, [r1], r2
162         pld             [r1, r2, lsl #2]
163         vld1.64         {d3}, [r1], r2
164         pld             [r1]
165         pld             [r1, r2]
166         pld             [r1, r2, lsl #1]
167 .if \avg
168         vld1.64         {d4}, [r0,:64], r2
169         vrhadd.u8       d0,  d0,  d4
170         vld1.64         {d5}, [r0,:64], r2
171         vrhadd.u8       d1,  d1,  d5
172         vld1.64         {d6}, [r0,:64], r2
173         vrhadd.u8       d2,  d2,  d6
174         vld1.64         {d7}, [r0,:64], r2
175         vrhadd.u8       d3,  d3,  d7
176         sub             r0,  r0,  r2,  lsl #2
177 .endif
178         subs            r3,  r3,  #4
179         vst1.64         {d0}, [r0,:64], r2
180         vst1.64         {d1}, [r0,:64], r2
181         vst1.64         {d2}, [r0,:64], r2
182         vst1.64         {d3}, [r0,:64], r2
183         bne             1b
184         bx              lr
185         .endm
186
187         .macro pixels8_x2 vhadd=vrhadd.u8
188 1:      vld1.64         {d0, d1},  [r1], r2
189         vext.8          d1,  d0,  d1,  #1
190         vld1.64         {d2, d3},  [r1], r2
191         vext.8          d3,  d2,  d3,  #1
192         pld             [r1]
193         pld             [r1, r2]
194         subs            r3,  r3,  #2
195         vswp            d1,  d2
196         \vhadd          q0,  q0,  q1
197         vst1.64         {d0},      [r0,:64], r2
198         vst1.64         {d1},      [r0,:64], r2
199         bne             1b
200         bx              lr
201         .endm
202
203         .macro pixels8_y2 vhadd=vrhadd.u8
204         vld1.64         {d0},      [r1], r2
205         vld1.64         {d1},      [r1], r2
206 1:      subs            r3,  r3,  #2
207         \vhadd          d4,  d0,  d1
208         vld1.64         {d0},      [r1], r2
209         \vhadd          d5,  d0,  d1
210         vld1.64         {d1},      [r1], r2
211         pld             [r1]
212         pld             [r1, r2]
213         vst1.64         {d4},      [r0,:64], r2
214         vst1.64         {d5},      [r0,:64], r2
215         bne             1b
216         bx              lr
217         .endm
218
219         .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
220         vld1.64         {d0, d1},  [r1], r2
221         vld1.64         {d2, d3},  [r1], r2
222 .if \no_rnd
223         vmov.i16        q11, #1
224 .endif
225         pld             [r1]
226         pld             [r1, r2]
227         vext.8          d4,  d0,  d1,  #1
228         vext.8          d6,  d2,  d3,  #1
229         vaddl.u8        q8,  d0,  d4
230         vaddl.u8        q9,  d2,  d6
231 1:      subs            r3,  r3,  #2
232         vld1.64         {d0, d1},  [r1], r2
233         pld             [r1]
234         vadd.u16        q10, q8,  q9
235         vext.8          d4,  d0,  d1,  #1
236 .if \no_rnd
237         vadd.u16        q10, q10, q11
238 .endif
239         vaddl.u8        q8,  d0,  d4
240         \vshrn          d5,  q10, #2
241         vld1.64         {d2, d3},  [r1], r2
242         vadd.u16        q10, q8,  q9
243         pld             [r1, r2]
244 .if \no_rnd
245         vadd.u16        q10, q10, q11
246 .endif
247         vst1.64         {d5},      [r0,:64], r2
248         \vshrn          d7,  q10, #2
249         vext.8          d6,  d2,  d3,  #1
250         vaddl.u8        q9,  d2,  d6
251         vst1.64         {d7},      [r0,:64], r2
252         bgt             1b
253         bx              lr
254         .endm
255
256         .macro pixfunc pfx name suf rnd_op args:vararg
257 function ff_\pfx\name\suf\()_neon, export=1
258         \name \rnd_op \args
259 endfunc
260         .endm
261
262         .macro pixfunc2 pfx name args:vararg
263         pixfunc \pfx \name
264         pixfunc \pfx \name \args
265         .endm
266
267 function ff_put_h264_qpel16_mc00_neon, export=1
268         mov             r3,  #16
269 endfunc
270
271         pixfunc  put_ pixels16
272         pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
273         pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
274         pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
275
276 function ff_avg_h264_qpel16_mc00_neon, export=1
277         mov             r3,  #16
278 endfunc
279
280         pixfunc  avg_ pixels16,, 1
281
282 function ff_put_h264_qpel8_mc00_neon, export=1
283         mov             r3,  #8
284 endfunc
285
286         pixfunc  put_ pixels8
287         pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
288         pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
289         pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
290
291 function ff_avg_h264_qpel8_mc00_neon, export=1
292         mov             r3,  #8
293 endfunc
294
295         pixfunc  avg_ pixels8,, 1
296
297 function ff_put_pixels_clamped_neon, export=1
298         vld1.64         {d16-d19}, [r0,:128]!
299         vqmovun.s16     d0, q8
300         vld1.64         {d20-d23}, [r0,:128]!
301         vqmovun.s16     d1, q9
302         vld1.64         {d24-d27}, [r0,:128]!
303         vqmovun.s16     d2, q10
304         vld1.64         {d28-d31}, [r0,:128]!
305         vqmovun.s16     d3, q11
306         vst1.64         {d0},      [r1,:64], r2
307         vqmovun.s16     d4, q12
308         vst1.64         {d1},      [r1,:64], r2
309         vqmovun.s16     d5, q13
310         vst1.64         {d2},      [r1,:64], r2
311         vqmovun.s16     d6, q14
312         vst1.64         {d3},      [r1,:64], r2
313         vqmovun.s16     d7, q15
314         vst1.64         {d4},      [r1,:64], r2
315         vst1.64         {d5},      [r1,:64], r2
316         vst1.64         {d6},      [r1,:64], r2
317         vst1.64         {d7},      [r1,:64], r2
318         bx              lr
319 endfunc
320
321 function ff_put_signed_pixels_clamped_neon, export=1
322         vmov.u8         d31, #128
323         vld1.64         {d16-d17}, [r0,:128]!
324         vqmovn.s16      d0, q8
325         vld1.64         {d18-d19}, [r0,:128]!
326         vqmovn.s16      d1, q9
327         vld1.64         {d16-d17}, [r0,:128]!
328         vqmovn.s16      d2, q8
329         vld1.64         {d18-d19}, [r0,:128]!
330         vadd.u8         d0, d0, d31
331         vld1.64         {d20-d21}, [r0,:128]!
332         vadd.u8         d1, d1, d31
333         vld1.64         {d22-d23}, [r0,:128]!
334         vadd.u8         d2, d2, d31
335         vst1.64         {d0},      [r1,:64], r2
336         vqmovn.s16      d3, q9
337         vst1.64         {d1},      [r1,:64], r2
338         vqmovn.s16      d4, q10
339         vst1.64         {d2},      [r1,:64], r2
340         vqmovn.s16      d5, q11
341         vld1.64         {d24-d25}, [r0,:128]!
342         vadd.u8         d3, d3, d31
343         vld1.64         {d26-d27}, [r0,:128]!
344         vadd.u8         d4, d4, d31
345         vadd.u8         d5, d5, d31
346         vst1.64         {d3},      [r1,:64], r2
347         vqmovn.s16      d6, q12
348         vst1.64         {d4},      [r1,:64], r2
349         vqmovn.s16      d7, q13
350         vst1.64         {d5},      [r1,:64], r2
351         vadd.u8         d6, d6, d31
352         vadd.u8         d7, d7, d31
353         vst1.64         {d6},      [r1,:64], r2
354         vst1.64         {d7},      [r1,:64], r2
355         bx              lr
356 endfunc
357
358 function ff_add_pixels_clamped_neon, export=1
359         mov             r3, r1
360         vld1.64         {d16},   [r1,:64], r2
361         vld1.64         {d0-d1}, [r0,:128]!
362         vaddw.u8        q0, q0, d16
363         vld1.64         {d17},   [r1,:64], r2
364         vld1.64         {d2-d3}, [r0,:128]!
365         vqmovun.s16     d0, q0
366         vld1.64         {d18},   [r1,:64], r2
367         vaddw.u8        q1, q1, d17
368         vld1.64         {d4-d5}, [r0,:128]!
369         vaddw.u8        q2, q2, d18
370         vst1.64         {d0},    [r3,:64], r2
371         vqmovun.s16     d2, q1
372         vld1.64         {d19},   [r1,:64], r2
373         vld1.64         {d6-d7}, [r0,:128]!
374         vaddw.u8        q3, q3, d19
375         vqmovun.s16     d4, q2
376         vst1.64         {d2},    [r3,:64], r2
377         vld1.64         {d16},   [r1,:64], r2
378         vqmovun.s16     d6, q3
379         vld1.64         {d0-d1}, [r0,:128]!
380         vaddw.u8        q0, q0, d16
381         vst1.64         {d4},    [r3,:64], r2
382         vld1.64         {d17},   [r1,:64], r2
383         vld1.64         {d2-d3}, [r0,:128]!
384         vaddw.u8        q1, q1, d17
385         vst1.64         {d6},    [r3,:64], r2
386         vqmovun.s16     d0, q0
387         vld1.64         {d18},   [r1,:64], r2
388         vld1.64         {d4-d5}, [r0,:128]!
389         vaddw.u8        q2, q2, d18
390         vst1.64         {d0},    [r3,:64], r2
391         vqmovun.s16     d2, q1
392         vld1.64         {d19},   [r1,:64], r2
393         vqmovun.s16     d4, q2
394         vld1.64         {d6-d7}, [r0,:128]!
395         vaddw.u8        q3, q3, d19
396         vst1.64         {d2},    [r3,:64], r2
397         vqmovun.s16     d6, q3
398         vst1.64         {d4},    [r3,:64], r2
399         vst1.64         {d6},    [r3,:64], r2
400         bx              lr
401 endfunc
402
403 function ff_vector_fmul_neon, export=1
404         subs            r3,  r3,  #8
405         vld1.64         {d0-d3},  [r1,:128]!
406         vld1.64         {d4-d7},  [r2,:128]!
407         vmul.f32        q8,  q0,  q2
408         vmul.f32        q9,  q1,  q3
409         beq             3f
410         bics            ip,  r3,  #15
411         beq             2f
412 1:      subs            ip,  ip,  #16
413         vld1.64         {d0-d1},  [r1,:128]!
414         vld1.64         {d4-d5},  [r2,:128]!
415         vmul.f32        q10, q0,  q2
416         vld1.64         {d2-d3},  [r1,:128]!
417         vld1.64         {d6-d7},  [r2,:128]!
418         vmul.f32        q11, q1,  q3
419         vst1.64         {d16-d19},[r0,:128]!
420         vld1.64         {d0-d1},  [r1,:128]!
421         vld1.64         {d4-d5},  [r2,:128]!
422         vmul.f32        q8,  q0,  q2
423         vld1.64         {d2-d3},  [r1,:128]!
424         vld1.64         {d6-d7},  [r2,:128]!
425         vmul.f32        q9,  q1,  q3
426         vst1.64         {d20-d23},[r0,:128]!
427         bne             1b
428         ands            r3,  r3,  #15
429         beq             3f
430 2:      vld1.64         {d0-d1},  [r1,:128]!
431         vld1.64         {d4-d5},  [r2,:128]!
432         vst1.64         {d16-d17},[r0,:128]!
433         vmul.f32        q8,  q0,  q2
434         vld1.64         {d2-d3},  [r1,:128]!
435         vld1.64         {d6-d7},  [r2,:128]!
436         vst1.64         {d18-d19},[r0,:128]!
437         vmul.f32        q9,  q1,  q3
438 3:      vst1.64         {d16-d19},[r0,:128]!
439         bx              lr
440 endfunc
441
442 function ff_vector_fmul_window_neon, export=1
443         push            {r4,r5,lr}
444         ldr             lr,  [sp, #12]
445         sub             r2,  r2,  #8
446         sub             r5,  lr,  #2
447         add             r2,  r2,  r5, lsl #2
448         add             r4,  r3,  r5, lsl #3
449         add             ip,  r0,  r5, lsl #3
450         mov             r5,  #-16
451         vld1.64         {d0,d1},  [r1,:128]!
452         vld1.64         {d2,d3},  [r2,:128], r5
453         vld1.64         {d4,d5},  [r3,:128]!
454         vld1.64         {d6,d7},  [r4,:128], r5
455 1:      subs            lr,  lr,  #4
456         vmul.f32        d22, d0,  d4
457         vrev64.32       q3,  q3
458         vmul.f32        d23, d1,  d5
459         vrev64.32       q1,  q1
460         vmul.f32        d20, d0,  d7
461         vmul.f32        d21, d1,  d6
462         beq             2f
463         vmla.f32        d22, d3,  d7
464         vld1.64         {d0,d1},  [r1,:128]!
465         vmla.f32        d23, d2,  d6
466         vld1.64         {d18,d19},[r2,:128], r5
467         vmls.f32        d20, d3,  d4
468         vld1.64         {d24,d25},[r3,:128]!
469         vmls.f32        d21, d2,  d5
470         vld1.64         {d6,d7},  [r4,:128], r5
471         vmov            q1,  q9
472         vrev64.32       q11, q11
473         vmov            q2,  q12
474         vswp            d22, d23
475         vst1.64         {d20,d21},[r0,:128]!
476         vst1.64         {d22,d23},[ip,:128], r5
477         b               1b
478 2:      vmla.f32        d22, d3,  d7
479         vmla.f32        d23, d2,  d6
480         vmls.f32        d20, d3,  d4
481         vmls.f32        d21, d2,  d5
482         vrev64.32       q11, q11
483         vswp            d22, d23
484         vst1.64         {d20,d21},[r0,:128]!
485         vst1.64         {d22,d23},[ip,:128], r5
486         pop             {r4,r5,pc}
487 endfunc
488
489 #if CONFIG_VORBIS_DECODER
490 function ff_vorbis_inverse_coupling_neon, export=1
491         vmov.i32        q10, #1<<31
492         subs            r2,  r2,  #4
493         mov             r3,  r0
494         mov             r12, r1
495         beq             3f
496
497         vld1.32         {d24-d25},[r1,:128]!
498         vld1.32         {d22-d23},[r0,:128]!
499         vcle.s32        q8,  q12, #0
500         vand            q9,  q11, q10
501         veor            q12, q12, q9
502         vand            q2,  q12, q8
503         vbic            q3,  q12, q8
504         vadd.f32        q12, q11, q2
505         vsub.f32        q11, q11, q3
506 1:      vld1.32         {d2-d3},  [r1,:128]!
507         vld1.32         {d0-d1},  [r0,:128]!
508         vcle.s32        q8,  q1,  #0
509         vand            q9,  q0,  q10
510         veor            q1,  q1,  q9
511         vst1.32         {d24-d25},[r3, :128]!
512         vst1.32         {d22-d23},[r12,:128]!
513         vand            q2,  q1,  q8
514         vbic            q3,  q1,  q8
515         vadd.f32        q1,  q0,  q2
516         vsub.f32        q0,  q0,  q3
517         subs            r2,  r2,  #8
518         ble             2f
519         vld1.32         {d24-d25},[r1,:128]!
520         vld1.32         {d22-d23},[r0,:128]!
521         vcle.s32        q8,  q12, #0
522         vand            q9,  q11, q10
523         veor            q12, q12, q9
524         vst1.32         {d2-d3},  [r3, :128]!
525         vst1.32         {d0-d1},  [r12,:128]!
526         vand            q2,  q12, q8
527         vbic            q3,  q12, q8
528         vadd.f32        q12, q11, q2
529         vsub.f32        q11, q11, q3
530         b               1b
531
532 2:      vst1.32         {d2-d3},  [r3, :128]!
533         vst1.32         {d0-d1},  [r12,:128]!
534         it              lt
535         bxlt            lr
536
537 3:      vld1.32         {d2-d3},  [r1,:128]
538         vld1.32         {d0-d1},  [r0,:128]
539         vcle.s32        q8,  q1,  #0
540         vand            q9,  q0,  q10
541         veor            q1,  q1,  q9
542         vand            q2,  q1,  q8
543         vbic            q3,  q1,  q8
544         vadd.f32        q1,  q0,  q2
545         vsub.f32        q0,  q0,  q3
546         vst1.32         {d2-d3},  [r0,:128]!
547         vst1.32         {d0-d1},  [r1,:128]!
548         bx              lr
549 endfunc
550 #endif
551
552 function ff_vector_fmul_scalar_neon, export=1
553 VFP     len .req r2
554 NOVFP   len .req r3
555 VFP     vdup.32         q8,  d0[0]
556 NOVFP   vdup.32         q8,  r2
557         bics            r12, len, #15
558         beq             3f
559         vld1.32         {q0},[r1,:128]!
560         vld1.32         {q1},[r1,:128]!
561 1:      vmul.f32        q0,  q0,  q8
562         vld1.32         {q2},[r1,:128]!
563         vmul.f32        q1,  q1,  q8
564         vld1.32         {q3},[r1,:128]!
565         vmul.f32        q2,  q2,  q8
566         vst1.32         {q0},[r0,:128]!
567         vmul.f32        q3,  q3,  q8
568         vst1.32         {q1},[r0,:128]!
569         subs            r12, r12, #16
570         beq             2f
571         vld1.32         {q0},[r1,:128]!
572         vst1.32         {q2},[r0,:128]!
573         vld1.32         {q1},[r1,:128]!
574         vst1.32         {q3},[r0,:128]!
575         b               1b
576 2:      vst1.32         {q2},[r0,:128]!
577         vst1.32         {q3},[r0,:128]!
578         ands            len, len, #15
579         it              eq
580         bxeq            lr
581 3:      vld1.32         {q0},[r1,:128]!
582         vmul.f32        q0,  q0,  q8
583         vst1.32         {q0},[r0,:128]!
584         subs            len, len, #4
585         bgt             3b
586         bx              lr
587         .unreq          len
588 endfunc
589
590 function ff_vector_fmul_sv_scalar_2_neon, export=1
591 VFP     vdup.32         d16, d0[0]
592 NOVFP   vdup.32         d16, r3
593 NOVFP   ldr             r3,  [sp]
594         vld1.32         {d0},[r1,:64]!
595         vld1.32         {d1},[r1,:64]!
596 1:      subs            r3,  r3,  #4
597         vmul.f32        d4,  d0,  d16
598         vmul.f32        d5,  d1,  d16
599         ldr             r12, [r2], #4
600         vld1.32         {d2},[r12,:64]
601         ldr             r12, [r2], #4
602         vld1.32         {d3},[r12,:64]
603         vmul.f32        d4,  d4,  d2
604         vmul.f32        d5,  d5,  d3
605         beq             2f
606         vld1.32         {d0},[r1,:64]!
607         vld1.32         {d1},[r1,:64]!
608         vst1.32         {d4},[r0,:64]!
609         vst1.32         {d5},[r0,:64]!
610         b               1b
611 2:      vst1.32         {d4},[r0,:64]!
612         vst1.32         {d5},[r0,:64]!
613         bx              lr
614 endfunc
615
616 function ff_vector_fmul_sv_scalar_4_neon, export=1
617 VFP     vdup.32         q10, d0[0]
618 NOVFP   vdup.32         q10, r3
619 NOVFP   ldr             r3,  [sp]
620         push            {lr}
621         bics            lr,  r3,  #7
622         beq             3f
623         vld1.32         {q0},[r1,:128]!
624         vld1.32         {q2},[r1,:128]!
625 1:      ldr             r12, [r2], #4
626         vld1.32         {q1},[r12,:128]
627         ldr             r12, [r2], #4
628         vld1.32         {q3},[r12,:128]
629         vmul.f32        q8,  q0,  q10
630         vmul.f32        q8,  q8,  q1
631         vmul.f32        q9,  q2,  q10
632         vmul.f32        q9,  q9,  q3
633         subs            lr,  lr,  #8
634         beq             2f
635         vld1.32         {q0},[r1,:128]!
636         vld1.32         {q2},[r1,:128]!
637         vst1.32         {q8},[r0,:128]!
638         vst1.32         {q9},[r0,:128]!
639         b               1b
640 2:      vst1.32         {q8},[r0,:128]!
641         vst1.32         {q9},[r0,:128]!
642         ands            r3,  r3,  #7
643         it              eq
644         popeq           {pc}
645 3:      vld1.32         {q0},[r1,:128]!
646         ldr             r12, [r2], #4
647         vld1.32         {q1},[r12,:128]
648         vmul.f32        q0,  q0,  q10
649         vmul.f32        q0,  q0,  q1
650         vst1.32         {q0},[r0,:128]!
651         subs            r3,  r3,  #4
652         bgt             3b
653         pop             {pc}
654 endfunc
655
656 function ff_sv_fmul_scalar_2_neon, export=1
657 VFP     len .req r2
658 NOVFP   len .req r3
659 VFP     vdup.32         q8,  d0[0]
660 NOVFP   vdup.32         q8,  r2
661         ldr             r12, [r1], #4
662         vld1.32         {d0},[r12,:64]
663         ldr             r12, [r1], #4
664         vld1.32         {d1},[r12,:64]
665 1:      vmul.f32        q1,  q0,  q8
666         subs            len, len, #4
667         beq             2f
668         ldr             r12, [r1], #4
669         vld1.32         {d0},[r12,:64]
670         ldr             r12, [r1], #4
671         vld1.32         {d1},[r12,:64]
672         vst1.32         {q1},[r0,:128]!
673         b               1b
674 2:      vst1.32         {q1},[r0,:128]!
675         bx              lr
676         .unreq          len
677 endfunc
678
679 function ff_sv_fmul_scalar_4_neon, export=1
680 VFP     len .req r2
681 NOVFP   len .req r3
682 VFP     vdup.32         q8,  d0[0]
683 NOVFP   vdup.32         q8,  r2
684 1:      ldr             r12, [r1], #4
685         vld1.32         {q0},[r12,:128]
686         vmul.f32        q0,  q0,  q8
687         vst1.32         {q0},[r0,:128]!
688         subs            len, len, #4
689         bgt             1b
690         bx              lr
691         .unreq          len
692 endfunc
693
694 function ff_butterflies_float_neon, export=1
695 1:      vld1.32         {q0},[r0,:128]
696         vld1.32         {q1},[r1,:128]
697         vsub.f32        q2,  q0,  q1
698         vadd.f32        q1,  q0,  q1
699         vst1.32         {q2},[r1,:128]!
700         vst1.32         {q1},[r0,:128]!
701         subs            r2,  r2,  #4
702         bgt             1b
703         bx              lr
704 endfunc
705
706 function ff_scalarproduct_float_neon, export=1
707         vmov.f32        q2,  #0.0
708 1:      vld1.32         {q0},[r0,:128]!
709         vld1.32         {q1},[r1,:128]!
710         vmla.f32        q2,  q0,  q1
711         subs            r2,  r2,  #4
712         bgt             1b
713         vadd.f32        d0,  d4,  d5
714         vpadd.f32       d0,  d0,  d0
715 NOVFP   vmov.32         r0,  d0[0]
716         bx              lr
717 endfunc
718
719 function ff_vector_fmul_reverse_neon, export=1
720         add             r2,  r2,  r3,  lsl #2
721         sub             r2,  r2,  #32
722         mov             r12, #-32
723         vld1.32         {q0-q1},  [r1,:128]!
724         vld1.32         {q2-q3},  [r2,:128], r12
725 1:      pld             [r1, #32]
726         vrev64.32       q3,  q3
727         vmul.f32        d16, d0,  d7
728         vmul.f32        d17, d1,  d6
729         pld             [r2, #-32]
730         vrev64.32       q2,  q2
731         vmul.f32        d18, d2,  d5
732         vmul.f32        d19, d3,  d4
733         subs            r3,  r3,  #8
734         beq             2f
735         vld1.32         {q0-q1},  [r1,:128]!
736         vld1.32         {q2-q3},  [r2,:128], r12
737         vst1.32         {q8-q9},  [r0,:128]!
738         b               1b
739 2:      vst1.32         {q8-q9},  [r0,:128]!
740         bx              lr
741 endfunc
742
743 function ff_vector_fmul_add_neon, export=1
744         ldr             r12, [sp]
745         vld1.32         {q0-q1},  [r1,:128]!
746         vld1.32         {q8-q9},  [r2,:128]!
747         vld1.32         {q2-q3},  [r3,:128]!
748         vmul.f32        q10, q0,  q8
749         vmul.f32        q11, q1,  q9
750 1:      vadd.f32        q12, q2,  q10
751         vadd.f32        q13, q3,  q11
752         pld             [r1, #16]
753         pld             [r2, #16]
754         pld             [r3, #16]
755         subs            r12, r12, #8
756         beq             2f
757         vld1.32         {q0},     [r1,:128]!
758         vld1.32         {q8},     [r2,:128]!
759         vmul.f32        q10, q0,  q8
760         vld1.32         {q1},     [r1,:128]!
761         vld1.32         {q9},     [r2,:128]!
762         vmul.f32        q11, q1,  q9
763         vld1.32         {q2-q3},  [r3,:128]!
764         vst1.32         {q12-q13},[r0,:128]!
765         b               1b
766 2:      vst1.32         {q12-q13},[r0,:128]!
767         bx              lr
768 endfunc
769
770 function ff_vector_clipf_neon, export=1
771 VFP     vdup.32         q1,  d0[1]
772 VFP     vdup.32         q0,  d0[0]
773 NOVFP   vdup.32         q0,  r2
774 NOVFP   vdup.32         q1,  r3
775 NOVFP   ldr             r2,  [sp]
776         vld1.f32        {q2},[r1,:128]!
777         vmin.f32        q10, q2,  q1
778         vld1.f32        {q3},[r1,:128]!
779         vmin.f32        q11, q3,  q1
780 1:      vmax.f32        q8,  q10, q0
781         vmax.f32        q9,  q11, q0
782         subs            r2,  r2,  #8
783         beq             2f
784         vld1.f32        {q2},[r1,:128]!
785         vmin.f32        q10, q2,  q1
786         vld1.f32        {q3},[r1,:128]!
787         vmin.f32        q11, q3,  q1
788         vst1.f32        {q8},[r0,:128]!
789         vst1.f32        {q9},[r0,:128]!
790         b               1b
791 2:      vst1.f32        {q8},[r0,:128]!
792         vst1.f32        {q9},[r0,:128]!
793         bx              lr
794 endfunc
795
796 function ff_apply_window_int16_neon, export=1
797         push            {r4,lr}
798         add             r4,  r1,  r3,  lsl #1
799         add             lr,  r0,  r3,  lsl #1
800         sub             r4,  r4,  #16
801         sub             lr,  lr,  #16
802         mov             r12, #-16
803 1:
804         vld1.16         {q0},     [r1,:128]!
805         vld1.16         {q2},     [r2,:128]!
806         vld1.16         {q1},     [r4,:128], r12
807         vrev64.16       q3,  q2
808         vqrdmulh.s16    q0,  q0,  q2
809         vqrdmulh.s16    d2,  d2,  d7
810         vqrdmulh.s16    d3,  d3,  d6
811         vst1.16         {q0},     [r0,:128]!
812         vst1.16         {q1},     [lr,:128], r12
813         subs            r3,  r3,  #16
814         bgt             1b
815
816         pop             {r4,pc}
817 endfunc
818
819 function ff_vector_clip_int32_neon, export=1
820         vdup.32         q0,  r2
821         vdup.32         q1,  r3
822         ldr             r2,  [sp]
823 1:
824         vld1.32         {q2-q3},  [r1,:128]!
825         vmin.s32        q2,  q2,  q1
826         vmin.s32        q3,  q3,  q1
827         vmax.s32        q2,  q2,  q0
828         vmax.s32        q3,  q3,  q0
829         vst1.32         {q2-q3},  [r0,:128]!
830         subs            r2,  r2,  #8
831         bgt             1b
832         bx              lr
833 endfunc