]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/dsputil_neon.S
ARM: Move asm.S from libavcodec to libavutil
[ffmpeg] / libavcodec / arm / dsputil_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "libavutil/arm/asm.S"
24
25         preserve8
26
27 function ff_clear_block_neon, export=1
28         vmov.i16        q0,  #0
29         .rept           8
30         vst1.16         {q0}, [r0,:128]!
31         .endr
32         bx              lr
33 endfunc
34
35 function ff_clear_blocks_neon, export=1
36         vmov.i16        q0,  #0
37         .rept           8*6
38         vst1.16         {q0}, [r0,:128]!
39         .endr
40         bx              lr
41 endfunc
42
43 .macro  pixels16        rnd=1, avg=0
44   .if \avg
45         mov             r12, r0
46   .endif
47 1:      vld1.8          {q0},     [r1], r2
48         vld1.8          {q1},     [r1], r2
49         vld1.8          {q2},     [r1], r2
50         pld             [r1, r2, lsl #2]
51         vld1.8          {q3},     [r1], r2
52         pld             [r1]
53         pld             [r1, r2]
54         pld             [r1, r2, lsl #1]
55   .if \avg
56         vld1.8          {q8},     [r12,:128], r2
57         vrhadd.u8       q0,  q0,  q8
58         vld1.8          {q9},     [r12,:128], r2
59         vrhadd.u8       q1,  q1,  q9
60         vld1.8          {q10},    [r12,:128], r2
61         vrhadd.u8       q2,  q2,  q10
62         vld1.8          {q11},    [r12,:128], r2
63         vrhadd.u8       q3,  q3,  q11
64   .endif
65         subs            r3,  r3,  #4
66         vst1.64         {q0},     [r0,:128], r2
67         vst1.64         {q1},     [r0,:128], r2
68         vst1.64         {q2},     [r0,:128], r2
69         vst1.64         {q3},     [r0,:128], r2
70         bne             1b
71         bx              lr
72 .endm
73
74 .macro  pixels16_x2     rnd=1, avg=0
75 1:      vld1.8          {d0-d2},  [r1], r2
76         vld1.8          {d4-d6},  [r1], r2
77         pld             [r1]
78         pld             [r1, r2]
79         subs            r3,  r3,  #2
80         vext.8          q1,  q0,  q1,  #1
81         avg             q0,  q0,  q1
82         vext.8          q3,  q2,  q3,  #1
83         avg             q2,  q2,  q3
84   .if \avg
85         vld1.8          {q1},     [r0,:128], r2
86         vld1.8          {q3},     [r0,:128]
87         vrhadd.u8       q0,  q0,  q1
88         vrhadd.u8       q2,  q2,  q3
89         sub             r0,  r0,  r2
90   .endif
91         vst1.8          {q0},     [r0,:128], r2
92         vst1.8          {q2},     [r0,:128], r2
93         bne             1b
94         bx              lr
95 .endm
96
97 .macro  pixels16_y2     rnd=1, avg=0
98         sub             r3,  r3,  #2
99         vld1.8          {q0},     [r1], r2
100         vld1.8          {q1},     [r1], r2
101 1:      subs            r3,  r3,  #2
102         avg             q2,  q0,  q1
103         vld1.8          {q0},     [r1], r2
104         avg             q3,  q0,  q1
105         vld1.8          {q1},     [r1], r2
106         pld             [r1]
107         pld             [r1, r2]
108   .if \avg
109         vld1.8          {q8},     [r0,:128], r2
110         vld1.8          {q9},     [r0,:128]
111         vrhadd.u8       q2,  q2,  q8
112         vrhadd.u8       q3,  q3,  q9
113         sub             r0,  r0,  r2
114   .endif
115         vst1.8          {q2},     [r0,:128], r2
116         vst1.8          {q3},     [r0,:128], r2
117         bne             1b
118
119         avg             q2,  q0,  q1
120         vld1.8          {q0},     [r1], r2
121         avg             q3,  q0,  q1
122   .if \avg
123         vld1.8          {q8},     [r0,:128], r2
124         vld1.8          {q9},     [r0,:128]
125         vrhadd.u8       q2,  q2,  q8
126         vrhadd.u8       q3,  q3,  q9
127         sub             r0,  r0,  r2
128   .endif
129         vst1.8          {q2},     [r0,:128], r2
130         vst1.8          {q3},     [r0,:128], r2
131
132         bx              lr
133 .endm
134
135 .macro  pixels16_xy2    rnd=1, avg=0
136         sub             r3,  r3,  #2
137         vld1.8          {d0-d2},  [r1], r2
138         vld1.8          {d4-d6},  [r1], r2
139 NRND    vmov.i16        q13, #1
140         pld             [r1]
141         pld             [r1, r2]
142         vext.8          q1,  q0,  q1,  #1
143         vext.8          q3,  q2,  q3,  #1
144         vaddl.u8        q8,  d0,  d2
145         vaddl.u8        q10, d1,  d3
146         vaddl.u8        q9,  d4,  d6
147         vaddl.u8        q11, d5,  d7
148 1:      subs            r3,  r3,  #2
149         vld1.8          {d0-d2},  [r1], r2
150         vadd.u16        q12, q8,  q9
151         pld             [r1]
152 NRND    vadd.u16        q12, q12, q13
153         vext.8          q15, q0,  q1,  #1
154         vadd.u16        q1 , q10, q11
155         shrn            d28, q12, #2
156 NRND    vadd.u16        q1,  q1,  q13
157         shrn            d29, q1,  #2
158   .if \avg
159         vld1.8          {q8},     [r0,:128]
160         vrhadd.u8       q14, q14, q8
161   .endif
162         vaddl.u8        q8,  d0,  d30
163         vld1.8          {d2-d4},  [r1], r2
164         vaddl.u8        q10, d1,  d31
165         vst1.8          {q14},    [r0,:128], r2
166         vadd.u16        q12, q8,  q9
167         pld             [r1, r2]
168 NRND    vadd.u16        q12, q12, q13
169         vext.8          q2,  q1,  q2,  #1
170         vadd.u16        q0,  q10, q11
171         shrn            d30, q12, #2
172 NRND    vadd.u16        q0,  q0,  q13
173         shrn            d31, q0,  #2
174   .if \avg
175         vld1.8          {q9},     [r0,:128]
176         vrhadd.u8       q15, q15, q9
177   .endif
178         vaddl.u8        q9,  d2,  d4
179         vaddl.u8        q11, d3,  d5
180         vst1.8          {q15},    [r0,:128], r2
181         bgt             1b
182
183         vld1.8          {d0-d2},  [r1], r2
184         vadd.u16        q12, q8,  q9
185 NRND    vadd.u16        q12, q12, q13
186         vext.8          q15, q0,  q1,  #1
187         vadd.u16        q1 , q10, q11
188         shrn            d28, q12, #2
189 NRND    vadd.u16        q1,  q1,  q13
190         shrn            d29, q1,  #2
191   .if \avg
192         vld1.8          {q8},     [r0,:128]
193         vrhadd.u8       q14, q14, q8
194   .endif
195         vaddl.u8        q8,  d0,  d30
196         vaddl.u8        q10, d1,  d31
197         vst1.8          {q14},    [r0,:128], r2
198         vadd.u16        q12, q8,  q9
199 NRND    vadd.u16        q12, q12, q13
200         vadd.u16        q0,  q10, q11
201         shrn            d30, q12, #2
202 NRND    vadd.u16        q0,  q0,  q13
203         shrn            d31, q0,  #2
204   .if \avg
205         vld1.8          {q9},     [r0,:128]
206         vrhadd.u8       q15, q15, q9
207   .endif
208         vst1.8          {q15},    [r0,:128], r2
209
210         bx              lr
211 .endm
212
213 .macro  pixels8         rnd=1, avg=0
214 1:      vld1.8          {d0},     [r1], r2
215         vld1.8          {d1},     [r1], r2
216         vld1.8          {d2},     [r1], r2
217         pld             [r1, r2, lsl #2]
218         vld1.8          {d3},     [r1], r2
219         pld             [r1]
220         pld             [r1, r2]
221         pld             [r1, r2, lsl #1]
222   .if \avg
223         vld1.8          {d4},     [r0,:64], r2
224         vrhadd.u8       d0,  d0,  d4
225         vld1.8          {d5},     [r0,:64], r2
226         vrhadd.u8       d1,  d1,  d5
227         vld1.8          {d6},     [r0,:64], r2
228         vrhadd.u8       d2,  d2,  d6
229         vld1.8          {d7},     [r0,:64], r2
230         vrhadd.u8       d3,  d3,  d7
231         sub             r0,  r0,  r2,  lsl #2
232   .endif
233         subs            r3,  r3,  #4
234         vst1.8          {d0},     [r0,:64], r2
235         vst1.8          {d1},     [r0,:64], r2
236         vst1.8          {d2},     [r0,:64], r2
237         vst1.8          {d3},     [r0,:64], r2
238         bne             1b
239         bx              lr
240 .endm
241
242 .macro  pixels8_x2      rnd=1, avg=0
243 1:      vld1.8          {q0},     [r1], r2
244         vext.8          d1,  d0,  d1,  #1
245         vld1.8          {q1},     [r1], r2
246         vext.8          d3,  d2,  d3,  #1
247         pld             [r1]
248         pld             [r1, r2]
249         subs            r3,  r3,  #2
250         vswp            d1,  d2
251         avg             q0,  q0,  q1
252   .if \avg
253         vld1.8          {d4},     [r0,:64], r2
254         vld1.8          {d5},     [r0,:64]
255         vrhadd.u8       q0,  q0,  q2
256         sub             r0,  r0,  r2
257   .endif
258         vst1.8          {d0},     [r0,:64], r2
259         vst1.8          {d1},     [r0,:64], r2
260         bne             1b
261         bx              lr
262 .endm
263
264 .macro  pixels8_y2      rnd=1, avg=0
265         sub             r3,  r3,  #2
266         vld1.8          {d0},     [r1], r2
267         vld1.8          {d1},     [r1], r2
268 1:      subs            r3,  r3,  #2
269         avg             d4,  d0,  d1
270         vld1.8          {d0},     [r1], r2
271         avg             d5,  d0,  d1
272         vld1.8          {d1},     [r1], r2
273         pld             [r1]
274         pld             [r1, r2]
275   .if \avg
276         vld1.8          {d2},     [r0,:64], r2
277         vld1.8          {d3},     [r0,:64]
278         vrhadd.u8       q2,  q2,  q1
279         sub             r0,  r0,  r2
280   .endif
281         vst1.8          {d4},     [r0,:64], r2
282         vst1.8          {d5},     [r0,:64], r2
283         bne             1b
284
285         avg             d4,  d0,  d1
286         vld1.8          {d0},     [r1], r2
287         avg             d5,  d0,  d1
288   .if \avg
289         vld1.8          {d2},     [r0,:64], r2
290         vld1.8          {d3},     [r0,:64]
291         vrhadd.u8       q2,  q2,  q1
292         sub             r0,  r0,  r2
293   .endif
294         vst1.8          {d4},     [r0,:64], r2
295         vst1.8          {d5},     [r0,:64], r2
296
297         bx              lr
298 .endm
299
300 .macro  pixels8_xy2     rnd=1, avg=0
301         sub             r3,  r3,  #2
302         vld1.8          {q0},     [r1], r2
303         vld1.8          {q1},     [r1], r2
304 NRND    vmov.i16        q11, #1
305         pld             [r1]
306         pld             [r1, r2]
307         vext.8          d4,  d0,  d1,  #1
308         vext.8          d6,  d2,  d3,  #1
309         vaddl.u8        q8,  d0,  d4
310         vaddl.u8        q9,  d2,  d6
311 1:      subs            r3,  r3,  #2
312         vld1.8          {q0},     [r1], r2
313         pld             [r1]
314         vadd.u16        q10, q8,  q9
315         vext.8          d4,  d0,  d1,  #1
316 NRND    vadd.u16        q10, q10, q11
317         vaddl.u8        q8,  d0,  d4
318         shrn            d5,  q10, #2
319         vld1.8          {q1},     [r1], r2
320         vadd.u16        q10, q8,  q9
321         pld             [r1, r2]
322   .if \avg
323         vld1.8          {d7},     [r0,:64]
324         vrhadd.u8       d5,  d5,  d7
325   .endif
326 NRND    vadd.u16        q10, q10, q11
327         vst1.8          {d5},     [r0,:64], r2
328         shrn            d7,  q10, #2
329   .if \avg
330         vld1.8          {d5},     [r0,:64]
331         vrhadd.u8       d7,  d7,  d5
332   .endif
333         vext.8          d6,  d2,  d3,  #1
334         vaddl.u8        q9,  d2,  d6
335         vst1.8          {d7},     [r0,:64], r2
336         bgt             1b
337
338         vld1.8          {q0},     [r1], r2
339         vadd.u16        q10, q8,  q9
340         vext.8          d4,  d0,  d1,  #1
341 NRND    vadd.u16        q10, q10, q11
342         vaddl.u8        q8,  d0,  d4
343         shrn            d5,  q10, #2
344         vadd.u16        q10, q8,  q9
345   .if \avg
346         vld1.8          {d7},     [r0,:64]
347         vrhadd.u8       d5,  d5,  d7
348   .endif
349 NRND    vadd.u16        q10, q10, q11
350         vst1.8          {d5},     [r0,:64], r2
351         shrn            d7,  q10, #2
352   .if \avg
353         vld1.8          {d5},     [r0,:64]
354         vrhadd.u8       d7,  d7,  d5
355   .endif
356         vst1.8          {d7},     [r0,:64], r2
357
358         bx              lr
359 .endm
360
361 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
362   .if \rnd
363     .macro avg  rd, rn, rm
364         vrhadd.u8       \rd, \rn, \rm
365     .endm
366     .macro shrn rd, rn, rm
367         vrshrn.u16      \rd, \rn, \rm
368     .endm
369     .macro NRND insn:vararg
370     .endm
371   .else
372     .macro avg  rd, rn, rm
373         vhadd.u8        \rd, \rn, \rm
374     .endm
375     .macro shrn rd, rn, rm
376         vshrn.u16       \rd, \rn, \rm
377     .endm
378     .macro NRND insn:vararg
379         \insn
380     .endm
381   .endif
382 function ff_\pfx\name\suf\()_neon, export=1
383         \name           \rnd, \avg
384 endfunc
385         .purgem         avg
386         .purgem         shrn
387         .purgem         NRND
388 .endm
389
390 .macro  pixfunc2        pfx, name, avg=0
391         pixfunc         \pfx, \name,          rnd=1, avg=\avg
392         pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
393 .endm
394
395 function ff_put_h264_qpel16_mc00_neon, export=1
396         mov             r3,  #16
397 endfunc
398
399         pixfunc         put_, pixels16,     avg=0
400         pixfunc2        put_, pixels16_x2,  avg=0
401         pixfunc2        put_, pixels16_y2,  avg=0
402         pixfunc2        put_, pixels16_xy2, avg=0
403
404 function ff_avg_h264_qpel16_mc00_neon, export=1
405         mov             r3,  #16
406 endfunc
407
408         pixfunc         avg_, pixels16,     avg=1
409         pixfunc2        avg_, pixels16_x2,  avg=1
410         pixfunc2        avg_, pixels16_y2,  avg=1
411         pixfunc2        avg_, pixels16_xy2, avg=1
412
413 function ff_put_h264_qpel8_mc00_neon, export=1
414         mov             r3,  #8
415 endfunc
416
417         pixfunc         put_, pixels8,     avg=0
418         pixfunc2        put_, pixels8_x2,  avg=0
419         pixfunc2        put_, pixels8_y2,  avg=0
420         pixfunc2        put_, pixels8_xy2, avg=0
421
422 function ff_avg_h264_qpel8_mc00_neon, export=1
423         mov             r3,  #8
424 endfunc
425
426         pixfunc         avg_, pixels8,     avg=1
427         pixfunc2        avg_, pixels8_x2,  avg=1
428         pixfunc2        avg_, pixels8_y2,  avg=1
429         pixfunc2        avg_, pixels8_xy2, avg=1
430
431 function ff_put_pixels_clamped_neon, export=1
432         vld1.16         {d16-d19}, [r0,:128]!
433         vqmovun.s16     d0, q8
434         vld1.16         {d20-d23}, [r0,:128]!
435         vqmovun.s16     d1, q9
436         vld1.16         {d24-d27}, [r0,:128]!
437         vqmovun.s16     d2, q10
438         vld1.16         {d28-d31}, [r0,:128]!
439         vqmovun.s16     d3, q11
440         vst1.8          {d0},      [r1,:64], r2
441         vqmovun.s16     d4, q12
442         vst1.8          {d1},      [r1,:64], r2
443         vqmovun.s16     d5, q13
444         vst1.8          {d2},      [r1,:64], r2
445         vqmovun.s16     d6, q14
446         vst1.8          {d3},      [r1,:64], r2
447         vqmovun.s16     d7, q15
448         vst1.8          {d4},      [r1,:64], r2
449         vst1.8          {d5},      [r1,:64], r2
450         vst1.8          {d6},      [r1,:64], r2
451         vst1.8          {d7},      [r1,:64], r2
452         bx              lr
453 endfunc
454
455 function ff_put_signed_pixels_clamped_neon, export=1
456         vmov.u8         d31, #128
457         vld1.16         {d16-d17}, [r0,:128]!
458         vqmovn.s16      d0, q8
459         vld1.16         {d18-d19}, [r0,:128]!
460         vqmovn.s16      d1, q9
461         vld1.16         {d16-d17}, [r0,:128]!
462         vqmovn.s16      d2, q8
463         vld1.16         {d18-d19}, [r0,:128]!
464         vadd.u8         d0, d0, d31
465         vld1.16         {d20-d21}, [r0,:128]!
466         vadd.u8         d1, d1, d31
467         vld1.16         {d22-d23}, [r0,:128]!
468         vadd.u8         d2, d2, d31
469         vst1.8          {d0},      [r1,:64], r2
470         vqmovn.s16      d3, q9
471         vst1.8          {d1},      [r1,:64], r2
472         vqmovn.s16      d4, q10
473         vst1.8          {d2},      [r1,:64], r2
474         vqmovn.s16      d5, q11
475         vld1.16         {d24-d25}, [r0,:128]!
476         vadd.u8         d3, d3, d31
477         vld1.16         {d26-d27}, [r0,:128]!
478         vadd.u8         d4, d4, d31
479         vadd.u8         d5, d5, d31
480         vst1.8          {d3},      [r1,:64], r2
481         vqmovn.s16      d6, q12
482         vst1.8          {d4},      [r1,:64], r2
483         vqmovn.s16      d7, q13
484         vst1.8          {d5},      [r1,:64], r2
485         vadd.u8         d6, d6, d31
486         vadd.u8         d7, d7, d31
487         vst1.8          {d6},      [r1,:64], r2
488         vst1.8          {d7},      [r1,:64], r2
489         bx              lr
490 endfunc
491
492 function ff_add_pixels_clamped_neon, export=1
493         mov             r3, r1
494         vld1.8          {d16},   [r1,:64], r2
495         vld1.16         {d0-d1}, [r0,:128]!
496         vaddw.u8        q0, q0, d16
497         vld1.8          {d17},   [r1,:64], r2
498         vld1.16         {d2-d3}, [r0,:128]!
499         vqmovun.s16     d0, q0
500         vld1.8          {d18},   [r1,:64], r2
501         vaddw.u8        q1, q1, d17
502         vld1.16         {d4-d5}, [r0,:128]!
503         vaddw.u8        q2, q2, d18
504         vst1.8          {d0},    [r3,:64], r2
505         vqmovun.s16     d2, q1
506         vld1.8          {d19},   [r1,:64], r2
507         vld1.16         {d6-d7}, [r0,:128]!
508         vaddw.u8        q3, q3, d19
509         vqmovun.s16     d4, q2
510         vst1.8          {d2},    [r3,:64], r2
511         vld1.8          {d16},   [r1,:64], r2
512         vqmovun.s16     d6, q3
513         vld1.16         {d0-d1}, [r0,:128]!
514         vaddw.u8        q0, q0, d16
515         vst1.8          {d4},    [r3,:64], r2
516         vld1.8          {d17},   [r1,:64], r2
517         vld1.16         {d2-d3}, [r0,:128]!
518         vaddw.u8        q1, q1, d17
519         vst1.8          {d6},    [r3,:64], r2
520         vqmovun.s16     d0, q0
521         vld1.8          {d18},   [r1,:64], r2
522         vld1.16         {d4-d5}, [r0,:128]!
523         vaddw.u8        q2, q2, d18
524         vst1.8          {d0},    [r3,:64], r2
525         vqmovun.s16     d2, q1
526         vld1.8          {d19},   [r1,:64], r2
527         vqmovun.s16     d4, q2
528         vld1.16         {d6-d7}, [r0,:128]!
529         vaddw.u8        q3, q3, d19
530         vst1.8          {d2},    [r3,:64], r2
531         vqmovun.s16     d6, q3
532         vst1.8          {d4},    [r3,:64], r2
533         vst1.8          {d6},    [r3,:64], r2
534         bx              lr
535 endfunc
536
537 function ff_vector_fmul_neon, export=1
538         subs            r3,  r3,  #8
539         vld1.32         {d0-d3},  [r1,:128]!
540         vld1.32         {d4-d7},  [r2,:128]!
541         vmul.f32        q8,  q0,  q2
542         vmul.f32        q9,  q1,  q3
543         beq             3f
544         bics            ip,  r3,  #15
545         beq             2f
546 1:      subs            ip,  ip,  #16
547         vld1.32         {d0-d1},  [r1,:128]!
548         vld1.32         {d4-d5},  [r2,:128]!
549         vmul.f32        q10, q0,  q2
550         vld1.32         {d2-d3},  [r1,:128]!
551         vld1.32         {d6-d7},  [r2,:128]!
552         vmul.f32        q11, q1,  q3
553         vst1.32         {d16-d19},[r0,:128]!
554         vld1.32         {d0-d1},  [r1,:128]!
555         vld1.32         {d4-d5},  [r2,:128]!
556         vmul.f32        q8,  q0,  q2
557         vld1.32         {d2-d3},  [r1,:128]!
558         vld1.32         {d6-d7},  [r2,:128]!
559         vmul.f32        q9,  q1,  q3
560         vst1.32         {d20-d23},[r0,:128]!
561         bne             1b
562         ands            r3,  r3,  #15
563         beq             3f
564 2:      vld1.32         {d0-d1},  [r1,:128]!
565         vld1.32         {d4-d5},  [r2,:128]!
566         vst1.32         {d16-d17},[r0,:128]!
567         vmul.f32        q8,  q0,  q2
568         vld1.32         {d2-d3},  [r1,:128]!
569         vld1.32         {d6-d7},  [r2,:128]!
570         vst1.32         {d18-d19},[r0,:128]!
571         vmul.f32        q9,  q1,  q3
572 3:      vst1.32         {d16-d19},[r0,:128]!
573         bx              lr
574 endfunc
575
576 function ff_vector_fmul_window_neon, export=1
577         push            {r4,r5,lr}
578         ldr             lr,  [sp, #12]
579         sub             r2,  r2,  #8
580         sub             r5,  lr,  #2
581         add             r2,  r2,  r5, lsl #2
582         add             r4,  r3,  r5, lsl #3
583         add             ip,  r0,  r5, lsl #3
584         mov             r5,  #-16
585         vld1.32         {d0,d1},  [r1,:128]!
586         vld1.32         {d2,d3},  [r2,:128], r5
587         vld1.32         {d4,d5},  [r3,:128]!
588         vld1.32         {d6,d7},  [r4,:128], r5
589 1:      subs            lr,  lr,  #4
590         vmul.f32        d22, d0,  d4
591         vrev64.32       q3,  q3
592         vmul.f32        d23, d1,  d5
593         vrev64.32       q1,  q1
594         vmul.f32        d20, d0,  d7
595         vmul.f32        d21, d1,  d6
596         beq             2f
597         vmla.f32        d22, d3,  d7
598         vld1.32         {d0,d1},  [r1,:128]!
599         vmla.f32        d23, d2,  d6
600         vld1.32         {d18,d19},[r2,:128], r5
601         vmls.f32        d20, d3,  d4
602         vld1.32         {d24,d25},[r3,:128]!
603         vmls.f32        d21, d2,  d5
604         vld1.32         {d6,d7},  [r4,:128], r5
605         vmov            q1,  q9
606         vrev64.32       q11, q11
607         vmov            q2,  q12
608         vswp            d22, d23
609         vst1.32         {d20,d21},[r0,:128]!
610         vst1.32         {d22,d23},[ip,:128], r5
611         b               1b
612 2:      vmla.f32        d22, d3,  d7
613         vmla.f32        d23, d2,  d6
614         vmls.f32        d20, d3,  d4
615         vmls.f32        d21, d2,  d5
616         vrev64.32       q11, q11
617         vswp            d22, d23
618         vst1.32         {d20,d21},[r0,:128]!
619         vst1.32         {d22,d23},[ip,:128], r5
620         pop             {r4,r5,pc}
621 endfunc
622
623 #if CONFIG_VORBIS_DECODER
624 function ff_vorbis_inverse_coupling_neon, export=1
625         vmov.i32        q10, #1<<31
626         subs            r2,  r2,  #4
627         mov             r3,  r0
628         mov             r12, r1
629         beq             3f
630
631         vld1.32         {d24-d25},[r1,:128]!
632         vld1.32         {d22-d23},[r0,:128]!
633         vcle.s32        q8,  q12, #0
634         vand            q9,  q11, q10
635         veor            q12, q12, q9
636         vand            q2,  q12, q8
637         vbic            q3,  q12, q8
638         vadd.f32        q12, q11, q2
639         vsub.f32        q11, q11, q3
640 1:      vld1.32         {d2-d3},  [r1,:128]!
641         vld1.32         {d0-d1},  [r0,:128]!
642         vcle.s32        q8,  q1,  #0
643         vand            q9,  q0,  q10
644         veor            q1,  q1,  q9
645         vst1.32         {d24-d25},[r3, :128]!
646         vst1.32         {d22-d23},[r12,:128]!
647         vand            q2,  q1,  q8
648         vbic            q3,  q1,  q8
649         vadd.f32        q1,  q0,  q2
650         vsub.f32        q0,  q0,  q3
651         subs            r2,  r2,  #8
652         ble             2f
653         vld1.32         {d24-d25},[r1,:128]!
654         vld1.32         {d22-d23},[r0,:128]!
655         vcle.s32        q8,  q12, #0
656         vand            q9,  q11, q10
657         veor            q12, q12, q9
658         vst1.32         {d2-d3},  [r3, :128]!
659         vst1.32         {d0-d1},  [r12,:128]!
660         vand            q2,  q12, q8
661         vbic            q3,  q12, q8
662         vadd.f32        q12, q11, q2
663         vsub.f32        q11, q11, q3
664         b               1b
665
666 2:      vst1.32         {d2-d3},  [r3, :128]!
667         vst1.32         {d0-d1},  [r12,:128]!
668         it              lt
669         bxlt            lr
670
671 3:      vld1.32         {d2-d3},  [r1,:128]
672         vld1.32         {d0-d1},  [r0,:128]
673         vcle.s32        q8,  q1,  #0
674         vand            q9,  q0,  q10
675         veor            q1,  q1,  q9
676         vand            q2,  q1,  q8
677         vbic            q3,  q1,  q8
678         vadd.f32        q1,  q0,  q2
679         vsub.f32        q0,  q0,  q3
680         vst1.32         {d2-d3},  [r0,:128]!
681         vst1.32         {d0-d1},  [r1,:128]!
682         bx              lr
683 endfunc
684 #endif
685
686 function ff_vector_fmul_scalar_neon, export=1
687 VFP     len .req r2
688 NOVFP   len .req r3
689 VFP     vdup.32         q8,  d0[0]
690 NOVFP   vdup.32         q8,  r2
691         bics            r12, len, #15
692         beq             3f
693         vld1.32         {q0},[r1,:128]!
694         vld1.32         {q1},[r1,:128]!
695 1:      vmul.f32        q0,  q0,  q8
696         vld1.32         {q2},[r1,:128]!
697         vmul.f32        q1,  q1,  q8
698         vld1.32         {q3},[r1,:128]!
699         vmul.f32        q2,  q2,  q8
700         vst1.32         {q0},[r0,:128]!
701         vmul.f32        q3,  q3,  q8
702         vst1.32         {q1},[r0,:128]!
703         subs            r12, r12, #16
704         beq             2f
705         vld1.32         {q0},[r1,:128]!
706         vst1.32         {q2},[r0,:128]!
707         vld1.32         {q1},[r1,:128]!
708         vst1.32         {q3},[r0,:128]!
709         b               1b
710 2:      vst1.32         {q2},[r0,:128]!
711         vst1.32         {q3},[r0,:128]!
712         ands            len, len, #15
713         it              eq
714         bxeq            lr
715 3:      vld1.32         {q0},[r1,:128]!
716         vmul.f32        q0,  q0,  q8
717         vst1.32         {q0},[r0,:128]!
718         subs            len, len, #4
719         bgt             3b
720         bx              lr
721         .unreq          len
722 endfunc
723
724 function ff_vector_fmac_scalar_neon, export=1
725 VFP     len .req r2
726 VFP     acc .req r3
727 NOVFP   len .req r3
728 NOVFP   acc .req r2
729 VFP     vdup.32         q15, d0[0]
730 NOVFP   vdup.32         q15, r2
731         bics            r12, len, #15
732         mov             acc, r0
733         beq             3f
734         vld1.32         {q0},     [r1,:128]!
735         vld1.32         {q8},     [acc,:128]!
736         vld1.32         {q1},     [r1,:128]!
737         vld1.32         {q9},     [acc,:128]!
738 1:      vmla.f32        q8,  q0,  q15
739         vld1.32         {q2},     [r1,:128]!
740         vld1.32         {q10},    [acc,:128]!
741         vmla.f32        q9,  q1,  q15
742         vld1.32         {q3},     [r1,:128]!
743         vld1.32         {q11},    [acc,:128]!
744         vmla.f32        q10, q2,  q15
745         vst1.32         {q8},     [r0,:128]!
746         vmla.f32        q11, q3,  q15
747         vst1.32         {q9},     [r0,:128]!
748         subs            r12, r12, #16
749         beq             2f
750         vld1.32         {q0},     [r1,:128]!
751         vld1.32         {q8},     [acc,:128]!
752         vst1.32         {q10},    [r0,:128]!
753         vld1.32         {q1},     [r1,:128]!
754         vld1.32         {q9},     [acc,:128]!
755         vst1.32         {q11},    [r0,:128]!
756         b               1b
757 2:      vst1.32         {q10},    [r0,:128]!
758         vst1.32         {q11},    [r0,:128]!
759         ands            len, len, #15
760         it              eq
761         bxeq            lr
762 3:      vld1.32         {q0},     [r1,:128]!
763         vld1.32         {q8},     [acc,:128]!
764         vmla.f32        q8,  q0,  q15
765         vst1.32         {q8},     [r0,:128]!
766         subs            len, len, #4
767         bgt             3b
768         bx              lr
769         .unreq          len
770 endfunc
771
772 function ff_butterflies_float_neon, export=1
773 1:      vld1.32         {q0},[r0,:128]
774         vld1.32         {q1},[r1,:128]
775         vsub.f32        q2,  q0,  q1
776         vadd.f32        q1,  q0,  q1
777         vst1.32         {q2},[r1,:128]!
778         vst1.32         {q1},[r0,:128]!
779         subs            r2,  r2,  #4
780         bgt             1b
781         bx              lr
782 endfunc
783
784 function ff_scalarproduct_float_neon, export=1
785         vmov.f32        q2,  #0.0
786 1:      vld1.32         {q0},[r0,:128]!
787         vld1.32         {q1},[r1,:128]!
788         vmla.f32        q2,  q0,  q1
789         subs            r2,  r2,  #4
790         bgt             1b
791         vadd.f32        d0,  d4,  d5
792         vpadd.f32       d0,  d0,  d0
793 NOVFP   vmov.32         r0,  d0[0]
794         bx              lr
795 endfunc
796
797 function ff_vector_fmul_reverse_neon, export=1
798         add             r2,  r2,  r3,  lsl #2
799         sub             r2,  r2,  #32
800         mov             r12, #-32
801         vld1.32         {q0-q1},  [r1,:128]!
802         vld1.32         {q2-q3},  [r2,:128], r12
803 1:      pld             [r1, #32]
804         vrev64.32       q3,  q3
805         vmul.f32        d16, d0,  d7
806         vmul.f32        d17, d1,  d6
807         pld             [r2, #-32]
808         vrev64.32       q2,  q2
809         vmul.f32        d18, d2,  d5
810         vmul.f32        d19, d3,  d4
811         subs            r3,  r3,  #8
812         beq             2f
813         vld1.32         {q0-q1},  [r1,:128]!
814         vld1.32         {q2-q3},  [r2,:128], r12
815         vst1.32         {q8-q9},  [r0,:128]!
816         b               1b
817 2:      vst1.32         {q8-q9},  [r0,:128]!
818         bx              lr
819 endfunc
820
821 function ff_vector_fmul_add_neon, export=1
822         ldr             r12, [sp]
823         vld1.32         {q0-q1},  [r1,:128]!
824         vld1.32         {q8-q9},  [r2,:128]!
825         vld1.32         {q2-q3},  [r3,:128]!
826         vmul.f32        q10, q0,  q8
827         vmul.f32        q11, q1,  q9
828 1:      vadd.f32        q12, q2,  q10
829         vadd.f32        q13, q3,  q11
830         pld             [r1, #16]
831         pld             [r2, #16]
832         pld             [r3, #16]
833         subs            r12, r12, #8
834         beq             2f
835         vld1.32         {q0},     [r1,:128]!
836         vld1.32         {q8},     [r2,:128]!
837         vmul.f32        q10, q0,  q8
838         vld1.32         {q1},     [r1,:128]!
839         vld1.32         {q9},     [r2,:128]!
840         vmul.f32        q11, q1,  q9
841         vld1.32         {q2-q3},  [r3,:128]!
842         vst1.32         {q12-q13},[r0,:128]!
843         b               1b
844 2:      vst1.32         {q12-q13},[r0,:128]!
845         bx              lr
846 endfunc
847
848 function ff_vector_clipf_neon, export=1
849 VFP     vdup.32         q1,  d0[1]
850 VFP     vdup.32         q0,  d0[0]
851 NOVFP   vdup.32         q0,  r2
852 NOVFP   vdup.32         q1,  r3
853 NOVFP   ldr             r2,  [sp]
854         vld1.f32        {q2},[r1,:128]!
855         vmin.f32        q10, q2,  q1
856         vld1.f32        {q3},[r1,:128]!
857         vmin.f32        q11, q3,  q1
858 1:      vmax.f32        q8,  q10, q0
859         vmax.f32        q9,  q11, q0
860         subs            r2,  r2,  #8
861         beq             2f
862         vld1.f32        {q2},[r1,:128]!
863         vmin.f32        q10, q2,  q1
864         vld1.f32        {q3},[r1,:128]!
865         vmin.f32        q11, q3,  q1
866         vst1.f32        {q8},[r0,:128]!
867         vst1.f32        {q9},[r0,:128]!
868         b               1b
869 2:      vst1.f32        {q8},[r0,:128]!
870         vst1.f32        {q9},[r0,:128]!
871         bx              lr
872 endfunc
873
874 function ff_apply_window_int16_neon, export=1
875         push            {r4,lr}
876         add             r4,  r1,  r3,  lsl #1
877         add             lr,  r0,  r3,  lsl #1
878         sub             r4,  r4,  #16
879         sub             lr,  lr,  #16
880         mov             r12, #-16
881 1:
882         vld1.16         {q0},     [r1,:128]!
883         vld1.16         {q2},     [r2,:128]!
884         vld1.16         {q1},     [r4,:128], r12
885         vrev64.16       q3,  q2
886         vqrdmulh.s16    q0,  q0,  q2
887         vqrdmulh.s16    d2,  d2,  d7
888         vqrdmulh.s16    d3,  d3,  d6
889         vst1.16         {q0},     [r0,:128]!
890         vst1.16         {q1},     [lr,:128], r12
891         subs            r3,  r3,  #16
892         bgt             1b
893
894         pop             {r4,pc}
895 endfunc
896
897 function ff_vector_clip_int32_neon, export=1
898         vdup.32         q0,  r2
899         vdup.32         q1,  r3
900         ldr             r2,  [sp]
901 1:
902         vld1.32         {q2-q3},  [r1,:128]!
903         vmin.s32        q2,  q2,  q1
904         vmin.s32        q3,  q3,  q1
905         vmax.s32        q2,  q2,  q0
906         vmax.s32        q3,  q3,  q0
907         vst1.32         {q2-q3},  [r0,:128]!
908         subs            r2,  r2,  #8
909         bgt             1b
910         bx              lr
911 endfunc