]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/dsputil_neon.S
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libavcodec / arm / dsputil_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "libavutil/arm/asm.S"
24
25 function ff_clear_block_neon, export=1
26         vmov.i16        q0,  #0
27         .rept           8
28         vst1.16         {q0}, [r0,:128]!
29         .endr
30         bx              lr
31 endfunc
32
33 function ff_clear_blocks_neon, export=1
34         vmov.i16        q0,  #0
35         .rept           8*6
36         vst1.16         {q0}, [r0,:128]!
37         .endr
38         bx              lr
39 endfunc
40
41 .macro  pixels16        rnd=1, avg=0
42   .if \avg
43         mov             r12, r0
44   .endif
45 1:      vld1.8          {q0},     [r1], r2
46         vld1.8          {q1},     [r1], r2
47         vld1.8          {q2},     [r1], r2
48         pld             [r1, r2, lsl #2]
49         vld1.8          {q3},     [r1], r2
50         pld             [r1]
51         pld             [r1, r2]
52         pld             [r1, r2, lsl #1]
53   .if \avg
54         vld1.8          {q8},     [r12,:128], r2
55         vrhadd.u8       q0,  q0,  q8
56         vld1.8          {q9},     [r12,:128], r2
57         vrhadd.u8       q1,  q1,  q9
58         vld1.8          {q10},    [r12,:128], r2
59         vrhadd.u8       q2,  q2,  q10
60         vld1.8          {q11},    [r12,:128], r2
61         vrhadd.u8       q3,  q3,  q11
62   .endif
63         subs            r3,  r3,  #4
64         vst1.64         {q0},     [r0,:128], r2
65         vst1.64         {q1},     [r0,:128], r2
66         vst1.64         {q2},     [r0,:128], r2
67         vst1.64         {q3},     [r0,:128], r2
68         bne             1b
69         bx              lr
70 .endm
71
72 .macro  pixels16_x2     rnd=1, avg=0
73 1:      vld1.8          {d0-d2},  [r1], r2
74         vld1.8          {d4-d6},  [r1], r2
75         pld             [r1]
76         pld             [r1, r2]
77         subs            r3,  r3,  #2
78         vext.8          q1,  q0,  q1,  #1
79         avg             q0,  q0,  q1
80         vext.8          q3,  q2,  q3,  #1
81         avg             q2,  q2,  q3
82   .if \avg
83         vld1.8          {q1},     [r0,:128], r2
84         vld1.8          {q3},     [r0,:128]
85         vrhadd.u8       q0,  q0,  q1
86         vrhadd.u8       q2,  q2,  q3
87         sub             r0,  r0,  r2
88   .endif
89         vst1.8          {q0},     [r0,:128], r2
90         vst1.8          {q2},     [r0,:128], r2
91         bne             1b
92         bx              lr
93 .endm
94
95 .macro  pixels16_y2     rnd=1, avg=0
96         sub             r3,  r3,  #2
97         vld1.8          {q0},     [r1], r2
98         vld1.8          {q1},     [r1], r2
99 1:      subs            r3,  r3,  #2
100         avg             q2,  q0,  q1
101         vld1.8          {q0},     [r1], r2
102         avg             q3,  q0,  q1
103         vld1.8          {q1},     [r1], r2
104         pld             [r1]
105         pld             [r1, r2]
106   .if \avg
107         vld1.8          {q8},     [r0,:128], r2
108         vld1.8          {q9},     [r0,:128]
109         vrhadd.u8       q2,  q2,  q8
110         vrhadd.u8       q3,  q3,  q9
111         sub             r0,  r0,  r2
112   .endif
113         vst1.8          {q2},     [r0,:128], r2
114         vst1.8          {q3},     [r0,:128], r2
115         bne             1b
116
117         avg             q2,  q0,  q1
118         vld1.8          {q0},     [r1], r2
119         avg             q3,  q0,  q1
120   .if \avg
121         vld1.8          {q8},     [r0,:128], r2
122         vld1.8          {q9},     [r0,:128]
123         vrhadd.u8       q2,  q2,  q8
124         vrhadd.u8       q3,  q3,  q9
125         sub             r0,  r0,  r2
126   .endif
127         vst1.8          {q2},     [r0,:128], r2
128         vst1.8          {q3},     [r0,:128], r2
129
130         bx              lr
131 .endm
132
133 .macro  pixels16_xy2    rnd=1, avg=0
134         sub             r3,  r3,  #2
135         vld1.8          {d0-d2},  [r1], r2
136         vld1.8          {d4-d6},  [r1], r2
137 NRND    vmov.i16        q13, #1
138         pld             [r1]
139         pld             [r1, r2]
140         vext.8          q1,  q0,  q1,  #1
141         vext.8          q3,  q2,  q3,  #1
142         vaddl.u8        q8,  d0,  d2
143         vaddl.u8        q10, d1,  d3
144         vaddl.u8        q9,  d4,  d6
145         vaddl.u8        q11, d5,  d7
146 1:      subs            r3,  r3,  #2
147         vld1.8          {d0-d2},  [r1], r2
148         vadd.u16        q12, q8,  q9
149         pld             [r1]
150 NRND    vadd.u16        q12, q12, q13
151         vext.8          q15, q0,  q1,  #1
152         vadd.u16        q1 , q10, q11
153         shrn            d28, q12, #2
154 NRND    vadd.u16        q1,  q1,  q13
155         shrn            d29, q1,  #2
156   .if \avg
157         vld1.8          {q8},     [r0,:128]
158         vrhadd.u8       q14, q14, q8
159   .endif
160         vaddl.u8        q8,  d0,  d30
161         vld1.8          {d2-d4},  [r1], r2
162         vaddl.u8        q10, d1,  d31
163         vst1.8          {q14},    [r0,:128], r2
164         vadd.u16        q12, q8,  q9
165         pld             [r1, r2]
166 NRND    vadd.u16        q12, q12, q13
167         vext.8          q2,  q1,  q2,  #1
168         vadd.u16        q0,  q10, q11
169         shrn            d30, q12, #2
170 NRND    vadd.u16        q0,  q0,  q13
171         shrn            d31, q0,  #2
172   .if \avg
173         vld1.8          {q9},     [r0,:128]
174         vrhadd.u8       q15, q15, q9
175   .endif
176         vaddl.u8        q9,  d2,  d4
177         vaddl.u8        q11, d3,  d5
178         vst1.8          {q15},    [r0,:128], r2
179         bgt             1b
180
181         vld1.8          {d0-d2},  [r1], r2
182         vadd.u16        q12, q8,  q9
183 NRND    vadd.u16        q12, q12, q13
184         vext.8          q15, q0,  q1,  #1
185         vadd.u16        q1 , q10, q11
186         shrn            d28, q12, #2
187 NRND    vadd.u16        q1,  q1,  q13
188         shrn            d29, q1,  #2
189   .if \avg
190         vld1.8          {q8},     [r0,:128]
191         vrhadd.u8       q14, q14, q8
192   .endif
193         vaddl.u8        q8,  d0,  d30
194         vaddl.u8        q10, d1,  d31
195         vst1.8          {q14},    [r0,:128], r2
196         vadd.u16        q12, q8,  q9
197 NRND    vadd.u16        q12, q12, q13
198         vadd.u16        q0,  q10, q11
199         shrn            d30, q12, #2
200 NRND    vadd.u16        q0,  q0,  q13
201         shrn            d31, q0,  #2
202   .if \avg
203         vld1.8          {q9},     [r0,:128]
204         vrhadd.u8       q15, q15, q9
205   .endif
206         vst1.8          {q15},    [r0,:128], r2
207
208         bx              lr
209 .endm
210
211 .macro  pixels8         rnd=1, avg=0
212 1:      vld1.8          {d0},     [r1], r2
213         vld1.8          {d1},     [r1], r2
214         vld1.8          {d2},     [r1], r2
215         pld             [r1, r2, lsl #2]
216         vld1.8          {d3},     [r1], r2
217         pld             [r1]
218         pld             [r1, r2]
219         pld             [r1, r2, lsl #1]
220   .if \avg
221         vld1.8          {d4},     [r0,:64], r2
222         vrhadd.u8       d0,  d0,  d4
223         vld1.8          {d5},     [r0,:64], r2
224         vrhadd.u8       d1,  d1,  d5
225         vld1.8          {d6},     [r0,:64], r2
226         vrhadd.u8       d2,  d2,  d6
227         vld1.8          {d7},     [r0,:64], r2
228         vrhadd.u8       d3,  d3,  d7
229         sub             r0,  r0,  r2,  lsl #2
230   .endif
231         subs            r3,  r3,  #4
232         vst1.8          {d0},     [r0,:64], r2
233         vst1.8          {d1},     [r0,:64], r2
234         vst1.8          {d2},     [r0,:64], r2
235         vst1.8          {d3},     [r0,:64], r2
236         bne             1b
237         bx              lr
238 .endm
239
240 .macro  pixels8_x2      rnd=1, avg=0
241 1:      vld1.8          {q0},     [r1], r2
242         vext.8          d1,  d0,  d1,  #1
243         vld1.8          {q1},     [r1], r2
244         vext.8          d3,  d2,  d3,  #1
245         pld             [r1]
246         pld             [r1, r2]
247         subs            r3,  r3,  #2
248         vswp            d1,  d2
249         avg             q0,  q0,  q1
250   .if \avg
251         vld1.8          {d4},     [r0,:64], r2
252         vld1.8          {d5},     [r0,:64]
253         vrhadd.u8       q0,  q0,  q2
254         sub             r0,  r0,  r2
255   .endif
256         vst1.8          {d0},     [r0,:64], r2
257         vst1.8          {d1},     [r0,:64], r2
258         bne             1b
259         bx              lr
260 .endm
261
262 .macro  pixels8_y2      rnd=1, avg=0
263         sub             r3,  r3,  #2
264         vld1.8          {d0},     [r1], r2
265         vld1.8          {d1},     [r1], r2
266 1:      subs            r3,  r3,  #2
267         avg             d4,  d0,  d1
268         vld1.8          {d0},     [r1], r2
269         avg             d5,  d0,  d1
270         vld1.8          {d1},     [r1], r2
271         pld             [r1]
272         pld             [r1, r2]
273   .if \avg
274         vld1.8          {d2},     [r0,:64], r2
275         vld1.8          {d3},     [r0,:64]
276         vrhadd.u8       q2,  q2,  q1
277         sub             r0,  r0,  r2
278   .endif
279         vst1.8          {d4},     [r0,:64], r2
280         vst1.8          {d5},     [r0,:64], r2
281         bne             1b
282
283         avg             d4,  d0,  d1
284         vld1.8          {d0},     [r1], r2
285         avg             d5,  d0,  d1
286   .if \avg
287         vld1.8          {d2},     [r0,:64], r2
288         vld1.8          {d3},     [r0,:64]
289         vrhadd.u8       q2,  q2,  q1
290         sub             r0,  r0,  r2
291   .endif
292         vst1.8          {d4},     [r0,:64], r2
293         vst1.8          {d5},     [r0,:64], r2
294
295         bx              lr
296 .endm
297
298 .macro  pixels8_xy2     rnd=1, avg=0
299         sub             r3,  r3,  #2
300         vld1.8          {q0},     [r1], r2
301         vld1.8          {q1},     [r1], r2
302 NRND    vmov.i16        q11, #1
303         pld             [r1]
304         pld             [r1, r2]
305         vext.8          d4,  d0,  d1,  #1
306         vext.8          d6,  d2,  d3,  #1
307         vaddl.u8        q8,  d0,  d4
308         vaddl.u8        q9,  d2,  d6
309 1:      subs            r3,  r3,  #2
310         vld1.8          {q0},     [r1], r2
311         pld             [r1]
312         vadd.u16        q10, q8,  q9
313         vext.8          d4,  d0,  d1,  #1
314 NRND    vadd.u16        q10, q10, q11
315         vaddl.u8        q8,  d0,  d4
316         shrn            d5,  q10, #2
317         vld1.8          {q1},     [r1], r2
318         vadd.u16        q10, q8,  q9
319         pld             [r1, r2]
320   .if \avg
321         vld1.8          {d7},     [r0,:64]
322         vrhadd.u8       d5,  d5,  d7
323   .endif
324 NRND    vadd.u16        q10, q10, q11
325         vst1.8          {d5},     [r0,:64], r2
326         shrn            d7,  q10, #2
327   .if \avg
328         vld1.8          {d5},     [r0,:64]
329         vrhadd.u8       d7,  d7,  d5
330   .endif
331         vext.8          d6,  d2,  d3,  #1
332         vaddl.u8        q9,  d2,  d6
333         vst1.8          {d7},     [r0,:64], r2
334         bgt             1b
335
336         vld1.8          {q0},     [r1], r2
337         vadd.u16        q10, q8,  q9
338         vext.8          d4,  d0,  d1,  #1
339 NRND    vadd.u16        q10, q10, q11
340         vaddl.u8        q8,  d0,  d4
341         shrn            d5,  q10, #2
342         vadd.u16        q10, q8,  q9
343   .if \avg
344         vld1.8          {d7},     [r0,:64]
345         vrhadd.u8       d5,  d5,  d7
346   .endif
347 NRND    vadd.u16        q10, q10, q11
348         vst1.8          {d5},     [r0,:64], r2
349         shrn            d7,  q10, #2
350   .if \avg
351         vld1.8          {d5},     [r0,:64]
352         vrhadd.u8       d7,  d7,  d5
353   .endif
354         vst1.8          {d7},     [r0,:64], r2
355
356         bx              lr
357 .endm
358
359 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
360   .if \rnd
361     .macro avg  rd, rn, rm
362         vrhadd.u8       \rd, \rn, \rm
363     .endm
364     .macro shrn rd, rn, rm
365         vrshrn.u16      \rd, \rn, \rm
366     .endm
367     .macro NRND insn:vararg
368     .endm
369   .else
370     .macro avg  rd, rn, rm
371         vhadd.u8        \rd, \rn, \rm
372     .endm
373     .macro shrn rd, rn, rm
374         vshrn.u16       \rd, \rn, \rm
375     .endm
376     .macro NRND insn:vararg
377         \insn
378     .endm
379   .endif
380 function ff_\pfx\name\suf\()_neon, export=1
381         \name           \rnd, \avg
382 endfunc
383         .purgem         avg
384         .purgem         shrn
385         .purgem         NRND
386 .endm
387
388 .macro  pixfunc2        pfx, name, avg=0
389         pixfunc         \pfx, \name,          rnd=1, avg=\avg
390         pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
391 .endm
392
393 function ff_put_h264_qpel16_mc00_neon, export=1
394         mov             r3,  #16
395 endfunc
396
397         pixfunc         put_, pixels16,     avg=0
398         pixfunc2        put_, pixels16_x2,  avg=0
399         pixfunc2        put_, pixels16_y2,  avg=0
400         pixfunc2        put_, pixels16_xy2, avg=0
401
402 function ff_avg_h264_qpel16_mc00_neon, export=1
403         mov             r3,  #16
404 endfunc
405
406         pixfunc         avg_, pixels16,     avg=1
407         pixfunc2        avg_, pixels16_x2,  avg=1
408         pixfunc2        avg_, pixels16_y2,  avg=1
409         pixfunc2        avg_, pixels16_xy2, avg=1
410
411 function ff_put_h264_qpel8_mc00_neon, export=1
412         mov             r3,  #8
413 endfunc
414
415         pixfunc         put_, pixels8,     avg=0
416         pixfunc2        put_, pixels8_x2,  avg=0
417         pixfunc2        put_, pixels8_y2,  avg=0
418         pixfunc2        put_, pixels8_xy2, avg=0
419
420 function ff_avg_h264_qpel8_mc00_neon, export=1
421         mov             r3,  #8
422 endfunc
423
424         pixfunc         avg_, pixels8,     avg=1
425         pixfunc2        avg_, pixels8_x2,  avg=1
426         pixfunc2        avg_, pixels8_y2,  avg=1
427         pixfunc2        avg_, pixels8_xy2, avg=1
428
429 function ff_put_pixels_clamped_neon, export=1
430         vld1.16         {d16-d19}, [r0,:128]!
431         vqmovun.s16     d0, q8
432         vld1.16         {d20-d23}, [r0,:128]!
433         vqmovun.s16     d1, q9
434         vld1.16         {d24-d27}, [r0,:128]!
435         vqmovun.s16     d2, q10
436         vld1.16         {d28-d31}, [r0,:128]!
437         vqmovun.s16     d3, q11
438         vst1.8          {d0},      [r1,:64], r2
439         vqmovun.s16     d4, q12
440         vst1.8          {d1},      [r1,:64], r2
441         vqmovun.s16     d5, q13
442         vst1.8          {d2},      [r1,:64], r2
443         vqmovun.s16     d6, q14
444         vst1.8          {d3},      [r1,:64], r2
445         vqmovun.s16     d7, q15
446         vst1.8          {d4},      [r1,:64], r2
447         vst1.8          {d5},      [r1,:64], r2
448         vst1.8          {d6},      [r1,:64], r2
449         vst1.8          {d7},      [r1,:64], r2
450         bx              lr
451 endfunc
452
453 function ff_put_signed_pixels_clamped_neon, export=1
454         vmov.u8         d31, #128
455         vld1.16         {d16-d17}, [r0,:128]!
456         vqmovn.s16      d0, q8
457         vld1.16         {d18-d19}, [r0,:128]!
458         vqmovn.s16      d1, q9
459         vld1.16         {d16-d17}, [r0,:128]!
460         vqmovn.s16      d2, q8
461         vld1.16         {d18-d19}, [r0,:128]!
462         vadd.u8         d0, d0, d31
463         vld1.16         {d20-d21}, [r0,:128]!
464         vadd.u8         d1, d1, d31
465         vld1.16         {d22-d23}, [r0,:128]!
466         vadd.u8         d2, d2, d31
467         vst1.8          {d0},      [r1,:64], r2
468         vqmovn.s16      d3, q9
469         vst1.8          {d1},      [r1,:64], r2
470         vqmovn.s16      d4, q10
471         vst1.8          {d2},      [r1,:64], r2
472         vqmovn.s16      d5, q11
473         vld1.16         {d24-d25}, [r0,:128]!
474         vadd.u8         d3, d3, d31
475         vld1.16         {d26-d27}, [r0,:128]!
476         vadd.u8         d4, d4, d31
477         vadd.u8         d5, d5, d31
478         vst1.8          {d3},      [r1,:64], r2
479         vqmovn.s16      d6, q12
480         vst1.8          {d4},      [r1,:64], r2
481         vqmovn.s16      d7, q13
482         vst1.8          {d5},      [r1,:64], r2
483         vadd.u8         d6, d6, d31
484         vadd.u8         d7, d7, d31
485         vst1.8          {d6},      [r1,:64], r2
486         vst1.8          {d7},      [r1,:64], r2
487         bx              lr
488 endfunc
489
490 function ff_add_pixels_clamped_neon, export=1
491         mov             r3, r1
492         vld1.8          {d16},   [r1,:64], r2
493         vld1.16         {d0-d1}, [r0,:128]!
494         vaddw.u8        q0, q0, d16
495         vld1.8          {d17},   [r1,:64], r2
496         vld1.16         {d2-d3}, [r0,:128]!
497         vqmovun.s16     d0, q0
498         vld1.8          {d18},   [r1,:64], r2
499         vaddw.u8        q1, q1, d17
500         vld1.16         {d4-d5}, [r0,:128]!
501         vaddw.u8        q2, q2, d18
502         vst1.8          {d0},    [r3,:64], r2
503         vqmovun.s16     d2, q1
504         vld1.8          {d19},   [r1,:64], r2
505         vld1.16         {d6-d7}, [r0,:128]!
506         vaddw.u8        q3, q3, d19
507         vqmovun.s16     d4, q2
508         vst1.8          {d2},    [r3,:64], r2
509         vld1.8          {d16},   [r1,:64], r2
510         vqmovun.s16     d6, q3
511         vld1.16         {d0-d1}, [r0,:128]!
512         vaddw.u8        q0, q0, d16
513         vst1.8          {d4},    [r3,:64], r2
514         vld1.8          {d17},   [r1,:64], r2
515         vld1.16         {d2-d3}, [r0,:128]!
516         vaddw.u8        q1, q1, d17
517         vst1.8          {d6},    [r3,:64], r2
518         vqmovun.s16     d0, q0
519         vld1.8          {d18},   [r1,:64], r2
520         vld1.16         {d4-d5}, [r0,:128]!
521         vaddw.u8        q2, q2, d18
522         vst1.8          {d0},    [r3,:64], r2
523         vqmovun.s16     d2, q1
524         vld1.8          {d19},   [r1,:64], r2
525         vqmovun.s16     d4, q2
526         vld1.16         {d6-d7}, [r0,:128]!
527         vaddw.u8        q3, q3, d19
528         vst1.8          {d2},    [r3,:64], r2
529         vqmovun.s16     d6, q3
530         vst1.8          {d4},    [r3,:64], r2
531         vst1.8          {d6},    [r3,:64], r2
532         bx              lr
533 endfunc
534
535 function ff_vector_fmul_window_neon, export=1
536         push            {r4,r5,lr}
537         ldr             lr,  [sp, #12]
538         sub             r2,  r2,  #8
539         sub             r5,  lr,  #2
540         add             r2,  r2,  r5, lsl #2
541         add             r4,  r3,  r5, lsl #3
542         add             ip,  r0,  r5, lsl #3
543         mov             r5,  #-16
544         vld1.32         {d0,d1},  [r1,:128]!
545         vld1.32         {d2,d3},  [r2,:128], r5
546         vld1.32         {d4,d5},  [r3,:128]!
547         vld1.32         {d6,d7},  [r4,:128], r5
548 1:      subs            lr,  lr,  #4
549         vmul.f32        d22, d0,  d4
550         vrev64.32       q3,  q3
551         vmul.f32        d23, d1,  d5
552         vrev64.32       q1,  q1
553         vmul.f32        d20, d0,  d7
554         vmul.f32        d21, d1,  d6
555         beq             2f
556         vmla.f32        d22, d3,  d7
557         vld1.32         {d0,d1},  [r1,:128]!
558         vmla.f32        d23, d2,  d6
559         vld1.32         {d18,d19},[r2,:128], r5
560         vmls.f32        d20, d3,  d4
561         vld1.32         {d24,d25},[r3,:128]!
562         vmls.f32        d21, d2,  d5
563         vld1.32         {d6,d7},  [r4,:128], r5
564         vmov            q1,  q9
565         vrev64.32       q11, q11
566         vmov            q2,  q12
567         vswp            d22, d23
568         vst1.32         {d20,d21},[r0,:128]!
569         vst1.32         {d22,d23},[ip,:128], r5
570         b               1b
571 2:      vmla.f32        d22, d3,  d7
572         vmla.f32        d23, d2,  d6
573         vmls.f32        d20, d3,  d4
574         vmls.f32        d21, d2,  d5
575         vrev64.32       q11, q11
576         vswp            d22, d23
577         vst1.32         {d20,d21},[r0,:128]!
578         vst1.32         {d22,d23},[ip,:128], r5
579         pop             {r4,r5,pc}
580 endfunc
581
582 #if CONFIG_VORBIS_DECODER
583 function ff_vorbis_inverse_coupling_neon, export=1
584         vmov.i32        q10, #1<<31
585         subs            r2,  r2,  #4
586         mov             r3,  r0
587         mov             r12, r1
588         beq             3f
589
590         vld1.32         {d24-d25},[r1,:128]!
591         vld1.32         {d22-d23},[r0,:128]!
592         vcle.s32        q8,  q12, #0
593         vand            q9,  q11, q10
594         veor            q12, q12, q9
595         vand            q2,  q12, q8
596         vbic            q3,  q12, q8
597         vadd.f32        q12, q11, q2
598         vsub.f32        q11, q11, q3
599 1:      vld1.32         {d2-d3},  [r1,:128]!
600         vld1.32         {d0-d1},  [r0,:128]!
601         vcle.s32        q8,  q1,  #0
602         vand            q9,  q0,  q10
603         veor            q1,  q1,  q9
604         vst1.32         {d24-d25},[r3, :128]!
605         vst1.32         {d22-d23},[r12,:128]!
606         vand            q2,  q1,  q8
607         vbic            q3,  q1,  q8
608         vadd.f32        q1,  q0,  q2
609         vsub.f32        q0,  q0,  q3
610         subs            r2,  r2,  #8
611         ble             2f
612         vld1.32         {d24-d25},[r1,:128]!
613         vld1.32         {d22-d23},[r0,:128]!
614         vcle.s32        q8,  q12, #0
615         vand            q9,  q11, q10
616         veor            q12, q12, q9
617         vst1.32         {d2-d3},  [r3, :128]!
618         vst1.32         {d0-d1},  [r12,:128]!
619         vand            q2,  q12, q8
620         vbic            q3,  q12, q8
621         vadd.f32        q12, q11, q2
622         vsub.f32        q11, q11, q3
623         b               1b
624
625 2:      vst1.32         {d2-d3},  [r3, :128]!
626         vst1.32         {d0-d1},  [r12,:128]!
627         it              lt
628         bxlt            lr
629
630 3:      vld1.32         {d2-d3},  [r1,:128]
631         vld1.32         {d0-d1},  [r0,:128]
632         vcle.s32        q8,  q1,  #0
633         vand            q9,  q0,  q10
634         veor            q1,  q1,  q9
635         vand            q2,  q1,  q8
636         vbic            q3,  q1,  q8
637         vadd.f32        q1,  q0,  q2
638         vsub.f32        q0,  q0,  q3
639         vst1.32         {d2-d3},  [r0,:128]!
640         vst1.32         {d0-d1},  [r1,:128]!
641         bx              lr
642 endfunc
643 #endif
644
645 function ff_vector_fmul_scalar_neon, export=1
646 VFP     len .req r2
647 NOVFP   len .req r3
648 VFP     vdup.32         q8,  d0[0]
649 NOVFP   vdup.32         q8,  r2
650         bics            r12, len, #15
651         beq             3f
652         vld1.32         {q0},[r1,:128]!
653         vld1.32         {q1},[r1,:128]!
654 1:      vmul.f32        q0,  q0,  q8
655         vld1.32         {q2},[r1,:128]!
656         vmul.f32        q1,  q1,  q8
657         vld1.32         {q3},[r1,:128]!
658         vmul.f32        q2,  q2,  q8
659         vst1.32         {q0},[r0,:128]!
660         vmul.f32        q3,  q3,  q8
661         vst1.32         {q1},[r0,:128]!
662         subs            r12, r12, #16
663         beq             2f
664         vld1.32         {q0},[r1,:128]!
665         vst1.32         {q2},[r0,:128]!
666         vld1.32         {q1},[r1,:128]!
667         vst1.32         {q3},[r0,:128]!
668         b               1b
669 2:      vst1.32         {q2},[r0,:128]!
670         vst1.32         {q3},[r0,:128]!
671         ands            len, len, #15
672         it              eq
673         bxeq            lr
674 3:      vld1.32         {q0},[r1,:128]!
675         vmul.f32        q0,  q0,  q8
676         vst1.32         {q0},[r0,:128]!
677         subs            len, len, #4
678         bgt             3b
679         bx              lr
680         .unreq          len
681 endfunc
682
683 function ff_butterflies_float_neon, export=1
684 1:      vld1.32         {q0},[r0,:128]
685         vld1.32         {q1},[r1,:128]
686         vsub.f32        q2,  q0,  q1
687         vadd.f32        q1,  q0,  q1
688         vst1.32         {q2},[r1,:128]!
689         vst1.32         {q1},[r0,:128]!
690         subs            r2,  r2,  #4
691         bgt             1b
692         bx              lr
693 endfunc
694
695 function ff_scalarproduct_float_neon, export=1
696         vmov.f32        q2,  #0.0
697 1:      vld1.32         {q0},[r0,:128]!
698         vld1.32         {q1},[r1,:128]!
699         vmla.f32        q2,  q0,  q1
700         subs            r2,  r2,  #4
701         bgt             1b
702         vadd.f32        d0,  d4,  d5
703         vpadd.f32       d0,  d0,  d0
704 NOVFP   vmov.32         r0,  d0[0]
705         bx              lr
706 endfunc
707
708 function ff_vector_fmul_reverse_neon, export=1
709         add             r2,  r2,  r3,  lsl #2
710         sub             r2,  r2,  #32
711         mov             r12, #-32
712         vld1.32         {q0-q1},  [r1,:128]!
713         vld1.32         {q2-q3},  [r2,:128], r12
714 1:      pld             [r1, #32]
715         vrev64.32       q3,  q3
716         vmul.f32        d16, d0,  d7
717         vmul.f32        d17, d1,  d6
718         pld             [r2, #-32]
719         vrev64.32       q2,  q2
720         vmul.f32        d18, d2,  d5
721         vmul.f32        d19, d3,  d4
722         subs            r3,  r3,  #8
723         beq             2f
724         vld1.32         {q0-q1},  [r1,:128]!
725         vld1.32         {q2-q3},  [r2,:128], r12
726         vst1.32         {q8-q9},  [r0,:128]!
727         b               1b
728 2:      vst1.32         {q8-q9},  [r0,:128]!
729         bx              lr
730 endfunc
731
732 function ff_vector_fmul_add_neon, export=1
733         ldr             r12, [sp]
734         vld1.32         {q0-q1},  [r1,:128]!
735         vld1.32         {q8-q9},  [r2,:128]!
736         vld1.32         {q2-q3},  [r3,:128]!
737         vmul.f32        q10, q0,  q8
738         vmul.f32        q11, q1,  q9
739 1:      vadd.f32        q12, q2,  q10
740         vadd.f32        q13, q3,  q11
741         pld             [r1, #16]
742         pld             [r2, #16]
743         pld             [r3, #16]
744         subs            r12, r12, #8
745         beq             2f
746         vld1.32         {q0},     [r1,:128]!
747         vld1.32         {q8},     [r2,:128]!
748         vmul.f32        q10, q0,  q8
749         vld1.32         {q1},     [r1,:128]!
750         vld1.32         {q9},     [r2,:128]!
751         vmul.f32        q11, q1,  q9
752         vld1.32         {q2-q3},  [r3,:128]!
753         vst1.32         {q12-q13},[r0,:128]!
754         b               1b
755 2:      vst1.32         {q12-q13},[r0,:128]!
756         bx              lr
757 endfunc
758
759 function ff_vector_clipf_neon, export=1
760 VFP     vdup.32         q1,  d0[1]
761 VFP     vdup.32         q0,  d0[0]
762 NOVFP   vdup.32         q0,  r2
763 NOVFP   vdup.32         q1,  r3
764 NOVFP   ldr             r2,  [sp]
765         vld1.f32        {q2},[r1,:128]!
766         vmin.f32        q10, q2,  q1
767         vld1.f32        {q3},[r1,:128]!
768         vmin.f32        q11, q3,  q1
769 1:      vmax.f32        q8,  q10, q0
770         vmax.f32        q9,  q11, q0
771         subs            r2,  r2,  #8
772         beq             2f
773         vld1.f32        {q2},[r1,:128]!
774         vmin.f32        q10, q2,  q1
775         vld1.f32        {q3},[r1,:128]!
776         vmin.f32        q11, q3,  q1
777         vst1.f32        {q8},[r0,:128]!
778         vst1.f32        {q9},[r0,:128]!
779         b               1b
780 2:      vst1.f32        {q8},[r0,:128]!
781         vst1.f32        {q9},[r0,:128]!
782         bx              lr
783 endfunc
784
785 function ff_apply_window_int16_neon, export=1
786         push            {r4,lr}
787         add             r4,  r1,  r3,  lsl #1
788         add             lr,  r0,  r3,  lsl #1
789         sub             r4,  r4,  #16
790         sub             lr,  lr,  #16
791         mov             r12, #-16
792 1:
793         vld1.16         {q0},     [r1,:128]!
794         vld1.16         {q2},     [r2,:128]!
795         vld1.16         {q1},     [r4,:128], r12
796         vrev64.16       q3,  q2
797         vqrdmulh.s16    q0,  q0,  q2
798         vqrdmulh.s16    d2,  d2,  d7
799         vqrdmulh.s16    d3,  d3,  d6
800         vst1.16         {q0},     [r0,:128]!
801         vst1.16         {q1},     [lr,:128], r12
802         subs            r3,  r3,  #16
803         bgt             1b
804
805         pop             {r4,pc}
806 endfunc
807
808 function ff_vector_clip_int32_neon, export=1
809         vdup.32         q0,  r2
810         vdup.32         q1,  r3
811         ldr             r2,  [sp]
812 1:
813         vld1.32         {q2-q3},  [r1,:128]!
814         vmin.s32        q2,  q2,  q1
815         vmin.s32        q3,  q3,  q1
816         vmax.s32        q2,  q2,  q0
817         vmax.s32        q3,  q3,  q0
818         vst1.32         {q2-q3},  [r0,:128]!
819         subs            r2,  r2,  #8
820         bgt             1b
821         bx              lr
822 endfunc