]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/dsputil_neon.S
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libavcodec / arm / dsputil_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "asm.S"
24
25         preserve8
26
27 function ff_clear_block_neon, export=1
28         vmov.i16        q0,  #0
29         .rept           8
30         vst1.16         {q0}, [r0,:128]!
31         .endr
32         bx              lr
33 endfunc
34
35 function ff_clear_blocks_neon, export=1
36         vmov.i16        q0,  #0
37         .rept           8*6
38         vst1.16         {q0}, [r0,:128]!
39         .endr
40         bx              lr
41 endfunc
42
43 .macro  pixels16        rnd=1, avg=0
44   .if \avg
45         mov             r12, r0
46   .endif
47 1:      vld1.64         {q0},     [r1], r2
48         vld1.64         {q1},     [r1], r2
49         vld1.64         {q2},     [r1], r2
50         pld             [r1, r2, lsl #2]
51         vld1.64         {q3},     [r1], r2
52         pld             [r1]
53         pld             [r1, r2]
54         pld             [r1, r2, lsl #1]
55   .if \avg
56         vld1.64         {q8},     [r12,:128], r2
57         vrhadd.u8       q0,  q0,  q8
58         vld1.64         {q9},     [r12,:128], r2
59         vrhadd.u8       q1,  q1,  q9
60         vld1.64         {q10},    [r12,:128], r2
61         vrhadd.u8       q2,  q2,  q10
62         vld1.64         {q11},    [r12,:128], r2
63         vrhadd.u8       q3,  q3,  q11
64   .endif
65         subs            r3,  r3,  #4
66         vst1.64         {q0},     [r0,:128], r2
67         vst1.64         {q1},     [r0,:128], r2
68         vst1.64         {q2},     [r0,:128], r2
69         vst1.64         {q3},     [r0,:128], r2
70         bne             1b
71         bx              lr
72 .endm
73
74 .macro  pixels16_x2     rnd=1, avg=0
75 1:      vld1.64         {d0-d2},  [r1], r2
76         vld1.64         {d4-d6},  [r1], r2
77         pld             [r1]
78         pld             [r1, r2]
79         subs            r3,  r3,  #2
80         vext.8          q1,  q0,  q1,  #1
81         avg             q0,  q0,  q1
82         vext.8          q3,  q2,  q3,  #1
83         avg             q2,  q2,  q3
84   .if \avg
85         vld1.8          {q1},     [r0,:128], r2
86         vld1.8          {q3},     [r0,:128]
87         vrhadd.u8       q0,  q0,  q1
88         vrhadd.u8       q2,  q2,  q3
89         sub             r0,  r0,  r2
90   .endif
91         vst1.64         {q0},     [r0,:128], r2
92         vst1.64         {q2},     [r0,:128], r2
93         bne             1b
94         bx              lr
95 .endm
96
97 .macro  pixels16_y2     rnd=1, avg=0
98         vld1.64         {q0},     [r1], r2
99         vld1.64         {q1},     [r1], r2
100 1:      subs            r3,  r3,  #2
101         avg             q2,  q0,  q1
102         vld1.64         {q0},     [r1], r2
103         avg             q3,  q0,  q1
104         vld1.64         {q1},     [r1], r2
105         pld             [r1]
106         pld             [r1, r2]
107   .if \avg
108         vld1.8          {q8},     [r0,:128], r2
109         vld1.8          {q9},     [r0,:128]
110         vrhadd.u8       q2,  q2,  q8
111         vrhadd.u8       q3,  q3,  q9
112         sub             r0,  r0,  r2
113   .endif
114         vst1.64         {q2},     [r0,:128], r2
115         vst1.64         {q3},     [r0,:128], r2
116         bne             1b
117         bx              lr
118 .endm
119
120 .macro  pixels16_xy2    rnd=1, avg=0
121         vld1.64         {d0-d2},  [r1], r2
122         vld1.64         {d4-d6},  [r1], r2
123   .ifeq \rnd
124         vmov.i16        q13, #1
125   .endif
126         pld             [r1]
127         pld             [r1, r2]
128         vext.8          q1,  q0,  q1,  #1
129         vext.8          q3,  q2,  q3,  #1
130         vaddl.u8        q8,  d0,  d2
131         vaddl.u8        q10, d1,  d3
132         vaddl.u8        q9,  d4,  d6
133         vaddl.u8        q11, d5,  d7
134 1:      subs            r3,  r3,  #2
135         vld1.64         {d0-d2},  [r1], r2
136         vadd.u16        q12, q8,  q9
137         pld             [r1]
138   .ifeq \rnd
139         vadd.u16        q12, q12, q13
140   .endif
141         vext.8          q15, q0,  q1,  #1
142         vadd.u16        q1 , q10, q11
143         shrn            d28, q12, #2
144   .ifeq \rnd
145         vadd.u16        q1,  q1,  q13
146   .endif
147         shrn            d29, q1,  #2
148   .if \avg
149         vld1.8          {q8},     [r0,:128]
150         vrhadd.u8       q14, q14, q8
151   .endif
152         vaddl.u8        q8,  d0,  d30
153         vld1.64         {d2-d4},  [r1], r2
154         vaddl.u8        q10, d1,  d31
155         vst1.64         {q14},    [r0,:128], r2
156         vadd.u16        q12, q8,  q9
157         pld             [r1, r2]
158   .ifeq \rnd
159         vadd.u16        q12, q12, q13
160   .endif
161         vext.8          q2,  q1,  q2,  #1
162         vadd.u16        q0,  q10, q11
163         shrn            d30, q12, #2
164   .ifeq \rnd
165         vadd.u16        q0,  q0,  q13
166   .endif
167         shrn            d31, q0,  #2
168   .if \avg
169         vld1.8          {q9},     [r0,:128]
170         vrhadd.u8       q15, q15, q9
171   .endif
172         vaddl.u8        q9,  d2,  d4
173         vaddl.u8        q11, d3,  d5
174         vst1.64         {q15},    [r0,:128], r2
175         bgt             1b
176         bx              lr
177 .endm
178
179 .macro  pixels8         rnd=1, avg=0
180 1:      vld1.64         {d0},     [r1], r2
181         vld1.64         {d1},     [r1], r2
182         vld1.64         {d2},     [r1], r2
183         pld             [r1, r2, lsl #2]
184         vld1.64         {d3},     [r1], r2
185         pld             [r1]
186         pld             [r1, r2]
187         pld             [r1, r2, lsl #1]
188   .if \avg
189         vld1.64         {d4},     [r0,:64], r2
190         vrhadd.u8       d0,  d0,  d4
191         vld1.64         {d5},     [r0,:64], r2
192         vrhadd.u8       d1,  d1,  d5
193         vld1.64         {d6},     [r0,:64], r2
194         vrhadd.u8       d2,  d2,  d6
195         vld1.64         {d7},     [r0,:64], r2
196         vrhadd.u8       d3,  d3,  d7
197         sub             r0,  r0,  r2,  lsl #2
198   .endif
199         subs            r3,  r3,  #4
200         vst1.64         {d0},     [r0,:64], r2
201         vst1.64         {d1},     [r0,:64], r2
202         vst1.64         {d2},     [r0,:64], r2
203         vst1.64         {d3},     [r0,:64], r2
204         bne             1b
205         bx              lr
206 .endm
207
208 .macro  pixels8_x2      rnd=1, avg=0
209 1:      vld1.64         {q0},     [r1], r2
210         vext.8          d1,  d0,  d1,  #1
211         vld1.64         {q1},     [r1], r2
212         vext.8          d3,  d2,  d3,  #1
213         pld             [r1]
214         pld             [r1, r2]
215         subs            r3,  r3,  #2
216         vswp            d1,  d2
217         avg             q0,  q0,  q1
218   .if \avg
219         vld1.8          {d4},     [r0,:64], r2
220         vld1.8          {d5},     [r0,:64]
221         vrhadd.u8       q0,  q0,  q2
222         sub             r0,  r0,  r2
223   .endif
224         vst1.64         {d0},     [r0,:64], r2
225         vst1.64         {d1},     [r0,:64], r2
226         bne             1b
227         bx              lr
228 .endm
229
230 .macro  pixels8_y2      rnd=1, avg=0
231         vld1.64         {d0},     [r1], r2
232         vld1.64         {d1},     [r1], r2
233 1:      subs            r3,  r3,  #2
234         avg             d4,  d0,  d1
235         vld1.64         {d0},     [r1], r2
236         avg             d5,  d0,  d1
237         vld1.64         {d1},     [r1], r2
238         pld             [r1]
239         pld             [r1, r2]
240   .if \avg
241         vld1.8          {d2},     [r0,:64], r2
242         vld1.8          {d3},     [r0,:64]
243         vrhadd.u8       q2,  q2,  q1
244         sub             r0,  r0,  r2
245   .endif
246         vst1.64         {d4},     [r0,:64], r2
247         vst1.64         {d5},     [r0,:64], r2
248         bne             1b
249         bx              lr
250 .endm
251
252 .macro  pixels8_xy2     rnd=1, avg=0
253         vld1.64         {q0},     [r1], r2
254         vld1.64         {q1},     [r1], r2
255   .ifeq \rnd
256         vmov.i16        q11, #1
257   .endif
258         pld             [r1]
259         pld             [r1, r2]
260         vext.8          d4,  d0,  d1,  #1
261         vext.8          d6,  d2,  d3,  #1
262         vaddl.u8        q8,  d0,  d4
263         vaddl.u8        q9,  d2,  d6
264 1:      subs            r3,  r3,  #2
265         vld1.64         {q0},     [r1], r2
266         pld             [r1]
267         vadd.u16        q10, q8,  q9
268         vext.8          d4,  d0,  d1,  #1
269   .ifeq \rnd
270         vadd.u16        q10, q10, q11
271   .endif
272         vaddl.u8        q8,  d0,  d4
273         shrn            d5,  q10, #2
274         vld1.64         {q1},     [r1], r2
275         vadd.u16        q10, q8,  q9
276         pld             [r1, r2]
277   .if \avg
278         vld1.8          {d7},     [r0,:64]
279         vrhadd.u8       d5,  d5,  d7
280   .endif
281   .ifeq \rnd
282         vadd.u16        q10, q10, q11
283   .endif
284         vst1.64         {d5},     [r0,:64], r2
285         shrn            d7,  q10, #2
286   .if \avg
287         vld1.8          {d5},     [r0,:64]
288         vrhadd.u8       d7,  d7,  d5
289   .endif
290         vext.8          d6,  d2,  d3,  #1
291         vaddl.u8        q9,  d2,  d6
292         vst1.64         {d7},     [r0,:64], r2
293         bgt             1b
294         bx              lr
295 .endm
296
297 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
298   .if \rnd
299     .macro avg  rd, rn, rm
300         vrhadd.u8       \rd, \rn, \rm
301     .endm
302     .macro shrn rd, rn, rm
303         vrshrn.u16      \rd, \rn, \rm
304     .endm
305   .else
306     .macro avg  rd, rn, rm
307         vhadd.u8        \rd, \rn, \rm
308     .endm
309     .macro shrn rd, rn, rm
310         vshrn.u16       \rd, \rn, \rm
311     .endm
312   .endif
313 function ff_\pfx\name\suf\()_neon, export=1
314         \name           \rnd, \avg
315 endfunc
316         .purgem         avg
317         .purgem         shrn
318 .endm
319
320 .macro  pixfunc2        pfx, name, avg=0
321         pixfunc         \pfx, \name,          rnd=1, avg=\avg
322         pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
323 .endm
324
325 function ff_put_h264_qpel16_mc00_neon, export=1
326         mov             r3,  #16
327 endfunc
328
329         pixfunc         put_, pixels16,     avg=0
330         pixfunc2        put_, pixels16_x2,  avg=0
331         pixfunc2        put_, pixels16_y2,  avg=0
332         pixfunc2        put_, pixels16_xy2, avg=0
333
334 function ff_avg_h264_qpel16_mc00_neon, export=1
335         mov             r3,  #16
336 endfunc
337
338         pixfunc         avg_, pixels16,     avg=1
339         pixfunc2        avg_, pixels16_x2,  avg=1
340         pixfunc2        avg_, pixels16_y2,  avg=1
341         pixfunc2        avg_, pixels16_xy2, avg=1
342
343 function ff_put_h264_qpel8_mc00_neon, export=1
344         mov             r3,  #8
345 endfunc
346
347         pixfunc         put_, pixels8,     avg=0
348         pixfunc2        put_, pixels8_x2,  avg=0
349         pixfunc2        put_, pixels8_y2,  avg=0
350         pixfunc2        put_, pixels8_xy2, avg=0
351
352 function ff_avg_h264_qpel8_mc00_neon, export=1
353         mov             r3,  #8
354 endfunc
355
356         pixfunc         avg_, pixels8,     avg=1
357         pixfunc2        avg_, pixels8_x2,  avg=1
358         pixfunc2        avg_, pixels8_y2,  avg=1
359         pixfunc2        avg_, pixels8_xy2, avg=1
360
361 function ff_put_pixels_clamped_neon, export=1
362         vld1.64         {d16-d19}, [r0,:128]!
363         vqmovun.s16     d0, q8
364         vld1.64         {d20-d23}, [r0,:128]!
365         vqmovun.s16     d1, q9
366         vld1.64         {d24-d27}, [r0,:128]!
367         vqmovun.s16     d2, q10
368         vld1.64         {d28-d31}, [r0,:128]!
369         vqmovun.s16     d3, q11
370         vst1.64         {d0},      [r1,:64], r2
371         vqmovun.s16     d4, q12
372         vst1.64         {d1},      [r1,:64], r2
373         vqmovun.s16     d5, q13
374         vst1.64         {d2},      [r1,:64], r2
375         vqmovun.s16     d6, q14
376         vst1.64         {d3},      [r1,:64], r2
377         vqmovun.s16     d7, q15
378         vst1.64         {d4},      [r1,:64], r2
379         vst1.64         {d5},      [r1,:64], r2
380         vst1.64         {d6},      [r1,:64], r2
381         vst1.64         {d7},      [r1,:64], r2
382         bx              lr
383 endfunc
384
385 function ff_put_signed_pixels_clamped_neon, export=1
386         vmov.u8         d31, #128
387         vld1.64         {d16-d17}, [r0,:128]!
388         vqmovn.s16      d0, q8
389         vld1.64         {d18-d19}, [r0,:128]!
390         vqmovn.s16      d1, q9
391         vld1.64         {d16-d17}, [r0,:128]!
392         vqmovn.s16      d2, q8
393         vld1.64         {d18-d19}, [r0,:128]!
394         vadd.u8         d0, d0, d31
395         vld1.64         {d20-d21}, [r0,:128]!
396         vadd.u8         d1, d1, d31
397         vld1.64         {d22-d23}, [r0,:128]!
398         vadd.u8         d2, d2, d31
399         vst1.64         {d0},      [r1,:64], r2
400         vqmovn.s16      d3, q9
401         vst1.64         {d1},      [r1,:64], r2
402         vqmovn.s16      d4, q10
403         vst1.64         {d2},      [r1,:64], r2
404         vqmovn.s16      d5, q11
405         vld1.64         {d24-d25}, [r0,:128]!
406         vadd.u8         d3, d3, d31
407         vld1.64         {d26-d27}, [r0,:128]!
408         vadd.u8         d4, d4, d31
409         vadd.u8         d5, d5, d31
410         vst1.64         {d3},      [r1,:64], r2
411         vqmovn.s16      d6, q12
412         vst1.64         {d4},      [r1,:64], r2
413         vqmovn.s16      d7, q13
414         vst1.64         {d5},      [r1,:64], r2
415         vadd.u8         d6, d6, d31
416         vadd.u8         d7, d7, d31
417         vst1.64         {d6},      [r1,:64], r2
418         vst1.64         {d7},      [r1,:64], r2
419         bx              lr
420 endfunc
421
422 function ff_add_pixels_clamped_neon, export=1
423         mov             r3, r1
424         vld1.64         {d16},   [r1,:64], r2
425         vld1.64         {d0-d1}, [r0,:128]!
426         vaddw.u8        q0, q0, d16
427         vld1.64         {d17},   [r1,:64], r2
428         vld1.64         {d2-d3}, [r0,:128]!
429         vqmovun.s16     d0, q0
430         vld1.64         {d18},   [r1,:64], r2
431         vaddw.u8        q1, q1, d17
432         vld1.64         {d4-d5}, [r0,:128]!
433         vaddw.u8        q2, q2, d18
434         vst1.64         {d0},    [r3,:64], r2
435         vqmovun.s16     d2, q1
436         vld1.64         {d19},   [r1,:64], r2
437         vld1.64         {d6-d7}, [r0,:128]!
438         vaddw.u8        q3, q3, d19
439         vqmovun.s16     d4, q2
440         vst1.64         {d2},    [r3,:64], r2
441         vld1.64         {d16},   [r1,:64], r2
442         vqmovun.s16     d6, q3
443         vld1.64         {d0-d1}, [r0,:128]!
444         vaddw.u8        q0, q0, d16
445         vst1.64         {d4},    [r3,:64], r2
446         vld1.64         {d17},   [r1,:64], r2
447         vld1.64         {d2-d3}, [r0,:128]!
448         vaddw.u8        q1, q1, d17
449         vst1.64         {d6},    [r3,:64], r2
450         vqmovun.s16     d0, q0
451         vld1.64         {d18},   [r1,:64], r2
452         vld1.64         {d4-d5}, [r0,:128]!
453         vaddw.u8        q2, q2, d18
454         vst1.64         {d0},    [r3,:64], r2
455         vqmovun.s16     d2, q1
456         vld1.64         {d19},   [r1,:64], r2
457         vqmovun.s16     d4, q2
458         vld1.64         {d6-d7}, [r0,:128]!
459         vaddw.u8        q3, q3, d19
460         vst1.64         {d2},    [r3,:64], r2
461         vqmovun.s16     d6, q3
462         vst1.64         {d4},    [r3,:64], r2
463         vst1.64         {d6},    [r3,:64], r2
464         bx              lr
465 endfunc
466
467 function ff_vector_fmul_neon, export=1
468         subs            r3,  r3,  #8
469         vld1.64         {d0-d3},  [r1,:128]!
470         vld1.64         {d4-d7},  [r2,:128]!
471         vmul.f32        q8,  q0,  q2
472         vmul.f32        q9,  q1,  q3
473         beq             3f
474         bics            ip,  r3,  #15
475         beq             2f
476 1:      subs            ip,  ip,  #16
477         vld1.64         {d0-d1},  [r1,:128]!
478         vld1.64         {d4-d5},  [r2,:128]!
479         vmul.f32        q10, q0,  q2
480         vld1.64         {d2-d3},  [r1,:128]!
481         vld1.64         {d6-d7},  [r2,:128]!
482         vmul.f32        q11, q1,  q3
483         vst1.64         {d16-d19},[r0,:128]!
484         vld1.64         {d0-d1},  [r1,:128]!
485         vld1.64         {d4-d5},  [r2,:128]!
486         vmul.f32        q8,  q0,  q2
487         vld1.64         {d2-d3},  [r1,:128]!
488         vld1.64         {d6-d7},  [r2,:128]!
489         vmul.f32        q9,  q1,  q3
490         vst1.64         {d20-d23},[r0,:128]!
491         bne             1b
492         ands            r3,  r3,  #15
493         beq             3f
494 2:      vld1.64         {d0-d1},  [r1,:128]!
495         vld1.64         {d4-d5},  [r2,:128]!
496         vst1.64         {d16-d17},[r0,:128]!
497         vmul.f32        q8,  q0,  q2
498         vld1.64         {d2-d3},  [r1,:128]!
499         vld1.64         {d6-d7},  [r2,:128]!
500         vst1.64         {d18-d19},[r0,:128]!
501         vmul.f32        q9,  q1,  q3
502 3:      vst1.64         {d16-d19},[r0,:128]!
503         bx              lr
504 endfunc
505
506 function ff_vector_fmul_window_neon, export=1
507         push            {r4,r5,lr}
508         ldr             lr,  [sp, #12]
509         sub             r2,  r2,  #8
510         sub             r5,  lr,  #2
511         add             r2,  r2,  r5, lsl #2
512         add             r4,  r3,  r5, lsl #3
513         add             ip,  r0,  r5, lsl #3
514         mov             r5,  #-16
515         vld1.64         {d0,d1},  [r1,:128]!
516         vld1.64         {d2,d3},  [r2,:128], r5
517         vld1.64         {d4,d5},  [r3,:128]!
518         vld1.64         {d6,d7},  [r4,:128], r5
519 1:      subs            lr,  lr,  #4
520         vmul.f32        d22, d0,  d4
521         vrev64.32       q3,  q3
522         vmul.f32        d23, d1,  d5
523         vrev64.32       q1,  q1
524         vmul.f32        d20, d0,  d7
525         vmul.f32        d21, d1,  d6
526         beq             2f
527         vmla.f32        d22, d3,  d7
528         vld1.64         {d0,d1},  [r1,:128]!
529         vmla.f32        d23, d2,  d6
530         vld1.64         {d18,d19},[r2,:128], r5
531         vmls.f32        d20, d3,  d4
532         vld1.64         {d24,d25},[r3,:128]!
533         vmls.f32        d21, d2,  d5
534         vld1.64         {d6,d7},  [r4,:128], r5
535         vmov            q1,  q9
536         vrev64.32       q11, q11
537         vmov            q2,  q12
538         vswp            d22, d23
539         vst1.64         {d20,d21},[r0,:128]!
540         vst1.64         {d22,d23},[ip,:128], r5
541         b               1b
542 2:      vmla.f32        d22, d3,  d7
543         vmla.f32        d23, d2,  d6
544         vmls.f32        d20, d3,  d4
545         vmls.f32        d21, d2,  d5
546         vrev64.32       q11, q11
547         vswp            d22, d23
548         vst1.64         {d20,d21},[r0,:128]!
549         vst1.64         {d22,d23},[ip,:128], r5
550         pop             {r4,r5,pc}
551 endfunc
552
553 #if CONFIG_VORBIS_DECODER
554 function ff_vorbis_inverse_coupling_neon, export=1
555         vmov.i32        q10, #1<<31
556         subs            r2,  r2,  #4
557         mov             r3,  r0
558         mov             r12, r1
559         beq             3f
560
561         vld1.32         {d24-d25},[r1,:128]!
562         vld1.32         {d22-d23},[r0,:128]!
563         vcle.s32        q8,  q12, #0
564         vand            q9,  q11, q10
565         veor            q12, q12, q9
566         vand            q2,  q12, q8
567         vbic            q3,  q12, q8
568         vadd.f32        q12, q11, q2
569         vsub.f32        q11, q11, q3
570 1:      vld1.32         {d2-d3},  [r1,:128]!
571         vld1.32         {d0-d1},  [r0,:128]!
572         vcle.s32        q8,  q1,  #0
573         vand            q9,  q0,  q10
574         veor            q1,  q1,  q9
575         vst1.32         {d24-d25},[r3, :128]!
576         vst1.32         {d22-d23},[r12,:128]!
577         vand            q2,  q1,  q8
578         vbic            q3,  q1,  q8
579         vadd.f32        q1,  q0,  q2
580         vsub.f32        q0,  q0,  q3
581         subs            r2,  r2,  #8
582         ble             2f
583         vld1.32         {d24-d25},[r1,:128]!
584         vld1.32         {d22-d23},[r0,:128]!
585         vcle.s32        q8,  q12, #0
586         vand            q9,  q11, q10
587         veor            q12, q12, q9
588         vst1.32         {d2-d3},  [r3, :128]!
589         vst1.32         {d0-d1},  [r12,:128]!
590         vand            q2,  q12, q8
591         vbic            q3,  q12, q8
592         vadd.f32        q12, q11, q2
593         vsub.f32        q11, q11, q3
594         b               1b
595
596 2:      vst1.32         {d2-d3},  [r3, :128]!
597         vst1.32         {d0-d1},  [r12,:128]!
598         it              lt
599         bxlt            lr
600
601 3:      vld1.32         {d2-d3},  [r1,:128]
602         vld1.32         {d0-d1},  [r0,:128]
603         vcle.s32        q8,  q1,  #0
604         vand            q9,  q0,  q10
605         veor            q1,  q1,  q9
606         vand            q2,  q1,  q8
607         vbic            q3,  q1,  q8
608         vadd.f32        q1,  q0,  q2
609         vsub.f32        q0,  q0,  q3
610         vst1.32         {d2-d3},  [r0,:128]!
611         vst1.32         {d0-d1},  [r1,:128]!
612         bx              lr
613 endfunc
614 #endif
615
616 function ff_vector_fmul_scalar_neon, export=1
617 VFP     len .req r2
618 NOVFP   len .req r3
619 VFP     vdup.32         q8,  d0[0]
620 NOVFP   vdup.32         q8,  r2
621         bics            r12, len, #15
622         beq             3f
623         vld1.32         {q0},[r1,:128]!
624         vld1.32         {q1},[r1,:128]!
625 1:      vmul.f32        q0,  q0,  q8
626         vld1.32         {q2},[r1,:128]!
627         vmul.f32        q1,  q1,  q8
628         vld1.32         {q3},[r1,:128]!
629         vmul.f32        q2,  q2,  q8
630         vst1.32         {q0},[r0,:128]!
631         vmul.f32        q3,  q3,  q8
632         vst1.32         {q1},[r0,:128]!
633         subs            r12, r12, #16
634         beq             2f
635         vld1.32         {q0},[r1,:128]!
636         vst1.32         {q2},[r0,:128]!
637         vld1.32         {q1},[r1,:128]!
638         vst1.32         {q3},[r0,:128]!
639         b               1b
640 2:      vst1.32         {q2},[r0,:128]!
641         vst1.32         {q3},[r0,:128]!
642         ands            len, len, #15
643         it              eq
644         bxeq            lr
645 3:      vld1.32         {q0},[r1,:128]!
646         vmul.f32        q0,  q0,  q8
647         vst1.32         {q0},[r0,:128]!
648         subs            len, len, #4
649         bgt             3b
650         bx              lr
651         .unreq          len
652 endfunc
653
654 function ff_vector_fmac_scalar_neon, export=1
655 VFP     len .req r2
656 VFP     acc .req r3
657 NOVFP   len .req r3
658 NOVFP   acc .req r2
659 VFP     vdup.32         q15, d0[0]
660 NOVFP   vdup.32         q15, r2
661         bics            r12, len, #15
662         mov             acc, r0
663         beq             3f
664         vld1.32         {q0},     [r1,:128]!
665         vld1.32         {q8},     [acc,:128]!
666         vld1.32         {q1},     [r1,:128]!
667         vld1.32         {q9},     [acc,:128]!
668 1:      vmla.f32        q8,  q0,  q15
669         vld1.32         {q2},     [r1,:128]!
670         vld1.32         {q10},    [acc,:128]!
671         vmla.f32        q9,  q1,  q15
672         vld1.32         {q3},     [r1,:128]!
673         vld1.32         {q11},    [acc,:128]!
674         vmla.f32        q10, q2,  q15
675         vst1.32         {q8},     [r0,:128]!
676         vmla.f32        q11, q3,  q15
677         vst1.32         {q9},     [r0,:128]!
678         subs            r12, r12, #16
679         beq             2f
680         vld1.32         {q0},     [r1,:128]!
681         vld1.32         {q8},     [acc,:128]!
682         vst1.32         {q10},    [r0,:128]!
683         vld1.32         {q1},     [r1,:128]!
684         vld1.32         {q9},     [acc,:128]!
685         vst1.32         {q11},    [r0,:128]!
686         b               1b
687 2:      vst1.32         {q10},    [r0,:128]!
688         vst1.32         {q11},    [r0,:128]!
689         ands            len, len, #15
690         it              eq
691         bxeq            lr
692 3:      vld1.32         {q0},     [r1,:128]!
693         vld1.32         {q8},     [acc,:128]!
694         vmla.f32        q8,  q0,  q15
695         vst1.32         {q8},     [r0,:128]!
696         subs            len, len, #4
697         bgt             3b
698         bx              lr
699         .unreq          len
700 endfunc
701
702 function ff_butterflies_float_neon, export=1
703 1:      vld1.32         {q0},[r0,:128]
704         vld1.32         {q1},[r1,:128]
705         vsub.f32        q2,  q0,  q1
706         vadd.f32        q1,  q0,  q1
707         vst1.32         {q2},[r1,:128]!
708         vst1.32         {q1},[r0,:128]!
709         subs            r2,  r2,  #4
710         bgt             1b
711         bx              lr
712 endfunc
713
714 function ff_scalarproduct_float_neon, export=1
715         vmov.f32        q2,  #0.0
716 1:      vld1.32         {q0},[r0,:128]!
717         vld1.32         {q1},[r1,:128]!
718         vmla.f32        q2,  q0,  q1
719         subs            r2,  r2,  #4
720         bgt             1b
721         vadd.f32        d0,  d4,  d5
722         vpadd.f32       d0,  d0,  d0
723 NOVFP   vmov.32         r0,  d0[0]
724         bx              lr
725 endfunc
726
727 function ff_vector_fmul_reverse_neon, export=1
728         add             r2,  r2,  r3,  lsl #2
729         sub             r2,  r2,  #32
730         mov             r12, #-32
731         vld1.32         {q0-q1},  [r1,:128]!
732         vld1.32         {q2-q3},  [r2,:128], r12
733 1:      pld             [r1, #32]
734         vrev64.32       q3,  q3
735         vmul.f32        d16, d0,  d7
736         vmul.f32        d17, d1,  d6
737         pld             [r2, #-32]
738         vrev64.32       q2,  q2
739         vmul.f32        d18, d2,  d5
740         vmul.f32        d19, d3,  d4
741         subs            r3,  r3,  #8
742         beq             2f
743         vld1.32         {q0-q1},  [r1,:128]!
744         vld1.32         {q2-q3},  [r2,:128], r12
745         vst1.32         {q8-q9},  [r0,:128]!
746         b               1b
747 2:      vst1.32         {q8-q9},  [r0,:128]!
748         bx              lr
749 endfunc
750
751 function ff_vector_fmul_add_neon, export=1
752         ldr             r12, [sp]
753         vld1.32         {q0-q1},  [r1,:128]!
754         vld1.32         {q8-q9},  [r2,:128]!
755         vld1.32         {q2-q3},  [r3,:128]!
756         vmul.f32        q10, q0,  q8
757         vmul.f32        q11, q1,  q9
758 1:      vadd.f32        q12, q2,  q10
759         vadd.f32        q13, q3,  q11
760         pld             [r1, #16]
761         pld             [r2, #16]
762         pld             [r3, #16]
763         subs            r12, r12, #8
764         beq             2f
765         vld1.32         {q0},     [r1,:128]!
766         vld1.32         {q8},     [r2,:128]!
767         vmul.f32        q10, q0,  q8
768         vld1.32         {q1},     [r1,:128]!
769         vld1.32         {q9},     [r2,:128]!
770         vmul.f32        q11, q1,  q9
771         vld1.32         {q2-q3},  [r3,:128]!
772         vst1.32         {q12-q13},[r0,:128]!
773         b               1b
774 2:      vst1.32         {q12-q13},[r0,:128]!
775         bx              lr
776 endfunc
777
778 function ff_vector_clipf_neon, export=1
779 VFP     vdup.32         q1,  d0[1]
780 VFP     vdup.32         q0,  d0[0]
781 NOVFP   vdup.32         q0,  r2
782 NOVFP   vdup.32         q1,  r3
783 NOVFP   ldr             r2,  [sp]
784         vld1.f32        {q2},[r1,:128]!
785         vmin.f32        q10, q2,  q1
786         vld1.f32        {q3},[r1,:128]!
787         vmin.f32        q11, q3,  q1
788 1:      vmax.f32        q8,  q10, q0
789         vmax.f32        q9,  q11, q0
790         subs            r2,  r2,  #8
791         beq             2f
792         vld1.f32        {q2},[r1,:128]!
793         vmin.f32        q10, q2,  q1
794         vld1.f32        {q3},[r1,:128]!
795         vmin.f32        q11, q3,  q1
796         vst1.f32        {q8},[r0,:128]!
797         vst1.f32        {q9},[r0,:128]!
798         b               1b
799 2:      vst1.f32        {q8},[r0,:128]!
800         vst1.f32        {q9},[r0,:128]!
801         bx              lr
802 endfunc
803
804 function ff_apply_window_int16_neon, export=1
805         push            {r4,lr}
806         add             r4,  r1,  r3,  lsl #1
807         add             lr,  r0,  r3,  lsl #1
808         sub             r4,  r4,  #16
809         sub             lr,  lr,  #16
810         mov             r12, #-16
811 1:
812         vld1.16         {q0},     [r1,:128]!
813         vld1.16         {q2},     [r2,:128]!
814         vld1.16         {q1},     [r4,:128], r12
815         vrev64.16       q3,  q2
816         vqrdmulh.s16    q0,  q0,  q2
817         vqrdmulh.s16    d2,  d2,  d7
818         vqrdmulh.s16    d3,  d3,  d6
819         vst1.16         {q0},     [r0,:128]!
820         vst1.16         {q1},     [lr,:128], r12
821         subs            r3,  r3,  #16
822         bgt             1b
823
824         pop             {r4,pc}
825 endfunc
826
827 function ff_vector_clip_int32_neon, export=1
828         vdup.32         q0,  r2
829         vdup.32         q1,  r3
830         ldr             r2,  [sp]
831 1:
832         vld1.32         {q2-q3},  [r1,:128]!
833         vmin.s32        q2,  q2,  q1
834         vmin.s32        q3,  q3,  q1
835         vmax.s32        q2,  q2,  q0
836         vmax.s32        q3,  q3,  q0
837         vst1.32         {q2-q3},  [r0,:128]!
838         subs            r2,  r2,  #8
839         bgt             1b
840         bx              lr
841 endfunc