]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/dsputil_neon.S
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libavcodec / arm / dsputil_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "libavutil/arm/asm.S"
24
25         preserve8
26
27 function ff_clear_block_neon, export=1
28         vmov.i16        q0,  #0
29         .rept           8
30         vst1.16         {q0}, [r0,:128]!
31         .endr
32         bx              lr
33 endfunc
34
35 function ff_clear_blocks_neon, export=1
36         vmov.i16        q0,  #0
37         .rept           8*6
38         vst1.16         {q0}, [r0,:128]!
39         .endr
40         bx              lr
41 endfunc
42
43 .macro  pixels16        rnd=1, avg=0
44   .if \avg
45         mov             r12, r0
46   .endif
47 1:      vld1.8          {q0},     [r1], r2
48         vld1.8          {q1},     [r1], r2
49         vld1.8          {q2},     [r1], r2
50         pld             [r1, r2, lsl #2]
51         vld1.8          {q3},     [r1], r2
52         pld             [r1]
53         pld             [r1, r2]
54         pld             [r1, r2, lsl #1]
55   .if \avg
56         vld1.8          {q8},     [r12,:128], r2
57         vrhadd.u8       q0,  q0,  q8
58         vld1.8          {q9},     [r12,:128], r2
59         vrhadd.u8       q1,  q1,  q9
60         vld1.8          {q10},    [r12,:128], r2
61         vrhadd.u8       q2,  q2,  q10
62         vld1.8          {q11},    [r12,:128], r2
63         vrhadd.u8       q3,  q3,  q11
64   .endif
65         subs            r3,  r3,  #4
66         vst1.64         {q0},     [r0,:128], r2
67         vst1.64         {q1},     [r0,:128], r2
68         vst1.64         {q2},     [r0,:128], r2
69         vst1.64         {q3},     [r0,:128], r2
70         bne             1b
71         bx              lr
72 .endm
73
74 .macro  pixels16_x2     rnd=1, avg=0
75 1:      vld1.8          {d0-d2},  [r1], r2
76         vld1.8          {d4-d6},  [r1], r2
77         pld             [r1]
78         pld             [r1, r2]
79         subs            r3,  r3,  #2
80         vext.8          q1,  q0,  q1,  #1
81         avg             q0,  q0,  q1
82         vext.8          q3,  q2,  q3,  #1
83         avg             q2,  q2,  q3
84   .if \avg
85         vld1.8          {q1},     [r0,:128], r2
86         vld1.8          {q3},     [r0,:128]
87         vrhadd.u8       q0,  q0,  q1
88         vrhadd.u8       q2,  q2,  q3
89         sub             r0,  r0,  r2
90   .endif
91         vst1.8          {q0},     [r0,:128], r2
92         vst1.8          {q2},     [r0,:128], r2
93         bne             1b
94         bx              lr
95 .endm
96
97 .macro  pixels16_y2     rnd=1, avg=0
98         sub             r3,  r3,  #2
99         vld1.8          {q0},     [r1], r2
100         vld1.8          {q1},     [r1], r2
101 1:      subs            r3,  r3,  #2
102         avg             q2,  q0,  q1
103         vld1.8          {q0},     [r1], r2
104         avg             q3,  q0,  q1
105         vld1.8          {q1},     [r1], r2
106         pld             [r1]
107         pld             [r1, r2]
108   .if \avg
109         vld1.8          {q8},     [r0,:128], r2
110         vld1.8          {q9},     [r0,:128]
111         vrhadd.u8       q2,  q2,  q8
112         vrhadd.u8       q3,  q3,  q9
113         sub             r0,  r0,  r2
114   .endif
115         vst1.8          {q2},     [r0,:128], r2
116         vst1.8          {q3},     [r0,:128], r2
117         bne             1b
118
119         avg             q2,  q0,  q1
120         vld1.8          {q0},     [r1], r2
121         avg             q3,  q0,  q1
122   .if \avg
123         vld1.8          {q8},     [r0,:128], r2
124         vld1.8          {q9},     [r0,:128]
125         vrhadd.u8       q2,  q2,  q8
126         vrhadd.u8       q3,  q3,  q9
127         sub             r0,  r0,  r2
128   .endif
129         vst1.8          {q2},     [r0,:128], r2
130         vst1.8          {q3},     [r0,:128], r2
131
132         bx              lr
133 .endm
134
135 .macro  pixels16_xy2    rnd=1, avg=0
136         sub             r3,  r3,  #2
137         vld1.8          {d0-d2},  [r1], r2
138         vld1.8          {d4-d6},  [r1], r2
139 NRND    vmov.i16        q13, #1
140         pld             [r1]
141         pld             [r1, r2]
142         vext.8          q1,  q0,  q1,  #1
143         vext.8          q3,  q2,  q3,  #1
144         vaddl.u8        q8,  d0,  d2
145         vaddl.u8        q10, d1,  d3
146         vaddl.u8        q9,  d4,  d6
147         vaddl.u8        q11, d5,  d7
148 1:      subs            r3,  r3,  #2
149         vld1.8          {d0-d2},  [r1], r2
150         vadd.u16        q12, q8,  q9
151         pld             [r1]
152 NRND    vadd.u16        q12, q12, q13
153         vext.8          q15, q0,  q1,  #1
154         vadd.u16        q1 , q10, q11
155         shrn            d28, q12, #2
156 NRND    vadd.u16        q1,  q1,  q13
157         shrn            d29, q1,  #2
158   .if \avg
159         vld1.8          {q8},     [r0,:128]
160         vrhadd.u8       q14, q14, q8
161   .endif
162         vaddl.u8        q8,  d0,  d30
163         vld1.8          {d2-d4},  [r1], r2
164         vaddl.u8        q10, d1,  d31
165         vst1.8          {q14},    [r0,:128], r2
166         vadd.u16        q12, q8,  q9
167         pld             [r1, r2]
168 NRND    vadd.u16        q12, q12, q13
169         vext.8          q2,  q1,  q2,  #1
170         vadd.u16        q0,  q10, q11
171         shrn            d30, q12, #2
172 NRND    vadd.u16        q0,  q0,  q13
173         shrn            d31, q0,  #2
174   .if \avg
175         vld1.8          {q9},     [r0,:128]
176         vrhadd.u8       q15, q15, q9
177   .endif
178         vaddl.u8        q9,  d2,  d4
179         vaddl.u8        q11, d3,  d5
180         vst1.8          {q15},    [r0,:128], r2
181         bgt             1b
182
183         vld1.8          {d0-d2},  [r1], r2
184         vadd.u16        q12, q8,  q9
185 NRND    vadd.u16        q12, q12, q13
186         vext.8          q15, q0,  q1,  #1
187         vadd.u16        q1 , q10, q11
188         shrn            d28, q12, #2
189 NRND    vadd.u16        q1,  q1,  q13
190         shrn            d29, q1,  #2
191   .if \avg
192         vld1.8          {q8},     [r0,:128]
193         vrhadd.u8       q14, q14, q8
194   .endif
195         vaddl.u8        q8,  d0,  d30
196         vaddl.u8        q10, d1,  d31
197         vst1.8          {q14},    [r0,:128], r2
198         vadd.u16        q12, q8,  q9
199 NRND    vadd.u16        q12, q12, q13
200         vadd.u16        q0,  q10, q11
201         shrn            d30, q12, #2
202 NRND    vadd.u16        q0,  q0,  q13
203         shrn            d31, q0,  #2
204   .if \avg
205         vld1.8          {q9},     [r0,:128]
206         vrhadd.u8       q15, q15, q9
207   .endif
208         vst1.8          {q15},    [r0,:128], r2
209
210         bx              lr
211 .endm
212
213 .macro  pixels8         rnd=1, avg=0
214 1:      vld1.8          {d0},     [r1], r2
215         vld1.8          {d1},     [r1], r2
216         vld1.8          {d2},     [r1], r2
217         pld             [r1, r2, lsl #2]
218         vld1.8          {d3},     [r1], r2
219         pld             [r1]
220         pld             [r1, r2]
221         pld             [r1, r2, lsl #1]
222   .if \avg
223         vld1.8          {d4},     [r0,:64], r2
224         vrhadd.u8       d0,  d0,  d4
225         vld1.8          {d5},     [r0,:64], r2
226         vrhadd.u8       d1,  d1,  d5
227         vld1.8          {d6},     [r0,:64], r2
228         vrhadd.u8       d2,  d2,  d6
229         vld1.8          {d7},     [r0,:64], r2
230         vrhadd.u8       d3,  d3,  d7
231         sub             r0,  r0,  r2,  lsl #2
232   .endif
233         subs            r3,  r3,  #4
234         vst1.8          {d0},     [r0,:64], r2
235         vst1.8          {d1},     [r0,:64], r2
236         vst1.8          {d2},     [r0,:64], r2
237         vst1.8          {d3},     [r0,:64], r2
238         bne             1b
239         bx              lr
240 .endm
241
242 .macro  pixels8_x2      rnd=1, avg=0
243 1:      vld1.8          {q0},     [r1], r2
244         vext.8          d1,  d0,  d1,  #1
245         vld1.8          {q1},     [r1], r2
246         vext.8          d3,  d2,  d3,  #1
247         pld             [r1]
248         pld             [r1, r2]
249         subs            r3,  r3,  #2
250         vswp            d1,  d2
251         avg             q0,  q0,  q1
252   .if \avg
253         vld1.8          {d4},     [r0,:64], r2
254         vld1.8          {d5},     [r0,:64]
255         vrhadd.u8       q0,  q0,  q2
256         sub             r0,  r0,  r2
257   .endif
258         vst1.8          {d0},     [r0,:64], r2
259         vst1.8          {d1},     [r0,:64], r2
260         bne             1b
261         bx              lr
262 .endm
263
264 .macro  pixels8_y2      rnd=1, avg=0
265         sub             r3,  r3,  #2
266         vld1.8          {d0},     [r1], r2
267         vld1.8          {d1},     [r1], r2
268 1:      subs            r3,  r3,  #2
269         avg             d4,  d0,  d1
270         vld1.8          {d0},     [r1], r2
271         avg             d5,  d0,  d1
272         vld1.8          {d1},     [r1], r2
273         pld             [r1]
274         pld             [r1, r2]
275   .if \avg
276         vld1.8          {d2},     [r0,:64], r2
277         vld1.8          {d3},     [r0,:64]
278         vrhadd.u8       q2,  q2,  q1
279         sub             r0,  r0,  r2
280   .endif
281         vst1.8          {d4},     [r0,:64], r2
282         vst1.8          {d5},     [r0,:64], r2
283         bne             1b
284
285         avg             d4,  d0,  d1
286         vld1.8          {d0},     [r1], r2
287         avg             d5,  d0,  d1
288   .if \avg
289         vld1.8          {d2},     [r0,:64], r2
290         vld1.8          {d3},     [r0,:64]
291         vrhadd.u8       q2,  q2,  q1
292         sub             r0,  r0,  r2
293   .endif
294         vst1.8          {d4},     [r0,:64], r2
295         vst1.8          {d5},     [r0,:64], r2
296
297         bx              lr
298 .endm
299
300 .macro  pixels8_xy2     rnd=1, avg=0
301         sub             r3,  r3,  #2
302         vld1.8          {q0},     [r1], r2
303         vld1.8          {q1},     [r1], r2
304 NRND    vmov.i16        q11, #1
305         pld             [r1]
306         pld             [r1, r2]
307         vext.8          d4,  d0,  d1,  #1
308         vext.8          d6,  d2,  d3,  #1
309         vaddl.u8        q8,  d0,  d4
310         vaddl.u8        q9,  d2,  d6
311 1:      subs            r3,  r3,  #2
312         vld1.8          {q0},     [r1], r2
313         pld             [r1]
314         vadd.u16        q10, q8,  q9
315         vext.8          d4,  d0,  d1,  #1
316 NRND    vadd.u16        q10, q10, q11
317         vaddl.u8        q8,  d0,  d4
318         shrn            d5,  q10, #2
319         vld1.8          {q1},     [r1], r2
320         vadd.u16        q10, q8,  q9
321         pld             [r1, r2]
322   .if \avg
323         vld1.8          {d7},     [r0,:64]
324         vrhadd.u8       d5,  d5,  d7
325   .endif
326 NRND    vadd.u16        q10, q10, q11
327         vst1.8          {d5},     [r0,:64], r2
328         shrn            d7,  q10, #2
329   .if \avg
330         vld1.8          {d5},     [r0,:64]
331         vrhadd.u8       d7,  d7,  d5
332   .endif
333         vext.8          d6,  d2,  d3,  #1
334         vaddl.u8        q9,  d2,  d6
335         vst1.8          {d7},     [r0,:64], r2
336         bgt             1b
337
338         vld1.8          {q0},     [r1], r2
339         vadd.u16        q10, q8,  q9
340         vext.8          d4,  d0,  d1,  #1
341 NRND    vadd.u16        q10, q10, q11
342         vaddl.u8        q8,  d0,  d4
343         shrn            d5,  q10, #2
344         vadd.u16        q10, q8,  q9
345   .if \avg
346         vld1.8          {d7},     [r0,:64]
347         vrhadd.u8       d5,  d5,  d7
348   .endif
349 NRND    vadd.u16        q10, q10, q11
350         vst1.8          {d5},     [r0,:64], r2
351         shrn            d7,  q10, #2
352   .if \avg
353         vld1.8          {d5},     [r0,:64]
354         vrhadd.u8       d7,  d7,  d5
355   .endif
356         vst1.8          {d7},     [r0,:64], r2
357
358         bx              lr
359 .endm
360
361 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
362   .if \rnd
363     .macro avg  rd, rn, rm
364         vrhadd.u8       \rd, \rn, \rm
365     .endm
366     .macro shrn rd, rn, rm
367         vrshrn.u16      \rd, \rn, \rm
368     .endm
369     .macro NRND insn:vararg
370     .endm
371   .else
372     .macro avg  rd, rn, rm
373         vhadd.u8        \rd, \rn, \rm
374     .endm
375     .macro shrn rd, rn, rm
376         vshrn.u16       \rd, \rn, \rm
377     .endm
378     .macro NRND insn:vararg
379         \insn
380     .endm
381   .endif
382 function ff_\pfx\name\suf\()_neon, export=1
383         \name           \rnd, \avg
384 endfunc
385         .purgem         avg
386         .purgem         shrn
387         .purgem         NRND
388 .endm
389
390 .macro  pixfunc2        pfx, name, avg=0
391         pixfunc         \pfx, \name,          rnd=1, avg=\avg
392         pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
393 .endm
394
395 function ff_put_h264_qpel16_mc00_neon, export=1
396         mov             r3,  #16
397 endfunc
398
399         pixfunc         put_, pixels16,     avg=0
400         pixfunc2        put_, pixels16_x2,  avg=0
401         pixfunc2        put_, pixels16_y2,  avg=0
402         pixfunc2        put_, pixels16_xy2, avg=0
403
404 function ff_avg_h264_qpel16_mc00_neon, export=1
405         mov             r3,  #16
406 endfunc
407
408         pixfunc         avg_, pixels16,     avg=1
409         pixfunc2        avg_, pixels16_x2,  avg=1
410         pixfunc2        avg_, pixels16_y2,  avg=1
411         pixfunc2        avg_, pixels16_xy2, avg=1
412
413 function ff_put_h264_qpel8_mc00_neon, export=1
414         mov             r3,  #8
415 endfunc
416
417         pixfunc         put_, pixels8,     avg=0
418         pixfunc2        put_, pixels8_x2,  avg=0
419         pixfunc2        put_, pixels8_y2,  avg=0
420         pixfunc2        put_, pixels8_xy2, avg=0
421
422 function ff_avg_h264_qpel8_mc00_neon, export=1
423         mov             r3,  #8
424 endfunc
425
426         pixfunc         avg_, pixels8,     avg=1
427         pixfunc2        avg_, pixels8_x2,  avg=1
428         pixfunc2        avg_, pixels8_y2,  avg=1
429         pixfunc2        avg_, pixels8_xy2, avg=1
430
431 function ff_put_pixels_clamped_neon, export=1
432         vld1.16         {d16-d19}, [r0,:128]!
433         vqmovun.s16     d0, q8
434         vld1.16         {d20-d23}, [r0,:128]!
435         vqmovun.s16     d1, q9
436         vld1.16         {d24-d27}, [r0,:128]!
437         vqmovun.s16     d2, q10
438         vld1.16         {d28-d31}, [r0,:128]!
439         vqmovun.s16     d3, q11
440         vst1.8          {d0},      [r1,:64], r2
441         vqmovun.s16     d4, q12
442         vst1.8          {d1},      [r1,:64], r2
443         vqmovun.s16     d5, q13
444         vst1.8          {d2},      [r1,:64], r2
445         vqmovun.s16     d6, q14
446         vst1.8          {d3},      [r1,:64], r2
447         vqmovun.s16     d7, q15
448         vst1.8          {d4},      [r1,:64], r2
449         vst1.8          {d5},      [r1,:64], r2
450         vst1.8          {d6},      [r1,:64], r2
451         vst1.8          {d7},      [r1,:64], r2
452         bx              lr
453 endfunc
454
455 function ff_put_signed_pixels_clamped_neon, export=1
456         vmov.u8         d31, #128
457         vld1.16         {d16-d17}, [r0,:128]!
458         vqmovn.s16      d0, q8
459         vld1.16         {d18-d19}, [r0,:128]!
460         vqmovn.s16      d1, q9
461         vld1.16         {d16-d17}, [r0,:128]!
462         vqmovn.s16      d2, q8
463         vld1.16         {d18-d19}, [r0,:128]!
464         vadd.u8         d0, d0, d31
465         vld1.16         {d20-d21}, [r0,:128]!
466         vadd.u8         d1, d1, d31
467         vld1.16         {d22-d23}, [r0,:128]!
468         vadd.u8         d2, d2, d31
469         vst1.8          {d0},      [r1,:64], r2
470         vqmovn.s16      d3, q9
471         vst1.8          {d1},      [r1,:64], r2
472         vqmovn.s16      d4, q10
473         vst1.8          {d2},      [r1,:64], r2
474         vqmovn.s16      d5, q11
475         vld1.16         {d24-d25}, [r0,:128]!
476         vadd.u8         d3, d3, d31
477         vld1.16         {d26-d27}, [r0,:128]!
478         vadd.u8         d4, d4, d31
479         vadd.u8         d5, d5, d31
480         vst1.8          {d3},      [r1,:64], r2
481         vqmovn.s16      d6, q12
482         vst1.8          {d4},      [r1,:64], r2
483         vqmovn.s16      d7, q13
484         vst1.8          {d5},      [r1,:64], r2
485         vadd.u8         d6, d6, d31
486         vadd.u8         d7, d7, d31
487         vst1.8          {d6},      [r1,:64], r2
488         vst1.8          {d7},      [r1,:64], r2
489         bx              lr
490 endfunc
491
492 function ff_add_pixels_clamped_neon, export=1
493         mov             r3, r1
494         vld1.8          {d16},   [r1,:64], r2
495         vld1.16         {d0-d1}, [r0,:128]!
496         vaddw.u8        q0, q0, d16
497         vld1.8          {d17},   [r1,:64], r2
498         vld1.16         {d2-d3}, [r0,:128]!
499         vqmovun.s16     d0, q0
500         vld1.8          {d18},   [r1,:64], r2
501         vaddw.u8        q1, q1, d17
502         vld1.16         {d4-d5}, [r0,:128]!
503         vaddw.u8        q2, q2, d18
504         vst1.8          {d0},    [r3,:64], r2
505         vqmovun.s16     d2, q1
506         vld1.8          {d19},   [r1,:64], r2
507         vld1.16         {d6-d7}, [r0,:128]!
508         vaddw.u8        q3, q3, d19
509         vqmovun.s16     d4, q2
510         vst1.8          {d2},    [r3,:64], r2
511         vld1.8          {d16},   [r1,:64], r2
512         vqmovun.s16     d6, q3
513         vld1.16         {d0-d1}, [r0,:128]!
514         vaddw.u8        q0, q0, d16
515         vst1.8          {d4},    [r3,:64], r2
516         vld1.8          {d17},   [r1,:64], r2
517         vld1.16         {d2-d3}, [r0,:128]!
518         vaddw.u8        q1, q1, d17
519         vst1.8          {d6},    [r3,:64], r2
520         vqmovun.s16     d0, q0
521         vld1.8          {d18},   [r1,:64], r2
522         vld1.16         {d4-d5}, [r0,:128]!
523         vaddw.u8        q2, q2, d18
524         vst1.8          {d0},    [r3,:64], r2
525         vqmovun.s16     d2, q1
526         vld1.8          {d19},   [r1,:64], r2
527         vqmovun.s16     d4, q2
528         vld1.16         {d6-d7}, [r0,:128]!
529         vaddw.u8        q3, q3, d19
530         vst1.8          {d2},    [r3,:64], r2
531         vqmovun.s16     d6, q3
532         vst1.8          {d4},    [r3,:64], r2
533         vst1.8          {d6},    [r3,:64], r2
534         bx              lr
535 endfunc
536
537 function ff_vector_fmul_window_neon, export=1
538         push            {r4,r5,lr}
539         ldr             lr,  [sp, #12]
540         sub             r2,  r2,  #8
541         sub             r5,  lr,  #2
542         add             r2,  r2,  r5, lsl #2
543         add             r4,  r3,  r5, lsl #3
544         add             ip,  r0,  r5, lsl #3
545         mov             r5,  #-16
546         vld1.32         {d0,d1},  [r1,:128]!
547         vld1.32         {d2,d3},  [r2,:128], r5
548         vld1.32         {d4,d5},  [r3,:128]!
549         vld1.32         {d6,d7},  [r4,:128], r5
550 1:      subs            lr,  lr,  #4
551         vmul.f32        d22, d0,  d4
552         vrev64.32       q3,  q3
553         vmul.f32        d23, d1,  d5
554         vrev64.32       q1,  q1
555         vmul.f32        d20, d0,  d7
556         vmul.f32        d21, d1,  d6
557         beq             2f
558         vmla.f32        d22, d3,  d7
559         vld1.32         {d0,d1},  [r1,:128]!
560         vmla.f32        d23, d2,  d6
561         vld1.32         {d18,d19},[r2,:128], r5
562         vmls.f32        d20, d3,  d4
563         vld1.32         {d24,d25},[r3,:128]!
564         vmls.f32        d21, d2,  d5
565         vld1.32         {d6,d7},  [r4,:128], r5
566         vmov            q1,  q9
567         vrev64.32       q11, q11
568         vmov            q2,  q12
569         vswp            d22, d23
570         vst1.32         {d20,d21},[r0,:128]!
571         vst1.32         {d22,d23},[ip,:128], r5
572         b               1b
573 2:      vmla.f32        d22, d3,  d7
574         vmla.f32        d23, d2,  d6
575         vmls.f32        d20, d3,  d4
576         vmls.f32        d21, d2,  d5
577         vrev64.32       q11, q11
578         vswp            d22, d23
579         vst1.32         {d20,d21},[r0,:128]!
580         vst1.32         {d22,d23},[ip,:128], r5
581         pop             {r4,r5,pc}
582 endfunc
583
584 #if CONFIG_VORBIS_DECODER
585 function ff_vorbis_inverse_coupling_neon, export=1
586         vmov.i32        q10, #1<<31
587         subs            r2,  r2,  #4
588         mov             r3,  r0
589         mov             r12, r1
590         beq             3f
591
592         vld1.32         {d24-d25},[r1,:128]!
593         vld1.32         {d22-d23},[r0,:128]!
594         vcle.s32        q8,  q12, #0
595         vand            q9,  q11, q10
596         veor            q12, q12, q9
597         vand            q2,  q12, q8
598         vbic            q3,  q12, q8
599         vadd.f32        q12, q11, q2
600         vsub.f32        q11, q11, q3
601 1:      vld1.32         {d2-d3},  [r1,:128]!
602         vld1.32         {d0-d1},  [r0,:128]!
603         vcle.s32        q8,  q1,  #0
604         vand            q9,  q0,  q10
605         veor            q1,  q1,  q9
606         vst1.32         {d24-d25},[r3, :128]!
607         vst1.32         {d22-d23},[r12,:128]!
608         vand            q2,  q1,  q8
609         vbic            q3,  q1,  q8
610         vadd.f32        q1,  q0,  q2
611         vsub.f32        q0,  q0,  q3
612         subs            r2,  r2,  #8
613         ble             2f
614         vld1.32         {d24-d25},[r1,:128]!
615         vld1.32         {d22-d23},[r0,:128]!
616         vcle.s32        q8,  q12, #0
617         vand            q9,  q11, q10
618         veor            q12, q12, q9
619         vst1.32         {d2-d3},  [r3, :128]!
620         vst1.32         {d0-d1},  [r12,:128]!
621         vand            q2,  q12, q8
622         vbic            q3,  q12, q8
623         vadd.f32        q12, q11, q2
624         vsub.f32        q11, q11, q3
625         b               1b
626
627 2:      vst1.32         {d2-d3},  [r3, :128]!
628         vst1.32         {d0-d1},  [r12,:128]!
629         it              lt
630         bxlt            lr
631
632 3:      vld1.32         {d2-d3},  [r1,:128]
633         vld1.32         {d0-d1},  [r0,:128]
634         vcle.s32        q8,  q1,  #0
635         vand            q9,  q0,  q10
636         veor            q1,  q1,  q9
637         vand            q2,  q1,  q8
638         vbic            q3,  q1,  q8
639         vadd.f32        q1,  q0,  q2
640         vsub.f32        q0,  q0,  q3
641         vst1.32         {d2-d3},  [r0,:128]!
642         vst1.32         {d0-d1},  [r1,:128]!
643         bx              lr
644 endfunc
645 #endif
646
647 function ff_vector_fmul_scalar_neon, export=1
648 VFP     len .req r2
649 NOVFP   len .req r3
650 VFP     vdup.32         q8,  d0[0]
651 NOVFP   vdup.32         q8,  r2
652         bics            r12, len, #15
653         beq             3f
654         vld1.32         {q0},[r1,:128]!
655         vld1.32         {q1},[r1,:128]!
656 1:      vmul.f32        q0,  q0,  q8
657         vld1.32         {q2},[r1,:128]!
658         vmul.f32        q1,  q1,  q8
659         vld1.32         {q3},[r1,:128]!
660         vmul.f32        q2,  q2,  q8
661         vst1.32         {q0},[r0,:128]!
662         vmul.f32        q3,  q3,  q8
663         vst1.32         {q1},[r0,:128]!
664         subs            r12, r12, #16
665         beq             2f
666         vld1.32         {q0},[r1,:128]!
667         vst1.32         {q2},[r0,:128]!
668         vld1.32         {q1},[r1,:128]!
669         vst1.32         {q3},[r0,:128]!
670         b               1b
671 2:      vst1.32         {q2},[r0,:128]!
672         vst1.32         {q3},[r0,:128]!
673         ands            len, len, #15
674         it              eq
675         bxeq            lr
676 3:      vld1.32         {q0},[r1,:128]!
677         vmul.f32        q0,  q0,  q8
678         vst1.32         {q0},[r0,:128]!
679         subs            len, len, #4
680         bgt             3b
681         bx              lr
682         .unreq          len
683 endfunc
684
685 function ff_butterflies_float_neon, export=1
686 1:      vld1.32         {q0},[r0,:128]
687         vld1.32         {q1},[r1,:128]
688         vsub.f32        q2,  q0,  q1
689         vadd.f32        q1,  q0,  q1
690         vst1.32         {q2},[r1,:128]!
691         vst1.32         {q1},[r0,:128]!
692         subs            r2,  r2,  #4
693         bgt             1b
694         bx              lr
695 endfunc
696
697 function ff_scalarproduct_float_neon, export=1
698         vmov.f32        q2,  #0.0
699 1:      vld1.32         {q0},[r0,:128]!
700         vld1.32         {q1},[r1,:128]!
701         vmla.f32        q2,  q0,  q1
702         subs            r2,  r2,  #4
703         bgt             1b
704         vadd.f32        d0,  d4,  d5
705         vpadd.f32       d0,  d0,  d0
706 NOVFP   vmov.32         r0,  d0[0]
707         bx              lr
708 endfunc
709
710 function ff_vector_fmul_reverse_neon, export=1
711         add             r2,  r2,  r3,  lsl #2
712         sub             r2,  r2,  #32
713         mov             r12, #-32
714         vld1.32         {q0-q1},  [r1,:128]!
715         vld1.32         {q2-q3},  [r2,:128], r12
716 1:      pld             [r1, #32]
717         vrev64.32       q3,  q3
718         vmul.f32        d16, d0,  d7
719         vmul.f32        d17, d1,  d6
720         pld             [r2, #-32]
721         vrev64.32       q2,  q2
722         vmul.f32        d18, d2,  d5
723         vmul.f32        d19, d3,  d4
724         subs            r3,  r3,  #8
725         beq             2f
726         vld1.32         {q0-q1},  [r1,:128]!
727         vld1.32         {q2-q3},  [r2,:128], r12
728         vst1.32         {q8-q9},  [r0,:128]!
729         b               1b
730 2:      vst1.32         {q8-q9},  [r0,:128]!
731         bx              lr
732 endfunc
733
734 function ff_vector_fmul_add_neon, export=1
735         ldr             r12, [sp]
736         vld1.32         {q0-q1},  [r1,:128]!
737         vld1.32         {q8-q9},  [r2,:128]!
738         vld1.32         {q2-q3},  [r3,:128]!
739         vmul.f32        q10, q0,  q8
740         vmul.f32        q11, q1,  q9
741 1:      vadd.f32        q12, q2,  q10
742         vadd.f32        q13, q3,  q11
743         pld             [r1, #16]
744         pld             [r2, #16]
745         pld             [r3, #16]
746         subs            r12, r12, #8
747         beq             2f
748         vld1.32         {q0},     [r1,:128]!
749         vld1.32         {q8},     [r2,:128]!
750         vmul.f32        q10, q0,  q8
751         vld1.32         {q1},     [r1,:128]!
752         vld1.32         {q9},     [r2,:128]!
753         vmul.f32        q11, q1,  q9
754         vld1.32         {q2-q3},  [r3,:128]!
755         vst1.32         {q12-q13},[r0,:128]!
756         b               1b
757 2:      vst1.32         {q12-q13},[r0,:128]!
758         bx              lr
759 endfunc
760
761 function ff_vector_clipf_neon, export=1
762 VFP     vdup.32         q1,  d0[1]
763 VFP     vdup.32         q0,  d0[0]
764 NOVFP   vdup.32         q0,  r2
765 NOVFP   vdup.32         q1,  r3
766 NOVFP   ldr             r2,  [sp]
767         vld1.f32        {q2},[r1,:128]!
768         vmin.f32        q10, q2,  q1
769         vld1.f32        {q3},[r1,:128]!
770         vmin.f32        q11, q3,  q1
771 1:      vmax.f32        q8,  q10, q0
772         vmax.f32        q9,  q11, q0
773         subs            r2,  r2,  #8
774         beq             2f
775         vld1.f32        {q2},[r1,:128]!
776         vmin.f32        q10, q2,  q1
777         vld1.f32        {q3},[r1,:128]!
778         vmin.f32        q11, q3,  q1
779         vst1.f32        {q8},[r0,:128]!
780         vst1.f32        {q9},[r0,:128]!
781         b               1b
782 2:      vst1.f32        {q8},[r0,:128]!
783         vst1.f32        {q9},[r0,:128]!
784         bx              lr
785 endfunc
786
787 function ff_apply_window_int16_neon, export=1
788         push            {r4,lr}
789         add             r4,  r1,  r3,  lsl #1
790         add             lr,  r0,  r3,  lsl #1
791         sub             r4,  r4,  #16
792         sub             lr,  lr,  #16
793         mov             r12, #-16
794 1:
795         vld1.16         {q0},     [r1,:128]!
796         vld1.16         {q2},     [r2,:128]!
797         vld1.16         {q1},     [r4,:128], r12
798         vrev64.16       q3,  q2
799         vqrdmulh.s16    q0,  q0,  q2
800         vqrdmulh.s16    d2,  d2,  d7
801         vqrdmulh.s16    d3,  d3,  d6
802         vst1.16         {q0},     [r0,:128]!
803         vst1.16         {q1},     [lr,:128], r12
804         subs            r3,  r3,  #16
805         bgt             1b
806
807         pop             {r4,pc}
808 endfunc
809
810 function ff_vector_clip_int32_neon, export=1
811         vdup.32         q0,  r2
812         vdup.32         q1,  r3
813         ldr             r2,  [sp]
814 1:
815         vld1.32         {q2-q3},  [r1,:128]!
816         vmin.s32        q2,  q2,  q1
817         vmin.s32        q3,  q3,  q1
818         vmax.s32        q2,  q2,  q0
819         vmax.s32        q3,  q3,  q0
820         vst1.32         {q2-q3},  [r0,:128]!
821         subs            r2,  r2,  #8
822         bgt             1b
823         bx              lr
824 endfunc