2 * ARM NEON optimised Float DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 function ff_vector_fmul_neon, export=1
28 ld1 {v0.4S, v1.4S}, [x1], #32
29 ld1 {v2.4S, v3.4S}, [x1], #32
30 ld1 {v4.4S, v5.4S}, [x2], #32
31 ld1 {v6.4S, v7.4S}, [x2], #32
32 fmul v16.4S, v0.4S, v4.4S
33 fmul v17.4S, v1.4S, v5.4S
34 fmul v18.4S, v2.4S, v6.4S
35 fmul v19.4S, v3.4S, v7.4S
36 st1 {v16.4S, v17.4S}, [x0], #32
37 st1 {v18.4S, v19.4S}, [x0], #32
42 function ff_vector_fmac_scalar_neon, export=1
45 ld1 {v16.4S, v17.4S}, [x0], #32
46 ld1 {v18.4S, v19.4S}, [x0], x3
47 ld1 {v4.4S, v5.4S}, [x1], #32
48 ld1 {v6.4S, v7.4S}, [x1], #32
49 fmla v16.4S, v4.4S, v0.S[0]
50 fmla v17.4S, v5.4S, v0.S[0]
51 fmla v18.4S, v6.4S, v0.S[0]
52 fmla v19.4S, v7.4S, v0.S[0]
53 st1 {v16.4S, v17.4S}, [x0], #32
54 st1 {v18.4S, v19.4S}, [x0], #32
59 function ff_vector_fmul_scalar_neon, export=1
64 ld1 {v0.4S, v1.4S}, [x1], #32
66 fmul v0.4S, v0.4S, v16.4S
67 ld1 {v2.4S, v3.4S}, [x1], #32
68 fmul v1.4S, v1.4S, v16.4S
69 fmul v2.4S, v2.4S, v16.4S
70 st1 {v0.4S, v1.4S}, [x0], #32
71 fmul v3.4S, v3.4S, v16.4S
73 ld1 {v0.4S, v1.4S}, [x1], #32
74 st1 {v2.4S, v3.4S}, [x0], #32
77 st1 {v2.4S, v3.4S}, [x0], #32
79 3: ld1 {v0.4S}, [x1], #16
80 fmul v0.4S, v0.4S, v16.4S
81 st1 {v0.4S}, [x0], #16
87 function ff_vector_dmul_scalar_neon, export=1
89 ld1 {v0.2D, v1.2D}, [x1], #32
91 fmul v0.2D, v0.2D, v16.2D
92 ld1 {v2.2D, v3.2D}, [x1], #32
93 fmul v1.2D, v1.2D, v16.2D
94 fmul v2.2D, v2.2D, v16.2D
95 st1 {v0.2D, v1.2D}, [x0], #32
96 fmul v3.2D, v3.2D, v16.2D
97 ld1 {v0.2D, v1.2D}, [x1], #32
98 st1 {v2.2D, v3.2D}, [x0], #32
103 function ff_vector_fmul_window_neon, export=1
107 add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4)
108 add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
109 add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
111 ld1 {v0.4S}, [x1], #16 // s0
112 ld1 {v2.4S}, [x3], #16 // wi
113 ld1 {v1.4S}, [x2], x7 // s1
114 1: ld1 {v3.4S}, [x6], x7 // wj
116 fmul v17.4S, v0.4S, v2.4S // s0 * wi
120 ext v4.16B, v4.16B, v4.16B, #8 // s1_r
121 ext v5.16B, v5.16B, v5.16B, #8 // wj_r
122 ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
123 fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
124 fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
126 ld1 {v0.4S}, [x1], #16
127 fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
128 st1 {v17.4S}, [x5], x7
129 ld1 {v2.4S}, [x3], #16
130 ld1 {v1.4S}, [x2], x7
131 st1 {v16.4S}, [x0], #16
134 fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
135 st1 {v17.4S}, [x5], x7
136 st1 {v16.4S}, [x0], #16
140 function ff_vector_fmul_add_neon, export=1
141 ld1 {v0.4S, v1.4S}, [x1], #32
142 ld1 {v2.4S, v3.4S}, [x2], #32
143 ld1 {v4.4S, v5.4S}, [x3], #32
145 fmla v4.4S, v0.4S, v2.4S
146 fmla v5.4S, v1.4S, v3.4S
148 ld1 {v0.4S, v1.4S}, [x1], #32
149 ld1 {v2.4S, v3.4S}, [x2], #32
150 st1 {v4.4S, v5.4S}, [x0], #32
151 ld1 {v4.4S, v5.4S}, [x3], #32
153 2: st1 {v4.4S, v5.4S}, [x0], #32
157 function ff_vector_fmul_reverse_neon, export=1
159 add x2, x2, x3, lsl #2
162 ld1 {v2.4S, v3.4S}, [x2], x4
163 ld1 {v0.4S, v1.4S}, [x1], #32
167 ext v3.16B, v3.16B, v3.16B, #8
168 ext v2.16B, v2.16B, v2.16B, #8
169 fmul v16.4S, v0.4S, v3.4S
170 fmul v17.4S, v1.4S, v2.4S
172 ld1 {v2.4S, v3.4S}, [x2], x4
173 ld1 {v0.4S, v1.4S}, [x1], #32
174 st1 {v16.4S, v17.4S}, [x0], #32
176 2: st1 {v16.4S, v17.4S}, [x0], #32
180 function ff_butterflies_float_neon, export=1
184 fsub v2.4S, v0.4S, v1.4S
185 fadd v3.4S, v0.4S, v1.4S
186 st1 {v2.4S}, [x1], #16
187 st1 {v3.4S}, [x0], #16
192 function ff_scalarproduct_float_neon, export=1
194 1: ld1 {v0.4S}, [x0], #16
195 ld1 {v1.4S}, [x1], #16
197 fmla v2.4S, v0.4S, v1.4S
199 faddp v0.4S, v2.4S, v2.4S