]> git.sesse.net Git - ffmpeg/blob - libavutil/arm/float_dsp_neon.S
Merge commit 'e39a9212ab37a55b346801c77487d8a47b6f9fe2'
[ffmpeg] / libavutil / arm / float_dsp_neon.S
1 /*
2  * ARM NEON optimised Float DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "asm.S"
24
25 function ff_vector_fmul_neon, export=1
26         subs            r3,  r3,  #8
27         vld1.32         {d0-d3},  [r1,:128]!
28         vld1.32         {d4-d7},  [r2,:128]!
29         vmul.f32        q8,  q0,  q2
30         vmul.f32        q9,  q1,  q3
31         beq             3f
32         bics            ip,  r3,  #15
33         beq             2f
34 1:      subs            ip,  ip,  #16
35         vld1.32         {d0-d1},  [r1,:128]!
36         vld1.32         {d4-d5},  [r2,:128]!
37         vmul.f32        q10, q0,  q2
38         vld1.32         {d2-d3},  [r1,:128]!
39         vld1.32         {d6-d7},  [r2,:128]!
40         vmul.f32        q11, q1,  q3
41         vst1.32         {d16-d19},[r0,:128]!
42         vld1.32         {d0-d1},  [r1,:128]!
43         vld1.32         {d4-d5},  [r2,:128]!
44         vmul.f32        q8,  q0,  q2
45         vld1.32         {d2-d3},  [r1,:128]!
46         vld1.32         {d6-d7},  [r2,:128]!
47         vmul.f32        q9,  q1,  q3
48         vst1.32         {d20-d23},[r0,:128]!
49         bne             1b
50         ands            r3,  r3,  #15
51         beq             3f
52 2:      vld1.32         {d0-d1},  [r1,:128]!
53         vld1.32         {d4-d5},  [r2,:128]!
54         vst1.32         {d16-d17},[r0,:128]!
55         vmul.f32        q8,  q0,  q2
56         vld1.32         {d2-d3},  [r1,:128]!
57         vld1.32         {d6-d7},  [r2,:128]!
58         vst1.32         {d18-d19},[r0,:128]!
59         vmul.f32        q9,  q1,  q3
60 3:      vst1.32         {d16-d19},[r0,:128]!
61         bx              lr
62 endfunc
63
64 function ff_vector_fmac_scalar_neon, export=1
65 VFP     len .req r2
66 VFP     acc .req r3
67 NOVFP   len .req r3
68 NOVFP   acc .req r2
69 VFP     vdup.32         q15, d0[0]
70 NOVFP   vdup.32         q15, r2
71         bics            r12, len, #15
72         mov             acc, r0
73         beq             3f
74         vld1.32         {q0},     [r1,:128]!
75         vld1.32         {q8},     [acc,:128]!
76         vld1.32         {q1},     [r1,:128]!
77         vld1.32         {q9},     [acc,:128]!
78 1:      vmla.f32        q8,  q0,  q15
79         vld1.32         {q2},     [r1,:128]!
80         vld1.32         {q10},    [acc,:128]!
81         vmla.f32        q9,  q1,  q15
82         vld1.32         {q3},     [r1,:128]!
83         vld1.32         {q11},    [acc,:128]!
84         vmla.f32        q10, q2,  q15
85         vst1.32         {q8},     [r0,:128]!
86         vmla.f32        q11, q3,  q15
87         vst1.32         {q9},     [r0,:128]!
88         subs            r12, r12, #16
89         beq             2f
90         vld1.32         {q0},     [r1,:128]!
91         vld1.32         {q8},     [acc,:128]!
92         vst1.32         {q10},    [r0,:128]!
93         vld1.32         {q1},     [r1,:128]!
94         vld1.32         {q9},     [acc,:128]!
95         vst1.32         {q11},    [r0,:128]!
96         b               1b
97 2:      vst1.32         {q10},    [r0,:128]!
98         vst1.32         {q11},    [r0,:128]!
99         ands            len, len, #15
100         it              eq
101         bxeq            lr
102 3:      vld1.32         {q0},     [r1,:128]!
103         vld1.32         {q8},     [acc,:128]!
104         vmla.f32        q8,  q0,  q15
105         vst1.32         {q8},     [r0,:128]!
106         subs            len, len, #4
107         bgt             3b
108         bx              lr
109         .unreq          len
110 endfunc
111
112 function ff_vector_fmul_scalar_neon, export=1
113 VFP     len .req r2
114 NOVFP   len .req r3
115 VFP     vdup.32         q8,  d0[0]
116 NOVFP   vdup.32         q8,  r2
117         bics            r12, len, #15
118         beq             3f
119         vld1.32         {q0},[r1,:128]!
120         vld1.32         {q1},[r1,:128]!
121 1:      vmul.f32        q0,  q0,  q8
122         vld1.32         {q2},[r1,:128]!
123         vmul.f32        q1,  q1,  q8
124         vld1.32         {q3},[r1,:128]!
125         vmul.f32        q2,  q2,  q8
126         vst1.32         {q0},[r0,:128]!
127         vmul.f32        q3,  q3,  q8
128         vst1.32         {q1},[r0,:128]!
129         subs            r12, r12, #16
130         beq             2f
131         vld1.32         {q0},[r1,:128]!
132         vst1.32         {q2},[r0,:128]!
133         vld1.32         {q1},[r1,:128]!
134         vst1.32         {q3},[r0,:128]!
135         b               1b
136 2:      vst1.32         {q2},[r0,:128]!
137         vst1.32         {q3},[r0,:128]!
138         ands            len, len, #15
139         it              eq
140         bxeq            lr
141 3:      vld1.32         {q0},[r1,:128]!
142         vmul.f32        q0,  q0,  q8
143         vst1.32         {q0},[r0,:128]!
144         subs            len, len, #4
145         bgt             3b
146         bx              lr
147         .unreq          len
148 endfunc
149
150 function ff_vector_fmul_window_neon, export=1
151         push            {r4,r5,lr}
152         ldr             lr,  [sp, #12]
153         sub             r2,  r2,  #8
154         sub             r5,  lr,  #2
155         add             r2,  r2,  r5, lsl #2
156         add             r4,  r3,  r5, lsl #3
157         add             ip,  r0,  r5, lsl #3
158         mov             r5,  #-16
159         vld1.32         {d0,d1},  [r1,:128]!
160         vld1.32         {d2,d3},  [r2,:128], r5
161         vld1.32         {d4,d5},  [r3,:128]!
162         vld1.32         {d6,d7},  [r4,:128], r5
163 1:      subs            lr,  lr,  #4
164         vmul.f32        d22, d0,  d4
165         vrev64.32       q3,  q3
166         vmul.f32        d23, d1,  d5
167         vrev64.32       q1,  q1
168         vmul.f32        d20, d0,  d7
169         vmul.f32        d21, d1,  d6
170         beq             2f
171         vmla.f32        d22, d3,  d7
172         vld1.32         {d0,d1},  [r1,:128]!
173         vmla.f32        d23, d2,  d6
174         vld1.32         {d18,d19},[r2,:128], r5
175         vmls.f32        d20, d3,  d4
176         vld1.32         {d24,d25},[r3,:128]!
177         vmls.f32        d21, d2,  d5
178         vld1.32         {d6,d7},  [r4,:128], r5
179         vmov            q1,  q9
180         vrev64.32       q11, q11
181         vmov            q2,  q12
182         vswp            d22, d23
183         vst1.32         {d20,d21},[r0,:128]!
184         vst1.32         {d22,d23},[ip,:128], r5
185         b               1b
186 2:      vmla.f32        d22, d3,  d7
187         vmla.f32        d23, d2,  d6
188         vmls.f32        d20, d3,  d4
189         vmls.f32        d21, d2,  d5
190         vrev64.32       q11, q11
191         vswp            d22, d23
192         vst1.32         {d20,d21},[r0,:128]!
193         vst1.32         {d22,d23},[ip,:128], r5
194         pop             {r4,r5,pc}
195 endfunc
196
197 function ff_vector_fmul_add_neon, export=1
198         ldr             r12, [sp]
199         vld1.32         {q0-q1},  [r1,:128]!
200         vld1.32         {q8-q9},  [r2,:128]!
201         vld1.32         {q2-q3},  [r3,:128]!
202         vmul.f32        q10, q0,  q8
203         vmul.f32        q11, q1,  q9
204 1:      vadd.f32        q12, q2,  q10
205         vadd.f32        q13, q3,  q11
206         pld             [r1, #16]
207         pld             [r2, #16]
208         pld             [r3, #16]
209         subs            r12, r12, #8
210         beq             2f
211         vld1.32         {q0},     [r1,:128]!
212         vld1.32         {q8},     [r2,:128]!
213         vmul.f32        q10, q0,  q8
214         vld1.32         {q1},     [r1,:128]!
215         vld1.32         {q9},     [r2,:128]!
216         vmul.f32        q11, q1,  q9
217         vld1.32         {q2-q3},  [r3,:128]!
218         vst1.32         {q12-q13},[r0,:128]!
219         b               1b
220 2:      vst1.32         {q12-q13},[r0,:128]!
221         bx              lr
222 endfunc
223
224 function ff_vector_fmul_reverse_neon, export=1
225         add             r2,  r2,  r3,  lsl #2
226         sub             r2,  r2,  #32
227         mov             r12, #-32
228         vld1.32         {q0-q1},  [r1,:128]!
229         vld1.32         {q2-q3},  [r2,:128], r12
230 1:      pld             [r1, #32]
231         vrev64.32       q3,  q3
232         vmul.f32        d16, d0,  d7
233         vmul.f32        d17, d1,  d6
234         pld             [r2, #-32]
235         vrev64.32       q2,  q2
236         vmul.f32        d18, d2,  d5
237         vmul.f32        d19, d3,  d4
238         subs            r3,  r3,  #8
239         beq             2f
240         vld1.32         {q0-q1},  [r1,:128]!
241         vld1.32         {q2-q3},  [r2,:128], r12
242         vst1.32         {q8-q9},  [r0,:128]!
243         b               1b
244 2:      vst1.32         {q8-q9},  [r0,:128]!
245         bx              lr
246 endfunc
247
248 function ff_butterflies_float_neon, export=1
249 1:      vld1.32         {q0},[r0,:128]
250         vld1.32         {q1},[r1,:128]
251         vsub.f32        q2,  q0,  q1
252         vadd.f32        q1,  q0,  q1
253         vst1.32         {q2},[r1,:128]!
254         vst1.32         {q1},[r0,:128]!
255         subs            r2,  r2,  #4
256         bgt             1b
257         bx              lr
258 endfunc
259
260 function ff_scalarproduct_float_neon, export=1
261         vmov.f32        q2,  #0.0
262 1:      vld1.32         {q0},[r0,:128]!
263         vld1.32         {q1},[r1,:128]!
264         vmla.f32        q2,  q0,  q1
265         subs            r2,  r2,  #4
266         bgt             1b
267         vadd.f32        d0,  d4,  d5
268         vpadd.f32       d0,  d0,  d0
269 NOVFP   vmov.32         r0,  d0[0]
270         bx              lr
271 endfunc