2 * This file is part of FFmpeg.
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 #include "libavutil/aarch64/asm.S"
21 function ff_ps_add_squares_neon, export=1
22 1: ld1 {v0.4S,v1.4S}, [x1], #32
23 fmul v0.4S, v0.4S, v0.4S
24 fmul v1.4S, v1.4S, v1.4S
25 faddp v2.4S, v0.4S, v1.4S
27 fadd v3.4S, v3.4S, v2.4S
28 st1 {v3.4S}, [x0], #16
34 function ff_ps_mul_pair_single_neon, export=1
35 1: ld1 {v0.4S,v1.4S}, [x1], #32
36 ld1 {v2.4S}, [x2], #16
37 zip1 v3.4S, v2.4S, v2.4S
38 zip2 v4.4S, v2.4S, v2.4S
39 fmul v0.4S, v0.4S, v3.4S
40 fmul v1.4S, v1.4S, v4.4S
41 st1 {v0.4S,v1.4S}, [x0], #32
47 function ff_ps_stereo_interpolate_neon, export=1
50 zip1 v4.4S, v0.4S, v0.4S
51 zip2 v5.4S, v0.4S, v0.4S
52 zip1 v6.4S, v1.4S, v1.4S
53 zip2 v7.4S, v1.4S, v1.4S
56 fadd v4.4S, v4.4S, v6.4S
57 fadd v5.4S, v5.4S, v7.4S
60 fmul v2.4S, v2.4S, v4.4S
61 fmla v2.4S, v3.4S, v5.4S
62 st1 {v2.D}[0], [x0], #8
63 st1 {v2.D}[1], [x1], #8
69 function ff_ps_stereo_interpolate_ipdopd_neon, export=1
70 ld1 {v0.4S,v1.4S}, [x2]
71 ld1 {v6.4S,v7.4S}, [x3]
74 zip1 v16.4S, v0.4S, v0.4S
75 zip2 v17.4S, v0.4S, v0.4S
76 zip1 v18.4S, v2.4S, v1.4S
77 zip2 v19.4S, v2.4S, v1.4S
78 zip1 v20.4S, v6.4S, v6.4S
79 zip2 v21.4S, v6.4S, v6.4S
80 zip1 v22.4S, v3.4S, v7.4S
81 zip2 v23.4S, v3.4S, v7.4S
84 fadd v16.4S, v16.4S, v20.4S
85 fadd v17.4S, v17.4S, v21.4S
88 fmul v4.4S, v2.4S, v16.4S
89 fmla v4.4S, v3.4S, v17.4S
90 fadd v18.4S, v18.4S, v22.4S
91 fadd v19.4S, v19.4S, v23.4S
92 ext v2.16B, v2.16B, v2.16B, #4
93 ext v3.16B, v3.16B, v3.16B, #4
94 fmla v4.4S, v2.4S, v18.4S
95 fmla v4.4S, v3.4S, v19.4S
96 st1 {v4.D}[0], [x0], #8
97 st1 {v4.D}[1], [x1], #8
103 function ff_ps_hybrid_analysis_neon, export=1
105 ld2 {v0.4S,v1.4S}, [x1], #32
106 ld2 {v2.2S,v3.2S}, [x1], #16
107 ld1 {v24.2S}, [x1], #8
108 ld2 {v4.2S,v5.2S}, [x1], #16
109 ld2 {v6.4S,v7.4S}, [x1]
112 ext v6.16B, v6.16B, v6.16B, #8
113 ext v7.16B, v7.16B, v7.16B, #8
120 fadd v16.4S, v0.4S, v6.4S
121 fadd v17.4S, v1.4S, v7.4S
122 fsub v18.4S, v1.4S, v7.4S
123 fsub v19.4S, v0.4S, v6.4S
124 fadd v22.4S, v2.4S, v4.4S
125 fsub v23.4S, v5.4S, v3.4S
126 trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5}
127 trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7}
128 1: ld2 {v2.4S,v3.4S}, [x2], #32
129 ld2 {v4.2S,v5.2S}, [x2], #16
130 ld1 {v6.2S}, [x2], #8
134 fmul v6.2S, v6.2S, v24.2S
135 fmul v0.4S, v2.4S, v16.4S
136 fmul v1.4S, v2.4S, v17.4S
137 fmls v0.4S, v3.4S, v18.4S
138 fmla v1.4S, v3.4S, v19.4S
139 fmla v0.4S, v4.4S, v20.4S
140 fmla v1.4S, v4.4S, v21.4S
141 faddp v0.4S, v0.4S, v1.4S
142 faddp v0.4S, v0.4S, v0.4S
143 fadd v0.2S, v0.2S, v6.2S
144 st1 {v0.2S}, [x0], x3