4 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
8 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 #include "libavutil/aarch64/asm.S"
29 #define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
30 #define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
31 #define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
32 #define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
33 #define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34 #define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35 #define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36 #define Z4c ((1<<(COL_SHIFT-1))/Z4)
49 const idct_coeff_neon, align=4
50 .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
53 .macro idct_start data
54 prfm pldl1keep, [\data]
56 movrel x3, idct_coeff_neon
76 .macro idct_col4_top y1, y2, y3, y4, i, l
77 smull\i v7.4S, \y3\l, z2
78 smull\i v16.4S, \y3\l, z6
79 smull\i v17.4S, \y2\l, z1
80 add v19.4S, v23.4S, v7.4S
81 smull\i v18.4S, \y2\l, z3
82 add v20.4S, v23.4S, v16.4S
83 smull\i v5.4S, \y2\l, z5
84 sub v21.4S, v23.4S, v16.4S
85 smull\i v6.4S, \y2\l, z7
86 sub v22.4S, v23.4S, v7.4S
88 smlal\i v17.4S, \y4\l, z3
89 smlsl\i v18.4S, \y4\l, z7
90 smlsl\i v5.4S, \y4\l, z1
91 smlsl\i v6.4S, \y4\l, z5
94 .macro idct_row4_neon y1, y2, y3, y4, pass
95 ld1 {\y1\().2D,\y2\().2D}, [x2], #32
96 movi v23.4S, #1<<2, lsl #8
97 orr v5.16B, \y1\().16B, \y2\().16B
98 ld1 {\y3\().2D,\y4\().2D}, [x2], #32
99 orr v6.16B, \y3\().16B, \y4\().16B
100 orr v5.16B, v5.16B, v6.16B
102 smlal v23.4S, \y1\().4H, z4
104 idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
109 smull2 v7.4S, \y1\().8H, z4
110 smlal2 v17.4S, \y2\().8H, z5
111 smlsl2 v18.4S, \y2\().8H, z1
112 smull2 v16.4S, \y3\().8H, z2
113 smlal2 v5.4S, \y2\().8H, z7
114 add v19.4S, v19.4S, v7.4S
115 sub v20.4S, v20.4S, v7.4S
116 sub v21.4S, v21.4S, v7.4S
117 add v22.4S, v22.4S, v7.4S
118 smlal2 v6.4S, \y2\().8H, z3
119 smull2 v7.4S, \y3\().8H, z6
120 smlal2 v17.4S, \y4\().8H, z7
121 smlsl2 v18.4S, \y4\().8H, z5
122 smlal2 v5.4S, \y4\().8H, z3
123 smlsl2 v6.4S, \y4\().8H, z1
124 add v19.4S, v19.4S, v7.4S
125 sub v20.4S, v20.4S, v16.4S
126 add v21.4S, v21.4S, v16.4S
127 sub v22.4S, v22.4S, v7.4S
129 \pass: add \y3\().4S, v19.4S, v17.4S
130 add \y4\().4S, v20.4S, v18.4S
131 shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
132 shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
133 add v7.4S, v21.4S, v5.4S
134 add v16.4S, v22.4S, v6.4S
135 shrn \y3\().4H, v7.4S, #ROW_SHIFT
136 shrn \y4\().4H, v16.4S, #ROW_SHIFT
137 sub v22.4S, v22.4S, v6.4S
138 sub v19.4S, v19.4S, v17.4S
139 sub v21.4S, v21.4S, v5.4S
140 shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
141 sub v20.4S, v20.4S, v18.4S
142 shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
143 shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
144 shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
146 trn1 v16.8H, \y1\().8H, \y2\().8H
147 trn2 v17.8H, \y1\().8H, \y2\().8H
148 trn1 v18.8H, \y3\().8H, \y4\().8H
149 trn2 v19.8H, \y3\().8H, \y4\().8H
150 trn1 \y1\().4S, v16.4S, v18.4S
151 trn1 \y2\().4S, v17.4S, v19.4S
152 trn2 \y3\().4S, v16.4S, v18.4S
153 trn2 \y4\().4S, v17.4S, v19.4S
156 .macro declare_idct_col4_neon i, l
157 function idct_col4_neon\i
160 add v23.4H, v23.4H, v24.4H
162 mov v5.D[0], v24.D[1]
163 add v23.4H, v23.4H, v5.4H
165 smull v23.4S, v23.4H, z4
167 idct_col4_top v24, v25, v26, v27, \i, \l
169 mov x4, v28.D[\i - 1]
170 mov x5, v29.D[\i - 1]
174 smull\i v7.4S, v28\l, z4
175 add v19.4S, v19.4S, v7.4S
176 sub v20.4S, v20.4S, v7.4S
177 sub v21.4S, v21.4S, v7.4S
178 add v22.4S, v22.4S, v7.4S
180 1: mov x4, v30.D[\i - 1]
184 smlal\i v17.4S, v29\l, z5
185 smlsl\i v18.4S, v29\l, z1
186 smlal\i v5.4S, v29\l, z7
187 smlal\i v6.4S, v29\l, z3
189 2: mov x5, v31.D[\i - 1]
193 smull\i v7.4S, v30\l, z6
194 smull\i v16.4S, v30\l, z2
195 add v19.4S, v19.4S, v7.4S
196 sub v22.4S, v22.4S, v7.4S
197 sub v20.4S, v20.4S, v16.4S
198 add v21.4S, v21.4S, v16.4S
203 smlal\i v17.4S, v31\l, z7
204 smlsl\i v18.4S, v31\l, z5
205 smlal\i v5.4S, v31\l, z3
206 smlsl\i v6.4S, v31\l, z1
208 4: addhn v7.4H, v19.4S, v17.4S
209 addhn2 v7.8H, v20.4S, v18.4S
210 subhn v18.4H, v20.4S, v18.4S
211 subhn2 v18.8H, v19.4S, v17.4S
213 addhn v16.4H, v21.4S, v5.4S
214 addhn2 v16.8H, v22.4S, v6.4S
215 subhn v17.4H, v22.4S, v6.4S
216 subhn2 v17.8H, v21.4S, v5.4S
222 declare_idct_col4_neon 1, .4H
223 declare_idct_col4_neon 2, .8H
225 function ff_simple_idct_put_neon, export=1
228 idct_row4_neon v24, v25, v26, v27, 1
229 idct_row4_neon v28, v29, v30, v31, 2
232 sqshrun v1.8B, v7.8H, #COL_SHIFT-16
233 sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
234 sqshrun v3.8B, v17.8H, #COL_SHIFT-16
235 sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
239 sqshrun v2.8B, v7.8H, #COL_SHIFT-16
240 sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
241 sqshrun v4.8B, v17.8H, #COL_SHIFT-16
242 sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
244 zip1 v16.4S, v1.4S, v2.4S
245 zip2 v17.4S, v1.4S, v2.4S
247 st1 {v16.D}[0], [x0], x1
248 st1 {v16.D}[1], [x0], x1
250 zip1 v18.4S, v3.4S, v4.4S
251 zip2 v19.4S, v3.4S, v4.4S
253 st1 {v17.D}[0], [x0], x1
254 st1 {v17.D}[1], [x0], x1
255 st1 {v18.D}[0], [x0], x1
256 st1 {v18.D}[1], [x0], x1
257 st1 {v19.D}[0], [x0], x1
258 st1 {v19.D}[1], [x0], x1
263 function ff_simple_idct_add_neon, export=1
266 idct_row4_neon v24, v25, v26, v27, 1
267 idct_row4_neon v28, v29, v30, v31, 2
270 sshr v1.8H, v7.8H, #COL_SHIFT-16
271 sshr v2.8H, v16.8H, #COL_SHIFT-16
272 sshr v3.8H, v17.8H, #COL_SHIFT-16
273 sshr v4.8H, v18.8H, #COL_SHIFT-16
277 sshr v7.8H, v7.8H, #COL_SHIFT-16
278 sshr v16.8H, v16.8H, #COL_SHIFT-16
279 sshr v17.8H, v17.8H, #COL_SHIFT-16
280 sshr v18.8H, v18.8H, #COL_SHIFT-16
283 ld1 {v19.D}[0], [x0], x1
284 zip1 v23.2D, v1.2D, v7.2D
285 zip2 v24.2D, v1.2D, v7.2D
286 ld1 {v19.D}[1], [x0], x1
287 zip1 v25.2D, v2.2D, v16.2D
288 zip2 v26.2D, v2.2D, v16.2D
289 ld1 {v20.D}[0], [x0], x1
290 zip1 v27.2D, v3.2D, v17.2D
291 zip2 v28.2D, v3.2D, v17.2D
292 ld1 {v20.D}[1], [x0], x1
293 zip1 v29.2D, v4.2D, v18.2D
294 zip2 v30.2D, v4.2D, v18.2D
295 ld1 {v21.D}[0], [x0], x1
296 uaddw v23.8H, v23.8H, v19.8B
297 uaddw2 v24.8H, v24.8H, v19.16B
298 ld1 {v21.D}[1], [x0], x1
299 sqxtun v23.8B, v23.8H
300 sqxtun2 v23.16B, v24.8H
301 ld1 {v22.D}[0], [x0], x1
302 uaddw v24.8H, v25.8H, v20.8B
303 uaddw2 v25.8H, v26.8H, v20.16B
304 ld1 {v22.D}[1], [x0], x1
305 sqxtun v24.8B, v24.8H
306 sqxtun2 v24.16B, v25.8H
307 st1 {v23.D}[0], [x9], x1
308 uaddw v25.8H, v27.8H, v21.8B
309 uaddw2 v26.8H, v28.8H, v21.16B
310 st1 {v23.D}[1], [x9], x1
311 sqxtun v25.8B, v25.8H
312 sqxtun2 v25.16B, v26.8H
313 st1 {v24.D}[0], [x9], x1
314 uaddw v26.8H, v29.8H, v22.8B
315 uaddw2 v27.8H, v30.8H, v22.16B
316 st1 {v24.D}[1], [x9], x1
317 sqxtun v26.8B, v26.8H
318 sqxtun2 v26.16B, v27.8H
319 st1 {v25.D}[0], [x9], x1
320 st1 {v25.D}[1], [x9], x1
321 st1 {v26.D}[0], [x9], x1
322 st1 {v26.D}[1], [x9], x1
327 function ff_simple_idct_neon, export=1
331 idct_row4_neon v24, v25, v26, v27, 1
332 idct_row4_neon v28, v29, v30, v31, 2
336 sshr v1.8H, v7.8H, #COL_SHIFT-16
337 sshr v2.8H, v16.8H, #COL_SHIFT-16
338 sshr v3.8H, v17.8H, #COL_SHIFT-16
339 sshr v4.8H, v18.8H, #COL_SHIFT-16
343 sshr v7.8H, v7.8H, #COL_SHIFT-16
344 sshr v16.8H, v16.8H, #COL_SHIFT-16
345 sshr v17.8H, v17.8H, #COL_SHIFT-16
346 sshr v18.8H, v18.8H, #COL_SHIFT-16
348 zip1 v23.2D, v1.2D, v7.2D
349 zip2 v24.2D, v1.2D, v7.2D
350 st1 {v23.2D,v24.2D}, [x2], #32
351 zip1 v25.2D, v2.2D, v16.2D
352 zip2 v26.2D, v2.2D, v16.2D
353 st1 {v25.2D,v26.2D}, [x2], #32
354 zip1 v27.2D, v3.2D, v17.2D
355 zip2 v28.2D, v3.2D, v17.2D
356 st1 {v27.2D,v28.2D}, [x2], #32
357 zip1 v29.2D, v4.2D, v18.2D
358 zip2 v30.2D, v4.2D, v18.2D
359 st1 {v29.2D,v30.2D}, [x2], #32