2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/aarch64/asm.S"
25 function ff_h264_idct_add_neon, export=1
26 .L_ff_h264_idct_add_neon:
27 ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
31 add v4.4H, v0.4H, v2.4H
32 sshr v16.4H, v1.4H, #1
33 st1 {v30.8H}, [x1], #16
34 sshr v17.4H, v3.4H, #1
35 st1 {v30.8H}, [x1], #16
36 sub v5.4H, v0.4H, v2.4H
37 sub v6.4H, v16.4H, v3.4H
38 add v7.4H, v1.4H, v17.4H
39 add v0.4H, v4.4H, v7.4H
40 add v1.4H, v5.4H, v6.4H
41 sub v2.4H, v5.4H, v6.4H
42 sub v3.4H, v4.4H, v7.4H
44 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
46 add v4.4H, v0.4H, v2.4H
47 ld1 {v18.S}[0], [x0], x2
48 sshr v16.4H, v3.4H, #1
49 sshr v17.4H, v1.4H, #1
50 ld1 {v18.S}[1], [x0], x2
51 sub v5.4H, v0.4H, v2.4H
52 ld1 {v19.S}[1], [x0], x2
53 add v6.4H, v16.4H, v1.4H
55 sub v7.4H, v17.4H, v3.4H
56 ld1 {v19.S}[0], [x0], x2
58 sub x0, x0, x2, lsl #2
59 add v0.8H, v4.8H, v6.8H
60 sub v1.8H, v4.8H, v6.8H
62 srshr v0.8H, v0.8H, #6
63 srshr v1.8H, v1.8H, #6
65 uaddw v0.8H, v0.8H, v18.8B
66 uaddw v1.8H, v1.8H, v19.8B
71 st1 {v0.S}[0], [x0], x2
72 st1 {v0.S}[1], [x0], x2
73 st1 {v1.S}[1], [x0], x2
74 st1 {v1.S}[0], [x0], x2
80 function ff_h264_idct_dc_add_neon, export=1
81 .L_ff_h264_idct_dc_add_neon:
86 srshr v2.8H, v2.8H, #6
87 ld1 {v0.S}[0], [x0], x2
88 ld1 {v0.S}[1], [x0], x2
89 uaddw v3.8H, v2.8H, v0.8B
90 ld1 {v1.S}[0], [x0], x2
91 ld1 {v1.S}[1], [x0], x2
92 uaddw v4.8H, v2.8H, v1.8B
95 sub x0, x0, x2, lsl #2
96 st1 {v0.S}[0], [x0], x2
97 st1 {v0.S}[1], [x0], x2
98 st1 {v1.S}[0], [x0], x2
99 st1 {v1.S}[1], [x0], x2
103 function ff_h264_idct_add16_neon, export=1
106 mov x5, x1 // block_offset
111 movrel x13, .L_ff_h264_idct_dc_add_neon
112 movrel x14, .L_ff_h264_idct_add_neon
116 ldrb w3, [x4, w3, uxtw]
122 csel x15, x13, x14, ne
130 function ff_h264_idct_add16intra_neon, export=1
133 mov x5, x1 // block_offset
138 movrel x13, .L_ff_h264_idct_dc_add_neon
139 movrel x14, .L_ff_h264_idct_add_neon
143 ldrb w3, [x4, w3, uxtw]
147 csel x15, x13, x14, eq
157 function ff_h264_idct_add8_neon, export=1
161 ldp x6, x15, [x0] // dest[0], dest[1]
162 add x5, x1, #16*4 // block_offset
163 add x9, x2, #16*32 // block
164 mov w19, w3 // stride
165 movrel x13, .L_ff_h264_idct_dc_add_neon
166 movrel x14, .L_ff_h264_idct_add_neon
171 ldrb w3, [x7, x10] // scan8[i]
172 ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
173 ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
174 add x0, x0, x6 // block_offset[i] + dst[j-1]
175 add x1, x9, x10, lsl #5 // block + i * 16
177 ldrsh w3, [x1] // block[i*16]
178 csel x20, x13, x14, eq
184 csel x10, x11, x10, eq // mov x10, #16
193 .macro idct8x8_cols pass
197 sshr v18.8H, v26.8H, #1
198 add v16.8H, v24.8H, v28.8H
199 ld1 {v30.8H, v31.8H}, [x1]
200 st1 {v19.8H}, [x1], #16
201 st1 {v19.8H}, [x1], #16
202 sub v17.8H, v24.8H, v28.8H
203 sshr v19.8H, v30.8H, #1
204 sub v18.8H, v18.8H, v30.8H
205 add v19.8H, v19.8H, v26.8H
209 sshr v30.8H, v26.8H, #1
210 sshr v19.8H, v18.8H, #1
211 add v16.8H, v24.8H, v28.8H
212 sub v17.8H, v24.8H, v28.8H
213 sub v30.8H, v30.8H, v18.8H
214 add v19.8H, v19.8H, v26.8H
216 add v26.8H, v17.8H, va.8H
217 sub v28.8H, v17.8H, va.8H
218 add v24.8H, v16.8H, v19.8H
219 sub vb.8H, v16.8H, v19.8H
220 sub v16.8H, v29.8H, v27.8H
221 add v17.8H, v31.8H, v25.8H
222 sub va.8H, v31.8H, v25.8H
223 add v19.8H, v29.8H, v27.8H
224 sub v16.8H, v16.8H, v31.8H
225 sub v17.8H, v17.8H, v27.8H
226 add va.8H, va.8H, v29.8H
227 add v19.8H, v19.8H, v25.8H
228 sshr v25.8H, v25.8H, #1
229 sshr v27.8H, v27.8H, #1
230 sshr v29.8H, v29.8H, #1
231 sshr v31.8H, v31.8H, #1
232 sub v16.8H, v16.8H, v31.8H
233 sub v17.8H, v17.8H, v27.8H
234 add va.8H, va.8H, v29.8H
235 add v19.8H, v19.8H, v25.8H
236 sshr v25.8H, v16.8H, #2
237 sshr v27.8H, v17.8H, #2
238 sshr v29.8H, va.8H, #2
239 sshr v31.8H, v19.8H, #2
240 sub v19.8H, v19.8H, v25.8H
241 sub va.8H, v27.8H, va.8H
242 add v17.8H, v17.8H, v29.8H
243 add v16.8H, v16.8H, v31.8H
245 sub v31.8H, v24.8H, v19.8H
246 add v24.8H, v24.8H, v19.8H
247 add v25.8H, v26.8H, v18.8H
248 sub v18.8H, v26.8H, v18.8H
249 add v26.8H, v28.8H, v17.8H
250 add v27.8H, v30.8H, v16.8H
251 sub v29.8H, v28.8H, v17.8H
252 sub v28.8H, v30.8H, v16.8H
254 sub v31.8H, v24.8H, v19.8H
255 add v24.8H, v24.8H, v19.8H
256 add v25.8H, v26.8H, v30.8H
257 sub v30.8H, v26.8H, v30.8H
258 add v26.8H, v28.8H, v17.8H
259 sub v29.8H, v28.8H, v17.8H
260 add v27.8H, v18.8H, v16.8H
261 sub v28.8H, v18.8H, v16.8H
267 function ff_h264_idct8_add_neon, export=1
268 .L_ff_h264_idct8_add_neon:
271 ld1 {v24.8H, v25.8H}, [x1]
272 st1 {v19.8H}, [x1], #16
273 st1 {v19.8H}, [x1], #16
274 ld1 {v26.8H, v27.8H}, [x1]
275 st1 {v19.8H}, [x1], #16
276 st1 {v19.8H}, [x1], #16
277 ld1 {v28.8H, v29.8H}, [x1]
278 st1 {v19.8H}, [x1], #16
279 st1 {v19.8H}, [x1], #16
282 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
286 srshr v24.8H, v24.8H, #6
287 ld1 {v0.8B}, [x0], x2
288 srshr v25.8H, v25.8H, #6
289 ld1 {v1.8B}, [x0], x2
290 srshr v26.8H, v26.8H, #6
291 ld1 {v2.8B}, [x0], x2
292 srshr v27.8H, v27.8H, #6
293 ld1 {v3.8B}, [x0], x2
294 srshr v28.8H, v28.8H, #6
295 ld1 {v4.8B}, [x0], x2
296 srshr v29.8H, v29.8H, #6
297 ld1 {v5.8B}, [x0], x2
298 srshr v30.8H, v30.8H, #6
299 ld1 {v6.8B}, [x0], x2
300 srshr v31.8H, v31.8H, #6
301 ld1 {v7.8B}, [x0], x2
302 uaddw v24.8H, v24.8H, v0.8B
303 uaddw v25.8H, v25.8H, v1.8B
304 uaddw v26.8H, v26.8H, v2.8B
306 uaddw v27.8H, v27.8H, v3.8B
308 uaddw v28.8H, v28.8H, v4.8B
310 st1 {v0.8B}, [x3], x2
311 uaddw v29.8H, v29.8H, v5.8B
313 st1 {v1.8B}, [x3], x2
314 uaddw v30.8H, v30.8H, v6.8B
316 st1 {v2.8B}, [x3], x2
317 uaddw v31.8H, v31.8H, v7.8B
319 st1 {v3.8B}, [x3], x2
322 st1 {v4.8B}, [x3], x2
323 st1 {v5.8B}, [x3], x2
324 st1 {v6.8B}, [x3], x2
325 st1 {v7.8B}, [x3], x2
331 function ff_h264_idct8_dc_add_neon, export=1
332 .L_ff_h264_idct8_dc_add_neon:
337 ld1 {v0.8B}, [x0], x2
338 srshr v31.8H, v31.8H, #6
339 ld1 {v1.8B}, [x0], x2
340 ld1 {v2.8B}, [x0], x2
341 uaddw v24.8H, v31.8H, v0.8B
342 ld1 {v3.8B}, [x0], x2
343 uaddw v25.8H, v31.8H, v1.8B
344 ld1 {v4.8B}, [x0], x2
345 uaddw v26.8H, v31.8H, v2.8B
346 ld1 {v5.8B}, [x0], x2
347 uaddw v27.8H, v31.8H, v3.8B
348 ld1 {v6.8B}, [x0], x2
349 uaddw v28.8H, v31.8H, v4.8B
350 ld1 {v7.8B}, [x0], x2
351 uaddw v29.8H, v31.8H, v5.8B
352 uaddw v30.8H, v31.8H, v6.8B
353 uaddw v31.8H, v31.8H, v7.8B
358 sub x0, x0, x2, lsl #3
359 st1 {v0.8B}, [x0], x2
361 st1 {v1.8B}, [x0], x2
363 st1 {v2.8B}, [x0], x2
365 st1 {v3.8B}, [x0], x2
367 st1 {v4.8B}, [x0], x2
368 st1 {v5.8B}, [x0], x2
369 st1 {v6.8B}, [x0], x2
370 st1 {v7.8B}, [x0], x2
374 function ff_h264_idct8_add4_neon, export=1
382 movrel x13, .L_ff_h264_idct8_dc_add_neon
383 movrel x14, .L_ff_h264_idct8_add_neon
386 ldrb w9, [x4, w9, UXTW]
392 csel x15, x13, x14, ne
401 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
402 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
403 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
404 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
405 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
406 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
407 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
408 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
409 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
410 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
411 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
412 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8