2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/aarch64/asm.S"
25 function ff_h264_idct_add_neon, export=1
26 ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
30 add v4.4H, v0.4H, v2.4H
31 sshr v16.4H, v1.4H, #1
32 st1 {v30.8H}, [x1], #16
33 sshr v17.4H, v3.4H, #1
34 st1 {v30.8H}, [x1], #16
35 sub v5.4H, v0.4H, v2.4H
36 sub v6.4H, v16.4H, v3.4H
37 add v7.4H, v1.4H, v17.4H
38 add v0.4H, v4.4H, v7.4H
39 add v1.4H, v5.4H, v6.4H
40 sub v2.4H, v5.4H, v6.4H
41 sub v3.4H, v4.4H, v7.4H
43 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
45 add v4.4H, v0.4H, v2.4H
46 ld1 {v18.S}[0], [x0], x2
47 sshr v16.4H, v3.4H, #1
48 sshr v17.4H, v1.4H, #1
49 ld1 {v18.S}[1], [x0], x2
50 sub v5.4H, v0.4H, v2.4H
51 ld1 {v19.S}[1], [x0], x2
52 add v6.4H, v16.4H, v1.4H
54 sub v7.4H, v17.4H, v3.4H
55 ld1 {v19.S}[0], [x0], x2
57 sub x0, x0, x2, lsl #2
58 add v0.8H, v4.8H, v6.8H
59 sub v1.8H, v4.8H, v6.8H
61 srshr v0.8H, v0.8H, #6
62 srshr v1.8H, v1.8H, #6
64 uaddw v0.8H, v0.8H, v18.8B
65 uaddw v1.8H, v1.8H, v19.8B
70 st1 {v0.S}[0], [x0], x2
71 st1 {v0.S}[1], [x0], x2
72 st1 {v1.S}[1], [x0], x2
73 st1 {v1.S}[0], [x0], x2
79 function ff_h264_idct_dc_add_neon, export=1
84 srshr v2.8H, v2.8H, #6
85 ld1 {v0.S}[0], [x0], x2
86 ld1 {v0.S}[1], [x0], x2
87 uaddw v3.8H, v2.8H, v0.8B
88 ld1 {v1.S}[0], [x0], x2
89 ld1 {v1.S}[1], [x0], x2
90 uaddw v4.8H, v2.8H, v1.8B
93 sub x0, x0, x2, lsl #2
94 st1 {v0.S}[0], [x0], x2
95 st1 {v0.S}[1], [x0], x2
96 st1 {v1.S}[0], [x0], x2
97 st1 {v1.S}[1], [x0], x2
101 function ff_h264_idct_add16_neon, export=1
104 mov x5, x1 // block_offset
109 movrel x13, X(ff_h264_idct_dc_add_neon)
110 movrel x14, X(ff_h264_idct_add_neon)
114 ldrb w3, [x4, w3, uxtw]
120 csel x15, x13, x14, ne
128 function ff_h264_idct_add16intra_neon, export=1
131 mov x5, x1 // block_offset
136 movrel x13, X(ff_h264_idct_dc_add_neon)
137 movrel x14, X(ff_h264_idct_add_neon)
141 ldrb w3, [x4, w3, uxtw]
145 csel x15, x13, x14, eq
155 function ff_h264_idct_add8_neon, export=1
159 ldp x6, x15, [x0] // dest[0], dest[1]
160 add x5, x1, #16*4 // block_offset
161 add x9, x2, #16*32 // block
162 mov w19, w3 // stride
163 movrel x13, X(ff_h264_idct_dc_add_neon)
164 movrel x14, X(ff_h264_idct_add_neon)
169 ldrb w3, [x7, x10] // scan8[i]
170 ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
171 ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
172 add x0, x0, x6 // block_offset[i] + dst[j-1]
173 add x1, x9, x10, lsl #5 // block + i * 16
175 ldrsh w3, [x1] // block[i*16]
176 csel x20, x13, x14, eq
182 csel x10, x11, x10, eq // mov x10, #16
191 .macro idct8x8_cols pass
195 sshr v18.8H, v26.8H, #1
196 add v16.8H, v24.8H, v28.8H
197 ld1 {v30.8H, v31.8H}, [x1]
198 st1 {v19.8H}, [x1], #16
199 st1 {v19.8H}, [x1], #16
200 sub v17.8H, v24.8H, v28.8H
201 sshr v19.8H, v30.8H, #1
202 sub v18.8H, v18.8H, v30.8H
203 add v19.8H, v19.8H, v26.8H
207 sshr v30.8H, v26.8H, #1
208 sshr v19.8H, v18.8H, #1
209 add v16.8H, v24.8H, v28.8H
210 sub v17.8H, v24.8H, v28.8H
211 sub v30.8H, v30.8H, v18.8H
212 add v19.8H, v19.8H, v26.8H
214 add v26.8H, v17.8H, va.8H
215 sub v28.8H, v17.8H, va.8H
216 add v24.8H, v16.8H, v19.8H
217 sub vb.8H, v16.8H, v19.8H
218 sub v16.8H, v29.8H, v27.8H
219 add v17.8H, v31.8H, v25.8H
220 sub va.8H, v31.8H, v25.8H
221 add v19.8H, v29.8H, v27.8H
222 sub v16.8H, v16.8H, v31.8H
223 sub v17.8H, v17.8H, v27.8H
224 add va.8H, va.8H, v29.8H
225 add v19.8H, v19.8H, v25.8H
226 sshr v25.8H, v25.8H, #1
227 sshr v27.8H, v27.8H, #1
228 sshr v29.8H, v29.8H, #1
229 sshr v31.8H, v31.8H, #1
230 sub v16.8H, v16.8H, v31.8H
231 sub v17.8H, v17.8H, v27.8H
232 add va.8H, va.8H, v29.8H
233 add v19.8H, v19.8H, v25.8H
234 sshr v25.8H, v16.8H, #2
235 sshr v27.8H, v17.8H, #2
236 sshr v29.8H, va.8H, #2
237 sshr v31.8H, v19.8H, #2
238 sub v19.8H, v19.8H, v25.8H
239 sub va.8H, v27.8H, va.8H
240 add v17.8H, v17.8H, v29.8H
241 add v16.8H, v16.8H, v31.8H
243 sub v31.8H, v24.8H, v19.8H
244 add v24.8H, v24.8H, v19.8H
245 add v25.8H, v26.8H, v18.8H
246 sub v18.8H, v26.8H, v18.8H
247 add v26.8H, v28.8H, v17.8H
248 add v27.8H, v30.8H, v16.8H
249 sub v29.8H, v28.8H, v17.8H
250 sub v28.8H, v30.8H, v16.8H
252 sub v31.8H, v24.8H, v19.8H
253 add v24.8H, v24.8H, v19.8H
254 add v25.8H, v26.8H, v30.8H
255 sub v30.8H, v26.8H, v30.8H
256 add v26.8H, v28.8H, v17.8H
257 sub v29.8H, v28.8H, v17.8H
258 add v27.8H, v18.8H, v16.8H
259 sub v28.8H, v18.8H, v16.8H
265 function ff_h264_idct8_add_neon, export=1
267 ld1 {v24.8H, v25.8H}, [x1]
268 st1 {v19.8H}, [x1], #16
269 st1 {v19.8H}, [x1], #16
270 ld1 {v26.8H, v27.8H}, [x1]
271 st1 {v19.8H}, [x1], #16
272 st1 {v19.8H}, [x1], #16
273 ld1 {v28.8H, v29.8H}, [x1]
274 st1 {v19.8H}, [x1], #16
275 st1 {v19.8H}, [x1], #16
278 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
282 srshr v24.8H, v24.8H, #6
283 ld1 {v0.8B}, [x0], x2
284 srshr v25.8H, v25.8H, #6
285 ld1 {v1.8B}, [x0], x2
286 srshr v26.8H, v26.8H, #6
287 ld1 {v2.8B}, [x0], x2
288 srshr v27.8H, v27.8H, #6
289 ld1 {v3.8B}, [x0], x2
290 srshr v28.8H, v28.8H, #6
291 ld1 {v4.8B}, [x0], x2
292 srshr v29.8H, v29.8H, #6
293 ld1 {v5.8B}, [x0], x2
294 srshr v30.8H, v30.8H, #6
295 ld1 {v6.8B}, [x0], x2
296 srshr v31.8H, v31.8H, #6
297 ld1 {v7.8B}, [x0], x2
298 uaddw v24.8H, v24.8H, v0.8B
299 uaddw v25.8H, v25.8H, v1.8B
300 uaddw v26.8H, v26.8H, v2.8B
302 uaddw v27.8H, v27.8H, v3.8B
304 uaddw v28.8H, v28.8H, v4.8B
306 st1 {v0.8B}, [x3], x2
307 uaddw v29.8H, v29.8H, v5.8B
309 st1 {v1.8B}, [x3], x2
310 uaddw v30.8H, v30.8H, v6.8B
312 st1 {v2.8B}, [x3], x2
313 uaddw v31.8H, v31.8H, v7.8B
315 st1 {v3.8B}, [x3], x2
318 st1 {v4.8B}, [x3], x2
319 st1 {v5.8B}, [x3], x2
320 st1 {v6.8B}, [x3], x2
321 st1 {v7.8B}, [x3], x2
327 function ff_h264_idct8_dc_add_neon, export=1
332 ld1 {v0.8B}, [x0], x2
333 srshr v31.8H, v31.8H, #6
334 ld1 {v1.8B}, [x0], x2
335 ld1 {v2.8B}, [x0], x2
336 uaddw v24.8H, v31.8H, v0.8B
337 ld1 {v3.8B}, [x0], x2
338 uaddw v25.8H, v31.8H, v1.8B
339 ld1 {v4.8B}, [x0], x2
340 uaddw v26.8H, v31.8H, v2.8B
341 ld1 {v5.8B}, [x0], x2
342 uaddw v27.8H, v31.8H, v3.8B
343 ld1 {v6.8B}, [x0], x2
344 uaddw v28.8H, v31.8H, v4.8B
345 ld1 {v7.8B}, [x0], x2
346 uaddw v29.8H, v31.8H, v5.8B
347 uaddw v30.8H, v31.8H, v6.8B
348 uaddw v31.8H, v31.8H, v7.8B
353 sub x0, x0, x2, lsl #3
354 st1 {v0.8B}, [x0], x2
356 st1 {v1.8B}, [x0], x2
358 st1 {v2.8B}, [x0], x2
360 st1 {v3.8B}, [x0], x2
362 st1 {v4.8B}, [x0], x2
363 st1 {v5.8B}, [x0], x2
364 st1 {v6.8B}, [x0], x2
365 st1 {v7.8B}, [x0], x2
369 function ff_h264_idct8_add4_neon, export=1
377 movrel x13, X(ff_h264_idct8_dc_add_neon)
378 movrel x14, X(ff_h264_idct8_add_neon)
381 ldrb w9, [x4, w9, UXTW]
387 csel x15, x13, x14, ne
396 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
397 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
398 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
399 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
400 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
401 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
402 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
403 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
404 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
405 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
406 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
407 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8