2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/aarch64/asm.S"
25 function ff_h264_idct_add_neon, export=1
26 ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
30 add v4.4H, v0.4H, v2.4H
31 sshr v16.4H, v1.4H, #1
32 st1 {v30.8H}, [x1], #16
33 sshr v17.4H, v3.4H, #1
34 st1 {v30.8H}, [x1], #16
35 sub v5.4H, v0.4H, v2.4H
36 sub v6.4H, v16.4H, v3.4H
37 add v7.4H, v1.4H, v17.4H
38 add v0.4H, v4.4H, v7.4H
39 add v1.4H, v5.4H, v6.4H
40 sub v2.4H, v5.4H, v6.4H
41 sub v3.4H, v4.4H, v7.4H
43 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
45 add v4.4H, v0.4H, v2.4H
46 ld1 {v18.S}[0], [x0], x2
47 sshr v16.4H, v3.4H, #1
48 sshr v17.4H, v1.4H, #1
49 ld1 {v18.S}[1], [x0], x2
50 sub v5.4H, v0.4H, v2.4H
51 ld1 {v19.S}[1], [x0], x2
52 add v6.4H, v16.4H, v1.4H
54 sub v7.4H, v17.4H, v3.4H
55 ld1 {v19.S}[0], [x0], x2
57 sub x0, x0, x2, lsl #2
58 add v0.8H, v4.8H, v6.8H
59 sub v1.8H, v4.8H, v6.8H
61 srshr v0.8H, v0.8H, #6
62 srshr v1.8H, v1.8H, #6
64 uaddw v0.8H, v0.8H, v18.8B
65 uaddw v1.8H, v1.8H, v19.8B
70 st1 {v0.S}[0], [x0], x2
71 st1 {v0.S}[1], [x0], x2
72 st1 {v1.S}[1], [x0], x2
73 st1 {v1.S}[0], [x0], x2
79 function ff_h264_idct_dc_add_neon, export=1
84 srshr v2.8H, v2.8H, #6
85 ld1 {v0.S}[0], [x0], x2
86 ld1 {v0.S}[1], [x0], x2
87 uaddw v3.8H, v2.8H, v0.8B
88 ld1 {v1.S}[0], [x0], x2
89 ld1 {v1.S}[1], [x0], x2
90 uaddw v4.8H, v2.8H, v1.8B
93 sub x0, x0, x2, lsl #2
94 st1 {v0.S}[0], [x0], x2
95 st1 {v0.S}[1], [x0], x2
96 st1 {v1.S}[0], [x0], x2
97 st1 {v1.S}[1], [x0], x2
101 function ff_h264_idct_add16_neon, export=1
104 mov x5, x1 // block_offset
109 movrel x13, X(ff_h264_idct_dc_add_neon)
110 movrel x14, X(ff_h264_idct_add_neon)
114 ldrb w3, [x4, w3, uxtw]
120 csel x15, x13, x14, ne
128 function ff_h264_idct_add16intra_neon, export=1
131 mov x5, x1 // block_offset
136 movrel x13, X(ff_h264_idct_dc_add_neon)
137 movrel x14, X(ff_h264_idct_add_neon)
141 ldrb w3, [x4, w3, uxtw]
145 csel x15, x13, x14, eq
155 function ff_h264_idct_add8_neon, export=1
159 ldp x6, x15, [x0] // dest[0], dest[1]
160 add x5, x1, #16*4 // block_offset
161 add x9, x2, #16*32 // block
162 mov w19, w3 // stride
163 movrel x13, X(ff_h264_idct_dc_add_neon)
164 movrel x14, X(ff_h264_idct_add_neon)
169 ldrb w3, [x7, x10] // scan8[i]
170 ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
171 ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
172 add x0, x0, x6 // block_offset[i] + dst[j-1]
173 add x1, x9, x10, lsl #5 // block + i * 16
175 ldrsh w3, [x1] // block[i*16]
176 csel x20, x13, x14, eq
182 csel x10, x11, x10, eq // mov x10, #16
191 .macro idct8x8_cols pass
195 sshr v18.8H, v26.8H, #1
196 add v16.8H, v24.8H, v28.8H
197 ld1 {v30.8H, v31.8H}, [x1]
198 st1 {v19.8H}, [x1], #16
199 st1 {v19.8H}, [x1], #16
200 sub v17.8H, v24.8H, v28.8H
201 sshr v19.8H, v30.8H, #1
202 sub v18.8H, v18.8H, v30.8H
203 add v19.8H, v19.8H, v26.8H
207 sshr v30.8H, v26.8H, #1
208 sshr v19.8H, v18.8H, #1
209 add v16.8H, v24.8H, v28.8H
210 sub v17.8H, v24.8H, v28.8H
211 sub v30.8H, v30.8H, v18.8H
212 add v19.8H, v19.8H, v26.8H
214 add v26.8H, v17.8H, va.8H
215 sub v28.8H, v17.8H, va.8H
216 add v24.8H, v16.8H, v19.8H
217 sub vb.8H, v16.8H, v19.8H
218 sub v16.8H, v29.8H, v27.8H
219 add v17.8H, v31.8H, v25.8H
220 sub va.8H, v31.8H, v25.8H
221 add v19.8H, v29.8H, v27.8H
222 sub v16.8H, v16.8H, v31.8H
223 sub v17.8H, v17.8H, v27.8H
224 add va.8H, va.8H, v29.8H
225 add v19.8H, v19.8H, v25.8H
226 sshr v25.8H, v25.8H, #1
227 sshr v27.8H, v27.8H, #1
228 sshr v29.8H, v29.8H, #1
229 sshr v31.8H, v31.8H, #1
230 sub v16.8H, v16.8H, v31.8H
231 sub v17.8H, v17.8H, v27.8H
232 add va.8H, va.8H, v29.8H
233 add v19.8H, v19.8H, v25.8H
234 sshr v25.8H, v16.8H, #2
235 sshr v27.8H, v17.8H, #2
236 sshr v29.8H, va.8H, #2
237 sshr v31.8H, v19.8H, #2
238 sub v19.8H, v19.8H, v25.8H
239 sub va.8H, v27.8H, va.8H
240 add v17.8H, v17.8H, v29.8H
241 add v16.8H, v16.8H, v31.8H
243 sub v31.8H, v24.8H, v19.8H
244 add v24.8H, v24.8H, v19.8H
245 add v25.8H, v26.8H, v18.8H
246 sub v18.8H, v26.8H, v18.8H
247 add v26.8H, v28.8H, v17.8H
248 add v27.8H, v30.8H, v16.8H
249 sub v29.8H, v28.8H, v17.8H
250 sub v28.8H, v30.8H, v16.8H
252 sub v31.8H, v24.8H, v19.8H
253 add v24.8H, v24.8H, v19.8H
254 add v25.8H, v26.8H, v30.8H
255 sub v30.8H, v26.8H, v30.8H
256 add v26.8H, v28.8H, v17.8H
257 sub v29.8H, v28.8H, v17.8H
258 add v27.8H, v18.8H, v16.8H
259 sub v28.8H, v18.8H, v16.8H
265 function ff_h264_idct8_add_neon, export=1
268 ld1 {v24.8H, v25.8H}, [x1]
269 st1 {v19.8H}, [x1], #16
270 st1 {v19.8H}, [x1], #16
271 ld1 {v26.8H, v27.8H}, [x1]
272 st1 {v19.8H}, [x1], #16
273 st1 {v19.8H}, [x1], #16
274 ld1 {v28.8H, v29.8H}, [x1]
275 st1 {v19.8H}, [x1], #16
276 st1 {v19.8H}, [x1], #16
279 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
283 srshr v24.8H, v24.8H, #6
284 ld1 {v0.8B}, [x0], x2
285 srshr v25.8H, v25.8H, #6
286 ld1 {v1.8B}, [x0], x2
287 srshr v26.8H, v26.8H, #6
288 ld1 {v2.8B}, [x0], x2
289 srshr v27.8H, v27.8H, #6
290 ld1 {v3.8B}, [x0], x2
291 srshr v28.8H, v28.8H, #6
292 ld1 {v4.8B}, [x0], x2
293 srshr v29.8H, v29.8H, #6
294 ld1 {v5.8B}, [x0], x2
295 srshr v30.8H, v30.8H, #6
296 ld1 {v6.8B}, [x0], x2
297 srshr v31.8H, v31.8H, #6
298 ld1 {v7.8B}, [x0], x2
299 uaddw v24.8H, v24.8H, v0.8B
300 uaddw v25.8H, v25.8H, v1.8B
301 uaddw v26.8H, v26.8H, v2.8B
303 uaddw v27.8H, v27.8H, v3.8B
305 uaddw v28.8H, v28.8H, v4.8B
307 st1 {v0.8B}, [x3], x2
308 uaddw v29.8H, v29.8H, v5.8B
310 st1 {v1.8B}, [x3], x2
311 uaddw v30.8H, v30.8H, v6.8B
313 st1 {v2.8B}, [x3], x2
314 uaddw v31.8H, v31.8H, v7.8B
316 st1 {v3.8B}, [x3], x2
319 st1 {v4.8B}, [x3], x2
320 st1 {v5.8B}, [x3], x2
321 st1 {v6.8B}, [x3], x2
322 st1 {v7.8B}, [x3], x2
328 function ff_h264_idct8_dc_add_neon, export=1
333 ld1 {v0.8B}, [x0], x2
334 srshr v31.8H, v31.8H, #6
335 ld1 {v1.8B}, [x0], x2
336 ld1 {v2.8B}, [x0], x2
337 uaddw v24.8H, v31.8H, v0.8B
338 ld1 {v3.8B}, [x0], x2
339 uaddw v25.8H, v31.8H, v1.8B
340 ld1 {v4.8B}, [x0], x2
341 uaddw v26.8H, v31.8H, v2.8B
342 ld1 {v5.8B}, [x0], x2
343 uaddw v27.8H, v31.8H, v3.8B
344 ld1 {v6.8B}, [x0], x2
345 uaddw v28.8H, v31.8H, v4.8B
346 ld1 {v7.8B}, [x0], x2
347 uaddw v29.8H, v31.8H, v5.8B
348 uaddw v30.8H, v31.8H, v6.8B
349 uaddw v31.8H, v31.8H, v7.8B
354 sub x0, x0, x2, lsl #3
355 st1 {v0.8B}, [x0], x2
357 st1 {v1.8B}, [x0], x2
359 st1 {v2.8B}, [x0], x2
361 st1 {v3.8B}, [x0], x2
363 st1 {v4.8B}, [x0], x2
364 st1 {v5.8B}, [x0], x2
365 st1 {v6.8B}, [x0], x2
366 st1 {v7.8B}, [x0], x2
370 function ff_h264_idct8_add4_neon, export=1
378 movrel x13, X(ff_h264_idct8_dc_add_neon)
379 movrel x14, X(ff_h264_idct8_add_neon)
382 ldrb w9, [x4, w9, UXTW]
388 csel x15, x13, x14, ne
397 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
398 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
399 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
400 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
401 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
402 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
403 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
404 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
405 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
406 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
407 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
408 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8