2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 function ff_h264_idct_add_neon, export=1
24 vld1.64 {d0-d3}, [r1,:128]
28 vst1.16 {q15}, [r1,:128]!
30 vst1.16 {q15}, [r1,:128]!
44 vld1.32 {d18[0]}, [r0,:32], r2
47 vld1.32 {d19[1]}, [r0,:32], r2
49 vld1.32 {d18[1]}, [r0,:32], r2
51 vld1.32 {d19[0]}, [r0,:32], r2
53 sub r0, r0, r2, lsl #2
66 vst1.32 {d0[0]}, [r0,:32], r2
67 vst1.32 {d1[1]}, [r0,:32], r2
68 vst1.32 {d0[1]}, [r0,:32], r2
69 vst1.32 {d1[0]}, [r0,:32], r2
75 function ff_h264_idct_dc_add_neon, export=1
77 vld1.16 {d2[],d3[]}, [r1,:16]
80 vld1.32 {d0[0]}, [r0,:32], r2
81 vld1.32 {d0[1]}, [r0,:32], r2
83 vld1.32 {d1[0]}, [r0,:32], r2
84 vld1.32 {d1[1]}, [r0,:32], r2
88 sub r0, r0, r2, lsl #2
89 vst1.32 {d0[0]}, [r0,:32], r2
90 vst1.32 {d0[1]}, [r0,:32], r2
91 vst1.32 {d1[0]}, [r0,:32], r2
92 vst1.32 {d1[1]}, [r0,:32], r2
96 function ff_h264_idct_add16_neon, export=1
116 adrne lr, X(ff_h264_idct_dc_add_neon) + CONFIG_THUMB
117 adreq lr, X(ff_h264_idct_add_neon) + CONFIG_THUMB
125 function ff_h264_idct_add16intra_neon, export=1
141 adrne lr, X(ff_h264_idct_add_neon) + CONFIG_THUMB
142 adreq lr, X(ff_h264_idct_dc_add_neon) + CONFIG_THUMB
151 function ff_h264_idct_add8_neon, export=1
161 1: ldrb r8, [r7, r12]
162 ldr r0, [r5, r12, lsl #2]
165 add r1, r10, r12, lsl #5
169 adrne lr, X(ff_h264_idct_add_neon) + CONFIG_THUMB
170 adreq lr, X(ff_h264_idct_dc_add_neon) + CONFIG_THUMB
183 .macro idct8x8_cols pass
189 vld1.16 {q14-q15},[r1,:128]
190 vst1.16 {q3}, [r1,:128]!
191 vst1.16 {q3}, [r1,:128]!
205 vshr.s16 q14, q10, #1
212 vsub.i16 q14, q14, q2
219 vsub.i16 q0, q13, q11
222 vadd.i16 q3, q13, q11
228 vshr.s16 q11, q11, #1
229 vshr.s16 q13, q13, #1
230 vshr.s16 q15, q15, #1
249 vadd.i16 q10, q12, q1
251 vadd.i16 q11, q14, q0
252 vsub.i16 q13, q12, q1
254 vsub.i16 q12, q14, q0
258 vadd.i16 q9, q10, q14
259 vsub.i16 q14, q10, q14
260 vadd.i16 q10, q12, q1
261 vsub.i16 q13, q12, q1
269 function ff_h264_idct8_add_neon, export=1
271 vld1.16 {q8-q9}, [r1,:128]
272 vst1.16 {q3}, [r1,:128]!
273 vst1.16 {q3}, [r1,:128]!
274 vld1.16 {q10-q11},[r1,:128]
275 vst1.16 {q3}, [r1,:128]!
276 vst1.16 {q3}, [r1,:128]!
277 vld1.16 {q12-q13},[r1,:128]
278 vst1.16 {q3}, [r1,:128]!
279 vst1.16 {q3}, [r1,:128]!
286 vld1.8 {d0}, [r0,:64], r2
288 vld1.8 {d1}, [r0,:64], r2
289 vrshr.s16 q10, q10, #6
290 vld1.8 {d2}, [r0,:64], r2
291 vrshr.s16 q11, q11, #6
292 vld1.8 {d3}, [r0,:64], r2
293 vrshr.s16 q12, q12, #6
294 vld1.8 {d4}, [r0,:64], r2
295 vrshr.s16 q13, q13, #6
296 vld1.8 {d5}, [r0,:64], r2
297 vrshr.s16 q14, q14, #6
298 vld1.8 {d6}, [r0,:64], r2
299 vrshr.s16 q15, q15, #6
300 vld1.8 {d7}, [r0,:64], r2
303 vaddw.u8 q10, q10, d2
305 vaddw.u8 q11, q11, d3
307 vaddw.u8 q12, q12, d4
309 vst1.8 {d0}, [r3,:64], r2
310 vaddw.u8 q13, q13, d5
312 vst1.8 {d1}, [r3,:64], r2
313 vaddw.u8 q14, q14, d6
315 vst1.8 {d2}, [r3,:64], r2
316 vaddw.u8 q15, q15, d7
318 vst1.8 {d3}, [r3,:64], r2
321 vst1.8 {d4}, [r3,:64], r2
322 vst1.8 {d5}, [r3,:64], r2
323 vst1.8 {d6}, [r3,:64], r2
324 vst1.8 {d7}, [r3,:64], r2
330 function ff_h264_idct8_dc_add_neon, export=1
332 vld1.16 {d30[],d31[]},[r1,:16]
334 vld1.32 {d0}, [r0,:64], r2
335 vrshr.s16 q15, q15, #6
336 vld1.32 {d1}, [r0,:64], r2
337 vld1.32 {d2}, [r0,:64], r2
339 vld1.32 {d3}, [r0,:64], r2
341 vld1.32 {d4}, [r0,:64], r2
342 vaddw.u8 q10, q15, d2
343 vld1.32 {d5}, [r0,:64], r2
344 vaddw.u8 q11, q15, d3
345 vld1.32 {d6}, [r0,:64], r2
346 vaddw.u8 q12, q15, d4
347 vld1.32 {d7}, [r0,:64], r2
348 vaddw.u8 q13, q15, d5
349 vaddw.u8 q14, q15, d6
350 vaddw.u8 q15, q15, d7
355 sub r0, r0, r2, lsl #3
356 vst1.32 {d0}, [r0,:64], r2
358 vst1.32 {d1}, [r0,:64], r2
360 vst1.32 {d2}, [r0,:64], r2
362 vst1.32 {d3}, [r0,:64], r2
364 vst1.32 {d4}, [r0,:64], r2
365 vst1.32 {d5}, [r0,:64], r2
366 vst1.32 {d6}, [r0,:64], r2
367 vst1.32 {d7}, [r0,:64], r2
371 function ff_h264_idct8_add4_neon, export=1
391 adrne lr, X(ff_h264_idct8_dc_add_neon) + CONFIG_THUMB
392 adreq lr, X(ff_h264_idct8_add_neon) + CONFIG_THUMB
401 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
402 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
403 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
404 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
405 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
406 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
407 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
408 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
409 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
410 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
411 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
412 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8