2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 function ff_h264_idct_add_neon, export=1
24 h264_idct_add_neon_nothumb:
25 vld1.64 {d0-d3}, [r1,:128]
29 vst1.16 {q15}, [r1,:128]!
31 vst1.16 {q15}, [r1,:128]!
45 vld1.32 {d18[0]}, [r0,:32], r2
48 vld1.32 {d19[1]}, [r0,:32], r2
50 vld1.32 {d18[1]}, [r0,:32], r2
52 vld1.32 {d19[0]}, [r0,:32], r2
54 sub r0, r0, r2, lsl #2
67 vst1.32 {d0[0]}, [r0,:32], r2
68 vst1.32 {d1[1]}, [r0,:32], r2
69 vst1.32 {d0[1]}, [r0,:32], r2
70 vst1.32 {d1[0]}, [r0,:32], r2
76 function ff_h264_idct_dc_add_neon, export=1
77 h264_idct_dc_add_neon_nothumb:
79 vld1.16 {d2[],d3[]}, [r1,:16]
82 vld1.32 {d0[0]}, [r0,:32], r2
83 vld1.32 {d0[1]}, [r0,:32], r2
85 vld1.32 {d1[0]}, [r0,:32], r2
86 vld1.32 {d1[1]}, [r0,:32], r2
90 sub r0, r0, r2, lsl #2
91 vst1.32 {d0[0]}, [r0,:32], r2
92 vst1.32 {d0[1]}, [r0,:32], r2
93 vst1.32 {d1[0]}, [r0,:32], r2
94 vst1.32 {d1[1]}, [r0,:32], r2
98 function ff_h264_idct_add16_neon, export=1
118 adrne lr, h264_idct_dc_add_neon_nothumb + CONFIG_THUMB
119 adreq lr, h264_idct_add_neon_nothumb + CONFIG_THUMB
127 function ff_h264_idct_add16intra_neon, export=1
143 adrne lr, h264_idct_add_neon_nothumb + CONFIG_THUMB
144 adreq lr, h264_idct_dc_add_neon_nothumb + CONFIG_THUMB
153 function ff_h264_idct_add8_neon, export=1
163 1: ldrb r8, [r7, r12]
164 ldr r0, [r5, r12, lsl #2]
167 add r1, r10, r12, lsl #5
171 adrne lr, h264_idct_add_neon_nothumb + CONFIG_THUMB
172 adreq lr, h264_idct_dc_add_neon_nothumb + CONFIG_THUMB
185 .macro idct8x8_cols pass
191 vld1.16 {q14-q15},[r1,:128]
192 vst1.16 {q3}, [r1,:128]!
193 vst1.16 {q3}, [r1,:128]!
207 vshr.s16 q14, q10, #1
214 vsub.i16 q14, q14, q2
221 vsub.i16 q0, q13, q11
224 vadd.i16 q3, q13, q11
230 vshr.s16 q11, q11, #1
231 vshr.s16 q13, q13, #1
232 vshr.s16 q15, q15, #1
251 vadd.i16 q10, q12, q1
253 vadd.i16 q11, q14, q0
254 vsub.i16 q13, q12, q1
256 vsub.i16 q12, q14, q0
260 vadd.i16 q9, q10, q14
261 vsub.i16 q14, q10, q14
262 vadd.i16 q10, q12, q1
263 vsub.i16 q13, q12, q1
271 function ff_h264_idct8_add_neon, export=1
272 h264_idct8_add_neon_nothumb:
274 vld1.16 {q8-q9}, [r1,:128]
275 vst1.16 {q3}, [r1,:128]!
276 vst1.16 {q3}, [r1,:128]!
277 vld1.16 {q10-q11},[r1,:128]
278 vst1.16 {q3}, [r1,:128]!
279 vst1.16 {q3}, [r1,:128]!
280 vld1.16 {q12-q13},[r1,:128]
281 vst1.16 {q3}, [r1,:128]!
282 vst1.16 {q3}, [r1,:128]!
289 vld1.8 {d0}, [r0,:64], r2
291 vld1.8 {d1}, [r0,:64], r2
292 vrshr.s16 q10, q10, #6
293 vld1.8 {d2}, [r0,:64], r2
294 vrshr.s16 q11, q11, #6
295 vld1.8 {d3}, [r0,:64], r2
296 vrshr.s16 q12, q12, #6
297 vld1.8 {d4}, [r0,:64], r2
298 vrshr.s16 q13, q13, #6
299 vld1.8 {d5}, [r0,:64], r2
300 vrshr.s16 q14, q14, #6
301 vld1.8 {d6}, [r0,:64], r2
302 vrshr.s16 q15, q15, #6
303 vld1.8 {d7}, [r0,:64], r2
306 vaddw.u8 q10, q10, d2
308 vaddw.u8 q11, q11, d3
310 vaddw.u8 q12, q12, d4
312 vst1.8 {d0}, [r3,:64], r2
313 vaddw.u8 q13, q13, d5
315 vst1.8 {d1}, [r3,:64], r2
316 vaddw.u8 q14, q14, d6
318 vst1.8 {d2}, [r3,:64], r2
319 vaddw.u8 q15, q15, d7
321 vst1.8 {d3}, [r3,:64], r2
324 vst1.8 {d4}, [r3,:64], r2
325 vst1.8 {d5}, [r3,:64], r2
326 vst1.8 {d6}, [r3,:64], r2
327 vst1.8 {d7}, [r3,:64], r2
333 function ff_h264_idct8_dc_add_neon, export=1
334 h264_idct8_dc_add_neon_nothumb:
336 vld1.16 {d30[],d31[]},[r1,:16]
338 vld1.32 {d0}, [r0,:64], r2
339 vrshr.s16 q15, q15, #6
340 vld1.32 {d1}, [r0,:64], r2
341 vld1.32 {d2}, [r0,:64], r2
343 vld1.32 {d3}, [r0,:64], r2
345 vld1.32 {d4}, [r0,:64], r2
346 vaddw.u8 q10, q15, d2
347 vld1.32 {d5}, [r0,:64], r2
348 vaddw.u8 q11, q15, d3
349 vld1.32 {d6}, [r0,:64], r2
350 vaddw.u8 q12, q15, d4
351 vld1.32 {d7}, [r0,:64], r2
352 vaddw.u8 q13, q15, d5
353 vaddw.u8 q14, q15, d6
354 vaddw.u8 q15, q15, d7
359 sub r0, r0, r2, lsl #3
360 vst1.32 {d0}, [r0,:64], r2
362 vst1.32 {d1}, [r0,:64], r2
364 vst1.32 {d2}, [r0,:64], r2
366 vst1.32 {d3}, [r0,:64], r2
368 vst1.32 {d4}, [r0,:64], r2
369 vst1.32 {d5}, [r0,:64], r2
370 vst1.32 {d6}, [r0,:64], r2
371 vst1.32 {d7}, [r0,:64], r2
375 function ff_h264_idct8_add4_neon, export=1
395 adrne lr, h264_idct8_dc_add_neon_nothumb + CONFIG_THUMB
396 adreq lr, h264_idct8_add_neon_nothumb + CONFIG_THUMB
405 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
406 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
407 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
408 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
409 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
410 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
411 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
412 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
413 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
414 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
415 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
416 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8