2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 function ff_h264_idct_add_neon, export=1
24 vld1.64 {d0-d3}, [r1,:128]
41 vld1.32 {d18[0]}, [r0,:32], r2
44 vld1.32 {d19[1]}, [r0,:32], r2
46 vld1.32 {d18[1]}, [r0,:32], r2
48 vld1.32 {d19[0]}, [r0,:32], r2
50 sub r0, r0, r2, lsl #2
63 vst1.32 {d0[0]}, [r0,:32], r2
64 vst1.32 {d1[1]}, [r0,:32], r2
65 vst1.32 {d0[1]}, [r0,:32], r2
66 vst1.32 {d1[0]}, [r0,:32], r2
71 function ff_h264_idct_dc_add_neon, export=1
72 vld1.16 {d2[],d3[]}, [r1,:16]
74 vld1.32 {d0[0]}, [r0,:32], r2
75 vld1.32 {d0[1]}, [r0,:32], r2
77 vld1.32 {d1[0]}, [r0,:32], r2
78 vld1.32 {d1[1]}, [r0,:32], r2
82 sub r0, r0, r2, lsl #2
83 vst1.32 {d0[0]}, [r0,:32], r2
84 vst1.32 {d0[1]}, [r0,:32], r2
85 vst1.32 {d1[0]}, [r0,:32], r2
86 vst1.32 {d1[1]}, [r0,:32], r2
90 function ff_h264_idct_add16_neon, export=1
110 adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
111 adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
119 function ff_h264_idct_add16intra_neon, export=1
135 adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
136 adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
145 function ff_h264_idct_add8_neon, export=1
155 1: ldrb r8, [r7, r12]
156 ldr r0, [r5, r12, lsl #2]
159 add r1, r3, r12, lsl #5
163 adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
164 adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
177 .macro idct8x8_cols pass
183 vld1.16 {q14-q15},[r1,:128]!
197 vshr.s16 q14, q10, #1
204 vsub.i16 q14, q14, q2
211 vsub.i16 q0, q13, q11
214 vadd.i16 q3, q13, q11
220 vshr.s16 q11, q11, #1
221 vshr.s16 q13, q13, #1
222 vshr.s16 q15, q15, #1
241 vadd.i16 q10, q12, q1
243 vadd.i16 q11, q14, q0
244 vsub.i16 q13, q12, q1
246 vsub.i16 q12, q14, q0
250 vadd.i16 q9, q10, q14
251 vsub.i16 q14, q10, q14
252 vadd.i16 q10, q12, q1
253 vsub.i16 q13, q12, q1
261 function ff_h264_idct8_add_neon, export=1
262 vld1.16 {q8-q9}, [r1,:128]!
263 vld1.16 {q10-q11},[r1,:128]!
264 vld1.16 {q12-q13},[r1,:128]!
271 vld1.8 {d0}, [r0,:64], r2
273 vld1.8 {d1}, [r0,:64], r2
274 vrshr.s16 q10, q10, #6
275 vld1.8 {d2}, [r0,:64], r2
276 vrshr.s16 q11, q11, #6
277 vld1.8 {d3}, [r0,:64], r2
278 vrshr.s16 q12, q12, #6
279 vld1.8 {d4}, [r0,:64], r2
280 vrshr.s16 q13, q13, #6
281 vld1.8 {d5}, [r0,:64], r2
282 vrshr.s16 q14, q14, #6
283 vld1.8 {d6}, [r0,:64], r2
284 vrshr.s16 q15, q15, #6
285 vld1.8 {d7}, [r0,:64], r2
288 vaddw.u8 q10, q10, d2
290 vaddw.u8 q11, q11, d3
292 vaddw.u8 q12, q12, d4
294 vst1.8 {d0}, [r3,:64], r2
295 vaddw.u8 q13, q13, d5
297 vst1.8 {d1}, [r3,:64], r2
298 vaddw.u8 q14, q14, d6
300 vst1.8 {d2}, [r3,:64], r2
301 vaddw.u8 q15, q15, d7
303 vst1.8 {d3}, [r3,:64], r2
306 vst1.8 {d4}, [r3,:64], r2
307 vst1.8 {d5}, [r3,:64], r2
308 vst1.8 {d6}, [r3,:64], r2
309 vst1.8 {d7}, [r3,:64], r2
315 function ff_h264_idct8_dc_add_neon, export=1
316 vld1.16 {d30[],d31[]},[r1,:16]
317 vld1.32 {d0}, [r0,:64], r2
318 vrshr.s16 q15, q15, #6
319 vld1.32 {d1}, [r0,:64], r2
320 vld1.32 {d2}, [r0,:64], r2
322 vld1.32 {d3}, [r0,:64], r2
324 vld1.32 {d4}, [r0,:64], r2
325 vaddw.u8 q10, q15, d2
326 vld1.32 {d5}, [r0,:64], r2
327 vaddw.u8 q11, q15, d3
328 vld1.32 {d6}, [r0,:64], r2
329 vaddw.u8 q12, q15, d4
330 vld1.32 {d7}, [r0,:64], r2
331 vaddw.u8 q13, q15, d5
332 vaddw.u8 q14, q15, d6
333 vaddw.u8 q15, q15, d7
338 sub r0, r0, r2, lsl #3
339 vst1.32 {d0}, [r0,:64], r2
341 vst1.32 {d1}, [r0,:64], r2
343 vst1.32 {d2}, [r0,:64], r2
345 vst1.32 {d3}, [r0,:64], r2
347 vst1.32 {d4}, [r0,:64], r2
348 vst1.32 {d5}, [r0,:64], r2
349 vst1.32 {d6}, [r0,:64], r2
350 vst1.32 {d7}, [r0,:64], r2
354 function ff_h264_idct8_add4_neon, export=1
374 adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
375 adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
384 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
385 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
386 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
387 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
388 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
389 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
390 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
391 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
392 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
393 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
394 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
395 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8