2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 function ff_h264_idct_add_neon, export=1
27 vld1.64 {d0-d3}, [r1,:128]
44 vld1.32 {d18[0]}, [r0,:32], r2
47 vld1.32 {d19[1]}, [r0,:32], r2
49 vld1.32 {d18[1]}, [r0,:32], r2
51 vld1.32 {d19[0]}, [r0,:32], r2
53 sub r0, r0, r2, lsl #2
66 vst1.32 {d0[0]}, [r0,:32], r2
67 vst1.32 {d1[1]}, [r0,:32], r2
68 vst1.32 {d0[1]}, [r0,:32], r2
69 vst1.32 {d1[0]}, [r0,:32], r2
74 function ff_h264_idct_dc_add_neon, export=1
75 vld1.16 {d2[],d3[]}, [r1,:16]
77 vld1.32 {d0[0]}, [r0,:32], r2
78 vld1.32 {d0[1]}, [r0,:32], r2
80 vld1.32 {d1[0]}, [r0,:32], r2
81 vld1.32 {d1[1]}, [r0,:32], r2
85 sub r0, r0, r2, lsl #2
86 vst1.32 {d0[0]}, [r0,:32], r2
87 vst1.32 {d0[1]}, [r0,:32], r2
88 vst1.32 {d1[0]}, [r0,:32], r2
89 vst1.32 {d1[1]}, [r0,:32], r2
93 function ff_h264_idct_add16_neon, export=1
111 adrne lr, ff_h264_idct_dc_add_neon
112 adreq lr, ff_h264_idct_add_neon
120 function ff_h264_idct_add16intra_neon, export=1
135 adrne lr, ff_h264_idct_add_neon
136 adreq lr, ff_h264_idct_dc_add_neon
145 function ff_h264_idct_add8_neon, export=1
155 1: ldrb r8, [r7, r12]
156 ldr r0, [r5, r12, lsl #2]
159 add r1, r3, r12, lsl #5
162 adrne lr, ff_h264_idct_add_neon
163 adreq lr, ff_h264_idct_dc_add_neon
175 .macro idct8x8_cols pass
181 vld1.16 {q14-q15},[r1,:128]!
195 vshr.s16 q14, q10, #1
202 vsub.i16 q14, q14, q2
209 vsub.i16 q0, q13, q11
212 vadd.i16 q3, q13, q11
218 vshr.s16 q11, q11, #1
219 vshr.s16 q13, q13, #1
220 vshr.s16 q15, q15, #1
239 vadd.i16 q10, q12, q1
241 vadd.i16 q11, q14, q0
242 vsub.i16 q13, q12, q1
244 vsub.i16 q12, q14, q0
248 vadd.i16 q9, q10, q14
249 vsub.i16 q14, q10, q14
250 vadd.i16 q10, q12, q1
251 vsub.i16 q13, q12, q1
259 function ff_h264_idct8_add_neon, export=1
260 vld1.16 {q8-q9}, [r1,:128]!
261 vld1.16 {q10-q11},[r1,:128]!
262 vld1.16 {q12-q13},[r1,:128]!
269 vld1.8 {d0}, [r0,:64], r2
271 vld1.8 {d1}, [r0,:64], r2
272 vrshr.s16 q10, q10, #6
273 vld1.8 {d2}, [r0,:64], r2
274 vrshr.s16 q11, q11, #6
275 vld1.8 {d3}, [r0,:64], r2
276 vrshr.s16 q12, q12, #6
277 vld1.8 {d4}, [r0,:64], r2
278 vrshr.s16 q13, q13, #6
279 vld1.8 {d5}, [r0,:64], r2
280 vrshr.s16 q14, q14, #6
281 vld1.8 {d6}, [r0,:64], r2
282 vrshr.s16 q15, q15, #6
283 vld1.8 {d7}, [r0,:64], r2
286 vaddw.u8 q10, q10, d2
288 vaddw.u8 q11, q11, d3
290 vaddw.u8 q12, q12, d4
292 vst1.8 {d0}, [r3,:64], r2
293 vaddw.u8 q13, q13, d5
295 vst1.8 {d1}, [r3,:64], r2
296 vaddw.u8 q14, q14, d6
298 vst1.8 {d2}, [r3,:64], r2
299 vaddw.u8 q15, q15, d7
301 vst1.8 {d3}, [r3,:64], r2
304 vst1.8 {d4}, [r3,:64], r2
305 vst1.8 {d5}, [r3,:64], r2
306 vst1.8 {d6}, [r3,:64], r2
307 vst1.8 {d7}, [r3,:64], r2
313 function ff_h264_idct8_dc_add_neon, export=1
314 vld1.16 {d30[],d31[]},[r1,:16]
315 vld1.32 {d0}, [r0,:64], r2
316 vrshr.s16 q15, q15, #6
317 vld1.32 {d1}, [r0,:64], r2
318 vld1.32 {d2}, [r0,:64], r2
320 vld1.32 {d3}, [r0,:64], r2
322 vld1.32 {d4}, [r0,:64], r2
323 vaddw.u8 q10, q15, d2
324 vld1.32 {d5}, [r0,:64], r2
325 vaddw.u8 q11, q15, d3
326 vld1.32 {d6}, [r0,:64], r2
327 vaddw.u8 q12, q15, d4
328 vld1.32 {d7}, [r0,:64], r2
329 vaddw.u8 q13, q15, d5
330 vaddw.u8 q14, q15, d6
331 vaddw.u8 q15, q15, d7
336 sub r0, r0, r2, lsl #3
337 vst1.32 {d0}, [r0,:64], r2
339 vst1.32 {d1}, [r0,:64], r2
341 vst1.32 {d2}, [r0,:64], r2
343 vst1.32 {d3}, [r0,:64], r2
345 vst1.32 {d4}, [r0,:64], r2
346 vst1.32 {d5}, [r0,:64], r2
347 vst1.32 {d6}, [r0,:64], r2
348 vst1.32 {d7}, [r0,:64], r2
352 function ff_h264_idct8_add4_neon, export=1
370 adrne lr, ff_h264_idct8_dc_add_neon
371 adreq lr, ff_h264_idct8_add_neon
380 scan8: .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
381 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
382 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
383 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
384 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
385 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
386 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
387 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
388 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
389 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
390 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
391 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8