2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 function ff_h264_idct_add_neon, export=1
27 vld1.64 {d0-d3}, [r1,:128]
44 vld1.32 {d18[0]}, [r0,:32], r2
47 vld1.32 {d19[1]}, [r0,:32], r2
49 vld1.32 {d18[1]}, [r0,:32], r2
51 vld1.32 {d19[0]}, [r0,:32], r2
53 sub r0, r0, r2, lsl #2
66 vst1.32 {d0[0]}, [r0,:32], r2
67 vst1.32 {d1[1]}, [r0,:32], r2
68 vst1.32 {d0[1]}, [r0,:32], r2
69 vst1.32 {d1[0]}, [r0,:32], r2
74 function ff_h264_idct_dc_add_neon, export=1
75 vld1.16 {d2[],d3[]}, [r1,:16]
77 vld1.32 {d0[0]}, [r0,:32], r2
78 vld1.32 {d0[1]}, [r0,:32], r2
80 vld1.32 {d1[0]}, [r0,:32], r2
81 vld1.32 {d1[1]}, [r0,:32], r2
85 sub r0, r0, r2, lsl #2
86 vst1.32 {d0[0]}, [r0,:32], r2
87 vst1.32 {d0[1]}, [r0,:32], r2
88 vst1.32 {d1[0]}, [r0,:32], r2
89 vst1.32 {d1[1]}, [r0,:32], r2
93 function ff_h264_idct_add16_neon, export=1
113 adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
114 adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
122 function ff_h264_idct_add16intra_neon, export=1
138 adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
139 adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
148 function ff_h264_idct_add8_neon, export=1
158 1: ldrb r8, [r7, r12]
159 ldr r0, [r5, r12, lsl #2]
162 add r1, r3, r12, lsl #5
166 adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
167 adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
180 .macro idct8x8_cols pass
186 vld1.16 {q14-q15},[r1,:128]!
200 vshr.s16 q14, q10, #1
207 vsub.i16 q14, q14, q2
214 vsub.i16 q0, q13, q11
217 vadd.i16 q3, q13, q11
223 vshr.s16 q11, q11, #1
224 vshr.s16 q13, q13, #1
225 vshr.s16 q15, q15, #1
244 vadd.i16 q10, q12, q1
246 vadd.i16 q11, q14, q0
247 vsub.i16 q13, q12, q1
249 vsub.i16 q12, q14, q0
253 vadd.i16 q9, q10, q14
254 vsub.i16 q14, q10, q14
255 vadd.i16 q10, q12, q1
256 vsub.i16 q13, q12, q1
264 function ff_h264_idct8_add_neon, export=1
265 vld1.16 {q8-q9}, [r1,:128]!
266 vld1.16 {q10-q11},[r1,:128]!
267 vld1.16 {q12-q13},[r1,:128]!
274 vld1.8 {d0}, [r0,:64], r2
276 vld1.8 {d1}, [r0,:64], r2
277 vrshr.s16 q10, q10, #6
278 vld1.8 {d2}, [r0,:64], r2
279 vrshr.s16 q11, q11, #6
280 vld1.8 {d3}, [r0,:64], r2
281 vrshr.s16 q12, q12, #6
282 vld1.8 {d4}, [r0,:64], r2
283 vrshr.s16 q13, q13, #6
284 vld1.8 {d5}, [r0,:64], r2
285 vrshr.s16 q14, q14, #6
286 vld1.8 {d6}, [r0,:64], r2
287 vrshr.s16 q15, q15, #6
288 vld1.8 {d7}, [r0,:64], r2
291 vaddw.u8 q10, q10, d2
293 vaddw.u8 q11, q11, d3
295 vaddw.u8 q12, q12, d4
297 vst1.8 {d0}, [r3,:64], r2
298 vaddw.u8 q13, q13, d5
300 vst1.8 {d1}, [r3,:64], r2
301 vaddw.u8 q14, q14, d6
303 vst1.8 {d2}, [r3,:64], r2
304 vaddw.u8 q15, q15, d7
306 vst1.8 {d3}, [r3,:64], r2
309 vst1.8 {d4}, [r3,:64], r2
310 vst1.8 {d5}, [r3,:64], r2
311 vst1.8 {d6}, [r3,:64], r2
312 vst1.8 {d7}, [r3,:64], r2
318 function ff_h264_idct8_dc_add_neon, export=1
319 vld1.16 {d30[],d31[]},[r1,:16]
320 vld1.32 {d0}, [r0,:64], r2
321 vrshr.s16 q15, q15, #6
322 vld1.32 {d1}, [r0,:64], r2
323 vld1.32 {d2}, [r0,:64], r2
325 vld1.32 {d3}, [r0,:64], r2
327 vld1.32 {d4}, [r0,:64], r2
328 vaddw.u8 q10, q15, d2
329 vld1.32 {d5}, [r0,:64], r2
330 vaddw.u8 q11, q15, d3
331 vld1.32 {d6}, [r0,:64], r2
332 vaddw.u8 q12, q15, d4
333 vld1.32 {d7}, [r0,:64], r2
334 vaddw.u8 q13, q15, d5
335 vaddw.u8 q14, q15, d6
336 vaddw.u8 q15, q15, d7
341 sub r0, r0, r2, lsl #3
342 vst1.32 {d0}, [r0,:64], r2
344 vst1.32 {d1}, [r0,:64], r2
346 vst1.32 {d2}, [r0,:64], r2
348 vst1.32 {d3}, [r0,:64], r2
350 vst1.32 {d4}, [r0,:64], r2
351 vst1.32 {d5}, [r0,:64], r2
352 vst1.32 {d6}, [r0,:64], r2
353 vst1.32 {d7}, [r0,:64], r2
357 function ff_h264_idct8_add4_neon, export=1
377 adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
378 adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
387 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
388 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
389 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
390 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
391 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
392 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
393 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
394 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
395 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
396 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
397 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
398 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8