2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
25 function ff_h264_idct_add_neon, export=1
26 vld1.64 {d0-d3}, [r1,:128]
43 vld1.32 {d18[0]}, [r0,:32], r2
46 vld1.32 {d19[1]}, [r0,:32], r2
48 vld1.32 {d18[1]}, [r0,:32], r2
50 vld1.32 {d19[0]}, [r0,:32], r2
52 sub r0, r0, r2, lsl #2
65 vst1.32 {d0[0]}, [r0,:32], r2
66 vst1.32 {d1[1]}, [r0,:32], r2
67 vst1.32 {d0[1]}, [r0,:32], r2
68 vst1.32 {d1[0]}, [r0,:32], r2
73 function ff_h264_idct_dc_add_neon, export=1
74 vld1.16 {d2[],d3[]}, [r1,:16]
76 vld1.32 {d0[0]}, [r0,:32], r2
77 vld1.32 {d0[1]}, [r0,:32], r2
79 vld1.32 {d1[0]}, [r0,:32], r2
80 vld1.32 {d1[1]}, [r0,:32], r2
84 sub r0, r0, r2, lsl #2
85 vst1.32 {d0[0]}, [r0,:32], r2
86 vst1.32 {d0[1]}, [r0,:32], r2
87 vst1.32 {d1[0]}, [r0,:32], r2
88 vst1.32 {d1[1]}, [r0,:32], r2
92 function ff_h264_idct_add16_neon, export=1
112 adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
113 adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
121 function ff_h264_idct_add16intra_neon, export=1
137 adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
138 adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
147 function ff_h264_idct_add8_neon, export=1
157 1: ldrb r8, [r7, r12]
158 ldr r0, [r5, r12, lsl #2]
161 add r1, r3, r12, lsl #5
165 adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
166 adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
179 .macro idct8x8_cols pass
185 vld1.16 {q14-q15},[r1,:128]!
199 vshr.s16 q14, q10, #1
206 vsub.i16 q14, q14, q2
213 vsub.i16 q0, q13, q11
216 vadd.i16 q3, q13, q11
222 vshr.s16 q11, q11, #1
223 vshr.s16 q13, q13, #1
224 vshr.s16 q15, q15, #1
243 vadd.i16 q10, q12, q1
245 vadd.i16 q11, q14, q0
246 vsub.i16 q13, q12, q1
248 vsub.i16 q12, q14, q0
252 vadd.i16 q9, q10, q14
253 vsub.i16 q14, q10, q14
254 vadd.i16 q10, q12, q1
255 vsub.i16 q13, q12, q1
263 function ff_h264_idct8_add_neon, export=1
264 vld1.16 {q8-q9}, [r1,:128]!
265 vld1.16 {q10-q11},[r1,:128]!
266 vld1.16 {q12-q13},[r1,:128]!
273 vld1.8 {d0}, [r0,:64], r2
275 vld1.8 {d1}, [r0,:64], r2
276 vrshr.s16 q10, q10, #6
277 vld1.8 {d2}, [r0,:64], r2
278 vrshr.s16 q11, q11, #6
279 vld1.8 {d3}, [r0,:64], r2
280 vrshr.s16 q12, q12, #6
281 vld1.8 {d4}, [r0,:64], r2
282 vrshr.s16 q13, q13, #6
283 vld1.8 {d5}, [r0,:64], r2
284 vrshr.s16 q14, q14, #6
285 vld1.8 {d6}, [r0,:64], r2
286 vrshr.s16 q15, q15, #6
287 vld1.8 {d7}, [r0,:64], r2
290 vaddw.u8 q10, q10, d2
292 vaddw.u8 q11, q11, d3
294 vaddw.u8 q12, q12, d4
296 vst1.8 {d0}, [r3,:64], r2
297 vaddw.u8 q13, q13, d5
299 vst1.8 {d1}, [r3,:64], r2
300 vaddw.u8 q14, q14, d6
302 vst1.8 {d2}, [r3,:64], r2
303 vaddw.u8 q15, q15, d7
305 vst1.8 {d3}, [r3,:64], r2
308 vst1.8 {d4}, [r3,:64], r2
309 vst1.8 {d5}, [r3,:64], r2
310 vst1.8 {d6}, [r3,:64], r2
311 vst1.8 {d7}, [r3,:64], r2
317 function ff_h264_idct8_dc_add_neon, export=1
318 vld1.16 {d30[],d31[]},[r1,:16]
319 vld1.32 {d0}, [r0,:64], r2
320 vrshr.s16 q15, q15, #6
321 vld1.32 {d1}, [r0,:64], r2
322 vld1.32 {d2}, [r0,:64], r2
324 vld1.32 {d3}, [r0,:64], r2
326 vld1.32 {d4}, [r0,:64], r2
327 vaddw.u8 q10, q15, d2
328 vld1.32 {d5}, [r0,:64], r2
329 vaddw.u8 q11, q15, d3
330 vld1.32 {d6}, [r0,:64], r2
331 vaddw.u8 q12, q15, d4
332 vld1.32 {d7}, [r0,:64], r2
333 vaddw.u8 q13, q15, d5
334 vaddw.u8 q14, q15, d6
335 vaddw.u8 q15, q15, d7
340 sub r0, r0, r2, lsl #3
341 vst1.32 {d0}, [r0,:64], r2
343 vst1.32 {d1}, [r0,:64], r2
345 vst1.32 {d2}, [r0,:64], r2
347 vst1.32 {d3}, [r0,:64], r2
349 vst1.32 {d4}, [r0,:64], r2
350 vst1.32 {d5}, [r0,:64], r2
351 vst1.32 {d6}, [r0,:64], r2
352 vst1.32 {d7}, [r0,:64], r2
356 function ff_h264_idct8_add4_neon, export=1
376 adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
377 adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
386 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
387 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
388 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
389 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
390 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
391 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
392 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
393 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
394 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
395 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
396 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
397 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8