2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
24 .macro h264_chroma_mc8 type, codec=h264
25 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
37 add r6, r6, r7, lsl #3
39 add r6, r6, r7, lsl #1
40 vld1.16 {d22[],d23[]}, [r6,:16]
46 rsb r6, r7, r5, lsl #3
47 rsb r12, r7, r4, lsl #3
48 sub r4, r7, r4, lsl #3
49 sub r4, r4, r5, lsl #3
59 vld1.8 {d4, d5}, [r1], r4
61 vld1.8 {d6, d7}, [r5], r4
70 vld1.8 {d4, d5}, [r1], r4
79 vld1.8 {d6, d7}, [r5], r4
82 vrshrn.u16 d16, q8, #6
83 vrshrn.u16 d17, q9, #6
91 vld1.8 {d20}, [lr,:64], r2
92 vld1.8 {d21}, [lr,:64], r2
96 vst1.8 {d16}, [r0,:64], r2
97 vst1.8 {d17}, [r0,:64], r2
111 vld1.8 {d4}, [r1], r4
112 vld1.8 {d6}, [r5], r4
117 vld1.8 {d4}, [r1], r4
120 vld1.8 {d6}, [r5], r4
122 vrshrn.u16 d16, q8, #6
123 vrshrn.u16 d17, q9, #6
127 vshrn.u16 d16, q8, #6
128 vshrn.u16 d17, q9, #6
131 vld1.8 {d20}, [lr,:64], r2
132 vld1.8 {d21}, [lr,:64], r2
133 vrhadd.u8 q8, q8, q10
137 vst1.8 {d16}, [r0,:64], r2
138 vst1.8 {d17}, [r0,:64], r2
143 4: vld1.8 {d4, d5}, [r1], r2
144 vld1.8 {d6, d7}, [r1], r2
145 vext.8 d5, d4, d5, #1
146 vext.8 d7, d6, d7, #1
152 vld1.8 {d4, d5}, [r1], r2
156 vext.8 d5, d4, d5, #1
158 vrshrn.u16 d16, q8, #6
159 vrshrn.u16 d17, q9, #6
163 vshrn.u16 d16, q8, #6
164 vshrn.u16 d17, q9, #6
167 vld1.8 {d20}, [lr,:64], r2
168 vld1.8 {d21}, [lr,:64], r2
169 vrhadd.u8 q8, q8, q10
171 vld1.8 {d6, d7}, [r1], r2
172 vext.8 d7, d6, d7, #1
173 vst1.8 {d16}, [r0,:64], r2
174 vst1.8 {d17}, [r0,:64], r2
181 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
182 .macro h264_chroma_mc4 type, codec=h264
183 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
195 add r6, r6, r7, lsl #3
197 add r6, r6, r7, lsl #1
198 vld1.16 {d22[],d23[]}, [r6,:16]
204 rsb r6, r7, r5, lsl #3
205 rsb r12, r7, r4, lsl #3
206 sub r4, r7, r4, lsl #3
207 sub r4, r4, r5, lsl #3
217 vld1.8 {d4}, [r1], r4
219 vld1.8 {d6}, [r5], r4
222 vext.8 d5, d4, d5, #1
223 vext.8 d7, d6, d7, #1
233 vld1.8 {d4}, [r1], r4
234 vext.8 d5, d4, d5, #1
238 vld1.8 {d6}, [r5], r4
239 vadd.i16 d16, d16, d17
240 vadd.i16 d17, d18, d19
242 vrshrn.u16 d16, q8, #6
245 vshrn.u16 d16, q8, #6
250 vld1.32 {d20[0]}, [lr,:32], r2
251 vld1.32 {d20[1]}, [lr,:32], r2
252 vrhadd.u8 d16, d16, d20
254 vext.8 d7, d6, d7, #1
256 vst1.32 {d16[0]}, [r0,:32], r2
257 vst1.32 {d16[1]}, [r0,:32], r2
270 vext.32 d1, d0, d1, #1
273 vld1.32 {d4[0]}, [r1], r4
274 vld1.32 {d4[1]}, [r5], r4
278 vld1.32 {d4[0]}, [r1], r4
280 vld1.32 {d4[1]}, [r5], r4
281 vadd.i16 d16, d16, d17
282 vadd.i16 d17, d18, d19
284 vrshrn.u16 d16, q8, #6
287 vshrn.u16 d16, q8, #6
290 vld1.32 {d20[0]}, [lr,:32], r2
291 vld1.32 {d20[1]}, [lr,:32], r2
292 vrhadd.u8 d16, d16, d20
296 vst1.32 {d16[0]}, [r0,:32], r2
297 vst1.32 {d16[1]}, [r0,:32], r2
302 4: vld1.8 {d4}, [r1], r2
303 vld1.8 {d6}, [r1], r2
304 vext.8 d5, d4, d5, #1
305 vext.8 d7, d6, d7, #1
309 5: vmull.u8 q8, d4, d0
312 vld1.8 {d4}, [r1], r2
313 vext.8 d5, d4, d5, #1
315 vadd.i16 d16, d16, d17
316 vadd.i16 d17, d18, d19
319 vrshrn.u16 d16, q8, #6
322 vshrn.u16 d16, q8, #6
325 vld1.32 {d20[0]}, [lr,:32], r2
326 vld1.32 {d20[1]}, [lr,:32], r2
327 vrhadd.u8 d16, d16, d20
329 vld1.8 {d6}, [r1], r2
330 vext.8 d7, d6, d7, #1
333 vst1.32 {d16[0]}, [r0,:32], r2
334 vst1.32 {d16[1]}, [r0,:32], r2
341 .macro h264_chroma_mc2 type
342 function ff_\type\()_h264_chroma_mc2_neon, export=1
352 rsb r6, r5, lr, lsl #3
353 rsb r12, r5, r4, lsl #3
354 sub r4, r5, r4, lsl #3
355 sub r4, r4, lr, lsl #3
363 vld1.32 {d4[0]}, [r1], r2
364 vld1.32 {d4[1]}, [r1], r2
366 vld1.32 {d5[1]}, [r1]
367 vext.8 q3, q2, q2, #1
372 vld1.16 {d18[0]}, [r0,:16], r2
373 vld1.16 {d18[1]}, [r0,:16]
377 vadd.i16 d16, d16, d17
378 vrshrn.u16 d16, q8, #6
380 vrhadd.u8 d16, d16, d18
382 vst1.16 {d16[0]}, [r0,:16], r2
383 vst1.16 {d16[1]}, [r0,:16], r2
394 vld1.16 {d16[0]}, [r1], r2
395 vld1.16 {d16[1]}, [r1], r2
396 vld1.16 {d18[0]}, [r0,:16], r2
397 vld1.16 {d18[1]}, [r0,:16]
399 vrhadd.u8 d16, d16, d18
400 vst1.16 {d16[0]}, [r0,:16], r2
401 vst1.16 {d16[1]}, [r0,:16], r2
409 #if CONFIG_H264_DECODER
418 #if CONFIG_RV40_DECODER
421 .short 32, 28, 32, 28
423 .short 32, 28, 32, 28
426 h264_chroma_mc8 put, rv40
427 h264_chroma_mc8 avg, rv40
428 h264_chroma_mc4 put, rv40
429 h264_chroma_mc4 avg, rv40