2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 /* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
24 .macro h264_chroma_mc8 type, codec=h264
25 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
27 ldrd r4, r5, [sp, #20]
37 add r6, r6, r7, lsl #3
39 add r6, r6, r7, lsl #1
40 vld1.16 {d22[],d23[]}, [r6,:16]
49 rsb r6, r7, r5, lsl #3
50 rsb r12, r7, r4, lsl #3
51 sub r4, r7, r4, lsl #3
52 sub r4, r4, r5, lsl #3
59 vld1.8 {d4, d5}, [r1], r2
64 1: vld1.8 {d6, d7}, [r1], r2
68 vld1.8 {d4, d5}, [r1], r2
80 vrshrn.u16 d16, q8, #6
81 vrshrn.u16 d17, q9, #6
89 vld1.8 {d20}, [lr,:64], r2
90 vld1.8 {d21}, [lr,:64], r2
93 vst1.8 {d16}, [r0,:64], r2
94 vst1.8 {d17}, [r0,:64], r2
107 vld1.8 {d4}, [r1], r2
109 3: vld1.8 {d6}, [r1], r2
112 vld1.8 {d4}, [r1], r2
117 vrshrn.u16 d16, q8, #6
118 vrshrn.u16 d17, q9, #6
122 vshrn.u16 d16, q8, #6
123 vshrn.u16 d17, q9, #6
127 vld1.8 {d20}, [lr,:64], r2
128 vld1.8 {d21}, [lr,:64], r2
129 vrhadd.u8 q8, q8, q10
132 vst1.8 {d16}, [r0,:64], r2
133 vst1.8 {d17}, [r0,:64], r2
138 4: vld1.8 {d4, d5}, [r1], r2
139 vld1.8 {d6, d7}, [r1], r2
140 vext.8 d5, d4, d5, #1
141 vext.8 d7, d6, d7, #1
150 vrshrn.u16 d16, q8, #6
151 vrshrn.u16 d17, q9, #6
155 vshrn.u16 d16, q8, #6
156 vshrn.u16 d17, q9, #6
159 vld1.8 {d20}, [lr,:64], r2
160 vld1.8 {d21}, [lr,:64], r2
161 vrhadd.u8 q8, q8, q10
163 vst1.8 {d16}, [r0,:64], r2
164 vst1.8 {d17}, [r0,:64], r2
169 5: vld1.8 {d4}, [r1], r2
170 vld1.8 {d5}, [r1], r2
177 vrshrn.u16 d16, q8, #6
178 vrshrn.u16 d17, q9, #6
182 vshrn.u16 d16, q8, #6
183 vshrn.u16 d17, q9, #6
186 vld1.8 {d20}, [lr,:64], r2
187 vld1.8 {d21}, [lr,:64], r2
188 vrhadd.u8 q8, q8, q10
190 vst1.8 {d16}, [r0,:64], r2
191 vst1.8 {d17}, [r0,:64], r2
198 /* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
199 .macro h264_chroma_mc4 type, codec=h264
200 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
202 ldrd r4, r5, [sp, #20]
212 add r6, r6, r7, lsl #3
214 add r6, r6, r7, lsl #1
215 vld1.16 {d22[],d23[]}, [r6,:16]
224 rsb r6, r7, r5, lsl #3
225 rsb r12, r7, r4, lsl #3
226 sub r4, r7, r4, lsl #3
227 sub r4, r4, r5, lsl #3
234 vld1.8 {d4}, [r1], r2
238 vext.8 d5, d4, d5, #1
244 1: vld1.8 {d6}, [r1], r2
245 vext.8 d7, d6, d7, #1
249 vld1.8 {d4}, [r1], r2
250 vext.8 d5, d4, d5, #1
255 vadd.i16 d16, d16, d17
256 vadd.i16 d17, d18, d19
258 vrshrn.u16 d16, q8, #6
261 vshrn.u16 d16, q8, #6
266 vld1.32 {d20[0]}, [lr,:32], r2
267 vld1.32 {d20[1]}, [lr,:32], r2
268 vrhadd.u8 d16, d16, d20
270 vst1.32 {d16[0]}, [r0,:32], r2
271 vst1.32 {d16[1]}, [r0,:32], r2
285 vext.32 d1, d0, d1, #1
286 vld1.32 {d4[0]}, [r1], r2
288 3: vld1.32 {d4[1]}, [r1], r2
290 vld1.32 {d4[0]}, [r1], r2
292 vadd.i16 d16, d16, d17
293 vadd.i16 d17, d18, d19
296 vrshrn.u16 d16, q8, #6
299 vshrn.u16 d16, q8, #6
302 vld1.32 {d20[0]}, [lr,:32], r2
303 vld1.32 {d20[1]}, [lr,:32], r2
304 vrhadd.u8 d16, d16, d20
308 vst1.32 {d16[0]}, [r0,:32], r2
309 vst1.32 {d16[1]}, [r0,:32], r2
314 4: vld1.8 {d4}, [r1], r2
315 vld1.8 {d6}, [r1], r2
316 vext.8 d5, d4, d5, #1
317 vext.8 d7, d6, d7, #1
323 vadd.i16 d16, d16, d17
324 vadd.i16 d17, d18, d19
327 vrshrn.u16 d16, q8, #6
330 vshrn.u16 d16, q8, #6
333 vld1.32 {d20[0]}, [lr,:32], r2
334 vld1.32 {d20[1]}, [lr,:32], r2
335 vrhadd.u8 d16, d16, d20
338 vst1.32 {d16[0]}, [r0,:32], r2
339 vst1.32 {d16[1]}, [r0,:32], r2
344 5: vld1.32 {d4[0]}, [r1], r2
345 vld1.32 {d4[1]}, [r1], r2
350 vrshrn.u16 d16, q8, #6
353 vshrn.u16 d16, q8, #6
356 vld1.32 {d20[0]}, [lr,:32], r2
357 vld1.32 {d20[1]}, [lr,:32], r2
358 vrhadd.u8 d16, d16, d20
361 vst1.32 {d16[0]}, [r0,:32], r2
362 vst1.32 {d16[1]}, [r0,:32], r2
369 .macro h264_chroma_mc2 type
370 function ff_\type\()_h264_chroma_mc2_neon, export=1
380 rsb r6, r5, lr, lsl #3
381 rsb r12, r5, r4, lsl #3
382 sub r4, r5, r4, lsl #3
383 sub r4, r4, lr, lsl #3
391 vld1.32 {d4[0]}, [r1], r2
392 vld1.32 {d4[1]}, [r1], r2
394 vld1.32 {d5[1]}, [r1]
395 vext.8 q3, q2, q2, #1
400 vld1.16 {d18[0]}, [r0,:16], r2
401 vld1.16 {d18[1]}, [r0,:16]
405 vadd.i16 d16, d16, d17
406 vrshrn.u16 d16, q8, #6
408 vrhadd.u8 d16, d16, d18
410 vst1.16 {d16[0]}, [r0,:16], r2
411 vst1.16 {d16[1]}, [r0,:16], r2
422 vld1.16 {d16[0]}, [r1], r2
423 vld1.16 {d16[1]}, [r1], r2
424 vld1.16 {d18[0]}, [r0,:16], r2
425 vld1.16 {d18[1]}, [r0,:16]
427 vrhadd.u8 d16, d16, d18
428 vst1.16 {d16[0]}, [r0,:16], r2
429 vst1.16 {d16[1]}, [r0,:16], r2
444 #if CONFIG_RV40_DECODER
447 .short 32, 28, 32, 28
449 .short 32, 28, 32, 28
452 h264_chroma_mc8 put, rv40
453 h264_chroma_mc8 avg, rv40
454 h264_chroma_mc4 put, rv40
455 h264_chroma_mc4 avg, rv40
458 #if CONFIG_VC1_DECODER
459 h264_chroma_mc8 put, vc1
460 h264_chroma_mc8 avg, vc1
461 h264_chroma_mc4 put, vc1
462 h264_chroma_mc4 avg, vc1