2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 .macro ldcol.8 rd, rs, rt, n=8, hi=0
24 .if \n == 8 || \hi == 0
25 vld1.8 {\rd[0]}, [\rs], \rt
26 vld1.8 {\rd[1]}, [\rs], \rt
27 vld1.8 {\rd[2]}, [\rs], \rt
28 vld1.8 {\rd[3]}, [\rs], \rt
30 .if \n == 8 || \hi == 1
31 vld1.8 {\rd[4]}, [\rs], \rt
32 vld1.8 {\rd[5]}, [\rs], \rt
33 vld1.8 {\rd[6]}, [\rs], \rt
34 vld1.8 {\rd[7]}, [\rs], \rt
38 .macro add16x8 dq, dl, dh, rl, rh
39 vaddl.u8 \dq, \rl, \rh
40 vadd.u16 \dl, \dl, \dh
41 vpadd.u16 \dl, \dl, \dl
42 vpadd.u16 \dl, \dl, \dl
45 function ff_pred16x16_128_dc_neon, export=1
50 function ff_pred16x16_top_dc_neon, export=1
52 vld1.8 {q0}, [r2,:128]
53 add16x8 q0, d0, d1, d0, d1
59 function ff_pred16x16_left_dc_neon, export=1
63 add16x8 q0, d0, d1, d0, d1
69 function ff_pred16x16_dc_neon, export=1
71 vld1.8 {q0}, [r2,:128]
85 6: vst1.8 {q0}, [r0,:128], r1
86 vst1.8 {q0}, [r0,:128], r1
92 function ff_pred16x16_hor_neon, export=1
95 1: vld1.8 {d0[],d1[]},[r2], r1
96 vst1.8 {q0}, [r0,:128], r1
102 function ff_pred16x16_vert_neon, export=1
104 vld1.8 {q0}, [r0,:128], r1
106 1: vst1.8 {q0}, [r0,:128], r1
107 vst1.8 {q0}, [r0,:128], r1
113 function ff_pred16x16_plane_neon, export=1
118 vld1.8 {d2}, [r2,:64], r1
127 vld1.8 {q0}, [r3,:128]
136 vrshrn.s32 d4, q2, #6
143 vadd.i16 d16, d16, d0
147 vext.16 q0, q0, q0, #7
150 vmul.i16 q0, q0, d4[0]
159 vqshrun.s16 d0, q1, #5
161 vqshrun.s16 d1, q1, #5
163 vst1.8 {q0}, [r0,:128], r1
169 const p16weight, align=4
170 .short 1,2,3,4,5,6,7,8
173 function ff_pred8x8_hor_neon, export=1
176 1: vld1.8 {d0[]}, [r2], r1
177 vst1.8 {d0}, [r0,:64], r1
183 function ff_pred8x8_vert_neon, export=1
185 vld1.8 {d0}, [r0,:64], r1
187 1: vst1.8 {d0}, [r0,:64], r1
188 vst1.8 {d0}, [r0,:64], r1
194 function ff_pred8x8_plane_neon, export=1
198 vld1.32 {d0[0]}, [r3]
199 vld1.32 {d2[0]}, [r2,:32], r1
200 ldcol.8 d0, r3, r1, 4, hi=1
202 ldcol.8 d3, r3, r1, 4
208 vld1.16 {q0}, [r3,:128]
215 vrshrn.s32 d4, q2, #5
222 vadd.i16 d16, d16, d0
226 vext.16 q0, q0, q0, #7
229 vmul.i16 q0, q0, d4[0]
238 vqshrun.s16 d0, q1, #5
240 vst1.8 {d0}, [r0,:64], r1
246 function ff_pred8x8_128_dc_neon, export=1
251 function ff_pred8x8_top_dc_neon, export=1
253 vld1.8 {d0}, [r2,:64]
256 vrshrn.u16 d0, q0, #2
263 function ff_pred8x8_left_dc_neon, export=1
268 vrshrn.u16 d0, q0, #2
274 function ff_pred8x8_dc_neon, export=1
276 vld1.8 {d0}, [r2,:64]
283 vrshrn.u16 d2, q0, #3
284 vrshrn.u16 d3, q0, #2
292 add r2, r0, r1, lsl #2
293 6: vst1.8 {d0}, [r0,:64], r1
294 vst1.8 {d1}, [r2,:64], r1
300 function ff_pred8x8_l0t_dc_neon, export=1
302 vld1.8 {d0}, [r2,:64]
304 ldcol.8 d1, r2, r1, 4
309 vrshrn.u16 d2, q0, #3
310 vrshrn.u16 d3, q0, #2
318 function ff_pred8x8_l00_dc_neon, export=1
320 ldcol.8 d0, r2, r1, 4
323 vrshrn.u16 d0, q0, #2
329 function ff_pred8x8_0lt_dc_neon, export=1
331 vld1.8 {d0}, [r2,:64]
332 add r2, r0, r1, lsl #2
334 ldcol.8 d1, r2, r1, 4, hi=1
339 vrshrn.u16 d3, q0, #2
340 vrshrn.u16 d2, q0, #3
349 function ff_pred8x8_0l0_dc_neon, export=1
350 add r2, r0, r1, lsl #2
352 ldcol.8 d1, r2, r1, 4
355 vrshrn.u16 d1, q1, #2