2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
23 .macro ldcol.8 rd, rs, rt, n=8, hi=0
24 .if \n >= 8 || \hi == 0
25 ld1 {\rd\().b}[0], [\rs], \rt
26 ld1 {\rd\().b}[1], [\rs], \rt
27 ld1 {\rd\().b}[2], [\rs], \rt
28 ld1 {\rd\().b}[3], [\rs], \rt
30 .if \n >= 8 || \hi == 1
31 ld1 {\rd\().b}[4], [\rs], \rt
32 ld1 {\rd\().b}[5], [\rs], \rt
33 ld1 {\rd\().b}[6], [\rs], \rt
34 ld1 {\rd\().b}[7], [\rs], \rt
37 ld1 {\rd\().b}[8], [\rs], \rt
38 ld1 {\rd\().b}[9], [\rs], \rt
39 ld1 {\rd\().b}[10], [\rs], \rt
40 ld1 {\rd\().b}[11], [\rs], \rt
41 ld1 {\rd\().b}[12], [\rs], \rt
42 ld1 {\rd\().b}[13], [\rs], \rt
43 ld1 {\rd\().b}[14], [\rs], \rt
44 ld1 {\rd\().b}[15], [\rs], \rt
48 function ff_pred16x16_128_dc_neon, export=1
53 function ff_pred16x16_top_dc_neon, export=1
57 rshrn v0.8b, v0.8h, #4
62 function ff_pred16x16_left_dc_neon, export=1
64 ldcol.8 v0, x2, x1, 16
66 rshrn v0.8b, v0.8h, #4
71 function ff_pred16x16_dc_neon, export=1
75 ldcol.8 v1, x3, x1, 16
78 add v0.4h, v0.4h, v1.4h
79 rshrn v0.8b, v0.8h, #5
83 6: st1 {v0.16b}, [x0], x1
84 st1 {v0.16b}, [x0], x1
90 function ff_pred16x16_hor_neon, export=1
93 1: ld1r {v0.16b}, [x2], x1
94 st1 {v0.16b}, [x0], x1
100 function ff_pred16x16_vert_neon, export=1
103 ld1 {v0.16b}, [x2], x1
105 1: st1 {v0.16b}, [x0], x1
106 st1 {v0.16b}, [x2], x1
112 function ff_pred16x16_plane_neon, export=1
118 ld1 {v2.8b}, [x2], x1
124 uaddl v7.8h, v2.8b, v3.8b
125 usubl v2.8h, v2.8b, v0.8b
126 usubl v3.8h, v3.8b, v1.8b
128 mul v2.8h, v2.8h, v0.8h
129 mul v3.8h, v3.8h, v0.8h
130 addp v2.8h, v2.8h, v3.8h
131 addp v2.8h, v2.8h, v2.8h
132 addp v2.4h, v2.4h, v2.4h
133 sshll v3.4s, v2.4h, #2
134 saddw v2.4s, v3.4s, v2.4h
135 rshrn v4.4h, v2.4s, #6
136 trn2 v5.4h, v4.4h, v4.4h
137 add v2.4h, v4.4h, v5.4h
139 ext v7.16b, v7.16b, v7.16b, #14
140 sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
141 add v7.4h, v7.4h, v0.4h
143 sub v2.4h, v2.4h, v3.4h
145 ext v0.16b, v0.16b, v0.16b, #14
146 sub v6.4h, v5.4h, v3.4h
148 mul v0.8h, v0.8h, v4.h[0]
153 add v1.8h, v1.8h, v0.8h
154 add v3.8h, v3.8h, v2.8h
157 sqshrun v0.8b, v1.8h, #5
158 add v1.8h, v1.8h, v2.8h
159 sqshrun2 v0.16b, v1.8h, #5
160 add v1.8h, v1.8h, v3.8h
161 st1 {v0.16b}, [x0], x1
167 const p16weight, align=4
168 .short 1,2,3,4,5,6,7,8
170 const p8weight, align=4
171 .short 1,2,3,4,1,2,3,4
174 function ff_pred8x8_hor_neon, export=1
177 1: ld1r {v0.8b}, [x2], x1
178 st1 {v0.8b}, [x0], x1
184 function ff_pred8x8_vert_neon, export=1
187 ld1 {v0.8b}, [x2], x1
189 1: st1 {v0.8b}, [x0], x1
190 st1 {v0.8b}, [x2], x1
196 function ff_pred8x8_plane_neon, export=1
203 ld1 {v2.s}[0], [x2], x1
204 ldcol.8 v0, x3, x1, 4, hi=1
206 ldcol.8 v3, x3, x1, 4
207 uaddl v7.8h, v2.8b, v3.8b
209 trn1 v2.2s, v2.2s, v3.2s
210 usubl v2.8h, v2.8b, v0.8b
212 mul v2.8h, v2.8h, v6.8h
215 addp v2.4s, v2.4s, v2.4s
217 add v2.4s, v3.4s, v2.4s
218 rshrn v5.4h, v2.4s, #5
219 addp v2.4h, v5.4h, v5.4h
221 add v3.4h, v3.4h, v2.4h
223 add v7.4h, v7.4h, v0.4h
225 sub v2.4h, v2.4h, v3.4h
226 ext v0.16b, v0.16b, v0.16b, #14
228 mul v0.8h, v0.8h, v5.h[0]
231 add v1.8h, v1.8h, v0.8h
234 sqshrun v0.8b, v1.8h, #5
235 add v1.8h, v1.8h, v2.8h
236 st1 {v0.8b}, [x0], x1
242 function ff_pred8x8_128_dc_neon, export=1
248 function ff_pred8x8_top_dc_neon, export=1
252 addp v0.4h, v0.4h, v0.4h
253 zip1 v0.8h, v0.8h, v0.8h
254 rshrn v2.8b, v0.8h, #2
255 zip1 v0.8b, v2.8b, v2.8b
256 zip1 v1.8b, v2.8b, v2.8b
260 function ff_pred8x8_left_dc_neon, export=1
264 addp v0.4h, v0.4h, v0.4h
265 rshrn v2.8b, v0.8h, #2
271 function ff_pred8x8_dc_neon, export=1
278 trn1 v2.2s, v0.2s, v1.2s
279 trn2 v3.2s, v0.2s, v1.2s
280 addp v4.4h, v2.4h, v3.4h
281 addp v5.4h, v4.4h, v4.4h
282 rshrn v6.8b, v5.8h, #3
283 rshrn v7.8b, v4.8h, #2
288 zip1 v0.2s, v0.2s, v2.2s
289 zip1 v1.2s, v1.2s, v3.2s
292 add x2, x0, x1, lsl #2
293 6: st1 {v0.8b}, [x0], x1
294 st1 {v1.8b}, [x2], x1
300 function ff_pred8x8_l0t_dc_neon, export=1
304 ldcol.8 v1, x3, x1, 4
305 zip1 v0.4s, v0.4s, v1.4s
307 addp v0.8h, v0.8h, v0.8h
308 addp v1.4h, v0.4h, v0.4h
309 rshrn v2.8b, v0.8h, #2
310 rshrn v3.8b, v1.8h, #3
314 zip1 v0.2s, v4.2s, v6.2s
315 zip1 v1.2s, v5.2s, v6.2s
319 function ff_pred8x8_l00_dc_neon, export=1
321 ldcol.8 v0, x2, x1, 4
323 addp v0.4h, v0.4h, v0.4h
324 rshrn v0.8b, v0.8h, #2
330 function ff_pred8x8_0lt_dc_neon, export=1
331 add x3, x0, x1, lsl #2
335 ldcol.8 v1, x3, x1, 4, hi=1
336 zip1 v0.4s, v0.4s, v1.4s
338 addp v0.8h, v0.8h, v0.8h
339 addp v1.4h, v0.4h, v0.4h
340 rshrn v2.8b, v0.8h, #2
341 rshrn v3.8b, v1.8h, #3
346 zip1 v0.2s, v4.2s, v6.2s
347 zip1 v1.2s, v5.2s, v7.2s
351 function ff_pred8x8_0l0_dc_neon, export=1
352 add x2, x0, x1, lsl #2
354 ldcol.8 v1, x2, x1, 4
356 addp v2.4h, v2.4h, v2.4h
357 rshrn v1.8b, v2.8h, #2