2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/aarch64/asm.S"
25 .macro h264_loop_filter_start
30 and w6, w6, w6, lsl #16
32 ands w6, w6, w6, lsl #8
39 .macro h264_loop_filter_luma
40 dup v22.16B, w2 // alpha
42 uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
44 uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
45 sli v24.8H, v24.8H, #8
46 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
47 sli v24.4S, v24.4S, #16
48 cmhi v21.16B, v22.16B, v21.16B // < alpha
49 dup v22.16B, w3 // beta
50 cmlt v23.16B, v24.16B, #0
51 cmhi v28.16B, v22.16B, v28.16B // < beta
52 cmhi v30.16B, v22.16B, v30.16B // < beta
53 bic v21.16B, v21.16B, v23.16B
54 uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
55 and v21.16B, v21.16B, v28.16B
56 uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
57 cmhi v17.16B, v22.16B, v17.16B // < beta
58 and v21.16B, v21.16B, v30.16B
59 cmhi v19.16B, v22.16B, v19.16B // < beta
60 and v17.16B, v17.16B, v21.16B
61 and v19.16B, v19.16B, v21.16B
62 and v24.16B, v24.16B, v21.16B
63 urhadd v28.16B, v16.16B, v0.16B
64 sub v21.16B, v24.16B, v17.16B
65 uqadd v23.16B, v18.16B, v24.16B
66 uhadd v20.16B, v20.16B, v28.16B
67 sub v21.16B, v21.16B, v19.16B
68 uhadd v28.16B, v4.16B, v28.16B
69 umin v23.16B, v23.16B, v20.16B
70 uqsub v22.16B, v18.16B, v24.16B
71 uqadd v4.16B, v2.16B, v24.16B
72 umax v23.16B, v23.16B, v22.16B
73 uqsub v22.16B, v2.16B, v24.16B
74 umin v28.16B, v4.16B, v28.16B
76 umax v28.16B, v28.16B, v22.16B
78 usubw v4.8H, v4.8H, v16.8B
79 usubw2 v20.8H, v20.8H, v16.16B
81 shl v20.8H, v20.8H, #2
82 uaddw v4.8H, v4.8H, v18.8B
83 uaddw2 v20.8H, v20.8H, v18.16B
84 usubw v4.8H, v4.8H, v2.8B
85 usubw2 v20.8H, v20.8H, v2.16B
86 rshrn v4.8B, v4.8H, #3
87 rshrn2 v4.16B, v20.8H, #3
88 bsl v17.16B, v23.16B, v18.16B
89 bsl v19.16B, v28.16B, v2.16B
92 smin v4.16B, v4.16B, v21.16B
94 smax v4.16B, v4.16B, v23.16B
97 saddw v28.8H, v28.8H, v4.8B
98 saddw2 v21.8H, v21.8H, v4.16B
99 ssubw v22.8H, v22.8H, v4.8B
100 ssubw2 v24.8H, v24.8H, v4.16B
101 sqxtun v16.8B, v28.8H
102 sqxtun2 v16.16B, v21.8H
104 sqxtun2 v0.16B, v24.8H
107 function ff_h264_v_loop_filter_luma_neon, export=1
108 h264_loop_filter_start
111 ld1 {v0.16B}, [x0], x1
112 ld1 {v2.16B}, [x0], x1
113 ld1 {v4.16B}, [x0], x1
114 sub x0, x0, x1, lsl #2
115 sub x0, x0, x1, lsl #1
116 ld1 {v20.16B}, [x0], x1
117 ld1 {v18.16B}, [x0], x1
118 ld1 {v16.16B}, [x0], x1
120 h264_loop_filter_luma
122 sub x0, x0, x1, lsl #1
123 st1 {v17.16B}, [x0], x1
124 st1 {v16.16B}, [x0], x1
125 st1 {v0.16B}, [x0], x1
131 function ff_h264_h_loop_filter_luma_neon, export=1
132 h264_loop_filter_start
135 ld1 {v6.8B}, [x0], x1
136 ld1 {v20.8B}, [x0], x1
137 ld1 {v18.8B}, [x0], x1
138 ld1 {v16.8B}, [x0], x1
139 ld1 {v0.8B}, [x0], x1
140 ld1 {v2.8B}, [x0], x1
141 ld1 {v4.8B}, [x0], x1
142 ld1 {v26.8B}, [x0], x1
143 ld1 {v6.D}[1], [x0], x1
144 ld1 {v20.D}[1], [x0], x1
145 ld1 {v18.D}[1], [x0], x1
146 ld1 {v16.D}[1], [x0], x1
147 ld1 {v0.D}[1], [x0], x1
148 ld1 {v2.D}[1], [x0], x1
149 ld1 {v4.D}[1], [x0], x1
150 ld1 {v26.D}[1], [x0], x1
152 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
154 h264_loop_filter_luma
156 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
158 sub x0, x0, x1, lsl #4
160 st1 {v17.S}[0], [x0], x1
161 st1 {v16.S}[0], [x0], x1
162 st1 {v0.S}[0], [x0], x1
163 st1 {v19.S}[0], [x0], x1
164 st1 {v17.S}[1], [x0], x1
165 st1 {v16.S}[1], [x0], x1
166 st1 {v0.S}[1], [x0], x1
167 st1 {v19.S}[1], [x0], x1
168 st1 {v17.S}[2], [x0], x1
169 st1 {v16.S}[2], [x0], x1
170 st1 {v0.S}[2], [x0], x1
171 st1 {v19.S}[2], [x0], x1
172 st1 {v17.S}[3], [x0], x1
173 st1 {v16.S}[3], [x0], x1
174 st1 {v0.S}[3], [x0], x1
175 st1 {v19.S}[3], [x0], x1
180 .macro h264_loop_filter_chroma
181 dup v22.8B, w2 // alpha
183 uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
185 uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
186 usubw v4.8H, v4.8H, v16.8B
187 sli v24.8H, v24.8H, #8
189 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
190 uaddw v4.8H, v4.8H, v18.8B
191 cmhi v26.8B, v22.8B, v26.8B // < alpha
192 usubw v4.8H, v4.8H, v2.8B
193 dup v22.8B, w3 // beta
194 rshrn v4.8B, v4.8H, #3
195 cmhi v28.8B, v22.8B, v28.8B // < beta
196 cmhi v30.8B, v22.8B, v30.8B // < beta
197 smin v4.8B, v4.8B, v24.8B
199 and v26.8B, v26.8B, v28.8B
200 smax v4.8B, v4.8B, v25.8B
201 and v26.8B, v26.8B, v30.8B
203 and v4.8B, v4.8B, v26.8B
205 saddw v28.8H, v28.8H, v4.8B
206 ssubw v22.8H, v22.8H, v4.8B
207 sqxtun v16.8B, v28.8H
211 function ff_h264_v_loop_filter_chroma_neon, export=1
212 h264_loop_filter_start
214 sub x0, x0, x1, lsl #1
215 ld1 {v18.8B}, [x0], x1
216 ld1 {v16.8B}, [x0], x1
217 ld1 {v0.8B}, [x0], x1
220 h264_loop_filter_chroma
222 sub x0, x0, x1, lsl #1
223 st1 {v16.8B}, [x0], x1
224 st1 {v0.8B}, [x0], x1
229 function ff_h264_h_loop_filter_chroma_neon, export=1
230 h264_loop_filter_start
233 ld1 {v18.S}[0], [x0], x1
234 ld1 {v16.S}[0], [x0], x1
235 ld1 {v0.S}[0], [x0], x1
236 ld1 {v2.S}[0], [x0], x1
237 ld1 {v18.S}[1], [x0], x1
238 ld1 {v16.S}[1], [x0], x1
239 ld1 {v0.S}[1], [x0], x1
240 ld1 {v2.S}[1], [x0], x1
242 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
244 h264_loop_filter_chroma
246 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
248 sub x0, x0, x1, lsl #3
249 st1 {v18.S}[0], [x0], x1
250 st1 {v16.S}[0], [x0], x1
251 st1 {v0.S}[0], [x0], x1
252 st1 {v2.S}[0], [x0], x1
253 st1 {v18.S}[1], [x0], x1
254 st1 {v16.S}[1], [x0], x1
255 st1 {v0.S}[1], [x0], x1
256 st1 {v2.S}[1], [x0], x1
261 .macro biweight_16 macs, macd
267 ld1 {v20.16B}, [x0], x2
268 \macd v4.8H, v0.8B, v20.8B
269 \macd\()2 v6.8H, v0.16B, v20.16B
270 ld1 {v22.16B}, [x1], x2
271 \macs v4.8H, v1.8B, v22.8B
272 \macs\()2 v6.8H, v1.16B, v22.16B
274 ld1 {v28.16B}, [x0], x2
276 \macd v24.8H, v0.8B, v28.8B
277 \macd\()2 v26.8H, v0.16B, v28.16B
278 ld1 {v30.16B}, [x1], x2
279 \macs v24.8H, v1.8B, v30.8B
280 \macs\()2 v26.8H, v1.16B, v30.16B
281 sshl v4.8H, v4.8H, v18.8H
282 sshl v6.8H, v6.8H, v18.8H
284 sqxtun2 v4.16B, v6.8H
285 sshl v24.8H, v24.8H, v18.8H
286 sshl v26.8H, v26.8H, v18.8H
287 sqxtun v24.8B, v24.8H
288 sqxtun2 v24.16B, v26.8H
290 st1 {v4.16B}, [x7], x2
292 st1 {v24.16B}, [x7], x2
297 .macro biweight_8 macs, macd
303 ld1 {v4.8B}, [x0], x2
304 \macd v2.8H, v0.8B, v4.8B
305 ld1 {v5.8B}, [x1], x2
306 \macs v2.8H, v1.8B, v5.8B
307 ld1 {v6.8B}, [x0], x2
308 \macd v20.8H, v0.8B, v6.8B
309 ld1 {v7.8B}, [x1], x2
310 \macs v20.8H, v1.8B, v7.8B
311 sshl v2.8H, v2.8H, v18.8H
313 sshl v20.8H, v20.8H, v18.8H
316 st1 {v2.8B}, [x7], x2
318 st1 {v4.8B}, [x7], x2
323 .macro biweight_4 macs, macd
329 ld1 {v4.S}[0], [x0], x2
330 ld1 {v4.S}[1], [x0], x2
331 \macd v2.8H, v0.8B, v4.8B
332 ld1 {v5.S}[0], [x1], x2
333 ld1 {v5.S}[1], [x1], x2
334 \macs v2.8H, v1.8B, v5.8B
336 ld1 {v6.S}[0], [x0], x2
337 ld1 {v6.S}[1], [x0], x2
338 \macd v20.8H, v0.8B, v6.8B
339 ld1 {v7.S}[0], [x1], x2
340 ld1 {v7.S}[1], [x1], x2
341 \macs v20.8H, v1.8B, v7.8B
342 sshl v2.8H, v2.8H, v18.8H
344 sshl v20.8H, v20.8H, v18.8H
347 st1 {v2.S}[0], [x7], x2
348 st1 {v2.S}[1], [x7], x2
350 st1 {v4.S}[0], [x7], x2
351 st1 {v4.S}[1], [x7], x2
354 2: sshl v2.8H, v2.8H, v18.8H
356 st1 {v2.S}[0], [x7], x2
357 st1 {v2.S}[1], [x7], x2
361 .macro biweight_func w
362 function ff_biweight_h264_pixels_\w\()_neon, export=1
366 eor w8, w8, w6, lsr #30
379 10: biweight_\w umlal, umlal
381 biweight_\w umlal, umlsl
384 biweight_\w umlsl, umlsl
386 biweight_\w umlsl, umlal
397 ld1 {v20.16B}, [x0], x1
398 umull v4.8H, v0.8B, v20.8B
399 umull2 v6.8H, v0.16B, v20.16B
400 ld1 {v28.16B}, [x0], x1
401 umull v24.8H, v0.8B, v28.8B
402 umull2 v26.8H, v0.16B, v28.16B
403 \add v4.8H, v16.8H, v4.8H
404 srshl v4.8H, v4.8H, v18.8H
405 \add v6.8H, v16.8H, v6.8H
406 srshl v6.8H, v6.8H, v18.8H
408 sqxtun2 v4.16B, v6.8H
409 \add v24.8H, v16.8H, v24.8H
410 srshl v24.8H, v24.8H, v18.8H
411 \add v26.8H, v16.8H, v26.8H
412 srshl v26.8H, v26.8H, v18.8H
413 sqxtun v24.8B, v24.8H
414 sqxtun2 v24.16B, v26.8H
415 st1 {v4.16B}, [x5], x1
416 st1 {v24.16B}, [x5], x1
424 ld1 {v4.8B}, [x0], x1
425 umull v2.8H, v0.8B, v4.8B
426 ld1 {v6.8B}, [x0], x1
427 umull v20.8H, v0.8B, v6.8B
428 \add v2.8H, v16.8H, v2.8H
429 srshl v2.8H, v2.8H, v18.8H
431 \add v20.8H, v16.8H, v20.8H
432 srshl v20.8H, v20.8H, v18.8H
434 st1 {v2.8B}, [x5], x1
435 st1 {v4.8B}, [x5], x1
443 ld1 {v4.S}[0], [x0], x1
444 ld1 {v4.S}[1], [x0], x1
445 umull v2.8H, v0.8B, v4.8B
447 ld1 {v6.S}[0], [x0], x1
448 ld1 {v6.S}[1], [x0], x1
449 umull v20.8H, v0.8B, v6.8B
450 \add v2.8H, v16.8H, v2.8H
451 srshl v2.8H, v2.8H, v18.8H
453 \add v20.8H, v16.8H, v20.8H
454 srshl v20.8H, v20.8h, v18.8H
456 st1 {v2.S}[0], [x5], x1
457 st1 {v2.S}[1], [x5], x1
458 st1 {v4.S}[0], [x5], x1
459 st1 {v4.S}[1], [x5], x1
462 2: \add v2.8H, v16.8H, v2.8H
463 srshl v2.8H, v2.8H, v18.8H
465 st1 {v2.S}[0], [x5], x1
466 st1 {v2.S}[1], [x5], x1
471 function ff_weight_h264_pixels_\w\()_neon, export=1