2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/aarch64/asm.S"
25 .macro h264_loop_filter_start
30 and w6, w6, w6, lsl #16
32 ands w6, w6, w6, lsl #8
39 .macro h264_loop_filter_luma
40 dup v22.16B, w2 // alpha
42 uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
44 uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
45 sli v24.8H, v24.8H, #8
46 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
47 sli v24.4S, v24.4S, #16
48 cmhi v21.16B, v22.16B, v21.16B // < alpha
49 dup v22.16B, w3 // beta
50 cmlt v23.16B, v24.16B, #0
51 cmhi v28.16B, v22.16B, v28.16B // < beta
52 cmhi v30.16B, v22.16B, v30.16B // < beta
53 bic v21.16B, v21.16B, v23.16B
54 uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
55 and v21.16B, v21.16B, v28.16B
56 uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
57 cmhi v17.16B, v22.16B, v17.16B // < beta
58 and v21.16B, v21.16B, v30.16B
59 cmhi v19.16B, v22.16B, v19.16B // < beta
60 and v17.16B, v17.16B, v21.16B
61 and v19.16B, v19.16B, v21.16B
62 and v24.16B, v24.16B, v21.16B
63 urhadd v28.16B, v16.16B, v0.16B
64 sub v21.16B, v24.16B, v17.16B
65 uqadd v23.16B, v18.16B, v24.16B
66 uhadd v20.16B, v20.16B, v28.16B
67 sub v21.16B, v21.16B, v19.16B
68 uhadd v28.16B, v4.16B, v28.16B
69 umin v23.16B, v23.16B, v20.16B
70 uqsub v22.16B, v18.16B, v24.16B
71 uqadd v4.16B, v2.16B, v24.16B
72 umax v23.16B, v23.16B, v22.16B
73 uqsub v22.16B, v2.16B, v24.16B
74 umin v28.16B, v4.16B, v28.16B
76 umax v28.16B, v28.16B, v22.16B
78 usubw v4.8H, v4.8H, v16.8B
79 usubw2 v20.8H, v20.8H, v16.16B
81 shl v20.8H, v20.8H, #2
82 uaddw v4.8H, v4.8H, v18.8B
83 uaddw2 v20.8H, v20.8H, v18.16B
84 usubw v4.8H, v4.8H, v2.8B
85 usubw2 v20.8H, v20.8H, v2.16B
86 rshrn v4.8B, v4.8H, #3
87 rshrn2 v4.16B, v20.8H, #3
88 bsl v17.16B, v23.16B, v18.16B
89 bsl v19.16B, v28.16B, v2.16B
92 smin v4.16B, v4.16B, v21.16B
94 smax v4.16B, v4.16B, v23.16B
97 saddw v28.8H, v28.8H, v4.8B
98 saddw2 v21.8H, v21.8H, v4.16B
99 ssubw v22.8H, v22.8H, v4.8B
100 ssubw2 v24.8H, v24.8H, v4.16B
101 sqxtun v16.8B, v28.8H
102 sqxtun2 v16.16B, v21.8H
104 sqxtun2 v0.16B, v24.8H
107 function ff_h264_v_loop_filter_luma_neon, export=1
108 h264_loop_filter_start
111 ld1 {v0.16B}, [x0], x1
112 ld1 {v2.16B}, [x0], x1
113 ld1 {v4.16B}, [x0], x1
114 sub x0, x0, x1, lsl #2
115 sub x0, x0, x1, lsl #1
116 ld1 {v20.16B}, [x0], x1
117 ld1 {v18.16B}, [x0], x1
118 ld1 {v16.16B}, [x0], x1
120 h264_loop_filter_luma
122 sub x0, x0, x1, lsl #1
123 st1 {v17.16B}, [x0], x1
124 st1 {v16.16B}, [x0], x1
125 st1 {v0.16B}, [x0], x1
131 function ff_h264_h_loop_filter_luma_neon, export=1
132 h264_loop_filter_start
136 ld1 {v6.8B}, [x0], x1
137 ld1 {v20.8B}, [x0], x1
138 ld1 {v18.8B}, [x0], x1
139 ld1 {v16.8B}, [x0], x1
140 ld1 {v0.8B}, [x0], x1
141 ld1 {v2.8B}, [x0], x1
142 ld1 {v4.8B}, [x0], x1
143 ld1 {v26.8B}, [x0], x1
144 ld1 {v6.D}[1], [x0], x1
145 ld1 {v20.D}[1], [x0], x1
146 ld1 {v18.D}[1], [x0], x1
147 ld1 {v16.D}[1], [x0], x1
148 ld1 {v0.D}[1], [x0], x1
149 ld1 {v2.D}[1], [x0], x1
150 ld1 {v4.D}[1], [x0], x1
151 ld1 {v26.D}[1], [x0], x1
153 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
155 h264_loop_filter_luma
157 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
159 sub x0, x0, x1, lsl #4
161 st1 {v17.S}[0], [x0], x1
162 st1 {v16.S}[0], [x0], x1
163 st1 {v0.S}[0], [x0], x1
164 st1 {v19.S}[0], [x0], x1
165 st1 {v17.S}[1], [x0], x1
166 st1 {v16.S}[1], [x0], x1
167 st1 {v0.S}[1], [x0], x1
168 st1 {v19.S}[1], [x0], x1
169 st1 {v17.S}[2], [x0], x1
170 st1 {v16.S}[2], [x0], x1
171 st1 {v0.S}[2], [x0], x1
172 st1 {v19.S}[2], [x0], x1
173 st1 {v17.S}[3], [x0], x1
174 st1 {v16.S}[3], [x0], x1
175 st1 {v0.S}[3], [x0], x1
176 st1 {v19.S}[3], [x0], x1
181 .macro h264_loop_filter_chroma
182 dup v22.8B, w2 // alpha
184 uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
186 uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
187 usubw v4.8H, v4.8H, v16.8B
188 sli v24.8H, v24.8H, #8
190 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
191 uaddw v4.8H, v4.8H, v18.8B
192 cmhi v26.8B, v22.8B, v26.8B // < alpha
193 usubw v4.8H, v4.8H, v2.8B
194 dup v22.8B, w3 // beta
195 rshrn v4.8B, v4.8H, #3
196 cmhi v28.8B, v22.8B, v28.8B // < beta
197 cmhi v30.8B, v22.8B, v30.8B // < beta
198 smin v4.8B, v4.8B, v24.8B
200 and v26.8B, v26.8B, v28.8B
201 smax v4.8B, v4.8B, v25.8B
202 and v26.8B, v26.8B, v30.8B
204 and v4.8B, v4.8B, v26.8B
206 saddw v28.8H, v28.8H, v4.8B
207 ssubw v22.8H, v22.8H, v4.8B
208 sqxtun v16.8B, v28.8H
212 function ff_h264_v_loop_filter_chroma_neon, export=1
213 h264_loop_filter_start
216 sub x0, x0, x1, lsl #1
217 ld1 {v18.8B}, [x0], x1
218 ld1 {v16.8B}, [x0], x1
219 ld1 {v0.8B}, [x0], x1
222 h264_loop_filter_chroma
224 sub x0, x0, x1, lsl #1
225 st1 {v16.8B}, [x0], x1
226 st1 {v0.8B}, [x0], x1
231 function ff_h264_h_loop_filter_chroma_neon, export=1
232 h264_loop_filter_start
236 ld1 {v18.S}[0], [x0], x1
237 ld1 {v16.S}[0], [x0], x1
238 ld1 {v0.S}[0], [x0], x1
239 ld1 {v2.S}[0], [x0], x1
240 ld1 {v18.S}[1], [x0], x1
241 ld1 {v16.S}[1], [x0], x1
242 ld1 {v0.S}[1], [x0], x1
243 ld1 {v2.S}[1], [x0], x1
245 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
247 h264_loop_filter_chroma
249 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
251 sub x0, x0, x1, lsl #3
252 st1 {v18.S}[0], [x0], x1
253 st1 {v16.S}[0], [x0], x1
254 st1 {v0.S}[0], [x0], x1
255 st1 {v2.S}[0], [x0], x1
256 st1 {v18.S}[1], [x0], x1
257 st1 {v16.S}[1], [x0], x1
258 st1 {v0.S}[1], [x0], x1
259 st1 {v2.S}[1], [x0], x1
264 .macro biweight_16 macs, macd
270 ld1 {v20.16B}, [x0], x2
271 \macd v4.8H, v0.8B, v20.8B
272 \macd\()2 v6.8H, v0.16B, v20.16B
273 ld1 {v22.16B}, [x1], x2
274 \macs v4.8H, v1.8B, v22.8B
275 \macs\()2 v6.8H, v1.16B, v22.16B
277 ld1 {v28.16B}, [x0], x2
279 \macd v24.8H, v0.8B, v28.8B
280 \macd\()2 v26.8H, v0.16B, v28.16B
281 ld1 {v30.16B}, [x1], x2
282 \macs v24.8H, v1.8B, v30.8B
283 \macs\()2 v26.8H, v1.16B, v30.16B
284 sshl v4.8H, v4.8H, v18.8H
285 sshl v6.8H, v6.8H, v18.8H
287 sqxtun2 v4.16B, v6.8H
288 sshl v24.8H, v24.8H, v18.8H
289 sshl v26.8H, v26.8H, v18.8H
290 sqxtun v24.8B, v24.8H
291 sqxtun2 v24.16B, v26.8H
293 st1 {v4.16B}, [x7], x2
295 st1 {v24.16B}, [x7], x2
300 .macro biweight_8 macs, macd
306 ld1 {v4.8B}, [x0], x2
307 \macd v2.8H, v0.8B, v4.8B
308 ld1 {v5.8B}, [x1], x2
309 \macs v2.8H, v1.8B, v5.8B
310 ld1 {v6.8B}, [x0], x2
311 \macd v20.8H, v0.8B, v6.8B
312 ld1 {v7.8B}, [x1], x2
313 \macs v20.8H, v1.8B, v7.8B
314 sshl v2.8H, v2.8H, v18.8H
316 sshl v20.8H, v20.8H, v18.8H
319 st1 {v2.8B}, [x7], x2
321 st1 {v4.8B}, [x7], x2
326 .macro biweight_4 macs, macd
332 ld1 {v4.S}[0], [x0], x2
333 ld1 {v4.S}[1], [x0], x2
334 \macd v2.8H, v0.8B, v4.8B
335 ld1 {v5.S}[0], [x1], x2
336 ld1 {v5.S}[1], [x1], x2
337 \macs v2.8H, v1.8B, v5.8B
339 ld1 {v6.S}[0], [x0], x2
340 ld1 {v6.S}[1], [x0], x2
341 \macd v20.8H, v0.8B, v6.8B
342 ld1 {v7.S}[0], [x1], x2
343 ld1 {v7.S}[1], [x1], x2
344 \macs v20.8H, v1.8B, v7.8B
345 sshl v2.8H, v2.8H, v18.8H
347 sshl v20.8H, v20.8H, v18.8H
350 st1 {v2.S}[0], [x7], x2
351 st1 {v2.S}[1], [x7], x2
353 st1 {v4.S}[0], [x7], x2
354 st1 {v4.S}[1], [x7], x2
357 2: sshl v2.8H, v2.8H, v18.8H
359 st1 {v2.S}[0], [x7], x2
360 st1 {v2.S}[1], [x7], x2
364 .macro biweight_func w
365 function ff_biweight_h264_pixels_\w\()_neon, export=1
369 eor w8, w8, w6, lsr #30
382 10: biweight_\w umlal, umlal
384 biweight_\w umlal, umlsl
387 biweight_\w umlsl, umlsl
389 biweight_\w umlsl, umlal
400 ld1 {v20.16B}, [x0], x1
401 umull v4.8H, v0.8B, v20.8B
402 umull2 v6.8H, v0.16B, v20.16B
403 ld1 {v28.16B}, [x0], x1
404 umull v24.8H, v0.8B, v28.8B
405 umull2 v26.8H, v0.16B, v28.16B
406 \add v4.8H, v16.8H, v4.8H
407 srshl v4.8H, v4.8H, v18.8H
408 \add v6.8H, v16.8H, v6.8H
409 srshl v6.8H, v6.8H, v18.8H
411 sqxtun2 v4.16B, v6.8H
412 \add v24.8H, v16.8H, v24.8H
413 srshl v24.8H, v24.8H, v18.8H
414 \add v26.8H, v16.8H, v26.8H
415 srshl v26.8H, v26.8H, v18.8H
416 sqxtun v24.8B, v24.8H
417 sqxtun2 v24.16B, v26.8H
418 st1 {v4.16B}, [x5], x1
419 st1 {v24.16B}, [x5], x1
427 ld1 {v4.8B}, [x0], x1
428 umull v2.8H, v0.8B, v4.8B
429 ld1 {v6.8B}, [x0], x1
430 umull v20.8H, v0.8B, v6.8B
431 \add v2.8H, v16.8H, v2.8H
432 srshl v2.8H, v2.8H, v18.8H
434 \add v20.8H, v16.8H, v20.8H
435 srshl v20.8H, v20.8H, v18.8H
437 st1 {v2.8B}, [x5], x1
438 st1 {v4.8B}, [x5], x1
446 ld1 {v4.S}[0], [x0], x1
447 ld1 {v4.S}[1], [x0], x1
448 umull v2.8H, v0.8B, v4.8B
450 ld1 {v6.S}[0], [x0], x1
451 ld1 {v6.S}[1], [x0], x1
452 umull v20.8H, v0.8B, v6.8B
453 \add v2.8H, v16.8H, v2.8H
454 srshl v2.8H, v2.8H, v18.8H
456 \add v20.8H, v16.8H, v20.8H
457 srshl v20.8H, v20.8h, v18.8H
459 st1 {v2.S}[0], [x5], x1
460 st1 {v2.S}[1], [x5], x1
461 st1 {v4.S}[0], [x5], x1
462 st1 {v4.S}[1], [x5], x1
465 2: \add v2.8H, v16.8H, v2.8H
466 srshl v2.8H, v2.8H, v18.8H
468 st1 {v2.S}[0], [x5], x1
469 st1 {v2.S}[1], [x5], x1
474 function ff_weight_h264_pixels_\w\()_neon, export=1