2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/aarch64/asm.S"
26 .macro h264_loop_filter_start
31 and w6, w6, w6, lsl #16
33 ands w6, w6, w6, lsl #8
40 .macro h264_loop_filter_luma
41 dup v22.16B, w2 // alpha
43 uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
45 uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
46 sli v24.8H, v24.8H, #8
47 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
48 sli v24.4S, v24.4S, #16
49 cmhi v21.16B, v22.16B, v21.16B // < alpha
50 dup v22.16B, w3 // beta
51 cmlt v23.16B, v24.16B, #0
52 cmhi v28.16B, v22.16B, v28.16B // < beta
53 cmhi v30.16B, v22.16B, v30.16B // < beta
54 bic v21.16B, v21.16B, v23.16B
55 uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
56 and v21.16B, v21.16B, v28.16B
57 uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
58 and v21.16B, v21.16B, v30.16B // < beta
59 shrn v30.8b, v21.8h, #4
61 cmhi v17.16B, v22.16B, v17.16B // < beta
62 cmhi v19.16B, v22.16B, v19.16B // < beta
64 and v17.16B, v17.16B, v21.16B
65 and v19.16B, v19.16B, v21.16B
66 and v24.16B, v24.16B, v21.16B
67 urhadd v28.16B, v16.16B, v0.16B
68 sub v21.16B, v24.16B, v17.16B
69 uqadd v23.16B, v18.16B, v24.16B
70 uhadd v20.16B, v20.16B, v28.16B
71 sub v21.16B, v21.16B, v19.16B
72 uhadd v28.16B, v4.16B, v28.16B
73 umin v23.16B, v23.16B, v20.16B
74 uqsub v22.16B, v18.16B, v24.16B
75 uqadd v4.16B, v2.16B, v24.16B
76 umax v23.16B, v23.16B, v22.16B
77 uqsub v22.16B, v2.16B, v24.16B
78 umin v28.16B, v4.16B, v28.16B
80 umax v28.16B, v28.16B, v22.16B
82 usubw v4.8H, v4.8H, v16.8B
83 usubw2 v20.8H, v20.8H, v16.16B
85 shl v20.8H, v20.8H, #2
86 uaddw v4.8H, v4.8H, v18.8B
87 uaddw2 v20.8H, v20.8H, v18.16B
88 usubw v4.8H, v4.8H, v2.8B
89 usubw2 v20.8H, v20.8H, v2.16B
90 rshrn v4.8B, v4.8H, #3
91 rshrn2 v4.16B, v20.8H, #3
92 bsl v17.16B, v23.16B, v18.16B
93 bsl v19.16B, v28.16B, v2.16B
96 smin v4.16B, v4.16B, v21.16B
98 smax v4.16B, v4.16B, v23.16B
101 saddw v28.8H, v28.8H, v4.8B
102 saddw2 v21.8H, v21.8H, v4.16B
103 ssubw v22.8H, v22.8H, v4.8B
104 ssubw2 v24.8H, v24.8H, v4.16B
105 sqxtun v16.8B, v28.8H
106 sqxtun2 v16.16B, v21.8H
108 sqxtun2 v0.16B, v24.8H
111 function ff_h264_v_loop_filter_luma_neon, export=1
112 h264_loop_filter_start
115 ld1 {v0.16B}, [x0], x1
116 ld1 {v2.16B}, [x0], x1
117 ld1 {v4.16B}, [x0], x1
118 sub x0, x0, x1, lsl #2
119 sub x0, x0, x1, lsl #1
120 ld1 {v20.16B}, [x0], x1
121 ld1 {v18.16B}, [x0], x1
122 ld1 {v16.16B}, [x0], x1
124 h264_loop_filter_luma
126 sub x0, x0, x1, lsl #1
127 st1 {v17.16B}, [x0], x1
128 st1 {v16.16B}, [x0], x1
129 st1 {v0.16B}, [x0], x1
135 function ff_h264_h_loop_filter_luma_neon, export=1
136 h264_loop_filter_start
140 ld1 {v6.8B}, [x0], x1
141 ld1 {v20.8B}, [x0], x1
142 ld1 {v18.8B}, [x0], x1
143 ld1 {v16.8B}, [x0], x1
144 ld1 {v0.8B}, [x0], x1
145 ld1 {v2.8B}, [x0], x1
146 ld1 {v4.8B}, [x0], x1
147 ld1 {v26.8B}, [x0], x1
148 ld1 {v6.D}[1], [x0], x1
149 ld1 {v20.D}[1], [x0], x1
150 ld1 {v18.D}[1], [x0], x1
151 ld1 {v16.D}[1], [x0], x1
152 ld1 {v0.D}[1], [x0], x1
153 ld1 {v2.D}[1], [x0], x1
154 ld1 {v4.D}[1], [x0], x1
155 ld1 {v26.D}[1], [x0], x1
157 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
159 h264_loop_filter_luma
161 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
163 sub x0, x0, x1, lsl #4
165 st1 {v17.S}[0], [x0], x1
166 st1 {v16.S}[0], [x0], x1
167 st1 {v0.S}[0], [x0], x1
168 st1 {v19.S}[0], [x0], x1
169 st1 {v17.S}[1], [x0], x1
170 st1 {v16.S}[1], [x0], x1
171 st1 {v0.S}[1], [x0], x1
172 st1 {v19.S}[1], [x0], x1
173 st1 {v17.S}[2], [x0], x1
174 st1 {v16.S}[2], [x0], x1
175 st1 {v0.S}[2], [x0], x1
176 st1 {v19.S}[2], [x0], x1
177 st1 {v17.S}[3], [x0], x1
178 st1 {v16.S}[3], [x0], x1
179 st1 {v0.S}[3], [x0], x1
180 st1 {v19.S}[3], [x0], x1
186 .macro h264_loop_filter_start_intra
192 dup v30.16b, w2 // alpha
193 dup v31.16b, w3 // beta
196 .macro h264_loop_filter_luma_intra
197 uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
198 uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
199 uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
200 cmhi v19.16b, v30.16b, v16.16b // < alpha
201 cmhi v17.16b, v31.16b, v17.16b // < beta
202 cmhi v18.16b, v31.16b, v18.16b // < beta
205 ushr v30.16b, v30.16b, #2 // alpha >> 2
206 add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
207 cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
209 and v19.16b, v19.16b, v17.16b
210 and v19.16b, v19.16b, v18.16b
211 shrn v20.8b, v19.8h, #4
215 ushll v20.8h, v6.8b, #1
216 ushll v22.8h, v1.8b, #1
217 ushll2 v21.8h, v6.16b, #1
218 ushll2 v23.8h, v1.16b, #1
219 uaddw v20.8h, v20.8h, v7.8b
220 uaddw v22.8h, v22.8h, v0.8b
221 uaddw2 v21.8h, v21.8h, v7.16b
222 uaddw2 v23.8h, v23.8h, v0.16b
223 uaddw v20.8h, v20.8h, v1.8b
224 uaddw v22.8h, v22.8h, v6.8b
225 uaddw2 v21.8h, v21.8h, v1.16b
226 uaddw2 v23.8h, v23.8h, v6.16b
228 rshrn v24.8b, v20.8h, #2 // p0'_1
229 rshrn v25.8b, v22.8h, #2 // q0'_1
230 rshrn2 v24.16b, v21.8h, #2 // p0'_1
231 rshrn2 v25.16b, v23.8h, #2 // q0'_1
233 uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
234 uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
235 cmhi v17.16b, v31.16b, v17.16b // < beta
236 cmhi v18.16b, v31.16b, v18.16b // < beta
238 and v17.16b, v16.16b, v17.16b // if_2 && if_3
239 and v18.16b, v16.16b, v18.16b // if_2 && if_4
244 and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
245 and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
247 and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
248 and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
250 //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
251 uaddl v26.8h, v5.8b, v7.8b
252 uaddl2 v27.8h, v5.16b, v7.16b
253 uaddw v26.8h, v26.8h, v0.8b
254 uaddw2 v27.8h, v27.8h, v0.16b
255 add v20.8h, v20.8h, v26.8h
256 add v21.8h, v21.8h, v27.8h
257 uaddw v20.8h, v20.8h, v0.8b
258 uaddw2 v21.8h, v21.8h, v0.16b
259 rshrn v20.8b, v20.8h, #3 // p0'_2
260 rshrn2 v20.16b, v21.8h, #3 // p0'_2
261 uaddw v26.8h, v26.8h, v6.8b
262 uaddw2 v27.8h, v27.8h, v6.16b
263 rshrn v21.8b, v26.8h, #2 // p1'_2
264 rshrn2 v21.16b, v27.8h, #2 // p1'_2
265 uaddl v28.8h, v4.8b, v5.8b
266 uaddl2 v29.8h, v4.16b, v5.16b
267 shl v28.8h, v28.8h, #1
268 shl v29.8h, v29.8h, #1
269 add v28.8h, v28.8h, v26.8h
270 add v29.8h, v29.8h, v27.8h
271 rshrn v19.8b, v28.8h, #3 // p2'_2
272 rshrn2 v19.16b, v29.8h, #3 // p2'_2
274 //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
275 uaddl v26.8h, v2.8b, v0.8b
276 uaddl2 v27.8h, v2.16b, v0.16b
277 uaddw v26.8h, v26.8h, v7.8b
278 uaddw2 v27.8h, v27.8h, v7.16b
279 add v22.8h, v22.8h, v26.8h
280 add v23.8h, v23.8h, v27.8h
281 uaddw v22.8h, v22.8h, v7.8b
282 uaddw2 v23.8h, v23.8h, v7.16b
283 rshrn v22.8b, v22.8h, #3 // q0'_2
284 rshrn2 v22.16b, v23.8h, #3 // q0'_2
285 uaddw v26.8h, v26.8h, v1.8b
286 uaddw2 v27.8h, v27.8h, v1.16b
287 rshrn v23.8b, v26.8h, #2 // q1'_2
288 rshrn2 v23.16b, v27.8h, #2 // q1'_2
289 uaddl v28.8h, v2.8b, v3.8b
290 uaddl2 v29.8h, v2.16b, v3.16b
291 shl v28.8h, v28.8h, #1
292 shl v29.8h, v29.8h, #1
293 add v28.8h, v28.8h, v26.8h
294 add v29.8h, v29.8h, v27.8h
295 rshrn v26.8b, v28.8h, #3 // q2'_2
296 rshrn2 v26.16b, v29.8h, #3 // q2'_2
298 bit v7.16b, v24.16b, v30.16b // p0'_1
299 bit v0.16b, v25.16b, v31.16b // q0'_1
300 bit v7.16b, v20.16b, v17.16b // p0'_2
301 bit v6.16b, v21.16b, v17.16b // p1'_2
302 bit v5.16b, v19.16b, v17.16b // p2'_2
303 bit v0.16b, v22.16b, v18.16b // q0'_2
304 bit v1.16b, v23.16b, v18.16b // q1'_2
305 bit v2.16b, v26.16b, v18.16b // q2'_2
308 function ff_h264_v_loop_filter_luma_intra_neon, export=1
309 h264_loop_filter_start_intra
311 ld1 {v0.16b}, [x0], x1 // q0
312 ld1 {v1.16b}, [x0], x1 // q1
313 ld1 {v2.16b}, [x0], x1 // q2
314 ld1 {v3.16b}, [x0], x1 // q3
315 sub x0, x0, x1, lsl #3
316 ld1 {v4.16b}, [x0], x1 // p3
317 ld1 {v5.16b}, [x0], x1 // p2
318 ld1 {v6.16b}, [x0], x1 // p1
319 ld1 {v7.16b}, [x0] // p0
321 h264_loop_filter_luma_intra
323 sub x0, x0, x1, lsl #1
324 st1 {v5.16b}, [x0], x1 // p2
325 st1 {v6.16b}, [x0], x1 // p1
326 st1 {v7.16b}, [x0], x1 // p0
327 st1 {v0.16b}, [x0], x1 // q0
328 st1 {v1.16b}, [x0], x1 // q1
329 st1 {v2.16b}, [x0] // q2
334 function ff_h264_h_loop_filter_luma_intra_neon, export=1
335 h264_loop_filter_start_intra
338 ld1 {v4.8b}, [x0], x1
339 ld1 {v5.8b}, [x0], x1
340 ld1 {v6.8b}, [x0], x1
341 ld1 {v7.8b}, [x0], x1
342 ld1 {v0.8b}, [x0], x1
343 ld1 {v1.8b}, [x0], x1
344 ld1 {v2.8b}, [x0], x1
345 ld1 {v3.8b}, [x0], x1
346 ld1 {v4.d}[1], [x0], x1
347 ld1 {v5.d}[1], [x0], x1
348 ld1 {v6.d}[1], [x0], x1
349 ld1 {v7.d}[1], [x0], x1
350 ld1 {v0.d}[1], [x0], x1
351 ld1 {v1.d}[1], [x0], x1
352 ld1 {v2.d}[1], [x0], x1
353 ld1 {v3.d}[1], [x0], x1
355 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
357 h264_loop_filter_luma_intra
359 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
361 sub x0, x0, x1, lsl #4
362 st1 {v4.8b}, [x0], x1
363 st1 {v5.8b}, [x0], x1
364 st1 {v6.8b}, [x0], x1
365 st1 {v7.8b}, [x0], x1
366 st1 {v0.8b}, [x0], x1
367 st1 {v1.8b}, [x0], x1
368 st1 {v2.8b}, [x0], x1
369 st1 {v3.8b}, [x0], x1
370 st1 {v4.d}[1], [x0], x1
371 st1 {v5.d}[1], [x0], x1
372 st1 {v6.d}[1], [x0], x1
373 st1 {v7.d}[1], [x0], x1
374 st1 {v0.d}[1], [x0], x1
375 st1 {v1.d}[1], [x0], x1
376 st1 {v2.d}[1], [x0], x1
377 st1 {v3.d}[1], [x0], x1
382 .macro h264_loop_filter_chroma
383 dup v22.8B, w2 // alpha
384 dup v23.8B, w3 // beta
386 uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
387 uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
388 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
389 cmhi v26.8B, v22.8B, v26.8B // < alpha
390 cmhi v28.8B, v23.8B, v28.8B // < beta
391 cmhi v30.8B, v23.8B, v30.8B // < beta
393 and v26.8B, v26.8B, v28.8B
394 usubw v4.8H, v4.8H, v16.8B
395 and v26.8B, v26.8B, v30.8B
398 sli v24.8H, v24.8H, #8
399 uaddw v4.8H, v4.8H, v18.8B
401 usubw v4.8H, v4.8H, v2.8B
402 rshrn v4.8B, v4.8H, #3
403 smin v4.8B, v4.8B, v24.8B
405 smax v4.8B, v4.8B, v25.8B
407 and v4.8B, v4.8B, v26.8B
409 saddw v28.8H, v28.8H, v4.8B
410 ssubw v22.8H, v22.8H, v4.8B
411 sqxtun v16.8B, v28.8H
415 function ff_h264_v_loop_filter_chroma_neon, export=1
416 h264_loop_filter_start
419 sub x0, x0, x1, lsl #1
420 ld1 {v18.8B}, [x0], x1
421 ld1 {v16.8B}, [x0], x1
422 ld1 {v0.8B}, [x0], x1
425 h264_loop_filter_chroma
427 sub x0, x0, x1, lsl #1
428 st1 {v16.8B}, [x0], x1
429 st1 {v0.8B}, [x0], x1
434 function ff_h264_h_loop_filter_chroma_neon, export=1
435 h264_loop_filter_start
439 ld1 {v18.S}[0], [x0], x1
440 ld1 {v16.S}[0], [x0], x1
441 ld1 {v0.S}[0], [x0], x1
442 ld1 {v2.S}[0], [x0], x1
443 ld1 {v18.S}[1], [x0], x1
444 ld1 {v16.S}[1], [x0], x1
445 ld1 {v0.S}[1], [x0], x1
446 ld1 {v2.S}[1], [x0], x1
448 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
450 h264_loop_filter_chroma
452 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
454 sub x0, x0, x1, lsl #3
455 st1 {v18.S}[0], [x0], x1
456 st1 {v16.S}[0], [x0], x1
457 st1 {v0.S}[0], [x0], x1
458 st1 {v2.S}[0], [x0], x1
459 st1 {v18.S}[1], [x0], x1
460 st1 {v16.S}[1], [x0], x1
461 st1 {v0.S}[1], [x0], x1
462 st1 {v2.S}[1], [x0], x1
468 .macro h264_loop_filter_chroma_intra
469 uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
470 uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
471 uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
472 cmhi v26.8b, v30.8b, v26.8b // < alpha
473 cmhi v27.8b, v31.8b, v27.8b // < beta
474 cmhi v28.8b, v31.8b, v28.8b // < beta
475 and v26.8b, v26.8b, v27.8b
476 and v26.8b, v26.8b, v28.8b
479 ushll v4.8h, v18.8b, #1
480 ushll v6.8h, v19.8b, #1
482 uaddl v20.8h, v16.8b, v19.8b
483 uaddl v22.8h, v17.8b, v18.8b
484 add v20.8h, v20.8h, v4.8h
485 add v22.8h, v22.8h, v6.8h
486 uqrshrn v24.8b, v20.8h, #2
487 uqrshrn v25.8b, v22.8h, #2
488 bit v16.8b, v24.8b, v26.8b
489 bit v17.8b, v25.8b, v26.8b
492 function ff_h264_v_loop_filter_chroma_intra_neon, export=1
493 h264_loop_filter_start_intra
495 sub x0, x0, x1, lsl #1
496 ld1 {v18.8b}, [x0], x1
497 ld1 {v16.8b}, [x0], x1
498 ld1 {v17.8b}, [x0], x1
501 h264_loop_filter_chroma_intra
503 sub x0, x0, x1, lsl #1
504 st1 {v16.8b}, [x0], x1
505 st1 {v17.8b}, [x0], x1
511 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
512 h264_loop_filter_start_intra
516 ld1 {v18.8b}, [x4], x1
517 ld1 {v16.8b}, [x4], x1
518 ld1 {v17.8b}, [x4], x1
519 ld1 {v19.8b}, [x4], x1
521 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
523 h264_loop_filter_chroma_intra
525 st2 {v16.b,v17.b}[0], [x0], x1
526 st2 {v16.b,v17.b}[1], [x0], x1
527 st2 {v16.b,v17.b}[2], [x0], x1
528 st2 {v16.b,v17.b}[3], [x0], x1
534 function ff_h264_h_loop_filter_chroma_intra_neon, export=1
535 h264_loop_filter_start_intra
539 ld1 {v18.8b}, [x4], x1
540 ld1 {v16.8b}, [x4], x1
541 ld1 {v17.8b}, [x4], x1
542 ld1 {v19.8b}, [x4], x1
543 ld1 {v18.s}[1], [x4], x1
544 ld1 {v16.s}[1], [x4], x1
545 ld1 {v17.s}[1], [x4], x1
548 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
550 h264_loop_filter_chroma_intra
552 st2 {v16.b,v17.b}[0], [x0], x1
553 st2 {v16.b,v17.b}[1], [x0], x1
554 st2 {v16.b,v17.b}[2], [x0], x1
555 st2 {v16.b,v17.b}[3], [x0], x1
556 st2 {v16.b,v17.b}[4], [x0], x1
557 st2 {v16.b,v17.b}[5], [x0], x1
558 st2 {v16.b,v17.b}[6], [x0], x1
559 st2 {v16.b,v17.b}[7], [x0], x1
566 .macro biweight_16 macs, macd
572 ld1 {v20.16B}, [x0], x2
573 \macd v4.8H, v0.8B, v20.8B
574 \macd\()2 v6.8H, v0.16B, v20.16B
575 ld1 {v22.16B}, [x1], x2
576 \macs v4.8H, v1.8B, v22.8B
577 \macs\()2 v6.8H, v1.16B, v22.16B
579 ld1 {v28.16B}, [x0], x2
581 \macd v24.8H, v0.8B, v28.8B
582 \macd\()2 v26.8H, v0.16B, v28.16B
583 ld1 {v30.16B}, [x1], x2
584 \macs v24.8H, v1.8B, v30.8B
585 \macs\()2 v26.8H, v1.16B, v30.16B
586 sshl v4.8H, v4.8H, v18.8H
587 sshl v6.8H, v6.8H, v18.8H
589 sqxtun2 v4.16B, v6.8H
590 sshl v24.8H, v24.8H, v18.8H
591 sshl v26.8H, v26.8H, v18.8H
592 sqxtun v24.8B, v24.8H
593 sqxtun2 v24.16B, v26.8H
595 st1 {v4.16B}, [x7], x2
597 st1 {v24.16B}, [x7], x2
602 .macro biweight_8 macs, macd
608 ld1 {v4.8B}, [x0], x2
609 \macd v2.8H, v0.8B, v4.8B
610 ld1 {v5.8B}, [x1], x2
611 \macs v2.8H, v1.8B, v5.8B
612 ld1 {v6.8B}, [x0], x2
613 \macd v20.8H, v0.8B, v6.8B
614 ld1 {v7.8B}, [x1], x2
615 \macs v20.8H, v1.8B, v7.8B
616 sshl v2.8H, v2.8H, v18.8H
618 sshl v20.8H, v20.8H, v18.8H
621 st1 {v2.8B}, [x7], x2
623 st1 {v4.8B}, [x7], x2
628 .macro biweight_4 macs, macd
634 ld1 {v4.S}[0], [x0], x2
635 ld1 {v4.S}[1], [x0], x2
636 \macd v2.8H, v0.8B, v4.8B
637 ld1 {v5.S}[0], [x1], x2
638 ld1 {v5.S}[1], [x1], x2
639 \macs v2.8H, v1.8B, v5.8B
641 ld1 {v6.S}[0], [x0], x2
642 ld1 {v6.S}[1], [x0], x2
643 \macd v20.8H, v0.8B, v6.8B
644 ld1 {v7.S}[0], [x1], x2
645 ld1 {v7.S}[1], [x1], x2
646 \macs v20.8H, v1.8B, v7.8B
647 sshl v2.8H, v2.8H, v18.8H
649 sshl v20.8H, v20.8H, v18.8H
652 st1 {v2.S}[0], [x7], x2
653 st1 {v2.S}[1], [x7], x2
655 st1 {v4.S}[0], [x7], x2
656 st1 {v4.S}[1], [x7], x2
659 2: sshl v2.8H, v2.8H, v18.8H
661 st1 {v2.S}[0], [x7], x2
662 st1 {v2.S}[1], [x7], x2
666 .macro biweight_func w
667 function ff_biweight_h264_pixels_\w\()_neon, export=1
671 eor w8, w8, w6, lsr #30
684 10: biweight_\w umlal, umlal
686 biweight_\w umlal, umlsl
689 biweight_\w umlsl, umlsl
691 biweight_\w umlsl, umlal
702 ld1 {v20.16B}, [x0], x1
703 umull v4.8H, v0.8B, v20.8B
704 umull2 v6.8H, v0.16B, v20.16B
705 ld1 {v28.16B}, [x0], x1
706 umull v24.8H, v0.8B, v28.8B
707 umull2 v26.8H, v0.16B, v28.16B
708 \add v4.8H, v16.8H, v4.8H
709 srshl v4.8H, v4.8H, v18.8H
710 \add v6.8H, v16.8H, v6.8H
711 srshl v6.8H, v6.8H, v18.8H
713 sqxtun2 v4.16B, v6.8H
714 \add v24.8H, v16.8H, v24.8H
715 srshl v24.8H, v24.8H, v18.8H
716 \add v26.8H, v16.8H, v26.8H
717 srshl v26.8H, v26.8H, v18.8H
718 sqxtun v24.8B, v24.8H
719 sqxtun2 v24.16B, v26.8H
720 st1 {v4.16B}, [x5], x1
721 st1 {v24.16B}, [x5], x1
729 ld1 {v4.8B}, [x0], x1
730 umull v2.8H, v0.8B, v4.8B
731 ld1 {v6.8B}, [x0], x1
732 umull v20.8H, v0.8B, v6.8B
733 \add v2.8H, v16.8H, v2.8H
734 srshl v2.8H, v2.8H, v18.8H
736 \add v20.8H, v16.8H, v20.8H
737 srshl v20.8H, v20.8H, v18.8H
739 st1 {v2.8B}, [x5], x1
740 st1 {v4.8B}, [x5], x1
748 ld1 {v4.S}[0], [x0], x1
749 ld1 {v4.S}[1], [x0], x1
750 umull v2.8H, v0.8B, v4.8B
752 ld1 {v6.S}[0], [x0], x1
753 ld1 {v6.S}[1], [x0], x1
754 umull v20.8H, v0.8B, v6.8B
755 \add v2.8H, v16.8H, v2.8H
756 srshl v2.8H, v2.8H, v18.8H
758 \add v20.8H, v16.8H, v20.8H
759 srshl v20.8H, v20.8h, v18.8H
761 st1 {v2.S}[0], [x5], x1
762 st1 {v2.S}[1], [x5], x1
763 st1 {v4.S}[0], [x5], x1
764 st1 {v4.S}[1], [x5], x1
767 2: \add v2.8H, v16.8H, v2.8H
768 srshl v2.8H, v2.8H, v18.8H
770 st1 {v2.S}[0], [x5], x1
771 st1 {v2.S}[1], [x5], x1
776 function ff_weight_h264_pixels_\w\()_neon, export=1