2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/aarch64/asm.S"
26 .macro h264_loop_filter_start
31 and w8, w6, w6, lsl #16
33 ands w8, w8, w8, lsl #8
40 .macro h264_loop_filter_luma
41 dup v22.16B, w2 // alpha
43 uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
45 uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
46 sli v24.8H, v24.8H, #8
47 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
48 sli v24.4S, v24.4S, #16
49 cmhi v21.16B, v22.16B, v21.16B // < alpha
50 dup v22.16B, w3 // beta
51 cmlt v23.16B, v24.16B, #0
52 cmhi v28.16B, v22.16B, v28.16B // < beta
53 cmhi v30.16B, v22.16B, v30.16B // < beta
54 bic v21.16B, v21.16B, v23.16B
55 uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
56 and v21.16B, v21.16B, v28.16B
57 uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
58 and v21.16B, v21.16B, v30.16B // < beta
59 shrn v30.8b, v21.8h, #4
61 cmhi v17.16B, v22.16B, v17.16B // < beta
62 cmhi v19.16B, v22.16B, v19.16B // < beta
64 and v17.16B, v17.16B, v21.16B
65 and v19.16B, v19.16B, v21.16B
66 and v24.16B, v24.16B, v21.16B
67 urhadd v28.16B, v16.16B, v0.16B
68 sub v21.16B, v24.16B, v17.16B
69 uqadd v23.16B, v18.16B, v24.16B
70 uhadd v20.16B, v20.16B, v28.16B
71 sub v21.16B, v21.16B, v19.16B
72 uhadd v28.16B, v4.16B, v28.16B
73 umin v23.16B, v23.16B, v20.16B
74 uqsub v22.16B, v18.16B, v24.16B
75 uqadd v4.16B, v2.16B, v24.16B
76 umax v23.16B, v23.16B, v22.16B
77 uqsub v22.16B, v2.16B, v24.16B
78 umin v28.16B, v4.16B, v28.16B
80 umax v28.16B, v28.16B, v22.16B
82 usubw v4.8H, v4.8H, v16.8B
83 usubw2 v20.8H, v20.8H, v16.16B
85 shl v20.8H, v20.8H, #2
86 uaddw v4.8H, v4.8H, v18.8B
87 uaddw2 v20.8H, v20.8H, v18.16B
88 usubw v4.8H, v4.8H, v2.8B
89 usubw2 v20.8H, v20.8H, v2.16B
90 rshrn v4.8B, v4.8H, #3
91 rshrn2 v4.16B, v20.8H, #3
92 bsl v17.16B, v23.16B, v18.16B
93 bsl v19.16B, v28.16B, v2.16B
96 smin v4.16B, v4.16B, v21.16B
98 smax v4.16B, v4.16B, v23.16B
101 saddw v28.8H, v28.8H, v4.8B
102 saddw2 v21.8H, v21.8H, v4.16B
103 ssubw v22.8H, v22.8H, v4.8B
104 ssubw2 v24.8H, v24.8H, v4.16B
105 sqxtun v16.8B, v28.8H
106 sqxtun2 v16.16B, v21.8H
108 sqxtun2 v0.16B, v24.8H
111 function ff_h264_v_loop_filter_luma_neon, export=1
112 h264_loop_filter_start
115 ld1 {v0.16B}, [x0], x1
116 ld1 {v2.16B}, [x0], x1
117 ld1 {v4.16B}, [x0], x1
118 sub x0, x0, x1, lsl #2
119 sub x0, x0, x1, lsl #1
120 ld1 {v20.16B}, [x0], x1
121 ld1 {v18.16B}, [x0], x1
122 ld1 {v16.16B}, [x0], x1
124 h264_loop_filter_luma
126 sub x0, x0, x1, lsl #1
127 st1 {v17.16B}, [x0], x1
128 st1 {v16.16B}, [x0], x1
129 st1 {v0.16B}, [x0], x1
135 function ff_h264_h_loop_filter_luma_neon, export=1
136 h264_loop_filter_start
140 ld1 {v6.8B}, [x0], x1
141 ld1 {v20.8B}, [x0], x1
142 ld1 {v18.8B}, [x0], x1
143 ld1 {v16.8B}, [x0], x1
144 ld1 {v0.8B}, [x0], x1
145 ld1 {v2.8B}, [x0], x1
146 ld1 {v4.8B}, [x0], x1
147 ld1 {v26.8B}, [x0], x1
148 ld1 {v6.D}[1], [x0], x1
149 ld1 {v20.D}[1], [x0], x1
150 ld1 {v18.D}[1], [x0], x1
151 ld1 {v16.D}[1], [x0], x1
152 ld1 {v0.D}[1], [x0], x1
153 ld1 {v2.D}[1], [x0], x1
154 ld1 {v4.D}[1], [x0], x1
155 ld1 {v26.D}[1], [x0], x1
157 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
159 h264_loop_filter_luma
161 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
163 sub x0, x0, x1, lsl #4
165 st1 {v17.S}[0], [x0], x1
166 st1 {v16.S}[0], [x0], x1
167 st1 {v0.S}[0], [x0], x1
168 st1 {v19.S}[0], [x0], x1
169 st1 {v17.S}[1], [x0], x1
170 st1 {v16.S}[1], [x0], x1
171 st1 {v0.S}[1], [x0], x1
172 st1 {v19.S}[1], [x0], x1
173 st1 {v17.S}[2], [x0], x1
174 st1 {v16.S}[2], [x0], x1
175 st1 {v0.S}[2], [x0], x1
176 st1 {v19.S}[2], [x0], x1
177 st1 {v17.S}[3], [x0], x1
178 st1 {v16.S}[3], [x0], x1
179 st1 {v0.S}[3], [x0], x1
180 st1 {v19.S}[3], [x0], x1
186 .macro h264_loop_filter_start_intra
192 dup v30.16b, w2 // alpha
193 dup v31.16b, w3 // beta
196 .macro h264_loop_filter_luma_intra
197 uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
198 uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
199 uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
200 cmhi v19.16b, v30.16b, v16.16b // < alpha
201 cmhi v17.16b, v31.16b, v17.16b // < beta
202 cmhi v18.16b, v31.16b, v18.16b // < beta
205 ushr v30.16b, v30.16b, #2 // alpha >> 2
206 add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
207 cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
209 and v19.16b, v19.16b, v17.16b
210 and v19.16b, v19.16b, v18.16b
211 shrn v20.8b, v19.8h, #4
215 ushll v20.8h, v6.8b, #1
216 ushll v22.8h, v1.8b, #1
217 ushll2 v21.8h, v6.16b, #1
218 ushll2 v23.8h, v1.16b, #1
219 uaddw v20.8h, v20.8h, v7.8b
220 uaddw v22.8h, v22.8h, v0.8b
221 uaddw2 v21.8h, v21.8h, v7.16b
222 uaddw2 v23.8h, v23.8h, v0.16b
223 uaddw v20.8h, v20.8h, v1.8b
224 uaddw v22.8h, v22.8h, v6.8b
225 uaddw2 v21.8h, v21.8h, v1.16b
226 uaddw2 v23.8h, v23.8h, v6.16b
228 rshrn v24.8b, v20.8h, #2 // p0'_1
229 rshrn v25.8b, v22.8h, #2 // q0'_1
230 rshrn2 v24.16b, v21.8h, #2 // p0'_1
231 rshrn2 v25.16b, v23.8h, #2 // q0'_1
233 uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
234 uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
235 cmhi v17.16b, v31.16b, v17.16b // < beta
236 cmhi v18.16b, v31.16b, v18.16b // < beta
238 and v17.16b, v16.16b, v17.16b // if_2 && if_3
239 and v18.16b, v16.16b, v18.16b // if_2 && if_4
244 and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
245 and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
247 and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
248 and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
250 //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
251 uaddl v26.8h, v5.8b, v7.8b
252 uaddl2 v27.8h, v5.16b, v7.16b
253 uaddw v26.8h, v26.8h, v0.8b
254 uaddw2 v27.8h, v27.8h, v0.16b
255 add v20.8h, v20.8h, v26.8h
256 add v21.8h, v21.8h, v27.8h
257 uaddw v20.8h, v20.8h, v0.8b
258 uaddw2 v21.8h, v21.8h, v0.16b
259 rshrn v20.8b, v20.8h, #3 // p0'_2
260 rshrn2 v20.16b, v21.8h, #3 // p0'_2
261 uaddw v26.8h, v26.8h, v6.8b
262 uaddw2 v27.8h, v27.8h, v6.16b
263 rshrn v21.8b, v26.8h, #2 // p1'_2
264 rshrn2 v21.16b, v27.8h, #2 // p1'_2
265 uaddl v28.8h, v4.8b, v5.8b
266 uaddl2 v29.8h, v4.16b, v5.16b
267 shl v28.8h, v28.8h, #1
268 shl v29.8h, v29.8h, #1
269 add v28.8h, v28.8h, v26.8h
270 add v29.8h, v29.8h, v27.8h
271 rshrn v19.8b, v28.8h, #3 // p2'_2
272 rshrn2 v19.16b, v29.8h, #3 // p2'_2
274 //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
275 uaddl v26.8h, v2.8b, v0.8b
276 uaddl2 v27.8h, v2.16b, v0.16b
277 uaddw v26.8h, v26.8h, v7.8b
278 uaddw2 v27.8h, v27.8h, v7.16b
279 add v22.8h, v22.8h, v26.8h
280 add v23.8h, v23.8h, v27.8h
281 uaddw v22.8h, v22.8h, v7.8b
282 uaddw2 v23.8h, v23.8h, v7.16b
283 rshrn v22.8b, v22.8h, #3 // q0'_2
284 rshrn2 v22.16b, v23.8h, #3 // q0'_2
285 uaddw v26.8h, v26.8h, v1.8b
286 uaddw2 v27.8h, v27.8h, v1.16b
287 rshrn v23.8b, v26.8h, #2 // q1'_2
288 rshrn2 v23.16b, v27.8h, #2 // q1'_2
289 uaddl v28.8h, v2.8b, v3.8b
290 uaddl2 v29.8h, v2.16b, v3.16b
291 shl v28.8h, v28.8h, #1
292 shl v29.8h, v29.8h, #1
293 add v28.8h, v28.8h, v26.8h
294 add v29.8h, v29.8h, v27.8h
295 rshrn v26.8b, v28.8h, #3 // q2'_2
296 rshrn2 v26.16b, v29.8h, #3 // q2'_2
298 bit v7.16b, v24.16b, v30.16b // p0'_1
299 bit v0.16b, v25.16b, v31.16b // q0'_1
300 bit v7.16b, v20.16b, v17.16b // p0'_2
301 bit v6.16b, v21.16b, v17.16b // p1'_2
302 bit v5.16b, v19.16b, v17.16b // p2'_2
303 bit v0.16b, v22.16b, v18.16b // q0'_2
304 bit v1.16b, v23.16b, v18.16b // q1'_2
305 bit v2.16b, v26.16b, v18.16b // q2'_2
308 function ff_h264_v_loop_filter_luma_intra_neon, export=1
309 h264_loop_filter_start_intra
311 ld1 {v0.16b}, [x0], x1 // q0
312 ld1 {v1.16b}, [x0], x1 // q1
313 ld1 {v2.16b}, [x0], x1 // q2
314 ld1 {v3.16b}, [x0], x1 // q3
315 sub x0, x0, x1, lsl #3
316 ld1 {v4.16b}, [x0], x1 // p3
317 ld1 {v5.16b}, [x0], x1 // p2
318 ld1 {v6.16b}, [x0], x1 // p1
319 ld1 {v7.16b}, [x0] // p0
321 h264_loop_filter_luma_intra
323 sub x0, x0, x1, lsl #1
324 st1 {v5.16b}, [x0], x1 // p2
325 st1 {v6.16b}, [x0], x1 // p1
326 st1 {v7.16b}, [x0], x1 // p0
327 st1 {v0.16b}, [x0], x1 // q0
328 st1 {v1.16b}, [x0], x1 // q1
329 st1 {v2.16b}, [x0] // q2
334 function ff_h264_h_loop_filter_luma_intra_neon, export=1
335 h264_loop_filter_start_intra
338 ld1 {v4.8b}, [x0], x1
339 ld1 {v5.8b}, [x0], x1
340 ld1 {v6.8b}, [x0], x1
341 ld1 {v7.8b}, [x0], x1
342 ld1 {v0.8b}, [x0], x1
343 ld1 {v1.8b}, [x0], x1
344 ld1 {v2.8b}, [x0], x1
345 ld1 {v3.8b}, [x0], x1
346 ld1 {v4.d}[1], [x0], x1
347 ld1 {v5.d}[1], [x0], x1
348 ld1 {v6.d}[1], [x0], x1
349 ld1 {v7.d}[1], [x0], x1
350 ld1 {v0.d}[1], [x0], x1
351 ld1 {v1.d}[1], [x0], x1
352 ld1 {v2.d}[1], [x0], x1
353 ld1 {v3.d}[1], [x0], x1
355 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
357 h264_loop_filter_luma_intra
359 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
361 sub x0, x0, x1, lsl #4
362 st1 {v4.8b}, [x0], x1
363 st1 {v5.8b}, [x0], x1
364 st1 {v6.8b}, [x0], x1
365 st1 {v7.8b}, [x0], x1
366 st1 {v0.8b}, [x0], x1
367 st1 {v1.8b}, [x0], x1
368 st1 {v2.8b}, [x0], x1
369 st1 {v3.8b}, [x0], x1
370 st1 {v4.d}[1], [x0], x1
371 st1 {v5.d}[1], [x0], x1
372 st1 {v6.d}[1], [x0], x1
373 st1 {v7.d}[1], [x0], x1
374 st1 {v0.d}[1], [x0], x1
375 st1 {v1.d}[1], [x0], x1
376 st1 {v2.d}[1], [x0], x1
377 st1 {v3.d}[1], [x0], x1
382 .macro h264_loop_filter_chroma
383 dup v22.8B, w2 // alpha
384 dup v23.8B, w3 // beta
386 uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
387 uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
388 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
389 cmhi v26.8B, v22.8B, v26.8B // < alpha
390 cmhi v28.8B, v23.8B, v28.8B // < beta
391 cmhi v30.8B, v23.8B, v30.8B // < beta
393 and v26.8B, v26.8B, v28.8B
394 usubw v4.8H, v4.8H, v16.8B
395 and v26.8B, v26.8B, v30.8B
398 sli v24.8H, v24.8H, #8
399 uaddw v4.8H, v4.8H, v18.8B
401 usubw v4.8H, v4.8H, v2.8B
402 rshrn v4.8B, v4.8H, #3
403 smin v4.8B, v4.8B, v24.8B
405 smax v4.8B, v4.8B, v25.8B
407 and v4.8B, v4.8B, v26.8B
409 saddw v28.8H, v28.8H, v4.8B
410 ssubw v22.8H, v22.8H, v4.8B
411 sqxtun v16.8B, v28.8H
415 function ff_h264_v_loop_filter_chroma_neon, export=1
416 h264_loop_filter_start
419 sub x0, x0, x1, lsl #1
420 ld1 {v18.8B}, [x0], x1
421 ld1 {v16.8B}, [x0], x1
422 ld1 {v0.8B}, [x0], x1
425 h264_loop_filter_chroma
427 sub x0, x0, x1, lsl #1
428 st1 {v16.8B}, [x0], x1
429 st1 {v0.8B}, [x0], x1
434 function ff_h264_h_loop_filter_chroma_neon, export=1
435 h264_loop_filter_start
439 h_loop_filter_chroma420:
440 ld1 {v18.S}[0], [x0], x1
441 ld1 {v16.S}[0], [x0], x1
442 ld1 {v0.S}[0], [x0], x1
443 ld1 {v2.S}[0], [x0], x1
444 ld1 {v18.S}[1], [x0], x1
445 ld1 {v16.S}[1], [x0], x1
446 ld1 {v0.S}[1], [x0], x1
447 ld1 {v2.S}[1], [x0], x1
449 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
451 h264_loop_filter_chroma
453 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
455 sub x0, x0, x1, lsl #3
456 st1 {v18.S}[0], [x0], x1
457 st1 {v16.S}[0], [x0], x1
458 st1 {v0.S}[0], [x0], x1
459 st1 {v2.S}[0], [x0], x1
460 st1 {v18.S}[1], [x0], x1
461 st1 {v16.S}[1], [x0], x1
462 st1 {v0.S}[1], [x0], x1
463 st1 {v2.S}[1], [x0], x1
468 function ff_h264_h_loop_filter_chroma422_neon, export=1
470 h264_loop_filter_start
475 bl h_loop_filter_chroma420
479 b h_loop_filter_chroma420
482 .macro h264_loop_filter_chroma_intra
483 uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
484 uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
485 uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
486 cmhi v26.8b, v30.8b, v26.8b // < alpha
487 cmhi v27.8b, v31.8b, v27.8b // < beta
488 cmhi v28.8b, v31.8b, v28.8b // < beta
489 and v26.8b, v26.8b, v27.8b
490 and v26.8b, v26.8b, v28.8b
493 ushll v4.8h, v18.8b, #1
494 ushll v6.8h, v19.8b, #1
496 uaddl v20.8h, v16.8b, v19.8b
497 uaddl v22.8h, v17.8b, v18.8b
498 add v20.8h, v20.8h, v4.8h
499 add v22.8h, v22.8h, v6.8h
500 uqrshrn v24.8b, v20.8h, #2
501 uqrshrn v25.8b, v22.8h, #2
502 bit v16.8b, v24.8b, v26.8b
503 bit v17.8b, v25.8b, v26.8b
506 function ff_h264_v_loop_filter_chroma_intra_neon, export=1
507 h264_loop_filter_start_intra
509 sub x0, x0, x1, lsl #1
510 ld1 {v18.8b}, [x0], x1
511 ld1 {v16.8b}, [x0], x1
512 ld1 {v17.8b}, [x0], x1
515 h264_loop_filter_chroma_intra
517 sub x0, x0, x1, lsl #1
518 st1 {v16.8b}, [x0], x1
519 st1 {v17.8b}, [x0], x1
525 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
526 h264_loop_filter_start_intra
530 ld1 {v18.8b}, [x4], x1
531 ld1 {v16.8b}, [x4], x1
532 ld1 {v17.8b}, [x4], x1
533 ld1 {v19.8b}, [x4], x1
535 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
537 h264_loop_filter_chroma_intra
539 st2 {v16.b,v17.b}[0], [x0], x1
540 st2 {v16.b,v17.b}[1], [x0], x1
541 st2 {v16.b,v17.b}[2], [x0], x1
542 st2 {v16.b,v17.b}[3], [x0], x1
548 function ff_h264_h_loop_filter_chroma_intra_neon, export=1
549 h264_loop_filter_start_intra
553 h_loop_filter_chroma420_intra:
554 ld1 {v18.8b}, [x4], x1
555 ld1 {v16.8b}, [x4], x1
556 ld1 {v17.8b}, [x4], x1
557 ld1 {v19.8b}, [x4], x1
558 ld1 {v18.s}[1], [x4], x1
559 ld1 {v16.s}[1], [x4], x1
560 ld1 {v17.s}[1], [x4], x1
561 ld1 {v19.s}[1], [x4], x1
563 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
565 h264_loop_filter_chroma_intra
567 st2 {v16.b,v17.b}[0], [x0], x1
568 st2 {v16.b,v17.b}[1], [x0], x1
569 st2 {v16.b,v17.b}[2], [x0], x1
570 st2 {v16.b,v17.b}[3], [x0], x1
571 st2 {v16.b,v17.b}[4], [x0], x1
572 st2 {v16.b,v17.b}[5], [x0], x1
573 st2 {v16.b,v17.b}[6], [x0], x1
574 st2 {v16.b,v17.b}[7], [x0], x1
580 function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
581 h264_loop_filter_start_intra
583 add x5, x0, x1, lsl #3
586 bl h_loop_filter_chroma420_intra
589 b h_loop_filter_chroma420_intra
592 .macro biweight_16 macs, macd
598 ld1 {v20.16B}, [x0], x2
599 \macd v4.8H, v0.8B, v20.8B
600 \macd\()2 v6.8H, v0.16B, v20.16B
601 ld1 {v22.16B}, [x1], x2
602 \macs v4.8H, v1.8B, v22.8B
603 \macs\()2 v6.8H, v1.16B, v22.16B
605 ld1 {v28.16B}, [x0], x2
607 \macd v24.8H, v0.8B, v28.8B
608 \macd\()2 v26.8H, v0.16B, v28.16B
609 ld1 {v30.16B}, [x1], x2
610 \macs v24.8H, v1.8B, v30.8B
611 \macs\()2 v26.8H, v1.16B, v30.16B
612 sshl v4.8H, v4.8H, v18.8H
613 sshl v6.8H, v6.8H, v18.8H
615 sqxtun2 v4.16B, v6.8H
616 sshl v24.8H, v24.8H, v18.8H
617 sshl v26.8H, v26.8H, v18.8H
618 sqxtun v24.8B, v24.8H
619 sqxtun2 v24.16B, v26.8H
621 st1 {v4.16B}, [x7], x2
623 st1 {v24.16B}, [x7], x2
628 .macro biweight_8 macs, macd
634 ld1 {v4.8B}, [x0], x2
635 \macd v2.8H, v0.8B, v4.8B
636 ld1 {v5.8B}, [x1], x2
637 \macs v2.8H, v1.8B, v5.8B
638 ld1 {v6.8B}, [x0], x2
639 \macd v20.8H, v0.8B, v6.8B
640 ld1 {v7.8B}, [x1], x2
641 \macs v20.8H, v1.8B, v7.8B
642 sshl v2.8H, v2.8H, v18.8H
644 sshl v20.8H, v20.8H, v18.8H
647 st1 {v2.8B}, [x7], x2
649 st1 {v4.8B}, [x7], x2
654 .macro biweight_4 macs, macd
660 ld1 {v4.S}[0], [x0], x2
661 ld1 {v4.S}[1], [x0], x2
662 \macd v2.8H, v0.8B, v4.8B
663 ld1 {v5.S}[0], [x1], x2
664 ld1 {v5.S}[1], [x1], x2
665 \macs v2.8H, v1.8B, v5.8B
667 ld1 {v6.S}[0], [x0], x2
668 ld1 {v6.S}[1], [x0], x2
669 \macd v20.8H, v0.8B, v6.8B
670 ld1 {v7.S}[0], [x1], x2
671 ld1 {v7.S}[1], [x1], x2
672 \macs v20.8H, v1.8B, v7.8B
673 sshl v2.8H, v2.8H, v18.8H
675 sshl v20.8H, v20.8H, v18.8H
678 st1 {v2.S}[0], [x7], x2
679 st1 {v2.S}[1], [x7], x2
681 st1 {v4.S}[0], [x7], x2
682 st1 {v4.S}[1], [x7], x2
685 2: sshl v2.8H, v2.8H, v18.8H
687 st1 {v2.S}[0], [x7], x2
688 st1 {v2.S}[1], [x7], x2
692 .macro biweight_func w
693 function ff_biweight_h264_pixels_\w\()_neon, export=1
697 eor w8, w8, w6, lsr #30
710 10: biweight_\w umlal, umlal
712 biweight_\w umlal, umlsl
715 biweight_\w umlsl, umlsl
717 biweight_\w umlsl, umlal
728 ld1 {v20.16B}, [x0], x1
729 umull v4.8H, v0.8B, v20.8B
730 umull2 v6.8H, v0.16B, v20.16B
731 ld1 {v28.16B}, [x0], x1
732 umull v24.8H, v0.8B, v28.8B
733 umull2 v26.8H, v0.16B, v28.16B
734 \add v4.8H, v16.8H, v4.8H
735 srshl v4.8H, v4.8H, v18.8H
736 \add v6.8H, v16.8H, v6.8H
737 srshl v6.8H, v6.8H, v18.8H
739 sqxtun2 v4.16B, v6.8H
740 \add v24.8H, v16.8H, v24.8H
741 srshl v24.8H, v24.8H, v18.8H
742 \add v26.8H, v16.8H, v26.8H
743 srshl v26.8H, v26.8H, v18.8H
744 sqxtun v24.8B, v24.8H
745 sqxtun2 v24.16B, v26.8H
746 st1 {v4.16B}, [x5], x1
747 st1 {v24.16B}, [x5], x1
755 ld1 {v4.8B}, [x0], x1
756 umull v2.8H, v0.8B, v4.8B
757 ld1 {v6.8B}, [x0], x1
758 umull v20.8H, v0.8B, v6.8B
759 \add v2.8H, v16.8H, v2.8H
760 srshl v2.8H, v2.8H, v18.8H
762 \add v20.8H, v16.8H, v20.8H
763 srshl v20.8H, v20.8H, v18.8H
765 st1 {v2.8B}, [x5], x1
766 st1 {v4.8B}, [x5], x1
774 ld1 {v4.S}[0], [x0], x1
775 ld1 {v4.S}[1], [x0], x1
776 umull v2.8H, v0.8B, v4.8B
778 ld1 {v6.S}[0], [x0], x1
779 ld1 {v6.S}[1], [x0], x1
780 umull v20.8H, v0.8B, v6.8B
781 \add v2.8H, v16.8H, v2.8H
782 srshl v2.8H, v2.8H, v18.8H
784 \add v20.8H, v16.8H, v20.8H
785 srshl v20.8H, v20.8h, v18.8H
787 st1 {v2.S}[0], [x5], x1
788 st1 {v2.S}[1], [x5], x1
789 st1 {v4.S}[0], [x5], x1
790 st1 {v4.S}[1], [x5], x1
793 2: \add v2.8H, v16.8H, v2.8H
794 srshl v2.8H, v2.8H, v18.8H
796 st1 {v2.S}[0], [x5], x1
797 st1 {v2.S}[1], [x5], x1
802 function ff_weight_h264_pixels_\w\()_neon, export=1