2 * Copyright (c) 2017 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
25 .macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
26 trn1 \t4\().8h, \r0\().8h, \r1\().8h
27 trn2 \t5\().8h, \r0\().8h, \r1\().8h
28 trn1 \t6\().8h, \r2\().8h, \r3\().8h
29 trn2 \t7\().8h, \r2\().8h, \r3\().8h
31 trn1 \r0\().4s, \t4\().4s, \t6\().4s
32 trn2 \r2\().4s, \t4\().4s, \t6\().4s
33 trn1 \r1\().4s, \t5\().4s, \t7\().4s
34 trn2 \r3\().4s, \t5\().4s, \t7\().4s
37 // The input to and output from this macro is in the registers v16-v31,
38 // and v0-v7 are used as scratch registers.
39 // p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
40 // Depending on the width of the loop filter, we either use v16-v19
41 // and v28-v31 as temp registers, or v8-v15.
42 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
47 uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2)
48 uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1)
49 uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0)
50 uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1)
51 uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2)
52 uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3)
53 umax v4.8h, v4.8h, v5.8h
54 umax v5.8h, v6.8h, v7.8h
55 umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
56 uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0)
57 umax v4.8h, v4.8h, v5.8h
58 add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2
59 uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1)
60 umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3))
62 cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I
63 add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
64 cmhs v6.8h, v0.8h, v6.8h
65 and v4.16b, v4.16b, v6.16b // fm
67 // If no pixels need filtering, just exit as soon as possible
78 uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
79 uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
80 uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0)
81 uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0)
82 uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0)
83 uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0)
84 umax v6.8h, v6.8h, v2.8h
85 umax v1.8h, v1.8h, \tmp1\().8h
86 umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h
88 uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0)
89 umax v6.8h, v6.8h, v1.8h
90 uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0)
91 umax v6.8h, v6.8h, \tmp2\().8h
92 uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0)
93 cmhs v6.8h, v0.8h, v6.8h // flat8in
94 uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0)
95 and v6.16b, v6.16b, v4.16b // flat8in && fm
96 uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0)
97 bic v4.16b, v4.16b, v6.16b // fm && !flat8in
98 uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0)
99 uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0)
100 uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0)
102 umax v7.8h, v7.8h, v2.8h
103 umax v1.8h, v1.8h, v8.8h
104 umax v9.8h, v9.8h, v10.8h
105 umax v11.8h, v11.8h, v12.8h
106 // The rest of the calculation of flat8out is interleaved below
108 // The rest of the calculation of flat8in is interleaved below
112 // Calculate the normal inner loop filter for 2 or 4 pixels
113 uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0)
115 umax v7.8h, v7.8h, v1.8h
116 umax v9.8h, v9.8h, v11.8h
118 umax v6.8h, v6.8h, v1.8h
120 uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
122 umax v7.8h, v7.8h, v9.8h
124 umax v6.8h, v6.8h, \tmp2\().8h
126 dup \tmp2\().8h, w6 // left shift for saturation
127 sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1
128 neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation
129 umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
130 sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0
133 cmhs v6.8h, v0.8h, v6.8h // flat8in
135 cmhs v5.8h, v3.8h, v5.8h // !hev
137 and v6.16b, v6.16b, v4.16b // flat8in && fm
139 sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
141 cmhs v7.8h, v0.8h, v7.8h // flat8out
143 bic v4.16b, v4.16b, v6.16b // fm && !flat8in
145 and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in
147 and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm
149 sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
151 mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0)
152 bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0
154 add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
156 sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h
158 sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
159 dup \tmp6\().8h, w7 // max pixel value
161 bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out
164 ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1
166 add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4
167 add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3
168 smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
169 smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
170 sshr \tmp3\().8h, \tmp3\().8h, #3 // f1
171 sshr \tmp4\().8h, \tmp4\().8h, #3 // f2
173 add v0.8h, v23.8h, \tmp4\().8h // p0 + f2
174 sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1
175 smin v0.8h, v0.8h, \tmp6\().8h
176 smin v2.8h, v2.8h, \tmp6\().8h
177 srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1
178 smax v0.8h, v0.8h, \tmp5\().8h // out p0
179 smax v2.8h, v2.8h, \tmp5\().8h // out q0
180 bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in)
181 bit v24.16b, v2.16b, v4.16b
183 add v0.8h, v22.8h, \tmp3\().8h // p1 + f
184 sub v2.8h, v25.8h, \tmp3\().8h // q1 - f
188 smin v0.8h, v0.8h, \tmp6\().8h
189 smin v2.8h, v2.8h, \tmp6\().8h
193 smax v0.8h, v0.8h, \tmp5\().8h // out p1
194 smax v2.8h, v2.8h, \tmp5\().8h // out q1
198 bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in)
199 bit v25.16b, v2.16b, v5.16b
201 // If no pixels need flat8in, jump to flat8out
202 // (or to a writeout of the inner 4 pixels, for wd=8)
213 add \tmp1\().8h, v20.8h, v21.8h
214 add \tmp3\().8h, v22.8h, v25.8h
215 add \tmp5\().8h, v20.8h, v22.8h
216 add \tmp7\().8h, v23.8h, v26.8h
217 add v0.8h, \tmp1\().8h, \tmp1\().8h
218 add v0.8h, v0.8h, v23.8h
219 add v0.8h, v0.8h, v24.8h
220 add v0.8h, v0.8h, \tmp5\().8h
221 sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
222 sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
223 urshr v2.8h, v0.8h, #3 // out p2
225 add v0.8h, v0.8h, \tmp3\().8h
226 add \tmp1\().8h, v20.8h, v23.8h
227 add \tmp3\().8h, v24.8h, v27.8h
228 urshr v3.8h, v0.8h, #3 // out p1
230 add v0.8h, v0.8h, \tmp7\().8h
231 sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
232 add \tmp5\().8h, v21.8h, v24.8h
233 add \tmp7\().8h, v25.8h, v27.8h
234 urshr v4.8h, v0.8h, #3 // out p0
236 add v0.8h, v0.8h, \tmp3\().8h
237 sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
238 add \tmp1\().8h, v22.8h, v25.8h
239 add \tmp3\().8h, v26.8h, v27.8h
240 urshr v5.8h, v0.8h, #3 // out q0
242 add v0.8h, v0.8h, \tmp7\().8h
243 sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
244 urshr \tmp5\().8h, v0.8h, #3 // out q1
246 add v0.8h, v0.8h, \tmp3\().8h
247 // The output here is written back into the input registers. This doesn't
248 // matter for the flat8part below, since we only update those pixels
249 // which won't be touched below.
250 bit v21.16b, v2.16b, v6.16b
251 bit v22.16b, v3.16b, v6.16b
252 bit v23.16b, v4.16b, v6.16b
253 urshr \tmp6\().8h, v0.8h, #3 // out q2
254 bit v24.16b, v5.16b, v6.16b
255 bit v25.16b, \tmp5\().16b, v6.16b
256 bit v26.16b, \tmp6\().16b, v6.16b
260 orr v2.16b, v6.16b, v7.16b
265 // If no pixels needed flat8in nor flat8out, jump to a
266 // writeout of the inner 4 pixels
274 // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
279 // This writes all outputs into v2-v17 (skipping v6 and v16).
280 // If this part is skipped, the output is read from v21-v26 (which is the input
282 shl v0.8h, v16.8h, #3 // 8 * v16
283 sub v0.8h, v0.8h, v16.8h // 7 * v16
284 add v0.8h, v0.8h, v17.8h
285 add v8.8h, v17.8h, v18.8h
286 add v10.8h, v19.8h, v20.8h
287 add v0.8h, v0.8h, v8.8h
288 add v8.8h, v16.8h, v17.8h
289 add v12.8h, v21.8h, v22.8h
290 add v0.8h, v0.8h, v10.8h
291 add v10.8h, v18.8h, v25.8h
292 add v14.8h, v23.8h, v24.8h
293 sub v10.8h, v10.8h, v8.8h
294 add v0.8h, v0.8h, v12.8h
295 add v0.8h, v0.8h, v14.8h
296 add v12.8h, v16.8h, v18.8h
297 add v14.8h, v19.8h, v26.8h
298 urshr v2.8h, v0.8h, #4
300 add v0.8h, v0.8h, v10.8h
301 add v8.8h, v16.8h, v19.8h
302 add v10.8h, v20.8h, v27.8h
303 sub v14.8h, v14.8h, v12.8h
304 bif v2.16b, v17.16b, v7.16b
305 urshr v3.8h , v0.8h, #4
307 add v0.8h, v0.8h, v14.8h
308 add v12.8h, v16.8h, v20.8h
309 add v14.8h, v21.8h, v28.8h
310 sub v10.8h, v10.8h, v8.8h
311 bif v3.16b, v18.16b, v7.16b
312 urshr v4.8h, v0.8h, #4
314 add v0.8h, v0.8h, v10.8h
315 add v8.8h, v16.8h, v21.8h
316 add v10.8h, v22.8h, v29.8h
317 sub v14.8h, v14.8h, v12.8h
318 bif v4.16b, v19.16b, v7.16b
319 urshr v5.8h, v0.8h, #4
321 add v0.8h, v0.8h, v14.8h
322 add v12.8h, v16.8h, v22.8h
323 add v14.8h, v23.8h, v30.8h
324 sub v10.8h, v10.8h, v8.8h
325 bif v5.16b, v20.16b, v7.16b
326 urshr v6.8h, v0.8h, #4
328 add v0.8h, v0.8h, v10.8h
329 add v10.8h, v16.8h, v23.8h
330 sub v14.8h, v14.8h, v12.8h
331 add v12.8h, v24.8h, v31.8h
332 bif v6.16b, v21.16b, v7.16b
333 urshr v8.8h, v0.8h, #4
335 add v0.8h, v0.8h, v14.8h
336 sub v10.8h, v12.8h, v10.8h
337 add v12.8h, v17.8h, v24.8h
338 add v14.8h, v25.8h, v31.8h
339 bif v8.16b, v22.16b, v7.16b
340 urshr v9.8h, v0.8h, #4
342 add v0.8h, v0.8h, v10.8h
343 sub v14.8h, v14.8h, v12.8h
344 add v12.8h, v26.8h, v31.8h
345 bif v9.16b, v23.16b, v7.16b
346 urshr v10.8h, v0.8h, #4
348 add v0.8h, v0.8h, v14.8h
349 add v14.8h, v18.8h, v25.8h
350 add v18.8h, v19.8h, v26.8h
351 sub v12.8h, v12.8h, v14.8h
352 add v14.8h, v27.8h, v31.8h
353 bif v10.16b, v24.16b, v7.16b
354 urshr v11.8h, v0.8h, #4
356 add v0.8h, v0.8h, v12.8h
357 add v12.8h, v20.8h, v27.8h
358 sub v14.8h, v14.8h, v18.8h
359 add v18.8h, v28.8h, v31.8h
360 bif v11.16b, v25.16b, v7.16b
361 sub v18.8h, v18.8h, v12.8h
362 urshr v12.8h, v0.8h, #4
364 add v0.8h, v0.8h, v14.8h
365 add v14.8h, v21.8h, v28.8h
366 add v20.8h, v29.8h, v31.8h
367 bif v12.16b, v26.16b, v7.16b
368 urshr v13.8h, v0.8h, #4
370 add v0.8h, v0.8h, v18.8h
371 sub v20.8h, v20.8h, v14.8h
372 add v18.8h, v22.8h, v29.8h
373 add v22.8h, v30.8h, v31.8h
374 bif v13.16b, v27.16b, v7.16b
375 urshr v14.8h, v0.8h, #4
377 add v0.8h, v0.8h, v20.8h
378 sub v22.8h, v22.8h, v18.8h
379 bif v14.16b, v28.16b, v7.16b
380 urshr v15.8h, v0.8h, #4
382 add v0.8h, v0.8h, v22.8h
383 bif v15.16b, v29.16b, v7.16b
384 urshr v17.8h, v0.8h, #4
385 bif v17.16b, v30.16b, v7.16b
389 // For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
390 // while we need those for inputs/outputs in wd=16 and use v8-v15
391 // for temp registers there instead.
392 function vp9_loop_filter_4
393 loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31
397 function vp9_loop_filter_8
398 loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31
402 function vp9_loop_filter_16
403 loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15
412 // calculate alternative 'return' targets
417 .macro loop_filter_16
418 // calculate alternative 'return' targets
421 bl vp9_loop_filter_16
425 // The public functions in this file have got the following signature:
426 // void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
428 .macro bpp_frontend func, bpp, push
429 function ff_\func\()_\bpp\()_neon, export=1
432 stp d14, d15, [sp, #-0x10]!
433 stp d12, d13, [sp, #-0x10]!
434 stp d10, d11, [sp, #-0x10]!
435 stp d8, d9, [sp, #-0x10]!
437 lsl w2, w2, #\bpp - 8
438 lsl w3, w3, #\bpp - 8
439 lsl w4, w4, #\bpp - 8
440 mov x5, #1 << (\bpp - 8)
442 mov x7, #((1 << \bpp) - 1)
445 ldp d8, d9, [sp], 0x10
446 ldp d10, d11, [sp], 0x10
447 ldp d12, d13, [sp], 0x10
448 ldp d14, d15, [sp], 0x10
456 .macro bpp_frontends func, push=0
457 bpp_frontend \func, 10, \push
458 bpp_frontend \func, 12, \push
461 .macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
462 function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
465 stp d14, d15, [sp, #-0x10]!
466 stp d12, d13, [sp, #-0x10]!
467 stp d10, d11, [sp, #-0x10]!
468 stp d8, d9, [sp, #-0x10]!
470 lsl w2, w2, #\bpp - 8
471 lsl w3, w3, #\bpp - 8
472 lsl w4, w4, #\bpp - 8
473 mov x5, #1 << (\bpp - 8)
475 mov x7, #((1 << \bpp) - 1)
476 bl \func\()_\int_suffix\()_16_neon
478 add x0, x0, x1, lsl #3
482 bl \func\()_\int_suffix\()_16_neon
484 ldp d8, d9, [sp], 0x10
485 ldp d10, d11, [sp], 0x10
486 ldp d12, d13, [sp], 0x10
487 ldp d14, d15, [sp], 0x10
493 .macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
494 bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
495 bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
498 .macro bpp_frontend_mix2 wd1, wd2, dir, bpp
499 function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
507 lsl w2, w2, #\bpp - 8
508 lsl w3, w3, #\bpp - 8
509 lsl w4, w4, #\bpp - 8
510 mov x5, #1 << (\bpp - 8)
512 mov x7, #((1 << \bpp) - 1)
513 bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
515 add x0, x0, x1, lsl #3
519 lsl w2, w8, #\bpp - 8
520 lsl w3, w14, #\bpp - 8
521 lsl w4, w15, #\bpp - 8
522 bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
527 .macro bpp_frontends_mix2 wd1, wd2
528 bpp_frontend_mix2 \wd1, \wd2, v, 10
529 bpp_frontend_mix2 \wd1, \wd2, v, 12
530 bpp_frontend_mix2 \wd1, \wd2, h, 10
531 bpp_frontend_mix2 \wd1, \wd2, h, 12
534 function vp9_loop_filter_v_4_8_16_neon
536 sub x9, x0, x1, lsl #2
537 ld1 {v20.8h}, [x9], x1 // p3
538 ld1 {v24.8h}, [x0], x1 // q0
539 ld1 {v21.8h}, [x9], x1 // p2
540 ld1 {v25.8h}, [x0], x1 // q1
541 ld1 {v22.8h}, [x9], x1 // p1
542 ld1 {v26.8h}, [x0], x1 // q2
543 ld1 {v23.8h}, [x9], x1 // p0
544 ld1 {v27.8h}, [x0], x1 // q3
545 sub x0, x0, x1, lsl #2
546 sub x9, x9, x1, lsl #1
550 st1 {v22.8h}, [x9], x1
551 st1 {v24.8h}, [x0], x1
552 st1 {v23.8h}, [x9], x1
553 st1 {v25.8h}, [x0], x1
554 sub x0, x0, x1, lsl #1
559 bpp_frontends vp9_loop_filter_v_4_8
561 function vp9_loop_filter_h_4_8_16_neon
564 add x0, x9, x1, lsl #2
565 ld1 {v20.8h}, [x9], x1
566 ld1 {v24.8h}, [x0], x1
567 ld1 {v21.8h}, [x9], x1
568 ld1 {v25.8h}, [x0], x1
569 ld1 {v22.8h}, [x9], x1
570 ld1 {v26.8h}, [x0], x1
571 ld1 {v23.8h}, [x9], x1
572 ld1 {v27.8h}, [x0], x1
574 sub x9, x9, x1, lsl #2
575 sub x0, x0, x1, lsl #3
578 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
582 // Move x9 forward by 2 pixels; we don't need to rewrite the
583 // outermost 2 pixels since they aren't changed.
585 add x0, x9, x1, lsl #2
587 // We only will write the mid 4 pixels back; after the loop filter,
588 // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
589 // We need to transpose them to columns, done with a 4x8 transpose
590 // (which in practice is two 4x4 transposes of the two 4x4 halves
591 // of the 8x4 pixels; into 4x8 pixels).
592 transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
593 st1 {v22.d}[0], [x9], x1
594 st1 {v22.d}[1], [x0], x1
595 st1 {v23.d}[0], [x9], x1
596 st1 {v23.d}[1], [x0], x1
597 st1 {v24.d}[0], [x9], x1
598 st1 {v24.d}[1], [x0], x1
599 st1 {v25.d}[0], [x9], x1
600 st1 {v25.d}[1], [x0], x1
601 sub x0, x0, x1, lsl #3
607 bpp_frontends vp9_loop_filter_h_4_8
609 function vp9_loop_filter_v_8_8_16_neon
611 sub x9, x0, x1, lsl #2
612 ld1 {v20.8h}, [x9], x1 // p3
613 ld1 {v24.8h}, [x0], x1 // q0
614 ld1 {v21.8h}, [x9], x1 // p2
615 ld1 {v25.8h}, [x0], x1 // q1
616 ld1 {v22.8h}, [x9], x1 // p1
617 ld1 {v26.8h}, [x0], x1 // q2
618 ld1 {v23.8h}, [x9], x1 // p0
619 ld1 {v27.8h}, [x0], x1 // q3
620 sub x9, x9, x1, lsl #2
621 sub x0, x0, x1, lsl #2
626 st1 {v21.8h}, [x9], x1
627 st1 {v24.8h}, [x0], x1
628 st1 {v22.8h}, [x9], x1
629 st1 {v25.8h}, [x0], x1
630 st1 {v23.8h}, [x9], x1
631 st1 {v26.8h}, [x0], x1
632 sub x0, x0, x1, lsl #1
637 sub x9, x0, x1, lsl #1
638 st1 {v22.8h}, [x9], x1
639 st1 {v24.8h}, [x0], x1
640 st1 {v23.8h}, [x9], x1
641 st1 {v25.8h}, [x0], x1
642 sub x0, x0, x1, lsl #1
646 bpp_frontends vp9_loop_filter_v_8_8
648 function vp9_loop_filter_h_8_8_16_neon
651 add x0, x9, x1, lsl #2
652 ld1 {v20.8h}, [x9], x1
653 ld1 {v24.8h}, [x0], x1
654 ld1 {v21.8h}, [x9], x1
655 ld1 {v25.8h}, [x0], x1
656 ld1 {v22.8h}, [x9], x1
657 ld1 {v26.8h}, [x0], x1
658 ld1 {v23.8h}, [x9], x1
659 ld1 {v27.8h}, [x0], x1
661 sub x9, x9, x1, lsl #2
662 sub x0, x0, x1, lsl #3
665 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
669 add x0, x9, x1, lsl #2
671 // Even though only 6 pixels per row have been changed, we write the
672 // full 8 pixel registers.
673 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
675 st1 {v20.8h}, [x9], x1
676 st1 {v24.8h}, [x0], x1
677 st1 {v21.8h}, [x9], x1
678 st1 {v25.8h}, [x0], x1
679 st1 {v22.8h}, [x9], x1
680 st1 {v26.8h}, [x0], x1
681 st1 {v23.8h}, [x9], x1
682 st1 {v27.8h}, [x0], x1
683 sub x0, x0, x1, lsl #3
688 // If we didn't need to do the flat8in part, we use the same writeback
689 // as in loop_filter_h_4_8.
691 add x0, x9, x1, lsl #2
692 transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
693 st1 {v22.d}[0], [x9], x1
694 st1 {v22.d}[1], [x0], x1
695 st1 {v23.d}[0], [x9], x1
696 st1 {v23.d}[1], [x0], x1
697 st1 {v24.d}[0], [x9], x1
698 st1 {v24.d}[1], [x0], x1
699 st1 {v25.d}[0], [x9], x1
700 st1 {v25.d}[1], [x0], x1
701 sub x0, x0, x1, lsl #3
706 bpp_frontends vp9_loop_filter_h_8_8
708 bpp_frontends_mix2 4, 4
709 bpp_frontends_mix2 4, 8
710 bpp_frontends_mix2 8, 4
711 bpp_frontends_mix2 8, 8
713 function vp9_loop_filter_v_16_8_16_neon
715 sub x9, x0, x1, lsl #3
716 ld1 {v16.8h}, [x9], x1 // p7
717 ld1 {v24.8h}, [x0], x1 // q0
718 ld1 {v17.8h}, [x9], x1 // p6
719 ld1 {v25.8h}, [x0], x1 // q1
720 ld1 {v18.8h}, [x9], x1 // p5
721 ld1 {v26.8h}, [x0], x1 // q2
722 ld1 {v19.8h}, [x9], x1 // p4
723 ld1 {v27.8h}, [x0], x1 // q3
724 ld1 {v20.8h}, [x9], x1 // p3
725 ld1 {v28.8h}, [x0], x1 // q4
726 ld1 {v21.8h}, [x9], x1 // p2
727 ld1 {v29.8h}, [x0], x1 // q5
728 ld1 {v22.8h}, [x9], x1 // p1
729 ld1 {v30.8h}, [x0], x1 // q6
730 ld1 {v23.8h}, [x9], x1 // p0
731 ld1 {v31.8h}, [x0], x1 // q7
732 sub x9, x9, x1, lsl #3
733 sub x0, x0, x1, lsl #3
738 // If we did the flat8out part, we get the output in
739 // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
740 // store v2-v9 there, and v10-v17 into x0.
741 st1 {v2.8h}, [x9], x1
742 st1 {v10.8h}, [x0], x1
743 st1 {v3.8h}, [x9], x1
744 st1 {v11.8h}, [x0], x1
745 st1 {v4.8h}, [x9], x1
746 st1 {v12.8h}, [x0], x1
747 st1 {v5.8h}, [x9], x1
748 st1 {v13.8h}, [x0], x1
749 st1 {v6.8h}, [x9], x1
750 st1 {v14.8h}, [x0], x1
751 st1 {v8.8h}, [x9], x1
752 st1 {v15.8h}, [x0], x1
753 st1 {v9.8h}, [x9], x1
754 st1 {v17.8h}, [x0], x1
755 sub x0, x0, x1, lsl #3
760 add x9, x9, x1, lsl #2
761 // If we didn't do the flat8out part, the output is left in the
763 st1 {v21.8h}, [x9], x1
764 st1 {v24.8h}, [x0], x1
765 st1 {v22.8h}, [x9], x1
766 st1 {v25.8h}, [x0], x1
767 st1 {v23.8h}, [x9], x1
768 st1 {v26.8h}, [x0], x1
769 sub x0, x0, x1, lsl #1
773 sub x9, x0, x1, lsl #1
774 st1 {v22.8h}, [x9], x1
775 st1 {v24.8h}, [x0], x1
776 st1 {v23.8h}, [x9], x1
777 st1 {v25.8h}, [x0], x1
778 sub x0, x0, x1, lsl #1
782 bpp_frontends vp9_loop_filter_v_16_8, push=1
783 bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
785 function vp9_loop_filter_h_16_8_16_neon
788 ld1 {v16.8h}, [x9], x1
789 ld1 {v24.8h}, [x0], x1
790 ld1 {v17.8h}, [x9], x1
791 ld1 {v25.8h}, [x0], x1
792 ld1 {v18.8h}, [x9], x1
793 ld1 {v26.8h}, [x0], x1
794 ld1 {v19.8h}, [x9], x1
795 ld1 {v27.8h}, [x0], x1
796 ld1 {v20.8h}, [x9], x1
797 ld1 {v28.8h}, [x0], x1
798 ld1 {v21.8h}, [x9], x1
799 ld1 {v29.8h}, [x0], x1
800 ld1 {v22.8h}, [x9], x1
801 ld1 {v30.8h}, [x0], x1
802 ld1 {v23.8h}, [x9], x1
803 ld1 {v31.8h}, [x0], x1
804 sub x0, x0, x1, lsl #3
805 sub x9, x9, x1, lsl #3
807 // The 16x8 pixels read above is in two 8x8 blocks; the left
808 // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
809 // of this, to get one column per register.
810 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
811 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
815 transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
816 transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
818 st1 {v16.8h}, [x9], x1
819 st1 {v10.8h}, [x0], x1
820 st1 {v2.8h}, [x9], x1
821 st1 {v11.8h}, [x0], x1
822 st1 {v3.8h}, [x9], x1
823 st1 {v12.8h}, [x0], x1
824 st1 {v4.8h}, [x9], x1
825 st1 {v13.8h}, [x0], x1
826 st1 {v5.8h}, [x9], x1
827 st1 {v14.8h}, [x0], x1
828 st1 {v6.8h}, [x9], x1
829 st1 {v15.8h}, [x0], x1
830 st1 {v8.8h}, [x9], x1
831 st1 {v17.8h}, [x0], x1
832 st1 {v9.8h}, [x9], x1
833 st1 {v31.8h}, [x0], x1
834 sub x0, x0, x1, lsl #3
838 // The same writeback as in loop_filter_h_8_8
840 add x0, x9, x1, lsl #2
841 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
843 st1 {v20.8h}, [x9], x1
844 st1 {v24.8h}, [x0], x1
845 st1 {v21.8h}, [x9], x1
846 st1 {v25.8h}, [x0], x1
847 st1 {v22.8h}, [x9], x1
848 st1 {v26.8h}, [x0], x1
849 st1 {v23.8h}, [x9], x1
850 st1 {v27.8h}, [x0], x1
851 sub x0, x0, x1, lsl #3
855 // The same writeback as in loop_filter_h_4_8
857 add x0, x9, x1, lsl #2
858 transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
859 st1 {v22.d}[0], [x9], x1
860 st1 {v22.d}[1], [x0], x1
861 st1 {v23.d}[0], [x9], x1
862 st1 {v23.d}[1], [x0], x1
863 st1 {v24.d}[0], [x9], x1
864 st1 {v24.d}[1], [x0], x1
865 st1 {v25.d}[0], [x9], x1
866 st1 {v25.d}[1], [x0], x1
867 sub x0, x0, x1, lsl #3
872 bpp_frontends vp9_loop_filter_h_16_8, push=1
873 bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1