2 * Copyright (c) 2016 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
25 // The main loop filter macro is templated and can produce filters for
26 // vectors of 8 or 16 bytes. The register mapping throughout the filter
27 // is close to identical to the arm version (please try to maintain this,
28 // if either is changed!). When the arm version uses e.g. d20 for the
29 // input variable p3, the aarch64 version uses v20.8b or v20.16b, depending
32 // The number of elements in the vector is passed in via the macro parameter
33 // \sz, which is either .8b or .16b. For simple instructions that doesn't
34 // lengthen or narrow things, this can easily be templated like this:
35 // uabd v4\sz, v20\sz, v21\sz
37 // For instructions that lengthen or narrow content, the arm version would
38 // have used q registers. For these instructions, we have macros that expand
39 // into either a single e.g. uaddl instruction, or into a uaddl + uaddl2
40 // pair, depending on the \sz parameter. Wherever the arm version would have
41 // used a q register, these macros instead take two v registers, i.e. q3
42 // is mapped to v6+v7. For the case with 8 byte input vectors, such a
43 // lengthening operation is only stored in v6.8h (what was in q3 in the arm
44 // case), while the 16 byte input vectors will use v6.8h + v7.8h.
45 // Such a macro invocation would look like this:
46 // uaddl_sz v8.8h, v9.8h, v17, v18, \sz
48 // That is, in the 8 byte input vector case, the second register in these
49 // register pairs will be unused.
50 // Unfortunately, this makes the code quite hard to read. For readability,
51 // see the arm version instead.
54 .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
61 .macro sub_sz dst1, dst2, in1, in2, in3, in4, sz
68 .macro uaddw_sz dst1, dst2, in1, in2, in3, sz
69 uaddw \dst1, \in1, \in3\().8b
71 uaddw2 \dst2, \in2, \in3\().16b
75 .macro usubw_sz dst1, dst2, in1, in2, in3, sz
76 usubw \dst1, \in1, \in3\().8b
78 usubw2 \dst2, \in2, \in3\().16b
82 .macro usubl_sz dst1, dst2, in1, in2, sz
83 usubl \dst1, \in1\().8b, \in2\().8b
85 usubl2 \dst2, \in1\().16b, \in2\().16b
89 .macro sqxtn_sz dst, in1, in2, sz
90 sqxtn \dst\().8b, \in1
92 sqxtn2 \dst\().16b, \in2
96 .macro sqxtun_sz dst, in1, in2, sz
97 sqxtun \dst\().8b, \in1
99 sqxtun2 \dst\().16b, \in2
103 .macro mul_sz dst1, dst2, in1, in2, in3, in4, sz
104 mul \dst1, \in1, \in3
106 mul \dst2, \in2, \in4
110 .macro saddw_sz dst1, dst2, in1, in2, in3, sz
111 saddw \dst1, \in1, \in3\().8b
113 saddw2 \dst2, \in2, \in3\().16b
117 .macro ssubw_sz dst1, dst2, in1, in2, in3, sz
118 ssubw \dst1, \in1, \in3\().8b
120 ssubw2 \dst2, \in2, \in3\().16b
124 .macro uxtl_sz dst1, dst2, in, sz
125 uxtl \dst1, \in\().8b
127 uxtl2 \dst2, \in\().16b
131 .macro uaddl_sz dst1, dst2, in1, in2, sz
132 uaddl \dst1, \in1\().8b, \in2\().8b
134 uaddl2 \dst2, \in1\().16b, \in2\().16b
138 .macro rshrn_sz dst, in1, in2, shift, sz
139 rshrn \dst\().8b, \in1, \shift
141 rshrn2 \dst\().16b, \in2, \shift
145 .macro ushll_sz dst1, dst2, in, shift, sz
146 ushll \dst1, \in\().8b, \shift
148 ushll2 \dst2, \in\().16b, \shift
152 // The input to and output from this macro is in the registers v16-v31,
153 // and v0-v7 are used as scratch registers.
154 // p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
155 // Depending on the width of the loop filter, we either use v16-v19
156 // and v28-v31 as temp registers, or v8-v15.
157 // When comparing to the arm version, tmpq1 == tmp1 + tmp2,
158 // tmpq2 == tmp3 + tmp4, etc.
159 .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
168 rev16 v1.16b, v0.16b // E
169 rev16 v4.16b, v2.16b // I
170 rev16 v5.16b, v3.16b // H
171 uzp1 v0.16b, v0.16b, v1.16b
172 uzp1 v2.16b, v2.16b, v4.16b
173 uzp1 v3.16b, v3.16b, v5.16b
176 uabd v4\sz, v20\sz, v21\sz // abs(p3 - p2)
177 uabd v5\sz, v21\sz, v22\sz // abs(p2 - p1)
178 uabd v6\sz, v22\sz, v23\sz // abs(p1 - p0)
179 uabd v7\sz, v24\sz, v25\sz // abs(q0 - q1)
180 uabd \tmp1\sz, v25\sz, v26\sz // abs(q1 - q2)
181 uabd \tmp2\sz, v26\sz, v27\sz // abs(q2 - q3)
182 umax v4\sz, v4\sz, v5\sz
183 umax v5\sz, v6\sz, v7\sz
184 umax \tmp1\sz, \tmp1\sz, \tmp2\sz
185 uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0)
186 umax v4\sz, v4\sz, v5\sz
187 uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2
188 uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1)
189 umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3))
190 ushr v5\sz, v5\sz, #1
191 cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I
192 uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
193 cmhs v5\sz, v0\sz, v6\sz
194 and v4\sz, v4\sz, v5\sz // fm
196 // If no pixels need filtering, just exit as soon as possible
209 uabd v6\sz, v20\sz, v23\sz // abs(p3 - p0)
210 uabd v2\sz, v21\sz, v23\sz // abs(p2 - p0)
211 uabd v1\sz, v22\sz, v23\sz // abs(p1 - p0)
212 uabd \tmp1\sz, v25\sz, v24\sz // abs(q1 - q0)
213 uabd \tmp2\sz, v26\sz, v24\sz // abs(q2 - q0)
214 uabd \tmp3\sz, v27\sz, v24\sz // abs(q3 - q0)
215 umax v6\sz, v6\sz, v2\sz
216 umax v1\sz, v1\sz, \tmp1\sz
217 umax \tmp2\sz, \tmp2\sz, \tmp3\sz
219 uabd v7\sz, v16\sz, v23\sz // abs(p7 - p0)
220 umax v6\sz, v6\sz, v1\sz
221 uabd v2\sz, v17\sz, v23\sz // abs(p6 - p0)
222 umax v6\sz, v6\sz, \tmp2\sz
223 uabd v1\sz, v18\sz, v23\sz // abs(p5 - p0)
224 cmhs v6\sz, v0\sz, v6\sz // flat8in
225 uabd v8\sz, v19\sz, v23\sz // abs(p4 - p0)
226 and v6\sz, v6\sz, v4\sz // flat8in && fm
227 uabd v9\sz, v28\sz, v24\sz // abs(q4 - q0)
228 bic v4\sz, v4\sz, v6\sz // fm && !flat8in
229 uabd v10\sz, v29\sz, v24\sz // abs(q5 - q0)
230 uabd v11\sz, v30\sz, v24\sz // abs(q6 - q0)
231 uabd v12\sz, v31\sz, v24\sz // abs(q7 - q0)
233 umax v7\sz, v7\sz, v2\sz
234 umax v1\sz, v1\sz, v8\sz
235 umax v9\sz, v9\sz, v10\sz
236 umax v11\sz, v11\sz, v12\sz
237 // The rest of the calculation of flat8out is interleaved below
239 // The rest of the calculation of flat8in is interleaved below
243 // Calculate the normal inner loop filter for 2 or 4 pixels
244 uabd v5\sz, v22\sz, v23\sz // abs(p1 - p0)
246 umax v7\sz, v7\sz, v1\sz
247 umax v9\sz, v9\sz, v11\sz
249 umax v6\sz, v6\sz, v1\sz
251 uabd v1\sz, v25\sz, v24\sz // abs(q1 - q0)
253 umax v7\sz, v7\sz, v9\sz
255 umax v6\sz, v6\sz, \tmp2\sz
257 usubl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz // p1 - q1
258 umax v5\sz, v5\sz, v1\sz // max(abs(p1 - p0), abs(q1 - q0))
262 usubl_sz \tmp3\().8h, \tmp4\().8h, v24, v23, \sz // q0 - p0
265 cmhs v6\sz, v0\sz, v6\sz // flat8in
270 cmhs v5\sz, v3\sz, v5\sz // !hev
272 // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
274 and v6\sz, v6\sz, v1.16b
276 and v6\sz, v6\sz, v4\sz // flat8in && fm
278 sqxtn_sz \tmp1, \tmp1\().8h, \tmp2\().8h, \sz // av_clip_int8(p1 - q1)
280 cmhs v7\sz, v0\sz, v7\sz // flat8out
282 bic v4\sz, v4\sz, v6\sz // fm && !flat8in
284 and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in
286 and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm
289 mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0)
290 bic \tmp1\sz, \tmp1\sz, v5\sz // if (!hev) av_clip_int8 = 0
292 saddw_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1, \sz // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
294 sqxtn_sz \tmp1, \tmp3\().8h, \tmp4\().8h, \sz // f
296 bic v6\sz, v6\sz, v7\sz // fm && flat8in && !flat8out
299 sqadd \tmp3\sz, \tmp1\sz, v2\sz // FFMIN(f + 4, 127)
300 sqadd \tmp4\sz, \tmp1\sz, v3\sz // FFMIN(f + 3, 127)
301 uxtl_sz v0.8h, v1.8h, v23, \sz // p0
302 sshr \tmp3\sz, \tmp3\sz, #3 // f1
303 sshr \tmp4\sz, \tmp4\sz, #3 // f2
305 uxtl_sz v2.8h, v3.8h, v24, \sz // q0
306 saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp4, \sz // p0 + f2
307 ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q0 - f1
308 sqxtun_sz v0, v0.8h, v1.8h, \sz // out p0
309 sqxtun_sz v1, v2.8h, v3.8h, \sz // out q0
310 srshr \tmp3\sz, \tmp3\sz, #1 // f = (f1 + 1) >> 1
311 bit v23\sz, v0\sz, v4\sz // if (fm && !flat8in)
312 bit v24\sz, v1\sz, v4\sz
314 uxtl_sz v0.8h, v1.8h, v22, \sz // p1
315 uxtl_sz v2.8h, v3.8h, v25, \sz // q1
322 saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3, \sz // p1 + f
323 ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q1 - f
324 sqxtun_sz v0, v0.8h, v1.8h, \sz // out p1
325 sqxtun_sz v2, v2.8h, v3.8h, \sz // out q1
331 bit v22\sz, v0\sz, v5\sz // if (!hev && fm && !flat8in)
332 bit v25\sz, v2\sz, v5\sz
334 // If no pixels need flat8in, jump to flat8out
335 // (or to a writeout of the inner 4 pixels, for wd=8)
344 uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v21, \sz
345 uaddl_sz \tmp3\().8h, \tmp4\().8h, v22, v25, \sz
346 uaddl_sz \tmp5\().8h, \tmp6\().8h, v20, v22, \sz
347 uaddl_sz \tmp7\().8h, \tmp8\().8h, v23, v26, \sz
348 add_sz v0.8h, v1.8h, \tmp1\().8h, \tmp2\().8h, \tmp1\().8h, \tmp2\().8h, \sz
349 uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v23, \sz
350 uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v24, \sz
351 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp5\().8h, \tmp6\().8h, \sz
352 sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz
353 sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz
354 rshrn_sz v2, v0.8h, v1.8h, #3, \sz // out p2
356 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz
357 uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v23, \sz
358 uaddl_sz \tmp3\().8h, \tmp4\().8h, v24, v27, \sz
359 rshrn_sz v3, v0.8h, v1.8h, #3, \sz // out p1
361 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz
362 sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz
363 uaddl_sz \tmp5\().8h, \tmp6\().8h, v21, v24, \sz
364 uaddl_sz \tmp7\().8h, \tmp8\().8h, v25, v27, \sz
365 rshrn_sz v4, v0.8h, v1.8h, #3, \sz // out p0
367 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz
368 sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz
369 uaddl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz
370 uaddl_sz \tmp3\().8h, \tmp4\().8h, v26, v27, \sz
371 rshrn_sz v5, v0.8h, v1.8h, #3, \sz // out q0
373 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz
374 sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz
375 rshrn_sz \tmp5, v0.8h, v1.8h, #3, \sz // out q1
377 add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz
378 // The output here is written back into the input registers. This doesn't
379 // matter for the flat8part below, since we only update those pixels
380 // which won't be touched below.
381 bit v21\sz, v2\sz, v6\sz
382 bit v22\sz, v3\sz, v6\sz
383 bit v23\sz, v4\sz, v6\sz
384 rshrn_sz \tmp6, v0.8h, v1.8h, #3, \sz // out q2
385 bit v24\sz, v5\sz, v6\sz
386 bit v25\sz, \tmp5\sz, v6\sz
387 bit v26\sz, \tmp6\sz, v6\sz
391 orr v2\sz, v6\sz, v7\sz
400 // If no pixels needed flat8in nor flat8out, jump to a
401 // writeout of the inner 4 pixels
413 // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
418 // This writes all outputs into v2-v17 (skipping v6 and v16).
419 // If this part is skipped, the output is read from v21-v26 (which is the input
421 ushll_sz v0.8h, v1.8h, v16, #3, \sz // 8 * v16
422 usubw_sz v0.8h, v1.8h, v0.8h, v1.8h, v16, \sz // 7 * v16
423 uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v17, \sz
424 uaddl_sz v8.8h, v9.8h, v17, v18, \sz
425 uaddl_sz v10.8h, v11.8h, v19, v20, \sz
426 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v8.8h, v9.8h, \sz
427 uaddl_sz v8.8h, v9.8h, v16, v17, \sz
428 uaddl_sz v12.8h, v13.8h, v21, v22, \sz
429 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
430 uaddl_sz v10.8h, v11.8h, v18, v25, \sz
431 uaddl_sz v14.8h, v15.8h, v23, v24, \sz
432 sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz
433 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz
434 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
435 uaddl_sz v12.8h, v13.8h, v16, v18, \sz
436 uaddl_sz v14.8h, v15.8h, v19, v26, \sz
437 rshrn_sz v2, v0.8h, v1.8h, #4, \sz
439 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
440 uaddl_sz v8.8h, v9.8h, v16, v19, \sz
441 uaddl_sz v10.8h, v11.8h, v20, v27, \sz
442 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
443 bif v2\sz, v17\sz, v7\sz
444 rshrn_sz v3, v0.8h, v1.8h, #4, \sz
446 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
447 uaddl_sz v12.8h, v13.8h, v16, v20, \sz
448 uaddl_sz v14.8h, v15.8h, v21, v28, \sz
449 sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz
450 bif v3\sz, v18\sz, v7\sz
451 rshrn_sz v4, v0.8h, v1.8h, #4, \sz
453 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
454 uaddl_sz v8.8h, v9.8h, v16, v21, \sz
455 uaddl_sz v10.8h, v11.8h, v22, v29, \sz
456 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
457 bif v4\sz, v19\sz, v7\sz
458 rshrn_sz v5, v0.8h, v1.8h, #4, \sz
460 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
461 uaddl_sz v12.8h, v13.8h, v16, v22, \sz
462 uaddl_sz v14.8h, v15.8h, v23, v30, \sz
463 sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz
464 bif v5\sz, v20\sz, v7\sz
465 rshrn_sz v6, v0.8h, v1.8h, #4, \sz
467 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
468 uaddl_sz v10.8h, v11.8h, v16, v23, \sz
469 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
470 uaddl_sz v12.8h, v13.8h, v24, v31, \sz
471 bif v6\sz, v21\sz, v7\sz
472 rshrn_sz v8, v0.8h, v1.8h, #4, \sz
474 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
475 sub_sz v10.8h, v11.8h, v12.8h, v13.8h, v10.8h, v11.8h, \sz
476 uaddl_sz v12.8h, v13.8h, v17, v24, \sz
477 uaddl_sz v14.8h, v15.8h, v25, v31, \sz
478 bif v8\sz, v22\sz, v7\sz
479 rshrn_sz v9, v0.8h, v1.8h, #4, \sz
481 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz
482 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz
483 uaddl_sz v12.8h, v13.8h, v26, v31, \sz
484 bif v9\sz, v23\sz, v7\sz
485 rshrn_sz v10, v0.8h, v1.8h, #4, \sz
487 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
488 uaddl_sz v14.8h, v15.8h, v18, v25, \sz
489 uaddl_sz v18.8h, v19.8h, v19, v26, \sz
490 sub_sz v12.8h, v13.8h, v12.8h, v13.8h, v14.8h, v15.8h, \sz
491 uaddl_sz v14.8h, v15.8h, v27, v31, \sz
492 bif v10\sz, v24\sz, v7\sz
493 rshrn_sz v11, v0.8h, v1.8h, #4, \sz
495 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz
496 uaddl_sz v12.8h, v13.8h, v20, v27, \sz
497 sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v18.8h, v19.8h, \sz
498 uaddl_sz v18.8h, v19.8h, v28, v31, \sz
499 bif v11\sz, v25\sz, v7\sz
500 sub_sz v18.8h, v19.8h, v18.8h, v19.8h, v12.8h, v13.8h, \sz
501 rshrn_sz v12, v0.8h, v1.8h, #4, \sz
503 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz
504 uaddl_sz v14.8h, v15.8h, v21, v28, \sz
505 uaddl_sz v20.8h, v21.8h, v29, v31, \sz
506 bif v12\sz, v26\sz, v7\sz
507 rshrn_sz v13, v0.8h, v1.8h, #4, \sz
509 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v18.8h, v19.8h, \sz
510 sub_sz v20.8h, v21.8h, v20.8h, v21.8h, v14.8h, v15.8h, \sz
511 uaddl_sz v18.8h, v19.8h, v22, v29, \sz
512 uaddl_sz v22.8h, v23.8h, v30, v31, \sz
513 bif v13\sz, v27\sz, v7\sz
514 rshrn_sz v14, v0.8h, v1.8h, #4, \sz
516 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v20.8h, v21.8h, \sz
517 sub_sz v22.8h, v23.8h, v22.8h, v23.8h, v18.8h, v19.8h, \sz
518 bif v14\sz, v28\sz, v7\sz
519 rshrn_sz v15, v0.8h, v1.8h, #4, \sz
521 add_sz v0.8h, v1.8h, v0.8h, v1.8h, v22.8h, v23.8h, \sz
522 bif v15\sz, v29\sz, v7\sz
523 rshrn_sz v17, v0.8h, v1.8h, #4, \sz
524 bif v17\sz, v30\sz, v7\sz
528 // For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
529 // while we need those for inputs/outputs in wd=16 and use v8-v15
530 // for temp registers there instead.
531 function vp9_loop_filter_4
532 loop_filter 4, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
538 function vp9_loop_filter_4_16b_mix_44
539 loop_filter 4, .16b, 44, v16, v17, v18, v19, v28, v29, v30, v31
545 function vp9_loop_filter_8
546 loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
554 function vp9_loop_filter_8_16b_mix
555 loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31
563 function vp9_loop_filter_16
564 loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15
567 ldp d8, d9, [sp], 0x10
568 ldp d10, d11, [sp], 0x10
569 ldp d12, d13, [sp], 0x10
570 ldp d14, d15, [sp], 0x10
574 function vp9_loop_filter_16_16b
575 loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15
578 ldp d8, d9, [sp], 0x10
579 ldp d10, d11, [sp], 0x10
580 ldp d12, d13, [sp], 0x10
581 ldp d14, d15, [sp], 0x10
589 .macro loop_filter_4_16b_mix mix
590 bl vp9_loop_filter_4_16b_mix_\mix
594 // calculate alternative 'return' targets
599 .macro loop_filter_8_16b_mix mix
600 // calculate alternative 'return' targets
603 mov x11, #0xffffffff00000000
605 mov x11, #0x00000000ffffffff
607 mov x11, #0xffffffffffffffff
609 bl vp9_loop_filter_8_16b_mix
612 .macro loop_filter_16
613 // calculate alternative 'return' targets
616 bl vp9_loop_filter_16
619 .macro loop_filter_16_16b
620 // calculate alternative 'return' targets
623 bl vp9_loop_filter_16_16b
627 // The public functions in this file have got the following signature:
628 // void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
630 function ff_vp9_loop_filter_v_4_8_neon, export=1
632 sub x9, x0, x1, lsl #2
633 ld1 {v20.8b}, [x9], x1 // p3
634 ld1 {v24.8b}, [x0], x1 // q0
635 ld1 {v21.8b}, [x9], x1 // p2
636 ld1 {v25.8b}, [x0], x1 // q1
637 ld1 {v22.8b}, [x9], x1 // p1
638 ld1 {v26.8b}, [x0], x1 // q2
639 ld1 {v23.8b}, [x9], x1 // p0
640 ld1 {v27.8b}, [x0], x1 // q3
641 sub x0, x0, x1, lsl #2
642 sub x9, x9, x1, lsl #1
646 st1 {v22.8b}, [x9], x1
647 st1 {v24.8b}, [x0], x1
648 st1 {v23.8b}, [x9], x1
649 st1 {v25.8b}, [x0], x1
654 function ff_vp9_loop_filter_v_44_16_neon, export=1
656 sub x9, x0, x1, lsl #2
657 ld1 {v20.16b}, [x9], x1 // p3
658 ld1 {v24.16b}, [x0], x1 // q0
659 ld1 {v21.16b}, [x9], x1 // p2
660 ld1 {v25.16b}, [x0], x1 // q1
661 ld1 {v22.16b}, [x9], x1 // p1
662 ld1 {v26.16b}, [x0], x1 // q2
663 ld1 {v23.16b}, [x9], x1 // p0
664 ld1 {v27.16b}, [x0], x1 // q3
665 sub x0, x0, x1, lsl #2
666 sub x9, x9, x1, lsl #1
668 loop_filter_4_16b_mix 44
670 st1 {v22.16b}, [x9], x1
671 st1 {v24.16b}, [x0], x1
672 st1 {v23.16b}, [x9], x1
673 st1 {v25.16b}, [x0], x1
678 function ff_vp9_loop_filter_h_4_8_neon, export=1
681 add x0, x9, x1, lsl #2
682 ld1 {v20.8b}, [x9], x1
683 ld1 {v24.8b}, [x0], x1
684 ld1 {v21.8b}, [x9], x1
685 ld1 {v25.8b}, [x0], x1
686 ld1 {v22.8b}, [x9], x1
687 ld1 {v26.8b}, [x0], x1
688 ld1 {v23.8b}, [x9], x1
689 ld1 {v27.8b}, [x0], x1
691 sub x9, x9, x1, lsl #2
692 sub x0, x0, x1, lsl #2
693 // Move x0/x9 forward by 2 pixels; we don't need to rewrite the
694 // outermost 2 pixels since they aren't changed.
698 transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
702 // We only will write the mid 4 pixels back; after the loop filter,
703 // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
704 // We need to transpose them to columns, done with a 4x8 transpose
705 // (which in practice is two 4x4 transposes of the two 4x4 halves
706 // of the 8x4 pixels; into 4x8 pixels).
707 transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29
708 st1 {v22.s}[0], [x9], x1
709 st1 {v22.s}[1], [x0], x1
710 st1 {v23.s}[0], [x9], x1
711 st1 {v23.s}[1], [x0], x1
712 st1 {v24.s}[0], [x9], x1
713 st1 {v24.s}[1], [x0], x1
714 st1 {v25.s}[0], [x9], x1
715 st1 {v25.s}[1], [x0], x1
720 function ff_vp9_loop_filter_h_44_16_neon, export=1
723 add x0, x9, x1, lsl #3
724 ld1 {v20.8b}, [x9], x1
725 ld1 {v20.d}[1], [x0], x1
726 ld1 {v21.8b}, [x9], x1
727 ld1 {v21.d}[1], [x0], x1
728 ld1 {v22.8b}, [x9], x1
729 ld1 {v22.d}[1], [x0], x1
730 ld1 {v23.8b}, [x9], x1
731 ld1 {v23.d}[1], [x0], x1
732 ld1 {v24.8b}, [x9], x1
733 ld1 {v24.d}[1], [x0], x1
734 ld1 {v25.8b}, [x9], x1
735 ld1 {v25.d}[1], [x0], x1
736 ld1 {v26.8b}, [x9], x1
737 ld1 {v26.d}[1], [x0], x1
738 ld1 {v27.8b}, [x9], x1
739 ld1 {v27.d}[1], [x0], x1
741 sub x9, x9, x1, lsl #3
742 sub x0, x0, x1, lsl #3
746 transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
748 loop_filter_4_16b_mix 44
750 transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
752 st1 {v22.s}[0], [x9], x1
753 st1 {v22.s}[2], [x0], x1
754 st1 {v23.s}[0], [x9], x1
755 st1 {v23.s}[2], [x0], x1
756 st1 {v24.s}[0], [x9], x1
757 st1 {v24.s}[2], [x0], x1
758 st1 {v25.s}[0], [x9], x1
759 st1 {v25.s}[2], [x0], x1
760 st1 {v22.s}[1], [x9], x1
761 st1 {v22.s}[3], [x0], x1
762 st1 {v23.s}[1], [x9], x1
763 st1 {v23.s}[3], [x0], x1
764 st1 {v24.s}[1], [x9], x1
765 st1 {v24.s}[3], [x0], x1
766 st1 {v25.s}[1], [x9], x1
767 st1 {v25.s}[3], [x0], x1
772 function ff_vp9_loop_filter_v_8_8_neon, export=1
774 sub x9, x0, x1, lsl #2
775 ld1 {v20.8b}, [x9], x1 // p3
776 ld1 {v24.8b}, [x0], x1 // q0
777 ld1 {v21.8b}, [x9], x1 // p2
778 ld1 {v25.8b}, [x0], x1 // q1
779 ld1 {v22.8b}, [x9], x1 // p1
780 ld1 {v26.8b}, [x0], x1 // q2
781 ld1 {v23.8b}, [x9], x1 // p0
782 ld1 {v27.8b}, [x0], x1 // q3
783 sub x9, x9, x1, lsl #2
784 sub x0, x0, x1, lsl #2
789 st1 {v21.8b}, [x9], x1
790 st1 {v24.8b}, [x0], x1
791 st1 {v22.8b}, [x9], x1
792 st1 {v25.8b}, [x0], x1
793 st1 {v23.8b}, [x9], x1
794 st1 {v26.8b}, [x0], x1
798 sub x9, x0, x1, lsl #1
799 st1 {v22.8b}, [x9], x1
800 st1 {v24.8b}, [x0], x1
801 st1 {v23.8b}, [x9], x1
802 st1 {v25.8b}, [x0], x1
807 function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
809 sub x9, x0, x1, lsl #2
810 ld1 {v20.16b}, [x9], x1 // p3
811 ld1 {v24.16b}, [x0], x1 // q0
812 ld1 {v21.16b}, [x9], x1 // p2
813 ld1 {v25.16b}, [x0], x1 // q1
814 ld1 {v22.16b}, [x9], x1 // p1
815 ld1 {v26.16b}, [x0], x1 // q2
816 ld1 {v23.16b}, [x9], x1 // p0
817 ld1 {v27.16b}, [x0], x1 // q3
818 sub x9, x9, x1, lsl #2
819 sub x0, x0, x1, lsl #2
822 loop_filter_8_16b_mix \mix
824 st1 {v21.16b}, [x9], x1
825 st1 {v24.16b}, [x0], x1
826 st1 {v22.16b}, [x9], x1
827 st1 {v25.16b}, [x0], x1
828 st1 {v23.16b}, [x9], x1
829 st1 {v26.16b}, [x0], x1
833 sub x9, x0, x1, lsl #1
834 st1 {v22.16b}, [x9], x1
835 st1 {v24.16b}, [x0], x1
836 st1 {v23.16b}, [x9], x1
837 st1 {v25.16b}, [x0], x1
846 function ff_vp9_loop_filter_h_8_8_neon, export=1
849 add x0, x9, x1, lsl #2
850 ld1 {v20.8b}, [x9], x1
851 ld1 {v24.8b}, [x0], x1
852 ld1 {v21.8b}, [x9], x1
853 ld1 {v25.8b}, [x0], x1
854 ld1 {v22.8b}, [x9], x1
855 ld1 {v26.8b}, [x0], x1
856 ld1 {v23.8b}, [x9], x1
857 ld1 {v27.8b}, [x0], x1
859 sub x9, x9, x1, lsl #2
860 sub x0, x0, x1, lsl #2
862 transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
866 // Even though only 6 pixels per row have been changed, we write the
867 // full 8 pixel registers.
868 transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
870 st1 {v20.8b}, [x9], x1
871 st1 {v24.8b}, [x0], x1
872 st1 {v21.8b}, [x9], x1
873 st1 {v25.8b}, [x0], x1
874 st1 {v22.8b}, [x9], x1
875 st1 {v26.8b}, [x0], x1
876 st1 {v23.8b}, [x9], x1
877 st1 {v27.8b}, [x0], x1
881 // If we didn't need to do the flat8in part, we use the same writeback
882 // as in loop_filter_h_4_8.
885 transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29
886 st1 {v22.s}[0], [x9], x1
887 st1 {v22.s}[1], [x0], x1
888 st1 {v23.s}[0], [x9], x1
889 st1 {v23.s}[1], [x0], x1
890 st1 {v24.s}[0], [x9], x1
891 st1 {v24.s}[1], [x0], x1
892 st1 {v25.s}[0], [x9], x1
893 st1 {v25.s}[1], [x0], x1
898 function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
901 add x0, x9, x1, lsl #3
902 ld1 {v20.8b}, [x9], x1
903 ld1 {v20.d}[1], [x0], x1
904 ld1 {v21.8b}, [x9], x1
905 ld1 {v21.d}[1], [x0], x1
906 ld1 {v22.8b}, [x9], x1
907 ld1 {v22.d}[1], [x0], x1
908 ld1 {v23.8b}, [x9], x1
909 ld1 {v23.d}[1], [x0], x1
910 ld1 {v24.8b}, [x9], x1
911 ld1 {v24.d}[1], [x0], x1
912 ld1 {v25.8b}, [x9], x1
913 ld1 {v25.d}[1], [x0], x1
914 ld1 {v26.8b}, [x9], x1
915 ld1 {v26.d}[1], [x0], x1
916 ld1 {v27.8b}, [x9], x1
917 ld1 {v27.d}[1], [x0], x1
919 sub x9, x9, x1, lsl #3
920 sub x0, x0, x1, lsl #3
922 transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
924 loop_filter_8_16b_mix \mix
926 transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
928 st1 {v20.8b}, [x9], x1
929 st1 {v20.d}[1], [x0], x1
930 st1 {v21.8b}, [x9], x1
931 st1 {v21.d}[1], [x0], x1
932 st1 {v22.8b}, [x9], x1
933 st1 {v22.d}[1], [x0], x1
934 st1 {v23.8b}, [x9], x1
935 st1 {v23.d}[1], [x0], x1
936 st1 {v24.8b}, [x9], x1
937 st1 {v24.d}[1], [x0], x1
938 st1 {v25.8b}, [x9], x1
939 st1 {v25.d}[1], [x0], x1
940 st1 {v26.8b}, [x9], x1
941 st1 {v26.d}[1], [x0], x1
942 st1 {v27.8b}, [x9], x1
943 st1 {v27.d}[1], [x0], x1
949 transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
950 st1 {v22.s}[0], [x9], x1
951 st1 {v22.s}[2], [x0], x1
952 st1 {v23.s}[0], [x9], x1
953 st1 {v23.s}[2], [x0], x1
954 st1 {v24.s}[0], [x9], x1
955 st1 {v24.s}[2], [x0], x1
956 st1 {v25.s}[0], [x9], x1
957 st1 {v25.s}[2], [x0], x1
958 st1 {v22.s}[1], [x9], x1
959 st1 {v22.s}[3], [x0], x1
960 st1 {v23.s}[1], [x9], x1
961 st1 {v23.s}[3], [x0], x1
962 st1 {v24.s}[1], [x9], x1
963 st1 {v24.s}[3], [x0], x1
964 st1 {v25.s}[1], [x9], x1
965 st1 {v25.s}[3], [x0], x1
974 function ff_vp9_loop_filter_v_16_8_neon, export=1
976 stp d14, d15, [sp, #-0x10]!
977 stp d12, d13, [sp, #-0x10]!
978 stp d10, d11, [sp, #-0x10]!
979 stp d8, d9, [sp, #-0x10]!
980 sub x9, x0, x1, lsl #3
981 ld1 {v16.8b}, [x9], x1 // p7
982 ld1 {v24.8b}, [x0], x1 // q0
983 ld1 {v17.8b}, [x9], x1 // p6
984 ld1 {v25.8b}, [x0], x1 // q1
985 ld1 {v18.8b}, [x9], x1 // p5
986 ld1 {v26.8b}, [x0], x1 // q2
987 ld1 {v19.8b}, [x9], x1 // p4
988 ld1 {v27.8b}, [x0], x1 // q3
989 ld1 {v20.8b}, [x9], x1 // p3
990 ld1 {v28.8b}, [x0], x1 // q4
991 ld1 {v21.8b}, [x9], x1 // p2
992 ld1 {v29.8b}, [x0], x1 // q5
993 ld1 {v22.8b}, [x9], x1 // p1
994 ld1 {v30.8b}, [x0], x1 // q6
995 ld1 {v23.8b}, [x9], x1 // p0
996 ld1 {v31.8b}, [x0], x1 // q7
997 sub x9, x9, x1, lsl #3
998 sub x0, x0, x1, lsl #3
1003 // If we did the flat8out part, we get the output in
1004 // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
1005 // store v2-v9 there, and v10-v17 into x0.
1006 st1 {v2.8b}, [x9], x1
1007 st1 {v10.8b}, [x0], x1
1008 st1 {v3.8b}, [x9], x1
1009 st1 {v11.8b}, [x0], x1
1010 st1 {v4.8b}, [x9], x1
1011 st1 {v12.8b}, [x0], x1
1012 st1 {v5.8b}, [x9], x1
1013 st1 {v13.8b}, [x0], x1
1014 st1 {v6.8b}, [x9], x1
1015 st1 {v14.8b}, [x0], x1
1016 st1 {v8.8b}, [x9], x1
1017 st1 {v15.8b}, [x0], x1
1018 st1 {v9.8b}, [x9], x1
1019 st1 {v17.8b}, [x0], x1
1021 ldp d8, d9, [sp], 0x10
1022 ldp d10, d11, [sp], 0x10
1023 ldp d12, d13, [sp], 0x10
1024 ldp d14, d15, [sp], 0x10
1027 add x9, x9, x1, lsl #2
1028 // If we didn't do the flat8out part, the output is left in the
1030 st1 {v21.8b}, [x9], x1
1031 st1 {v24.8b}, [x0], x1
1032 st1 {v22.8b}, [x9], x1
1033 st1 {v25.8b}, [x0], x1
1034 st1 {v23.8b}, [x9], x1
1035 st1 {v26.8b}, [x0], x1
1038 sub x9, x0, x1, lsl #1
1039 st1 {v22.8b}, [x9], x1
1040 st1 {v24.8b}, [x0], x1
1041 st1 {v23.8b}, [x9], x1
1042 st1 {v25.8b}, [x0], x1
1046 function ff_vp9_loop_filter_v_16_16_neon, export=1
1048 stp d14, d15, [sp, #-0x10]!
1049 stp d12, d13, [sp, #-0x10]!
1050 stp d10, d11, [sp, #-0x10]!
1051 stp d8, d9, [sp, #-0x10]!
1052 sub x9, x0, x1, lsl #3
1053 ld1 {v16.16b}, [x9], x1 // p7
1054 ld1 {v24.16b}, [x0], x1 // q0
1055 ld1 {v17.16b}, [x9], x1 // p6
1056 ld1 {v25.16b}, [x0], x1 // q1
1057 ld1 {v18.16b}, [x9], x1 // p5
1058 ld1 {v26.16b}, [x0], x1 // q2
1059 ld1 {v19.16b}, [x9], x1 // p4
1060 ld1 {v27.16b}, [x0], x1 // q3
1061 ld1 {v20.16b}, [x9], x1 // p3
1062 ld1 {v28.16b}, [x0], x1 // q4
1063 ld1 {v21.16b}, [x9], x1 // p2
1064 ld1 {v29.16b}, [x0], x1 // q5
1065 ld1 {v22.16b}, [x9], x1 // p1
1066 ld1 {v30.16b}, [x0], x1 // q6
1067 ld1 {v23.16b}, [x9], x1 // p0
1068 ld1 {v31.16b}, [x0], x1 // q7
1069 sub x9, x9, x1, lsl #3
1070 sub x0, x0, x1, lsl #3
1075 st1 {v2.16b}, [x9], x1
1076 st1 {v10.16b}, [x0], x1
1077 st1 {v3.16b}, [x9], x1
1078 st1 {v11.16b}, [x0], x1
1079 st1 {v4.16b}, [x9], x1
1080 st1 {v12.16b}, [x0], x1
1081 st1 {v5.16b}, [x9], x1
1082 st1 {v13.16b}, [x0], x1
1083 st1 {v6.16b}, [x9], x1
1084 st1 {v14.16b}, [x0], x1
1085 st1 {v8.16b}, [x9], x1
1086 st1 {v15.16b}, [x0], x1
1087 st1 {v9.16b}, [x9], x1
1088 st1 {v17.16b}, [x0], x1
1090 ldp d8, d9, [sp], 0x10
1091 ldp d10, d11, [sp], 0x10
1092 ldp d12, d13, [sp], 0x10
1093 ldp d14, d15, [sp], 0x10
1096 add x9, x9, x1, lsl #2
1097 st1 {v21.16b}, [x9], x1
1098 st1 {v24.16b}, [x0], x1
1099 st1 {v22.16b}, [x9], x1
1100 st1 {v25.16b}, [x0], x1
1101 st1 {v23.16b}, [x9], x1
1102 st1 {v26.16b}, [x0], x1
1105 sub x9, x0, x1, lsl #1
1106 st1 {v22.16b}, [x9], x1
1107 st1 {v24.16b}, [x0], x1
1108 st1 {v23.16b}, [x9], x1
1109 st1 {v25.16b}, [x0], x1
1113 function ff_vp9_loop_filter_h_16_8_neon, export=1
1115 stp d14, d15, [sp, #-0x10]!
1116 stp d12, d13, [sp, #-0x10]!
1117 stp d10, d11, [sp, #-0x10]!
1118 stp d8, d9, [sp, #-0x10]!
1120 ld1 {v16.8b}, [x9], x1
1121 ld1 {v24.8b}, [x0], x1
1122 ld1 {v17.8b}, [x9], x1
1123 ld1 {v25.8b}, [x0], x1
1124 ld1 {v18.8b}, [x9], x1
1125 ld1 {v26.8b}, [x0], x1
1126 ld1 {v19.8b}, [x9], x1
1127 ld1 {v27.8b}, [x0], x1
1128 ld1 {v20.8b}, [x9], x1
1129 ld1 {v28.8b}, [x0], x1
1130 ld1 {v21.8b}, [x9], x1
1131 ld1 {v29.8b}, [x0], x1
1132 ld1 {v22.8b}, [x9], x1
1133 ld1 {v30.8b}, [x0], x1
1134 ld1 {v23.8b}, [x9], x1
1135 ld1 {v31.8b}, [x0], x1
1136 sub x0, x0, x1, lsl #3
1137 sub x9, x9, x1, lsl #3
1139 // The 16x8 pixels read above is in two 8x8 blocks; the left
1140 // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
1141 // of this, to get one column per register.
1142 transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
1143 transpose_8x8B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
1147 transpose_8x8B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
1148 transpose_8x8B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
1150 st1 {v16.8b}, [x9], x1
1151 st1 {v10.8b}, [x0], x1
1152 st1 {v2.8b}, [x9], x1
1153 st1 {v11.8b}, [x0], x1
1154 st1 {v3.8b}, [x9], x1
1155 st1 {v12.8b}, [x0], x1
1156 st1 {v4.8b}, [x9], x1
1157 st1 {v13.8b}, [x0], x1
1158 st1 {v5.8b}, [x9], x1
1159 st1 {v14.8b}, [x0], x1
1160 st1 {v6.8b}, [x9], x1
1161 st1 {v15.8b}, [x0], x1
1162 st1 {v8.8b}, [x9], x1
1163 st1 {v17.8b}, [x0], x1
1164 st1 {v9.8b}, [x9], x1
1165 st1 {v31.8b}, [x0], x1
1167 ldp d8, d9, [sp], 0x10
1168 ldp d10, d11, [sp], 0x10
1169 ldp d12, d13, [sp], 0x10
1170 ldp d14, d15, [sp], 0x10
1173 // The same writeback as in loop_filter_h_8_8
1175 add x0, x9, x1, lsl #2
1176 transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
1178 st1 {v20.8b}, [x9], x1
1179 st1 {v24.8b}, [x0], x1
1180 st1 {v21.8b}, [x9], x1
1181 st1 {v25.8b}, [x0], x1
1182 st1 {v22.8b}, [x9], x1
1183 st1 {v26.8b}, [x0], x1
1184 st1 {v23.8b}, [x9], x1
1185 st1 {v27.8b}, [x0], x1
1188 // The same writeback as in loop_filter_h_4_8
1190 add x0, x9, x1, lsl #2
1191 transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29
1192 st1 {v22.s}[0], [x9], x1
1193 st1 {v22.s}[1], [x0], x1
1194 st1 {v23.s}[0], [x9], x1
1195 st1 {v23.s}[1], [x0], x1
1196 st1 {v24.s}[0], [x9], x1
1197 st1 {v24.s}[1], [x0], x1
1198 st1 {v25.s}[0], [x9], x1
1199 st1 {v25.s}[1], [x0], x1
1203 function ff_vp9_loop_filter_h_16_16_neon, export=1
1205 stp d14, d15, [sp, #-0x10]!
1206 stp d12, d13, [sp, #-0x10]!
1207 stp d10, d11, [sp, #-0x10]!
1208 stp d8, d9, [sp, #-0x10]!
1210 ld1 {v16.8b}, [x9], x1
1211 ld1 {v24.8b}, [x0], x1
1212 ld1 {v17.8b}, [x9], x1
1213 ld1 {v25.8b}, [x0], x1
1214 ld1 {v18.8b}, [x9], x1
1215 ld1 {v26.8b}, [x0], x1
1216 ld1 {v19.8b}, [x9], x1
1217 ld1 {v27.8b}, [x0], x1
1218 ld1 {v20.8b}, [x9], x1
1219 ld1 {v28.8b}, [x0], x1
1220 ld1 {v21.8b}, [x9], x1
1221 ld1 {v29.8b}, [x0], x1
1222 ld1 {v22.8b}, [x9], x1
1223 ld1 {v30.8b}, [x0], x1
1224 ld1 {v23.8b}, [x9], x1
1225 ld1 {v31.8b}, [x0], x1
1226 ld1 {v16.d}[1], [x9], x1
1227 ld1 {v24.d}[1], [x0], x1
1228 ld1 {v17.d}[1], [x9], x1
1229 ld1 {v25.d}[1], [x0], x1
1230 ld1 {v18.d}[1], [x9], x1
1231 ld1 {v26.d}[1], [x0], x1
1232 ld1 {v19.d}[1], [x9], x1
1233 ld1 {v27.d}[1], [x0], x1
1234 ld1 {v20.d}[1], [x9], x1
1235 ld1 {v28.d}[1], [x0], x1
1236 ld1 {v21.d}[1], [x9], x1
1237 ld1 {v29.d}[1], [x0], x1
1238 ld1 {v22.d}[1], [x9], x1
1239 ld1 {v30.d}[1], [x0], x1
1240 ld1 {v23.d}[1], [x9], x1
1241 ld1 {v31.d}[1], [x0], x1
1242 sub x0, x0, x1, lsl #4
1243 sub x9, x9, x1, lsl #4
1245 transpose_8x16B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
1246 transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
1250 transpose_8x16B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
1251 transpose_8x16B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
1253 st1 {v16.8b}, [x9], x1
1254 st1 {v10.8b}, [x0], x1
1255 st1 {v2.8b}, [x9], x1
1256 st1 {v11.8b}, [x0], x1
1257 st1 {v3.8b}, [x9], x1
1258 st1 {v12.8b}, [x0], x1
1259 st1 {v4.8b}, [x9], x1
1260 st1 {v13.8b}, [x0], x1
1261 st1 {v5.8b}, [x9], x1
1262 st1 {v14.8b}, [x0], x1
1263 st1 {v6.8b}, [x9], x1
1264 st1 {v15.8b}, [x0], x1
1265 st1 {v8.8b}, [x9], x1
1266 st1 {v17.8b}, [x0], x1
1267 st1 {v9.8b}, [x9], x1
1268 st1 {v31.8b}, [x0], x1
1269 st1 {v16.d}[1], [x9], x1
1270 st1 {v10.d}[1], [x0], x1
1271 st1 {v2.d}[1], [x9], x1
1272 st1 {v11.d}[1], [x0], x1
1273 st1 {v3.d}[1], [x9], x1
1274 st1 {v12.d}[1], [x0], x1
1275 st1 {v4.d}[1], [x9], x1
1276 st1 {v13.d}[1], [x0], x1
1277 st1 {v5.d}[1], [x9], x1
1278 st1 {v14.d}[1], [x0], x1
1279 st1 {v6.d}[1], [x9], x1
1280 st1 {v15.d}[1], [x0], x1
1281 st1 {v8.d}[1], [x9], x1
1282 st1 {v17.d}[1], [x0], x1
1283 st1 {v9.d}[1], [x9], x1
1284 st1 {v31.d}[1], [x0], x1
1286 ldp d8, d9, [sp], 0x10
1287 ldp d10, d11, [sp], 0x10
1288 ldp d12, d13, [sp], 0x10
1289 ldp d14, d15, [sp], 0x10
1293 add x0, x9, x1, lsl #3
1294 transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
1296 st1 {v20.8b}, [x9], x1
1297 st1 {v20.d}[1], [x0], x1
1298 st1 {v21.8b}, [x9], x1
1299 st1 {v21.d}[1], [x0], x1
1300 st1 {v22.8b}, [x9], x1
1301 st1 {v22.d}[1], [x0], x1
1302 st1 {v23.8b}, [x9], x1
1303 st1 {v23.d}[1], [x0], x1
1304 st1 {v24.8b}, [x9], x1
1305 st1 {v24.d}[1], [x0], x1
1306 st1 {v25.8b}, [x9], x1
1307 st1 {v25.d}[1], [x0], x1
1308 st1 {v26.8b}, [x9], x1
1309 st1 {v26.d}[1], [x0], x1
1310 st1 {v27.8b}, [x9], x1
1311 st1 {v27.d}[1], [x0], x1
1315 add x0, x9, x1, lsl #3
1316 transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
1317 st1 {v22.s}[0], [x9], x1
1318 st1 {v22.s}[2], [x0], x1
1319 st1 {v23.s}[0], [x9], x1
1320 st1 {v23.s}[2], [x0], x1
1321 st1 {v24.s}[0], [x9], x1
1322 st1 {v24.s}[2], [x0], x1
1323 st1 {v25.s}[0], [x9], x1
1324 st1 {v25.s}[2], [x0], x1
1325 st1 {v22.s}[1], [x9], x1
1326 st1 {v22.s}[3], [x0], x1
1327 st1 {v23.s}[1], [x9], x1
1328 st1 {v23.s}[3], [x0], x1
1329 st1 {v24.s}[1], [x9], x1
1330 st1 {v24.s}[3], [x0], x1
1331 st1 {v25.s}[1], [x9], x1
1332 st1 {v25.s}[3], [x0], x1