2 * Copyright (c) 2017 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 .macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
24 vswp \r1, \r8 @ vtrn.64 \rq0, \rq4
25 vswp \r3, \r10 @ vtrn.64 \rq1, \rq5
26 vswp \r5, \r12 @ vtrn.64 \rq2, \rq6
27 vswp \r7, \r14 @ vtrn.64 \rq3, \rq7
38 .macro transpose16_4x4 r0, r1, r2, r3
45 @ Do a 4x4 transpose, using q registers for the subtransposes that don't
46 @ need to address the indiviudal d registers.
47 @ r0,r1 == rq0, r2,r3 == rq1
48 .macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3
54 @ The input to and output from this macro is in the registers q8-q15,
55 @ and q0-q7 are used as scratch registers.
56 @ p3 = q8, p0 = q11, q0 = q12, q3 = q15
57 .macro loop_filter_q wd
61 vabd.u16 q2, q8, q9 @ abs(p3 - p2)
62 vabd.u16 q3, q9, q10 @ abs(p2 - p1)
63 vabd.u16 q4, q10, q11 @ abs(p1 - p0)
64 vabd.u16 q5, q12, q13 @ abs(q0 - q1)
65 vabd.u16 q6, q13, q14 @ abs(q1 - q2)
66 vabd.u16 q7, q14, q15 @ abs(q2 - q3)
70 vabd.u16 q5, q11, q12 @ abs(p0 - q0)
72 vadd.u16 q5, q5, q5 @ abs(p0 - q0) * 2
73 vabd.u16 q6, q10, q13 @ abs(p1 - q1)
74 vmax.u16 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3))
76 vcle.u16 q2, q2, q1 @ max(abs()) <= I
77 vadd.u16 q5, q5, q6 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
84 @ If no pixels need filtering, just exit as soon as possible
90 vabd.u16 q1, q8, q11 @ abs(p3 - p0)
91 vabd.u16 q3, q9, q11 @ abs(p2 - p0)
92 vabd.u16 q4, q10, q11 @ abs(p1 - p0)
93 vabd.u16 q5, q13, q12 @ abs(q1 - q0)
94 vabd.u16 q6, q14, q12 @ abs(q2 - q0)
95 vabd.u16 q7, q15, q12 @ abs(q3 - q0)
99 @ The rest of the calculation of flat8in is interleaved below
102 @ Calculate the normal inner loop filter for 2 or 4 pixels
103 vabd.u16 q3, q10, q11 @ abs(p1 - p0)
107 vabd.u16 q4, q13, q12 @ abs(q1 - q0)
112 vsub.u16 q5, q10, q13 @ p1 - q1
113 vmax.u16 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0))
115 vsub.u16 q6, q12, q11 @ q0 - p0
117 vcle.u16 q1, q1, q0 @ flat8in
119 vdup.u16 q0, r6 @ left shift for saturation
120 vcle.u16 q3, q3, q4 @ !hev
122 vand q1, q1, q2 @ flat8in && fm
124 vneg.s16 q4, q0 @ negative left shift after saturation
127 vbic q2, q2, q1 @ fm && !flat8in
130 vand q3, q3, q2 @ !hev && fm && !flat8in
131 vshl.s16 q5, q5, q4 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
133 vmul.s16 q6, q6, q7 @ 3 * (q0 - p0)
134 vbic q5, q5, q3 @ if (!hev) av_clip_int2p = 0
135 vadd.s16 q6, q6, q5 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
139 vshl.s16 q6, q6, q4 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
140 vdup.u16 q4, r7 @ max pixel value
142 vshr.u16 q4, q4, #1 @ (1 << (BIT_DEPTH - 1)) - 1)
144 vadd.s16 q5, q6, q5 @ f + 4
145 vadd.s16 q0, q6, q0 @ f + 3
147 vmin.s16 q5, q5, q4 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
148 vmin.s16 q0, q0, q4 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
149 vdup.u16 q4, r7 @ max pixel value
150 vshr.s16 q5, q5, #3 @ f1
151 vshr.s16 q0, q0, #3 @ f2
153 vadd.s16 q0, q11, q0 @ p0 + f2
154 vsub.s16 q7, q12, q5 @ q0 - f1
157 vrshr.s16 q5, q5, #1 @ f = (f1 + 1) >> 1
158 vmax.s16 q0, q0, q6 @ out p0
159 vmax.s16 q7, q7, q6 @ out q0
160 vbit q11, q0, q2 @ if (fm && !flat8in)
166 vadd.s16 q0, q10, q5 @ p1 + f
167 vsub.s16 q7, q13, q5 @ q1 - f
176 vmax.s16 q0, q0, q6 @ out p1
177 vmax.s16 q7, q7, q6 @ out q1
178 vbit q10, q0, q3 @ if (!hev && fm && !flat8in)
182 @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels
187 vadd.u16 q3, q10, q13
189 vadd.u16 q5, q11, q14
196 vrshr.u16 q6, q0, #3 @ out p2
200 vadd.u16 q3, q12, q15
201 vrshr.u16 q7, q0, #3 @ out p1
207 vadd.u16 q5, q13, q15
208 vrshr.u16 q6, q0, #3 @ out p0
212 vadd.u16 q2, q10, q13
214 vadd.u16 q3, q14, q15
215 vrshr.u16 q7, q0, #3 @ out q0
220 vrshr.u16 q6, q0, #3 @ out q1
224 vrshr.u16 q7, q0, #3 @ out q2
230 @ The input to and output from this macro is in the registers d16-d31,
231 @ and d0-d7 are used as scratch registers.
232 @ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
233 @ Depending on the width of the loop filter, we either use d16-d19
234 @ and d28-d31 as temp registers, or d8-d15.
235 @ In practice, this is only ever instantiated once, so the macro parameters
236 @ could be hardcoded, but keeping them as is, to keep similarities to the
237 @ 8 bpp and aarch64 versions.
238 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
242 vabd.u16 d4, d20, d21 @ abs(p3 - p2)
243 vabd.u16 d5, d21, d22 @ abs(p2 - p1)
244 vabd.u16 d6, d22, d23 @ abs(p1 - p0)
245 vabd.u16 d7, d24, d25 @ abs(q0 - q1)
246 vabd.u16 \tmp1, d25, d26 @ abs(q1 - q2)
247 vabd.u16 \tmp2, d26, d27 @ abs(q2 - q3)
250 vmax.u16 \tmp1, \tmp1, \tmp2
251 vabd.u16 d6, d23, d24 @ abs(p0 - q0)
253 vadd.u16 d6, d6, d6 @ abs(p0 - q0) * 2
254 vabd.u16 d5, d22, d25 @ abs(p1 - q1)
255 vmax.u16 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
257 vcle.u16 d4, d4, d2 @ max(abs()) <= I
258 vadd.u16 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
265 @ If no pixels need filtering, just exit as soon as possible
271 vabd.u16 d6, d20, d23 @ abs(p3 - p0)
272 vabd.u16 d2, d21, d23 @ abs(p2 - p0)
273 vabd.u16 d1, d22, d23 @ abs(p1 - p0)
274 vabd.u16 \tmp1, d25, d24 @ abs(q1 - q0)
275 vabd.u16 \tmp2, d26, d24 @ abs(q2 - q0)
276 vabd.u16 \tmp3, d27, d24 @ abs(q3 - q0)
278 vmax.u16 d1, d1, \tmp1
279 vmax.u16 \tmp2, \tmp2, \tmp3
281 vabd.u16 d7, d16, d23 @ abs(p7 - p0)
283 vabd.u16 d2, d17, d23 @ abs(p6 - p0)
284 vmax.u16 d6, d6, \tmp2
285 vabd.u16 d1, d18, d23 @ abs(p5 - p0)
286 vcle.u16 d6, d6, d0 @ flat8in
287 vabd.u16 d8, d19, d23 @ abs(p4 - p0)
288 vand d6, d6, d4 @ flat8in && fm
289 vabd.u16 d9, d28, d24 @ abs(q4 - q0)
290 vbic d4, d4, d6 @ fm && !flat8in
291 vabd.u16 d10, d29, d24 @ abs(q5 - q0)
292 vabd.u16 d11, d30, d24 @ abs(q6 - q0)
293 vabd.u16 d12, d31, d24 @ abs(q7 - q0)
298 vmax.u16 d11, d11, d12
299 @ The rest of the calculation of flat8out is interleaved below
301 @ The rest of the calculation of flat8in is interleaved below
305 @ Calculate the normal inner loop filter for 2 or 4 pixels
306 vabd.u16 d5, d22, d23 @ abs(p1 - p0)
313 vabd.u16 d1, d25, d24 @ abs(q1 - q0)
317 vmax.u16 d6, d6, \tmp2
319 vdup.u16 \tmp2, r6 @ left shift for saturation
320 vsub.u16 \tmp1, d22, d25 @ p1 - q1
321 vneg.s16 \tmp6, \tmp2 @ negative left shift after saturation
322 vmax.u16 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0))
323 vsub.u16 \tmp3, d24, d23 @ q0 - p0
326 vcle.u16 d6, d6, d0 @ flat8in
328 vcle.u16 d5, d5, d3 @ !hev
330 vand d6, d6, d4 @ flat8in && fm
332 vqshl.s16 \tmp1, \tmp1, \tmp2
334 vcle.u16 d7, d7, d0 @ flat8out
336 vbic d4, d4, d6 @ fm && !flat8in
338 vand d5, d5, d4 @ !hev && fm && !flat8in
340 vand d7, d7, d6 @ flat8out && flat8in && fm
342 vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
344 vmul.s16 \tmp3, \tmp3, \tmp5 @ 3 * (q0 - p0)
345 vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int2p = 0
347 vadd.s16 \tmp3, \tmp3, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
349 vqshl.s16 \tmp1, \tmp3, \tmp2
351 vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
352 vdup.u16 \tmp6, r7 @ max pixel value
354 vbic d6, d6, d7 @ fm && flat8in && !flat8out
357 vshr.u16 \tmp2, \tmp6, #1 @ (1 << (BIT_DEPTH - 1)) - 1
359 vadd.s16 \tmp3, \tmp1, d2 @ f + 4
360 vadd.s16 \tmp4, \tmp1, d3 @ f + 3
361 vmin.s16 \tmp3, \tmp3, \tmp2 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
362 vmin.s16 \tmp4, \tmp4, \tmp2 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
363 vshr.s16 \tmp3, \tmp3, #3 @ f1
364 vshr.s16 \tmp4, \tmp4, #3 @ f2
366 vadd.s16 d0, d23, \tmp4 @ p0 + f2
367 vsub.s16 d2, d24, \tmp3 @ q0 - f1
368 vmin.s16 d0, d0, \tmp6
369 vmin.s16 d2, d2, \tmp6
370 vrshr.s16 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1
371 vmax.s16 d0, d0, \tmp5 @ out p0
372 vmax.s16 d2, d2, \tmp5 @ out q0
373 vbit d23, d0, d4 @ if (fm && !flat8in)
376 vadd.s16 d0, d22, \tmp3 @ p1 + f
377 vsub.s16 d2, d25, \tmp3 @ q1 - f
381 vmin.s16 d0, d0, \tmp6
382 vmin.s16 d2, d2, \tmp6
386 vmax.s16 d0, d0, \tmp5 @ out p1
387 vmax.s16 d2, d2, \tmp5 @ out q1
388 vbit d22, d0, d5 @ if (!hev && fm && !flat8in)
392 @ If no pixels need flat8in, jump to flat8out
393 @ (or to a writeout of the inner 4 pixels, for wd=8)
397 vadd.u16 \tmp1, d20, d21
398 vadd.u16 \tmp3, d22, d25
399 vadd.u16 \tmp5, d20, d22
400 vadd.u16 \tmp7, d23, d26
401 vadd.u16 d0, \tmp1, \tmp1
404 vadd.u16 d0, d0, \tmp5
405 vsub.s16 \tmp3, \tmp3, \tmp1
406 vsub.s16 \tmp7, \tmp7, \tmp5
407 vrshr.u16 d2, d0, #3 @ out p2
409 vadd.u16 d0, d0, \tmp3
410 vadd.u16 \tmp1, d20, d23
411 vadd.u16 \tmp3, d24, d27
412 vrshr.u16 d3, d0, #3 @ out p1
414 vadd.u16 d0, d0, \tmp7
415 vsub.s16 \tmp3, \tmp3, \tmp1
416 vadd.u16 \tmp5, d21, d24
417 vadd.u16 \tmp7, d25, d27
418 vrshr.u16 d4, d0, #3 @ out p0
420 vadd.u16 d0, d0, \tmp3
421 vsub.s16 \tmp7, \tmp7, \tmp5
422 vadd.u16 \tmp1, d22, d25
423 vadd.u16 \tmp3, d26, d27
424 vrshr.u16 d5, d0, #3 @ out d0
426 vadd.u16 d0, d0, \tmp7
427 vsub.s16 \tmp3, \tmp3, \tmp1
428 vrshr.u16 \tmp5, d0, #3 @ out q1
430 vadd.u16 d0, d0, \tmp3
431 @ The output here is written back into the input registers. This doesn't
432 @ matter for the flat8out part below, since we only update those pixels
433 @ which won't be touched below.
437 vrshr.u16 \tmp6, d0, #3 @ out q2
447 @ If no pixels needed flat8in nor flat8out, jump to a
448 @ writeout of the inner 4 pixels
452 @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
456 @ This writes all outputs into d2-d17 (skipping d6 and d16).
457 @ If this part is skipped, the output is read from d21-d26 (which is the input
459 vshl.u16 d0, d16, #3 @ 8 * d16
460 vsub.u16 d0, d0, d16 @ 7 * d16
462 vadd.u16 d8, d17, d18
463 vadd.u16 d10, d19, d20
465 vadd.u16 d8, d16, d17
466 vadd.u16 d12, d21, d22
468 vadd.u16 d10, d18, d25
469 vadd.u16 d14, d23, d24
470 vsub.s16 d10, d10, d8
473 vadd.u16 d12, d16, d18
474 vadd.u16 d14, d19, d26
478 vadd.u16 d8, d16, d19
479 vadd.u16 d10, d20, d27
480 vsub.s16 d14, d14, d12
485 vadd.u16 d12, d16, d20
486 vadd.u16 d14, d21, d28
487 vsub.s16 d10, d10, d8
492 vadd.u16 d8, d16, d21
493 vadd.u16 d10, d22, d29
494 vsub.s16 d14, d14, d12
499 vadd.u16 d12, d16, d22
500 vadd.u16 d14, d23, d30
501 vsub.s16 d10, d10, d8
506 vadd.u16 d10, d16, d23
507 vsub.s16 d14, d14, d12
508 vadd.u16 d12, d24, d31
513 vsub.s16 d10, d12, d10
514 vadd.u16 d12, d17, d24
515 vadd.u16 d14, d25, d31
520 vsub.s16 d14, d14, d12
521 vadd.u16 d12, d26, d31
523 vrshr.u16 d10, d0, #4
526 vadd.u16 d14, d18, d25
527 vadd.u16 d18, d19, d26
528 vsub.s16 d12, d12, d14
529 vadd.u16 d14, d27, d31
531 vrshr.u16 d11, d0, #4
534 vadd.u16 d12, d20, d27
535 vsub.s16 d14, d14, d18
536 vadd.u16 d18, d28, d31
538 vsub.s16 d18, d18, d12
539 vrshr.u16 d12, d0, #4
542 vadd.u16 d14, d21, d28
543 vadd.u16 d20, d29, d31
545 vrshr.u16 d13, d0, #4
548 vsub.s16 d20, d20, d14
549 vadd.u16 d18, d22, d29
550 vadd.u16 d22, d30, d31
552 vrshr.u16 d14, d0, #4
555 vsub.s16 d22, d22, d18
557 vrshr.u16 d15, d0, #4
561 vrshr.u16 d17, d0, #4
566 .macro loop_filter_q_4
570 .macro loop_filter_q_8
574 .macro loop_filter_16
575 loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15
579 @ The public functions in this file have got the following signature:
580 @ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
582 .macro bpp_frontend func, bpp
583 function ff_\func\()_\bpp\()_neon, export=1
587 lsl r2, r2, #\bpp - 8
588 lsl r3, r3, #\bpp - 8
589 lsl r4, r4, #\bpp - 8
590 mov r5, #1 << (\bpp - 8)
592 movw r7, #((1 << \bpp) - 1)
599 .macro bpp_frontends func
600 bpp_frontend \func, 10
601 bpp_frontend \func, 12
604 .macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp
605 function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
609 lsl r2, r2, #\bpp - 8
610 lsl r3, r3, #\bpp - 8
611 lsl r4, r4, #\bpp - 8
612 mov r5, #1 << (\bpp - 8)
614 movw r7, #((1 << \bpp) - 1)
615 bl \func\()_\int_suffix\()_16_neon
617 add r0, r0, r1, lsl #2
621 bl \func\()_\int_suffix\()_16_neon
624 add r0, r0, r1, lsl #2
625 bl \func\()_\int_suffix\()_16_neon
626 add r0, r0, r1, lsl #2
627 bl \func\()_\int_suffix\()_16_neon
630 bl \func\()_\int_suffix\()_16_neon
632 bl \func\()_\int_suffix\()_16_neon
640 .macro bpp_frontends_rep func, suffix, int_suffix, rep, dir
641 bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10
642 bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12
645 .macro bpp_frontend_mix2 wd1, wd2, dir, bpp
646 function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
654 lsl r2, r2, #\bpp - 8
655 lsl r3, r3, #\bpp - 8
656 lsl r4, r4, #\bpp - 8
657 mov r5, #1 << (\bpp - 8)
659 movw r7, #((1 << \bpp) - 1)
660 bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
662 add r0, r0, r1, lsl #3
670 lsl r2, r2, #\bpp - 8
671 lsl r3, r3, #\bpp - 8
672 lsl r4, r4, #\bpp - 8
673 bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
679 .macro bpp_frontends_mix2 wd1, wd2
680 bpp_frontend_mix2 \wd1, \wd2, v, 10
681 bpp_frontend_mix2 \wd1, \wd2, v, 12
682 bpp_frontend_mix2 \wd1, \wd2, h, 10
683 bpp_frontend_mix2 \wd1, \wd2, h, 12
686 function vp9_loop_filter_v_4_8_16_neon
687 sub r12, r0, r1, lsl #2
688 vld1.16 {q8}, [r12,:128], r1 @ p3
689 vld1.16 {q12}, [r0, :128], r1 @ q0
690 vld1.16 {q9}, [r12,:128], r1 @ p2
691 vld1.16 {q13}, [r0, :128], r1 @ q1
692 vld1.16 {q10}, [r12,:128], r1 @ p1
693 vld1.16 {q14}, [r0, :128], r1 @ q2
694 vld1.16 {q11}, [r12,:128], r1 @ p0
695 vld1.16 {q15}, [r0, :128], r1 @ q3
696 sub r0, r0, r1, lsl #2
697 sub r12, r12, r1, lsl #1
701 vst1.16 {q10}, [r12,:128], r1
702 vst1.16 {q12}, [r0, :128], r1
703 vst1.16 {q11}, [r12,:128], r1
704 vst1.16 {q13}, [r0, :128], r1
705 sub r0, r0, r1, lsl #1
710 bpp_frontends vp9_loop_filter_v_4_8
713 function vp9_loop_filter_h_4_8_16_neon
715 add r0, r12, r1, lsl #2
716 vld1.16 {q8}, [r12,:64], r1
717 vld1.16 {q12}, [r0, :64], r1
718 vld1.16 {q9}, [r12,:64], r1
719 vld1.16 {q13}, [r0, :64], r1
720 vld1.16 {q10}, [r12,:64], r1
721 vld1.16 {q14}, [r0, :64], r1
722 vld1.16 {q11}, [r12,:64], r1
723 vld1.16 {q15}, [r0, :64], r1
725 sub r12, r12, r1, lsl #2
726 sub r0, r0, r1, lsl #2
727 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
728 @ outermost 2 pixels since they aren't changed.
732 transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
736 @ We only will write the mid 4 pixels back; after the loop filter,
737 @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels).
738 @ We need to transpose them to columns, done with a
739 @ 4x4 transpose (which in practice is two 4x4 transposes of the two
740 @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
741 transpose16_4x4 q10, q11, q12, q13
743 vst1.16 {d20}, [r12], r1
744 vst1.16 {d21}, [r0], r1
745 vst1.16 {d22}, [r12], r1
746 vst1.16 {d23}, [r0], r1
747 vst1.16 {d24}, [r12], r1
748 vst1.16 {d25}, [r0], r1
749 vst1.16 {d26}, [r12], r1
750 vst1.16 {d27}, [r0], r1
751 sub r12, r12, r1, lsl #2
757 bpp_frontends vp9_loop_filter_h_4_8
760 function vp9_loop_filter_v_8_8_16_neon
761 sub r12, r0, r1, lsl #2
762 vld1.16 {q8}, [r12,:128], r1 @ p3
763 vld1.16 {q12}, [r0, :128], r1 @ q0
764 vld1.16 {q9}, [r12,:128], r1 @ p2
765 vld1.16 {q13}, [r0, :128], r1 @ q1
766 vld1.16 {q10}, [r12,:128], r1 @ p1
767 vld1.16 {q14}, [r0, :128], r1 @ q2
768 vld1.16 {q11}, [r12,:128], r1 @ p0
769 vld1.16 {q15}, [r0, :128], r1 @ q3
770 sub r12, r12, r1, lsl #2
771 sub r0, r0, r1, lsl #2
776 vst1.16 {q9}, [r12,:128], r1
777 vst1.16 {q12}, [r0, :128], r1
778 vst1.16 {q10}, [r12,:128], r1
779 vst1.16 {q13}, [r0, :128], r1
780 vst1.16 {q11}, [r12,:128], r1
781 vst1.16 {q14}, [r0, :128], r1
782 sub r0, r0, r1, lsl #1
787 sub r12, r0, r1, lsl #1
788 vst1.16 {q10}, [r12,:128], r1
789 vst1.16 {q12}, [r0, :128], r1
790 vst1.16 {q11}, [r12,:128], r1
791 vst1.16 {q13}, [r0, :128], r1
792 sub r0, r0, r1, lsl #1
796 bpp_frontends vp9_loop_filter_v_8_8
799 function vp9_loop_filter_h_8_8_16_neon
801 add r0, r12, r1, lsl #2
802 vld1.16 {q8}, [r12,:64], r1
803 vld1.16 {q12}, [r0, :64], r1
804 vld1.16 {q9}, [r12,:64], r1
805 vld1.16 {q13}, [r0, :64], r1
806 vld1.16 {q10}, [r12,:64], r1
807 vld1.16 {q14}, [r0, :64], r1
808 vld1.16 {q11}, [r12,:64], r1
809 vld1.16 {q15}, [r0, :64], r1
811 sub r12, r12, r1, lsl #2
812 sub r0, r0, r1, lsl #2
814 transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
818 @ Even though only 6 pixels per row have been changed, we write the
819 @ full 8 pixel registers.
820 transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
822 vst1.16 {q8}, [r12,:64], r1
823 vst1.16 {q12}, [r0, :64], r1
824 vst1.16 {q9}, [r12,:64], r1
825 vst1.16 {q13}, [r0, :64], r1
826 vst1.16 {q10}, [r12,:64], r1
827 vst1.16 {q14}, [r0, :64], r1
828 vst1.16 {q11}, [r12,:64], r1
829 vst1.16 {q15}, [r0, :64], r1
830 sub r12, r12, r1, lsl #2
835 @ If we didn't need to do the flat8in part, we use the same writeback
836 @ as in loop_filter_h_4_8.
839 transpose16_4x4 q10, q11, q12, q13
841 vst1.16 {d20}, [r12], r1
842 vst1.16 {d21}, [r0], r1
843 vst1.16 {d22}, [r12], r1
844 vst1.16 {d23}, [r0], r1
845 vst1.16 {d24}, [r12], r1
846 vst1.16 {d25}, [r0], r1
847 vst1.16 {d26}, [r12], r1
848 vst1.16 {d27}, [r0], r1
849 sub r12, r12, r1, lsl #2
854 bpp_frontends vp9_loop_filter_h_8_8
856 bpp_frontends_mix2 4, 4
857 bpp_frontends_mix2 4, 8
858 bpp_frontends_mix2 8, 4
859 bpp_frontends_mix2 8, 8
861 function vp9_loop_filter_v_16_4_16_neon
862 sub r12, r0, r1, lsl #3
863 @ Read p7-p0 using r12 and q0-q7 using r0
864 vld1.16 {d16}, [r12,:64], r1 @ p7
865 vld1.16 {d24}, [r0, :64], r1 @ q0
866 vld1.16 {d17}, [r12,:64], r1 @ p6
867 vld1.16 {d25}, [r0, :64], r1 @ q1
868 vld1.16 {d18}, [r12,:64], r1 @ p5
869 vld1.16 {d26}, [r0, :64], r1 @ q2
870 vld1.16 {d19}, [r12,:64], r1 @ p4
871 vld1.16 {d27}, [r0, :64], r1 @ q3
872 vld1.16 {d20}, [r12,:64], r1 @ p3
873 vld1.16 {d28}, [r0, :64], r1 @ q4
874 vld1.16 {d21}, [r12,:64], r1 @ p2
875 vld1.16 {d29}, [r0, :64], r1 @ q5
876 vld1.16 {d22}, [r12,:64], r1 @ p1
877 vld1.16 {d30}, [r0, :64], r1 @ q6
878 vld1.16 {d23}, [r12,:64], r1 @ p0
879 vld1.16 {d31}, [r0, :64], r1 @ q7
880 sub r12, r12, r1, lsl #3
881 sub r0, r0, r1, lsl #3
886 @ If we did the flat8out part, we get the output in
887 @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
888 @ store d2-d9 there, and d10-d17 into r0.
889 vst1.16 {d2}, [r12,:64], r1
890 vst1.16 {d10}, [r0, :64], r1
891 vst1.16 {d3}, [r12,:64], r1
892 vst1.16 {d11}, [r0, :64], r1
893 vst1.16 {d4}, [r12,:64], r1
894 vst1.16 {d12}, [r0, :64], r1
895 vst1.16 {d5}, [r12,:64], r1
896 vst1.16 {d13}, [r0, :64], r1
897 vst1.16 {d6}, [r12,:64], r1
898 vst1.16 {d14}, [r0, :64], r1
899 vst1.16 {d8}, [r12,:64], r1
900 vst1.16 {d15}, [r0, :64], r1
901 vst1.16 {d9}, [r12,:64], r1
902 vst1.16 {d17}, [r0, :64], r1
903 sub r0, r0, r1, lsl #3
910 add r12, r12, r1, lsl #2
911 @ If we didn't do the flat8out part, the output is left in the
913 vst1.16 {d21}, [r12,:64], r1
914 vst1.16 {d24}, [r0, :64], r1
915 vst1.16 {d22}, [r12,:64], r1
916 vst1.16 {d25}, [r0, :64], r1
917 vst1.16 {d23}, [r12,:64], r1
918 vst1.16 {d26}, [r0, :64], r1
919 sub r0, r0, r1, lsl #1
923 sub r12, r0, r1, lsl #1
924 vst1.16 {d22}, [r12,:64], r1
925 vst1.16 {d24}, [r0, :64], r1
926 vst1.16 {d23}, [r12,:64], r1
927 vst1.16 {d25}, [r0, :64], r1
928 sub r0, r0, r1, lsl #1
932 bpp_frontends_rep vp9_loop_filter_v_16, 8, 4, 2, v
933 bpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v
935 function vp9_loop_filter_h_16_4_16_neon
938 vld1.16 {d16}, [r12,:64], r1
939 vld1.16 {d20}, [r0, :64], r1
940 vld1.16 {d17}, [r12,:64], r1
941 vld1.16 {d21}, [r0, :64], r1
942 vld1.16 {d18}, [r12,:64], r1
943 vld1.16 {d22}, [r0, :64], r1
944 vld1.16 {d19}, [r12,:64], r1
945 vld1.16 {d23}, [r0, :64], r1
946 sub r12, r12, r1, lsl #2
947 sub r0, r0, r1, lsl #2
950 vld1.16 {d24}, [r12,:64], r1
951 vld1.16 {d28}, [r0, :64], r1
952 vld1.16 {d25}, [r12,:64], r1
953 vld1.16 {d29}, [r0, :64], r1
954 vld1.16 {d26}, [r12,:64], r1
955 vld1.16 {d30}, [r0, :64], r1
956 vld1.16 {d27}, [r12,:64], r1
957 vld1.16 {d31}, [r0, :64], r1
958 sub r0, r0, r1, lsl #2
959 sub r12, r12, r1, lsl #2
963 @ The 16x4 pixels read above is in four 4x4 blocks
964 transpose16_q_4x4 q8, q9, d16, d17, d18, d19
965 transpose16_q_4x4 q10, q11, d20, d21, d22, d23
966 transpose16_q_4x4 q12, q13, d24, d25, d26, d27
967 transpose16_q_4x4 q14, q15, d28, d29, d30, d31
971 @ Transpose back; this is the same transpose as above, but
972 @ we can't take advantage of q registers for the transpose, since
973 @ all d registers in the transpose aren't consecutive.
974 transpose16_4x4 d16, d2, d3, d4
975 transpose16_4x4 d5, d6, d8, d9
976 transpose16_4x4 d10, d11, d12, d13
977 transpose16_4x4 d14, d15, d17, d31
979 vst1.16 {d16}, [r12,:64], r1
980 vst1.16 {d5}, [r0, :64], r1
982 vst1.16 {d2}, [r12,:64], r1
983 vst1.16 {d6}, [r0, :64], r1
985 vst1.16 {d3}, [r12,:64], r1
986 vst1.16 {d8}, [r0, :64], r1
988 vst1.16 {d4}, [r12,:64], r1
989 vst1.16 {d9}, [r0, :64], r1
991 sub r12, r12, r1, lsl #2
992 sub r0, r0, r1, lsl #2
996 vst1.16 {d10}, [r12,:64], r1
997 vst1.16 {d14}, [r0, :64], r1
999 vst1.16 {d11}, [r12,:64], r1
1000 vst1.16 {d15}, [r0, :64], r1
1002 vst1.16 {d12}, [r12,:64], r1
1003 vst1.16 {d17}, [r0, :64], r1
1005 vst1.16 {d13}, [r12,:64], r1
1006 vst1.16 {d31}, [r0, :64], r1
1007 sub r0, r0, r1, lsl #2
1016 transpose16_q_4x4 q10, q11, d20, d21, d22, d23
1017 transpose16_q_4x4 q12, q13, d24, d25, d26, d27
1019 vst1.16 {d20}, [r12,:64], r1
1020 vst1.16 {d24}, [r0, :64], r1
1021 vst1.16 {d21}, [r12,:64], r1
1022 vst1.16 {d25}, [r0, :64], r1
1023 vst1.16 {d22}, [r12,:64], r1
1024 vst1.16 {d26}, [r0, :64], r1
1025 vst1.16 {d23}, [r12,:64], r1
1026 vst1.16 {d27}, [r0, :64], r1
1027 sub r0, r0, r1, lsl #2
1031 add r0, r12, r1, lsl #1
1032 transpose16_q_4x4 q11, q12, d22, d23, d24, d25
1034 vst1.16 {d22}, [r12], r1
1035 vst1.16 {d24}, [r0], r1
1036 vst1.16 {d23}, [r12], r1
1037 vst1.16 {d25}, [r0], r1
1038 sub r0, r0, r1, lsl #2
1043 bpp_frontends_rep vp9_loop_filter_h_16, 8, 4, 2, h
1044 bpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h