2 * Copyright (c) 2016 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
24 @ Do an 8x8 transpose, using q registers for the subtransposes that don't
25 @ need to address the indiviudal d registers.
26 @ r0,r1 == rq0, r2,r3 == rq1, etc
27 .macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
38 @ Do a 4x4 transpose, using q registers for the subtransposes that don't
39 @ need to address the indiviudal d registers.
40 @ r0,r1 == rq0, r2,r3 == rq1
41 .macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
47 @ The input to and output from this macro is in the registers q8-q15,
48 @ and q0-q7 are used as scratch registers.
49 @ p3 = q8, p0 = q11, q0 = q12, q3 = q15
58 vabd.u8 q2, q8, q9 @ abs(p3 - p2)
59 vabd.u8 q3, q9, q10 @ abs(p2 - p1)
60 vabd.u8 q4, q10, q11 @ abs(p1 - p0)
61 vabd.u8 q5, q12, q13 @ abs(q0 - q1)
62 vabd.u8 q6, q13, q14 @ abs(q1 - q2)
63 vabd.u8 q7, q14, q15 @ abs(q2 - q3)
67 vabd.u8 q5, q11, q12 @ abs(p0 - q0)
69 vqadd.u8 q5, q5, q5 @ abs(p0 - q0) * 2
70 vabd.u8 q7, q10, q13 @ abs(p1 - q1)
71 vmax.u8 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3))
73 vcle.u8 q2, q2, q1 @ max(abs()) <= I
74 vqadd.u8 q5, q5, q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
81 @ If no pixels need filtering, just exit as soon as possible
84 @ Calculate the normal inner loop filter for 2 or 4 pixels
86 vabd.u8 q3, q10, q11 @ abs(p1 - p0)
87 vabd.u8 q4, q13, q12 @ abs(q1 - q0)
89 vsubl.u8 q5, d20, d26 @ p1 - q1
90 vsubl.u8 q6, d21, d27 @ p1 - q1
91 vmax.u8 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0))
92 vqmovn.s16 d10, q5 @ av_clip_int8p(p1 - q1)
93 vqmovn.s16 d11, q6 @ av_clip_int8p(p1 - q1)
97 vsubl.u8 q6, d24, d22 @ q0 - p0
98 vsubl.u8 q7, d25, d23 @ q0 - p0
99 vcle.u8 q3, q3, q4 @ hev
101 vand q3, q3, q2 @ !hev && fm && !flat8in
103 vmul.s16 q6, q6, q0 @ 3 * (q0 - p0)
104 vmul.s16 q7, q7, q0 @ 3 * (q0 - p0)
105 vbic q5, q5, q3 @ if (!hev) av_clip_int8 = 0
106 vaddw.s8 q6, q6, d10 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
107 vaddw.s8 q7, q7, d11 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
110 vqmovn.s16 d13, q7 @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f
113 vqadd.s8 q5, q6, q5 @ FFMIN(f + 4, 127)
114 vqadd.s8 q0, q6, q0 @ FFMIN(f + 3, 127)
115 vmovl.u8 q6, d22 @ p0
116 vmovl.u8 q7, d23 @ p0
117 vshr.s8 q5, q5, #3 @ f1
118 vshr.s8 q0, q0, #3 @ f2
120 vaddw.s8 q6, q6, d0 @ p0 + f2
121 vaddw.s8 q7, q7, d1 @ p0 + f2
122 vqmovun.s16 d0, q6 @ out p0
123 vmovl.u8 q6, d24 @ q0
124 vqmovun.s16 d1, q7 @ out p0
125 vmovl.u8 q7, d25 @ q0
126 vsubw.s8 q6, q6, d10 @ q0 - f1
127 vsubw.s8 q7, q7, d11 @ q0 - f1
128 vqmovun.s16 d12, q6 @ out q0
129 vqmovun.s16 d13, q7 @ out q0
130 vrshr.s8 q5, q5, #1 @ f = (f1 + 1) >> 1
131 vbit q11, q0, q2 @ if (fm && !flat8in)
134 vmovl.u8 q0, d20 @ p1
135 vmovl.u8 q2, d21 @ p1
136 vmovl.u8 q6, d26 @ q1
137 vmovl.u8 q7, d27 @ q1
138 vaddw.s8 q0, q0, d10 @ p1 + f
139 vaddw.s8 q2, q2, d11 @ p1 + f
140 vsubw.s8 q6, q6, d10 @ q1 - f
141 vsubw.s8 q7, q7, d11 @ q1 - f
142 vqmovun.s16 d0, q0 @ out p1
143 vqmovun.s16 d1, q2 @ out p1
144 vqmovun.s16 d12, q6 @ out q1
145 vqmovun.s16 d13, q7 @ out q1
146 vbit q10, q0, q3 @ if (!hev && fm && !flat8in)
150 @ The input to and output from this macro is in the registers d16-d31,
151 @ and d0-d7 are used as scratch registers.
152 @ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
153 @ Depending on the width of the loop filter, we either use d16-d19
154 @ and d28-d31 as temp registers, or d8-d15.
155 @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
156 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
161 vabd.u8 d4, d20, d21 @ abs(p3 - p2)
162 vabd.u8 d5, d21, d22 @ abs(p2 - p1)
163 vabd.u8 d6, d22, d23 @ abs(p1 - p0)
164 vabd.u8 d7, d24, d25 @ abs(q0 - q1)
165 vabd.u8 \tmp1, d25, d26 @ abs(q1 - q2)
166 vabd.u8 \tmp2, d26, d27 @ abs(q2 - q3)
169 vmax.u8 \tmp1, \tmp1, \tmp2
170 vabd.u8 d6, d23, d24 @ abs(p0 - q0)
172 vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2
173 vabd.u8 d5, d22, d25 @ abs(p1 - q1)
174 vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
176 vcle.u8 d4, d4, d2 @ max(abs()) <= I
177 vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
184 @ If no pixels need filtering, just exit as soon as possible
190 vabd.u8 d6, d20, d23 @ abs(p3 - p0)
191 vabd.u8 d2, d21, d23 @ abs(p2 - p0)
192 vabd.u8 d1, d22, d23 @ abs(p1 - p0)
193 vabd.u8 \tmp1, d25, d24 @ abs(q1 - q0)
194 vabd.u8 \tmp2, d26, d24 @ abs(q2 - q0)
195 vabd.u8 \tmp3, d27, d24 @ abs(q3 - q0)
197 vmax.u8 d1, d1, \tmp1
198 vmax.u8 \tmp2, \tmp2, \tmp3
200 vabd.u8 d7, d16, d23 @ abs(p7 - p0)
202 vabd.u8 d2, d17, d23 @ abs(p6 - p0)
203 vmax.u8 d6, d6, \tmp2
204 vabd.u8 d1, d18, d23 @ abs(p5 - p0)
205 vcle.u8 d6, d6, d0 @ flat8in
206 vabd.u8 d8, d19, d23 @ abs(p4 - p0)
207 vand d6, d6, d4 @ flat8in && fm
208 vabd.u8 d9, d28, d24 @ abs(q4 - q0)
209 vbic d4, d4, d6 @ fm && !flat8in
210 vabd.u8 d10, d29, d24 @ abs(q5 - q0)
211 vabd.u8 d11, d30, d24 @ abs(q6 - q0)
212 vabd.u8 d12, d31, d24 @ abs(q7 - q0)
217 vmax.u8 d11, d11, d12
218 @ The rest of the calculation of flat8out is interleaved below
220 @ The rest of the calculation of flat8in is interleaved below
224 @ Calculate the normal inner loop filter for 2 or 4 pixels
225 vabd.u8 d5, d22, d23 @ abs(p1 - p0)
232 vabd.u8 d1, d25, d24 @ abs(q1 - q0)
236 vmax.u8 d6, d6, \tmp2
238 vsubl.u8 \tmpq1, d22, d25 @ p1 - q1
239 vmax.u8 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0))
240 vsubl.u8 \tmpq2, d24, d23 @ q0 - p0
243 vcle.u8 d6, d6, d0 @ flat8in
245 vcle.u8 d5, d5, d3 @ !hev
247 vand d6, d6, d4 @ flat8in && fm
249 vqmovn.s16 \tmp1, \tmpq1 @ av_clip_int8(p1 - q1)
251 vcle.u8 d7, d7, d0 @ flat8out
253 vbic d4, d4, d6 @ fm && !flat8in
255 vand d5, d5, d4 @ !hev && fm && !flat8in
257 vand d7, d7, d6 @ flat8out && flat8in && fm
260 vmul.s16 \tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0)
261 vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int8 = 0
263 vaddw.s8 \tmpq2, \tmpq2, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
265 vqmovn.s16 \tmp1, \tmpq2 @ f
267 vbic d6, d6, d7 @ fm && flat8in && !flat8out
270 vqadd.s8 \tmp3, \tmp1, d2 @ FFMIN(f + 4, 127)
271 vqadd.s8 \tmp4, \tmp1, d3 @ FFMIN(f + 3, 127)
272 vmovl.u8 q0, d23 @ p0
273 vshr.s8 \tmp3, \tmp3, #3 @ f1
274 vshr.s8 \tmp4, \tmp4, #3 @ f2
276 vmovl.u8 q1, d24 @ q0
277 vaddw.s8 q0, q0, \tmp4 @ p0 + f2
278 vsubw.s8 q1, q1, \tmp3 @ q0 - f1
279 vqmovun.s16 d0, q0 @ out p0
280 vqmovun.s16 d1, q1 @ out q0
281 vrshr.s8 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1
282 vbit d23, d0, d4 @ if (fm && !flat8in)
285 vmovl.u8 q0, d22 @ p1
286 vmovl.u8 q1, d25 @ q1
290 vaddw.s8 q0, q0, \tmp3 @ p1 + f
291 vsubw.s8 q1, q1, \tmp3 @ q1 - f
295 vqmovun.s16 d0, q0 @ out p1
296 vqmovun.s16 d2, q1 @ out q1
297 vbit d22, d0, d5 @ if (!hev && fm && !flat8in)
301 @ If no pixels need flat8in, jump to flat8out
302 @ (or to a writeout of the inner 4 pixels, for wd=8)
306 vaddl.u8 \tmpq1, d20, d21
307 vaddl.u8 \tmpq2, d22, d25
308 vaddl.u8 \tmpq3, d20, d22
309 vaddl.u8 \tmpq4, d23, d26
310 vadd.u16 q0, \tmpq1, \tmpq1
313 vadd.u16 q0, q0, \tmpq3
314 vsub.s16 \tmpq2, \tmpq2, \tmpq1
315 vsub.s16 \tmpq4, \tmpq4, \tmpq3
316 vrshrn.u16 d2, q0, #3 @ out p2
318 vadd.u16 q0, q0, \tmpq2
319 vaddl.u8 \tmpq1, d20, d23
320 vaddl.u8 \tmpq2, d24, d27
321 vrshrn.u16 d3, q0, #3 @ out p1
323 vadd.u16 q0, q0, \tmpq4
324 vsub.s16 \tmpq2, \tmpq2, \tmpq1
325 vaddl.u8 \tmpq3, d21, d24
326 vaddl.u8 \tmpq4, d25, d27
327 vrshrn.u16 d4, q0, #3 @ out p0
329 vadd.u16 q0, q0, \tmpq2
330 vsub.s16 \tmpq4, \tmpq4, \tmpq3
331 vaddl.u8 \tmpq1, d22, d25
332 vaddl.u8 \tmpq2, d26, d27
333 vrshrn.u16 d5, q0, #3 @ out q0
335 vadd.u16 q0, q0, \tmpq4
336 vsub.s16 \tmpq2, \tmpq2, \tmpq1
337 vrshrn.u16 \tmp5, q0, #3 @ out q1
339 vadd.u16 q0, q0, \tmpq2
340 @ The output here is written back into the input registers. This doesn't
341 @ matter for the flat8out part below, since we only update those pixels
342 @ which won't be touched below.
346 vrshrn.u16 \tmp6, q0, #3 @ out q2
356 @ If no pixels needed flat8in nor flat8out, jump to a
357 @ writeout of the inner 4 pixels
361 @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
365 @ This writes all outputs into d2-d17 (skipping d6 and d16).
366 @ If this part is skipped, the output is read from d21-d26 (which is the input
368 vshll.u8 q0, d16, #3 @ 8 * d16
369 vsubw.u8 q0, q0, d16 @ 7 * d16
371 vaddl.u8 q4, d17, d18
372 vaddl.u8 q5, d19, d20
374 vaddl.u8 q4, d16, d17
375 vaddl.u8 q6, d21, d22
377 vaddl.u8 q5, d18, d25
378 vaddl.u8 q7, d23, d24
382 vaddl.u8 q6, d16, d18
383 vaddl.u8 q7, d19, d26
384 vrshrn.u16 d2, q0, #4
387 vaddl.u8 q4, d16, d19
388 vaddl.u8 q5, d20, d27
391 vrshrn.u16 d3, q0, #4
394 vaddl.u8 q6, d16, d20
395 vaddl.u8 q7, d21, d28
398 vrshrn.u16 d4, q0, #4
401 vaddl.u8 q4, d16, d21
402 vaddl.u8 q5, d22, d29
405 vrshrn.u16 d5, q0, #4
408 vaddl.u8 q6, d16, d22
409 vaddl.u8 q7, d23, d30
412 vrshrn.u16 d6, q0, #4
415 vaddl.u8 q5, d16, d23
417 vaddl.u8 q6, d24, d31
419 vrshrn.u16 d8, q0, #4
423 vaddl.u8 q6, d17, d24
424 vaddl.u8 q7, d25, d31
426 vrshrn.u16 d9, q0, #4
430 vaddl.u8 q6, d26, d31
432 vrshrn.u16 d10, q0, #4
435 vaddl.u8 q7, d18, d25
436 vaddl.u8 q9, d19, d26
438 vaddl.u8 q7, d27, d31
440 vrshrn.u16 d11, q0, #4
443 vaddl.u8 q6, d20, d27
445 vaddl.u8 q9, d28, d31
448 vrshrn.u16 d12, q0, #4
451 vaddl.u8 q7, d21, d28
452 vaddl.u8 q10, d29, d31
454 vrshrn.u16 d13, q0, #4
457 vsub.s16 q10, q10, q7
458 vaddl.u8 q9, d22, d29
459 vaddl.u8 q11, d30, d31
461 vrshrn.u16 d14, q0, #4
464 vsub.s16 q11, q11, q9
466 vrshrn.u16 d15, q0, #4
470 vrshrn.u16 d17, q0, #4
475 @ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
476 @ while we need those for inputs/outputs in wd=16 and use d8-d15
477 @ for temp registers there instead.
479 loop_filter 4, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
483 loop_filter 8, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
486 .macro loop_filter_16
487 loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15, q4, q5, q6, q7
491 @ The public functions in this file have got the following signature:
492 @ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
494 function ff_vp9_loop_filter_v_4_8_neon, export=1
495 sub r12, r0, r1, lsl #2
496 vld1.8 {d20}, [r12,:64], r1 @ p3
497 vld1.8 {d24}, [r0, :64], r1 @ q0
498 vld1.8 {d21}, [r12,:64], r1 @ p2
499 vld1.8 {d25}, [r0, :64], r1 @ q1
500 vld1.8 {d22}, [r12,:64], r1 @ p1
501 vld1.8 {d26}, [r0, :64], r1 @ q2
502 vld1.8 {d23}, [r12,:64], r1 @ p0
503 vld1.8 {d27}, [r0, :64], r1 @ q3
504 sub r0, r0, r1, lsl #2
505 sub r12, r12, r1, lsl #1
509 vst1.8 {d22}, [r12,:64], r1
510 vst1.8 {d24}, [r0, :64], r1
511 vst1.8 {d23}, [r12,:64], r1
512 vst1.8 {d25}, [r0, :64], r1
517 function ff_vp9_loop_filter_h_4_8_neon, export=1
519 add r0, r12, r1, lsl #2
520 vld1.8 {d20}, [r12], r1
521 vld1.8 {d24}, [r0], r1
522 vld1.8 {d21}, [r12], r1
523 vld1.8 {d25}, [r0], r1
524 vld1.8 {d22}, [r12], r1
525 vld1.8 {d26}, [r0], r1
526 vld1.8 {d23}, [r12], r1
527 vld1.8 {d27}, [r0], r1
529 sub r12, r12, r1, lsl #2
530 sub r0, r0, r1, lsl #2
531 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
532 @ outermost 2 pixels since they aren't changed.
536 @ Transpose the 8x8 pixels, taking advantage of q registers, to get
537 @ one register per column.
538 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
542 @ We only will write the mid 4 pixels back; after the loop filter,
543 @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
544 @ (8x4 pixels). We need to transpose them to columns, done with a
545 @ 4x4 transpose (which in practice is two 4x4 transposes of the two
546 @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
547 transpose_q_4x4 q11, q12, d22, d23, d24, d25
549 vst1.32 {d22[0]}, [r12], r1
550 vst1.32 {d22[1]}, [r0], r1
551 vst1.32 {d23[0]}, [r12], r1
552 vst1.32 {d23[1]}, [r0], r1
553 vst1.32 {d24[0]}, [r12], r1
554 vst1.32 {d24[1]}, [r0], r1
555 vst1.32 {d25[0]}, [r12], r1
556 vst1.32 {d25[1]}, [r0], r1
561 function ff_vp9_loop_filter_v_44_16_neon, export=1
563 sub r12, r0, r1, lsl #2
564 vld1.8 {q8}, [r12,:128], r1 @ p3
565 vld1.8 {q12}, [r0, :128], r1 @ q0
566 vld1.8 {q9}, [r12,:128], r1 @ p2
567 vld1.8 {q13}, [r0, :128], r1 @ q1
568 vld1.8 {q10}, [r12,:128], r1 @ p1
569 vld1.8 {q14}, [r0, :128], r1 @ q2
570 vld1.8 {q11}, [r12,:128], r1 @ p0
571 vld1.8 {q15}, [r0, :128], r1 @ q3
572 sub r0, r0, r1, lsl #2
573 sub r12, r12, r1, lsl #1
577 vst1.8 {q10}, [r12,:128], r1
578 vst1.8 {q12}, [r0, :128], r1
579 vst1.8 {q11}, [r12,:128], r1
580 vst1.8 {q13}, [r0, :128], r1
586 function ff_vp9_loop_filter_h_44_16_neon, export=1
589 add r0, r12, r1, lsl #2
590 vld1.8 {d16}, [r12], r1
591 vld1.8 {d24}, [r0], r1
592 vld1.8 {d18}, [r12], r1
593 vld1.8 {d26}, [r0], r1
594 vld1.8 {d20}, [r12], r1
595 vld1.8 {d28}, [r0], r1
596 vld1.8 {d22}, [r12], r1
597 vld1.8 {d30}, [r0], r1
599 add r0, r0, r1, lsl #2
600 vld1.8 {d17}, [r12], r1
601 vld1.8 {d25}, [r0], r1
602 vld1.8 {d19}, [r12], r1
603 vld1.8 {d27}, [r0], r1
604 vld1.8 {d21}, [r12], r1
605 vld1.8 {d29}, [r0], r1
606 vld1.8 {d23}, [r12], r1
607 vld1.8 {d31}, [r0], r1
609 @ Transpose the 16x8 pixels, as two 8x8 parts
610 transpose_8x8 q8, q9, q10, q11, q12, q13, q14, q15
614 sub r12, r0, r1, lsl #4
615 add r0, r12, r1, lsl #3
616 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
617 @ outermost 2 pixels since they aren't changed.
621 @ We only will write the mid 4 pixels back; after the loop filter,
622 @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels).
623 @ We need to transpose them to columns, done with a 4x4 transpose
624 @ (which in practice is four 4x4 transposes of the 4x4 blocks of
625 @ the 16x4 pixels; into 4x16 pixels).
626 transpose_4x4 q10, q11, q12, q13
628 vst1.32 {d20[0]}, [r12], r1
629 vst1.32 {d21[0]}, [r0], r1
630 vst1.32 {d22[0]}, [r12], r1
631 vst1.32 {d23[0]}, [r0], r1
632 vst1.32 {d24[0]}, [r12], r1
633 vst1.32 {d25[0]}, [r0], r1
634 vst1.32 {d26[0]}, [r12], r1
635 vst1.32 {d27[0]}, [r0], r1
636 vst1.32 {d20[1]}, [r12], r1
637 vst1.32 {d21[1]}, [r0], r1
638 vst1.32 {d22[1]}, [r12], r1
639 vst1.32 {d23[1]}, [r0], r1
640 vst1.32 {d24[1]}, [r12], r1
641 vst1.32 {d25[1]}, [r0], r1
642 vst1.32 {d26[1]}, [r12], r1
643 vst1.32 {d27[1]}, [r0], r1
649 function ff_vp9_loop_filter_v_8_8_neon, export=1
650 sub r12, r0, r1, lsl #2
651 vld1.8 {d20}, [r12,:64], r1 @ p3
652 vld1.8 {d24}, [r0, :64], r1 @ q0
653 vld1.8 {d21}, [r12,:64], r1 @ p2
654 vld1.8 {d25}, [r0, :64], r1 @ q1
655 vld1.8 {d22}, [r12,:64], r1 @ p1
656 vld1.8 {d26}, [r0, :64], r1 @ q2
657 vld1.8 {d23}, [r12,:64], r1 @ p0
658 vld1.8 {d27}, [r0, :64], r1 @ q3
659 sub r12, r12, r1, lsl #2
660 sub r0, r0, r1, lsl #2
665 vst1.8 {d21}, [r12,:64], r1
666 vst1.8 {d24}, [r0, :64], r1
667 vst1.8 {d22}, [r12,:64], r1
668 vst1.8 {d25}, [r0, :64], r1
669 vst1.8 {d23}, [r12,:64], r1
670 vst1.8 {d26}, [r0, :64], r1
674 sub r12, r0, r1, lsl #1
675 vst1.8 {d22}, [r12,:64], r1
676 vst1.8 {d24}, [r0, :64], r1
677 vst1.8 {d23}, [r12,:64], r1
678 vst1.8 {d25}, [r0, :64], r1
682 function ff_vp9_loop_filter_h_8_8_neon, export=1
684 add r0, r12, r1, lsl #2
685 vld1.8 {d20}, [r12], r1
686 vld1.8 {d24}, [r0], r1
687 vld1.8 {d21}, [r12], r1
688 vld1.8 {d25}, [r0], r1
689 vld1.8 {d22}, [r12], r1
690 vld1.8 {d26}, [r0], r1
691 vld1.8 {d23}, [r12], r1
692 vld1.8 {d27}, [r0], r1
694 sub r12, r12, r1, lsl #2
695 sub r0, r0, r1, lsl #2
697 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
701 @ Even though only 6 pixels per row have been changed, we write the
702 @ full 8 pixel registers.
703 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
705 vst1.8 {d20}, [r12], r1
706 vst1.8 {d24}, [r0], r1
707 vst1.8 {d21}, [r12], r1
708 vst1.8 {d25}, [r0], r1
709 vst1.8 {d22}, [r12], r1
710 vst1.8 {d26}, [r0], r1
711 vst1.8 {d23}, [r12], r1
712 vst1.8 {d27}, [r0], r1
716 @ If we didn't need to do the flat8in part, we use the same writeback
717 @ as in loop_filter_h_4_8.
720 transpose_q_4x4 q11, q12, d22, d23, d24, d25
721 vst1.32 {d22[0]}, [r12], r1
722 vst1.32 {d22[1]}, [r0], r1
723 vst1.32 {d23[0]}, [r12], r1
724 vst1.32 {d23[1]}, [r0], r1
725 vst1.32 {d24[0]}, [r12], r1
726 vst1.32 {d24[1]}, [r0], r1
727 vst1.32 {d25[0]}, [r12], r1
728 vst1.32 {d25[1]}, [r0], r1
732 function vp9_loop_filter_v_16_neon
733 sub r12, r0, r1, lsl #3
734 @ Read p7-p0 using r12 and q0-q7 using r0
735 vld1.8 {d16}, [r12,:64], r1 @ p7
736 vld1.8 {d24}, [r0, :64], r1 @ q0
737 vld1.8 {d17}, [r12,:64], r1 @ p6
738 vld1.8 {d25}, [r0, :64], r1 @ q1
739 vld1.8 {d18}, [r12,:64], r1 @ p5
740 vld1.8 {d26}, [r0, :64], r1 @ q2
741 vld1.8 {d19}, [r12,:64], r1 @ p4
742 vld1.8 {d27}, [r0, :64], r1 @ q3
743 vld1.8 {d20}, [r12,:64], r1 @ p3
744 vld1.8 {d28}, [r0, :64], r1 @ q4
745 vld1.8 {d21}, [r12,:64], r1 @ p2
746 vld1.8 {d29}, [r0, :64], r1 @ q5
747 vld1.8 {d22}, [r12,:64], r1 @ p1
748 vld1.8 {d30}, [r0, :64], r1 @ q6
749 vld1.8 {d23}, [r12,:64], r1 @ p0
750 vld1.8 {d31}, [r0, :64], r1 @ q7
751 sub r12, r12, r1, lsl #3
752 sub r0, r0, r1, lsl #3
757 @ If we did the flat8out part, we get the output in
758 @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
759 @ store d2-d9 there, and d10-d17 into r0.
760 vst1.8 {d2}, [r12,:64], r1
761 vst1.8 {d10}, [r0, :64], r1
762 vst1.8 {d3}, [r12,:64], r1
763 vst1.8 {d11}, [r0, :64], r1
764 vst1.8 {d4}, [r12,:64], r1
765 vst1.8 {d12}, [r0, :64], r1
766 vst1.8 {d5}, [r12,:64], r1
767 vst1.8 {d13}, [r0, :64], r1
768 vst1.8 {d6}, [r12,:64], r1
769 vst1.8 {d14}, [r0, :64], r1
770 vst1.8 {d8}, [r12,:64], r1
771 vst1.8 {d15}, [r0, :64], r1
772 vst1.8 {d9}, [r12,:64], r1
773 vst1.8 {d17}, [r0, :64], r1
774 sub r0, r0, r1, lsl #3
781 add r12, r12, r1, lsl #2
782 @ If we didn't do the flat8out part, the output is left in the
784 vst1.8 {d21}, [r12,:64], r1
785 vst1.8 {d24}, [r0, :64], r1
786 vst1.8 {d22}, [r12,:64], r1
787 vst1.8 {d25}, [r0, :64], r1
788 vst1.8 {d23}, [r12,:64], r1
789 vst1.8 {d26}, [r0, :64], r1
790 sub r0, r0, r1, lsl #1
794 sub r12, r0, r1, lsl #1
795 vst1.8 {d22}, [r12,:64], r1
796 vst1.8 {d24}, [r0, :64], r1
797 vst1.8 {d23}, [r12,:64], r1
798 vst1.8 {d25}, [r0, :64], r1
799 sub r0, r0, r1, lsl #1
803 function ff_vp9_loop_filter_v_16_8_neon, export=1
808 bl vp9_loop_filter_v_16_neon
814 function ff_vp9_loop_filter_v_16_16_neon, export=1
816 // The filter clobbers r2 and r3, but we need to keep them for the second round
820 bl vp9_loop_filter_v_16_neon
824 bl vp9_loop_filter_v_16_neon
830 function vp9_loop_filter_h_16_neon
832 vld1.8 {d16}, [r12,:64], r1
833 vld1.8 {d24}, [r0, :64], r1
834 vld1.8 {d17}, [r12,:64], r1
835 vld1.8 {d25}, [r0, :64], r1
836 vld1.8 {d18}, [r12,:64], r1
837 vld1.8 {d26}, [r0, :64], r1
838 vld1.8 {d19}, [r12,:64], r1
839 vld1.8 {d27}, [r0, :64], r1
840 vld1.8 {d20}, [r12,:64], r1
841 vld1.8 {d28}, [r0, :64], r1
842 vld1.8 {d21}, [r12,:64], r1
843 vld1.8 {d29}, [r0, :64], r1
844 vld1.8 {d22}, [r12,:64], r1
845 vld1.8 {d30}, [r0, :64], r1
846 vld1.8 {d23}, [r12,:64], r1
847 vld1.8 {d31}, [r0, :64], r1
848 sub r0, r0, r1, lsl #3
849 sub r12, r12, r1, lsl #3
851 @ The 16x8 pixels read above is in two 8x8 blocks; the left
852 @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
853 @ of this, to get one column per register. This could be done with two
854 @ transpose_8x8 as below, but this takes advantage of the q registers.
855 transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
867 @ Transpose back; this is the same transpose as above, but
868 @ we can't take advantage of q registers for the transpose, since
869 @ all d registers in the transpose aren't consecutive.
870 transpose_8x8 d16, d2, d3, d4, d5, d6, d8, d9
871 transpose_8x8 d10, d11, d12, d13, d14, d15, d17, d31
873 vst1.8 {d16}, [r12,:64], r1
874 vst1.8 {d10}, [r0, :64], r1
876 vst1.8 {d2}, [r12,:64], r1
877 vst1.8 {d11}, [r0, :64], r1
879 vst1.8 {d3}, [r12,:64], r1
880 vst1.8 {d12}, [r0, :64], r1
882 vst1.8 {d4}, [r12,:64], r1
883 vst1.8 {d13}, [r0, :64], r1
885 vst1.8 {d5}, [r12,:64], r1
886 vst1.8 {d14}, [r0, :64], r1
888 vst1.8 {d6}, [r12,:64], r1
889 vst1.8 {d15}, [r0, :64], r1
891 vst1.8 {d8}, [r12,:64], r1
892 vst1.8 {d17}, [r0, :64], r1
894 vst1.8 {d9}, [r12,:64], r1
895 vst1.8 {d31}, [r0, :64], r1
896 sub r0, r0, r1, lsl #3
900 @ The same writeback as in loop_filter_h_8_8
902 add r0, r12, r1, lsl #2
903 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
905 vst1.8 {d20}, [r12], r1
906 vst1.8 {d24}, [r0], r1
907 vst1.8 {d21}, [r12], r1
908 vst1.8 {d25}, [r0], r1
909 vst1.8 {d22}, [r12], r1
910 vst1.8 {d26}, [r0], r1
911 vst1.8 {d23}, [r12], r1
912 vst1.8 {d27}, [r0], r1
913 sub r0, r0, r1, lsl #3
917 @ The same writeback as in loop_filter_h_4_8
919 add r0, r12, r1, lsl #2
920 transpose_q_4x4 q11, q12, d22, d23, d24, d25
921 vst1.32 {d22[0]}, [r12], r1
922 vst1.32 {d22[1]}, [r0], r1
923 vst1.32 {d23[0]}, [r12], r1
924 vst1.32 {d23[1]}, [r0], r1
925 vst1.32 {d24[0]}, [r12], r1
926 vst1.32 {d24[1]}, [r0], r1
927 vst1.32 {d25[0]}, [r12], r1
928 vst1.32 {d25[1]}, [r0], r1
929 sub r0, r0, r1, lsl #3
934 function ff_vp9_loop_filter_h_16_8_neon, export=1
939 bl vp9_loop_filter_h_16_neon
945 function ff_vp9_loop_filter_h_16_16_neon, export=1
947 // The filter clobbers r2 and r3, but we need to keep them for the second round
951 bl vp9_loop_filter_h_16_neon
952 add r0, r0, r1, lsl #3
955 bl vp9_loop_filter_h_16_neon