]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9lpf_16bpp_neon.S
avformat/aviobuf: End grace period of allowing 0 from read_packet
[ffmpeg] / libavcodec / aarch64 / vp9lpf_16bpp_neon.S
1 /*
2  * Copyright (c) 2017 Google Inc.
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/aarch64/asm.S"
22 #include "neon.S"
23
24
25 .macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
26         trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
27         trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
28         trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
29         trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
30
31         trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
32         trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
33         trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
34         trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
35 .endm
36
37 // The input to and output from this macro is in the registers v16-v31,
38 // and v0-v7 are used as scratch registers.
39 // p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
40 // Depending on the width of the loop filter, we either use v16-v19
41 // and v28-v31 as temp registers, or v8-v15.
42 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
43         dup             v0.8h,  w2                   // E
44         dup             v2.8h,  w3                   // I
45         dup             v3.8h,  w4                   // H
46
47         uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
48         uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
49         uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
50         uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
51         uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
52         uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
53         umax            v4.8h,  v4.8h,  v5.8h
54         umax            v5.8h,  v6.8h,  v7.8h
55         umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
56         uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
57         umax            v4.8h,  v4.8h,  v5.8h
58         add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
59         uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
60         umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
61         ushr            v5.8h,  v5.8h,  #1
62         cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
63         add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
64         cmhs            v6.8h,  v0.8h,  v6.8h
65         and             v4.16b, v4.16b, v6.16b       // fm
66
67         // If no pixels need filtering, just exit as soon as possible
68         mov             x11, v4.d[0]
69         mov             x12, v4.d[1]
70         adds            x11, x11, x12
71         b.ne            1f
72         br              x10
73 1:
74
75 .if \wd >= 8
76         dup             v0.8h,  w5
77
78         uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
79         uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
80         uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
81         uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
82         uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
83         uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
84         umax            v6.8h,  v6.8h,  v2.8h
85         umax            v1.8h,  v1.8h,  \tmp1\().8h
86         umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
87 .if \wd == 16
88         uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
89         umax            v6.8h,  v6.8h,  v1.8h
90         uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
91         umax            v6.8h,  v6.8h,  \tmp2\().8h
92         uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
93         cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
94         uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
95         and             v6.16b, v6.16b, v4.16b       // flat8in && fm
96         uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
97         bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
98         uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
99         uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
100         uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
101
102         umax            v7.8h,  v7.8h,  v2.8h
103         umax            v1.8h,  v1.8h,  v8.8h
104         umax            v9.8h,  v9.8h,  v10.8h
105         umax            v11.8h, v11.8h, v12.8h
106         // The rest of the calculation of flat8out is interleaved below
107 .else
108         // The rest of the calculation of flat8in is interleaved below
109 .endif
110 .endif
111
112         // Calculate the normal inner loop filter for 2 or 4 pixels
113         uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
114 .if \wd == 16
115         umax            v7.8h,  v7.8h,  v1.8h
116         umax            v9.8h,  v9.8h,  v11.8h
117 .elseif \wd == 8
118         umax            v6.8h,  v6.8h,  v1.8h
119 .endif
120         uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
121 .if \wd == 16
122         umax            v7.8h,  v7.8h,  v9.8h
123 .elseif \wd == 8
124         umax            v6.8h,  v6.8h,  \tmp2\().8h
125 .endif
126         dup             \tmp2\().8h,  w6                        // left shift for saturation
127         sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
128         neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
129         umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
130         sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
131         movi            \tmp5\().8h,  #3
132 .if \wd == 8
133         cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
134 .endif
135         cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
136 .if \wd == 8
137         and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
138 .endif
139         sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
140 .if \wd == 16
141         cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
142 .elseif \wd == 8
143         bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
144 .endif
145         and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
146 .if \wd == 16
147         and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
148 .endif
149         sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
150
151         mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
152         bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
153         movi            v2.8h,  #4
154         add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
155         movi            v3.8h,  #3
156         sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
157         movi            \tmp5\().8h,  #0
158         sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
159         dup             \tmp6\().8h,  w7                        // max pixel value
160 .if \wd == 16
161         bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
162 .endif
163
164         ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
165
166         add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
167         add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
168         smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
169         smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
170         sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
171         sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
172
173         add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
174         sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
175         smin            v0.8h,   v0.8h,   \tmp6\().8h
176         smin            v2.8h,   v2.8h,   \tmp6\().8h
177         srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
178         smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
179         smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
180         bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
181         bit             v24.16b, v2.16b,  v4.16b
182
183         add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
184         sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
185 .if \wd >= 8
186         mov             x11, v6.d[0]
187 .endif
188         smin            v0.8h,  v0.8h,  \tmp6\().8h
189         smin            v2.8h,  v2.8h,  \tmp6\().8h
190 .if \wd >= 8
191         mov             x12, v6.d[1]
192 .endif
193         smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
194         smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
195 .if \wd >= 8
196         adds            x11, x11, x12
197 .endif
198         bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
199         bit             v25.16b, v2.16b,  v5.16b
200
201         // If no pixels need flat8in, jump to flat8out
202         // (or to a writeout of the inner 4 pixels, for wd=8)
203 .if \wd >= 8
204 .if \wd == 16
205         b.eq            6f
206 .else
207         b.ne            1f
208         br              x13
209 1:
210 .endif
211
212         // flat8in
213         add             \tmp1\().8h, v20.8h, v21.8h
214         add             \tmp3\().8h, v22.8h, v25.8h
215         add             \tmp5\().8h, v20.8h, v22.8h
216         add             \tmp7\().8h, v23.8h, v26.8h
217         add             v0.8h,  \tmp1\().8h, \tmp1\().8h
218         add             v0.8h,  v0.8h,  v23.8h
219         add             v0.8h,  v0.8h,  v24.8h
220         add             v0.8h,  v0.8h,  \tmp5\().8h
221         sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
222         sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
223         urshr           v2.8h,  v0.8h,  #3                      // out p2
224
225         add             v0.8h,  v0.8h,  \tmp3\().8h
226         add             \tmp1\().8h, v20.8h,  v23.8h
227         add             \tmp3\().8h, v24.8h,  v27.8h
228         urshr           v3.8h,  v0.8h,  #3                      // out p1
229
230         add             v0.8h,  v0.8h,  \tmp7\().8h
231         sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
232         add             \tmp5\().8h, v21.8h,  v24.8h
233         add             \tmp7\().8h, v25.8h,  v27.8h
234         urshr           v4.8h,  v0.8h,  #3                      // out p0
235
236         add             v0.8h,  v0.8h,  \tmp3\().8h
237         sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
238         add             \tmp1\().8h, v22.8h,  v25.8h
239         add             \tmp3\().8h, v26.8h,  v27.8h
240         urshr           v5.8h,  v0.8h,  #3                      // out q0
241
242         add             v0.8h,  v0.8h,  \tmp7\().8h
243         sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
244         urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
245
246         add             v0.8h,  v0.8h,  \tmp3\().8h
247         // The output here is written back into the input registers. This doesn't
248         // matter for the flat8part below, since we only update those pixels
249         // which won't be touched below.
250         bit             v21.16b, v2.16b,  v6.16b
251         bit             v22.16b, v3.16b,  v6.16b
252         bit             v23.16b, v4.16b,  v6.16b
253         urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
254         bit             v24.16b, v5.16b,  v6.16b
255         bit             v25.16b, \tmp5\().16b,  v6.16b
256         bit             v26.16b, \tmp6\().16b,  v6.16b
257 .endif
258 .if \wd == 16
259 6:
260         orr             v2.16b,  v6.16b,  v7.16b
261         mov             x11, v2.d[0]
262         mov             x12, v2.d[1]
263         adds            x11, x11, x12
264         b.ne            1f
265         // If no pixels needed flat8in nor flat8out, jump to a
266         // writeout of the inner 4 pixels
267         br              x14
268 1:
269
270         mov             x11, v7.d[0]
271         mov             x12, v7.d[1]
272         adds            x11, x11, x12
273         b.ne            1f
274         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
275         br              x15
276
277 1:
278         // flat8out
279         // This writes all outputs into v2-v17 (skipping v6 and v16).
280         // If this part is skipped, the output is read from v21-v26 (which is the input
281         // to this section).
282         shl             v0.8h,   v16.8h,  #3     // 8 * v16
283         sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
284         add             v0.8h,   v0.8h,   v17.8h
285         add             v8.8h,   v17.8h,  v18.8h
286         add             v10.8h,  v19.8h,  v20.8h
287         add             v0.8h,   v0.8h,   v8.8h
288         add             v8.8h,   v16.8h,  v17.8h
289         add             v12.8h,  v21.8h,  v22.8h
290         add             v0.8h,   v0.8h,   v10.8h
291         add             v10.8h,  v18.8h,  v25.8h
292         add             v14.8h,  v23.8h,  v24.8h
293         sub             v10.8h,  v10.8h,  v8.8h
294         add             v0.8h,   v0.8h,   v12.8h
295         add             v0.8h,   v0.8h,   v14.8h
296         add             v12.8h,  v16.8h,  v18.8h
297         add             v14.8h,  v19.8h,  v26.8h
298         urshr           v2.8h,   v0.8h,   #4
299
300         add             v0.8h,   v0.8h,   v10.8h
301         add             v8.8h,   v16.8h,  v19.8h
302         add             v10.8h,  v20.8h,  v27.8h
303         sub             v14.8h,  v14.8h,  v12.8h
304         bif             v2.16b,  v17.16b, v7.16b
305         urshr           v3.8h ,  v0.8h,   #4
306
307         add             v0.8h,   v0.8h,   v14.8h
308         add             v12.8h,  v16.8h,  v20.8h
309         add             v14.8h,  v21.8h,  v28.8h
310         sub             v10.8h,  v10.8h,  v8.8h
311         bif             v3.16b,  v18.16b, v7.16b
312         urshr           v4.8h,   v0.8h,   #4
313
314         add             v0.8h,   v0.8h,   v10.8h
315         add             v8.8h,   v16.8h,  v21.8h
316         add             v10.8h,  v22.8h,  v29.8h
317         sub             v14.8h,  v14.8h,  v12.8h
318         bif             v4.16b,  v19.16b, v7.16b
319         urshr           v5.8h,   v0.8h,   #4
320
321         add             v0.8h,   v0.8h,   v14.8h
322         add             v12.8h,  v16.8h,  v22.8h
323         add             v14.8h,  v23.8h,  v30.8h
324         sub             v10.8h,  v10.8h,  v8.8h
325         bif             v5.16b,  v20.16b, v7.16b
326         urshr           v6.8h,   v0.8h,   #4
327
328         add             v0.8h,   v0.8h,   v10.8h
329         add             v10.8h,  v16.8h,  v23.8h
330         sub             v14.8h,  v14.8h,  v12.8h
331         add             v12.8h,  v24.8h,  v31.8h
332         bif             v6.16b,  v21.16b, v7.16b
333         urshr           v8.8h,   v0.8h,   #4
334
335         add             v0.8h,   v0.8h,   v14.8h
336         sub             v10.8h,  v12.8h,  v10.8h
337         add             v12.8h,  v17.8h,  v24.8h
338         add             v14.8h,  v25.8h,  v31.8h
339         bif             v8.16b,  v22.16b, v7.16b
340         urshr           v9.8h,   v0.8h,   #4
341
342         add             v0.8h,   v0.8h,   v10.8h
343         sub             v14.8h,  v14.8h,  v12.8h
344         add             v12.8h,  v26.8h,  v31.8h
345         bif             v9.16b,  v23.16b, v7.16b
346         urshr           v10.8h,  v0.8h,   #4
347
348         add             v0.8h,   v0.8h,   v14.8h
349         add             v14.8h,  v18.8h,  v25.8h
350         add             v18.8h,  v19.8h,  v26.8h
351         sub             v12.8h,  v12.8h,  v14.8h
352         add             v14.8h,  v27.8h,  v31.8h
353         bif             v10.16b, v24.16b, v7.16b
354         urshr           v11.8h,  v0.8h,   #4
355
356         add             v0.8h,   v0.8h,   v12.8h
357         add             v12.8h,  v20.8h,  v27.8h
358         sub             v14.8h,  v14.8h,  v18.8h
359         add             v18.8h,  v28.8h,  v31.8h
360         bif             v11.16b, v25.16b, v7.16b
361         sub             v18.8h,  v18.8h,  v12.8h
362         urshr           v12.8h,  v0.8h,   #4
363
364         add             v0.8h,   v0.8h,   v14.8h
365         add             v14.8h,  v21.8h,  v28.8h
366         add             v20.8h,  v29.8h,  v31.8h
367         bif             v12.16b, v26.16b, v7.16b
368         urshr           v13.8h,  v0.8h,   #4
369
370         add             v0.8h,   v0.8h,   v18.8h
371         sub             v20.8h,  v20.8h,  v14.8h
372         add             v18.8h,  v22.8h,  v29.8h
373         add             v22.8h,  v30.8h,  v31.8h
374         bif             v13.16b, v27.16b, v7.16b
375         urshr           v14.8h,  v0.8h,   #4
376
377         add             v0.8h,   v0.8h,   v20.8h
378         sub             v22.8h,  v22.8h,  v18.8h
379         bif             v14.16b, v28.16b, v7.16b
380         urshr           v15.8h,  v0.8h,   #4
381
382         add             v0.8h,   v0.8h,   v22.8h
383         bif             v15.16b, v29.16b, v7.16b
384         urshr           v17.8h,  v0.8h,   #4
385         bif             v17.16b, v30.16b, v7.16b
386 .endif
387 .endm
388
389 // For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
390 // while we need those for inputs/outputs in wd=16 and use v8-v15
391 // for temp registers there instead.
392 function vp9_loop_filter_4
393         loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
394         ret
395 endfunc
396
397 function vp9_loop_filter_8
398         loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
399         ret
400 endfunc
401
402 function vp9_loop_filter_16
403         loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
404         ret
405 endfunc
406
407 .macro loop_filter_4
408         bl              vp9_loop_filter_4
409 .endm
410
411 .macro loop_filter_8
412         // calculate alternative 'return' targets
413         adr             x13, 6f
414         bl              vp9_loop_filter_8
415 .endm
416
417 .macro loop_filter_16
418         // calculate alternative 'return' targets
419         adr             x14, 7f
420         adr             x15, 8f
421         bl              vp9_loop_filter_16
422 .endm
423
424
425 // The public functions in this file have got the following signature:
426 // void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
427
428 .macro bpp_frontend func, bpp, push
429 function ff_\func\()_\bpp\()_neon, export=1
430 .if \push
431         mov             x16, x30
432         stp             d14, d15, [sp, #-0x10]!
433         stp             d12, d13, [sp, #-0x10]!
434         stp             d10, d11, [sp, #-0x10]!
435         stp             d8,  d9,  [sp, #-0x10]!
436 .endif
437         lsl             w2,  w2,  #\bpp - 8
438         lsl             w3,  w3,  #\bpp - 8
439         lsl             w4,  w4,  #\bpp - 8
440         mov             x5,  #1 << (\bpp - 8)
441         mov             x6,  #16 - \bpp
442         mov             x7,  #((1 << \bpp) - 1)
443 .if \push
444         bl              \func\()_16_neon
445         ldp             d8,  d9,  [sp], 0x10
446         ldp             d10, d11, [sp], 0x10
447         ldp             d12, d13, [sp], 0x10
448         ldp             d14, d15, [sp], 0x10
449         br              x16
450 .else
451         b               \func\()_16_neon
452 .endif
453 endfunc
454 .endm
455
456 .macro bpp_frontends func, push=0
457         bpp_frontend    \func, 10, \push
458         bpp_frontend    \func, 12, \push
459 .endm
460
461 .macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
462 function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
463         mov             x16, x30
464 .if \push
465         stp             d14, d15, [sp, #-0x10]!
466         stp             d12, d13, [sp, #-0x10]!
467         stp             d10, d11, [sp, #-0x10]!
468         stp             d8,  d9,  [sp, #-0x10]!
469 .endif
470         lsl             w2,  w2,  #\bpp - 8
471         lsl             w3,  w3,  #\bpp - 8
472         lsl             w4,  w4,  #\bpp - 8
473         mov             x5,  #1 << (\bpp - 8)
474         mov             x6,  #16 - \bpp
475         mov             x7,  #((1 << \bpp) - 1)
476         bl              \func\()_\int_suffix\()_16_neon
477 .ifc \dir,h
478         add             x0,  x0,  x1, lsl #3
479 .else
480         add             x0,  x0,  #16
481 .endif
482         bl              \func\()_\int_suffix\()_16_neon
483 .if \push
484         ldp             d8,  d9,  [sp], 0x10
485         ldp             d10, d11, [sp], 0x10
486         ldp             d12, d13, [sp], 0x10
487         ldp             d14, d15, [sp], 0x10
488 .endif
489         br              x16
490 endfunc
491 .endm
492
493 .macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
494         bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
495         bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
496 .endm
497
498 .macro bpp_frontend_mix2 wd1, wd2, dir, bpp
499 function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
500         mov             x16, x30
501         lsr             w8,  w2,  #8
502         lsr             w14, w3,  #8
503         lsr             w15, w4,  #8
504         and             w2,  w2,  #0xff
505         and             w3,  w3,  #0xff
506         and             w4,  w4,  #0xff
507         lsl             w2,  w2,  #\bpp - 8
508         lsl             w3,  w3,  #\bpp - 8
509         lsl             w4,  w4,  #\bpp - 8
510         mov             x5,  #1 << (\bpp - 8)
511         mov             x6,  #16 - \bpp
512         mov             x7,  #((1 << \bpp) - 1)
513         bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
514 .ifc \dir,h
515         add             x0,  x0,  x1, lsl #3
516 .else
517         add             x0,  x0,  #16
518 .endif
519         lsl             w2,  w8,  #\bpp - 8
520         lsl             w3,  w14, #\bpp - 8
521         lsl             w4,  w15, #\bpp - 8
522         bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
523         br              x16
524 endfunc
525 .endm
526
527 .macro bpp_frontends_mix2 wd1, wd2
528         bpp_frontend_mix2 \wd1, \wd2, v, 10
529         bpp_frontend_mix2 \wd1, \wd2, v, 12
530         bpp_frontend_mix2 \wd1, \wd2, h, 10
531         bpp_frontend_mix2 \wd1, \wd2, h, 12
532 .endm
533
534 function vp9_loop_filter_v_4_8_16_neon
535         mov             x10, x30
536         sub             x9,  x0,  x1, lsl #2
537         ld1             {v20.8h}, [x9], x1 // p3
538         ld1             {v24.8h}, [x0], x1 // q0
539         ld1             {v21.8h}, [x9], x1 // p2
540         ld1             {v25.8h}, [x0], x1 // q1
541         ld1             {v22.8h}, [x9], x1 // p1
542         ld1             {v26.8h}, [x0], x1 // q2
543         ld1             {v23.8h}, [x9], x1 // p0
544         ld1             {v27.8h}, [x0], x1 // q3
545         sub             x0,  x0,  x1, lsl #2
546         sub             x9,  x9,  x1, lsl #1
547
548         loop_filter_4
549
550         st1             {v22.8h}, [x9], x1
551         st1             {v24.8h}, [x0], x1
552         st1             {v23.8h}, [x9], x1
553         st1             {v25.8h}, [x0], x1
554         sub             x0,  x0,  x1, lsl #1
555
556         br              x10
557 endfunc
558
559 bpp_frontends vp9_loop_filter_v_4_8
560
561 function vp9_loop_filter_h_4_8_16_neon
562         mov             x10, x30
563         sub             x9,  x0,  #8
564         add             x0,  x9,  x1, lsl #2
565         ld1             {v20.8h}, [x9], x1
566         ld1             {v24.8h}, [x0], x1
567         ld1             {v21.8h}, [x9], x1
568         ld1             {v25.8h}, [x0], x1
569         ld1             {v22.8h}, [x9], x1
570         ld1             {v26.8h}, [x0], x1
571         ld1             {v23.8h}, [x9], x1
572         ld1             {v27.8h}, [x0], x1
573
574         sub             x9,  x9,  x1, lsl #2
575         sub             x0,  x0,  x1, lsl #3
576         add             x0,  x0,  #8
577
578         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
579
580         loop_filter_4
581
582         // Move x9 forward by 2 pixels; we don't need to rewrite the
583         // outermost 2 pixels since they aren't changed.
584         add             x9,  x9,  #4
585         add             x0,  x9,  x1, lsl #2
586
587         // We only will write the mid 4 pixels back; after the loop filter,
588         // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
589         // We need to transpose them to columns, done with a 4x8 transpose
590         // (which in practice is two 4x4 transposes of the two 4x4 halves
591         // of the 8x4 pixels; into 4x8 pixels).
592         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
593         st1             {v22.d}[0], [x9], x1
594         st1             {v22.d}[1], [x0], x1
595         st1             {v23.d}[0], [x9], x1
596         st1             {v23.d}[1], [x0], x1
597         st1             {v24.d}[0], [x9], x1
598         st1             {v24.d}[1], [x0], x1
599         st1             {v25.d}[0], [x9], x1
600         st1             {v25.d}[1], [x0], x1
601         sub             x0,  x0,  x1, lsl #3
602         add             x0,  x0,  #4
603
604         br              x10
605 endfunc
606
607 bpp_frontends vp9_loop_filter_h_4_8
608
609 function vp9_loop_filter_v_8_8_16_neon
610         mov             x10, x30
611         sub             x9,  x0,  x1, lsl #2
612         ld1             {v20.8h}, [x9], x1 // p3
613         ld1             {v24.8h}, [x0], x1 // q0
614         ld1             {v21.8h}, [x9], x1 // p2
615         ld1             {v25.8h}, [x0], x1 // q1
616         ld1             {v22.8h}, [x9], x1 // p1
617         ld1             {v26.8h}, [x0], x1 // q2
618         ld1             {v23.8h}, [x9], x1 // p0
619         ld1             {v27.8h}, [x0], x1 // q3
620         sub             x9,  x9,  x1, lsl #2
621         sub             x0,  x0,  x1, lsl #2
622         add             x9,  x9,  x1
623
624         loop_filter_8
625
626         st1             {v21.8h}, [x9], x1
627         st1             {v24.8h}, [x0], x1
628         st1             {v22.8h}, [x9], x1
629         st1             {v25.8h}, [x0], x1
630         st1             {v23.8h}, [x9], x1
631         st1             {v26.8h}, [x0], x1
632         sub             x0,  x0,  x1, lsl #1
633         sub             x0,  x0,  x1
634
635         br              x10
636 6:
637         sub             x9,  x0,  x1, lsl #1
638         st1             {v22.8h}, [x9], x1
639         st1             {v24.8h}, [x0], x1
640         st1             {v23.8h}, [x9], x1
641         st1             {v25.8h}, [x0], x1
642         sub             x0,  x0,  x1, lsl #1
643         br              x10
644 endfunc
645
646 bpp_frontends vp9_loop_filter_v_8_8
647
648 function vp9_loop_filter_h_8_8_16_neon
649         mov             x10, x30
650         sub             x9,  x0,  #8
651         add             x0,  x9,  x1, lsl #2
652         ld1             {v20.8h}, [x9], x1
653         ld1             {v24.8h}, [x0], x1
654         ld1             {v21.8h}, [x9], x1
655         ld1             {v25.8h}, [x0], x1
656         ld1             {v22.8h}, [x9], x1
657         ld1             {v26.8h}, [x0], x1
658         ld1             {v23.8h}, [x9], x1
659         ld1             {v27.8h}, [x0], x1
660
661         sub             x9,  x9,  x1, lsl #2
662         sub             x0,  x0,  x1, lsl #3
663         add             x0,  x0,  #8
664
665         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
666
667         loop_filter_8
668
669         add             x0,  x9,  x1, lsl #2
670
671         // Even though only 6 pixels per row have been changed, we write the
672         // full 8 pixel registers.
673         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
674
675         st1             {v20.8h}, [x9], x1
676         st1             {v24.8h}, [x0], x1
677         st1             {v21.8h}, [x9], x1
678         st1             {v25.8h}, [x0], x1
679         st1             {v22.8h}, [x9], x1
680         st1             {v26.8h}, [x0], x1
681         st1             {v23.8h}, [x9], x1
682         st1             {v27.8h}, [x0], x1
683         sub             x0,  x0,  x1, lsl #3
684         add             x0,  x0,  #8
685
686         br              x10
687 6:
688         // If we didn't need to do the flat8in part, we use the same writeback
689         // as in loop_filter_h_4_8.
690         add             x9,  x9,  #4
691         add             x0,  x9,  x1, lsl #2
692         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
693         st1             {v22.d}[0], [x9], x1
694         st1             {v22.d}[1], [x0], x1
695         st1             {v23.d}[0], [x9], x1
696         st1             {v23.d}[1], [x0], x1
697         st1             {v24.d}[0], [x9], x1
698         st1             {v24.d}[1], [x0], x1
699         st1             {v25.d}[0], [x9], x1
700         st1             {v25.d}[1], [x0], x1
701         sub             x0,  x0,  x1, lsl #3
702         add             x0,  x0,  #4
703         br              x10
704 endfunc
705
706 bpp_frontends vp9_loop_filter_h_8_8
707
708 bpp_frontends_mix2 4, 4
709 bpp_frontends_mix2 4, 8
710 bpp_frontends_mix2 8, 4
711 bpp_frontends_mix2 8, 8
712
713 function vp9_loop_filter_v_16_8_16_neon
714         mov             x10, x30
715         sub             x9,  x0,  x1, lsl #3
716         ld1             {v16.8h}, [x9], x1 // p7
717         ld1             {v24.8h}, [x0], x1 // q0
718         ld1             {v17.8h}, [x9], x1 // p6
719         ld1             {v25.8h}, [x0], x1 // q1
720         ld1             {v18.8h}, [x9], x1 // p5
721         ld1             {v26.8h}, [x0], x1 // q2
722         ld1             {v19.8h}, [x9], x1 // p4
723         ld1             {v27.8h}, [x0], x1 // q3
724         ld1             {v20.8h}, [x9], x1 // p3
725         ld1             {v28.8h}, [x0], x1 // q4
726         ld1             {v21.8h}, [x9], x1 // p2
727         ld1             {v29.8h}, [x0], x1 // q5
728         ld1             {v22.8h}, [x9], x1 // p1
729         ld1             {v30.8h}, [x0], x1 // q6
730         ld1             {v23.8h}, [x9], x1 // p0
731         ld1             {v31.8h}, [x0], x1 // q7
732         sub             x9,  x9,  x1, lsl #3
733         sub             x0,  x0,  x1, lsl #3
734         add             x9,  x9,  x1
735
736         loop_filter_16
737
738         // If we did the flat8out part, we get the output in
739         // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
740         // store v2-v9 there, and v10-v17 into x0.
741         st1             {v2.8h},  [x9], x1
742         st1             {v10.8h}, [x0], x1
743         st1             {v3.8h},  [x9], x1
744         st1             {v11.8h}, [x0], x1
745         st1             {v4.8h},  [x9], x1
746         st1             {v12.8h}, [x0], x1
747         st1             {v5.8h},  [x9], x1
748         st1             {v13.8h}, [x0], x1
749         st1             {v6.8h},  [x9], x1
750         st1             {v14.8h}, [x0], x1
751         st1             {v8.8h},  [x9], x1
752         st1             {v15.8h}, [x0], x1
753         st1             {v9.8h},  [x9], x1
754         st1             {v17.8h}, [x0], x1
755         sub             x0,  x0,  x1, lsl #3
756         add             x0,  x0,  x1
757
758         br              x10
759 8:
760         add             x9,  x9,  x1, lsl #2
761         // If we didn't do the flat8out part, the output is left in the
762         // input registers.
763         st1             {v21.8h}, [x9], x1
764         st1             {v24.8h}, [x0], x1
765         st1             {v22.8h}, [x9], x1
766         st1             {v25.8h}, [x0], x1
767         st1             {v23.8h}, [x9], x1
768         st1             {v26.8h}, [x0], x1
769         sub             x0,  x0,  x1, lsl #1
770         sub             x0,  x0,  x1
771         br              x10
772 7:
773         sub             x9,  x0,  x1, lsl #1
774         st1             {v22.8h}, [x9], x1
775         st1             {v24.8h}, [x0], x1
776         st1             {v23.8h}, [x9], x1
777         st1             {v25.8h}, [x0], x1
778         sub             x0,  x0,  x1, lsl #1
779         br              x10
780 endfunc
781
782 bpp_frontends vp9_loop_filter_v_16_8, push=1
783 bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
784
785 function vp9_loop_filter_h_16_8_16_neon
786         mov             x10, x30
787         sub             x9,  x0,  #16
788         ld1             {v16.8h}, [x9], x1
789         ld1             {v24.8h}, [x0], x1
790         ld1             {v17.8h}, [x9], x1
791         ld1             {v25.8h}, [x0], x1
792         ld1             {v18.8h}, [x9], x1
793         ld1             {v26.8h}, [x0], x1
794         ld1             {v19.8h}, [x9], x1
795         ld1             {v27.8h}, [x0], x1
796         ld1             {v20.8h}, [x9], x1
797         ld1             {v28.8h}, [x0], x1
798         ld1             {v21.8h}, [x9], x1
799         ld1             {v29.8h}, [x0], x1
800         ld1             {v22.8h}, [x9], x1
801         ld1             {v30.8h}, [x0], x1
802         ld1             {v23.8h}, [x9], x1
803         ld1             {v31.8h}, [x0], x1
804         sub             x0,  x0,  x1, lsl #3
805         sub             x9,  x9,  x1, lsl #3
806
807         // The 16x8 pixels read above is in two 8x8 blocks; the left
808         // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
809         // of this, to get one column per register.
810         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
811         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
812
813         loop_filter_16
814
815         transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
816         transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
817
818         st1             {v16.8h}, [x9], x1
819         st1             {v10.8h}, [x0], x1
820         st1             {v2.8h},  [x9], x1
821         st1             {v11.8h}, [x0], x1
822         st1             {v3.8h},  [x9], x1
823         st1             {v12.8h}, [x0], x1
824         st1             {v4.8h},  [x9], x1
825         st1             {v13.8h}, [x0], x1
826         st1             {v5.8h},  [x9], x1
827         st1             {v14.8h}, [x0], x1
828         st1             {v6.8h},  [x9], x1
829         st1             {v15.8h}, [x0], x1
830         st1             {v8.8h},  [x9], x1
831         st1             {v17.8h}, [x0], x1
832         st1             {v9.8h},  [x9], x1
833         st1             {v31.8h}, [x0], x1
834         sub             x0,  x0,  x1, lsl #3
835
836         br              x10
837 8:
838         // The same writeback as in loop_filter_h_8_8
839         sub             x9,  x0,  #8
840         add             x0,  x9,  x1, lsl #2
841         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
842
843         st1             {v20.8h}, [x9], x1
844         st1             {v24.8h}, [x0], x1
845         st1             {v21.8h}, [x9], x1
846         st1             {v25.8h}, [x0], x1
847         st1             {v22.8h}, [x9], x1
848         st1             {v26.8h}, [x0], x1
849         st1             {v23.8h}, [x9], x1
850         st1             {v27.8h}, [x0], x1
851         sub             x0,  x0,  x1, lsl #3
852         add             x0,  x0,  #8
853         br              x10
854 7:
855         // The same writeback as in loop_filter_h_4_8
856         sub             x9,  x0,  #4
857         add             x0,  x9,  x1, lsl #2
858         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
859         st1             {v22.d}[0], [x9], x1
860         st1             {v22.d}[1], [x0], x1
861         st1             {v23.d}[0], [x9], x1
862         st1             {v23.d}[1], [x0], x1
863         st1             {v24.d}[0], [x9], x1
864         st1             {v24.d}[1], [x0], x1
865         st1             {v25.d}[0], [x9], x1
866         st1             {v25.d}[1], [x0], x1
867         sub             x0,  x0,  x1, lsl #3
868         add             x0,  x0,  #4
869         br              x10
870 endfunc
871
872 bpp_frontends vp9_loop_filter_h_16_8, push=1
873 bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1