]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp8dsp_neon.S
Merge commit '49f9c4272c4029b57ff300d908ba03c6332fc9c4'
[ffmpeg] / libavcodec / aarch64 / vp8dsp_neon.S
1 /*
2  * VP8 NEON optimisations
3  *
4  * Copyright (c) 2010 Rob Clark <rob@ti.com>
5  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6  * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
7  * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
8  *
9  * This file is part of FFmpeg.
10  *
11  * FFmpeg is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * FFmpeg is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with FFmpeg; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25
26 #include "libavutil/aarch64/asm.S"
27 #include "neon.S"
28
29 function ff_vp8_luma_dc_wht_neon, export=1
30         ld1             {v0.4h - v3.4h}, [x1]
31         movi            v30.8h, #0
32
33         add             v4.4h,  v0.4h,  v3.4h
34         add             v6.4h,  v1.4h,  v2.4h
35         st1             {v30.8h}, [x1], #16
36         sub             v7.4h,  v1.4h,  v2.4h
37         sub             v5.4h,  v0.4h,  v3.4h
38         st1             {v30.8h}, [x1]
39         add             v0.4h,  v4.4h,  v6.4h
40         add             v1.4h,  v5.4h,  v7.4h
41         sub             v2.4h,  v4.4h,  v6.4h
42         sub             v3.4h,  v5.4h,  v7.4h
43
44         movi            v16.4h, #3
45
46         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
47
48         add             v0.4h,  v0.4h,  v16.4h
49
50         add             v4.4h,  v0.4h,  v3.4h
51         add             v6.4h,  v1.4h,  v2.4h
52         sub             v7.4h,  v1.4h,  v2.4h
53         sub             v5.4h,  v0.4h,  v3.4h
54         add             v0.4h,  v4.4h,  v6.4h
55         add             v1.4h,  v5.4h,  v7.4h
56         sub             v2.4h,  v4.4h,  v6.4h
57         sub             v3.4h,  v5.4h,  v7.4h
58
59         sshr            v0.4h,  v0.4h,  #3
60         sshr            v1.4h,  v1.4h,  #3
61         sshr            v2.4h,  v2.4h,  #3
62         sshr            v3.4h,  v3.4h,  #3
63
64         mov             x3,  #32
65         st1             {v0.h}[0],  [x0], x3
66         st1             {v1.h}[0],  [x0], x3
67         st1             {v2.h}[0],  [x0], x3
68         st1             {v3.h}[0],  [x0], x3
69         st1             {v0.h}[1],  [x0], x3
70         st1             {v1.h}[1],  [x0], x3
71         st1             {v2.h}[1],  [x0], x3
72         st1             {v3.h}[1],  [x0], x3
73         st1             {v0.h}[2],  [x0], x3
74         st1             {v1.h}[2],  [x0], x3
75         st1             {v2.h}[2],  [x0], x3
76         st1             {v3.h}[2],  [x0], x3
77         st1             {v0.h}[3],  [x0], x3
78         st1             {v1.h}[3],  [x0], x3
79         st1             {v2.h}[3],  [x0], x3
80         st1             {v3.h}[3],  [x0], x3
81
82         ret
83 endfunc
84
85 function ff_vp8_idct_add_neon, export=1
86         ld1             {v0.8b - v3.8b},  [x1]
87         mov             w4,  #20091
88         movk            w4,  #35468/2, lsl #16
89         dup             v4.2s, w4
90
91         smull           v26.4s, v1.4h,  v4.h[0]
92         smull           v27.4s, v3.4h,  v4.h[0]
93         sqdmulh         v20.4h, v1.4h,  v4.h[1]
94         sqdmulh         v23.4h, v3.4h,  v4.h[1]
95         shrn            v21.4h, v26.4s, #16
96         shrn            v22.4h, v27.4s, #16
97         add             v21.4h, v21.4h, v1.4h
98         add             v22.4h, v22.4h, v3.4h
99
100         add             v16.4h,  v0.4h,   v2.4h
101         sub             v17.4h,  v0.4h,   v2.4h
102
103         add             v18.4h,  v21.4h,  v23.4h
104         sub             v19.4h,  v20.4h,  v22.4h
105
106         add             v0.4h,   v16.4h,  v18.4h
107         add             v1.4h,   v17.4h,  v19.4h
108         sub             v3.4h,   v16.4h,  v18.4h
109         sub             v2.4h,   v17.4h,  v19.4h
110
111         transpose_4x4H  v0, v1, v2, v3, v24, v5, v6, v7
112
113         movi            v29.8h, #0
114         smull           v26.4s,     v1.4h,  v4.h[0]
115         st1             {v29.8h},   [x1],   #16
116         smull           v27.4s,     v3.4h,  v4.h[0]
117         st1             {v29.16b},  [x1]
118         sqdmulh         v21.4h,     v1.4h,  v4.h[1]
119         sqdmulh         v23.4h,     v3.4h,  v4.h[1]
120         shrn            v20.4h,     v26.4s, #16
121         shrn            v22.4h,     v27.4s, #16
122         add             v20.4h,     v20.4h, v1.4h
123         add             v22.4h,     v22.4h, v3.4h
124         add             v16.4h,     v0.4h,  v2.4h
125         sub             v17.4h,     v0.4h,  v2.4h
126
127         add             v18.4h,     v20.4h, v23.4h
128         ld1             {v24.d}[0], [x0],   x2
129         zip1            v16.2d,     v16.2d, v17.2d
130         sub             v19.4h,     v21.4h, v22.4h
131         ld1             {v25.d}[0], [x0],   x2
132         zip1            v18.2d,     v18.2d, v19.2d
133         add             v0.8h,      v16.8h, v18.8h
134         ld1             {v25.d}[1], [x0],   x2
135         sub             v1.8h,      v16.8h, v18.8h
136         ld1             {v24.d}[1], [x0],   x2
137         srshr           v0.8h,      v0.8h,  #3
138         trn1            v24.4s,     v24.4s, v25.4s
139         srshr           v1.8h,      v1.8h,  #3
140         sub             x0,  x0,  x2,  lsl #2
141
142         ext             v1.16b, v1.16b, v1.16b, #8
143         trn1            v3.2d,  v0.2d,  v1.2d
144         trn2            v0.2d,  v0.2d,  v1.2d
145         trn1            v1.8h,  v3.8h,  v0.8h
146         trn2            v3.8h,  v3.8h,  v0.8h
147         uzp1            v0.4s,  v1.4s,  v3.4s
148         uzp2            v1.4s,  v3.4s,  v1.4s
149
150         uaddw           v0.8h,  v0.8h, v24.8b
151         uaddw2          v1.8h,  v1.8h, v24.16b
152         sqxtun          v0.8b,  v0.8h
153         sqxtun2         v0.16b, v1.8h
154         st1             {v0.s}[0],  [x0], x2
155         st1             {v0.s}[1],  [x0], x2
156         st1             {v0.s}[3],  [x0], x2
157         st1             {v0.s}[2],  [x0], x2
158
159         ret
160 endfunc
161
162 function ff_vp8_idct_dc_add4uv_neon, export=1
163         movi            v0.4h,  #0
164         mov             x3,     #32
165         ld1r            {v16.4h},  [x1]
166         st1             {v0.h}[0], [x1], x3
167         ld1r            {v17.4h},  [x1]
168         st1             {v0.h}[0], [x1], x3
169         ld1r            {v18.4h},  [x1]
170         st1             {v0.h}[0], [x1], x3
171         ld1r            {v19.4h},  [x1]
172         st1             {v0.h}[0], [x1], x3
173         ins             v16.d[1],  v17.d[0]
174         ins             v18.d[1],  v19.d[0]
175         mov             x3,  x0
176         srshr           v16.8h,    v16.8h,  #3            // dc >>= 3
177         ld1             {v0.8b},   [x0], x2
178         srshr           v18.8h,    v18.8h,  #3
179         ld1             {v1.8b},   [x0], x2
180         uaddw           v20.8h,    v16.8h, v0.8b
181         ld1             {v2.8b},   [x0], x2
182         uaddw           v0.8h,     v16.8h, v1.8b
183         ld1             {v3.8b},   [x0], x2
184         uaddw           v22.8h,    v16.8h, v2.8b
185         ld1             {v4.8b},   [x0], x2
186         uaddw           v2.8h,     v16.8h, v3.8b
187         ld1             {v5.8b},   [x0], x2
188         uaddw           v24.8h,    v18.8h, v4.8b
189         ld1             {v6.8b},   [x0], x2
190         uaddw           v4.8h,     v18.8h, v5.8b
191         ld1             {v7.8b},   [x0], x2
192         uaddw           v26.8h,    v18.8h, v6.8b
193         sqxtun          v20.8b,    v20.8h
194         uaddw           v6.8h,     v18.8h, v7.8b
195         sqxtun          v21.8b,    v0.8h
196         sqxtun          v22.8b,    v22.8h
197         st1             {v20.8b},  [x3], x2
198         sqxtun          v23.8b,    v2.8h
199         st1             {v21.8b},  [x3], x2
200         sqxtun          v24.8b,    v24.8h
201         st1             {v22.8b},  [x3], x2
202         sqxtun          v25.8b,    v4.8h
203         st1             {v23.8b},  [x3], x2
204         sqxtun          v26.8b,    v26.8h
205         st1             {v24.8b},  [x3], x2
206         sqxtun          v27.8b,    v6.8h
207         st1             {v25.8b},  [x3], x2
208         st1             {v26.8b},  [x3], x2
209         st1             {v27.8b},  [x3], x2
210
211         ret
212 endfunc
213
214 function ff_vp8_idct_dc_add4y_neon, export=1
215         movi            v0.16b,  #0
216         mov             x3,  #32
217         ld1r            {v16.4h},    [x1]
218         st1             {v0.h}[0],   [x1], x3
219         ld1r            {v17.4h},    [x1]
220         st1             {v0.h}[0],   [x1], x3
221         zip1            v16.2d,      v16.2d, v17.2d
222         ld1r            {v18.4h},    [x1]
223         st1             {v0.h}[0],   [x1], x3
224         ld1r            {v19.4h},    [x1]
225         st1             {v0.h}[0],   [x1], x3
226         zip1            v18.2d,      v18.2d, v19.2d
227         srshr           v16.8h,      v16.8h,  #3            // dc >>= 3
228         ld1             {v0.16b},     [x0], x2
229         srshr           v18.8h,       v18.8h,  #3
230         ld1             {v1.16b},     [x0], x2
231         uaddw           v20.8h,       v16.8h,  v0.8b
232         ld1             {v2.16b},     [x0], x2
233         uaddw2          v0.8h,        v18.8h,   v0.16b
234         ld1             {v3.16b},     [x0], x2
235         uaddw           v21.8h, v16.8h,  v1.8b
236         uaddw2          v1.8h,  v18.8h,  v1.16b
237         uaddw           v22.8h, v16.8h,  v2.8b
238         uaddw2          v2.8h,  v18.8h,  v2.16b
239         uaddw           v23.8h, v16.8h,  v3.8b
240         uaddw2          v3.8h,  v18.8h,  v3.16b
241         sub             x0,  x0,  x2,  lsl #2
242         sqxtun          v20.8b,  v20.8h
243         sqxtun2         v20.16b, v0.8h
244         sqxtun          v21.8b,  v21.8h
245         sqxtun2         v21.16b, v1.8h
246         sqxtun          v22.8b,  v22.8h
247         st1             {v20.16b},    [x0], x2
248         sqxtun2         v22.16b, v2.8h
249         st1             {v21.16b},    [x0], x2
250         sqxtun          v23.8b,  v23.8h
251         st1             {v22.16b},    [x0], x2
252         sqxtun2         v23.16b, v3.8h
253         st1             {v23.16b},    [x0], x2
254
255         ret
256 endfunc
257
258 function ff_vp8_idct_dc_add_neon, export=1
259         mov             w3,       #0
260         ld1r            {v2.8h},  [x1]
261         strh            w3,       [x1]
262         srshr           v2.8h,  v2.8h,  #3
263         ld1             {v0.s}[0],  [x0], x2
264         ld1             {v0.s}[1],  [x0], x2
265         uaddw           v3.8h,  v2.8h,  v0.8b
266         ld1             {v1.s}[0],  [x0], x2
267         ld1             {v1.s}[1],  [x0], x2
268         uaddw           v4.8h,  v2.8h,  v1.8b
269         sqxtun          v0.8b,  v3.8h
270         sqxtun          v1.8b,  v4.8h
271         sub             x0,  x0,  x2, lsl #2
272         st1             {v0.s}[0],  [x0], x2
273         st1             {v0.s}[1],  [x0], x2
274         st1             {v1.s}[0],  [x0], x2
275         st1             {v1.s}[1],  [x0], x2
276         ret
277 endfunc
278
279 // Register layout:
280 //   P3..Q3 -> v0..v7
281 //   flim_E -> v22
282 //   flim_I -> v23
283 //   hev_thresh -> x5
284 //
285 .macro  vp8_loop_filter, inner=0, simple=0, hev_thresh
286     .if \simple
287         uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
288         uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
289         uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
290         ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
291         uqadd           v19.16b, v17.16b,  v18.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
292         movi            v21.16b, #0x80
293         cmhs            v16.16b, v22.16b, v19.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
294     .else
295         // calculate hev and normal_limit:
296         uabd            v20.16b, v2.16b,  v3.16b      // abs(P1-P0)
297         uabd            v21.16b, v5.16b,  v4.16b      // abs(Q1-Q0)
298         uabd            v18.16b, v0.16b,  v1.16b      // abs(P3-P2)
299         uabd            v19.16b, v1.16b,  v2.16b      // abs(P2-P1)
300         cmhs            v16.16b, v23.16b, v20.16b     // abs(P1-P0) <= flim_I
301         cmhs            v17.16b, v23.16b, v21.16b     // abs(Q1-Q0) <= flim_I
302         cmhs            v18.16b, v23.16b, v18.16b     // abs(P3-P2) <= flim_I
303         cmhs            v19.16b, v23.16b, v19.16b     // abs(P2-P1) <= flim_I
304         and             v16.16b, v17.16b, v16.16b
305         uabd            v17.16b, v7.16b,  v6.16b      // abs(Q3-Q2)
306         and             v16.16b, v16.16b, v19.16b
307         uabd            v19.16b, v6.16b,  v5.16b      // abs(Q2-Q1)
308         and             v16.16b, v16.16b, v18.16b
309         cmhs            v18.16b, v23.16b, v17.16b     // abs(Q3-Q2) <= flim_I
310         cmhs            v19.16b, v23.16b, v19.16b     // abs(Q2-Q1) <= flim_I
311         uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
312         uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
313         and             v16.16b, v16.16b, v18.16b
314         uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
315         and             v16.16b, v16.16b, v19.16b
316         ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
317         dup             v23.16b, \hev_thresh          // hev_thresh
318         uqadd           v19.16b, v17.16b, v18.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
319         cmhi            v20.16b, v20.16b, v23.16b     // abs(P1-P0) > hev_thresh
320         cmhs            v19.16b, v22.16b, v19.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
321         cmhi            v22.16b, v21.16b, v23.16b     // abs(Q1-Q0) > hev_thresh
322         and             v16.16b, v16.16b, v19.16b
323         movi            v21.16b, #0x80
324         orr             v17.16b, v20.16b, v22.16b
325     .endif
326
327         // at this point:
328         //   v16: normal_limit
329         //   v17: hev
330
331         // convert to signed value:
332         eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
333         eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
334
335         movi           v20.8h, #3
336         ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
337         ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
338         eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
339         eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
340         mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
341         mul            v19.8h, v19.8h, v20.8h
342
343         sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
344         movi           v22.16b, #4
345         movi           v23.16b, #3
346     .if \inner
347         and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
348     .endif
349         saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
350         saddw2         v19.8h,  v19.8h, v20.16b
351         sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
352         sqxtn2         v18.16b, v19.8h
353     .if !\inner && !\simple
354         eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
355         eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
356     .endif
357         and            v18.16b, v18.16b, v16.16b         // w &= normal_limit
358
359         // registers used at this point..
360         //   v0 -> P3  (don't corrupt)
361         //   v1-v6 -> PS2-QS2
362         //   v7 -> Q3  (don't corrupt)
363         //   v17 -> hev
364         //   v18 -> w
365         //   v21 -> #0x80
366         //   v22 -> #4
367         //   v23 -> #3
368         //   v16, v19, v29 -> unused
369         //
370         // filter_common:   is4tap==1
371         //   c1 = clamp(w + 4) >> 3;
372         //   c2 = clamp(w + 3) >> 3;
373         //   Q0 = s2u(QS0 - c1);
374         //   P0 = s2u(PS0 + c2);
375
376     .if \simple
377         sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
378         sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
379         sshr           v19.16b, v19.16b, #3                // c1 >>= 3
380         sshr           v20.16b, v20.16b, #3                // c2 >>= 3
381         sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
382         sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
383         eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
384         eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
385         eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
386         eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
387     .elseif \inner
388         // the !is4tap case of filter_common, only used for inner blocks
389         //   c3 = ((c1&~hev) + 1) >> 1;
390         //   Q1 = s2u(QS1 - c3);
391         //   P1 = s2u(PS1 + c3);
392         sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
393         sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
394         sshr           v19.16b, v19.16b, #3                // c1 >>= 3
395         sshr           v20.16b, v20.16b, #3                // c2 >>= 3
396         sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
397         sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
398         bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
399         eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
400         srshr          v19.16b, v19.16b, #1                // c3 >>= 1
401         eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
402         sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
403         sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
404         eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
405         eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
406     .else
407         and            v20.16b, v18.16b, v17.16b           // w & hev
408         sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
409         sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
410         sshr           v19.16b, v19.16b, #3                // c1 >>= 3
411         sshr           v20.16b, v20.16b, #3                // c2 >>= 3
412         bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
413         sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
414         sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
415
416         // filter_mbedge:
417         //   a = clamp((27*w + 63) >> 7);
418         //   Q0 = s2u(QS0 - a);
419         //   P0 = s2u(PS0 + a);
420         //   a = clamp((18*w + 63) >> 7);
421         //   Q1 = s2u(QS1 - a);
422         //   P1 = s2u(PS1 + a);
423         //   a = clamp((9*w + 63) >> 7);
424         //   Q2 = s2u(QS2 - a);
425         //   P2 = s2u(PS2 + a);
426         movi           v17.8h,  #63
427         sshll          v22.8h,  v18.8b, #3
428         sshll2         v23.8h,  v18.16b, #3
429         saddw          v22.8h,  v22.8h, v18.8b
430         saddw2         v23.8h,  v23.8h, v18.16b
431         add            v16.8h,  v17.8h, v22.8h
432         add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
433         add            v19.8h,  v16.8h, v22.8h
434         add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
435         add            v22.8h,  v19.8h, v22.8h
436         add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
437         sqshrn         v16.8b,  v16.8h,  #7
438         sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
439         sqshrn         v19.8b,  v19.8h, #7
440         sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
441         sqshrn         v22.8b,  v22.8h, #7
442         sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
443         sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
444         sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
445         sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
446         sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
447         sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
448         sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
449         eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
450         eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
451         eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
452         eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
453         eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
454         eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
455     .endif
456 .endm
457
458 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
459 function ff_vp8_v_loop_filter16\name\()_neon, export=1
460         sub             x0,  x0,  x1,  lsl #1+!\simple
461
462         // Load pixels:
463     .if !\simple
464         ld1             {v0.16b},     [x0], x1 // P3
465         ld1             {v1.16b},     [x0], x1 // P2
466     .endif
467         ld1             {v2.16b},     [x0], x1 // P1
468         ld1             {v3.16b},     [x0], x1 // P0
469         ld1             {v4.16b},     [x0], x1 // Q0
470         ld1             {v5.16b},     [x0], x1 // Q1
471     .if !\simple
472         ld1             {v6.16b},     [x0], x1 // Q2
473         ld1             {v7.16b},     [x0]     // Q3
474         dup             v23.16b, w3                 // flim_I
475     .endif
476         dup             v22.16b, w2                 // flim_E
477
478         vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
479
480         // back up to P2:  dst -= stride * 6
481         sub             x0,  x0,  x1,  lsl #2
482     .if !\simple
483         sub             x0,  x0,  x1,  lsl #1
484
485         // Store pixels:
486         st1             {v1.16b},     [x0], x1 // P2
487     .endif
488         st1             {v2.16b},     [x0], x1 // P1
489         st1             {v3.16b},     [x0], x1 // P0
490         st1             {v4.16b},     [x0], x1 // Q0
491         st1             {v5.16b},     [x0], x1 // Q1
492     .if !\simple
493         st1             {v6.16b},     [x0]     // Q2
494     .endif
495
496         ret
497 endfunc
498 .endm
499
500 vp8_v_loop_filter16
501 vp8_v_loop_filter16 _inner,  inner=1
502 vp8_v_loop_filter16 _simple, simple=1
503
504 .macro  vp8_v_loop_filter8uv name, inner=0
505 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
506         sub             x0,  x0,  x2,  lsl #2
507         sub             x1,  x1,  x2,  lsl #2
508         // Load pixels:
509         ld1          {v0.d}[0],     [x0], x2  // P3
510         ld1          {v0.d}[1],     [x1], x2  // P3
511         ld1          {v1.d}[0],     [x0], x2  // P2
512         ld1          {v1.d}[1],     [x1], x2  // P2
513         ld1          {v2.d}[0],     [x0], x2  // P1
514         ld1          {v2.d}[1],     [x1], x2  // P1
515         ld1          {v3.d}[0],     [x0], x2  // P0
516         ld1          {v3.d}[1],     [x1], x2  // P0
517         ld1          {v4.d}[0],     [x0], x2  // Q0
518         ld1          {v4.d}[1],     [x1], x2  // Q0
519         ld1          {v5.d}[0],     [x0], x2  // Q1
520         ld1          {v5.d}[1],     [x1], x2  // Q1
521         ld1          {v6.d}[0],     [x0], x2  // Q2
522         ld1          {v6.d}[1],     [x1], x2  // Q2
523         ld1          {v7.d}[0],     [x0]      // Q3
524         ld1          {v7.d}[1],     [x1]      // Q3
525
526         dup          v22.16b, w3                 // flim_E
527         dup          v23.16b, w4                 // flim_I
528
529         vp8_loop_filter inner=\inner, hev_thresh=w5
530
531         // back up to P2:  u,v -= stride * 6
532         sub          x0,  x0,  x2,  lsl #2
533         sub          x1,  x1,  x2,  lsl #2
534         sub          x0,  x0,  x2,  lsl #1
535         sub          x1,  x1,  x2,  lsl #1
536
537         // Store pixels:
538
539         st1          {v1.d}[0],     [x0], x2  // P2
540         st1          {v1.d}[1],     [x1], x2  // P2
541         st1          {v2.d}[0],     [x0], x2  // P1
542         st1          {v2.d}[1],     [x1], x2  // P1
543         st1          {v3.d}[0],     [x0], x2  // P0
544         st1          {v3.d}[1],     [x1], x2  // P0
545         st1          {v4.d}[0],     [x0], x2  // Q0
546         st1          {v4.d}[1],     [x1], x2  // Q0
547         st1          {v5.d}[0],     [x0], x2  // Q1
548         st1          {v5.d}[1],     [x1], x2  // Q1
549         st1          {v6.d}[0],     [x0]      // Q2
550         st1          {v6.d}[1],     [x1]      // Q2
551
552         ret
553 endfunc
554 .endm
555
556 vp8_v_loop_filter8uv
557 vp8_v_loop_filter8uv _inner, inner=1
558
559 .macro  vp8_h_loop_filter16 name, inner=0, simple=0
560 function ff_vp8_h_loop_filter16\name\()_neon, export=1
561
562         sub             x0,  x0,  #4
563         // Load pixels:
564         ld1             {v0.d}[0], [x0], x1
565         ld1             {v1.d}[0], [x0], x1
566         ld1             {v2.d}[0], [x0], x1
567         ld1             {v3.d}[0], [x0], x1
568         ld1             {v4.d}[0], [x0], x1
569         ld1             {v5.d}[0], [x0], x1
570         ld1             {v6.d}[0], [x0], x1
571         ld1             {v7.d}[0], [x0], x1
572         ld1             {v0.d}[1], [x0], x1
573         ld1             {v1.d}[1], [x0], x1
574         ld1             {v2.d}[1], [x0], x1
575         ld1             {v3.d}[1], [x0], x1
576         ld1             {v4.d}[1], [x0], x1
577         ld1             {v5.d}[1], [x0], x1
578         ld1             {v6.d}[1], [x0], x1
579         ld1             {v7.d}[1], [x0], x1
580
581         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
582
583         dup             v22.16b, w2                 // flim_E
584     .if !\simple
585         dup             v23.16b, w3                 // flim_I
586     .endif
587
588         vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
589
590         sub             x0,  x0,  x1, lsl #4    // backup 16 rows
591
592         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
593
594         // Store pixels:
595         st1             {v0.d}[0], [x0], x1
596         st1             {v1.d}[0], [x0], x1
597         st1             {v2.d}[0], [x0], x1
598         st1             {v3.d}[0], [x0], x1
599         st1             {v4.d}[0], [x0], x1
600         st1             {v5.d}[0], [x0], x1
601         st1             {v6.d}[0], [x0], x1
602         st1             {v7.d}[0], [x0], x1
603         st1             {v0.d}[1], [x0], x1
604         st1             {v1.d}[1], [x0], x1
605         st1             {v2.d}[1], [x0], x1
606         st1             {v3.d}[1], [x0], x1
607         st1             {v4.d}[1], [x0], x1
608         st1             {v5.d}[1], [x0], x1
609         st1             {v6.d}[1], [x0], x1
610         st1             {v7.d}[1], [x0]
611
612         ret
613 endfunc
614 .endm
615
616 vp8_h_loop_filter16
617 vp8_h_loop_filter16 _inner,  inner=1
618 vp8_h_loop_filter16 _simple, simple=1
619
620 .macro  vp8_h_loop_filter8uv name, inner=0
621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
622         sub             x0,  x0,  #4
623         sub             x1,  x1,  #4
624
625         // Load pixels:
626         ld1          {v0.d}[0],     [x0], x2 // load u
627         ld1          {v0.d}[1],     [x1], x2 // load v
628         ld1          {v1.d}[0],     [x0], x2
629         ld1          {v1.d}[1],     [x1], x2
630         ld1          {v2.d}[0],     [x0], x2
631         ld1          {v2.d}[1],     [x1], x2
632         ld1          {v3.d}[0],     [x0], x2
633         ld1          {v3.d}[1],     [x1], x2
634         ld1          {v4.d}[0],     [x0], x2
635         ld1          {v4.d}[1],     [x1], x2
636         ld1          {v5.d}[0],     [x0], x2
637         ld1          {v5.d}[1],     [x1], x2
638         ld1          {v6.d}[0],     [x0], x2
639         ld1          {v6.d}[1],     [x1], x2
640         ld1          {v7.d}[0],     [x0], x2
641         ld1          {v7.d}[1],     [x1], x2
642
643         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
644
645         dup             v22.16b, w3                 // flim_E
646         dup             v23.16b, w4                 // flim_I
647
648         vp8_loop_filter inner=\inner, hev_thresh=w5
649
650         sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
651         sub             x1,  x1,  x2, lsl #3    // backup v 8 rows
652
653         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
654
655         // Store pixels:
656         st1          {v0.d}[0],     [x0], x2 // load u
657         st1          {v0.d}[1],     [x1], x2 // load v
658         st1          {v1.d}[0],     [x0], x2
659         st1          {v1.d}[1],     [x1], x2
660         st1          {v2.d}[0],     [x0], x2
661         st1          {v2.d}[1],     [x1], x2
662         st1          {v3.d}[0],     [x0], x2
663         st1          {v3.d}[1],     [x1], x2
664         st1          {v4.d}[0],     [x0], x2
665         st1          {v4.d}[1],     [x1], x2
666         st1          {v5.d}[0],     [x0], x2
667         st1          {v5.d}[1],     [x1], x2
668         st1          {v6.d}[0],     [x0], x2
669         st1          {v6.d}[1],     [x1], x2
670         st1          {v7.d}[0],     [x0]
671         st1          {v7.d}[1],     [x1]
672
673         ret
674
675 endfunc
676 .endm
677
678 vp8_h_loop_filter8uv
679 vp8_h_loop_filter8uv _inner, inner=1
680
681
682 function ff_put_vp8_pixels16_neon, export=1
683 1:
684         subs            w4, w4, #4
685         ld1             {v0.16b},     [x2], x3
686         ld1             {v1.16b},     [x2], x3
687         ld1             {v2.16b},     [x2], x3
688         ld1             {v3.16b},     [x2], x3
689         st1             {v0.16b},     [x0], x1
690         st1             {v1.16b},     [x0], x1
691         st1             {v2.16b},     [x0], x1
692         st1             {v3.16b},     [x0], x1
693         b.gt            1b
694         ret
695 endfunc
696
697 function ff_put_vp8_pixels8_neon, export=1
698 1:
699         subs            w4, w4, #4
700         ld1             {v0.8b},   [x2], x3
701         ld1             {v0.d}[1], [x2], x3
702         ld1             {v1.8b},   [x2], x3
703         ld1             {v1.d}[1], [x2], x3
704         st1             {v0.8b},   [x0], x1
705         st1             {v0.d}[1], [x0], x1
706         st1             {v1.8b},   [x0], x1
707         st1             {v1.d}[1], [x0], x1
708         b.gt            1b
709         ret
710 endfunc
711
712 /* 4/6-tap 8th-pel MC */
713
714 .macro  vp8_epel8_h6    d,   s0,   s1
715         ext             v22.8b, \s0\().8b,  \s1\().8b,  #1
716         uxtl            v18.8h, \s0\().8b
717         ext             v23.8b, \s0\().8b,  \s1\().8b,  #2
718         uxtl            v19.8h, v22.8b
719         ext             v24.8b, \s0\().8b,  \s1\().8b,  #3
720         uxtl            v21.8h, v23.8b
721         ext             v25.8b, \s0\().8b,  \s1\().8b,  #4
722         uxtl            v22.8h, v24.8b
723         ext             v26.8b, \s0\().8b,  \s1\().8b,  #5
724         uxtl            v25.8h, v25.8b
725         mul             v21.8h, v21.8h, v0.h[2]
726         uxtl            v26.8h, v26.8b
727         mul             v22.8h, v22.8h, v0.h[3]
728         mls             v21.8h, v19.8h, v0.h[1]
729         mls             v22.8h, v25.8h, v0.h[4]
730         mla             v21.8h, v18.8h, v0.h[0]
731         mla             v22.8h, v26.8h, v0.h[5]
732         sqadd           v22.8h, v21.8h, v22.8h
733         sqrshrun        \d\().8b, v22.8h, #7
734 .endm
735
736 .macro  vp8_epel16_h6   d0,  v0,  v1
737         ext             v22.16b, \v0\().16b, \v1\().16b, #3
738         ext             v23.16b, \v0\().16b, \v1\().16b, #4
739         uxtl            v19.8h,  v22.8b
740         uxtl2           v22.8h,  v22.16b
741         ext             v3.16b,  \v0\().16b, \v1\().16b, #2
742         uxtl            v20.8h,  v23.8b
743         uxtl2           v23.8h,  v23.16b
744         ext             v16.16b, \v0\().16b, \v1\().16b, #1
745         uxtl            v18.8h,  v3.8b
746         uxtl2           v3.8h,   v3.16b
747         ext             v2.16b,  \v0\().16b, \v1\().16b, #5
748         uxtl            v21.8h,  v2.8b
749         uxtl2           v2.8h,   v2.16b
750         uxtl            v17.8h,  v16.8b
751         uxtl2           v16.8h,  v16.16b
752         mul             v19.8h,  v19.8h, v0.h[3]
753         mul             v18.8h,  v18.8h, v0.h[2]
754         mul             v3.8h,   v3.8h,  v0.h[2]
755         mul             v22.8h,  v22.8h, v0.h[3]
756         mls             v19.8h,  v20.8h, v0.h[4]
757         uxtl            v20.8h,  \v0\().8b
758         uxtl2           v1.8h,   \v0\().16b
759         mls             v18.8h,  v17.8h, v0.h[1]
760         mls             v3.8h,   v16.8h, v0.h[1]
761         mls             v22.8h,  v23.8h, v0.h[4]
762         mla             v18.8h,  v20.8h, v0.h[0]
763         mla             v19.8h,  v21.8h, v0.h[5]
764         mla             v3.8h,   v1.8h,  v0.h[0]
765         mla             v22.8h,  v2.8h,  v0.h[5]
766         sqadd           v19.8h,  v18.8h, v19.8h
767         sqadd           v22.8h,  v3.8h,  v22.8h
768         sqrshrun        \d0\().8b,  v19.8h, #7
769         sqrshrun2       \d0\().16b, v22.8h, #7
770 .endm
771
772 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
773         uxtl            \s0\().8h, \s0\().8b
774         uxtl            \s3\().8h, \s3\().8b
775         uxtl            \s6\().8h, \s6\().8b
776         uxtl            \s1\().8h, \s1\().8b
777         uxtl            \s4\().8h, \s4\().8b
778         uxtl            \s2\().8h, \s2\().8b
779         uxtl            \s5\().8h, \s5\().8b
780         mul             \s0\().8h, \s0\().8h, v0.h[0]
781         mul             v31.8h   , \s3\().8h, v0.h[3]
782         mul             \s3\().8h, \s3\().8h, v0.h[2]
783         mul             \s6\().8h, \s6\().8h, v0.h[5]
784
785         mls             \s0\().8h, \s1\().8h, v0.h[1]
786         mls             v31.8h   , \s4\().8h, v0.h[4]
787         mls             \s3\().8h, \s2\().8h, v0.h[1]
788         mls             \s6\().8h, \s5\().8h, v0.h[4]
789
790         mla             \s0\().8h, \s2\().8h, v0.h[2]
791         mla             v31.8h   , \s5\().8h, v0.h[5]
792         mla             \s3\().8h, \s1\().8h, v0.h[0]
793         mla             \s6\().8h, \s4\().8h, v0.h[3]
794         sqadd           v31.8h   , \s0\().8h, v31.8h
795         sqadd           \s6\().8h, \s3\().8h, \s6\().8h
796         sqrshrun        \d0\().8b, v31.8h,    #7
797         sqrshrun        \d1\().8b, \s6\().8h, #7
798 .endm
799
800 .macro  vp8_epel8_h4    d,   v0,   v1
801         ext             v22.8b, \v0\().8b,  \v1\().8b,  #1
802         uxtl            v19.8h, \v0\().8b
803         ext             v23.8b, \v0\().8b,  \v1\().8b,  #2
804         uxtl            v20.8h, v22.8b
805         ext             v25.8b, \v0\().8b,  \v1\().8b,  #3
806         uxtl            v22.8h, v23.8b
807         uxtl            v25.8h, v25.8b
808         mul             v20.8h, v20.8h, v0.h[2]
809         mul             v22.8h, v22.8h, v0.h[3]
810         mls             v20.8h, v19.8h, v0.h[1]
811         mls             v22.8h, v25.8h, v0.h[4]
812         sqadd           v22.8h, v20.8h, v22.8h
813         sqrshrun        \d\().8b, v22.8h, #7
814 .endm
815
816 .macro  vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
817         uxtl            \s0\().8h,  \s0\().8b
818         uxtl            \s1\().8h,  \s1\().8b
819         uxtl            \s2\().8h,  \s2\().8b
820         uxtl            \s3\().8h,  \s3\().8b
821         uxtl            \s4\().8h,  \s4\().8b
822         mul             v21.8h,     \s1\().8h, v0.h[2]
823         mul             v23.8h,     \s2\().8h, v0.h[3]
824         mul             \s2\().8h,  \s2\().8h, v0.h[2]
825         mul             v22.8h,     \s3\().8h, v0.h[3]
826         mls             v21.8h,     \s0\().8h, v0.h[1]
827         mls             v23.8h,     \s3\().8h, v0.h[4]
828         mls             \s2\().8h,  \s1\().8h, v0.h[1]
829         mls             v22.8h,     \s4\().8h, v0.h[4]
830         sqadd           v21.8h,     v21.8h,    v23.8h
831         sqadd           \s2\().8h,  \s2\().8h, v22.8h
832         sqrshrun        \d0\().8b,  v21.8h,    #7
833         sqrshrun2       \d0\().16b, \s2\().8h, #7
834 .endm
835
836
837 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
838 // arithmetic can be used to apply filters
839 const   subpel_filters, align=4
840         .short     0,   6, 123,  12,   1,   0,   0,   0
841         .short     2,  11, 108,  36,   8,   1,   0,   0
842         .short     0,   9,  93,  50,   6,   0,   0,   0
843         .short     3,  16,  77,  77,  16,   3,   0,   0
844         .short     0,   6,  50,  93,   9,   0,   0,   0
845         .short     1,   8,  36, 108,  11,   2,   0,   0
846         .short     0,   1,  12, 123,   6,   0,   0,   0
847 endconst
848
849 function ff_put_vp8_epel16_v6_neon, export=1
850         sub             x2,  x2,  x3,  lsl #1
851
852         sxtw            x4,  w4
853         sxtw            x6,  w6
854         movrel          x17,  subpel_filters, -16
855         add             x6,  x17,  x6, lsl #4  // y
856         ld1             {v0.8h},     [x6]
857 1:
858         ld1             {v1.1d - v2.1d},    [x2], x3
859         ld1             {v3.1d - v4.1d},    [x2], x3
860         ld1             {v16.1d - v17.1d},  [x2], x3
861         ld1             {v18.1d - v19.1d},  [x2], x3
862         ld1             {v20.1d - v21.1d},  [x2], x3
863         ld1             {v22.1d - v23.1d},  [x2], x3
864         ld1             {v24.1d - v25.1d},  [x2]
865         sub             x2,  x2,  x3, lsl #2
866
867         vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
868         vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
869
870         st1             {v1.1d - v2.1d}, [x0], x1
871         st1             {v3.1d - v4.1d}, [x0], x1
872         subs            x4, x4, #2
873         b.ne            1b
874
875         ret
876 endfunc
877
878 function ff_put_vp8_epel16_h6_neon, export=1
879         sub             x2,  x2,  #2
880         sxtw            x5,  w5 // x
881
882         // first pass (horizontal):
883         movrel          x17,  subpel_filters, -16
884         add             x5,  x17,  x5, lsl #4 // x
885         ld1             {v0.8h},  [x5]
886 1:
887         ld1             {v1.16b, v2.16b}, [x2], x3
888         vp8_epel16_h6   v1, v1, v2
889         st1             {v1.16b}, [x0], x1
890
891         subs            w4, w4, #1
892         b.ne            1b
893         ret
894 endfunc
895
896
897 function ff_put_vp8_epel16_h6v6_neon, export=1
898         sub             x2,  x2,  x3,  lsl #1
899         sub             x2,  x2,  #2
900
901         // first pass (horizontal):
902         movrel          x17,  subpel_filters, -16
903         sxtw            x5,  w5 // x
904         add             x16,  x17,  x5, lsl #4 // x
905         sub             sp,  sp,  #336+16
906         ld1             {v0.8h},  [x16]
907         add             x7,  sp,  #15
908         sxtw            x4,  w4
909         add             x16, x4, #5   // h
910         bic             x7,  x7,  #15
911 1:
912         ld1             {v1.16b, v2.16b}, [x2], x3
913         vp8_epel16_h6   v1, v1, v2
914         st1             {v1.16b}, [x7], #16
915         subs            x16, x16, #1
916         b.ne            1b
917
918
919         // second pass (vertical):
920         sxtw            x6,  w6
921         add             x6,  x17,  x6, lsl #4  // y
922         add             x7,  sp,  #15
923         ld1             {v0.8h},     [x6]
924         bic             x7,  x7,  #15
925 2:
926         ld1             {v1.8b - v4.8b},    [x7], #32
927         ld1             {v16.8b - v19.8b},  [x7], #32
928         ld1             {v20.8b - v23.8b},  [x7], #32
929         ld1             {v24.8b - v25.8b},  [x7]
930         sub             x7,  x7,  #64
931
932         vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
933         vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
934         trn1            v1.2d, v1.2d, v2.2d
935         trn1            v3.2d, v3.2d, v4.2d
936
937         st1             {v1.16b}, [x0], x1
938         st1             {v3.16b}, [x0], x1
939         subs            x4, x4, #2
940         b.ne            2b
941
942         add             sp,  sp,  #336+16
943         ret
944 endfunc
945
946 function ff_put_vp8_epel8_v6_neon, export=1
947         sub             x2,  x2,  x3,  lsl #1
948
949         movrel          x7,  subpel_filters, -16
950         add             x6,  x7,  w6, uxtw #4
951         ld1             {v0.8h},  [x6]
952 1:
953         ld1             {v2.8b},  [x2], x3
954         ld1             {v3.8b},  [x2], x3
955         ld1             {v4.8b},  [x2], x3
956         ld1             {v5.8b},  [x2], x3
957         ld1             {v6.8b},  [x2], x3
958         ld1             {v7.8b},  [x2], x3
959         ld1             {v28.8b}, [x2]
960
961         sub             x2,  x2,  x3,  lsl #2
962
963         vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
964
965         st1             {v2.8b}, [x0], x1
966         st1             {v3.8b}, [x0], x1
967         subs            w4,  w4,  #2
968         b.ne            1b
969
970         ret
971 endfunc
972
973 function ff_put_vp8_epel8_h6_neon, export=1
974         sub             x2,  x2,  #2
975
976         movrel          x7,  subpel_filters, -16
977         add             x5,  x7,  w5, uxtw #4
978         ld1             {v0.8h},        [x5]
979 1:
980         ld1             {v2.8b, v3.8b}, [x2], x3
981
982         vp8_epel8_h6    v2,  v2,  v3
983
984         st1             {v2.8b}, [x0], x1
985         subs            w4,  w4,  #1
986         b.ne            1b
987
988         ret
989 endfunc
990
991 function ff_put_vp8_epel8_h6v6_neon, export=1
992         sub             x2,  x2,  x3,  lsl #1
993         sub             x2,  x2,  #2
994         sxtw            x4,  w4
995
996         // first pass (horizontal):
997         movrel          x17,  subpel_filters, -16
998         sxtw            x5,  w5
999         add             x5,  x17,  x5, lsl #4 // x
1000         sub             sp,  sp,  #168+16
1001         ld1             {v0.8h},  [x5]
1002         add             x7,  sp,  #15
1003         add             x16, x4,  #5   // h
1004         bic             x7,  x7,  #15
1005 1:
1006         ld1             {v1.8b, v2.8b}, [x2], x3
1007
1008         vp8_epel8_h6    v1, v1, v2
1009
1010         st1             {v1.8b}, [x7], #8
1011         subs            x16, x16, #1
1012         b.ne            1b
1013
1014         // second pass (vertical):
1015         sxtw            x6,  w6
1016         add             x6,  x17,  x6, lsl #4  // y
1017         add             x7,  sp,   #15
1018         ld1             {v0.8h},   [x6]
1019         bic             x7,  x7,   #15
1020 2:
1021         ld1             {v1.8b - v4.8b}, [x7], #32
1022         ld1             {v5.8b - v7.8b}, [x7]
1023
1024         sub             x7,  x7,  #16
1025
1026         vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1027
1028         st1             {v1.8b}, [x0], x1
1029         st1             {v2.8b}, [x0], x1
1030         subs            x4, x4, #2
1031         b.ne            2b
1032
1033         add             sp,  sp,  #168+16
1034         ret
1035 endfunc
1036
1037 function ff_put_vp8_epel8_v4_neon, export=1
1038         sub             x2,  x2,  x3
1039
1040         movrel          x7,  subpel_filters, -16
1041         add             x6,  x7,  w6, uxtw #4
1042         ld1             {v0.8h},     [x6]
1043 1:
1044         ld1             {v2.8b},     [x2], x3
1045         ld1             {v3.8b},     [x2], x3
1046         ld1             {v4.8b},     [x2], x3
1047         ld1             {v5.8b},     [x2], x3
1048         ld1             {v6.8b},     [x2]
1049         sub             x2,  x2,  x3,  lsl #1
1050
1051         vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1052
1053         st1             {v2.d}[0], [x0], x1
1054         st1             {v2.d}[1], [x0], x1
1055         subs            w4,  w4,  #2
1056         b.ne            1b
1057
1058         ret
1059 endfunc
1060
1061 function ff_put_vp8_epel8_h4_neon, export=1
1062         sub             x2,  x2,  #1
1063
1064         movrel          x7,  subpel_filters, -16
1065         add             x5,  x7,  w5, uxtw #4
1066         ld1             {v0.8h},       [x5]
1067 1:
1068         ld1             {v2.8b,v3.8b}, [x2], x3
1069
1070         vp8_epel8_h4    v2,  v2,  v3
1071
1072         st1             {v2.8b}, [x0], x1
1073         subs            w4,  w4,  #1
1074         b.ne            1b
1075
1076         ret
1077 endfunc
1078
1079 function ff_put_vp8_epel8_h4v6_neon, export=1
1080         sub             x2,  x2,  x3,  lsl #1
1081         sub             x2,  x2,  #1
1082         sxtw            x4,  w4
1083
1084         // first pass (horizontal):
1085         movrel          x17,  subpel_filters, -16
1086         sxtw            x5,  w5
1087         add             x5,  x17,  x5, lsl #4 // x
1088         sub             sp,  sp,  #168+16
1089         ld1             {v0.8h},  [x5]
1090         add             x7,  sp,  #15
1091         add             x16, x4, #5   // h
1092         bic             x7,  x7,  #15
1093 1:
1094         ld1             {v1.8b, v2.8b}, [x2], x3
1095
1096         vp8_epel8_h4    v1, v1, v2
1097
1098         st1             {v1.8b}, [x7], #8
1099         subs            x16, x16, #1
1100         b.ne            1b
1101
1102         // second pass (vertical):
1103         sxtw            x6,  w6
1104         add             x6,  x17,  x6, lsl #4  // y
1105         add             x7,  sp,   #15
1106         ld1             {v0.8h},   [x6]
1107         bic             x7,  x7,   #15
1108 2:
1109         ld1             {v1.8b - v4.8b}, [x7], #32
1110         ld1             {v5.8b - v7.8b}, [x7]
1111
1112         sub             x7,  x7,  #16
1113
1114         vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1115
1116         st1             {v1.8b}, [x0], x1
1117         st1             {v2.8b}, [x0], x1
1118         subs            x4, x4, #2
1119         b.ne            2b
1120
1121         add             sp,  sp,  #168+16
1122         ret
1123 endfunc
1124
1125 function ff_put_vp8_epel8_h4v4_neon, export=1
1126         sub             x2,  x2,  x3
1127         sub             x2,  x2,  #1
1128         sxtw            x4,  w4
1129
1130
1131         // first pass (horizontal):
1132         movrel          x17,  subpel_filters, -16
1133         sxtw            x5,  w5
1134         add             x5,  x17,  x5, lsl #4 // x
1135         sub             sp,  sp,  #168+16
1136         ld1             {v0.8h},  [x5]
1137         add             x7,  sp,  #15
1138         add             x16, x4, #3   // h
1139         bic             x7,  x7,  #15
1140 1:
1141         ld1             {v1.8b, v2.8b}, [x2], x3
1142
1143         vp8_epel8_h4    v1, v1, v2
1144
1145         st1             {v1.8b}, [x7], #8
1146         subs            x16, x16, #1
1147         b.ne            1b
1148
1149         // second pass (vertical):
1150         sxtw            x6,  w6
1151         add             x6,  x17,  x6, lsl #4  // y
1152         add             x7,  sp,   #15
1153         ld1             {v0.8h},   [x6]
1154         bic             x7,  x7,   #15
1155 2:
1156         ld1             {v1.8b - v2.8b}, [x7], #16
1157         ld1             {v3.8b - v5.8b}, [x7]
1158
1159         vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1160
1161         st1             {v1.d}[0], [x0], x1
1162         st1             {v1.d}[1], [x0], x1
1163         subs            x4, x4, #2
1164         b.ne            2b
1165
1166         add             sp,  sp,  #168+16
1167         ret
1168 endfunc
1169
1170 function ff_put_vp8_epel8_h6v4_neon, export=1
1171         sub             x2,  x2,  x3
1172         sub             x2,  x2,  #2
1173         sxtw            x4,  w4
1174
1175
1176         // first pass (horizontal):
1177         movrel          x17,  subpel_filters, -16
1178         sxtw            x5,  w5
1179         add             x5,  x17,  x5, lsl #4 // x
1180         sub             sp,  sp,  #168+16
1181         ld1             {v0.8h},  [x5]
1182         add             x7,  sp,  #15
1183         add             x16, x4, #3   // h
1184         bic             x7,  x7,  #15
1185 1:
1186         ld1             {v1.8b, v2.8b}, [x2], x3
1187
1188         vp8_epel8_h6    v1, v1, v2
1189
1190         st1             {v1.8b}, [x7], #8
1191         subs            x16, x16, #1
1192         b.ne            1b
1193
1194         // second pass (vertical):
1195         sxtw            x6,  w6
1196         add             x6,  x17,  x6, lsl #4  // y
1197         add             x7,  sp,   #15
1198         ld1             {v0.8h},   [x6]
1199         bic             x7,  x7,   #15
1200 2:
1201         ld1             {v1.8b - v2.8b}, [x7], #16
1202         ld1             {v3.8b - v5.8b}, [x7]
1203
1204         vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1205
1206         st1             {v1.d}[0], [x0], x1
1207         st1             {v1.d}[1], [x0], x1
1208         subs            x4, x4, #2
1209         b.ne            2b
1210
1211         add             sp,  sp,  #168+16
1212         ret
1213 endfunc
1214
1215 function ff_put_vp8_epel4_v6_neon, export=1
1216         sub             x2,  x2,  x3,  lsl #1
1217
1218         movrel          x7,  subpel_filters, -16
1219         add             x6,  x7,  w6, uxtw #4
1220         ld1             {v0.8h},    [x6]
1221 1:
1222         ld1r            {v2.2s},    [x2], x3
1223         ld1r            {v3.2s},    [x2], x3
1224         ld1r            {v4.2s},    [x2], x3
1225         ld1r            {v5.2s},    [x2], x3
1226         ld1r            {v6.2s},    [x2], x3
1227         ld1r            {v7.2s},    [x2], x3
1228         ld1r            {v28.2s},   [x2]
1229         sub             x2,  x2,  x3,  lsl #2
1230         ld1             {v2.s}[1],  [x2], x3
1231         ld1             {v3.s}[1],  [x2], x3
1232         ld1             {v4.s}[1],  [x2], x3
1233         ld1             {v5.s}[1],  [x2], x3
1234         ld1             {v6.s}[1],  [x2], x3
1235         ld1             {v7.s}[1],  [x2], x3
1236         ld1             {v28.s}[1], [x2]
1237         sub             x2,  x2,  x3,  lsl #2
1238
1239         vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
1240
1241         st1             {v2.s}[0],  [x0], x1
1242         st1             {v3.s}[0],  [x0], x1
1243         st1             {v2.s}[1],  [x0], x1
1244         st1             {v3.s}[1],  [x0], x1
1245         subs            w4,  w4,  #4
1246         b.ne            1b
1247
1248         ret
1249 endfunc
1250
1251 function ff_put_vp8_epel4_h6_neon, export=1
1252         sub             x2,  x2,  #2
1253
1254         movrel          x7,  subpel_filters, -16
1255         add             x5,  x7,  w5, uxtw #4
1256         ld1             {v0.8h},       [x5]
1257 1:
1258         ld1             {v2.8b,v3.8b}, [x2], x3
1259         vp8_epel8_h6    v2,  v2,  v3
1260         st1             {v2.s}[0], [x0], x1
1261         subs            w4,  w4,  #1
1262         b.ne            1b
1263
1264         ret
1265 endfunc
1266
1267 function ff_put_vp8_epel4_h6v6_neon, export=1
1268         sub             x2,  x2,  x3,  lsl #1
1269         sub             x2,  x2,  #2
1270
1271         movrel          x7,  subpel_filters, -16
1272         add             x5,  x7,  w5, uxtw #4
1273         ld1             {v0.8h},       [x5]
1274
1275         sub             sp,  sp,  #52
1276         add             w8,  w4,  #5
1277         mov             x9,  sp
1278 1:
1279         ld1             {v2.8b,v3.8b}, [x2], x3
1280         vp8_epel8_h6    v2,  v2,  v3
1281         st1             {v2.s}[0],     [x9], #4
1282         subs            w8,  w8,  #1
1283         b.ne            1b
1284
1285         add             x6,  x7,  w6, uxtw #4
1286         ld1             {v0.8h},       [x6]
1287         mov             x9,  sp
1288 2:
1289         ld1             {v2.8b,v3.8b}, [x9], #16
1290         ld1             {v6.8b},       [x9], #8
1291         ld1r            {v28.2s},      [x9]
1292         sub             x9,  x9,  #16
1293         ld1             {v4.8b,v5.8b}, [x9], #16
1294         ld1             {v7.8b},       [x9], #8
1295         ld1             {v28.s}[1],    [x9]
1296         sub             x9,  x9,  #16
1297         trn1            v1.2s, v2.2s, v4.2s
1298         trn2            v4.2s, v2.2s, v4.2s
1299         trn1            v2.2s, v3.2s, v5.2s
1300         trn2            v5.2s, v3.2s, v5.2s
1301         trn1            v3.2s, v6.2s, v7.2s
1302         trn2            v7.2s, v6.2s, v7.2s
1303         vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
1304         st1             {v2.s}[0],  [x0], x1
1305         st1             {v3.s}[0],  [x0], x1
1306         st1             {v2.s}[1],  [x0], x1
1307         st1             {v3.s}[1],  [x0], x1
1308         subs            w4,  w4,  #4
1309         b.ne            2b
1310
1311         add             sp,  sp,  #52
1312         ret
1313 endfunc
1314
1315 function ff_put_vp8_epel4_h4v6_neon, export=1
1316         sub             x2,  x2,  x3,  lsl #1
1317         sub             x2,  x2,  #1
1318
1319         movrel          x7,  subpel_filters, -16
1320         add             x5,  x7,  w5, uxtw #4
1321         ld1             {v0.8h},       [x5]
1322
1323         sub             sp,  sp,  #52
1324         add             w8,  w4,  #5
1325         mov             x9,  sp
1326 1:
1327         ld1             {v2.8b},       [x2], x3
1328         vp8_epel8_h4    v2,  v2,  v2
1329         st1             {v2.s}[0],     [x9], #4
1330         subs            w8,  w8,  #1
1331         b.ne            1b
1332
1333         add             x6,  x7,  w6, uxtw #4
1334         ld1             {v0.8h},       [x6]
1335         mov             x9,  sp
1336 2:
1337         ld1             {v2.8b,v3.8b}, [x9], #16
1338         ld1             {v6.8b},       [x9], #8
1339         ld1r            {v28.2s},      [x9]
1340         sub             x9,  x9,  #16
1341         ld1             {v4.8b,v5.8b}, [x9], #16
1342         ld1             {v7.8b},       [x9], #8
1343         ld1             {v28.s}[1],    [x9]
1344         sub             x9,  x9,  #16
1345         trn1            v1.2s, v2.2s, v4.2s
1346         trn2            v4.2s, v2.2s, v4.2s
1347         trn1            v2.2s, v3.2s, v5.2s
1348         trn2            v5.2s, v3.2s, v5.2s
1349         trn1            v3.2s, v6.2s, v7.2s
1350         trn2            v7.2s, v6.2s, v7.2s
1351         vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
1352         st1             {v2.s}[0],  [x0], x1
1353         st1             {v3.s}[0],  [x0], x1
1354         st1             {v2.s}[1],  [x0], x1
1355         st1             {v3.s}[1],  [x0], x1
1356         subs            w4,  w4,  #4
1357         b.ne            2b
1358
1359         add             sp,  sp,  #52
1360         ret
1361 endfunc
1362
1363 function ff_put_vp8_epel4_h6v4_neon, export=1
1364         sub             x2,  x2,  x3
1365         sub             x2,  x2,  #2
1366
1367         movrel          x7,  subpel_filters, -16
1368         add             x5,  x7,  w5, uxtw #4
1369         ld1             {v0.8h},       [x5]
1370
1371         sub             sp,  sp,  #44
1372         add             w8,  w4,  #3
1373         mov             x9,  sp
1374 1:
1375         ld1             {v2.8b,v3.8b}, [x2], x3
1376         vp8_epel8_h6    v2, v2, v3
1377         st1             {v2.s}[0],     [x9], #4
1378         subs            w8,  w8,  #1
1379         b.ne            1b
1380
1381         add             x6,  x7,  w6, uxtw #4
1382         ld1             {v0.8h},       [x6]
1383         mov             x9,  sp
1384 2:
1385         ld1             {v2.8b,v3.8b}, [x9], #16
1386         ld1r            {v6.2s},       [x9]
1387         sub             x9,  x9,  #8
1388         ld1             {v4.8b,v5.8b}, [x9], #16
1389         ld1             {v6.s}[1],     [x9]
1390         sub             x9,  x9,  #8
1391         trn1            v1.2s, v2.2s, v4.2s
1392         trn2            v4.2s, v2.2s, v4.2s
1393         trn1            v2.2s, v3.2s, v5.2s
1394         trn2            v5.2s, v3.2s, v5.2s
1395         vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
1396         st1             {v1.s}[0],  [x0], x1
1397         st1             {v1.s}[2],  [x0], x1
1398         st1             {v1.s}[1],  [x0], x1
1399         st1             {v1.s}[3],  [x0], x1
1400         subs            w4,  w4,  #4
1401         b.ne            2b
1402
1403         add             sp,  sp,  #44
1404         ret
1405 endfunc
1406
1407 function ff_put_vp8_epel4_h4_neon, export=1
1408         sub             x2,  x2,  #1
1409
1410         movrel          x7,  subpel_filters, -16
1411         add             x5,  x7,  w5, uxtw #4
1412         ld1             {v0.8h},    [x5]
1413 1:
1414         ld1             {v2.8b},    [x2], x3
1415         vp8_epel8_h4    v2,  v2,  v2
1416         st1             {v2.s}[0],  [x0], x1
1417         subs            w4,  w4,  #1
1418         b.ne            1b
1419
1420         ret
1421 endfunc
1422
1423 function ff_put_vp8_epel4_v4_neon, export=1
1424         sub             x2,  x2,  x3
1425
1426         movrel          x7,  subpel_filters, -16
1427         add             x6,  x7,  w6, uxtw #4
1428         ld1             {v0.8h},   [x6]
1429 1:
1430         ld1r            {v2.2s},   [x2], x3
1431         ld1r            {v3.2s},   [x2], x3
1432         ld1r            {v4.2s},   [x2], x3
1433         ld1r            {v5.2s},   [x2], x3
1434         ld1r            {v6.2s},   [x2]
1435         sub             x2,  x2,  x3,  lsl #1
1436         ld1             {v2.s}[1], [x2], x3
1437         ld1             {v3.s}[1], [x2], x3
1438         ld1             {v4.s}[1], [x2], x3
1439         ld1             {v5.s}[1], [x2], x3
1440         ld1             {v6.s}[1], [x2]
1441         sub             x2,  x2,  x3,  lsl #1
1442
1443         vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1444
1445         st1             {v2.s}[0], [x0], x1
1446         st1             {v2.s}[2], [x0], x1
1447         st1             {v2.s}[1], [x0], x1
1448         st1             {v2.s}[3], [x0], x1
1449         subs            w4,  w4,  #4
1450         b.ne            1b
1451
1452         ret
1453 endfunc
1454
1455 function ff_put_vp8_epel4_h4v4_neon, export=1
1456         sub             x2,  x2,  x3
1457         sub             x2,  x2,  #1
1458
1459         movrel          x7,  subpel_filters, -16
1460         add             x5,  x7,  w5, uxtw #4
1461         ld1             {v0.8h},       [x5]
1462
1463         sub             sp,  sp,  #44
1464         add             w8,  w4,  #3
1465         mov             x9,  sp
1466 1:
1467         ld1             {v2.8b},       [x2], x3
1468         vp8_epel8_h4    v2,  v2,  v3
1469         st1             {v2.s}[0],     [x9], #4
1470         subs            w8,  w8,  #1
1471         b.ne            1b
1472
1473         add             x6,  x7,  w6, uxtw #4
1474         ld1             {v0.8h},       [x6]
1475         mov             x9,  sp
1476 2:
1477         ld1             {v2.8b,v3.8b}, [x9], #16
1478         ld1r            {v6.2s},       [x9]
1479         sub             x9,  x9,  #8
1480         ld1             {v4.8b,v5.8b}, [x9], #16
1481         ld1             {v6.s}[1],     [x9]
1482         sub             x9,  x9,  #8
1483         trn1            v1.2s, v2.2s, v4.2s
1484         trn2            v4.2s, v2.2s, v4.2s
1485         trn1            v2.2s, v3.2s, v5.2s
1486         trn2            v5.2s, v3.2s, v5.2s
1487         vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
1488         st1             {v1.s}[0], [x0], x1
1489         st1             {v1.s}[2], [x0], x1
1490         st1             {v1.s}[1], [x0], x1
1491         st1             {v1.s}[3], [x0], x1
1492         subs            w4,  w4,  #4
1493         b.ne            2b
1494
1495         add             sp,  sp,  #44
1496         ret
1497 endfunc
1498
1499 /* Bilinear MC */
1500
1501 function ff_put_vp8_bilin16_h_neon, export=1
1502         mov             w7,     #8
1503         dup             v0.8b,  w5
1504         sub             w5,     w7,     w5
1505         dup             v1.8b,  w5
1506 1:
1507         subs            w4,     w4,     #2
1508         ld1             {v2.8b,v3.8b,v4.8b},    [x2], x3
1509         ext             v5.8b,  v3.8b,  v4.8b,  #1
1510         ext             v4.8b,  v2.8b,  v3.8b,  #1
1511         umull           v16.8h, v2.8b,  v1.8b
1512         umlal           v16.8h, v4.8b,  v0.8b
1513         ld1             {v18.8b,v19.8b,v20.8b}, [x2], x3
1514         umull           v6.8h,  v3.8b,  v1.8b
1515         umlal           v6.8h,  v5.8b,  v0.8b
1516         ext             v21.8b, v19.8b, v20.8b, #1
1517         ext             v20.8b, v18.8b, v19.8b, #1
1518         umull           v22.8h, v18.8b, v1.8b
1519         umlal           v22.8h, v20.8b, v0.8b
1520         umull           v24.8h, v19.8b, v1.8b
1521         umlal           v24.8h, v21.8b, v0.8b
1522         rshrn           v4.8b,  v16.8h, #3
1523         rshrn2          v4.16b, v6.8h,  #3
1524         rshrn           v6.8b,  v22.8h, #3
1525         rshrn2          v6.16b, v24.8h, #3
1526         st1             {v4.16b}, [x0], x1
1527         st1             {v6.16b}, [x0], x1
1528         b.gt            1b
1529
1530         ret
1531 endfunc
1532
1533 function ff_put_vp8_bilin16_v_neon, export=1
1534         mov             w7,     #8
1535         dup             v0.16b, w6
1536         sub             w6,     w7,     w6
1537         dup             v1.16b, w6
1538
1539         ld1             {v2.16b}, [x2], x3
1540 1:
1541         subs            w4,     w4,     #2
1542         ld1             {v4.16b}, [x2], x3
1543         umull           v6.8h,  v2.8b,  v1.8b
1544         umlal           v6.8h,  v4.8b,  v0.8b
1545         umull2          v16.8h, v2.16b, v1.16b
1546         umlal2          v16.8h, v4.16b, v0.16b
1547         ld1             {v2.16b}, [x2], x3
1548         umull           v18.8h, v4.8b,  v1.8b
1549         umlal           v18.8h, v2.8b,  v0.8b
1550         umull2          v20.8h, v4.16b, v1.16b
1551         umlal2          v20.8h, v2.16b, v0.16b
1552         rshrn           v4.8b,  v6.8h,  #3
1553         rshrn2          v4.16b, v16.8h, #3
1554         rshrn           v6.8b,  v18.8h, #3
1555         rshrn2          v6.16b, v20.8h, #3
1556         st1             {v4.16b}, [x0], x1
1557         st1             {v6.16b}, [x0], x1
1558         b.gt            1b
1559
1560         ret
1561 endfunc
1562
1563 function ff_put_vp8_bilin16_hv_neon, export=1
1564         mov             w7,      #8
1565         dup             v0.8b,   w5            // mx
1566         sub             w5,      w7,     w5
1567         dup             v1.8b,   w5
1568         dup             v2.16b,  w6            // my
1569         sub             w6,      w7,     w6
1570         dup             v3.16b,  w6
1571
1572         ld1             {v4.8b,v5.8b,v6.8b},    [x2], x3
1573
1574         ext             v7.8b,   v5.8b,  v6.8b, #1
1575         ext             v6.8b,   v4.8b,  v5.8b, #1
1576         umull           v16.8h,  v4.8b,  v1.8b
1577         umlal           v16.8h,  v6.8b,  v0.8b
1578         umull           v18.8h,  v5.8b,  v1.8b
1579         umlal           v18.8h,  v7.8b,  v0.8b
1580         rshrn           v4.8b,   v16.8h, #3
1581         rshrn2          v4.16b,  v18.8h, #3
1582 1:
1583         subs            w4,  w4,  #2
1584         ld1             {v18.8b,v19.8b,v20.8b},  [x2], x3
1585         ext             v21.8b,  v19.8b, v20.8b, #1
1586         ext             v20.8b,  v18.8b, v19.8b, #1
1587         umull           v22.8h,  v18.8b, v1.8b
1588         umlal           v22.8h,  v20.8b, v0.8b
1589         ld1             {v26.8b,v27.8b,v28.8b},  [x2], x3
1590         umull           v24.8h,  v19.8b, v1.8b
1591         umlal           v24.8h,  v21.8b, v0.8b
1592         ext             v29.8b,  v27.8b, v28.8b, #1
1593         ext             v28.8b,  v26.8b, v27.8b, #1
1594         umull           v16.8h,  v26.8b, v1.8b
1595         umlal           v16.8h,  v28.8b, v0.8b
1596         umull           v18.8h,  v27.8b, v1.8b
1597         umlal           v18.8h,  v29.8b, v0.8b
1598         rshrn           v6.8b,   v22.8h, #3
1599         rshrn2          v6.16b,  v24.8h, #3
1600         umull           v24.8h,  v4.8b,  v3.8b
1601         umlal           v24.8h,  v6.8b,  v2.8b
1602         umull2          v30.8h,  v4.16b, v3.16b
1603         umlal2          v30.8h,  v6.16b, v2.16b
1604         rshrn           v4.8b,   v16.8h, #3
1605         rshrn2          v4.16b,  v18.8h, #3
1606         umull           v20.8h,  v6.8b,  v3.8b
1607         umlal           v20.8h,  v4.8b,  v2.8b
1608         umull2          v22.8h,  v6.16b, v3.16b
1609         umlal2          v22.8h,  v4.16b, v2.16b
1610         rshrn           v24.8b,  v24.8h, #3
1611         rshrn2          v24.16b, v30.8h, #3
1612         st1             {v24.16b}, [x0], x1
1613         rshrn           v20.8b,  v20.8h, #3
1614         rshrn2          v20.16b, v22.8h, #3
1615         st1             {v20.16b}, [x0], x1
1616         b.gt            1b
1617
1618         ret
1619 endfunc
1620
1621 function ff_put_vp8_bilin8_h_neon, export=1
1622         mov             w7,     #8
1623         dup             v0.8b,  w5
1624         sub             w5,     w7,     w5
1625         dup             v1.8b,  w5
1626 1:
1627         subs            w4,     w4,     #2
1628         ld1             {v2.8b,v3.8b},  [x2],  x3
1629         ext             v3.8b,  v2.8b,  v3.8b, #1
1630         umull           v4.8h,  v2.8b,  v1.8b
1631         umlal           v4.8h,  v3.8b,  v0.8b
1632         ld1             {v6.8b,v7.8b},  [x2],  x3
1633         ext             v7.8b,  v6.8b,  v7.8b, #1
1634         umull           v16.8h, v6.8b,  v1.8b
1635         umlal           v16.8h, v7.8b,  v0.8b
1636         rshrn           v4.8b,  v4.8h,  #3
1637         rshrn           v16.8b, v16.8h, #3
1638         st1             {v4.8b},  [x0], x1
1639         st1             {v16.8b}, [x0], x1
1640         b.gt            1b
1641
1642         ret
1643 endfunc
1644
1645 function ff_put_vp8_bilin8_v_neon, export=1
1646         mov             w7,      #8
1647         dup             v0.8b,   w6
1648         sub             w6,      w7,    w6
1649         dup             v1.8b,   w6
1650
1651         ld1             {v2.8b}, [x2],  x3
1652 1:
1653         subs            w4,      w4,    #2
1654         ld1             {v3.8b}, [x2],  x3
1655         umull           v4.8h,   v2.8b, v1.8b
1656         umlal           v4.8h,   v3.8b, v0.8b
1657         ld1             {v2.8b}, [x2],  x3
1658         umull           v6.8h,   v3.8b, v1.8b
1659         umlal           v6.8h,   v2.8b, v0.8b
1660         rshrn           v4.8b,   v4.8h, #3
1661         rshrn           v6.8b,   v6.8h, #3
1662         st1             {v4.8b}, [x0],  x1
1663         st1             {v6.8b}, [x0],  x1
1664         b.gt            1b
1665
1666         ret
1667 endfunc
1668
1669 function ff_put_vp8_bilin8_hv_neon, export=1
1670         mov             w7,     #8
1671         dup             v0.8b,  w5             // mx
1672         sub             w5,     w7,     w5
1673         dup             v1.8b,  w5
1674         dup             v2.8b,  w6             // my
1675         sub             w6,     w7,     w6
1676         dup             v3.8b,  w6
1677
1678         ld1             {v4.8b,v5.8b},  [x2],  x3
1679         ext             v5.8b,  v4.8b,  v5.8b, #1
1680         umull           v18.8h, v4.8b,  v1.8b
1681         umlal           v18.8h, v5.8b,  v0.8b
1682         rshrn           v22.8b, v18.8h, #3
1683 1:
1684         subs            w4,     w4,     #2
1685         ld1             {v6.8b,v7.8b},  [x2],  x3
1686         ext             v7.8b,  v6.8b,  v7.8b, #1
1687         umull           v16.8h, v6.8b,  v1.8b
1688         umlal           v16.8h, v7.8b,  v0.8b
1689         ld1             {v4.8b,v5.8b},  [x2],  x3
1690         ext             v5.8b,  v4.8b,  v5.8b, #1
1691         umull           v18.8h, v4.8b,  v1.8b
1692         umlal           v18.8h, v5.8b,  v0.8b
1693         rshrn           v16.8b, v16.8h, #3
1694         umull           v20.8h, v22.8b, v3.8b
1695         umlal           v20.8h, v16.8b, v2.8b
1696         rshrn           v22.8b, v18.8h, #3
1697         umull           v24.8h, v16.8b, v3.8b
1698         umlal           v24.8h, v22.8b, v2.8b
1699         rshrn           v20.8b, v20.8h, #3
1700         st1             {v20.8b}, [x0], x1
1701         rshrn           v23.8b, v24.8h, #3
1702         st1             {v23.8b}, [x0], x1
1703         b.gt            1b
1704
1705         ret
1706 endfunc
1707
1708 function ff_put_vp8_bilin4_h_neon, export=1
1709         mov             w7,      #8
1710         dup             v0.8b,   w5
1711         sub             w5,      w7,     w5
1712         dup             v1.8b,   w5
1713 1:
1714         subs            w4,      w4,     #2
1715         ld1             {v2.8b}, [x2],   x3
1716         ext             v3.8b,   v2.8b,  v3.8b,  #1
1717         ld1             {v6.8b}, [x2],   x3
1718         ext             v7.8b,   v6.8b,  v7.8b,  #1
1719         trn1            v2.2s,   v2.2s,  v6.2s
1720         trn1            v3.2s,   v3.2s,  v7.2s
1721         umull           v4.8h,   v2.8b,  v1.8b
1722         umlal           v4.8h,   v3.8b,  v0.8b
1723         rshrn           v4.8b,   v4.8h,  #3
1724         st1             {v4.s}[0], [x0], x1
1725         st1             {v4.s}[1], [x0], x1
1726         b.gt            1b
1727
1728         ret
1729 endfunc
1730
1731 function ff_put_vp8_bilin4_v_neon, export=1
1732         mov             w7,     #8
1733         dup             v0.8b,  w6
1734         sub             w6,     w7,  w6
1735         dup             v1.8b,  w6
1736
1737         ld1r            {v2.2s},    [x2], x3
1738 1:
1739         ld1r            {v3.2s},   [x2]
1740         ld1             {v2.s}[1], [x2], x3
1741         ld1             {v3.s}[1], [x2], x3
1742         umull           v4.8h,  v2.8b,  v1.8b
1743         umlal           v4.8h,  v3.8b,  v0.8b
1744         trn2            v2.2s,  v3.2s,  v2.2s
1745         rshrn           v4.8b,  v4.8h,  #3
1746         st1             {v4.s}[0], [x0], x1
1747         st1             {v4.s}[1], [x0], x1
1748         subs            w4,     w4,     #2
1749         b.gt            1b
1750
1751         ret
1752 endfunc
1753
1754 function ff_put_vp8_bilin4_hv_neon, export=1
1755         mov             w7,      #8
1756         dup             v0.8b,   w5             // mx
1757         sub             w5,      w7,     w5
1758         dup             v1.8b,   w5
1759         dup             v2.8b,   w6             // my
1760         sub             w6,      w7,     w6
1761         dup             v3.8b,   w6
1762
1763         ld1             {v4.8b}, [x2],   x3
1764         ext             v5.8b,   v4.8b,  v4.8b,  #1
1765         umull           v18.8h,  v4.8b,  v1.8b
1766         umlal           v18.8h,  v5.8b,  v0.8b
1767         rshrn           v22.8b,  v18.8h, #3
1768 1:
1769         subs            w4,      w4,     #2
1770         ld1             {v6.8b}, [x2],   x3
1771         ext             v7.8b,   v6.8b,  v6.8b,  #1
1772         ld1             {v4.8b}, [x2],   x3
1773         ext             v5.8b,   v4.8b,  v4.8b,  #1
1774         trn1            v6.2s,   v6.2s,  v4.2s
1775         trn1            v7.2s,   v7.2s,  v5.2s
1776         umull           v16.8h,  v6.8b,  v1.8b
1777         umlal           v16.8h,  v7.8b,  v0.8b
1778         rshrn           v16.8b,  v16.8h, #3
1779         umull           v20.8h,  v16.8b, v2.8b
1780         trn1            v22.2s,  v22.2s, v16.2s
1781         umlal           v20.8h,  v22.8b, v3.8b
1782         rev64           v22.2s,  v16.2s
1783         rshrn           v20.8b,  v20.8h, #3
1784         st1             {v20.s}[0], [x0], x1
1785         st1             {v20.s}[1], [x0], x1
1786         b.gt            1b
1787
1788         ret
1789 endfunc