]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp8dsp_neon.S
Merge commit 'cc7ba00c35faf0478f1f56215e926f70ccb31282'
[ffmpeg] / libavcodec / aarch64 / vp8dsp_neon.S
1 /*
2  * VP8 NEON optimisations
3  *
4  * Copyright (c) 2010 Rob Clark <rob@ti.com>
5  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6  * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
7  * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
8  *
9  * This file is part of FFmpeg.
10  *
11  * FFmpeg is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * FFmpeg is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with FFmpeg; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25
26 #include "libavutil/aarch64/asm.S"
27 #include "neon.S"
28
29 function ff_vp8_luma_dc_wht_neon, export=1
30         ld1             {v0.4h - v3.4h}, [x1]
31         movi            v30.8h, #0
32
33         add             v4.4h,  v0.4h,  v3.4h
34         add             v6.4h,  v1.4h,  v2.4h
35         st1             {v30.8h}, [x1], #16
36         sub             v7.4h,  v1.4h,  v2.4h
37         sub             v5.4h,  v0.4h,  v3.4h
38         st1             {v30.8h}, [x1]
39         add             v0.4h,  v4.4h,  v6.4h
40         add             v1.4h,  v5.4h,  v7.4h
41         sub             v2.4h,  v4.4h,  v6.4h
42         sub             v3.4h,  v5.4h,  v7.4h
43
44         movi            v16.4h, #3
45
46         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
47
48         add             v0.4h,  v0.4h,  v16.4h
49
50         add             v4.4h,  v0.4h,  v3.4h
51         add             v6.4h,  v1.4h,  v2.4h
52         sub             v7.4h,  v1.4h,  v2.4h
53         sub             v5.4h,  v0.4h,  v3.4h
54         add             v0.4h,  v4.4h,  v6.4h
55         add             v1.4h,  v5.4h,  v7.4h
56         sub             v2.4h,  v4.4h,  v6.4h
57         sub             v3.4h,  v5.4h,  v7.4h
58
59         sshr            v0.4h,  v0.4h,  #3
60         sshr            v1.4h,  v1.4h,  #3
61         sshr            v2.4h,  v2.4h,  #3
62         sshr            v3.4h,  v3.4h,  #3
63
64         mov             x3,  #32
65         st1             {v0.h}[0],  [x0], x3
66         st1             {v1.h}[0],  [x0], x3
67         st1             {v2.h}[0],  [x0], x3
68         st1             {v3.h}[0],  [x0], x3
69         st1             {v0.h}[1],  [x0], x3
70         st1             {v1.h}[1],  [x0], x3
71         st1             {v2.h}[1],  [x0], x3
72         st1             {v3.h}[1],  [x0], x3
73         st1             {v0.h}[2],  [x0], x3
74         st1             {v1.h}[2],  [x0], x3
75         st1             {v2.h}[2],  [x0], x3
76         st1             {v3.h}[2],  [x0], x3
77         st1             {v0.h}[3],  [x0], x3
78         st1             {v1.h}[3],  [x0], x3
79         st1             {v2.h}[3],  [x0], x3
80         st1             {v3.h}[3],  [x0], x3
81
82         ret
83 endfunc
84
85 function ff_vp8_idct_add_neon, export=1
86         ld1             {v0.8b - v3.8b},  [x1]
87         mov             w4,  #20091
88         movk            w4,  #35468/2, lsl #16
89         dup             v4.2s, w4
90
91         smull           v26.4s, v1.4h,  v4.h[0]
92         smull           v27.4s, v3.4h,  v4.h[0]
93         sqdmulh         v20.4h, v1.4h,  v4.h[1]
94         sqdmulh         v23.4h, v3.4h,  v4.h[1]
95         sqshrn          v21.4h, v26.4s, #16
96         sqshrn          v22.4h, v27.4s, #16
97         add             v21.4h, v21.4h, v1.4h
98         add             v22.4h, v22.4h, v3.4h
99
100         add             v16.4h,  v0.4h,   v2.4h
101         sub             v17.4h,  v0.4h,   v2.4h
102
103         add             v18.4h,  v21.4h,  v23.4h
104         sub             v19.4h,  v20.4h,  v22.4h
105
106         add             v0.4h,   v16.4h,  v18.4h
107         add             v1.4h,   v17.4h,  v19.4h
108         sub             v3.4h,   v16.4h,  v18.4h
109         sub             v2.4h,   v17.4h,  v19.4h
110
111         transpose_4x4H  v0, v1, v2, v3, v24, v5, v6, v7
112
113         movi            v29.8h, #0
114         smull           v26.4s,     v1.4h,  v4.h[0]
115         st1             {v29.8h},   [x1],   #16
116         smull           v27.4s,     v3.4h,  v4.h[0]
117         st1             {v29.16b},  [x1]
118         sqdmulh         v21.4h,     v1.4h,  v4.h[1]
119         sqdmulh         v23.4h,     v3.4h,  v4.h[1]
120         sqshrn          v20.4h,     v26.4s, #16
121         sqshrn          v22.4h,     v27.4s, #16
122         add             v20.4h,     v20.4h, v1.4h
123         add             v22.4h,     v22.4h, v3.4h
124         add             v16.4h,     v0.4h,  v2.4h
125         sub             v17.4h,     v0.4h,  v2.4h
126
127         add             v18.4h,     v20.4h, v23.4h
128         ld1             {v24.d}[0], [x0],   x2
129         zip1            v16.2d,     v16.2d, v17.2d
130         sub             v19.4h,     v21.4h, v22.4h
131         ld1             {v25.d}[0], [x0],   x2
132         zip1            v18.2d,     v18.2d, v19.2d
133         add             v0.8h,      v16.8h, v18.8h
134         ld1             {v25.d}[1], [x0],   x2
135         sub             v1.8h,      v16.8h, v18.8h
136         ld1             {v24.d}[1], [x0],   x2
137         srshr           v0.8h,      v0.8h,  #3
138         trn1            v24.4s,     v24.4s, v25.4s
139         srshr           v1.8h,      v1.8h,  #3
140         sub             x0,  x0,  x2,  lsl #2
141
142         ext             v1.16b, v1.16b, v1.16b, #8
143         trn1            v3.2d,  v0.2d,  v1.2d
144         trn2            v0.2d,  v0.2d,  v1.2d
145         trn1            v1.8h,  v3.8h,  v0.8h
146         trn2            v3.8h,  v3.8h,  v0.8h
147         uzp1            v0.4s,  v1.4s,  v3.4s
148         uzp2            v1.4s,  v3.4s,  v1.4s
149
150         uaddw           v0.8h,  v0.8h, v24.8b
151         uaddw2          v1.8h,  v1.8h, v24.16b
152         sqxtun          v0.8b,  v0.8h
153         sqxtun2         v0.16b, v1.8h
154         st1             {v0.s}[0],  [x0], x2
155         st1             {v0.s}[1],  [x0], x2
156         st1             {v0.s}[3],  [x0], x2
157         st1             {v0.s}[2],  [x0], x2
158
159         ret
160 endfunc
161
162 function ff_vp8_idct_dc_add4uv_neon, export=1
163         movi            v0.4h,  #0
164         mov             x3,     #32
165         ld1r            {v16.4h},  [x1]
166         st1             {v0.h}[0], [x1], x3
167         ld1r            {v17.4h},  [x1]
168         st1             {v0.h}[0], [x1], x3
169         ld1r            {v18.4h},  [x1]
170         st1             {v0.h}[0], [x1], x3
171         ld1r            {v19.4h},  [x1]
172         st1             {v0.h}[0], [x1], x3
173         ins             v16.d[1],  v17.d[0]
174         ins             v18.d[1],  v19.d[0]
175         mov             x3,  x0
176         srshr           v16.8h,    v16.8h,  #3            // dc >>= 3
177         ld1             {v0.8b},   [x0], x2
178         srshr           v18.8h,    v18.8h,  #3
179         ld1             {v1.8b},   [x0], x2
180         uaddw           v20.8h,    v16.8h, v0.8b
181         ld1             {v2.8b},   [x0], x2
182         uaddw           v0.8h,     v16.8h, v1.8b
183         ld1             {v3.8b},   [x0], x2
184         uaddw           v22.8h,    v16.8h, v2.8b
185         ld1             {v4.8b},   [x0], x2
186         uaddw           v2.8h,     v16.8h, v3.8b
187         ld1             {v5.8b},   [x0], x2
188         uaddw           v24.8h,    v18.8h, v4.8b
189         ld1             {v6.8b},   [x0], x2
190         uaddw           v4.8h,     v18.8h, v5.8b
191         ld1             {v7.8b},   [x0], x2
192         uaddw           v26.8h,    v18.8h, v6.8b
193         sqxtun          v20.8b,    v20.8h
194         uaddw           v6.8h,     v18.8h, v7.8b
195         sqxtun          v21.8b,    v0.8h
196         sqxtun          v22.8b,    v22.8h
197         st1             {v20.8b},  [x3], x2
198         sqxtun          v23.8b,    v2.8h
199         st1             {v21.8b},  [x3], x2
200         sqxtun          v24.8b,    v24.8h
201         st1             {v22.8b},  [x3], x2
202         sqxtun          v25.8b,    v4.8h
203         st1             {v23.8b},  [x3], x2
204         sqxtun          v26.8b,    v26.8h
205         st1             {v24.8b},  [x3], x2
206         sqxtun          v27.8b,    v6.8h
207         st1             {v25.8b},  [x3], x2
208         st1             {v26.8b},  [x3], x2
209         st1             {v27.8b},  [x3], x2
210
211         ret
212 endfunc
213
214 function ff_vp8_idct_dc_add4y_neon, export=1
215         movi            v0.16b,  #0
216         mov             x3,  #32
217         ld1r            {v16.4h},    [x1]
218         st1             {v0.h}[0],   [x1], x3
219         ld1r            {v17.4h},    [x1]
220         st1             {v0.h}[0],   [x1], x3
221         zip1            v16.2d,      v16.2d, v17.2d
222         ld1r            {v18.4h},    [x1]
223         st1             {v0.h}[0],   [x1], x3
224         ld1r            {v19.4h},    [x1]
225         st1             {v0.h}[0],   [x1], x3
226         zip1            v18.2d,      v18.2d, v19.2d
227         srshr           v16.8h,      v16.8h,  #3            // dc >>= 3
228         ld1             {v0.16b},     [x0], x2
229         srshr           v18.8h,       v18.8h,  #3
230         ld1             {v1.16b},     [x0], x2
231         uaddw           v20.8h,       v16.8h,  v0.8b
232         ld1             {v2.16b},     [x0], x2
233         uaddw2          v0.8h,        v18.8h,   v0.16b
234         ld1             {v3.16b},     [x0], x2
235         uaddw           v21.8h, v16.8h,  v1.8b
236         uaddw2          v1.8h,  v18.8h,  v1.16b
237         uaddw           v22.8h, v16.8h,  v2.8b
238         uaddw2          v2.8h,  v18.8h,  v2.16b
239         uaddw           v23.8h, v16.8h,  v3.8b
240         uaddw2          v3.8h,  v18.8h,  v3.16b
241         sub             x0,  x0,  x2,  lsl #2
242         sqxtun          v20.8b,  v20.8h
243         sqxtun2         v20.16b, v0.8h
244         sqxtun          v21.8b,  v21.8h
245         sqxtun2         v21.16b, v1.8h
246         sqxtun          v22.8b,  v22.8h
247         st1             {v20.16b},    [x0], x2
248         sqxtun2         v22.16b, v2.8h
249         st1             {v21.16b},    [x0], x2
250         sqxtun          v23.8b,  v23.8h
251         st1             {v22.16b},    [x0], x2
252         sqxtun2         v23.16b, v3.8h
253         st1             {v23.16b},    [x0], x2
254
255         ret
256 endfunc
257
258 function ff_vp8_idct_dc_add_neon, export=1
259         mov             w3,       #0
260         ld1r            {v2.8h},  [x1]
261         strh            w3,       [x1]
262         srshr           v2.8h,  v2.8h,  #3
263         ld1             {v0.s}[0],  [x0], x2
264         ld1             {v0.s}[1],  [x0], x2
265         uaddw           v3.8h,  v2.8h,  v0.8b
266         ld1             {v1.s}[0],  [x0], x2
267         ld1             {v1.s}[1],  [x0], x2
268         uaddw           v4.8h,  v2.8h,  v1.8b
269         sqxtun          v0.8b,  v3.8h
270         sqxtun          v1.8b,  v4.8h
271         sub             x0,  x0,  x2, lsl #2
272         st1             {v0.s}[0],  [x0], x2
273         st1             {v0.s}[1],  [x0], x2
274         st1             {v1.s}[0],  [x0], x2
275         st1             {v1.s}[1],  [x0], x2
276         ret
277 endfunc
278
279 // Register layout:
280 //   P3..Q3 -> v0..v7
281 //   flim_E -> v22
282 //   flim_I -> v23
283 //   hev_thresh -> x5
284 //
285 .macro  vp8_loop_filter, inner=0, simple=0, hev_thresh
286     .if \simple
287         uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
288         uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
289         uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
290         ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
291         uqadd           v19.16b, v17.16b,  v18.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
292         movi            v21.16b, #0x80
293         cmhs            v16.16b, v22.16b, v19.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
294     .else
295         // calculate hev and normal_limit:
296         uabd            v20.16b, v2.16b,  v3.16b      // abs(P1-P0)
297         uabd            v21.16b, v5.16b,  v4.16b      // abs(Q1-Q0)
298         uabd            v18.16b, v0.16b,  v1.16b      // abs(P3-P2)
299         uabd            v19.16b, v1.16b,  v2.16b      // abs(P2-P1)
300         cmhs            v16.16b, v23.16b, v20.16b     // abs(P1-P0) <= flim_I
301         cmhs            v17.16b, v23.16b, v21.16b     // abs(Q1-Q0) <= flim_I
302         cmhs            v18.16b, v23.16b, v18.16b     // abs(P3-P2) <= flim_I
303         cmhs            v19.16b, v23.16b, v19.16b     // abs(P2-P1) <= flim_I
304         and             v16.16b, v17.16b, v16.16b
305         uabd            v17.16b, v7.16b,  v6.16b      // abs(Q3-Q2)
306         and             v16.16b, v16.16b, v19.16b
307         uabd            v19.16b, v6.16b,  v5.16b      // abs(Q2-Q1)
308         and             v16.16b, v16.16b, v18.16b
309         cmhs            v18.16b, v23.16b, v17.16b     // abs(Q3-Q2) <= flim_I
310         cmhs            v19.16b, v23.16b, v19.16b     // abs(Q2-Q1) <= flim_I
311         uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
312         uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
313         and             v16.16b, v16.16b, v18.16b
314         uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
315         and             v16.16b, v16.16b, v19.16b
316         ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
317         dup             v23.16b, \hev_thresh          // hev_thresh
318         uqadd           v19.16b, v17.16b, v18.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
319         cmhi            v20.16b, v20.16b, v23.16b     // abs(P1-P0) > hev_thresh
320         cmhs            v19.16b, v22.16b, v19.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
321         cmhi            v22.16b, v21.16b, v23.16b     // abs(Q1-Q0) > hev_thresh
322         and             v16.16b, v16.16b, v19.16b
323         movi            v21.16b, #0x80
324         orr             v17.16b, v20.16b, v22.16b
325     .endif
326
327         // at this point:
328         //   v16: normal_limit
329         //   v17: hev
330
331         // convert to signed value:
332         eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
333         eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
334
335         movi           v20.8h, #3
336         ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
337         ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
338         eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
339         eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
340         mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
341         mul            v19.8h, v19.8h, v20.8h
342
343         sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
344         movi           v22.16b, #4
345         movi           v23.16b, #3
346     .if \inner
347         and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
348     .endif
349         saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
350         saddw2         v19.8h,  v19.8h, v20.16b
351         sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
352         sqxtn2         v18.16b, v19.8h
353     .if !\inner && !\simple
354         eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
355         eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
356     .endif
357         and            v18.16b, v18.16b, v16.16b         // w &= normal_limit
358
359         // registers used at this point..
360         //   v0 -> P3  (don't corrupt)
361         //   v1-v6 -> PS2-QS2
362         //   v7 -> Q3  (don't corrupt)
363         //   v17 -> hev
364         //   v18 -> w
365         //   v21 -> #0x80
366         //   v22 -> #4
367         //   v23 -> #3
368         //   v16, v19, v29 -> unused
369         //
370         // filter_common:   is4tap==1
371         //   c1 = clamp(w + 4) >> 3;
372         //   c2 = clamp(w + 3) >> 3;
373         //   Q0 = s2u(QS0 - c1);
374         //   P0 = s2u(PS0 + c2);
375
376     .if \simple
377         sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
378         sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
379         sshr           v19.16b, v19.16b, #3                // c1 >>= 3
380         sshr           v20.16b, v20.16b, #3                // c2 >>= 3
381         sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
382         sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
383         eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
384         eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
385         eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
386         eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
387     .elseif \inner
388         // the !is4tap case of filter_common, only used for inner blocks
389         //   c3 = ((c1&~hev) + 1) >> 1;
390         //   Q1 = s2u(QS1 - c3);
391         //   P1 = s2u(PS1 + c3);
392         sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
393         sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
394         sshr           v19.16b, v19.16b, #3                // c1 >>= 3
395         sshr           v20.16b, v20.16b, #3                // c2 >>= 3
396         sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
397         sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
398         bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
399         eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
400         srshr          v19.16b, v19.16b, #1                // c3 >>= 1
401         eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
402         sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
403         sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
404         eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
405         eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
406     .else
407         and            v20.16b, v18.16b, v17.16b           // w & hev
408         sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
409         sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
410         sshr           v19.16b, v19.16b, #3                // c1 >>= 3
411         sshr           v20.16b, v20.16b, #3                // c2 >>= 3
412         bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
413         sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
414         sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
415
416         // filter_mbedge:
417         //   a = clamp((27*w + 63) >> 7);
418         //   Q0 = s2u(QS0 - a);
419         //   P0 = s2u(PS0 + a);
420         //   a = clamp((18*w + 63) >> 7);
421         //   Q1 = s2u(QS1 - a);
422         //   P1 = s2u(PS1 + a);
423         //   a = clamp((9*w + 63) >> 7);
424         //   Q2 = s2u(QS2 - a);
425         //   P2 = s2u(PS2 + a);
426         movi           v17.8h,  #63
427         sshll          v22.8h,  v18.8b, #3
428         sshll2         v23.8h,  v18.16b, #3
429         saddw          v22.8h,  v22.8h, v18.8b
430         saddw2         v23.8h,  v23.8h, v18.16b
431         add            v16.8h,  v17.8h, v22.8h
432         add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
433         add            v19.8h,  v16.8h, v22.8h
434         add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
435         add            v22.8h,  v19.8h, v22.8h
436         add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
437         sqshrn         v16.8b,  v16.8h,  #7
438         sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
439         sqshrn         v19.8b,  v19.8h, #7
440         sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
441         sqshrn         v22.8b,  v22.8h, #7
442         sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
443         sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
444         sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
445         sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
446         sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
447         sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
448         sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
449         eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
450         eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
451         eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
452         eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
453         eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
454         eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
455     .endif
456 .endm
457
458 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
459 function ff_vp8_v_loop_filter16\name\()_neon, export=1
460         sub             x0,  x0,  x1,  lsl #1+!\simple
461
462         // Load pixels:
463     .if !\simple
464         ld1             {v0.16b},     [x0], x1 // P3
465         ld1             {v1.16b},     [x0], x1 // P2
466     .endif
467         ld1             {v2.16b},     [x0], x1 // P1
468         ld1             {v3.16b},     [x0], x1 // P0
469         ld1             {v4.16b},     [x0], x1 // Q0
470         ld1             {v5.16b},     [x0], x1 // Q1
471     .if !\simple
472         ld1             {v6.16b},     [x0], x1 // Q2
473         ld1             {v7.16b},     [x0]     // Q3
474         dup             v23.16b, w3                 // flim_I
475     .endif
476         dup             v22.16b, w2                 // flim_E
477
478         vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
479
480         // back up to P2:  dst -= stride * 6
481         sub             x0,  x0,  x1,  lsl #2
482     .if !\simple
483         sub             x0,  x0,  x1,  lsl #1
484
485         // Store pixels:
486         st1             {v1.16b},     [x0], x1 // P2
487     .endif
488         st1             {v2.16b},     [x0], x1 // P1
489         st1             {v3.16b},     [x0], x1 // P0
490         st1             {v4.16b},     [x0], x1 // Q0
491         st1             {v5.16b},     [x0], x1 // Q1
492     .if !\simple
493         st1             {v6.16b},     [x0]     // Q2
494     .endif
495
496         ret
497 endfunc
498 .endm
499
500 vp8_v_loop_filter16
501 vp8_v_loop_filter16 _inner,  inner=1
502 vp8_v_loop_filter16 _simple, simple=1
503
504 .macro  vp8_v_loop_filter8uv name, inner=0
505 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
506         sub             x0,  x0,  x2,  lsl #2
507         sub             x1,  x1,  x2,  lsl #2
508         // Load pixels:
509         ld1          {v0.d}[0],     [x0], x2  // P3
510         ld1          {v0.d}[1],     [x1], x2  // P3
511         ld1          {v1.d}[0],     [x0], x2  // P2
512         ld1          {v1.d}[1],     [x1], x2  // P2
513         ld1          {v2.d}[0],     [x0], x2  // P1
514         ld1          {v2.d}[1],     [x1], x2  // P1
515         ld1          {v3.d}[0],     [x0], x2  // P0
516         ld1          {v3.d}[1],     [x1], x2  // P0
517         ld1          {v4.d}[0],     [x0], x2  // Q0
518         ld1          {v4.d}[1],     [x1], x2  // Q0
519         ld1          {v5.d}[0],     [x0], x2  // Q1
520         ld1          {v5.d}[1],     [x1], x2  // Q1
521         ld1          {v6.d}[0],     [x0], x2  // Q2
522         ld1          {v6.d}[1],     [x1], x2  // Q2
523         ld1          {v7.d}[0],     [x0]      // Q3
524         ld1          {v7.d}[1],     [x1]      // Q3
525
526         dup          v22.16b, w3                 // flim_E
527         dup          v23.16b, w4                 // flim_I
528
529         vp8_loop_filter inner=\inner, hev_thresh=w5
530
531         // back up to P2:  u,v -= stride * 6
532         sub          x0,  x0,  x2,  lsl #2
533         sub          x1,  x1,  x2,  lsl #2
534         sub          x0,  x0,  x2,  lsl #1
535         sub          x1,  x1,  x2,  lsl #1
536
537         // Store pixels:
538
539         st1          {v1.d}[0],     [x0], x2  // P2
540         st1          {v1.d}[1],     [x1], x2  // P2
541         st1          {v2.d}[0],     [x0], x2  // P1
542         st1          {v2.d}[1],     [x1], x2  // P1
543         st1          {v3.d}[0],     [x0], x2  // P0
544         st1          {v3.d}[1],     [x1], x2  // P0
545         st1          {v4.d}[0],     [x0], x2  // Q0
546         st1          {v4.d}[1],     [x1], x2  // Q0
547         st1          {v5.d}[0],     [x0], x2  // Q1
548         st1          {v5.d}[1],     [x1], x2  // Q1
549         st1          {v6.d}[0],     [x0]      // Q2
550         st1          {v6.d}[1],     [x1]      // Q2
551
552         ret
553 endfunc
554 .endm
555
556 vp8_v_loop_filter8uv
557 vp8_v_loop_filter8uv _inner, inner=1
558
559 .macro  vp8_h_loop_filter16 name, inner=0, simple=0
560 function ff_vp8_h_loop_filter16\name\()_neon, export=1
561
562         sub             x0,  x0,  #4
563         // Load pixels:
564         ld1             {v0.d}[0], [x0], x1
565         ld1             {v1.d}[0], [x0], x1
566         ld1             {v2.d}[0], [x0], x1
567         ld1             {v3.d}[0], [x0], x1
568         ld1             {v4.d}[0], [x0], x1
569         ld1             {v5.d}[0], [x0], x1
570         ld1             {v6.d}[0], [x0], x1
571         ld1             {v7.d}[0], [x0], x1
572         ld1             {v0.d}[1], [x0], x1
573         ld1             {v1.d}[1], [x0], x1
574         ld1             {v2.d}[1], [x0], x1
575         ld1             {v3.d}[1], [x0], x1
576         ld1             {v4.d}[1], [x0], x1
577         ld1             {v5.d}[1], [x0], x1
578         ld1             {v6.d}[1], [x0], x1
579         ld1             {v7.d}[1], [x0], x1
580
581         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
582
583         dup             v22.16b, w2                 // flim_E
584     .if !\simple
585         dup             v23.16b, w3                 // flim_I
586     .endif
587
588         vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
589
590         sub             x0,  x0,  x1, lsl #4    // backup 16 rows
591
592         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
593
594         // Store pixels:
595         st1             {v0.d}[0], [x0], x1
596         st1             {v1.d}[0], [x0], x1
597         st1             {v2.d}[0], [x0], x1
598         st1             {v3.d}[0], [x0], x1
599         st1             {v4.d}[0], [x0], x1
600         st1             {v5.d}[0], [x0], x1
601         st1             {v6.d}[0], [x0], x1
602         st1             {v7.d}[0], [x0], x1
603         st1             {v0.d}[1], [x0], x1
604         st1             {v1.d}[1], [x0], x1
605         st1             {v2.d}[1], [x0], x1
606         st1             {v3.d}[1], [x0], x1
607         st1             {v4.d}[1], [x0], x1
608         st1             {v5.d}[1], [x0], x1
609         st1             {v6.d}[1], [x0], x1
610         st1             {v7.d}[1], [x0]
611
612         ret
613 endfunc
614 .endm
615
616 vp8_h_loop_filter16
617 vp8_h_loop_filter16 _inner,  inner=1
618 vp8_h_loop_filter16 _simple, simple=1
619
620 .macro  vp8_h_loop_filter8uv name, inner=0
621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
622         sub             x0,  x0,  #4
623         sub             x1,  x1,  #4
624
625         // Load pixels:
626         ld1          {v0.d}[0],     [x0], x2 // load u
627         ld1          {v0.d}[1],     [x1], x2 // load v
628         ld1          {v1.d}[0],     [x0], x2
629         ld1          {v1.d}[1],     [x1], x2
630         ld1          {v2.d}[0],     [x0], x2
631         ld1          {v2.d}[1],     [x1], x2
632         ld1          {v3.d}[0],     [x0], x2
633         ld1          {v3.d}[1],     [x1], x2
634         ld1          {v4.d}[0],     [x0], x2
635         ld1          {v4.d}[1],     [x1], x2
636         ld1          {v5.d}[0],     [x0], x2
637         ld1          {v5.d}[1],     [x1], x2
638         ld1          {v6.d}[0],     [x0], x2
639         ld1          {v6.d}[1],     [x1], x2
640         ld1          {v7.d}[0],     [x0], x2
641         ld1          {v7.d}[1],     [x1], x2
642
643         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
644
645         dup             v22.16b, w3                 // flim_E
646         dup             v23.16b, w4                 // flim_I
647
648         vp8_loop_filter inner=\inner, hev_thresh=w5
649
650         sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
651         sub             x1,  x1,  x2, lsl #3    // backup v 8 rows
652
653         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
654
655         // Store pixels:
656         st1          {v0.d}[0],     [x0], x2 // load u
657         st1          {v0.d}[1],     [x1], x2 // load v
658         st1          {v1.d}[0],     [x0], x2
659         st1          {v1.d}[1],     [x1], x2
660         st1          {v2.d}[0],     [x0], x2
661         st1          {v2.d}[1],     [x1], x2
662         st1          {v3.d}[0],     [x0], x2
663         st1          {v3.d}[1],     [x1], x2
664         st1          {v4.d}[0],     [x0], x2
665         st1          {v4.d}[1],     [x1], x2
666         st1          {v5.d}[0],     [x0], x2
667         st1          {v5.d}[1],     [x1], x2
668         st1          {v6.d}[0],     [x0], x2
669         st1          {v6.d}[1],     [x1], x2
670         st1          {v7.d}[0],     [x0]
671         st1          {v7.d}[1],     [x1]
672
673         ret
674
675 endfunc
676 .endm
677
678 vp8_h_loop_filter8uv
679 vp8_h_loop_filter8uv _inner, inner=1
680
681
682 function ff_put_vp8_pixels16_neon, export=1
683 1:
684         subs            w4, w4, #4
685         ld1             {v0.16b},     [x2], x3
686         ld1             {v1.16b},     [x2], x3
687         ld1             {v2.16b},     [x2], x3
688         ld1             {v3.16b},     [x2], x3
689         st1             {v0.16b},     [x0], x1
690         st1             {v1.16b},     [x0], x1
691         st1             {v2.16b},     [x0], x1
692         st1             {v3.16b},     [x0], x1
693         b.gt            1b
694         ret
695 endfunc
696
697 function ff_put_vp8_pixels8_neon, export=1
698 1:
699         subs            w4, w4, #4
700         ld1             {v0.8b},   [x2], x3
701         ld1             {v0.d}[1], [x2], x3
702         ld1             {v1.8b},   [x2], x3
703         ld1             {v1.d}[1], [x2], x3
704         st1             {v0.8b},   [x0], x1
705         st1             {v0.d}[1], [x0], x1
706         st1             {v1.8b},   [x0], x1
707         st1             {v1.d}[1], [x0], x1
708         b.gt            1b
709         ret
710 endfunc
711
712 /* 4/6-tap 8th-pel MC */
713
714 .macro  vp8_epel8_h6    d,   s0,   s1
715         ext             v22.8b, \s0\().8b,  \s1\().8b,  #1
716         uxtl            v18.8h, \s0\().8b
717         ext             v23.8b, \s0\().8b,  \s1\().8b,  #2
718         uxtl            v19.8h, v22.8b
719         ext             v24.8b, \s0\().8b,  \s1\().8b,  #3
720         uxtl            v21.8h, v23.8b
721         ext             v25.8b, \s0\().8b,  \s1\().8b,  #4
722         uxtl            v22.8h, v24.8b
723         ext             v26.8b, \s0\().8b,  \s1\().8b,  #5
724         uxtl            v25.8h, v25.8b
725         mul             v21.8h, v21.8h, v0.h[2]
726         uxtl            v26.8h, v26.8b
727         mul             v22.8h, v22.8h, v0.h[3]
728         mls             v21.8h, v19.8h, v0.h[1]
729         mls             v22.8h, v25.8h, v0.h[4]
730         mla             v21.8h, v18.8h, v0.h[0]
731         mla             v22.8h, v26.8h, v0.h[5]
732         sqadd           v22.8h, v21.8h, v22.8h
733         sqrshrun        \d\().8b, v22.8h, #7
734 .endm
735
736 .macro  vp8_epel16_h6   d0,  v0,  v1
737         ext             v22.16b, \v0\().16b, \v1\().16b, #3
738         ext             v23.16b, \v0\().16b, \v1\().16b, #4
739         uxtl            v19.8h,  v22.8b
740         uxtl2           v22.8h,  v22.16b
741         ext             v3.16b,  \v0\().16b, \v1\().16b, #2
742         uxtl            v20.8h,  v23.8b
743         uxtl2           v23.8h,  v23.16b
744         ext             v16.16b, \v0\().16b, \v1\().16b, #1
745         uxtl            v18.8h,  v3.8b
746         uxtl2           v3.8h,   v3.16b
747         ext             v2.16b,  \v0\().16b, \v1\().16b, #5
748         uxtl            v21.8h,  v2.8b
749         uxtl2           v2.8h,   v2.16b
750         uxtl            v17.8h,  v16.8b
751         uxtl2           v16.8h,  v16.16b
752         mul             v19.8h,  v19.8h, v0.h[3]
753         mul             v18.8h,  v18.8h, v0.h[2]
754         mul             v3.8h,   v3.8h,  v0.h[2]
755         mul             v22.8h,  v22.8h, v0.h[3]
756         mls             v19.8h,  v20.8h, v0.h[4]
757         uxtl            v20.8h,  \v0\().8b
758         uxtl2           v1.8h,   \v0\().16b
759         mls             v18.8h,  v17.8h, v0.h[1]
760         mls             v3.8h,   v16.8h, v0.h[1]
761         mls             v22.8h,  v23.8h, v0.h[4]
762         mla             v18.8h,  v20.8h, v0.h[0]
763         mla             v19.8h,  v21.8h, v0.h[5]
764         mla             v3.8h,   v1.8h,  v0.h[0]
765         mla             v22.8h,  v2.8h,  v0.h[5]
766         sqadd           v19.8h,  v18.8h, v19.8h
767         sqadd           v22.8h,  v3.8h,  v22.8h
768         sqrshrun        \d0\().8b,  v19.8h, #7
769         sqrshrun2       \d0\().16b, v22.8h, #7
770 .endm
771
772 .macro  vp8_epel8_v6    d0,  s0,  s1,  s2, s3, s4, s5
773         uxtl            \s2\().8h, \s2\().8b
774         uxtl            \s3\().8h, \s3\().8b
775         uxtl            \s1\().8h, \s1\().8b
776         uxtl            \s4\().8h, \s4\().8b
777         uxtl            \s0\().8h, \s0\().8b
778         uxtl            \s5\().8h, \s5\().8b
779         mul             \s2\().8h, \s2\().8h, v0.h[2]
780         mul             \s3\().8h, \s3\().8h, v0.h[3]
781         mls             \s2\().8h, \s1\().8h, v0.h[1]
782         mls             \s3\().8h, \s4\().8h, v0.h[4]
783         mla             \s2\().8h, \s0\().8h, v0.h[0]
784         mla             \s3\().8h, \s5\().8h, v0.h[5]
785         sqadd           \s3\().8h, \s2\().8h, \s3\().8h
786         sqrshrun        \d0\().8b, \s3\().8h, #7
787 .endm
788
789 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
790         uxtl            \s0\().8h, \s0\().8b
791         uxtl            \s3\().8h, \s3\().8b
792         uxtl            \s6\().8h, \s6\().8b
793         uxtl            \s1\().8h, \s1\().8b
794         uxtl            \s4\().8h, \s4\().8b
795         uxtl            \s2\().8h, \s2\().8b
796         uxtl            \s5\().8h, \s5\().8b
797         mul             \s0\().8h, \s0\().8h, v0.h[0]
798         mul             v31.8h   , \s3\().8h, v0.h[3]
799         mul             \s3\().8h, \s3\().8h, v0.h[2]
800         mul             \s6\().8h, \s6\().8h, v0.h[5]
801
802         mls             \s0\().8h, \s1\().8h, v0.h[1]
803         mls             v31.8h   , \s4\().8h, v0.h[4]
804         mls             \s3\().8h, \s2\().8h, v0.h[1]
805         mls             \s6\().8h, \s5\().8h, v0.h[4]
806
807         mla             \s0\().8h, \s2\().8h, v0.h[2]
808         mla             v31.8h   , \s5\().8h, v0.h[5]
809         mla             \s3\().8h, \s1\().8h, v0.h[0]
810         mla             \s6\().8h, \s4\().8h, v0.h[3]
811         sqadd           v31.8h   , \s0\().8h, v31.8h
812         sqadd           \s6\().8h, \s3\().8h, \s6\().8h
813         sqrshrun        \d0\().8b, v31.8h,    #7
814         sqrshrun        \d1\().8b, \s6\().8h, #7
815 .endm
816
817 .macro  vp8_epel8_h4    d,   v0,   v1
818         ext             v22.8b, \v0\().8b,  \v1\().8b,  #1
819         uxtl            v19.8h, \v0\().8b
820         ext             v23.8b, \v0\().8b,  \v1\().8b,  #2
821         uxtl            v20.8h, v22.8b
822         ext             v25.8b, \v0\().8b,  \v1\().8b,  #3
823         uxtl            v22.8h, v23.8b
824         uxtl            v25.8h, v25.8b
825         mul             v20.8h, v20.8h, v0.h[2]
826         mul             v22.8h, v22.8h, v0.h[3]
827         mls             v20.8h, v19.8h, v0.h[1]
828         mls             v22.8h, v25.8h, v0.h[4]
829         sqadd           v22.8h, v20.8h, v22.8h
830         sqrshrun        \d\().8b, v22.8h, #7
831 .endm
832
833 .macro  vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
834         uxtl            \s0\().8h,  \s0\().8b
835         uxtl            \s1\().8h,  \s1\().8b
836         uxtl            \s2\().8h,  \s2\().8b
837         uxtl            \s3\().8h,  \s3\().8b
838         uxtl            \s4\().8h,  \s4\().8b
839         mul             v21.8h,     \s1\().8h, v0.h[2]
840         mul             v23.8h,     \s2\().8h, v0.h[3]
841         mul             \s2\().8h,  \s2\().8h, v0.h[2]
842         mul             v22.8h,     \s3\().8h, v0.h[3]
843         mls             v21.8h,     \s0\().8h, v0.h[1]
844         mls             v23.8h,     \s3\().8h, v0.h[4]
845         mls             \s2\().8h,  \s1\().8h, v0.h[1]
846         mls             v22.8h,     \s4\().8h, v0.h[4]
847         sqadd           v21.8h,     v21.8h,    v23.8h
848         sqadd           \s2\().8h,  \s2\().8h, v22.8h
849         sqrshrun        \d0\().8b,  v21.8h,    #7
850         sqrshrun2       \d0\().16b, \s2\().8h, #7
851 .endm
852
853
854 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
855 // arithmetic can be used to apply filters
856 const   subpel_filters, align=4
857         .short     0,   6, 123,  12,   1,   0,   0,   0
858         .short     2,  11, 108,  36,   8,   1,   0,   0
859         .short     0,   9,  93,  50,   6,   0,   0,   0
860         .short     3,  16,  77,  77,  16,   3,   0,   0
861         .short     0,   6,  50,  93,   9,   0,   0,   0
862         .short     1,   8,  36, 108,  11,   2,   0,   0
863         .short     0,   1,  12, 123,   6,   0,   0,   0
864 endconst
865
866 function ff_put_vp8_epel16_v6_neon, export=1
867         sub             x2,  x2,  x3,  lsl #1
868
869         sxtw            x4,  w4
870         sxtw            x6,  w6
871         movrel          x17,  subpel_filters, -16
872         add             x6,  x17,  x6, lsl #4  // y
873         ld1             {v0.8h},     [x6]
874 1:
875         ld1             {v1.1d - v2.1d},    [x2], x3
876         ld1             {v3.1d - v4.1d},    [x2], x3
877         ld1             {v16.1d - v17.1d},  [x2], x3
878         ld1             {v18.1d - v19.1d},  [x2], x3
879         ld1             {v20.1d - v21.1d},  [x2], x3
880         ld1             {v22.1d - v23.1d},  [x2], x3
881         ld1             {v24.1d - v25.1d},  [x2]
882         sub             x2,  x2,  x3, lsl #2
883
884         vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
885         vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
886
887         st1             {v1.1d - v2.1d}, [x0], x1
888         st1             {v3.1d - v4.1d}, [x0], x1
889         subs            x4, x4, #2
890         b.ne            1b
891
892         ret
893 endfunc
894
895 function ff_put_vp8_epel16_h6_neon, export=1
896         sub             x2,  x2,  #2
897         sxtw            x5,  w5 // x
898
899         // first pass (horizontal):
900         movrel          x17,  subpel_filters, -16
901         add             x5,  x17,  x5, lsl #4 // x
902         ld1             {v0.8h},  [x5]
903 1:
904         ld1             {v1.16b, v2.16b}, [x2], x3
905         vp8_epel16_h6   v1, v1, v2
906         st1             {v1.16b}, [x0], x1
907
908         subs            w4, w4, #1
909         b.ne            1b
910         ret
911 endfunc
912
913
914 function ff_put_vp8_epel16_h6v6_neon, export=1
915         sub             x2,  x2,  x3,  lsl #1
916         sub             x2,  x2,  #2
917
918         // first pass (horizontal):
919         movrel          x17,  subpel_filters, -16
920         sxtw            x5,  w5 // x
921         add             x16,  x17,  x5, lsl #4 // x
922         sub             sp,  sp,  #336+16
923         ld1             {v0.8h},  [x16]
924         add             x7,  sp,  #15
925         sxtw            x4,  w4
926         add             x16, x4, #5   // h
927         bic             x7,  x7,  #15
928 1:
929         ld1             {v1.16b, v2.16b}, [x2], x3
930         vp8_epel16_h6   v1, v1, v2
931         st1             {v1.16b}, [x7], #16
932         subs            x16, x16, #1
933         b.ne            1b
934
935
936         // second pass (vertical):
937         sxtw            x6,  w6
938         add             x6,  x17,  x6, lsl #4  // y
939         add             x7,  sp,  #15
940         ld1             {v0.8h},     [x6]
941         bic             x7,  x7,  #15
942 2:
943         ld1             {v1.8b - v4.8b},    [x7], #32
944         ld1             {v16.8b - v19.8b},  [x7], #32
945         ld1             {v20.8b - v23.8b},  [x7]
946         sub             x7,  x7,  #48
947
948         vp8_epel8_v6    v5, v1, v3, v16, v18, v20, v22
949         vp8_epel8_v6    v2, v2, v4, v17, v19, v21, v23
950         trn1            v2.2d, v5.2d, v2.2d
951
952         st1             {v2.16b}, [x0], x1
953         subs            x4, x4, #1
954         b.ne            2b
955
956         add             sp,  sp,  #336+16
957         ret
958 endfunc
959
960 function ff_put_vp8_epel8_v6_neon, export=1
961         sub             x2,  x2,  x3,  lsl #1
962
963         movrel          x7,  subpel_filters, -16
964         add             x6,  x7,  w6, uxtw #4
965         ld1             {v0.8h},  [x6]
966 1:
967         ld1             {v2.8b},  [x2], x3
968         ld1             {v3.8b},  [x2], x3
969         ld1             {v4.8b},  [x2], x3
970         ld1             {v5.8b},  [x2], x3
971         ld1             {v6.8b},  [x2], x3
972         ld1             {v7.8b},  [x2], x3
973         ld1             {v28.8b}, [x2]
974
975         sub             x2,  x2,  x3,  lsl #2
976
977         vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
978
979         st1             {v2.8b}, [x0], x1
980         st1             {v3.8b}, [x0], x1
981         subs            w4,  w4,  #2
982         b.ne            1b
983
984         ret
985 endfunc
986
987 function ff_put_vp8_epel8_h6_neon, export=1
988         sub             x2,  x2,  #2
989
990         movrel          x7,  subpel_filters, -16
991         add             x5,  x7,  w5, uxtw #4
992         ld1             {v0.8h},        [x5]
993 1:
994         ld1             {v2.8b, v3.8b}, [x2], x3
995
996         vp8_epel8_h6    v2,  v2,  v3
997
998         st1             {v2.8b}, [x0], x1
999         subs            w4,  w4,  #1
1000         b.ne            1b
1001
1002         ret
1003 endfunc
1004
1005 function ff_put_vp8_epel8_h6v6_neon, export=1
1006         sub             x2,  x2,  x3,  lsl #1
1007         sub             x2,  x2,  #2
1008         sxtw            x4,  w4
1009
1010         // first pass (horizontal):
1011         movrel          x17,  subpel_filters, -16
1012         sxtw            x5,  w5
1013         add             x5,  x17,  x5, lsl #4 // x
1014         sub             sp,  sp,  #168+16
1015         ld1             {v0.8h},  [x5]
1016         add             x7,  sp,  #15
1017         add             x16, x4,  #5   // h
1018         bic             x7,  x7,  #15
1019 1:
1020         ld1             {v1.8b, v2.8b}, [x2], x3
1021
1022         vp8_epel8_h6    v1, v1, v2
1023
1024         st1             {v1.8b}, [x7], #8
1025         subs            x16, x16, #1
1026         b.ne            1b
1027
1028         // second pass (vertical):
1029         sxtw            x6,  w6
1030         add             x6,  x17,  x6, lsl #4  // y
1031         add             x7,  sp,   #15
1032         ld1             {v0.8h},   [x6]
1033         bic             x7,  x7,   #15
1034 2:
1035         ld1             {v1.8b - v4.8b}, [x7], #32
1036         ld1             {v5.8b - v7.8b}, [x7]
1037
1038         sub             x7,  x7,  #16
1039
1040         vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1041
1042         st1             {v1.8b}, [x0], x1
1043         st1             {v2.8b}, [x0], x1
1044         subs            x4, x4, #2
1045         b.ne            2b
1046
1047         add             sp,  sp,  #168+16
1048         ret
1049 endfunc
1050
1051 function ff_put_vp8_epel8_v4_neon, export=1
1052         sub             x2,  x2,  x3
1053
1054         movrel          x7,  subpel_filters, -16
1055         add             x6,  x7,  w6, uxtw #4
1056         ld1             {v0.8h},     [x6]
1057 1:
1058         ld1             {v2.8b},     [x2], x3
1059         ld1             {v3.8b},     [x2], x3
1060         ld1             {v4.8b},     [x2], x3
1061         ld1             {v5.8b},     [x2], x3
1062         ld1             {v6.8b},     [x2]
1063         sub             x2,  x2,  x3,  lsl #1
1064
1065         vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1066
1067         st1             {v2.d}[0], [x0], x1
1068         st1             {v2.d}[1], [x0], x1
1069         subs            w4,  w4,  #2
1070         b.ne            1b
1071
1072         ret
1073 endfunc
1074
1075 function ff_put_vp8_epel8_h4_neon, export=1
1076         sub             x2,  x2,  #1
1077
1078         movrel          x7,  subpel_filters, -16
1079         add             x5,  x7,  w5, uxtw #4
1080         ld1             {v0.8h},       [x5]
1081 1:
1082         ld1             {v2.8b,v3.8b}, [x2], x3
1083
1084         vp8_epel8_h4    v2,  v2,  v3
1085
1086         st1             {v2.8b}, [x0], x1
1087         subs            w4,  w4,  #1
1088         b.ne            1b
1089
1090         ret
1091 endfunc
1092
1093 function ff_put_vp8_epel8_h4v6_neon, export=1
1094         sub             x2,  x2,  x3,  lsl #1
1095         sub             x2,  x2,  #1
1096         sxtw            x4,  w4
1097
1098         // first pass (horizontal):
1099         movrel          x17,  subpel_filters, -16
1100         sxtw            x5,  w5
1101         add             x5,  x17,  x5, lsl #4 // x
1102         sub             sp,  sp,  #168+16
1103         ld1             {v0.8h},  [x5]
1104         add             x7,  sp,  #15
1105         add             x16, x4, #5   // h
1106         bic             x7,  x7,  #15
1107 1:
1108         ld1             {v1.8b, v2.8b}, [x2], x3
1109
1110         vp8_epel8_h4    v1, v1, v2
1111
1112         st1             {v1.8b}, [x7], #8
1113         subs            x16, x16, #1
1114         b.ne            1b
1115
1116         // second pass (vertical):
1117         sxtw            x6,  w6
1118         add             x6,  x17,  x6, lsl #4  // y
1119         add             x7,  sp,   #15
1120         ld1             {v0.8h},   [x6]
1121         bic             x7,  x7,   #15
1122 2:
1123         ld1             {v1.8b - v4.8b}, [x7], #32
1124         ld1             {v5.8b - v7.8b}, [x7]
1125
1126         sub             x7,  x7,  #16
1127
1128         vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1129
1130         st1             {v1.8b}, [x0], x1
1131         st1             {v2.8b}, [x0], x1
1132         subs            x4, x4, #2
1133         b.ne            2b
1134
1135         add             sp,  sp,  #168+16
1136         ret
1137 endfunc
1138
1139 function ff_put_vp8_epel8_h4v4_neon, export=1
1140         sub             x2,  x2,  x3
1141         sub             x2,  x2,  #1
1142         sxtw            x4,  w4
1143
1144
1145         // first pass (horizontal):
1146         movrel          x17,  subpel_filters, -16
1147         sxtw            x5,  w5
1148         add             x5,  x17,  x5, lsl #4 // x
1149         sub             sp,  sp,  #168+16
1150         ld1             {v0.8h},  [x5]
1151         add             x7,  sp,  #15
1152         add             x16, x4, #3   // h
1153         bic             x7,  x7,  #15
1154 1:
1155         ld1             {v1.8b, v2.8b}, [x2], x3
1156
1157         vp8_epel8_h4    v1, v1, v2
1158
1159         st1             {v1.8b}, [x7], #8
1160         subs            x16, x16, #1
1161         b.ne            1b
1162
1163         // second pass (vertical):
1164         sxtw            x6,  w6
1165         add             x6,  x17,  x6, lsl #4  // y
1166         add             x7,  sp,   #15
1167         ld1             {v0.8h},   [x6]
1168         bic             x7,  x7,   #15
1169 2:
1170         ld1             {v1.8b - v2.8b}, [x7], #16
1171         ld1             {v3.8b - v5.8b}, [x7]
1172
1173         vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1174
1175         st1             {v1.d}[0], [x0], x1
1176         st1             {v1.d}[1], [x0], x1
1177         subs            x4, x4, #2
1178         b.ne            2b
1179
1180         add             sp,  sp,  #168+16
1181         ret
1182 endfunc
1183
1184 function ff_put_vp8_epel8_h6v4_neon, export=1
1185         sub             x2,  x2,  x3
1186         sub             x2,  x2,  #2
1187         sxtw            x4,  w4
1188
1189
1190         // first pass (horizontal):
1191         movrel          x17,  subpel_filters, -16
1192         sxtw            x5,  w5
1193         add             x5,  x17,  x5, lsl #4 // x
1194         sub             sp,  sp,  #168+16
1195         ld1             {v0.8h},  [x5]
1196         add             x7,  sp,  #15
1197         add             x16, x4, #3   // h
1198         bic             x7,  x7,  #15
1199 1:
1200         ld1             {v1.8b, v2.8b}, [x2], x3
1201
1202         vp8_epel8_h6    v1, v1, v2
1203
1204         st1             {v1.8b}, [x7], #8
1205         subs            x16, x16, #1
1206         b.ne            1b
1207
1208         // second pass (vertical):
1209         sxtw            x6,  w6
1210         add             x6,  x17,  x6, lsl #4  // y
1211         add             x7,  sp,   #15
1212         ld1             {v0.8h},   [x6]
1213         bic             x7,  x7,   #15
1214 2:
1215         ld1             {v1.8b - v2.8b}, [x7], #16
1216         ld1             {v3.8b - v5.8b}, [x7]
1217
1218         vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1219
1220         st1             {v1.d}[0], [x0], x1
1221         st1             {v1.d}[1], [x0], x1
1222         subs            x4, x4, #2
1223         b.ne            2b
1224
1225         add             sp,  sp,  #168+16
1226         ret
1227 endfunc