]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp8dsp_neon.S
aarch64: vp8: Use the proper aarch64 form for conditional branches
[ffmpeg] / libavcodec / aarch64 / vp8dsp_neon.S
1 /*
2  * VP8 NEON optimisations
3  *
4  * Copyright (c) 2010 Rob Clark <rob@ti.com>
5  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6  * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 #include "libavutil/aarch64/asm.S"
26 #include "neon.S"
27
28 function ff_vp8_idct_add_neon, export=1
29         ld1             {v0.8b - v3.8b},  [x1]
30         mov             w4,  #20091
31         movk            w4,  #35468/2, lsl #16
32         dup             v4.2s, w4
33
34         smull           v26.4s, v1.4h,  v4.h[0]
35         smull           v27.4s, v3.4h,  v4.h[0]
36         sqdmulh         v20.4h, v1.4h,  v4.h[1]
37         sqdmulh         v23.4h, v3.4h,  v4.h[1]
38         sqshrn          v21.4h, v26.4s, #16
39         sqshrn          v22.4h, v27.4s, #16
40         add             v21.4h, v21.4h, v1.4h
41         add             v22.4h, v22.4h, v3.4h
42
43         add             v16.4h,  v0.4h,   v2.4h
44         sub             v17.4h,  v0.4h,   v2.4h
45
46         add             v18.4h,  v21.4h,  v23.4h
47         sub             v19.4h,  v20.4h,  v22.4h
48
49         add             v0.4h,   v16.4h,  v18.4h
50         add             v1.4h,   v17.4h,  v19.4h
51         sub             v3.4h,   v16.4h,  v18.4h
52         sub             v2.4h,   v17.4h,  v19.4h
53
54         transpose_4x4H  v0, v1, v2, v3, v24, v5, v6, v7
55
56         movi            v29.8h, #0
57         smull           v26.4s,     v1.4h,  v4.h[0]
58         st1             {v29.8h},   [x1],   #16
59         smull           v27.4s,     v3.4h,  v4.h[0]
60         st1             {v29.16b},  [x1]
61         sqdmulh         v21.4h,     v1.4h,  v4.h[1]
62         sqdmulh         v23.4h,     v3.4h,  v4.h[1]
63         sqshrn          v20.4h,     v26.4s, #16
64         sqshrn          v22.4h,     v27.4s, #16
65         add             v20.4h,     v20.4h, v1.4h
66         add             v22.4h,     v22.4h, v3.4h
67         add             v16.4h,     v0.4h,  v2.4h
68         sub             v17.4h,     v0.4h,  v2.4h
69
70         add             v18.4h,     v20.4h, v23.4h
71         ld1             {v24.d}[0], [x0],   x2
72         zip1            v16.2d,     v16.2d, v17.2d
73         sub             v19.4h,     v21.4h, v22.4h
74         ld1             {v25.d}[0], [x0],   x2
75         zip1            v18.2d,     v18.2d, v19.2d
76         add             v0.8h,      v16.8h, v18.8h
77         ld1             {v25.d}[1], [x0],   x2
78         sub             v1.8h,      v16.8h, v18.8h
79         ld1             {v24.d}[1], [x0],   x2
80         srshr           v0.8h,      v0.8h,  #3
81         trn1            v24.4s,     v24.4s, v25.4s
82         srshr           v1.8h,      v1.8h,  #3
83         sub             x0,  x0,  x2,  lsl #2
84
85         ext             v1.16b, v1.16b, v1.16b, #8
86         trn1            v3.2d,  v0.2d,  v1.2d
87         trn2            v0.2d,  v0.2d,  v1.2d
88         trn1            v1.8h,  v3.8h,  v0.8h
89         trn2            v3.8h,  v3.8h,  v0.8h
90         uzp1            v0.4s,  v1.4s,  v3.4s
91         uzp2            v1.4s,  v3.4s,  v1.4s
92
93         uaddw           v0.8h,  v0.8h, v24.8b
94         uaddw2          v1.8h,  v1.8h, v24.16b
95         sqxtun          v0.8b,  v0.8h
96         sqxtun2         v0.16b, v1.8h
97         st1             {v0.s}[0],  [x0], x2
98         st1             {v0.s}[1],  [x0], x2
99         st1             {v0.s}[3],  [x0], x2
100         st1             {v0.s}[2],  [x0], x2
101
102         ret
103 endfunc
104
105 function ff_vp8_idct_dc_add4y_neon, export=1
106         movi            v0.16b,  #0
107         mov             x3,  #32
108         ld1r            {v16.4h},    [x1]
109         st1             {v0.h}[0],   [x1], x3
110         ld1r            {v17.4h},    [x1]
111         st1             {v0.h}[0],   [x1], x3
112         zip1            v16.2d,      v16.2d, v17.2d
113         ld1r            {v18.4h},    [x1]
114         st1             {v0.h}[0],   [x1], x3
115         ld1r            {v19.4h},    [x1]
116         st1             {v0.h}[0],   [x1], x3
117         zip1            v18.2d,      v18.2d, v19.2d
118         srshr           v16.8h,      v16.8h,  #3            // dc >>= 3
119         ld1             {v0.16b},     [x0], x2
120         srshr           v18.8h,       v18.8h,  #3
121         ld1             {v1.16b},     [x0], x2
122         uaddw           v20.8h,       v16.8h,  v0.8b
123         ld1             {v2.16b},     [x0], x2
124         uaddw2          v0.8h,        v18.8h,   v0.16b
125         ld1             {v3.16b},     [x0], x2
126         uaddw           v21.8h, v16.8h,  v1.8b
127         uaddw2          v1.8h,  v18.8h,  v1.16b
128         uaddw           v22.8h, v16.8h,  v2.8b
129         uaddw2          v2.8h,  v18.8h,  v2.16b
130         uaddw           v23.8h, v16.8h,  v3.8b
131         uaddw2          v3.8h,  v18.8h,  v3.16b
132         sub             x0,  x0,  x2,  lsl #2
133         sqxtun          v20.8b,  v20.8h
134         sqxtun2         v20.16b, v0.8h
135         sqxtun          v21.8b,  v21.8h
136         sqxtun2         v21.16b, v1.8h
137         sqxtun          v22.8b,  v22.8h
138         st1             {v20.16b},    [x0], x2
139         sqxtun2         v22.16b, v2.8h
140         st1             {v21.16b},    [x0], x2
141         sqxtun          v23.8b,  v23.8h
142         st1             {v22.16b},    [x0], x2
143         sqxtun2         v23.16b, v3.8h
144         st1             {v23.16b},    [x0], x2
145
146         ret
147 endfunc
148
149 function ff_vp8_idct_dc_add_neon, export=1
150         mov             w3,       #0
151         ld1r            {v2.8h},  [x1]
152         strh            w3,       [x1]
153         srshr           v2.8h,  v2.8h,  #3
154         ld1             {v0.s}[0],  [x0], x2
155         ld1             {v0.s}[1],  [x0], x2
156         uaddw           v3.8h,  v2.8h,  v0.8b
157         ld1             {v1.s}[0],  [x0], x2
158         ld1             {v1.s}[1],  [x0], x2
159         uaddw           v4.8h,  v2.8h,  v1.8b
160         sqxtun          v0.8b,  v3.8h
161         sqxtun          v1.8b,  v4.8h
162         sub             x0,  x0,  x2, lsl #2
163         st1             {v0.s}[0],  [x0], x2
164         st1             {v0.s}[1],  [x0], x2
165         st1             {v1.s}[0],  [x0], x2
166         st1             {v1.s}[1],  [x0], x2
167         ret
168 endfunc
169
170 // Register layout:
171 //   P3..Q3 -> v0..v7
172 //   flim_E -> v22
173 //   flim_I -> v23
174 //   hev_thresh -> x5
175 //
176 .macro  vp8_loop_filter, inner=0, simple=0, hev_thresh
177     .if \simple
178         uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
179         uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
180         uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
181         ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
182         uqadd           v19.16b, v17.16b,  v18.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
183         movi            v21.16b, #0x80
184         cmhs            v16.16b, v22.16b, v19.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
185     .else
186         // calculate hev and normal_limit:
187         uabd            v20.16b, v2.16b,  v3.16b      // abs(P1-P0)
188         uabd            v21.16b, v5.16b,  v4.16b      // abs(Q1-Q0)
189         uabd            v18.16b, v0.16b,  v1.16b      // abs(P3-P2)
190         uabd            v19.16b, v1.16b,  v2.16b      // abs(P2-P1)
191         cmhs            v16.16b, v23.16b, v20.16b     // abs(P1-P0) <= flim_I
192         cmhs            v17.16b, v23.16b, v21.16b     // abs(Q1-Q0) <= flim_I
193         cmhs            v18.16b, v23.16b, v18.16b     // abs(P3-P2) <= flim_I
194         cmhs            v19.16b, v23.16b, v19.16b     // abs(P2-P1) <= flim_I
195         and             v16.16b, v17.16b, v16.16b
196         uabd            v17.16b, v7.16b,  v6.16b      // abs(Q3-Q2)
197         and             v16.16b, v16.16b, v19.16b
198         uabd            v19.16b, v6.16b,  v5.16b      // abs(Q2-Q1)
199         and             v16.16b, v16.16b, v18.16b
200         cmhs            v18.16b, v23.16b, v17.16b     // abs(Q3-Q2) <= flim_I
201         cmhs            v19.16b, v23.16b, v19.16b     // abs(Q2-Q1) <= flim_I
202         uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
203         uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
204         and             v16.16b, v16.16b, v18.16b
205         uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
206         and             v16.16b, v16.16b, v19.16b
207         ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
208         dup             v23.16b, \hev_thresh          // hev_thresh
209         uqadd           v19.16b, v17.16b, v18.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
210         cmhi            v20.16b, v20.16b, v23.16b     // abs(P1-P0) > hev_thresh
211         cmhs            v19.16b, v22.16b, v19.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
212         cmhi            v22.16b, v21.16b, v23.16b     // abs(Q1-Q0) > hev_thresh
213         and             v16.16b, v16.16b, v19.16b
214         movi            v21.16b, #0x80
215         orr             v17.16b, v20.16b, v22.16b
216     .endif
217
218         // at this point:
219         //   v16: normal_limit
220         //   v17: hev
221
222         // convert to signed value:
223         eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
224         eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
225
226         movi           v20.8h, #3
227         ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
228         ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
229         eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
230         eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
231         mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
232         mul            v19.8h, v19.8h, v20.8h
233
234         sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
235         movi           v22.16b, #4
236         movi           v23.16b, #3
237     .if \inner
238         and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
239     .endif
240         saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
241         saddw2         v19.8h,  v19.8h, v20.16b
242         sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
243         sqxtn2         v18.16b, v19.8h
244     .if !\inner && !\simple
245         eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
246         eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
247     .endif
248         and            v18.16b, v18.16b, v16.16b         // w &= normal_limit
249
250         // registers used at this point..
251         //   v0 -> P3  (don't corrupt)
252         //   v1-v6 -> PS2-QS2
253         //   v7 -> Q3  (don't corrupt)
254         //   v17 -> hev
255         //   v18 -> w
256         //   v21 -> #0x80
257         //   v22 -> #4
258         //   v23 -> #3
259         //   v16, v19, v29 -> unused
260         //
261         // filter_common:   is4tap==1
262         //   c1 = clamp(w + 4) >> 3;
263         //   c2 = clamp(w + 3) >> 3;
264         //   Q0 = s2u(QS0 - c1);
265         //   P0 = s2u(PS0 + c2);
266
267     .if \simple
268         sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
269         sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
270         sshr           v19.16b, v19.16b, #3                // c1 >>= 3
271         sshr           v20.16b, v20.16b, #3                // c2 >>= 3
272         sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
273         sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
274         eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
275         eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
276         eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
277         eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
278     .elseif \inner
279         // the !is4tap case of filter_common, only used for inner blocks
280         //   c3 = ((c1&~hev) + 1) >> 1;
281         //   Q1 = s2u(QS1 - c3);
282         //   P1 = s2u(PS1 + c3);
283         sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
284         sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
285         sshr           v19.16b, v19.16b, #3                // c1 >>= 3
286         sshr           v20.16b, v20.16b, #3                // c2 >>= 3
287         sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
288         sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
289         bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
290         eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
291         srshr          v19.16b, v19.16b, #1                // c3 >>= 1
292         eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
293         sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
294         sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
295         eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
296         eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
297     .else
298         and            v20.16b, v18.16b, v17.16b           // w & hev
299         sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
300         sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
301         sshr           v19.16b, v19.16b, #3                // c1 >>= 3
302         sshr           v20.16b, v20.16b, #3                // c2 >>= 3
303         bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
304         sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
305         sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
306
307         // filter_mbedge:
308         //   a = clamp((27*w + 63) >> 7);
309         //   Q0 = s2u(QS0 - a);
310         //   P0 = s2u(PS0 + a);
311         //   a = clamp((18*w + 63) >> 7);
312         //   Q1 = s2u(QS1 - a);
313         //   P1 = s2u(PS1 + a);
314         //   a = clamp((9*w + 63) >> 7);
315         //   Q2 = s2u(QS2 - a);
316         //   P2 = s2u(PS2 + a);
317         movi           v17.8h,  #63
318         sshll          v22.8h,  v18.8b, #3
319         sshll2         v23.8h,  v18.16b, #3
320         saddw          v22.8h,  v22.8h, v18.8b
321         saddw2         v23.8h,  v23.8h, v18.16b
322         add            v16.8h,  v17.8h, v22.8h
323         add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
324         add            v19.8h,  v16.8h, v22.8h
325         add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
326         add            v22.8h,  v19.8h, v22.8h
327         add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
328         sqshrn         v16.8b,  v16.8h,  #7
329         sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
330         sqshrn         v19.8b,  v19.8h, #7
331         sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
332         sqshrn         v22.8b,  v22.8h, #7
333         sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
334         sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
335         sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
336         sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
337         sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
338         sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
339         sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
340         eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
341         eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
342         eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
343         eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
344         eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
345         eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
346     .endif
347 .endm
348
349 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
350 function ff_vp8_v_loop_filter16\name\()_neon, export=1
351         sub             x0,  x0,  x1,  lsl #1+!\simple
352
353         // Load pixels:
354     .if !\simple
355         ld1             {v0.16b},     [x0], x1 // P3
356         ld1             {v1.16b},     [x0], x1 // P2
357     .endif
358         ld1             {v2.16b},     [x0], x1 // P1
359         ld1             {v3.16b},     [x0], x1 // P0
360         ld1             {v4.16b},     [x0], x1 // Q0
361         ld1             {v5.16b},     [x0], x1 // Q1
362     .if !\simple
363         ld1             {v6.16b},     [x0], x1 // Q2
364         ld1             {v7.16b},     [x0]     // Q3
365         dup             v23.16b, w3                 // flim_I
366     .endif
367         dup             v22.16b, w2                 // flim_E
368
369         vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
370
371         // back up to P2:  dst -= stride * 6
372         sub             x0,  x0,  x1,  lsl #2
373     .if !\simple
374         sub             x0,  x0,  x1,  lsl #1
375
376         // Store pixels:
377         st1             {v1.16b},     [x0], x1 // P2
378     .endif
379         st1             {v2.16b},     [x0], x1 // P1
380         st1             {v3.16b},     [x0], x1 // P0
381         st1             {v4.16b},     [x0], x1 // Q0
382         st1             {v5.16b},     [x0], x1 // Q1
383     .if !\simple
384         st1             {v6.16b},     [x0]     // Q2
385     .endif
386
387         ret
388 endfunc
389 .endm
390
391 vp8_v_loop_filter16
392 vp8_v_loop_filter16 _inner,  inner=1
393 vp8_v_loop_filter16 _simple, simple=1
394
395 .macro  vp8_v_loop_filter8uv name, inner=0
396 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
397         sub             x0,  x0,  x2,  lsl #2
398         sub             x1,  x1,  x2,  lsl #2
399         // Load pixels:
400         ld1          {v0.d}[0],     [x0], x2  // P3
401         ld1          {v0.d}[1],     [x1], x2  // P3
402         ld1          {v1.d}[0],     [x0], x2  // P2
403         ld1          {v1.d}[1],     [x1], x2  // P2
404         ld1          {v2.d}[0],     [x0], x2  // P1
405         ld1          {v2.d}[1],     [x1], x2  // P1
406         ld1          {v3.d}[0],     [x0], x2  // P0
407         ld1          {v3.d}[1],     [x1], x2  // P0
408         ld1          {v4.d}[0],     [x0], x2  // Q0
409         ld1          {v4.d}[1],     [x1], x2  // Q0
410         ld1          {v5.d}[0],     [x0], x2  // Q1
411         ld1          {v5.d}[1],     [x1], x2  // Q1
412         ld1          {v6.d}[0],     [x0], x2  // Q2
413         ld1          {v6.d}[1],     [x1], x2  // Q2
414         ld1          {v7.d}[0],     [x0]      // Q3
415         ld1          {v7.d}[1],     [x1]      // Q3
416
417         dup          v22.16b, w3                 // flim_E
418         dup          v23.16b, w4                 // flim_I
419
420         vp8_loop_filter inner=\inner, hev_thresh=w5
421
422         // back up to P2:  u,v -= stride * 6
423         sub          x0,  x0,  x2,  lsl #2
424         sub          x1,  x1,  x2,  lsl #2
425         sub          x0,  x0,  x2,  lsl #1
426         sub          x1,  x1,  x2,  lsl #1
427
428         // Store pixels:
429
430         st1          {v1.d}[0],     [x0], x2  // P2
431         st1          {v1.d}[1],     [x1], x2  // P2
432         st1          {v2.d}[0],     [x0], x2  // P1
433         st1          {v2.d}[1],     [x1], x2  // P1
434         st1          {v3.d}[0],     [x0], x2  // P0
435         st1          {v3.d}[1],     [x1], x2  // P0
436         st1          {v4.d}[0],     [x0], x2  // Q0
437         st1          {v4.d}[1],     [x1], x2  // Q0
438         st1          {v5.d}[0],     [x0], x2  // Q1
439         st1          {v5.d}[1],     [x1], x2  // Q1
440         st1          {v6.d}[0],     [x0]      // Q2
441         st1          {v6.d}[1],     [x1]      // Q2
442
443         ret
444 endfunc
445 .endm
446
447 vp8_v_loop_filter8uv
448 vp8_v_loop_filter8uv _inner, inner=1
449
450 .macro  vp8_h_loop_filter16 name, inner=0, simple=0
451 function ff_vp8_h_loop_filter16\name\()_neon, export=1
452
453         sub             x0,  x0,  #4
454         // Load pixels:
455         ld1             {v0.d}[0], [x0], x1
456         ld1             {v1.d}[0], [x0], x1
457         ld1             {v2.d}[0], [x0], x1
458         ld1             {v3.d}[0], [x0], x1
459         ld1             {v4.d}[0], [x0], x1
460         ld1             {v5.d}[0], [x0], x1
461         ld1             {v6.d}[0], [x0], x1
462         ld1             {v7.d}[0], [x0], x1
463         ld1             {v0.d}[1], [x0], x1
464         ld1             {v1.d}[1], [x0], x1
465         ld1             {v2.d}[1], [x0], x1
466         ld1             {v3.d}[1], [x0], x1
467         ld1             {v4.d}[1], [x0], x1
468         ld1             {v5.d}[1], [x0], x1
469         ld1             {v6.d}[1], [x0], x1
470         ld1             {v7.d}[1], [x0], x1
471
472         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
473
474         dup             v22.16b, w2                 // flim_E
475     .if !\simple
476         dup             v23.16b, w3                 // flim_I
477     .endif
478
479         vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
480
481         sub             x0,  x0,  x1, lsl #4    // backup 16 rows
482
483         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
484
485         // Store pixels:
486         st1             {v0.d}[0], [x0], x1
487         st1             {v1.d}[0], [x0], x1
488         st1             {v2.d}[0], [x0], x1
489         st1             {v3.d}[0], [x0], x1
490         st1             {v4.d}[0], [x0], x1
491         st1             {v5.d}[0], [x0], x1
492         st1             {v6.d}[0], [x0], x1
493         st1             {v7.d}[0], [x0], x1
494         st1             {v0.d}[1], [x0], x1
495         st1             {v1.d}[1], [x0], x1
496         st1             {v2.d}[1], [x0], x1
497         st1             {v3.d}[1], [x0], x1
498         st1             {v4.d}[1], [x0], x1
499         st1             {v5.d}[1], [x0], x1
500         st1             {v6.d}[1], [x0], x1
501         st1             {v7.d}[1], [x0]
502
503         ret
504 endfunc
505 .endm
506
507 vp8_h_loop_filter16
508 vp8_h_loop_filter16 _inner,  inner=1
509 vp8_h_loop_filter16 _simple, simple=1
510
511 .macro  vp8_h_loop_filter8uv name, inner=0
512 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
513         sub             x0,  x0,  #4
514         sub             x1,  x1,  #4
515
516         // Load pixels:
517         ld1          {v0.d}[0],     [x0], x2 // load u
518         ld1          {v0.d}[1],     [x1], x2 // load v
519         ld1          {v1.d}[0],     [x0], x2
520         ld1          {v1.d}[1],     [x1], x2
521         ld1          {v2.d}[0],     [x0], x2
522         ld1          {v2.d}[1],     [x1], x2
523         ld1          {v3.d}[0],     [x0], x2
524         ld1          {v3.d}[1],     [x1], x2
525         ld1          {v4.d}[0],     [x0], x2
526         ld1          {v4.d}[1],     [x1], x2
527         ld1          {v5.d}[0],     [x0], x2
528         ld1          {v5.d}[1],     [x1], x2
529         ld1          {v6.d}[0],     [x0], x2
530         ld1          {v6.d}[1],     [x1], x2
531         ld1          {v7.d}[0],     [x0], x2
532         ld1          {v7.d}[1],     [x1], x2
533
534         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
535
536         dup             v22.16b, w3                 // flim_E
537         dup             v23.16b, w4                 // flim_I
538
539         vp8_loop_filter inner=\inner, hev_thresh=w5
540
541         sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
542         sub             x1,  x1,  x2, lsl #3    // backup v 8 rows
543
544         transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
545
546         // Store pixels:
547         st1          {v0.d}[0],     [x0], x2 // load u
548         st1          {v0.d}[1],     [x1], x2 // load v
549         st1          {v1.d}[0],     [x0], x2
550         st1          {v1.d}[1],     [x1], x2
551         st1          {v2.d}[0],     [x0], x2
552         st1          {v2.d}[1],     [x1], x2
553         st1          {v3.d}[0],     [x0], x2
554         st1          {v3.d}[1],     [x1], x2
555         st1          {v4.d}[0],     [x0], x2
556         st1          {v4.d}[1],     [x1], x2
557         st1          {v5.d}[0],     [x0], x2
558         st1          {v5.d}[1],     [x1], x2
559         st1          {v6.d}[0],     [x0], x2
560         st1          {v6.d}[1],     [x1], x2
561         st1          {v7.d}[0],     [x0]
562         st1          {v7.d}[1],     [x1]
563
564         ret
565
566 endfunc
567 .endm
568
569 vp8_h_loop_filter8uv
570 vp8_h_loop_filter8uv _inner, inner=1
571
572
573 function ff_put_vp8_pixels16_neon, export=1
574 1:
575         subs            w4, w4, #4
576         ld1             {v0.16b},     [x2], x3
577         ld1             {v1.16b},     [x2], x3
578         ld1             {v2.16b},     [x2], x3
579         ld1             {v3.16b},     [x2], x3
580         st1             {v0.16b},     [x0], x1
581         st1             {v1.16b},     [x0], x1
582         st1             {v2.16b},     [x0], x1
583         st1             {v3.16b},     [x0], x1
584         b.gt            1b
585         ret
586 endfunc
587
588 function ff_put_vp8_pixels8_neon, export=1
589 1:
590         subs            w4, w4, #4
591         ld1             {v0.8b},   [x2], x3
592         ld1             {v0.d}[1], [x2], x3
593         ld1             {v1.8b},   [x2], x3
594         ld1             {v1.d}[1], [x2], x3
595         st1             {v0.8b},   [x0], x1
596         st1             {v0.d}[1], [x0], x1
597         st1             {v1.8b},   [x0], x1
598         st1             {v1.d}[1], [x0], x1
599         b.gt            1b
600         ret
601 endfunc
602
603 /* 4/6-tap 8th-pel MC */
604
605 .macro  vp8_epel8_h6    d,   s0,   s1
606         ext             v22.8b, \s0\().8b,  \s1\().8b,  #1
607         uxtl            v18.8h, \s0\().8b
608         ext             v23.8b, \s0\().8b,  \s1\().8b,  #2
609         uxtl            v19.8h, v22.8b
610         ext             v24.8b, \s0\().8b,  \s1\().8b,  #3
611         uxtl            v21.8h, v23.8b
612         ext             v25.8b, \s0\().8b,  \s1\().8b,  #4
613         uxtl            v22.8h, v24.8b
614         ext             v26.8b, \s0\().8b,  \s1\().8b,  #5
615         uxtl            v25.8h, v25.8b
616         mul             v21.8h, v21.8h, v0.h[2]
617         uxtl            v26.8h, v26.8b
618         mul             v22.8h, v22.8h, v0.h[3]
619         mls             v21.8h, v19.8h, v0.h[1]
620         mls             v22.8h, v25.8h, v0.h[4]
621         mla             v21.8h, v18.8h, v0.h[0]
622         mla             v22.8h, v26.8h, v0.h[5]
623         sqadd           v22.8h, v21.8h, v22.8h
624         sqrshrun        \d\().8b, v22.8h, #7
625 .endm
626
627 .macro  vp8_epel16_h6   d0,  v0,  v1
628         ext             v22.16b, \v0\().16b, \v1\().16b, #3
629         ext             v23.16b, \v0\().16b, \v1\().16b, #4
630         uxtl            v19.8h,  v22.8b
631         uxtl2           v22.8h,  v22.16b
632         ext             v3.16b,  \v0\().16b, \v1\().16b, #2
633         uxtl            v20.8h,  v23.8b
634         uxtl2           v23.8h,  v23.16b
635         ext             v16.16b, \v0\().16b, \v1\().16b, #1
636         uxtl            v18.8h,  v3.8b
637         uxtl2           v3.8h,   v3.16b
638         ext             v2.16b,  \v0\().16b, \v1\().16b, #5
639         uxtl            v21.8h,  v2.8b
640         uxtl2           v2.8h,   v2.16b
641         uxtl            v17.8h,  v16.8b
642         uxtl2           v16.8h,  v16.16b
643         mul             v19.8h,  v19.8h, v0.h[3]
644         mul             v18.8h,  v18.8h, v0.h[2]
645         mul             v3.8h,   v3.8h,  v0.h[2]
646         mul             v22.8h,  v22.8h, v0.h[3]
647         mls             v19.8h,  v20.8h, v0.h[4]
648         uxtl            v20.8h,  \v0\().8b
649         uxtl2           v1.8h,   \v0\().16b
650         mls             v18.8h,  v17.8h, v0.h[1]
651         mls             v3.8h,   v16.8h, v0.h[1]
652         mls             v22.8h,  v23.8h, v0.h[4]
653         mla             v18.8h,  v20.8h, v0.h[0]
654         mla             v19.8h,  v21.8h, v0.h[5]
655         mla             v3.8h,   v1.8h,  v0.h[0]
656         mla             v22.8h,  v2.8h,  v0.h[5]
657         sqadd           v19.8h,  v18.8h, v19.8h
658         sqadd           v22.8h,  v3.8h,  v22.8h
659         sqrshrun        \d0\().8b,  v19.8h, #7
660         sqrshrun2       \d0\().16b, v22.8h, #7
661 .endm
662
663 .macro  vp8_epel8_v6    d0,  s0,  s1,  s2, s3, s4, s5
664         uxtl            \s2\().8h, \s2\().8b
665         uxtl            \s3\().8h, \s3\().8b
666         uxtl            \s1\().8h, \s1\().8b
667         uxtl            \s4\().8h, \s4\().8b
668         uxtl            \s0\().8h, \s0\().8b
669         uxtl            \s5\().8h, \s5\().8b
670         mul             \s2\().8h, \s2\().8h, v0.h[2]
671         mul             \s3\().8h, \s3\().8h, v0.h[3]
672         mls             \s2\().8h, \s1\().8h, v0.h[1]
673         mls             \s3\().8h, \s4\().8h, v0.h[4]
674         mla             \s2\().8h, \s0\().8h, v0.h[0]
675         mla             \s3\().8h, \s5\().8h, v0.h[5]
676         sqadd           \s3\().8h, \s2\().8h, \s3\().8h
677         sqrshrun        \d0\().8b, \s3\().8h, #7
678 .endm
679
680 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
681         uxtl            \s0\().8h, \s0\().8b
682         uxtl            \s3\().8h, \s3\().8b
683         uxtl            \s6\().8h, \s6\().8b
684         uxtl            \s1\().8h, \s1\().8b
685         uxtl            \s4\().8h, \s4\().8b
686         uxtl            \s2\().8h, \s2\().8b
687         uxtl            \s5\().8h, \s5\().8b
688         mul             \s0\().8h, \s0\().8h, v0.h[0]
689         mul             v31.8h   , \s3\().8h, v0.h[3]
690         mul             \s3\().8h, \s3\().8h, v0.h[2]
691         mul             \s6\().8h, \s6\().8h, v0.h[5]
692
693         mls             \s0\().8h, \s1\().8h, v0.h[1]
694         mls             v31.8h   , \s4\().8h, v0.h[4]
695         mls             \s3\().8h, \s2\().8h, v0.h[1]
696         mls             \s6\().8h, \s5\().8h, v0.h[4]
697
698         mla             \s0\().8h, \s2\().8h, v0.h[2]
699         mla             v31.8h   , \s5\().8h, v0.h[5]
700         mla             \s3\().8h, \s1\().8h, v0.h[0]
701         mla             \s6\().8h, \s4\().8h, v0.h[3]
702         sqadd           v31.8h   , \s0\().8h, v31.8h
703         sqadd           \s6\().8h, \s3\().8h, \s6\().8h
704         sqrshrun        \d0\().8b, v31.8h,    #7
705         sqrshrun        \d1\().8b, \s6\().8h, #7
706 .endm
707
708 .macro  vp8_epel8_h4    d,   v0,   v1
709         ext             v22.8b, \v0\().8b,  \v1\().8b,  #1
710         uxtl            v19.8h, \v0\().8b
711         ext             v23.8b, \v0\().8b,  \v1\().8b,  #2
712         uxtl            v20.8h, v22.8b
713         ext             v25.8b, \v0\().8b,  \v1\().8b,  #3
714         uxtl            v22.8h, v23.8b
715         uxtl            v25.8h, v25.8b
716         mul             v20.8h, v20.8h, v0.h[2]
717         mul             v22.8h, v22.8h, v0.h[3]
718         mls             v20.8h, v19.8h, v0.h[1]
719         mls             v22.8h, v25.8h, v0.h[4]
720         sqadd           v22.8h, v20.8h, v22.8h
721         sqrshrun        \d\().8b, v22.8h, #7
722 .endm
723
724 .macro  vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
725         uxtl            \s0\().8h,  \s0\().8b
726         uxtl            \s1\().8h,  \s1\().8b
727         uxtl            \s2\().8h,  \s2\().8b
728         uxtl            \s3\().8h,  \s3\().8b
729         uxtl            \s4\().8h,  \s4\().8b
730         mul             v21.8h,     \s1\().8h, v0.h[2]
731         mul             v23.8h,     \s2\().8h, v0.h[3]
732         mul             \s2\().8h,  \s2\().8h, v0.h[2]
733         mul             v22.8h,     \s3\().8h, v0.h[3]
734         mls             v21.8h,     \s0\().8h, v0.h[1]
735         mls             v23.8h,     \s3\().8h, v0.h[4]
736         mls             \s2\().8h,  \s1\().8h, v0.h[1]
737         mls             v22.8h,     \s4\().8h, v0.h[4]
738         sqadd           v21.8h,     v21.8h,    v23.8h
739         sqadd           \s2\().8h,  \s2\().8h, v22.8h
740         sqrshrun        \d0\().8b,  v21.8h,    #7
741         sqrshrun2       \d0\().16b, \s2\().8h, #7
742 .endm
743
744
745 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
746 // arithmatic can be used to apply filters
747 const   subpel_filters, align=4
748         .short     0,   6, 123,  12,   1,   0,   0,   0
749         .short     2,  11, 108,  36,   8,   1,   0,   0
750         .short     0,   9,  93,  50,   6,   0,   0,   0
751         .short     3,  16,  77,  77,  16,   3,   0,   0
752         .short     0,   6,  50,  93,   9,   0,   0,   0
753         .short     1,   8,  36, 108,  11,   2,   0,   0
754         .short     0,   1,  12, 123,   6,   0,   0,   0
755 endconst
756
757 function ff_put_vp8_epel16_v6_neon, export=1
758         sub             x2,  x2,  x3,  lsl #1
759
760         sxtw            x4,  w4
761         sxtw            x6,  w6
762         movrel          x17,  subpel_filters, -16
763         add             x6,  x17,  x6, lsl #4  // y
764         ld1             {v0.8h},     [x6]
765 1:
766         ld1             {v1.1d - v2.1d},    [x2], x3
767         ld1             {v3.1d - v4.1d},    [x2], x3
768         ld1             {v16.1d - v17.1d},  [x2], x3
769         ld1             {v18.1d - v19.1d},  [x2], x3
770         ld1             {v20.1d - v21.1d},  [x2], x3
771         ld1             {v22.1d - v23.1d},  [x2], x3
772         ld1             {v24.1d - v25.1d},  [x2]
773         sub             x2,  x2,  x3, lsl #2
774
775         vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
776         vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
777
778         st1             {v1.1d - v2.1d}, [x0], x1
779         st1             {v3.1d - v4.1d}, [x0], x1
780         subs            x4, x4, #2
781         b.ne            1b
782
783         ret
784 endfunc
785
786 function ff_put_vp8_epel16_h6_neon, export=1
787         sub             x2,  x2,  #2
788         sxtw            x5,  w5 // x
789
790         // first pass (horizontal):
791         movrel          x17,  subpel_filters, -16
792         add             x5,  x17,  x5, lsl #4 // x
793         ld1             {v0.8h},  [x5]
794 1:
795         ld1             {v1.16b, v2.16b}, [x2], x3
796         vp8_epel16_h6   v1, v1, v2
797         st1             {v1.16b}, [x0], x1
798
799         subs            w4, w4, #1
800         b.ne            1b
801         ret
802 endfunc
803
804
805 function ff_put_vp8_epel16_h6v6_neon, export=1
806         sub             x2,  x2,  x3,  lsl #1
807         sub             x2,  x2,  #2
808
809         // first pass (horizontal):
810         movrel          x17,  subpel_filters, -16
811         sxtw            x5,  w5 // x
812         add             x16,  x17,  x5, lsl #4 // x
813         sub             sp,  sp,  #336+16
814         ld1             {v0.8h},  [x16]
815         add             x7,  sp,  #15
816         sxtw            x4,  w4
817         add             x16, x4, #5   // h
818         bic             x7,  x7,  #15
819 1:
820         ld1             {v1.16b, v2.16b}, [x2], x3
821         vp8_epel16_h6   v1, v1, v2
822         st1             {v1.16b}, [x7], #16
823         subs            x16, x16, #1
824         b.ne            1b
825
826
827         // second pass (vertical):
828         sxtw            x6,  w6
829         add             x6,  x17,  x6, lsl #4  // y
830         add             x7,  sp,  #15
831         ld1             {v0.8h},     [x6]
832         bic             x7,  x7,  #15
833 2:
834         ld1             {v1.8b - v4.8b},    [x7], #32
835         ld1             {v16.8b - v19.8b},  [x7], #32
836         ld1             {v20.8b - v23.8b},  [x7]
837         sub             x7,  x7,  #48
838
839         vp8_epel8_v6    v5, v1, v3, v16, v18, v20, v22
840         vp8_epel8_v6    v2, v2, v4, v17, v19, v21, v23
841         trn1            v2.2d, v5.2d, v2.2d
842
843         st1             {v2.16b}, [x0], x1
844         subs            x4, x4, #1
845         b.ne            2b
846
847         add             sp,  sp,  #336+16
848         ret
849 endfunc
850
851 function ff_put_vp8_epel8_h6v6_neon, export=1
852         sub             x2,  x2,  x3,  lsl #1
853         sub             x2,  x2,  #2
854         sxtw            x4,  w4
855
856         // first pass (horizontal):
857         movrel          x17,  subpel_filters, -16
858         sxtw            x5,  w5
859         add             x5,  x17,  x5, lsl #4 // x
860         sub             sp,  sp,  #168+16
861         ld1             {v0.8h},  [x5]
862         add             x7,  sp,  #15
863         add             x16, x4,  #5   // h
864         bic             x7,  x7,  #15
865 1:
866         ld1             {v1.8b, v2.8b}, [x2], x3
867
868         vp8_epel8_h6    v1, v1, v2
869
870         st1             {v1.8b}, [x7], #8
871         subs            x16, x16, #1
872         b.ne            1b
873
874         // second pass (vertical):
875         sxtw            x6,  w6
876         add             x6,  x17,  x6, lsl #4  // y
877         add             x7,  sp,   #15
878         ld1             {v0.8h},   [x6]
879         bic             x7,  x7,   #15
880 2:
881         ld1             {v1.8b - v4.8b}, [x7], #32
882         ld1             {v5.8b - v7.8b}, [x7]
883
884         sub             x7,  x7,  #16
885
886         vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
887
888         st1             {v1.8b}, [x0], x1
889         st1             {v2.8b}, [x0], x1
890         subs            x4, x4, #2
891         b.ne            2b
892
893         add             sp,  sp,  #168+16
894         ret
895 endfunc
896
897 function ff_put_vp8_epel8_h4v6_neon, export=1
898         sub             x2,  x2,  x3,  lsl #1
899         sub             x2,  x2,  #1
900         sxtw            x4,  w4
901
902         // first pass (horizontal):
903         movrel          x17,  subpel_filters, -16
904         sxtw            x5,  w5
905         add             x5,  x17,  x5, lsl #4 // x
906         sub             sp,  sp,  #168+16
907         ld1             {v0.8h},  [x5]
908         add             x7,  sp,  #15
909         add             x16, x4, #5   // h
910         bic             x7,  x7,  #15
911 1:
912         ld1             {v1.8b, v2.8b}, [x2], x3
913
914         vp8_epel8_h4    v1, v1, v2
915
916         st1             {v1.8b}, [x7], #8
917         subs            x16, x16, #1
918         b.ne            1b
919
920         // second pass (vertical):
921         sxtw            x6,  w6
922         add             x6,  x17,  x6, lsl #4  // y
923         add             x7,  sp,   #15
924         ld1             {v0.8h},   [x6]
925         bic             x7,  x7,   #15
926 2:
927         ld1             {v1.8b - v4.8b}, [x7], #32
928         ld1             {v5.8b - v7.8b}, [x7]
929
930         sub             x7,  x7,  #16
931
932         vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
933
934         st1             {v1.8b}, [x0], x1
935         st1             {v2.8b}, [x0], x1
936         subs            x4, x4, #2
937         b.ne            2b
938
939         add             sp,  sp,  #168+16
940         ret
941 endfunc
942
943 function ff_put_vp8_epel8_h4v4_neon, export=1
944         sub             x2,  x2,  x3
945         sub             x2,  x2,  #1
946         sxtw            x4,  w4
947
948
949         // first pass (horizontal):
950         movrel          x17,  subpel_filters, -16
951         sxtw            x5,  w5
952         add             x5,  x17,  x5, lsl #4 // x
953         sub             sp,  sp,  #168+16
954         ld1             {v0.8h},  [x5]
955         add             x7,  sp,  #15
956         add             x16, x4, #3   // h
957         bic             x7,  x7,  #15
958 1:
959         ld1             {v1.8b, v2.8b}, [x2], x3
960
961         vp8_epel8_h4    v1, v1, v2
962
963         st1             {v1.8b}, [x7], #8
964         subs            x16, x16, #1
965         b.ne            1b
966
967         // second pass (vertical):
968         sxtw            x6,  w6
969         add             x6,  x17,  x6, lsl #4  // y
970         add             x7,  sp,   #15
971         ld1             {v0.8h},   [x6]
972         bic             x7,  x7,   #15
973 2:
974         ld1             {v1.8b - v2.8b}, [x7], #16
975         ld1             {v3.8b - v5.8b}, [x7]
976
977         vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
978
979         st1             {v1.d}[0], [x0], x1
980         st1             {v1.d}[1], [x0], x1
981         subs            x4, x4, #2
982         b.ne            2b
983
984         add             sp,  sp,  #168+16
985         ret
986 endfunc
987
988 function ff_put_vp8_epel8_h6v4_neon, export=1
989         sub             x2,  x2,  x3
990         sub             x2,  x2,  #2
991         sxtw            x4,  w4
992
993
994         // first pass (horizontal):
995         movrel          x17,  subpel_filters, -16
996         sxtw            x5,  w5
997         add             x5,  x17,  x5, lsl #4 // x
998         sub             sp,  sp,  #168+16
999         ld1             {v0.8h},  [x5]
1000         add             x7,  sp,  #15
1001         add             x16, x4, #3   // h
1002         bic             x7,  x7,  #15
1003 1:
1004         ld1             {v1.8b, v2.8b}, [x2], x3
1005
1006         vp8_epel8_h6    v1, v1, v2
1007
1008         st1             {v1.8b}, [x7], #8
1009         subs            x16, x16, #1
1010         b.ne            1b
1011
1012         // second pass (vertical):
1013         sxtw            x6,  w6
1014         add             x6,  x17,  x6, lsl #4  // y
1015         add             x7,  sp,   #15
1016         ld1             {v0.8h},   [x6]
1017         bic             x7,  x7,   #15
1018 2:
1019         ld1             {v1.8b - v2.8b}, [x7], #16
1020         ld1             {v3.8b - v5.8b}, [x7]
1021
1022         vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1023
1024         st1             {v1.d}[0], [x0], x1
1025         st1             {v1.d}[1], [x0], x1
1026         subs            x4, x4, #2
1027         b.ne            2b
1028
1029         add             sp,  sp,  #168+16
1030         ret
1031 endfunc