2 * VP8 NEON optimisations
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
7 * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
9 * This file is part of Libav.
11 * Libav is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
16 * Libav is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with Libav; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include "libavutil/aarch64/asm.S"
29 function ff_vp8_luma_dc_wht_neon, export=1
30 ld1 {v0.4h - v3.4h}, [x1]
33 add v4.4h, v0.4h, v3.4h
34 add v6.4h, v1.4h, v2.4h
35 st1 {v30.8h}, [x1], #16
36 sub v7.4h, v1.4h, v2.4h
37 sub v5.4h, v0.4h, v3.4h
39 add v0.4h, v4.4h, v6.4h
40 add v1.4h, v5.4h, v7.4h
41 sub v2.4h, v4.4h, v6.4h
42 sub v3.4h, v5.4h, v7.4h
46 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
48 add v0.4h, v0.4h, v16.4h
50 add v4.4h, v0.4h, v3.4h
51 add v6.4h, v1.4h, v2.4h
52 sub v7.4h, v1.4h, v2.4h
53 sub v5.4h, v0.4h, v3.4h
54 add v0.4h, v4.4h, v6.4h
55 add v1.4h, v5.4h, v7.4h
56 sub v2.4h, v4.4h, v6.4h
57 sub v3.4h, v5.4h, v7.4h
65 st1 {v0.h}[0], [x0], x3
66 st1 {v1.h}[0], [x0], x3
67 st1 {v2.h}[0], [x0], x3
68 st1 {v3.h}[0], [x0], x3
69 st1 {v0.h}[1], [x0], x3
70 st1 {v1.h}[1], [x0], x3
71 st1 {v2.h}[1], [x0], x3
72 st1 {v3.h}[1], [x0], x3
73 st1 {v0.h}[2], [x0], x3
74 st1 {v1.h}[2], [x0], x3
75 st1 {v2.h}[2], [x0], x3
76 st1 {v3.h}[2], [x0], x3
77 st1 {v0.h}[3], [x0], x3
78 st1 {v1.h}[3], [x0], x3
79 st1 {v2.h}[3], [x0], x3
80 st1 {v3.h}[3], [x0], x3
85 function ff_vp8_idct_add_neon, export=1
86 ld1 {v0.8b - v3.8b}, [x1]
88 movk w4, #35468/2, lsl #16
91 smull v26.4s, v1.4h, v4.h[0]
92 smull v27.4s, v3.4h, v4.h[0]
93 sqdmulh v20.4h, v1.4h, v4.h[1]
94 sqdmulh v23.4h, v3.4h, v4.h[1]
95 sqshrn v21.4h, v26.4s, #16
96 sqshrn v22.4h, v27.4s, #16
97 add v21.4h, v21.4h, v1.4h
98 add v22.4h, v22.4h, v3.4h
100 add v16.4h, v0.4h, v2.4h
101 sub v17.4h, v0.4h, v2.4h
103 add v18.4h, v21.4h, v23.4h
104 sub v19.4h, v20.4h, v22.4h
106 add v0.4h, v16.4h, v18.4h
107 add v1.4h, v17.4h, v19.4h
108 sub v3.4h, v16.4h, v18.4h
109 sub v2.4h, v17.4h, v19.4h
111 transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7
114 smull v26.4s, v1.4h, v4.h[0]
115 st1 {v29.8h}, [x1], #16
116 smull v27.4s, v3.4h, v4.h[0]
118 sqdmulh v21.4h, v1.4h, v4.h[1]
119 sqdmulh v23.4h, v3.4h, v4.h[1]
120 sqshrn v20.4h, v26.4s, #16
121 sqshrn v22.4h, v27.4s, #16
122 add v20.4h, v20.4h, v1.4h
123 add v22.4h, v22.4h, v3.4h
124 add v16.4h, v0.4h, v2.4h
125 sub v17.4h, v0.4h, v2.4h
127 add v18.4h, v20.4h, v23.4h
128 ld1 {v24.d}[0], [x0], x2
129 zip1 v16.2d, v16.2d, v17.2d
130 sub v19.4h, v21.4h, v22.4h
131 ld1 {v25.d}[0], [x0], x2
132 zip1 v18.2d, v18.2d, v19.2d
133 add v0.8h, v16.8h, v18.8h
134 ld1 {v25.d}[1], [x0], x2
135 sub v1.8h, v16.8h, v18.8h
136 ld1 {v24.d}[1], [x0], x2
137 srshr v0.8h, v0.8h, #3
138 trn1 v24.4s, v24.4s, v25.4s
139 srshr v1.8h, v1.8h, #3
140 sub x0, x0, x2, lsl #2
142 ext v1.16b, v1.16b, v1.16b, #8
143 trn1 v3.2d, v0.2d, v1.2d
144 trn2 v0.2d, v0.2d, v1.2d
145 trn1 v1.8h, v3.8h, v0.8h
146 trn2 v3.8h, v3.8h, v0.8h
147 uzp1 v0.4s, v1.4s, v3.4s
148 uzp2 v1.4s, v3.4s, v1.4s
150 uaddw v0.8h, v0.8h, v24.8b
151 uaddw2 v1.8h, v1.8h, v24.16b
153 sqxtun2 v0.16b, v1.8h
154 st1 {v0.s}[0], [x0], x2
155 st1 {v0.s}[1], [x0], x2
156 st1 {v0.s}[3], [x0], x2
157 st1 {v0.s}[2], [x0], x2
162 function ff_vp8_idct_dc_add4uv_neon, export=1
166 st1 {v0.h}[0], [x1], x3
168 st1 {v0.h}[0], [x1], x3
170 st1 {v0.h}[0], [x1], x3
172 st1 {v0.h}[0], [x1], x3
173 ins v16.d[1], v17.d[0]
174 ins v18.d[1], v19.d[0]
176 srshr v16.8h, v16.8h, #3 // dc >>= 3
177 ld1 {v0.8b}, [x0], x2
178 srshr v18.8h, v18.8h, #3
179 ld1 {v1.8b}, [x0], x2
180 uaddw v20.8h, v16.8h, v0.8b
181 ld1 {v2.8b}, [x0], x2
182 uaddw v0.8h, v16.8h, v1.8b
183 ld1 {v3.8b}, [x0], x2
184 uaddw v22.8h, v16.8h, v2.8b
185 ld1 {v4.8b}, [x0], x2
186 uaddw v2.8h, v16.8h, v3.8b
187 ld1 {v5.8b}, [x0], x2
188 uaddw v24.8h, v18.8h, v4.8b
189 ld1 {v6.8b}, [x0], x2
190 uaddw v4.8h, v18.8h, v5.8b
191 ld1 {v7.8b}, [x0], x2
192 uaddw v26.8h, v18.8h, v6.8b
193 sqxtun v20.8b, v20.8h
194 uaddw v6.8h, v18.8h, v7.8b
196 sqxtun v22.8b, v22.8h
197 st1 {v20.8b}, [x3], x2
199 st1 {v21.8b}, [x3], x2
200 sqxtun v24.8b, v24.8h
201 st1 {v22.8b}, [x3], x2
203 st1 {v23.8b}, [x3], x2
204 sqxtun v26.8b, v26.8h
205 st1 {v24.8b}, [x3], x2
207 st1 {v25.8b}, [x3], x2
208 st1 {v26.8b}, [x3], x2
209 st1 {v27.8b}, [x3], x2
214 function ff_vp8_idct_dc_add4y_neon, export=1
218 st1 {v0.h}[0], [x1], x3
220 st1 {v0.h}[0], [x1], x3
221 zip1 v16.2d, v16.2d, v17.2d
223 st1 {v0.h}[0], [x1], x3
225 st1 {v0.h}[0], [x1], x3
226 zip1 v18.2d, v18.2d, v19.2d
227 srshr v16.8h, v16.8h, #3 // dc >>= 3
228 ld1 {v0.16b}, [x0], x2
229 srshr v18.8h, v18.8h, #3
230 ld1 {v1.16b}, [x0], x2
231 uaddw v20.8h, v16.8h, v0.8b
232 ld1 {v2.16b}, [x0], x2
233 uaddw2 v0.8h, v18.8h, v0.16b
234 ld1 {v3.16b}, [x0], x2
235 uaddw v21.8h, v16.8h, v1.8b
236 uaddw2 v1.8h, v18.8h, v1.16b
237 uaddw v22.8h, v16.8h, v2.8b
238 uaddw2 v2.8h, v18.8h, v2.16b
239 uaddw v23.8h, v16.8h, v3.8b
240 uaddw2 v3.8h, v18.8h, v3.16b
241 sub x0, x0, x2, lsl #2
242 sqxtun v20.8b, v20.8h
243 sqxtun2 v20.16b, v0.8h
244 sqxtun v21.8b, v21.8h
245 sqxtun2 v21.16b, v1.8h
246 sqxtun v22.8b, v22.8h
247 st1 {v20.16b}, [x0], x2
248 sqxtun2 v22.16b, v2.8h
249 st1 {v21.16b}, [x0], x2
250 sqxtun v23.8b, v23.8h
251 st1 {v22.16b}, [x0], x2
252 sqxtun2 v23.16b, v3.8h
253 st1 {v23.16b}, [x0], x2
258 function ff_vp8_idct_dc_add_neon, export=1
262 srshr v2.8h, v2.8h, #3
263 ld1 {v0.s}[0], [x0], x2
264 ld1 {v0.s}[1], [x0], x2
265 uaddw v3.8h, v2.8h, v0.8b
266 ld1 {v1.s}[0], [x0], x2
267 ld1 {v1.s}[1], [x0], x2
268 uaddw v4.8h, v2.8h, v1.8b
271 sub x0, x0, x2, lsl #2
272 st1 {v0.s}[0], [x0], x2
273 st1 {v0.s}[1], [x0], x2
274 st1 {v1.s}[0], [x0], x2
275 st1 {v1.s}[1], [x0], x2
285 .macro vp8_loop_filter, inner=0, simple=0, hev_thresh
287 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
288 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
289 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
290 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
291 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
293 cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
295 // calculate hev and normal_limit:
296 uabd v20.16b, v2.16b, v3.16b // abs(P1-P0)
297 uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0)
298 uabd v18.16b, v0.16b, v1.16b // abs(P3-P2)
299 uabd v19.16b, v1.16b, v2.16b // abs(P2-P1)
300 cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I
301 cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I
302 cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I
303 cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I
304 and v16.16b, v17.16b, v16.16b
305 uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2)
306 and v16.16b, v16.16b, v19.16b
307 uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1)
308 and v16.16b, v16.16b, v18.16b
309 cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I
310 cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I
311 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
312 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
313 and v16.16b, v16.16b, v18.16b
314 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
315 and v16.16b, v16.16b, v19.16b
316 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
317 dup v23.16b, \hev_thresh // hev_thresh
318 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
319 cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh
320 cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
321 cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh
322 and v16.16b, v16.16b, v19.16b
324 orr v17.16b, v20.16b, v22.16b
331 // convert to signed value:
332 eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
333 eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
336 ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
337 ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
338 eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
339 eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
340 mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
341 mul v19.8h, v19.8h, v20.8h
343 sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
347 and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
349 saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
350 saddw2 v19.8h, v19.8h, v20.16b
351 sqxtn v18.8b, v18.8h // narrow result back into v18
352 sqxtn2 v18.16b, v19.8h
353 .if !\inner && !\simple
354 eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
355 eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
357 and v18.16b, v18.16b, v16.16b // w &= normal_limit
359 // registers used at this point..
360 // v0 -> P3 (don't corrupt)
362 // v7 -> Q3 (don't corrupt)
368 // v16, v19, v29 -> unused
370 // filter_common: is4tap==1
371 // c1 = clamp(w + 4) >> 3;
372 // c2 = clamp(w + 3) >> 3;
373 // Q0 = s2u(QS0 - c1);
374 // P0 = s2u(PS0 + c2);
377 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
378 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
379 sshr v19.16b, v19.16b, #3 // c1 >>= 3
380 sshr v20.16b, v20.16b, #3 // c2 >>= 3
381 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
382 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
383 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
384 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
385 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
386 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
388 // the !is4tap case of filter_common, only used for inner blocks
389 // c3 = ((c1&~hev) + 1) >> 1;
390 // Q1 = s2u(QS1 - c3);
391 // P1 = s2u(PS1 + c3);
392 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
393 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
394 sshr v19.16b, v19.16b, #3 // c1 >>= 3
395 sshr v20.16b, v20.16b, #3 // c2 >>= 3
396 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
397 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
398 bic v19.16b, v19.16b, v17.16b // c1 & ~hev
399 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
400 srshr v19.16b, v19.16b, #1 // c3 >>= 1
401 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
402 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
403 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
404 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
405 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
407 and v20.16b, v18.16b, v17.16b // w & hev
408 sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
409 sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
410 sshr v19.16b, v19.16b, #3 // c1 >>= 3
411 sshr v20.16b, v20.16b, #3 // c2 >>= 3
412 bic v18.16b, v18.16b, v17.16b // w &= ~hev
413 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
414 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
417 // a = clamp((27*w + 63) >> 7);
418 // Q0 = s2u(QS0 - a);
419 // P0 = s2u(PS0 + a);
420 // a = clamp((18*w + 63) >> 7);
421 // Q1 = s2u(QS1 - a);
422 // P1 = s2u(PS1 + a);
423 // a = clamp((9*w + 63) >> 7);
424 // Q2 = s2u(QS2 - a);
425 // P2 = s2u(PS2 + a);
427 sshll v22.8h, v18.8b, #3
428 sshll2 v23.8h, v18.16b, #3
429 saddw v22.8h, v22.8h, v18.8b
430 saddw2 v23.8h, v23.8h, v18.16b
431 add v16.8h, v17.8h, v22.8h
432 add v17.8h, v17.8h, v23.8h // 9*w + 63
433 add v19.8h, v16.8h, v22.8h
434 add v20.8h, v17.8h, v23.8h // 18*w + 63
435 add v22.8h, v19.8h, v22.8h
436 add v23.8h, v20.8h, v23.8h // 27*w + 63
437 sqshrn v16.8b, v16.8h, #7
438 sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
439 sqshrn v19.8b, v19.8h, #7
440 sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
441 sqshrn v22.8b, v22.8h, #7
442 sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
443 sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
444 sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
445 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
446 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
447 sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
448 sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
449 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
450 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
451 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
452 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
453 eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
454 eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
458 .macro vp8_v_loop_filter16 name, inner=0, simple=0
459 function ff_vp8_v_loop_filter16\name\()_neon, export=1
460 sub x0, x0, x1, lsl #1+!\simple
464 ld1 {v0.16b}, [x0], x1 // P3
465 ld1 {v1.16b}, [x0], x1 // P2
467 ld1 {v2.16b}, [x0], x1 // P1
468 ld1 {v3.16b}, [x0], x1 // P0
469 ld1 {v4.16b}, [x0], x1 // Q0
470 ld1 {v5.16b}, [x0], x1 // Q1
472 ld1 {v6.16b}, [x0], x1 // Q2
473 ld1 {v7.16b}, [x0] // Q3
474 dup v23.16b, w3 // flim_I
476 dup v22.16b, w2 // flim_E
478 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
480 // back up to P2: dst -= stride * 6
481 sub x0, x0, x1, lsl #2
483 sub x0, x0, x1, lsl #1
486 st1 {v1.16b}, [x0], x1 // P2
488 st1 {v2.16b}, [x0], x1 // P1
489 st1 {v3.16b}, [x0], x1 // P0
490 st1 {v4.16b}, [x0], x1 // Q0
491 st1 {v5.16b}, [x0], x1 // Q1
493 st1 {v6.16b}, [x0] // Q2
501 vp8_v_loop_filter16 _inner, inner=1
502 vp8_v_loop_filter16 _simple, simple=1
504 .macro vp8_v_loop_filter8uv name, inner=0
505 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
506 sub x0, x0, x2, lsl #2
507 sub x1, x1, x2, lsl #2
509 ld1 {v0.d}[0], [x0], x2 // P3
510 ld1 {v0.d}[1], [x1], x2 // P3
511 ld1 {v1.d}[0], [x0], x2 // P2
512 ld1 {v1.d}[1], [x1], x2 // P2
513 ld1 {v2.d}[0], [x0], x2 // P1
514 ld1 {v2.d}[1], [x1], x2 // P1
515 ld1 {v3.d}[0], [x0], x2 // P0
516 ld1 {v3.d}[1], [x1], x2 // P0
517 ld1 {v4.d}[0], [x0], x2 // Q0
518 ld1 {v4.d}[1], [x1], x2 // Q0
519 ld1 {v5.d}[0], [x0], x2 // Q1
520 ld1 {v5.d}[1], [x1], x2 // Q1
521 ld1 {v6.d}[0], [x0], x2 // Q2
522 ld1 {v6.d}[1], [x1], x2 // Q2
523 ld1 {v7.d}[0], [x0] // Q3
524 ld1 {v7.d}[1], [x1] // Q3
526 dup v22.16b, w3 // flim_E
527 dup v23.16b, w4 // flim_I
529 vp8_loop_filter inner=\inner, hev_thresh=w5
531 // back up to P2: u,v -= stride * 6
532 sub x0, x0, x2, lsl #2
533 sub x1, x1, x2, lsl #2
534 sub x0, x0, x2, lsl #1
535 sub x1, x1, x2, lsl #1
539 st1 {v1.d}[0], [x0], x2 // P2
540 st1 {v1.d}[1], [x1], x2 // P2
541 st1 {v2.d}[0], [x0], x2 // P1
542 st1 {v2.d}[1], [x1], x2 // P1
543 st1 {v3.d}[0], [x0], x2 // P0
544 st1 {v3.d}[1], [x1], x2 // P0
545 st1 {v4.d}[0], [x0], x2 // Q0
546 st1 {v4.d}[1], [x1], x2 // Q0
547 st1 {v5.d}[0], [x0], x2 // Q1
548 st1 {v5.d}[1], [x1], x2 // Q1
549 st1 {v6.d}[0], [x0] // Q2
550 st1 {v6.d}[1], [x1] // Q2
557 vp8_v_loop_filter8uv _inner, inner=1
559 .macro vp8_h_loop_filter16 name, inner=0, simple=0
560 function ff_vp8_h_loop_filter16\name\()_neon, export=1
564 ld1 {v0.d}[0], [x0], x1
565 ld1 {v1.d}[0], [x0], x1
566 ld1 {v2.d}[0], [x0], x1
567 ld1 {v3.d}[0], [x0], x1
568 ld1 {v4.d}[0], [x0], x1
569 ld1 {v5.d}[0], [x0], x1
570 ld1 {v6.d}[0], [x0], x1
571 ld1 {v7.d}[0], [x0], x1
572 ld1 {v0.d}[1], [x0], x1
573 ld1 {v1.d}[1], [x0], x1
574 ld1 {v2.d}[1], [x0], x1
575 ld1 {v3.d}[1], [x0], x1
576 ld1 {v4.d}[1], [x0], x1
577 ld1 {v5.d}[1], [x0], x1
578 ld1 {v6.d}[1], [x0], x1
579 ld1 {v7.d}[1], [x0], x1
581 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
583 dup v22.16b, w2 // flim_E
585 dup v23.16b, w3 // flim_I
588 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
590 sub x0, x0, x1, lsl #4 // backup 16 rows
592 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
595 st1 {v0.d}[0], [x0], x1
596 st1 {v1.d}[0], [x0], x1
597 st1 {v2.d}[0], [x0], x1
598 st1 {v3.d}[0], [x0], x1
599 st1 {v4.d}[0], [x0], x1
600 st1 {v5.d}[0], [x0], x1
601 st1 {v6.d}[0], [x0], x1
602 st1 {v7.d}[0], [x0], x1
603 st1 {v0.d}[1], [x0], x1
604 st1 {v1.d}[1], [x0], x1
605 st1 {v2.d}[1], [x0], x1
606 st1 {v3.d}[1], [x0], x1
607 st1 {v4.d}[1], [x0], x1
608 st1 {v5.d}[1], [x0], x1
609 st1 {v6.d}[1], [x0], x1
617 vp8_h_loop_filter16 _inner, inner=1
618 vp8_h_loop_filter16 _simple, simple=1
620 .macro vp8_h_loop_filter8uv name, inner=0
621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
626 ld1 {v0.d}[0], [x0], x2 // load u
627 ld1 {v0.d}[1], [x1], x2 // load v
628 ld1 {v1.d}[0], [x0], x2
629 ld1 {v1.d}[1], [x1], x2
630 ld1 {v2.d}[0], [x0], x2
631 ld1 {v2.d}[1], [x1], x2
632 ld1 {v3.d}[0], [x0], x2
633 ld1 {v3.d}[1], [x1], x2
634 ld1 {v4.d}[0], [x0], x2
635 ld1 {v4.d}[1], [x1], x2
636 ld1 {v5.d}[0], [x0], x2
637 ld1 {v5.d}[1], [x1], x2
638 ld1 {v6.d}[0], [x0], x2
639 ld1 {v6.d}[1], [x1], x2
640 ld1 {v7.d}[0], [x0], x2
641 ld1 {v7.d}[1], [x1], x2
643 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
645 dup v22.16b, w3 // flim_E
646 dup v23.16b, w4 // flim_I
648 vp8_loop_filter inner=\inner, hev_thresh=w5
650 sub x0, x0, x2, lsl #3 // backup u 8 rows
651 sub x1, x1, x2, lsl #3 // backup v 8 rows
653 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
656 st1 {v0.d}[0], [x0], x2 // load u
657 st1 {v0.d}[1], [x1], x2 // load v
658 st1 {v1.d}[0], [x0], x2
659 st1 {v1.d}[1], [x1], x2
660 st1 {v2.d}[0], [x0], x2
661 st1 {v2.d}[1], [x1], x2
662 st1 {v3.d}[0], [x0], x2
663 st1 {v3.d}[1], [x1], x2
664 st1 {v4.d}[0], [x0], x2
665 st1 {v4.d}[1], [x1], x2
666 st1 {v5.d}[0], [x0], x2
667 st1 {v5.d}[1], [x1], x2
668 st1 {v6.d}[0], [x0], x2
669 st1 {v6.d}[1], [x1], x2
679 vp8_h_loop_filter8uv _inner, inner=1
682 function ff_put_vp8_pixels16_neon, export=1
685 ld1 {v0.16b}, [x2], x3
686 ld1 {v1.16b}, [x2], x3
687 ld1 {v2.16b}, [x2], x3
688 ld1 {v3.16b}, [x2], x3
689 st1 {v0.16b}, [x0], x1
690 st1 {v1.16b}, [x0], x1
691 st1 {v2.16b}, [x0], x1
692 st1 {v3.16b}, [x0], x1
697 function ff_put_vp8_pixels8_neon, export=1
700 ld1 {v0.8b}, [x2], x3
701 ld1 {v0.d}[1], [x2], x3
702 ld1 {v1.8b}, [x2], x3
703 ld1 {v1.d}[1], [x2], x3
704 st1 {v0.8b}, [x0], x1
705 st1 {v0.d}[1], [x0], x1
706 st1 {v1.8b}, [x0], x1
707 st1 {v1.d}[1], [x0], x1
712 /* 4/6-tap 8th-pel MC */
714 .macro vp8_epel8_h6 d, s0, s1
715 ext v22.8b, \s0\().8b, \s1\().8b, #1
716 uxtl v18.8h, \s0\().8b
717 ext v23.8b, \s0\().8b, \s1\().8b, #2
719 ext v24.8b, \s0\().8b, \s1\().8b, #3
721 ext v25.8b, \s0\().8b, \s1\().8b, #4
723 ext v26.8b, \s0\().8b, \s1\().8b, #5
725 mul v21.8h, v21.8h, v0.h[2]
727 mul v22.8h, v22.8h, v0.h[3]
728 mls v21.8h, v19.8h, v0.h[1]
729 mls v22.8h, v25.8h, v0.h[4]
730 mla v21.8h, v18.8h, v0.h[0]
731 mla v22.8h, v26.8h, v0.h[5]
732 sqadd v22.8h, v21.8h, v22.8h
733 sqrshrun \d\().8b, v22.8h, #7
736 .macro vp8_epel16_h6 d0, v0, v1
737 ext v22.16b, \v0\().16b, \v1\().16b, #3
738 ext v23.16b, \v0\().16b, \v1\().16b, #4
740 uxtl2 v22.8h, v22.16b
741 ext v3.16b, \v0\().16b, \v1\().16b, #2
743 uxtl2 v23.8h, v23.16b
744 ext v16.16b, \v0\().16b, \v1\().16b, #1
747 ext v2.16b, \v0\().16b, \v1\().16b, #5
751 uxtl2 v16.8h, v16.16b
752 mul v19.8h, v19.8h, v0.h[3]
753 mul v18.8h, v18.8h, v0.h[2]
754 mul v3.8h, v3.8h, v0.h[2]
755 mul v22.8h, v22.8h, v0.h[3]
756 mls v19.8h, v20.8h, v0.h[4]
757 uxtl v20.8h, \v0\().8b
758 uxtl2 v1.8h, \v0\().16b
759 mls v18.8h, v17.8h, v0.h[1]
760 mls v3.8h, v16.8h, v0.h[1]
761 mls v22.8h, v23.8h, v0.h[4]
762 mla v18.8h, v20.8h, v0.h[0]
763 mla v19.8h, v21.8h, v0.h[5]
764 mla v3.8h, v1.8h, v0.h[0]
765 mla v22.8h, v2.8h, v0.h[5]
766 sqadd v19.8h, v18.8h, v19.8h
767 sqadd v22.8h, v3.8h, v22.8h
768 sqrshrun \d0\().8b, v19.8h, #7
769 sqrshrun2 \d0\().16b, v22.8h, #7
772 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
773 uxtl \s0\().8h, \s0\().8b
774 uxtl \s3\().8h, \s3\().8b
775 uxtl \s6\().8h, \s6\().8b
776 uxtl \s1\().8h, \s1\().8b
777 uxtl \s4\().8h, \s4\().8b
778 uxtl \s2\().8h, \s2\().8b
779 uxtl \s5\().8h, \s5\().8b
780 mul \s0\().8h, \s0\().8h, v0.h[0]
781 mul v31.8h , \s3\().8h, v0.h[3]
782 mul \s3\().8h, \s3\().8h, v0.h[2]
783 mul \s6\().8h, \s6\().8h, v0.h[5]
785 mls \s0\().8h, \s1\().8h, v0.h[1]
786 mls v31.8h , \s4\().8h, v0.h[4]
787 mls \s3\().8h, \s2\().8h, v0.h[1]
788 mls \s6\().8h, \s5\().8h, v0.h[4]
790 mla \s0\().8h, \s2\().8h, v0.h[2]
791 mla v31.8h , \s5\().8h, v0.h[5]
792 mla \s3\().8h, \s1\().8h, v0.h[0]
793 mla \s6\().8h, \s4\().8h, v0.h[3]
794 sqadd v31.8h , \s0\().8h, v31.8h
795 sqadd \s6\().8h, \s3\().8h, \s6\().8h
796 sqrshrun \d0\().8b, v31.8h, #7
797 sqrshrun \d1\().8b, \s6\().8h, #7
800 .macro vp8_epel8_h4 d, v0, v1
801 ext v22.8b, \v0\().8b, \v1\().8b, #1
802 uxtl v19.8h, \v0\().8b
803 ext v23.8b, \v0\().8b, \v1\().8b, #2
805 ext v25.8b, \v0\().8b, \v1\().8b, #3
808 mul v20.8h, v20.8h, v0.h[2]
809 mul v22.8h, v22.8h, v0.h[3]
810 mls v20.8h, v19.8h, v0.h[1]
811 mls v22.8h, v25.8h, v0.h[4]
812 sqadd v22.8h, v20.8h, v22.8h
813 sqrshrun \d\().8b, v22.8h, #7
816 .macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
817 uxtl \s0\().8h, \s0\().8b
818 uxtl \s1\().8h, \s1\().8b
819 uxtl \s2\().8h, \s2\().8b
820 uxtl \s3\().8h, \s3\().8b
821 uxtl \s4\().8h, \s4\().8b
822 mul v21.8h, \s1\().8h, v0.h[2]
823 mul v23.8h, \s2\().8h, v0.h[3]
824 mul \s2\().8h, \s2\().8h, v0.h[2]
825 mul v22.8h, \s3\().8h, v0.h[3]
826 mls v21.8h, \s0\().8h, v0.h[1]
827 mls v23.8h, \s3\().8h, v0.h[4]
828 mls \s2\().8h, \s1\().8h, v0.h[1]
829 mls v22.8h, \s4\().8h, v0.h[4]
830 sqadd v21.8h, v21.8h, v23.8h
831 sqadd \s2\().8h, \s2\().8h, v22.8h
832 sqrshrun \d0\().8b, v21.8h, #7
833 sqrshrun2 \d0\().16b, \s2\().8h, #7
837 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
838 // arithmetic can be used to apply filters
839 const subpel_filters, align=4
840 .short 0, 6, 123, 12, 1, 0, 0, 0
841 .short 2, 11, 108, 36, 8, 1, 0, 0
842 .short 0, 9, 93, 50, 6, 0, 0, 0
843 .short 3, 16, 77, 77, 16, 3, 0, 0
844 .short 0, 6, 50, 93, 9, 0, 0, 0
845 .short 1, 8, 36, 108, 11, 2, 0, 0
846 .short 0, 1, 12, 123, 6, 0, 0, 0
849 function ff_put_vp8_epel16_v6_neon, export=1
850 sub x2, x2, x3, lsl #1
854 movrel x17, subpel_filters, -16
855 add x6, x17, x6, lsl #4 // y
858 ld1 {v1.1d - v2.1d}, [x2], x3
859 ld1 {v3.1d - v4.1d}, [x2], x3
860 ld1 {v16.1d - v17.1d}, [x2], x3
861 ld1 {v18.1d - v19.1d}, [x2], x3
862 ld1 {v20.1d - v21.1d}, [x2], x3
863 ld1 {v22.1d - v23.1d}, [x2], x3
864 ld1 {v24.1d - v25.1d}, [x2]
865 sub x2, x2, x3, lsl #2
867 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
868 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
870 st1 {v1.1d - v2.1d}, [x0], x1
871 st1 {v3.1d - v4.1d}, [x0], x1
878 function ff_put_vp8_epel16_h6_neon, export=1
882 // first pass (horizontal):
883 movrel x17, subpel_filters, -16
884 add x5, x17, x5, lsl #4 // x
887 ld1 {v1.16b, v2.16b}, [x2], x3
888 vp8_epel16_h6 v1, v1, v2
889 st1 {v1.16b}, [x0], x1
897 function ff_put_vp8_epel16_h6v6_neon, export=1
898 sub x2, x2, x3, lsl #1
901 // first pass (horizontal):
902 movrel x17, subpel_filters, -16
904 add x16, x17, x5, lsl #4 // x
912 ld1 {v1.16b, v2.16b}, [x2], x3
913 vp8_epel16_h6 v1, v1, v2
914 st1 {v1.16b}, [x7], #16
919 // second pass (vertical):
921 add x6, x17, x6, lsl #4 // y
926 ld1 {v1.8b - v4.8b}, [x7], #32
927 ld1 {v16.8b - v19.8b}, [x7], #32
928 ld1 {v20.8b - v23.8b}, [x7], #32
929 ld1 {v24.8b - v25.8b}, [x7]
932 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
933 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
934 trn1 v1.2d, v1.2d, v2.2d
935 trn1 v3.2d, v3.2d, v4.2d
937 st1 {v1.16b}, [x0], x1
938 st1 {v3.16b}, [x0], x1
946 function ff_put_vp8_epel8_v6_neon, export=1
947 sub x2, x2, x3, lsl #1
949 movrel x7, subpel_filters, -16
950 add x6, x7, w6, uxtw #4
953 ld1 {v2.8b}, [x2], x3
954 ld1 {v3.8b}, [x2], x3
955 ld1 {v4.8b}, [x2], x3
956 ld1 {v5.8b}, [x2], x3
957 ld1 {v6.8b}, [x2], x3
958 ld1 {v7.8b}, [x2], x3
961 sub x2, x2, x3, lsl #2
963 vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
965 st1 {v2.8b}, [x0], x1
966 st1 {v3.8b}, [x0], x1
973 function ff_put_vp8_epel8_h6_neon, export=1
976 movrel x7, subpel_filters, -16
977 add x5, x7, w5, uxtw #4
980 ld1 {v2.8b, v3.8b}, [x2], x3
982 vp8_epel8_h6 v2, v2, v3
984 st1 {v2.8b}, [x0], x1
991 function ff_put_vp8_epel8_h6v6_neon, export=1
992 sub x2, x2, x3, lsl #1
996 // first pass (horizontal):
997 movrel x17, subpel_filters, -16
999 add x5, x17, x5, lsl #4 // x
1003 add x16, x4, #5 // h
1006 ld1 {v1.8b, v2.8b}, [x2], x3
1008 vp8_epel8_h6 v1, v1, v2
1010 st1 {v1.8b}, [x7], #8
1014 // second pass (vertical):
1016 add x6, x17, x6, lsl #4 // y
1021 ld1 {v1.8b - v4.8b}, [x7], #32
1022 ld1 {v5.8b - v7.8b}, [x7]
1026 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1028 st1 {v1.8b}, [x0], x1
1029 st1 {v2.8b}, [x0], x1
1037 function ff_put_vp8_epel8_v4_neon, export=1
1040 movrel x7, subpel_filters, -16
1041 add x6, x7, w6, uxtw #4
1044 ld1 {v2.8b}, [x2], x3
1045 ld1 {v3.8b}, [x2], x3
1046 ld1 {v4.8b}, [x2], x3
1047 ld1 {v5.8b}, [x2], x3
1049 sub x2, x2, x3, lsl #1
1051 vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1053 st1 {v2.d}[0], [x0], x1
1054 st1 {v2.d}[1], [x0], x1
1061 function ff_put_vp8_epel8_h4_neon, export=1
1064 movrel x7, subpel_filters, -16
1065 add x5, x7, w5, uxtw #4
1068 ld1 {v2.8b,v3.8b}, [x2], x3
1070 vp8_epel8_h4 v2, v2, v3
1072 st1 {v2.8b}, [x0], x1
1079 function ff_put_vp8_epel8_h4v6_neon, export=1
1080 sub x2, x2, x3, lsl #1
1084 // first pass (horizontal):
1085 movrel x17, subpel_filters, -16
1087 add x5, x17, x5, lsl #4 // x
1091 add x16, x4, #5 // h
1094 ld1 {v1.8b, v2.8b}, [x2], x3
1096 vp8_epel8_h4 v1, v1, v2
1098 st1 {v1.8b}, [x7], #8
1102 // second pass (vertical):
1104 add x6, x17, x6, lsl #4 // y
1109 ld1 {v1.8b - v4.8b}, [x7], #32
1110 ld1 {v5.8b - v7.8b}, [x7]
1114 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1116 st1 {v1.8b}, [x0], x1
1117 st1 {v2.8b}, [x0], x1
1125 function ff_put_vp8_epel8_h4v4_neon, export=1
1131 // first pass (horizontal):
1132 movrel x17, subpel_filters, -16
1134 add x5, x17, x5, lsl #4 // x
1138 add x16, x4, #3 // h
1141 ld1 {v1.8b, v2.8b}, [x2], x3
1143 vp8_epel8_h4 v1, v1, v2
1145 st1 {v1.8b}, [x7], #8
1149 // second pass (vertical):
1151 add x6, x17, x6, lsl #4 // y
1156 ld1 {v1.8b - v2.8b}, [x7], #16
1157 ld1 {v3.8b - v5.8b}, [x7]
1159 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1161 st1 {v1.d}[0], [x0], x1
1162 st1 {v1.d}[1], [x0], x1
1170 function ff_put_vp8_epel8_h6v4_neon, export=1
1176 // first pass (horizontal):
1177 movrel x17, subpel_filters, -16
1179 add x5, x17, x5, lsl #4 // x
1183 add x16, x4, #3 // h
1186 ld1 {v1.8b, v2.8b}, [x2], x3
1188 vp8_epel8_h6 v1, v1, v2
1190 st1 {v1.8b}, [x7], #8
1194 // second pass (vertical):
1196 add x6, x17, x6, lsl #4 // y
1201 ld1 {v1.8b - v2.8b}, [x7], #16
1202 ld1 {v3.8b - v5.8b}, [x7]
1204 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1206 st1 {v1.d}[0], [x0], x1
1207 st1 {v1.d}[1], [x0], x1
1215 function ff_put_vp8_epel4_v6_neon, export=1
1216 sub x2, x2, x3, lsl #1
1218 movrel x7, subpel_filters, -16
1219 add x6, x7, w6, uxtw #4
1222 ld1r {v2.2s}, [x2], x3
1223 ld1r {v3.2s}, [x2], x3
1224 ld1r {v4.2s}, [x2], x3
1225 ld1r {v5.2s}, [x2], x3
1226 ld1r {v6.2s}, [x2], x3
1227 ld1r {v7.2s}, [x2], x3
1229 sub x2, x2, x3, lsl #2
1230 ld1 {v2.s}[1], [x2], x3
1231 ld1 {v3.s}[1], [x2], x3
1232 ld1 {v4.s}[1], [x2], x3
1233 ld1 {v5.s}[1], [x2], x3
1234 ld1 {v6.s}[1], [x2], x3
1235 ld1 {v7.s}[1], [x2], x3
1236 ld1 {v28.s}[1], [x2]
1237 sub x2, x2, x3, lsl #2
1239 vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
1241 st1 {v2.s}[0], [x0], x1
1242 st1 {v3.s}[0], [x0], x1
1243 st1 {v2.s}[1], [x0], x1
1244 st1 {v3.s}[1], [x0], x1
1251 function ff_put_vp8_epel4_h6_neon, export=1
1254 movrel x7, subpel_filters, -16
1255 add x5, x7, w5, uxtw #4
1258 ld1 {v2.8b,v3.8b}, [x2], x3
1259 vp8_epel8_h6 v2, v2, v3
1260 st1 {v2.s}[0], [x0], x1
1267 function ff_put_vp8_epel4_h6v6_neon, export=1
1268 sub x2, x2, x3, lsl #1
1271 movrel x7, subpel_filters, -16
1272 add x5, x7, w5, uxtw #4
1279 ld1 {v2.8b,v3.8b}, [x2], x3
1280 vp8_epel8_h6 v2, v2, v3
1281 st1 {v2.s}[0], [x9], #4
1285 add x6, x7, w6, uxtw #4
1289 ld1 {v2.8b,v3.8b}, [x9], #16
1290 ld1 {v6.8b}, [x9], #8
1293 ld1 {v4.8b,v5.8b}, [x9], #16
1294 ld1 {v7.8b}, [x9], #8
1295 ld1 {v28.s}[1], [x9]
1297 trn1 v1.2s, v2.2s, v4.2s
1298 trn2 v4.2s, v2.2s, v4.2s
1299 trn1 v2.2s, v3.2s, v5.2s
1300 trn2 v5.2s, v3.2s, v5.2s
1301 trn1 v3.2s, v6.2s, v7.2s
1302 trn2 v7.2s, v6.2s, v7.2s
1303 vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
1304 st1 {v2.s}[0], [x0], x1
1305 st1 {v3.s}[0], [x0], x1
1306 st1 {v2.s}[1], [x0], x1
1307 st1 {v3.s}[1], [x0], x1
1315 function ff_put_vp8_epel4_h4v6_neon, export=1
1316 sub x2, x2, x3, lsl #1
1319 movrel x7, subpel_filters, -16
1320 add x5, x7, w5, uxtw #4
1327 ld1 {v2.8b}, [x2], x3
1328 vp8_epel8_h4 v2, v2, v2
1329 st1 {v2.s}[0], [x9], #4
1333 add x6, x7, w6, uxtw #4
1337 ld1 {v2.8b,v3.8b}, [x9], #16
1338 ld1 {v6.8b}, [x9], #8
1341 ld1 {v4.8b,v5.8b}, [x9], #16
1342 ld1 {v7.8b}, [x9], #8
1343 ld1 {v28.s}[1], [x9]
1345 trn1 v1.2s, v2.2s, v4.2s
1346 trn2 v4.2s, v2.2s, v4.2s
1347 trn1 v2.2s, v3.2s, v5.2s
1348 trn2 v5.2s, v3.2s, v5.2s
1349 trn1 v3.2s, v6.2s, v7.2s
1350 trn2 v7.2s, v6.2s, v7.2s
1351 vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
1352 st1 {v2.s}[0], [x0], x1
1353 st1 {v3.s}[0], [x0], x1
1354 st1 {v2.s}[1], [x0], x1
1355 st1 {v3.s}[1], [x0], x1
1363 function ff_put_vp8_epel4_h6v4_neon, export=1
1367 movrel x7, subpel_filters, -16
1368 add x5, x7, w5, uxtw #4
1375 ld1 {v2.8b,v3.8b}, [x2], x3
1376 vp8_epel8_h6 v2, v2, v3
1377 st1 {v2.s}[0], [x9], #4
1381 add x6, x7, w6, uxtw #4
1385 ld1 {v2.8b,v3.8b}, [x9], #16
1388 ld1 {v4.8b,v5.8b}, [x9], #16
1391 trn1 v1.2s, v2.2s, v4.2s
1392 trn2 v4.2s, v2.2s, v4.2s
1393 trn1 v2.2s, v3.2s, v5.2s
1394 trn2 v5.2s, v3.2s, v5.2s
1395 vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
1396 st1 {v1.s}[0], [x0], x1
1397 st1 {v1.s}[2], [x0], x1
1398 st1 {v1.s}[1], [x0], x1
1399 st1 {v1.s}[3], [x0], x1
1407 function ff_put_vp8_epel4_h4_neon, export=1
1410 movrel x7, subpel_filters, -16
1411 add x5, x7, w5, uxtw #4
1414 ld1 {v2.8b}, [x2], x3
1415 vp8_epel8_h4 v2, v2, v2
1416 st1 {v2.s}[0], [x0], x1
1423 function ff_put_vp8_epel4_v4_neon, export=1
1426 movrel x7, subpel_filters, -16
1427 add x6, x7, w6, uxtw #4
1430 ld1r {v2.2s}, [x2], x3
1431 ld1r {v3.2s}, [x2], x3
1432 ld1r {v4.2s}, [x2], x3
1433 ld1r {v5.2s}, [x2], x3
1435 sub x2, x2, x3, lsl #1
1436 ld1 {v2.s}[1], [x2], x3
1437 ld1 {v3.s}[1], [x2], x3
1438 ld1 {v4.s}[1], [x2], x3
1439 ld1 {v5.s}[1], [x2], x3
1441 sub x2, x2, x3, lsl #1
1443 vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1445 st1 {v2.s}[0], [x0], x1
1446 st1 {v2.s}[2], [x0], x1
1447 st1 {v2.s}[1], [x0], x1
1448 st1 {v2.s}[3], [x0], x1
1455 function ff_put_vp8_epel4_h4v4_neon, export=1
1459 movrel x7, subpel_filters, -16
1460 add x5, x7, w5, uxtw #4
1467 ld1 {v2.8b}, [x2], x3
1468 vp8_epel8_h4 v2, v2, v3
1469 st1 {v2.s}[0], [x9], #4
1473 add x6, x7, w6, uxtw #4
1477 ld1 {v2.8b,v3.8b}, [x9], #16
1480 ld1 {v4.8b,v5.8b}, [x9], #16
1483 trn1 v1.2s, v2.2s, v4.2s
1484 trn2 v4.2s, v2.2s, v4.2s
1485 trn1 v2.2s, v3.2s, v5.2s
1486 trn2 v5.2s, v3.2s, v5.2s
1487 vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
1488 st1 {v1.s}[0], [x0], x1
1489 st1 {v1.s}[2], [x0], x1
1490 st1 {v1.s}[1], [x0], x1
1491 st1 {v1.s}[3], [x0], x1
1501 function ff_put_vp8_bilin16_h_neon, export=1
1508 ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3
1509 ext v5.8b, v3.8b, v4.8b, #1
1510 ext v4.8b, v2.8b, v3.8b, #1
1511 umull v16.8h, v2.8b, v1.8b
1512 umlal v16.8h, v4.8b, v0.8b
1513 ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
1514 umull v6.8h, v3.8b, v1.8b
1515 umlal v6.8h, v5.8b, v0.8b
1516 ext v21.8b, v19.8b, v20.8b, #1
1517 ext v20.8b, v18.8b, v19.8b, #1
1518 umull v22.8h, v18.8b, v1.8b
1519 umlal v22.8h, v20.8b, v0.8b
1520 umull v24.8h, v19.8b, v1.8b
1521 umlal v24.8h, v21.8b, v0.8b
1522 rshrn v4.8b, v16.8h, #3
1523 rshrn2 v4.16b, v6.8h, #3
1524 rshrn v6.8b, v22.8h, #3
1525 rshrn2 v6.16b, v24.8h, #3
1526 st1 {v4.16b}, [x0], x1
1527 st1 {v6.16b}, [x0], x1
1533 function ff_put_vp8_bilin16_v_neon, export=1
1539 ld1 {v2.16b}, [x2], x3
1542 ld1 {v4.16b}, [x2], x3
1543 umull v6.8h, v2.8b, v1.8b
1544 umlal v6.8h, v4.8b, v0.8b
1545 umull2 v16.8h, v2.16b, v1.16b
1546 umlal2 v16.8h, v4.16b, v0.16b
1547 ld1 {v2.16b}, [x2], x3
1548 umull v18.8h, v4.8b, v1.8b
1549 umlal v18.8h, v2.8b, v0.8b
1550 umull2 v20.8h, v4.16b, v1.16b
1551 umlal2 v20.8h, v2.16b, v0.16b
1552 rshrn v4.8b, v6.8h, #3
1553 rshrn2 v4.16b, v16.8h, #3
1554 rshrn v6.8b, v18.8h, #3
1555 rshrn2 v6.16b, v20.8h, #3
1556 st1 {v4.16b}, [x0], x1
1557 st1 {v6.16b}, [x0], x1
1563 function ff_put_vp8_bilin16_hv_neon, export=1
1568 dup v2.16b, w6 // my
1572 ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3
1574 ext v7.8b, v5.8b, v6.8b, #1
1575 ext v6.8b, v4.8b, v5.8b, #1
1576 umull v16.8h, v4.8b, v1.8b
1577 umlal v16.8h, v6.8b, v0.8b
1578 umull v18.8h, v5.8b, v1.8b
1579 umlal v18.8h, v7.8b, v0.8b
1580 rshrn v4.8b, v16.8h, #3
1581 rshrn2 v4.16b, v18.8h, #3
1584 ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
1585 ext v21.8b, v19.8b, v20.8b, #1
1586 ext v20.8b, v18.8b, v19.8b, #1
1587 umull v22.8h, v18.8b, v1.8b
1588 umlal v22.8h, v20.8b, v0.8b
1589 ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3
1590 umull v24.8h, v19.8b, v1.8b
1591 umlal v24.8h, v21.8b, v0.8b
1592 ext v29.8b, v27.8b, v28.8b, #1
1593 ext v28.8b, v26.8b, v27.8b, #1
1594 umull v16.8h, v26.8b, v1.8b
1595 umlal v16.8h, v28.8b, v0.8b
1596 umull v18.8h, v27.8b, v1.8b
1597 umlal v18.8h, v29.8b, v0.8b
1598 rshrn v6.8b, v22.8h, #3
1599 rshrn2 v6.16b, v24.8h, #3
1600 umull v24.8h, v4.8b, v3.8b
1601 umlal v24.8h, v6.8b, v2.8b
1602 umull2 v30.8h, v4.16b, v3.16b
1603 umlal2 v30.8h, v6.16b, v2.16b
1604 rshrn v4.8b, v16.8h, #3
1605 rshrn2 v4.16b, v18.8h, #3
1606 umull v20.8h, v6.8b, v3.8b
1607 umlal v20.8h, v4.8b, v2.8b
1608 umull2 v22.8h, v6.16b, v3.16b
1609 umlal2 v22.8h, v4.16b, v2.16b
1610 rshrn v24.8b, v24.8h, #3
1611 rshrn2 v24.16b, v30.8h, #3
1612 st1 {v24.16b}, [x0], x1
1613 rshrn v20.8b, v20.8h, #3
1614 rshrn2 v20.16b, v22.8h, #3
1615 st1 {v20.16b}, [x0], x1
1621 function ff_put_vp8_bilin8_h_neon, export=1
1628 ld1 {v2.8b,v3.8b}, [x2], x3
1629 ext v3.8b, v2.8b, v3.8b, #1
1630 umull v4.8h, v2.8b, v1.8b
1631 umlal v4.8h, v3.8b, v0.8b
1632 ld1 {v6.8b,v7.8b}, [x2], x3
1633 ext v7.8b, v6.8b, v7.8b, #1
1634 umull v16.8h, v6.8b, v1.8b
1635 umlal v16.8h, v7.8b, v0.8b
1636 rshrn v4.8b, v4.8h, #3
1637 rshrn v16.8b, v16.8h, #3
1638 st1 {v4.8b}, [x0], x1
1639 st1 {v16.8b}, [x0], x1
1645 function ff_put_vp8_bilin8_v_neon, export=1
1651 ld1 {v2.8b}, [x2], x3
1654 ld1 {v3.8b}, [x2], x3
1655 umull v4.8h, v2.8b, v1.8b
1656 umlal v4.8h, v3.8b, v0.8b
1657 ld1 {v2.8b}, [x2], x3
1658 umull v6.8h, v3.8b, v1.8b
1659 umlal v6.8h, v2.8b, v0.8b
1660 rshrn v4.8b, v4.8h, #3
1661 rshrn v6.8b, v6.8h, #3
1662 st1 {v4.8b}, [x0], x1
1663 st1 {v6.8b}, [x0], x1
1669 function ff_put_vp8_bilin8_hv_neon, export=1
1678 ld1 {v4.8b,v5.8b}, [x2], x3
1679 ext v5.8b, v4.8b, v5.8b, #1
1680 umull v18.8h, v4.8b, v1.8b
1681 umlal v18.8h, v5.8b, v0.8b
1682 rshrn v22.8b, v18.8h, #3
1685 ld1 {v6.8b,v7.8b}, [x2], x3
1686 ext v7.8b, v6.8b, v7.8b, #1
1687 umull v16.8h, v6.8b, v1.8b
1688 umlal v16.8h, v7.8b, v0.8b
1689 ld1 {v4.8b,v5.8b}, [x2], x3
1690 ext v5.8b, v4.8b, v5.8b, #1
1691 umull v18.8h, v4.8b, v1.8b
1692 umlal v18.8h, v5.8b, v0.8b
1693 rshrn v16.8b, v16.8h, #3
1694 umull v20.8h, v22.8b, v3.8b
1695 umlal v20.8h, v16.8b, v2.8b
1696 rshrn v22.8b, v18.8h, #3
1697 umull v24.8h, v16.8b, v3.8b
1698 umlal v24.8h, v22.8b, v2.8b
1699 rshrn v20.8b, v20.8h, #3
1700 st1 {v20.8b}, [x0], x1
1701 rshrn v23.8b, v24.8h, #3
1702 st1 {v23.8b}, [x0], x1
1708 function ff_put_vp8_bilin4_h_neon, export=1
1715 ld1 {v2.8b}, [x2], x3
1716 ext v3.8b, v2.8b, v3.8b, #1
1717 ld1 {v6.8b}, [x2], x3
1718 ext v7.8b, v6.8b, v7.8b, #1
1719 trn1 v2.2s, v2.2s, v6.2s
1720 trn1 v3.2s, v3.2s, v7.2s
1721 umull v4.8h, v2.8b, v1.8b
1722 umlal v4.8h, v3.8b, v0.8b
1723 rshrn v4.8b, v4.8h, #3
1724 st1 {v4.s}[0], [x0], x1
1725 st1 {v4.s}[1], [x0], x1
1731 function ff_put_vp8_bilin4_v_neon, export=1
1737 ld1r {v2.2s}, [x2], x3
1740 ld1 {v2.s}[1], [x2], x3
1741 ld1 {v3.s}[1], [x2], x3
1742 umull v4.8h, v2.8b, v1.8b
1743 umlal v4.8h, v3.8b, v0.8b
1744 trn2 v2.2s, v3.2s, v2.2s
1745 rshrn v4.8b, v4.8h, #3
1746 st1 {v4.s}[0], [x0], x1
1747 st1 {v4.s}[1], [x0], x1
1754 function ff_put_vp8_bilin4_hv_neon, export=1
1763 ld1 {v4.8b}, [x2], x3
1764 ext v5.8b, v4.8b, v4.8b, #1
1765 umull v18.8h, v4.8b, v1.8b
1766 umlal v18.8h, v5.8b, v0.8b
1767 rshrn v22.8b, v18.8h, #3
1770 ld1 {v6.8b}, [x2], x3
1771 ext v7.8b, v6.8b, v6.8b, #1
1772 ld1 {v4.8b}, [x2], x3
1773 ext v5.8b, v4.8b, v4.8b, #1
1774 trn1 v6.2s, v6.2s, v4.2s
1775 trn1 v7.2s, v7.2s, v5.2s
1776 umull v16.8h, v6.8b, v1.8b
1777 umlal v16.8h, v7.8b, v0.8b
1778 rshrn v16.8b, v16.8h, #3
1779 umull v20.8h, v16.8b, v2.8b
1780 trn1 v22.2s, v22.2s, v16.2s
1781 umlal v20.8h, v22.8b, v3.8b
1782 rev64 v22.2s, v16.2s
1783 rshrn v20.8b, v20.8h, #3
1784 st1 {v20.s}[0], [x0], x1
1785 st1 {v20.s}[1], [x0], x1