2 * VP8 NEON optimisations
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
7 * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
9 * This file is part of FFmpeg.
11 * FFmpeg is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
16 * FFmpeg is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with FFmpeg; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include "libavutil/aarch64/asm.S"
29 function ff_vp8_luma_dc_wht_neon, export=1
30 ld1 {v0.4h - v3.4h}, [x1]
33 add v4.4h, v0.4h, v3.4h
34 add v6.4h, v1.4h, v2.4h
35 st1 {v30.8h}, [x1], #16
36 sub v7.4h, v1.4h, v2.4h
37 sub v5.4h, v0.4h, v3.4h
39 add v0.4h, v4.4h, v6.4h
40 add v1.4h, v5.4h, v7.4h
41 sub v2.4h, v4.4h, v6.4h
42 sub v3.4h, v5.4h, v7.4h
46 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
48 add v0.4h, v0.4h, v16.4h
50 add v4.4h, v0.4h, v3.4h
51 add v6.4h, v1.4h, v2.4h
52 sub v7.4h, v1.4h, v2.4h
53 sub v5.4h, v0.4h, v3.4h
54 add v0.4h, v4.4h, v6.4h
55 add v1.4h, v5.4h, v7.4h
56 sub v2.4h, v4.4h, v6.4h
57 sub v3.4h, v5.4h, v7.4h
65 st1 {v0.h}[0], [x0], x3
66 st1 {v1.h}[0], [x0], x3
67 st1 {v2.h}[0], [x0], x3
68 st1 {v3.h}[0], [x0], x3
69 st1 {v0.h}[1], [x0], x3
70 st1 {v1.h}[1], [x0], x3
71 st1 {v2.h}[1], [x0], x3
72 st1 {v3.h}[1], [x0], x3
73 st1 {v0.h}[2], [x0], x3
74 st1 {v1.h}[2], [x0], x3
75 st1 {v2.h}[2], [x0], x3
76 st1 {v3.h}[2], [x0], x3
77 st1 {v0.h}[3], [x0], x3
78 st1 {v1.h}[3], [x0], x3
79 st1 {v2.h}[3], [x0], x3
80 st1 {v3.h}[3], [x0], x3
85 function ff_vp8_idct_add_neon, export=1
86 ld1 {v0.8b - v3.8b}, [x1]
88 movk w4, #35468/2, lsl #16
91 smull v26.4s, v1.4h, v4.h[0]
92 smull v27.4s, v3.4h, v4.h[0]
93 sqdmulh v20.4h, v1.4h, v4.h[1]
94 sqdmulh v23.4h, v3.4h, v4.h[1]
95 shrn v21.4h, v26.4s, #16
96 shrn v22.4h, v27.4s, #16
97 add v21.4h, v21.4h, v1.4h
98 add v22.4h, v22.4h, v3.4h
100 add v16.4h, v0.4h, v2.4h
101 sub v17.4h, v0.4h, v2.4h
103 add v18.4h, v21.4h, v23.4h
104 sub v19.4h, v20.4h, v22.4h
106 add v0.4h, v16.4h, v18.4h
107 add v1.4h, v17.4h, v19.4h
108 sub v3.4h, v16.4h, v18.4h
109 sub v2.4h, v17.4h, v19.4h
111 transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7
114 smull v26.4s, v1.4h, v4.h[0]
115 st1 {v29.8h}, [x1], #16
116 smull v27.4s, v3.4h, v4.h[0]
118 sqdmulh v21.4h, v1.4h, v4.h[1]
119 sqdmulh v23.4h, v3.4h, v4.h[1]
120 shrn v20.4h, v26.4s, #16
121 shrn v22.4h, v27.4s, #16
122 add v20.4h, v20.4h, v1.4h
123 add v22.4h, v22.4h, v3.4h
124 add v16.4h, v0.4h, v2.4h
125 sub v17.4h, v0.4h, v2.4h
127 add v18.4h, v20.4h, v23.4h
128 ld1 {v24.s}[0], [x0], x2
129 sub v19.4h, v21.4h, v22.4h
130 ld1 {v25.s}[0], [x0], x2
131 add v0.4h, v16.4h, v18.4h
132 add v1.4h, v17.4h, v19.4h
133 ld1 {v26.s}[0], [x0], x2
134 sub v3.4h, v16.4h, v18.4h
135 sub v2.4h, v17.4h, v19.4h
136 ld1 {v27.s}[0], [x0], x2
137 srshr v0.4h, v0.4h, #3
138 srshr v1.4h, v1.4h, #3
139 srshr v2.4h, v2.4h, #3
140 srshr v3.4h, v3.4h, #3
142 sub x0, x0, x2, lsl #2
144 transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16
146 uaddw v0.8h, v0.8h, v24.8b
147 uaddw v1.8h, v1.8h, v25.8b
148 uaddw v2.8h, v2.8h, v26.8b
149 uaddw v3.8h, v3.8h, v27.8b
155 st1 {v0.s}[0], [x0], x2
156 st1 {v1.s}[0], [x0], x2
157 st1 {v2.s}[0], [x0], x2
158 st1 {v3.s}[0], [x0], x2
163 function ff_vp8_idct_dc_add4uv_neon, export=1
167 st1 {v0.h}[0], [x1], x3
169 st1 {v0.h}[0], [x1], x3
171 st1 {v0.h}[0], [x1], x3
173 st1 {v0.h}[0], [x1], x3
174 ins v16.d[1], v17.d[0]
175 ins v18.d[1], v19.d[0]
177 srshr v16.8h, v16.8h, #3 // dc >>= 3
178 ld1 {v0.8b}, [x0], x2
179 srshr v18.8h, v18.8h, #3
180 ld1 {v1.8b}, [x0], x2
181 uaddw v20.8h, v16.8h, v0.8b
182 ld1 {v2.8b}, [x0], x2
183 uaddw v0.8h, v16.8h, v1.8b
184 ld1 {v3.8b}, [x0], x2
185 uaddw v22.8h, v16.8h, v2.8b
186 ld1 {v4.8b}, [x0], x2
187 uaddw v2.8h, v16.8h, v3.8b
188 ld1 {v5.8b}, [x0], x2
189 uaddw v24.8h, v18.8h, v4.8b
190 ld1 {v6.8b}, [x0], x2
191 uaddw v4.8h, v18.8h, v5.8b
192 ld1 {v7.8b}, [x0], x2
193 uaddw v26.8h, v18.8h, v6.8b
194 sqxtun v20.8b, v20.8h
195 uaddw v6.8h, v18.8h, v7.8b
197 sqxtun v22.8b, v22.8h
198 st1 {v20.8b}, [x3], x2
200 st1 {v21.8b}, [x3], x2
201 sqxtun v24.8b, v24.8h
202 st1 {v22.8b}, [x3], x2
204 st1 {v23.8b}, [x3], x2
205 sqxtun v26.8b, v26.8h
206 st1 {v24.8b}, [x3], x2
208 st1 {v25.8b}, [x3], x2
209 st1 {v26.8b}, [x3], x2
210 st1 {v27.8b}, [x3], x2
215 function ff_vp8_idct_dc_add4y_neon, export=1
219 st1 {v0.h}[0], [x1], x3
221 st1 {v0.h}[0], [x1], x3
222 zip1 v16.2d, v16.2d, v17.2d
224 st1 {v0.h}[0], [x1], x3
226 st1 {v0.h}[0], [x1], x3
227 zip1 v18.2d, v18.2d, v19.2d
228 srshr v16.8h, v16.8h, #3 // dc >>= 3
229 ld1 {v0.16b}, [x0], x2
230 srshr v18.8h, v18.8h, #3
231 ld1 {v1.16b}, [x0], x2
232 uaddw v20.8h, v16.8h, v0.8b
233 ld1 {v2.16b}, [x0], x2
234 uaddw2 v0.8h, v18.8h, v0.16b
235 ld1 {v3.16b}, [x0], x2
236 uaddw v21.8h, v16.8h, v1.8b
237 uaddw2 v1.8h, v18.8h, v1.16b
238 uaddw v22.8h, v16.8h, v2.8b
239 uaddw2 v2.8h, v18.8h, v2.16b
240 uaddw v23.8h, v16.8h, v3.8b
241 uaddw2 v3.8h, v18.8h, v3.16b
242 sub x0, x0, x2, lsl #2
243 sqxtun v20.8b, v20.8h
244 sqxtun2 v20.16b, v0.8h
245 sqxtun v21.8b, v21.8h
246 sqxtun2 v21.16b, v1.8h
247 sqxtun v22.8b, v22.8h
248 st1 {v20.16b}, [x0], x2
249 sqxtun2 v22.16b, v2.8h
250 st1 {v21.16b}, [x0], x2
251 sqxtun v23.8b, v23.8h
252 st1 {v22.16b}, [x0], x2
253 sqxtun2 v23.16b, v3.8h
254 st1 {v23.16b}, [x0], x2
259 function ff_vp8_idct_dc_add_neon, export=1
263 srshr v2.8h, v2.8h, #3
264 ld1 {v0.s}[0], [x0], x2
265 ld1 {v0.s}[1], [x0], x2
266 uaddw v3.8h, v2.8h, v0.8b
267 ld1 {v1.s}[0], [x0], x2
268 ld1 {v1.s}[1], [x0], x2
269 uaddw v4.8h, v2.8h, v1.8b
272 sub x0, x0, x2, lsl #2
273 st1 {v0.s}[0], [x0], x2
274 st1 {v0.s}[1], [x0], x2
275 st1 {v1.s}[0], [x0], x2
276 st1 {v1.s}[1], [x0], x2
286 .macro vp8_loop_filter, inner=0, simple=0, hev_thresh
288 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
289 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
290 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
291 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
292 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
294 cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
296 // calculate hev and normal_limit:
297 uabd v20.16b, v2.16b, v3.16b // abs(P1-P0)
298 uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0)
299 uabd v18.16b, v0.16b, v1.16b // abs(P3-P2)
300 uabd v19.16b, v1.16b, v2.16b // abs(P2-P1)
301 cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I
302 cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I
303 cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I
304 cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I
305 and v16.16b, v17.16b, v16.16b
306 uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2)
307 and v16.16b, v16.16b, v19.16b
308 uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1)
309 and v16.16b, v16.16b, v18.16b
310 cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I
311 cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I
312 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
313 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
314 and v16.16b, v16.16b, v18.16b
315 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
316 and v16.16b, v16.16b, v19.16b
317 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
318 dup v23.16b, \hev_thresh // hev_thresh
319 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
320 cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh
321 cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
322 cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh
323 and v16.16b, v16.16b, v19.16b
325 orr v17.16b, v20.16b, v22.16b
332 // convert to signed value:
333 eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
334 eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
337 ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
338 ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
339 eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
340 eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
341 mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
342 mul v19.8h, v19.8h, v20.8h
344 sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
348 and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
350 saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
351 saddw2 v19.8h, v19.8h, v20.16b
352 sqxtn v18.8b, v18.8h // narrow result back into v18
353 sqxtn2 v18.16b, v19.8h
354 .if !\inner && !\simple
355 eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
356 eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
358 and v18.16b, v18.16b, v16.16b // w &= normal_limit
360 // registers used at this point..
361 // v0 -> P3 (don't corrupt)
363 // v7 -> Q3 (don't corrupt)
369 // v16, v19, v29 -> unused
371 // filter_common: is4tap==1
372 // c1 = clamp(w + 4) >> 3;
373 // c2 = clamp(w + 3) >> 3;
374 // Q0 = s2u(QS0 - c1);
375 // P0 = s2u(PS0 + c2);
378 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
379 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
380 sshr v19.16b, v19.16b, #3 // c1 >>= 3
381 sshr v20.16b, v20.16b, #3 // c2 >>= 3
382 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
383 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
384 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
385 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
386 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
387 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
389 // the !is4tap case of filter_common, only used for inner blocks
390 // c3 = ((c1&~hev) + 1) >> 1;
391 // Q1 = s2u(QS1 - c3);
392 // P1 = s2u(PS1 + c3);
393 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
394 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
395 sshr v19.16b, v19.16b, #3 // c1 >>= 3
396 sshr v20.16b, v20.16b, #3 // c2 >>= 3
397 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
398 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
399 bic v19.16b, v19.16b, v17.16b // c1 & ~hev
400 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
401 srshr v19.16b, v19.16b, #1 // c3 >>= 1
402 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
403 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
404 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
405 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
406 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
408 and v20.16b, v18.16b, v17.16b // w & hev
409 sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
410 sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
411 sshr v19.16b, v19.16b, #3 // c1 >>= 3
412 sshr v20.16b, v20.16b, #3 // c2 >>= 3
413 bic v18.16b, v18.16b, v17.16b // w &= ~hev
414 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
415 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
418 // a = clamp((27*w + 63) >> 7);
419 // Q0 = s2u(QS0 - a);
420 // P0 = s2u(PS0 + a);
421 // a = clamp((18*w + 63) >> 7);
422 // Q1 = s2u(QS1 - a);
423 // P1 = s2u(PS1 + a);
424 // a = clamp((9*w + 63) >> 7);
425 // Q2 = s2u(QS2 - a);
426 // P2 = s2u(PS2 + a);
428 sshll v22.8h, v18.8b, #3
429 sshll2 v23.8h, v18.16b, #3
430 saddw v22.8h, v22.8h, v18.8b
431 saddw2 v23.8h, v23.8h, v18.16b
432 add v16.8h, v17.8h, v22.8h
433 add v17.8h, v17.8h, v23.8h // 9*w + 63
434 add v19.8h, v16.8h, v22.8h
435 add v20.8h, v17.8h, v23.8h // 18*w + 63
436 add v22.8h, v19.8h, v22.8h
437 add v23.8h, v20.8h, v23.8h // 27*w + 63
438 sqshrn v16.8b, v16.8h, #7
439 sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
440 sqshrn v19.8b, v19.8h, #7
441 sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
442 sqshrn v22.8b, v22.8h, #7
443 sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
444 sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
445 sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
446 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
447 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
448 sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
449 sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
450 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
451 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
452 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
453 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
454 eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
455 eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
459 .macro vp8_v_loop_filter16 name, inner=0, simple=0
460 function ff_vp8_v_loop_filter16\name\()_neon, export=1
461 sub x0, x0, x1, lsl #1+!\simple
465 ld1 {v0.16b}, [x0], x1 // P3
466 ld1 {v1.16b}, [x0], x1 // P2
468 ld1 {v2.16b}, [x0], x1 // P1
469 ld1 {v3.16b}, [x0], x1 // P0
470 ld1 {v4.16b}, [x0], x1 // Q0
471 ld1 {v5.16b}, [x0], x1 // Q1
473 ld1 {v6.16b}, [x0], x1 // Q2
474 ld1 {v7.16b}, [x0] // Q3
475 dup v23.16b, w3 // flim_I
477 dup v22.16b, w2 // flim_E
479 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
481 // back up to P2: dst -= stride * 6
482 sub x0, x0, x1, lsl #2
484 sub x0, x0, x1, lsl #1
487 st1 {v1.16b}, [x0], x1 // P2
489 st1 {v2.16b}, [x0], x1 // P1
490 st1 {v3.16b}, [x0], x1 // P0
491 st1 {v4.16b}, [x0], x1 // Q0
492 st1 {v5.16b}, [x0], x1 // Q1
494 st1 {v6.16b}, [x0] // Q2
502 vp8_v_loop_filter16 _inner, inner=1
503 vp8_v_loop_filter16 _simple, simple=1
505 .macro vp8_v_loop_filter8uv name, inner=0
506 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
507 sub x0, x0, x2, lsl #2
508 sub x1, x1, x2, lsl #2
510 ld1 {v0.d}[0], [x0], x2 // P3
511 ld1 {v0.d}[1], [x1], x2 // P3
512 ld1 {v1.d}[0], [x0], x2 // P2
513 ld1 {v1.d}[1], [x1], x2 // P2
514 ld1 {v2.d}[0], [x0], x2 // P1
515 ld1 {v2.d}[1], [x1], x2 // P1
516 ld1 {v3.d}[0], [x0], x2 // P0
517 ld1 {v3.d}[1], [x1], x2 // P0
518 ld1 {v4.d}[0], [x0], x2 // Q0
519 ld1 {v4.d}[1], [x1], x2 // Q0
520 ld1 {v5.d}[0], [x0], x2 // Q1
521 ld1 {v5.d}[1], [x1], x2 // Q1
522 ld1 {v6.d}[0], [x0], x2 // Q2
523 ld1 {v6.d}[1], [x1], x2 // Q2
524 ld1 {v7.d}[0], [x0] // Q3
525 ld1 {v7.d}[1], [x1] // Q3
527 dup v22.16b, w3 // flim_E
528 dup v23.16b, w4 // flim_I
530 vp8_loop_filter inner=\inner, hev_thresh=w5
532 // back up to P2: u,v -= stride * 6
533 sub x0, x0, x2, lsl #2
534 sub x1, x1, x2, lsl #2
535 sub x0, x0, x2, lsl #1
536 sub x1, x1, x2, lsl #1
540 st1 {v1.d}[0], [x0], x2 // P2
541 st1 {v1.d}[1], [x1], x2 // P2
542 st1 {v2.d}[0], [x0], x2 // P1
543 st1 {v2.d}[1], [x1], x2 // P1
544 st1 {v3.d}[0], [x0], x2 // P0
545 st1 {v3.d}[1], [x1], x2 // P0
546 st1 {v4.d}[0], [x0], x2 // Q0
547 st1 {v4.d}[1], [x1], x2 // Q0
548 st1 {v5.d}[0], [x0], x2 // Q1
549 st1 {v5.d}[1], [x1], x2 // Q1
550 st1 {v6.d}[0], [x0] // Q2
551 st1 {v6.d}[1], [x1] // Q2
558 vp8_v_loop_filter8uv _inner, inner=1
560 .macro vp8_h_loop_filter16 name, inner=0, simple=0
561 function ff_vp8_h_loop_filter16\name\()_neon, export=1
565 ld1 {v0.d}[0], [x0], x1
566 ld1 {v1.d}[0], [x0], x1
567 ld1 {v2.d}[0], [x0], x1
568 ld1 {v3.d}[0], [x0], x1
569 ld1 {v4.d}[0], [x0], x1
570 ld1 {v5.d}[0], [x0], x1
571 ld1 {v6.d}[0], [x0], x1
572 ld1 {v7.d}[0], [x0], x1
573 ld1 {v0.d}[1], [x0], x1
574 ld1 {v1.d}[1], [x0], x1
575 ld1 {v2.d}[1], [x0], x1
576 ld1 {v3.d}[1], [x0], x1
577 ld1 {v4.d}[1], [x0], x1
578 ld1 {v5.d}[1], [x0], x1
579 ld1 {v6.d}[1], [x0], x1
580 ld1 {v7.d}[1], [x0], x1
582 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
584 dup v22.16b, w2 // flim_E
586 dup v23.16b, w3 // flim_I
589 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
591 sub x0, x0, x1, lsl #4 // backup 16 rows
593 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
596 st1 {v0.d}[0], [x0], x1
597 st1 {v1.d}[0], [x0], x1
598 st1 {v2.d}[0], [x0], x1
599 st1 {v3.d}[0], [x0], x1
600 st1 {v4.d}[0], [x0], x1
601 st1 {v5.d}[0], [x0], x1
602 st1 {v6.d}[0], [x0], x1
603 st1 {v7.d}[0], [x0], x1
604 st1 {v0.d}[1], [x0], x1
605 st1 {v1.d}[1], [x0], x1
606 st1 {v2.d}[1], [x0], x1
607 st1 {v3.d}[1], [x0], x1
608 st1 {v4.d}[1], [x0], x1
609 st1 {v5.d}[1], [x0], x1
610 st1 {v6.d}[1], [x0], x1
618 vp8_h_loop_filter16 _inner, inner=1
619 vp8_h_loop_filter16 _simple, simple=1
621 .macro vp8_h_loop_filter8uv name, inner=0
622 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
627 ld1 {v0.d}[0], [x0], x2 // load u
628 ld1 {v0.d}[1], [x1], x2 // load v
629 ld1 {v1.d}[0], [x0], x2
630 ld1 {v1.d}[1], [x1], x2
631 ld1 {v2.d}[0], [x0], x2
632 ld1 {v2.d}[1], [x1], x2
633 ld1 {v3.d}[0], [x0], x2
634 ld1 {v3.d}[1], [x1], x2
635 ld1 {v4.d}[0], [x0], x2
636 ld1 {v4.d}[1], [x1], x2
637 ld1 {v5.d}[0], [x0], x2
638 ld1 {v5.d}[1], [x1], x2
639 ld1 {v6.d}[0], [x0], x2
640 ld1 {v6.d}[1], [x1], x2
641 ld1 {v7.d}[0], [x0], x2
642 ld1 {v7.d}[1], [x1], x2
644 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
646 dup v22.16b, w3 // flim_E
647 dup v23.16b, w4 // flim_I
649 vp8_loop_filter inner=\inner, hev_thresh=w5
651 sub x0, x0, x2, lsl #3 // backup u 8 rows
652 sub x1, x1, x2, lsl #3 // backup v 8 rows
654 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
657 st1 {v0.d}[0], [x0], x2 // load u
658 st1 {v0.d}[1], [x1], x2 // load v
659 st1 {v1.d}[0], [x0], x2
660 st1 {v1.d}[1], [x1], x2
661 st1 {v2.d}[0], [x0], x2
662 st1 {v2.d}[1], [x1], x2
663 st1 {v3.d}[0], [x0], x2
664 st1 {v3.d}[1], [x1], x2
665 st1 {v4.d}[0], [x0], x2
666 st1 {v4.d}[1], [x1], x2
667 st1 {v5.d}[0], [x0], x2
668 st1 {v5.d}[1], [x1], x2
669 st1 {v6.d}[0], [x0], x2
670 st1 {v6.d}[1], [x1], x2
680 vp8_h_loop_filter8uv _inner, inner=1
683 function ff_put_vp8_pixels16_neon, export=1
686 ld1 {v0.16b}, [x2], x3
687 ld1 {v1.16b}, [x2], x3
688 ld1 {v2.16b}, [x2], x3
689 ld1 {v3.16b}, [x2], x3
690 st1 {v0.16b}, [x0], x1
691 st1 {v1.16b}, [x0], x1
692 st1 {v2.16b}, [x0], x1
693 st1 {v3.16b}, [x0], x1
698 function ff_put_vp8_pixels8_neon, export=1
701 ld1 {v0.8b}, [x2], x3
702 ld1 {v0.d}[1], [x2], x3
703 ld1 {v1.8b}, [x2], x3
704 ld1 {v1.d}[1], [x2], x3
705 st1 {v0.8b}, [x0], x1
706 st1 {v0.d}[1], [x0], x1
707 st1 {v1.8b}, [x0], x1
708 st1 {v1.d}[1], [x0], x1
713 /* 4/6-tap 8th-pel MC */
715 .macro vp8_epel8_h6 d, s0, s1
716 ext v22.8b, \s0\().8b, \s1\().8b, #1
717 uxtl v18.8h, \s0\().8b
718 ext v23.8b, \s0\().8b, \s1\().8b, #2
720 ext v24.8b, \s0\().8b, \s1\().8b, #3
722 ext v25.8b, \s0\().8b, \s1\().8b, #4
724 ext v26.8b, \s0\().8b, \s1\().8b, #5
726 mul v21.8h, v21.8h, v0.h[2]
728 mul v22.8h, v22.8h, v0.h[3]
729 mls v21.8h, v19.8h, v0.h[1]
730 mls v22.8h, v25.8h, v0.h[4]
731 mla v21.8h, v18.8h, v0.h[0]
732 mla v22.8h, v26.8h, v0.h[5]
733 sqadd v22.8h, v21.8h, v22.8h
734 sqrshrun \d\().8b, v22.8h, #7
737 .macro vp8_epel16_h6 d0, v0, v1
738 ext v22.16b, \v0\().16b, \v1\().16b, #3
739 ext v23.16b, \v0\().16b, \v1\().16b, #4
741 uxtl2 v22.8h, v22.16b
742 ext v3.16b, \v0\().16b, \v1\().16b, #2
744 uxtl2 v23.8h, v23.16b
745 ext v16.16b, \v0\().16b, \v1\().16b, #1
748 ext v2.16b, \v0\().16b, \v1\().16b, #5
752 uxtl2 v16.8h, v16.16b
753 mul v19.8h, v19.8h, v0.h[3]
754 mul v18.8h, v18.8h, v0.h[2]
755 mul v3.8h, v3.8h, v0.h[2]
756 mul v22.8h, v22.8h, v0.h[3]
757 mls v19.8h, v20.8h, v0.h[4]
758 uxtl v20.8h, \v0\().8b
759 uxtl2 v1.8h, \v0\().16b
760 mls v18.8h, v17.8h, v0.h[1]
761 mls v3.8h, v16.8h, v0.h[1]
762 mls v22.8h, v23.8h, v0.h[4]
763 mla v18.8h, v20.8h, v0.h[0]
764 mla v19.8h, v21.8h, v0.h[5]
765 mla v3.8h, v1.8h, v0.h[0]
766 mla v22.8h, v2.8h, v0.h[5]
767 sqadd v19.8h, v18.8h, v19.8h
768 sqadd v22.8h, v3.8h, v22.8h
769 sqrshrun \d0\().8b, v19.8h, #7
770 sqrshrun2 \d0\().16b, v22.8h, #7
773 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
774 uxtl \s0\().8h, \s0\().8b
775 uxtl \s3\().8h, \s3\().8b
776 uxtl \s6\().8h, \s6\().8b
777 uxtl \s1\().8h, \s1\().8b
778 uxtl \s4\().8h, \s4\().8b
779 uxtl \s2\().8h, \s2\().8b
780 uxtl \s5\().8h, \s5\().8b
781 mul \s0\().8h, \s0\().8h, v0.h[0]
782 mul v31.8h , \s3\().8h, v0.h[3]
783 mul \s3\().8h, \s3\().8h, v0.h[2]
784 mul \s6\().8h, \s6\().8h, v0.h[5]
786 mls \s0\().8h, \s1\().8h, v0.h[1]
787 mls v31.8h , \s4\().8h, v0.h[4]
788 mls \s3\().8h, \s2\().8h, v0.h[1]
789 mls \s6\().8h, \s5\().8h, v0.h[4]
791 mla \s0\().8h, \s2\().8h, v0.h[2]
792 mla v31.8h , \s5\().8h, v0.h[5]
793 mla \s3\().8h, \s1\().8h, v0.h[0]
794 mla \s6\().8h, \s4\().8h, v0.h[3]
795 sqadd v31.8h , \s0\().8h, v31.8h
796 sqadd \s6\().8h, \s3\().8h, \s6\().8h
797 sqrshrun \d0\().8b, v31.8h, #7
798 sqrshrun \d1\().8b, \s6\().8h, #7
801 .macro vp8_epel8_h4 d, v0, v1
802 ext v22.8b, \v0\().8b, \v1\().8b, #1
803 uxtl v19.8h, \v0\().8b
804 ext v23.8b, \v0\().8b, \v1\().8b, #2
806 ext v25.8b, \v0\().8b, \v1\().8b, #3
809 mul v20.8h, v20.8h, v0.h[2]
810 mul v22.8h, v22.8h, v0.h[3]
811 mls v20.8h, v19.8h, v0.h[1]
812 mls v22.8h, v25.8h, v0.h[4]
813 sqadd v22.8h, v20.8h, v22.8h
814 sqrshrun \d\().8b, v22.8h, #7
817 .macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
818 uxtl \s0\().8h, \s0\().8b
819 uxtl \s1\().8h, \s1\().8b
820 uxtl \s2\().8h, \s2\().8b
821 uxtl \s3\().8h, \s3\().8b
822 uxtl \s4\().8h, \s4\().8b
823 mul v21.8h, \s1\().8h, v0.h[2]
824 mul v23.8h, \s2\().8h, v0.h[3]
825 mul \s2\().8h, \s2\().8h, v0.h[2]
826 mul v22.8h, \s3\().8h, v0.h[3]
827 mls v21.8h, \s0\().8h, v0.h[1]
828 mls v23.8h, \s3\().8h, v0.h[4]
829 mls \s2\().8h, \s1\().8h, v0.h[1]
830 mls v22.8h, \s4\().8h, v0.h[4]
831 sqadd v21.8h, v21.8h, v23.8h
832 sqadd \s2\().8h, \s2\().8h, v22.8h
833 sqrshrun \d0\().8b, v21.8h, #7
834 sqrshrun2 \d0\().16b, \s2\().8h, #7
838 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
839 // arithmetic can be used to apply filters
840 const subpel_filters, align=4
841 .short 0, 6, 123, 12, 1, 0, 0, 0
842 .short 2, 11, 108, 36, 8, 1, 0, 0
843 .short 0, 9, 93, 50, 6, 0, 0, 0
844 .short 3, 16, 77, 77, 16, 3, 0, 0
845 .short 0, 6, 50, 93, 9, 0, 0, 0
846 .short 1, 8, 36, 108, 11, 2, 0, 0
847 .short 0, 1, 12, 123, 6, 0, 0, 0
850 function ff_put_vp8_epel16_v6_neon, export=1
851 sub x2, x2, x3, lsl #1
855 movrel x17, subpel_filters, -16
856 add x6, x17, x6, lsl #4 // y
859 ld1 {v1.1d - v2.1d}, [x2], x3
860 ld1 {v3.1d - v4.1d}, [x2], x3
861 ld1 {v16.1d - v17.1d}, [x2], x3
862 ld1 {v18.1d - v19.1d}, [x2], x3
863 ld1 {v20.1d - v21.1d}, [x2], x3
864 ld1 {v22.1d - v23.1d}, [x2], x3
865 ld1 {v24.1d - v25.1d}, [x2]
866 sub x2, x2, x3, lsl #2
868 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
869 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
871 st1 {v1.1d - v2.1d}, [x0], x1
872 st1 {v3.1d - v4.1d}, [x0], x1
879 function ff_put_vp8_epel16_h6_neon, export=1
883 // first pass (horizontal):
884 movrel x17, subpel_filters, -16
885 add x5, x17, x5, lsl #4 // x
888 ld1 {v1.16b, v2.16b}, [x2], x3
889 vp8_epel16_h6 v1, v1, v2
890 st1 {v1.16b}, [x0], x1
898 function ff_put_vp8_epel16_h6v6_neon, export=1
899 sub x2, x2, x3, lsl #1
902 // first pass (horizontal):
903 movrel x17, subpel_filters, -16
905 add x16, x17, x5, lsl #4 // x
913 ld1 {v1.16b, v2.16b}, [x2], x3
914 vp8_epel16_h6 v1, v1, v2
915 st1 {v1.16b}, [x7], #16
920 // second pass (vertical):
922 add x6, x17, x6, lsl #4 // y
927 ld1 {v1.8b - v4.8b}, [x7], #32
928 ld1 {v16.8b - v19.8b}, [x7], #32
929 ld1 {v20.8b - v23.8b}, [x7], #32
930 ld1 {v24.8b - v25.8b}, [x7]
933 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
934 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
935 trn1 v1.2d, v1.2d, v2.2d
936 trn1 v3.2d, v3.2d, v4.2d
938 st1 {v1.16b}, [x0], x1
939 st1 {v3.16b}, [x0], x1
947 function ff_put_vp8_epel8_v6_neon, export=1
948 sub x2, x2, x3, lsl #1
950 movrel x7, subpel_filters, -16
951 add x6, x7, w6, uxtw #4
954 ld1 {v2.8b}, [x2], x3
955 ld1 {v3.8b}, [x2], x3
956 ld1 {v4.8b}, [x2], x3
957 ld1 {v5.8b}, [x2], x3
958 ld1 {v6.8b}, [x2], x3
959 ld1 {v7.8b}, [x2], x3
962 sub x2, x2, x3, lsl #2
964 vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
966 st1 {v2.8b}, [x0], x1
967 st1 {v3.8b}, [x0], x1
974 function ff_put_vp8_epel8_h6_neon, export=1
977 movrel x7, subpel_filters, -16
978 add x5, x7, w5, uxtw #4
981 ld1 {v2.8b, v3.8b}, [x2], x3
983 vp8_epel8_h6 v2, v2, v3
985 st1 {v2.8b}, [x0], x1
992 function ff_put_vp8_epel8_h6v6_neon, export=1
993 sub x2, x2, x3, lsl #1
997 // first pass (horizontal):
998 movrel x17, subpel_filters, -16
1000 add x5, x17, x5, lsl #4 // x
1004 add x16, x4, #5 // h
1007 ld1 {v1.8b, v2.8b}, [x2], x3
1009 vp8_epel8_h6 v1, v1, v2
1011 st1 {v1.8b}, [x7], #8
1015 // second pass (vertical):
1017 add x6, x17, x6, lsl #4 // y
1022 ld1 {v1.8b - v4.8b}, [x7], #32
1023 ld1 {v5.8b - v7.8b}, [x7]
1027 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1029 st1 {v1.8b}, [x0], x1
1030 st1 {v2.8b}, [x0], x1
1038 function ff_put_vp8_epel8_v4_neon, export=1
1041 movrel x7, subpel_filters, -16
1042 add x6, x7, w6, uxtw #4
1045 ld1 {v2.8b}, [x2], x3
1046 ld1 {v3.8b}, [x2], x3
1047 ld1 {v4.8b}, [x2], x3
1048 ld1 {v5.8b}, [x2], x3
1050 sub x2, x2, x3, lsl #1
1052 vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1054 st1 {v2.d}[0], [x0], x1
1055 st1 {v2.d}[1], [x0], x1
1062 function ff_put_vp8_epel8_h4_neon, export=1
1065 movrel x7, subpel_filters, -16
1066 add x5, x7, w5, uxtw #4
1069 ld1 {v2.8b,v3.8b}, [x2], x3
1071 vp8_epel8_h4 v2, v2, v3
1073 st1 {v2.8b}, [x0], x1
1080 function ff_put_vp8_epel8_h4v6_neon, export=1
1081 sub x2, x2, x3, lsl #1
1085 // first pass (horizontal):
1086 movrel x17, subpel_filters, -16
1088 add x5, x17, x5, lsl #4 // x
1092 add x16, x4, #5 // h
1095 ld1 {v1.8b, v2.8b}, [x2], x3
1097 vp8_epel8_h4 v1, v1, v2
1099 st1 {v1.8b}, [x7], #8
1103 // second pass (vertical):
1105 add x6, x17, x6, lsl #4 // y
1110 ld1 {v1.8b - v4.8b}, [x7], #32
1111 ld1 {v5.8b - v7.8b}, [x7]
1115 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1117 st1 {v1.8b}, [x0], x1
1118 st1 {v2.8b}, [x0], x1
1126 function ff_put_vp8_epel8_h4v4_neon, export=1
1132 // first pass (horizontal):
1133 movrel x17, subpel_filters, -16
1135 add x5, x17, x5, lsl #4 // x
1139 add x16, x4, #3 // h
1142 ld1 {v1.8b, v2.8b}, [x2], x3
1144 vp8_epel8_h4 v1, v1, v2
1146 st1 {v1.8b}, [x7], #8
1150 // second pass (vertical):
1152 add x6, x17, x6, lsl #4 // y
1157 ld1 {v1.8b - v2.8b}, [x7], #16
1158 ld1 {v3.8b - v5.8b}, [x7]
1160 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1162 st1 {v1.d}[0], [x0], x1
1163 st1 {v1.d}[1], [x0], x1
1171 function ff_put_vp8_epel8_h6v4_neon, export=1
1177 // first pass (horizontal):
1178 movrel x17, subpel_filters, -16
1180 add x5, x17, x5, lsl #4 // x
1184 add x16, x4, #3 // h
1187 ld1 {v1.8b, v2.8b}, [x2], x3
1189 vp8_epel8_h6 v1, v1, v2
1191 st1 {v1.8b}, [x7], #8
1195 // second pass (vertical):
1197 add x6, x17, x6, lsl #4 // y
1202 ld1 {v1.8b - v2.8b}, [x7], #16
1203 ld1 {v3.8b - v5.8b}, [x7]
1205 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1207 st1 {v1.d}[0], [x0], x1
1208 st1 {v1.d}[1], [x0], x1
1216 function ff_put_vp8_epel4_v6_neon, export=1
1217 sub x2, x2, x3, lsl #1
1219 movrel x7, subpel_filters, -16
1220 add x6, x7, w6, uxtw #4
1223 ld1r {v2.2s}, [x2], x3
1224 ld1r {v3.2s}, [x2], x3
1225 ld1r {v4.2s}, [x2], x3
1226 ld1r {v5.2s}, [x2], x3
1227 ld1r {v6.2s}, [x2], x3
1228 ld1r {v7.2s}, [x2], x3
1230 sub x2, x2, x3, lsl #2
1231 ld1 {v2.s}[1], [x2], x3
1232 ld1 {v3.s}[1], [x2], x3
1233 ld1 {v4.s}[1], [x2], x3
1234 ld1 {v5.s}[1], [x2], x3
1235 ld1 {v6.s}[1], [x2], x3
1236 ld1 {v7.s}[1], [x2], x3
1237 ld1 {v28.s}[1], [x2]
1238 sub x2, x2, x3, lsl #2
1240 vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
1242 st1 {v2.s}[0], [x0], x1
1243 st1 {v3.s}[0], [x0], x1
1244 st1 {v2.s}[1], [x0], x1
1245 st1 {v3.s}[1], [x0], x1
1252 function ff_put_vp8_epel4_h6_neon, export=1
1255 movrel x7, subpel_filters, -16
1256 add x5, x7, w5, uxtw #4
1259 ld1 {v2.8b,v3.8b}, [x2], x3
1260 vp8_epel8_h6 v2, v2, v3
1261 st1 {v2.s}[0], [x0], x1
1268 function ff_put_vp8_epel4_h6v6_neon, export=1
1269 sub x2, x2, x3, lsl #1
1272 movrel x7, subpel_filters, -16
1273 add x5, x7, w5, uxtw #4
1280 ld1 {v2.8b,v3.8b}, [x2], x3
1281 vp8_epel8_h6 v2, v2, v3
1282 st1 {v2.s}[0], [x9], #4
1286 add x6, x7, w6, uxtw #4
1290 ld1 {v2.8b,v3.8b}, [x9], #16
1291 ld1 {v6.8b}, [x9], #8
1294 ld1 {v4.8b,v5.8b}, [x9], #16
1295 ld1 {v7.8b}, [x9], #8
1296 ld1 {v28.s}[1], [x9]
1298 trn1 v1.2s, v2.2s, v4.2s
1299 trn2 v4.2s, v2.2s, v4.2s
1300 trn1 v2.2s, v3.2s, v5.2s
1301 trn2 v5.2s, v3.2s, v5.2s
1302 trn1 v3.2s, v6.2s, v7.2s
1303 trn2 v7.2s, v6.2s, v7.2s
1304 vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
1305 st1 {v2.s}[0], [x0], x1
1306 st1 {v3.s}[0], [x0], x1
1307 st1 {v2.s}[1], [x0], x1
1308 st1 {v3.s}[1], [x0], x1
1316 function ff_put_vp8_epel4_h4v6_neon, export=1
1317 sub x2, x2, x3, lsl #1
1320 movrel x7, subpel_filters, -16
1321 add x5, x7, w5, uxtw #4
1328 ld1 {v2.8b}, [x2], x3
1329 vp8_epel8_h4 v2, v2, v2
1330 st1 {v2.s}[0], [x9], #4
1334 add x6, x7, w6, uxtw #4
1338 ld1 {v2.8b,v3.8b}, [x9], #16
1339 ld1 {v6.8b}, [x9], #8
1342 ld1 {v4.8b,v5.8b}, [x9], #16
1343 ld1 {v7.8b}, [x9], #8
1344 ld1 {v28.s}[1], [x9]
1346 trn1 v1.2s, v2.2s, v4.2s
1347 trn2 v4.2s, v2.2s, v4.2s
1348 trn1 v2.2s, v3.2s, v5.2s
1349 trn2 v5.2s, v3.2s, v5.2s
1350 trn1 v3.2s, v6.2s, v7.2s
1351 trn2 v7.2s, v6.2s, v7.2s
1352 vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
1353 st1 {v2.s}[0], [x0], x1
1354 st1 {v3.s}[0], [x0], x1
1355 st1 {v2.s}[1], [x0], x1
1356 st1 {v3.s}[1], [x0], x1
1364 function ff_put_vp8_epel4_h6v4_neon, export=1
1368 movrel x7, subpel_filters, -16
1369 add x5, x7, w5, uxtw #4
1376 ld1 {v2.8b,v3.8b}, [x2], x3
1377 vp8_epel8_h6 v2, v2, v3
1378 st1 {v2.s}[0], [x9], #4
1382 add x6, x7, w6, uxtw #4
1386 ld1 {v2.8b,v3.8b}, [x9], #16
1389 ld1 {v4.8b,v5.8b}, [x9], #16
1392 trn1 v1.2s, v2.2s, v4.2s
1393 trn2 v4.2s, v2.2s, v4.2s
1394 trn1 v2.2s, v3.2s, v5.2s
1395 trn2 v5.2s, v3.2s, v5.2s
1396 vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
1397 st1 {v1.s}[0], [x0], x1
1398 st1 {v1.s}[2], [x0], x1
1399 st1 {v1.s}[1], [x0], x1
1400 st1 {v1.s}[3], [x0], x1
1408 function ff_put_vp8_epel4_h4_neon, export=1
1411 movrel x7, subpel_filters, -16
1412 add x5, x7, w5, uxtw #4
1415 ld1 {v2.8b}, [x2], x3
1416 vp8_epel8_h4 v2, v2, v2
1417 st1 {v2.s}[0], [x0], x1
1424 function ff_put_vp8_epel4_v4_neon, export=1
1427 movrel x7, subpel_filters, -16
1428 add x6, x7, w6, uxtw #4
1431 ld1r {v2.2s}, [x2], x3
1432 ld1r {v3.2s}, [x2], x3
1433 ld1r {v4.2s}, [x2], x3
1434 ld1r {v5.2s}, [x2], x3
1436 sub x2, x2, x3, lsl #1
1437 ld1 {v2.s}[1], [x2], x3
1438 ld1 {v3.s}[1], [x2], x3
1439 ld1 {v4.s}[1], [x2], x3
1440 ld1 {v5.s}[1], [x2], x3
1442 sub x2, x2, x3, lsl #1
1444 vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1446 st1 {v2.s}[0], [x0], x1
1447 st1 {v2.s}[2], [x0], x1
1448 st1 {v2.s}[1], [x0], x1
1449 st1 {v2.s}[3], [x0], x1
1456 function ff_put_vp8_epel4_h4v4_neon, export=1
1460 movrel x7, subpel_filters, -16
1461 add x5, x7, w5, uxtw #4
1468 ld1 {v2.8b}, [x2], x3
1469 vp8_epel8_h4 v2, v2, v3
1470 st1 {v2.s}[0], [x9], #4
1474 add x6, x7, w6, uxtw #4
1478 ld1 {v2.8b,v3.8b}, [x9], #16
1481 ld1 {v4.8b,v5.8b}, [x9], #16
1484 trn1 v1.2s, v2.2s, v4.2s
1485 trn2 v4.2s, v2.2s, v4.2s
1486 trn1 v2.2s, v3.2s, v5.2s
1487 trn2 v5.2s, v3.2s, v5.2s
1488 vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
1489 st1 {v1.s}[0], [x0], x1
1490 st1 {v1.s}[2], [x0], x1
1491 st1 {v1.s}[1], [x0], x1
1492 st1 {v1.s}[3], [x0], x1
1502 function ff_put_vp8_bilin16_h_neon, export=1
1509 ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3
1510 ext v5.8b, v3.8b, v4.8b, #1
1511 ext v4.8b, v2.8b, v3.8b, #1
1512 umull v16.8h, v2.8b, v1.8b
1513 umlal v16.8h, v4.8b, v0.8b
1514 ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
1515 umull v6.8h, v3.8b, v1.8b
1516 umlal v6.8h, v5.8b, v0.8b
1517 ext v21.8b, v19.8b, v20.8b, #1
1518 ext v20.8b, v18.8b, v19.8b, #1
1519 umull v22.8h, v18.8b, v1.8b
1520 umlal v22.8h, v20.8b, v0.8b
1521 umull v24.8h, v19.8b, v1.8b
1522 umlal v24.8h, v21.8b, v0.8b
1523 rshrn v4.8b, v16.8h, #3
1524 rshrn2 v4.16b, v6.8h, #3
1525 rshrn v6.8b, v22.8h, #3
1526 rshrn2 v6.16b, v24.8h, #3
1527 st1 {v4.16b}, [x0], x1
1528 st1 {v6.16b}, [x0], x1
1534 function ff_put_vp8_bilin16_v_neon, export=1
1540 ld1 {v2.16b}, [x2], x3
1543 ld1 {v4.16b}, [x2], x3
1544 umull v6.8h, v2.8b, v1.8b
1545 umlal v6.8h, v4.8b, v0.8b
1546 umull2 v16.8h, v2.16b, v1.16b
1547 umlal2 v16.8h, v4.16b, v0.16b
1548 ld1 {v2.16b}, [x2], x3
1549 umull v18.8h, v4.8b, v1.8b
1550 umlal v18.8h, v2.8b, v0.8b
1551 umull2 v20.8h, v4.16b, v1.16b
1552 umlal2 v20.8h, v2.16b, v0.16b
1553 rshrn v4.8b, v6.8h, #3
1554 rshrn2 v4.16b, v16.8h, #3
1555 rshrn v6.8b, v18.8h, #3
1556 rshrn2 v6.16b, v20.8h, #3
1557 st1 {v4.16b}, [x0], x1
1558 st1 {v6.16b}, [x0], x1
1564 function ff_put_vp8_bilin16_hv_neon, export=1
1569 dup v2.16b, w6 // my
1573 ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3
1575 ext v7.8b, v5.8b, v6.8b, #1
1576 ext v6.8b, v4.8b, v5.8b, #1
1577 umull v16.8h, v4.8b, v1.8b
1578 umlal v16.8h, v6.8b, v0.8b
1579 umull v18.8h, v5.8b, v1.8b
1580 umlal v18.8h, v7.8b, v0.8b
1581 rshrn v4.8b, v16.8h, #3
1582 rshrn2 v4.16b, v18.8h, #3
1585 ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
1586 ext v21.8b, v19.8b, v20.8b, #1
1587 ext v20.8b, v18.8b, v19.8b, #1
1588 umull v22.8h, v18.8b, v1.8b
1589 umlal v22.8h, v20.8b, v0.8b
1590 ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3
1591 umull v24.8h, v19.8b, v1.8b
1592 umlal v24.8h, v21.8b, v0.8b
1593 ext v29.8b, v27.8b, v28.8b, #1
1594 ext v28.8b, v26.8b, v27.8b, #1
1595 umull v16.8h, v26.8b, v1.8b
1596 umlal v16.8h, v28.8b, v0.8b
1597 umull v18.8h, v27.8b, v1.8b
1598 umlal v18.8h, v29.8b, v0.8b
1599 rshrn v6.8b, v22.8h, #3
1600 rshrn2 v6.16b, v24.8h, #3
1601 umull v24.8h, v4.8b, v3.8b
1602 umlal v24.8h, v6.8b, v2.8b
1603 umull2 v30.8h, v4.16b, v3.16b
1604 umlal2 v30.8h, v6.16b, v2.16b
1605 rshrn v4.8b, v16.8h, #3
1606 rshrn2 v4.16b, v18.8h, #3
1607 umull v20.8h, v6.8b, v3.8b
1608 umlal v20.8h, v4.8b, v2.8b
1609 umull2 v22.8h, v6.16b, v3.16b
1610 umlal2 v22.8h, v4.16b, v2.16b
1611 rshrn v24.8b, v24.8h, #3
1612 rshrn2 v24.16b, v30.8h, #3
1613 st1 {v24.16b}, [x0], x1
1614 rshrn v20.8b, v20.8h, #3
1615 rshrn2 v20.16b, v22.8h, #3
1616 st1 {v20.16b}, [x0], x1
1622 function ff_put_vp8_bilin8_h_neon, export=1
1629 ld1 {v2.8b,v3.8b}, [x2], x3
1630 ext v3.8b, v2.8b, v3.8b, #1
1631 umull v4.8h, v2.8b, v1.8b
1632 umlal v4.8h, v3.8b, v0.8b
1633 ld1 {v6.8b,v7.8b}, [x2], x3
1634 ext v7.8b, v6.8b, v7.8b, #1
1635 umull v16.8h, v6.8b, v1.8b
1636 umlal v16.8h, v7.8b, v0.8b
1637 rshrn v4.8b, v4.8h, #3
1638 rshrn v16.8b, v16.8h, #3
1639 st1 {v4.8b}, [x0], x1
1640 st1 {v16.8b}, [x0], x1
1646 function ff_put_vp8_bilin8_v_neon, export=1
1652 ld1 {v2.8b}, [x2], x3
1655 ld1 {v3.8b}, [x2], x3
1656 umull v4.8h, v2.8b, v1.8b
1657 umlal v4.8h, v3.8b, v0.8b
1658 ld1 {v2.8b}, [x2], x3
1659 umull v6.8h, v3.8b, v1.8b
1660 umlal v6.8h, v2.8b, v0.8b
1661 rshrn v4.8b, v4.8h, #3
1662 rshrn v6.8b, v6.8h, #3
1663 st1 {v4.8b}, [x0], x1
1664 st1 {v6.8b}, [x0], x1
1670 function ff_put_vp8_bilin8_hv_neon, export=1
1679 ld1 {v4.8b,v5.8b}, [x2], x3
1680 ext v5.8b, v4.8b, v5.8b, #1
1681 umull v18.8h, v4.8b, v1.8b
1682 umlal v18.8h, v5.8b, v0.8b
1683 rshrn v22.8b, v18.8h, #3
1686 ld1 {v6.8b,v7.8b}, [x2], x3
1687 ext v7.8b, v6.8b, v7.8b, #1
1688 umull v16.8h, v6.8b, v1.8b
1689 umlal v16.8h, v7.8b, v0.8b
1690 ld1 {v4.8b,v5.8b}, [x2], x3
1691 ext v5.8b, v4.8b, v5.8b, #1
1692 umull v18.8h, v4.8b, v1.8b
1693 umlal v18.8h, v5.8b, v0.8b
1694 rshrn v16.8b, v16.8h, #3
1695 umull v20.8h, v22.8b, v3.8b
1696 umlal v20.8h, v16.8b, v2.8b
1697 rshrn v22.8b, v18.8h, #3
1698 umull v24.8h, v16.8b, v3.8b
1699 umlal v24.8h, v22.8b, v2.8b
1700 rshrn v20.8b, v20.8h, #3
1701 st1 {v20.8b}, [x0], x1
1702 rshrn v23.8b, v24.8h, #3
1703 st1 {v23.8b}, [x0], x1
1709 function ff_put_vp8_bilin4_h_neon, export=1
1716 ld1 {v2.8b}, [x2], x3
1717 ext v3.8b, v2.8b, v3.8b, #1
1718 ld1 {v6.8b}, [x2], x3
1719 ext v7.8b, v6.8b, v7.8b, #1
1720 trn1 v2.2s, v2.2s, v6.2s
1721 trn1 v3.2s, v3.2s, v7.2s
1722 umull v4.8h, v2.8b, v1.8b
1723 umlal v4.8h, v3.8b, v0.8b
1724 rshrn v4.8b, v4.8h, #3
1725 st1 {v4.s}[0], [x0], x1
1726 st1 {v4.s}[1], [x0], x1
1732 function ff_put_vp8_bilin4_v_neon, export=1
1738 ld1r {v2.2s}, [x2], x3
1741 ld1 {v2.s}[1], [x2], x3
1742 ld1 {v3.s}[1], [x2], x3
1743 umull v4.8h, v2.8b, v1.8b
1744 umlal v4.8h, v3.8b, v0.8b
1745 trn2 v2.2s, v3.2s, v2.2s
1746 rshrn v4.8b, v4.8h, #3
1747 st1 {v4.s}[0], [x0], x1
1748 st1 {v4.s}[1], [x0], x1
1755 function ff_put_vp8_bilin4_hv_neon, export=1
1764 ld1 {v4.8b}, [x2], x3
1765 ext v5.8b, v4.8b, v4.8b, #1
1766 umull v18.8h, v4.8b, v1.8b
1767 umlal v18.8h, v5.8b, v0.8b
1768 rshrn v22.8b, v18.8h, #3
1771 ld1 {v6.8b}, [x2], x3
1772 ext v7.8b, v6.8b, v6.8b, #1
1773 ld1 {v4.8b}, [x2], x3
1774 ext v5.8b, v4.8b, v4.8b, #1
1775 trn1 v6.2s, v6.2s, v4.2s
1776 trn1 v7.2s, v7.2s, v5.2s
1777 umull v16.8h, v6.8b, v1.8b
1778 umlal v16.8h, v7.8b, v0.8b
1779 rshrn v16.8b, v16.8h, #3
1780 umull v20.8h, v16.8b, v2.8b
1781 trn1 v22.2s, v22.2s, v16.2s
1782 umlal v20.8h, v22.8b, v3.8b
1783 rev64 v22.2s, v16.2s
1784 rshrn v20.8b, v20.8h, #3
1785 st1 {v20.s}[0], [x0], x1
1786 st1 {v20.s}[1], [x0], x1