2 * VP8 NEON optimisations
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
7 * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
9 * This file is part of FFmpeg.
11 * FFmpeg is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
16 * FFmpeg is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with FFmpeg; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include "libavutil/aarch64/asm.S"
29 function ff_vp8_luma_dc_wht_neon, export=1
30 ld1 {v0.4h - v3.4h}, [x1]
33 add v4.4h, v0.4h, v3.4h
34 add v6.4h, v1.4h, v2.4h
35 st1 {v30.8h}, [x1], #16
36 sub v7.4h, v1.4h, v2.4h
37 sub v5.4h, v0.4h, v3.4h
39 add v0.4h, v4.4h, v6.4h
40 add v1.4h, v5.4h, v7.4h
41 sub v2.4h, v4.4h, v6.4h
42 sub v3.4h, v5.4h, v7.4h
46 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
48 add v0.4h, v0.4h, v16.4h
50 add v4.4h, v0.4h, v3.4h
51 add v6.4h, v1.4h, v2.4h
52 sub v7.4h, v1.4h, v2.4h
53 sub v5.4h, v0.4h, v3.4h
54 add v0.4h, v4.4h, v6.4h
55 add v1.4h, v5.4h, v7.4h
56 sub v2.4h, v4.4h, v6.4h
57 sub v3.4h, v5.4h, v7.4h
65 st1 {v0.h}[0], [x0], x3
66 st1 {v1.h}[0], [x0], x3
67 st1 {v2.h}[0], [x0], x3
68 st1 {v3.h}[0], [x0], x3
69 st1 {v0.h}[1], [x0], x3
70 st1 {v1.h}[1], [x0], x3
71 st1 {v2.h}[1], [x0], x3
72 st1 {v3.h}[1], [x0], x3
73 st1 {v0.h}[2], [x0], x3
74 st1 {v1.h}[2], [x0], x3
75 st1 {v2.h}[2], [x0], x3
76 st1 {v3.h}[2], [x0], x3
77 st1 {v0.h}[3], [x0], x3
78 st1 {v1.h}[3], [x0], x3
79 st1 {v2.h}[3], [x0], x3
80 st1 {v3.h}[3], [x0], x3
85 function ff_vp8_idct_add_neon, export=1
86 ld1 {v0.8b - v3.8b}, [x1]
88 movk w4, #35468/2, lsl #16
91 smull v26.4s, v1.4h, v4.h[0]
92 smull v27.4s, v3.4h, v4.h[0]
93 sqdmulh v20.4h, v1.4h, v4.h[1]
94 sqdmulh v23.4h, v3.4h, v4.h[1]
95 sqshrn v21.4h, v26.4s, #16
96 sqshrn v22.4h, v27.4s, #16
97 add v21.4h, v21.4h, v1.4h
98 add v22.4h, v22.4h, v3.4h
100 add v16.4h, v0.4h, v2.4h
101 sub v17.4h, v0.4h, v2.4h
103 add v18.4h, v21.4h, v23.4h
104 sub v19.4h, v20.4h, v22.4h
106 add v0.4h, v16.4h, v18.4h
107 add v1.4h, v17.4h, v19.4h
108 sub v3.4h, v16.4h, v18.4h
109 sub v2.4h, v17.4h, v19.4h
111 transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7
114 smull v26.4s, v1.4h, v4.h[0]
115 st1 {v29.8h}, [x1], #16
116 smull v27.4s, v3.4h, v4.h[0]
118 sqdmulh v21.4h, v1.4h, v4.h[1]
119 sqdmulh v23.4h, v3.4h, v4.h[1]
120 sqshrn v20.4h, v26.4s, #16
121 sqshrn v22.4h, v27.4s, #16
122 add v20.4h, v20.4h, v1.4h
123 add v22.4h, v22.4h, v3.4h
124 add v16.4h, v0.4h, v2.4h
125 sub v17.4h, v0.4h, v2.4h
127 add v18.4h, v20.4h, v23.4h
128 ld1 {v24.d}[0], [x0], x2
129 zip1 v16.2d, v16.2d, v17.2d
130 sub v19.4h, v21.4h, v22.4h
131 ld1 {v25.d}[0], [x0], x2
132 zip1 v18.2d, v18.2d, v19.2d
133 add v0.8h, v16.8h, v18.8h
134 ld1 {v25.d}[1], [x0], x2
135 sub v1.8h, v16.8h, v18.8h
136 ld1 {v24.d}[1], [x0], x2
137 srshr v0.8h, v0.8h, #3
138 trn1 v24.4s, v24.4s, v25.4s
139 srshr v1.8h, v1.8h, #3
140 sub x0, x0, x2, lsl #2
142 ext v1.16b, v1.16b, v1.16b, #8
143 trn1 v3.2d, v0.2d, v1.2d
144 trn2 v0.2d, v0.2d, v1.2d
145 trn1 v1.8h, v3.8h, v0.8h
146 trn2 v3.8h, v3.8h, v0.8h
147 uzp1 v0.4s, v1.4s, v3.4s
148 uzp2 v1.4s, v3.4s, v1.4s
150 uaddw v0.8h, v0.8h, v24.8b
151 uaddw2 v1.8h, v1.8h, v24.16b
153 sqxtun2 v0.16b, v1.8h
154 st1 {v0.s}[0], [x0], x2
155 st1 {v0.s}[1], [x0], x2
156 st1 {v0.s}[3], [x0], x2
157 st1 {v0.s}[2], [x0], x2
162 function ff_vp8_idct_dc_add4uv_neon, export=1
166 st1 {v0.h}[0], [x1], x3
168 st1 {v0.h}[0], [x1], x3
170 st1 {v0.h}[0], [x1], x3
172 st1 {v0.h}[0], [x1], x3
173 ins v16.d[1], v17.d[0]
174 ins v18.d[1], v19.d[0]
176 srshr v16.8h, v16.8h, #3 // dc >>= 3
177 ld1 {v0.8b}, [x0], x2
178 srshr v18.8h, v18.8h, #3
179 ld1 {v1.8b}, [x0], x2
180 uaddw v20.8h, v16.8h, v0.8b
181 ld1 {v2.8b}, [x0], x2
182 uaddw v0.8h, v16.8h, v1.8b
183 ld1 {v3.8b}, [x0], x2
184 uaddw v22.8h, v16.8h, v2.8b
185 ld1 {v4.8b}, [x0], x2
186 uaddw v2.8h, v16.8h, v3.8b
187 ld1 {v5.8b}, [x0], x2
188 uaddw v24.8h, v18.8h, v4.8b
189 ld1 {v6.8b}, [x0], x2
190 uaddw v4.8h, v18.8h, v5.8b
191 ld1 {v7.8b}, [x0], x2
192 uaddw v26.8h, v18.8h, v6.8b
193 sqxtun v20.8b, v20.8h
194 uaddw v6.8h, v18.8h, v7.8b
196 sqxtun v22.8b, v22.8h
197 st1 {v20.8b}, [x3], x2
199 st1 {v21.8b}, [x3], x2
200 sqxtun v24.8b, v24.8h
201 st1 {v22.8b}, [x3], x2
203 st1 {v23.8b}, [x3], x2
204 sqxtun v26.8b, v26.8h
205 st1 {v24.8b}, [x3], x2
207 st1 {v25.8b}, [x3], x2
208 st1 {v26.8b}, [x3], x2
209 st1 {v27.8b}, [x3], x2
214 function ff_vp8_idct_dc_add4y_neon, export=1
218 st1 {v0.h}[0], [x1], x3
220 st1 {v0.h}[0], [x1], x3
221 zip1 v16.2d, v16.2d, v17.2d
223 st1 {v0.h}[0], [x1], x3
225 st1 {v0.h}[0], [x1], x3
226 zip1 v18.2d, v18.2d, v19.2d
227 srshr v16.8h, v16.8h, #3 // dc >>= 3
228 ld1 {v0.16b}, [x0], x2
229 srshr v18.8h, v18.8h, #3
230 ld1 {v1.16b}, [x0], x2
231 uaddw v20.8h, v16.8h, v0.8b
232 ld1 {v2.16b}, [x0], x2
233 uaddw2 v0.8h, v18.8h, v0.16b
234 ld1 {v3.16b}, [x0], x2
235 uaddw v21.8h, v16.8h, v1.8b
236 uaddw2 v1.8h, v18.8h, v1.16b
237 uaddw v22.8h, v16.8h, v2.8b
238 uaddw2 v2.8h, v18.8h, v2.16b
239 uaddw v23.8h, v16.8h, v3.8b
240 uaddw2 v3.8h, v18.8h, v3.16b
241 sub x0, x0, x2, lsl #2
242 sqxtun v20.8b, v20.8h
243 sqxtun2 v20.16b, v0.8h
244 sqxtun v21.8b, v21.8h
245 sqxtun2 v21.16b, v1.8h
246 sqxtun v22.8b, v22.8h
247 st1 {v20.16b}, [x0], x2
248 sqxtun2 v22.16b, v2.8h
249 st1 {v21.16b}, [x0], x2
250 sqxtun v23.8b, v23.8h
251 st1 {v22.16b}, [x0], x2
252 sqxtun2 v23.16b, v3.8h
253 st1 {v23.16b}, [x0], x2
258 function ff_vp8_idct_dc_add_neon, export=1
262 srshr v2.8h, v2.8h, #3
263 ld1 {v0.s}[0], [x0], x2
264 ld1 {v0.s}[1], [x0], x2
265 uaddw v3.8h, v2.8h, v0.8b
266 ld1 {v1.s}[0], [x0], x2
267 ld1 {v1.s}[1], [x0], x2
268 uaddw v4.8h, v2.8h, v1.8b
271 sub x0, x0, x2, lsl #2
272 st1 {v0.s}[0], [x0], x2
273 st1 {v0.s}[1], [x0], x2
274 st1 {v1.s}[0], [x0], x2
275 st1 {v1.s}[1], [x0], x2
285 .macro vp8_loop_filter, inner=0, simple=0, hev_thresh
287 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
288 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
289 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
290 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
291 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
293 cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
295 // calculate hev and normal_limit:
296 uabd v20.16b, v2.16b, v3.16b // abs(P1-P0)
297 uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0)
298 uabd v18.16b, v0.16b, v1.16b // abs(P3-P2)
299 uabd v19.16b, v1.16b, v2.16b // abs(P2-P1)
300 cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I
301 cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I
302 cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I
303 cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I
304 and v16.16b, v17.16b, v16.16b
305 uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2)
306 and v16.16b, v16.16b, v19.16b
307 uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1)
308 and v16.16b, v16.16b, v18.16b
309 cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I
310 cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I
311 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
312 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
313 and v16.16b, v16.16b, v18.16b
314 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
315 and v16.16b, v16.16b, v19.16b
316 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
317 dup v23.16b, \hev_thresh // hev_thresh
318 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
319 cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh
320 cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
321 cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh
322 and v16.16b, v16.16b, v19.16b
324 orr v17.16b, v20.16b, v22.16b
331 // convert to signed value:
332 eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
333 eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
336 ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
337 ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
338 eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
339 eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
340 mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
341 mul v19.8h, v19.8h, v20.8h
343 sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
347 and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
349 saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
350 saddw2 v19.8h, v19.8h, v20.16b
351 sqxtn v18.8b, v18.8h // narrow result back into v18
352 sqxtn2 v18.16b, v19.8h
353 .if !\inner && !\simple
354 eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
355 eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
357 and v18.16b, v18.16b, v16.16b // w &= normal_limit
359 // registers used at this point..
360 // v0 -> P3 (don't corrupt)
362 // v7 -> Q3 (don't corrupt)
368 // v16, v19, v29 -> unused
370 // filter_common: is4tap==1
371 // c1 = clamp(w + 4) >> 3;
372 // c2 = clamp(w + 3) >> 3;
373 // Q0 = s2u(QS0 - c1);
374 // P0 = s2u(PS0 + c2);
377 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
378 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
379 sshr v19.16b, v19.16b, #3 // c1 >>= 3
380 sshr v20.16b, v20.16b, #3 // c2 >>= 3
381 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
382 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
383 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
384 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
385 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
386 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
388 // the !is4tap case of filter_common, only used for inner blocks
389 // c3 = ((c1&~hev) + 1) >> 1;
390 // Q1 = s2u(QS1 - c3);
391 // P1 = s2u(PS1 + c3);
392 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
393 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
394 sshr v19.16b, v19.16b, #3 // c1 >>= 3
395 sshr v20.16b, v20.16b, #3 // c2 >>= 3
396 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
397 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
398 bic v19.16b, v19.16b, v17.16b // c1 & ~hev
399 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
400 srshr v19.16b, v19.16b, #1 // c3 >>= 1
401 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
402 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
403 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
404 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
405 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
407 and v20.16b, v18.16b, v17.16b // w & hev
408 sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
409 sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
410 sshr v19.16b, v19.16b, #3 // c1 >>= 3
411 sshr v20.16b, v20.16b, #3 // c2 >>= 3
412 bic v18.16b, v18.16b, v17.16b // w &= ~hev
413 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
414 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
417 // a = clamp((27*w + 63) >> 7);
418 // Q0 = s2u(QS0 - a);
419 // P0 = s2u(PS0 + a);
420 // a = clamp((18*w + 63) >> 7);
421 // Q1 = s2u(QS1 - a);
422 // P1 = s2u(PS1 + a);
423 // a = clamp((9*w + 63) >> 7);
424 // Q2 = s2u(QS2 - a);
425 // P2 = s2u(PS2 + a);
427 sshll v22.8h, v18.8b, #3
428 sshll2 v23.8h, v18.16b, #3
429 saddw v22.8h, v22.8h, v18.8b
430 saddw2 v23.8h, v23.8h, v18.16b
431 add v16.8h, v17.8h, v22.8h
432 add v17.8h, v17.8h, v23.8h // 9*w + 63
433 add v19.8h, v16.8h, v22.8h
434 add v20.8h, v17.8h, v23.8h // 18*w + 63
435 add v22.8h, v19.8h, v22.8h
436 add v23.8h, v20.8h, v23.8h // 27*w + 63
437 sqshrn v16.8b, v16.8h, #7
438 sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
439 sqshrn v19.8b, v19.8h, #7
440 sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
441 sqshrn v22.8b, v22.8h, #7
442 sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
443 sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
444 sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
445 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
446 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
447 sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
448 sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
449 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
450 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
451 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
452 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
453 eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
454 eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
458 .macro vp8_v_loop_filter16 name, inner=0, simple=0
459 function ff_vp8_v_loop_filter16\name\()_neon, export=1
460 sub x0, x0, x1, lsl #1+!\simple
464 ld1 {v0.16b}, [x0], x1 // P3
465 ld1 {v1.16b}, [x0], x1 // P2
467 ld1 {v2.16b}, [x0], x1 // P1
468 ld1 {v3.16b}, [x0], x1 // P0
469 ld1 {v4.16b}, [x0], x1 // Q0
470 ld1 {v5.16b}, [x0], x1 // Q1
472 ld1 {v6.16b}, [x0], x1 // Q2
473 ld1 {v7.16b}, [x0] // Q3
474 dup v23.16b, w3 // flim_I
476 dup v22.16b, w2 // flim_E
478 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
480 // back up to P2: dst -= stride * 6
481 sub x0, x0, x1, lsl #2
483 sub x0, x0, x1, lsl #1
486 st1 {v1.16b}, [x0], x1 // P2
488 st1 {v2.16b}, [x0], x1 // P1
489 st1 {v3.16b}, [x0], x1 // P0
490 st1 {v4.16b}, [x0], x1 // Q0
491 st1 {v5.16b}, [x0], x1 // Q1
493 st1 {v6.16b}, [x0] // Q2
501 vp8_v_loop_filter16 _inner, inner=1
502 vp8_v_loop_filter16 _simple, simple=1
504 .macro vp8_v_loop_filter8uv name, inner=0
505 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
506 sub x0, x0, x2, lsl #2
507 sub x1, x1, x2, lsl #2
509 ld1 {v0.d}[0], [x0], x2 // P3
510 ld1 {v0.d}[1], [x1], x2 // P3
511 ld1 {v1.d}[0], [x0], x2 // P2
512 ld1 {v1.d}[1], [x1], x2 // P2
513 ld1 {v2.d}[0], [x0], x2 // P1
514 ld1 {v2.d}[1], [x1], x2 // P1
515 ld1 {v3.d}[0], [x0], x2 // P0
516 ld1 {v3.d}[1], [x1], x2 // P0
517 ld1 {v4.d}[0], [x0], x2 // Q0
518 ld1 {v4.d}[1], [x1], x2 // Q0
519 ld1 {v5.d}[0], [x0], x2 // Q1
520 ld1 {v5.d}[1], [x1], x2 // Q1
521 ld1 {v6.d}[0], [x0], x2 // Q2
522 ld1 {v6.d}[1], [x1], x2 // Q2
523 ld1 {v7.d}[0], [x0] // Q3
524 ld1 {v7.d}[1], [x1] // Q3
526 dup v22.16b, w3 // flim_E
527 dup v23.16b, w4 // flim_I
529 vp8_loop_filter inner=\inner, hev_thresh=w5
531 // back up to P2: u,v -= stride * 6
532 sub x0, x0, x2, lsl #2
533 sub x1, x1, x2, lsl #2
534 sub x0, x0, x2, lsl #1
535 sub x1, x1, x2, lsl #1
539 st1 {v1.d}[0], [x0], x2 // P2
540 st1 {v1.d}[1], [x1], x2 // P2
541 st1 {v2.d}[0], [x0], x2 // P1
542 st1 {v2.d}[1], [x1], x2 // P1
543 st1 {v3.d}[0], [x0], x2 // P0
544 st1 {v3.d}[1], [x1], x2 // P0
545 st1 {v4.d}[0], [x0], x2 // Q0
546 st1 {v4.d}[1], [x1], x2 // Q0
547 st1 {v5.d}[0], [x0], x2 // Q1
548 st1 {v5.d}[1], [x1], x2 // Q1
549 st1 {v6.d}[0], [x0] // Q2
550 st1 {v6.d}[1], [x1] // Q2
557 vp8_v_loop_filter8uv _inner, inner=1
559 .macro vp8_h_loop_filter16 name, inner=0, simple=0
560 function ff_vp8_h_loop_filter16\name\()_neon, export=1
564 ld1 {v0.d}[0], [x0], x1
565 ld1 {v1.d}[0], [x0], x1
566 ld1 {v2.d}[0], [x0], x1
567 ld1 {v3.d}[0], [x0], x1
568 ld1 {v4.d}[0], [x0], x1
569 ld1 {v5.d}[0], [x0], x1
570 ld1 {v6.d}[0], [x0], x1
571 ld1 {v7.d}[0], [x0], x1
572 ld1 {v0.d}[1], [x0], x1
573 ld1 {v1.d}[1], [x0], x1
574 ld1 {v2.d}[1], [x0], x1
575 ld1 {v3.d}[1], [x0], x1
576 ld1 {v4.d}[1], [x0], x1
577 ld1 {v5.d}[1], [x0], x1
578 ld1 {v6.d}[1], [x0], x1
579 ld1 {v7.d}[1], [x0], x1
581 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
583 dup v22.16b, w2 // flim_E
585 dup v23.16b, w3 // flim_I
588 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
590 sub x0, x0, x1, lsl #4 // backup 16 rows
592 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
595 st1 {v0.d}[0], [x0], x1
596 st1 {v1.d}[0], [x0], x1
597 st1 {v2.d}[0], [x0], x1
598 st1 {v3.d}[0], [x0], x1
599 st1 {v4.d}[0], [x0], x1
600 st1 {v5.d}[0], [x0], x1
601 st1 {v6.d}[0], [x0], x1
602 st1 {v7.d}[0], [x0], x1
603 st1 {v0.d}[1], [x0], x1
604 st1 {v1.d}[1], [x0], x1
605 st1 {v2.d}[1], [x0], x1
606 st1 {v3.d}[1], [x0], x1
607 st1 {v4.d}[1], [x0], x1
608 st1 {v5.d}[1], [x0], x1
609 st1 {v6.d}[1], [x0], x1
617 vp8_h_loop_filter16 _inner, inner=1
618 vp8_h_loop_filter16 _simple, simple=1
620 .macro vp8_h_loop_filter8uv name, inner=0
621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
626 ld1 {v0.d}[0], [x0], x2 // load u
627 ld1 {v0.d}[1], [x1], x2 // load v
628 ld1 {v1.d}[0], [x0], x2
629 ld1 {v1.d}[1], [x1], x2
630 ld1 {v2.d}[0], [x0], x2
631 ld1 {v2.d}[1], [x1], x2
632 ld1 {v3.d}[0], [x0], x2
633 ld1 {v3.d}[1], [x1], x2
634 ld1 {v4.d}[0], [x0], x2
635 ld1 {v4.d}[1], [x1], x2
636 ld1 {v5.d}[0], [x0], x2
637 ld1 {v5.d}[1], [x1], x2
638 ld1 {v6.d}[0], [x0], x2
639 ld1 {v6.d}[1], [x1], x2
640 ld1 {v7.d}[0], [x0], x2
641 ld1 {v7.d}[1], [x1], x2
643 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
645 dup v22.16b, w3 // flim_E
646 dup v23.16b, w4 // flim_I
648 vp8_loop_filter inner=\inner, hev_thresh=w5
650 sub x0, x0, x2, lsl #3 // backup u 8 rows
651 sub x1, x1, x2, lsl #3 // backup v 8 rows
653 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
656 st1 {v0.d}[0], [x0], x2 // load u
657 st1 {v0.d}[1], [x1], x2 // load v
658 st1 {v1.d}[0], [x0], x2
659 st1 {v1.d}[1], [x1], x2
660 st1 {v2.d}[0], [x0], x2
661 st1 {v2.d}[1], [x1], x2
662 st1 {v3.d}[0], [x0], x2
663 st1 {v3.d}[1], [x1], x2
664 st1 {v4.d}[0], [x0], x2
665 st1 {v4.d}[1], [x1], x2
666 st1 {v5.d}[0], [x0], x2
667 st1 {v5.d}[1], [x1], x2
668 st1 {v6.d}[0], [x0], x2
669 st1 {v6.d}[1], [x1], x2
679 vp8_h_loop_filter8uv _inner, inner=1
682 function ff_put_vp8_pixels16_neon, export=1
685 ld1 {v0.16b}, [x2], x3
686 ld1 {v1.16b}, [x2], x3
687 ld1 {v2.16b}, [x2], x3
688 ld1 {v3.16b}, [x2], x3
689 st1 {v0.16b}, [x0], x1
690 st1 {v1.16b}, [x0], x1
691 st1 {v2.16b}, [x0], x1
692 st1 {v3.16b}, [x0], x1
697 function ff_put_vp8_pixels8_neon, export=1
700 ld1 {v0.8b}, [x2], x3
701 ld1 {v0.d}[1], [x2], x3
702 ld1 {v1.8b}, [x2], x3
703 ld1 {v1.d}[1], [x2], x3
704 st1 {v0.8b}, [x0], x1
705 st1 {v0.d}[1], [x0], x1
706 st1 {v1.8b}, [x0], x1
707 st1 {v1.d}[1], [x0], x1
712 /* 4/6-tap 8th-pel MC */
714 .macro vp8_epel8_h6 d, s0, s1
715 ext v22.8b, \s0\().8b, \s1\().8b, #1
716 uxtl v18.8h, \s0\().8b
717 ext v23.8b, \s0\().8b, \s1\().8b, #2
719 ext v24.8b, \s0\().8b, \s1\().8b, #3
721 ext v25.8b, \s0\().8b, \s1\().8b, #4
723 ext v26.8b, \s0\().8b, \s1\().8b, #5
725 mul v21.8h, v21.8h, v0.h[2]
727 mul v22.8h, v22.8h, v0.h[3]
728 mls v21.8h, v19.8h, v0.h[1]
729 mls v22.8h, v25.8h, v0.h[4]
730 mla v21.8h, v18.8h, v0.h[0]
731 mla v22.8h, v26.8h, v0.h[5]
732 sqadd v22.8h, v21.8h, v22.8h
733 sqrshrun \d\().8b, v22.8h, #7
736 .macro vp8_epel16_h6 d0, v0, v1
737 ext v22.16b, \v0\().16b, \v1\().16b, #3
738 ext v23.16b, \v0\().16b, \v1\().16b, #4
740 uxtl2 v22.8h, v22.16b
741 ext v3.16b, \v0\().16b, \v1\().16b, #2
743 uxtl2 v23.8h, v23.16b
744 ext v16.16b, \v0\().16b, \v1\().16b, #1
747 ext v2.16b, \v0\().16b, \v1\().16b, #5
751 uxtl2 v16.8h, v16.16b
752 mul v19.8h, v19.8h, v0.h[3]
753 mul v18.8h, v18.8h, v0.h[2]
754 mul v3.8h, v3.8h, v0.h[2]
755 mul v22.8h, v22.8h, v0.h[3]
756 mls v19.8h, v20.8h, v0.h[4]
757 uxtl v20.8h, \v0\().8b
758 uxtl2 v1.8h, \v0\().16b
759 mls v18.8h, v17.8h, v0.h[1]
760 mls v3.8h, v16.8h, v0.h[1]
761 mls v22.8h, v23.8h, v0.h[4]
762 mla v18.8h, v20.8h, v0.h[0]
763 mla v19.8h, v21.8h, v0.h[5]
764 mla v3.8h, v1.8h, v0.h[0]
765 mla v22.8h, v2.8h, v0.h[5]
766 sqadd v19.8h, v18.8h, v19.8h
767 sqadd v22.8h, v3.8h, v22.8h
768 sqrshrun \d0\().8b, v19.8h, #7
769 sqrshrun2 \d0\().16b, v22.8h, #7
772 .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
773 uxtl \s2\().8h, \s2\().8b
774 uxtl \s3\().8h, \s3\().8b
775 uxtl \s1\().8h, \s1\().8b
776 uxtl \s4\().8h, \s4\().8b
777 uxtl \s0\().8h, \s0\().8b
778 uxtl \s5\().8h, \s5\().8b
779 mul \s2\().8h, \s2\().8h, v0.h[2]
780 mul \s3\().8h, \s3\().8h, v0.h[3]
781 mls \s2\().8h, \s1\().8h, v0.h[1]
782 mls \s3\().8h, \s4\().8h, v0.h[4]
783 mla \s2\().8h, \s0\().8h, v0.h[0]
784 mla \s3\().8h, \s5\().8h, v0.h[5]
785 sqadd \s3\().8h, \s2\().8h, \s3\().8h
786 sqrshrun \d0\().8b, \s3\().8h, #7
789 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
790 uxtl \s0\().8h, \s0\().8b
791 uxtl \s3\().8h, \s3\().8b
792 uxtl \s6\().8h, \s6\().8b
793 uxtl \s1\().8h, \s1\().8b
794 uxtl \s4\().8h, \s4\().8b
795 uxtl \s2\().8h, \s2\().8b
796 uxtl \s5\().8h, \s5\().8b
797 mul \s0\().8h, \s0\().8h, v0.h[0]
798 mul v31.8h , \s3\().8h, v0.h[3]
799 mul \s3\().8h, \s3\().8h, v0.h[2]
800 mul \s6\().8h, \s6\().8h, v0.h[5]
802 mls \s0\().8h, \s1\().8h, v0.h[1]
803 mls v31.8h , \s4\().8h, v0.h[4]
804 mls \s3\().8h, \s2\().8h, v0.h[1]
805 mls \s6\().8h, \s5\().8h, v0.h[4]
807 mla \s0\().8h, \s2\().8h, v0.h[2]
808 mla v31.8h , \s5\().8h, v0.h[5]
809 mla \s3\().8h, \s1\().8h, v0.h[0]
810 mla \s6\().8h, \s4\().8h, v0.h[3]
811 sqadd v31.8h , \s0\().8h, v31.8h
812 sqadd \s6\().8h, \s3\().8h, \s6\().8h
813 sqrshrun \d0\().8b, v31.8h, #7
814 sqrshrun \d1\().8b, \s6\().8h, #7
817 .macro vp8_epel8_h4 d, v0, v1
818 ext v22.8b, \v0\().8b, \v1\().8b, #1
819 uxtl v19.8h, \v0\().8b
820 ext v23.8b, \v0\().8b, \v1\().8b, #2
822 ext v25.8b, \v0\().8b, \v1\().8b, #3
825 mul v20.8h, v20.8h, v0.h[2]
826 mul v22.8h, v22.8h, v0.h[3]
827 mls v20.8h, v19.8h, v0.h[1]
828 mls v22.8h, v25.8h, v0.h[4]
829 sqadd v22.8h, v20.8h, v22.8h
830 sqrshrun \d\().8b, v22.8h, #7
833 .macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
834 uxtl \s0\().8h, \s0\().8b
835 uxtl \s1\().8h, \s1\().8b
836 uxtl \s2\().8h, \s2\().8b
837 uxtl \s3\().8h, \s3\().8b
838 uxtl \s4\().8h, \s4\().8b
839 mul v21.8h, \s1\().8h, v0.h[2]
840 mul v23.8h, \s2\().8h, v0.h[3]
841 mul \s2\().8h, \s2\().8h, v0.h[2]
842 mul v22.8h, \s3\().8h, v0.h[3]
843 mls v21.8h, \s0\().8h, v0.h[1]
844 mls v23.8h, \s3\().8h, v0.h[4]
845 mls \s2\().8h, \s1\().8h, v0.h[1]
846 mls v22.8h, \s4\().8h, v0.h[4]
847 sqadd v21.8h, v21.8h, v23.8h
848 sqadd \s2\().8h, \s2\().8h, v22.8h
849 sqrshrun \d0\().8b, v21.8h, #7
850 sqrshrun2 \d0\().16b, \s2\().8h, #7
854 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
855 // arithmetic can be used to apply filters
856 const subpel_filters, align=4
857 .short 0, 6, 123, 12, 1, 0, 0, 0
858 .short 2, 11, 108, 36, 8, 1, 0, 0
859 .short 0, 9, 93, 50, 6, 0, 0, 0
860 .short 3, 16, 77, 77, 16, 3, 0, 0
861 .short 0, 6, 50, 93, 9, 0, 0, 0
862 .short 1, 8, 36, 108, 11, 2, 0, 0
863 .short 0, 1, 12, 123, 6, 0, 0, 0
866 function ff_put_vp8_epel16_v6_neon, export=1
867 sub x2, x2, x3, lsl #1
871 movrel x17, subpel_filters, -16
872 add x6, x17, x6, lsl #4 // y
875 ld1 {v1.1d - v2.1d}, [x2], x3
876 ld1 {v3.1d - v4.1d}, [x2], x3
877 ld1 {v16.1d - v17.1d}, [x2], x3
878 ld1 {v18.1d - v19.1d}, [x2], x3
879 ld1 {v20.1d - v21.1d}, [x2], x3
880 ld1 {v22.1d - v23.1d}, [x2], x3
881 ld1 {v24.1d - v25.1d}, [x2]
882 sub x2, x2, x3, lsl #2
884 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
885 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
887 st1 {v1.1d - v2.1d}, [x0], x1
888 st1 {v3.1d - v4.1d}, [x0], x1
895 function ff_put_vp8_epel16_h6_neon, export=1
899 // first pass (horizontal):
900 movrel x17, subpel_filters, -16
901 add x5, x17, x5, lsl #4 // x
904 ld1 {v1.16b, v2.16b}, [x2], x3
905 vp8_epel16_h6 v1, v1, v2
906 st1 {v1.16b}, [x0], x1
914 function ff_put_vp8_epel16_h6v6_neon, export=1
915 sub x2, x2, x3, lsl #1
918 // first pass (horizontal):
919 movrel x17, subpel_filters, -16
921 add x16, x17, x5, lsl #4 // x
929 ld1 {v1.16b, v2.16b}, [x2], x3
930 vp8_epel16_h6 v1, v1, v2
931 st1 {v1.16b}, [x7], #16
936 // second pass (vertical):
938 add x6, x17, x6, lsl #4 // y
943 ld1 {v1.8b - v4.8b}, [x7], #32
944 ld1 {v16.8b - v19.8b}, [x7], #32
945 ld1 {v20.8b - v23.8b}, [x7]
948 vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22
949 vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23
950 trn1 v2.2d, v5.2d, v2.2d
952 st1 {v2.16b}, [x0], x1
960 function ff_put_vp8_epel8_v6_neon, export=1
961 sub x2, x2, x3, lsl #1
963 movrel x7, subpel_filters, -16
964 add x6, x7, w6, uxtw #4
967 ld1 {v2.8b}, [x2], x3
968 ld1 {v3.8b}, [x2], x3
969 ld1 {v4.8b}, [x2], x3
970 ld1 {v5.8b}, [x2], x3
971 ld1 {v6.8b}, [x2], x3
972 ld1 {v7.8b}, [x2], x3
975 sub x2, x2, x3, lsl #2
977 vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
979 st1 {v2.8b}, [x0], x1
980 st1 {v3.8b}, [x0], x1
987 function ff_put_vp8_epel8_h6_neon, export=1
990 movrel x7, subpel_filters, -16
991 add x5, x7, w5, uxtw #4
994 ld1 {v2.8b, v3.8b}, [x2], x3
996 vp8_epel8_h6 v2, v2, v3
998 st1 {v2.8b}, [x0], x1
1005 function ff_put_vp8_epel8_h6v6_neon, export=1
1006 sub x2, x2, x3, lsl #1
1010 // first pass (horizontal):
1011 movrel x17, subpel_filters, -16
1013 add x5, x17, x5, lsl #4 // x
1017 add x16, x4, #5 // h
1020 ld1 {v1.8b, v2.8b}, [x2], x3
1022 vp8_epel8_h6 v1, v1, v2
1024 st1 {v1.8b}, [x7], #8
1028 // second pass (vertical):
1030 add x6, x17, x6, lsl #4 // y
1035 ld1 {v1.8b - v4.8b}, [x7], #32
1036 ld1 {v5.8b - v7.8b}, [x7]
1040 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1042 st1 {v1.8b}, [x0], x1
1043 st1 {v2.8b}, [x0], x1
1051 function ff_put_vp8_epel8_v4_neon, export=1
1054 movrel x7, subpel_filters, -16
1055 add x6, x7, w6, uxtw #4
1058 ld1 {v2.8b}, [x2], x3
1059 ld1 {v3.8b}, [x2], x3
1060 ld1 {v4.8b}, [x2], x3
1061 ld1 {v5.8b}, [x2], x3
1063 sub x2, x2, x3, lsl #1
1065 vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1067 st1 {v2.d}[0], [x0], x1
1068 st1 {v2.d}[1], [x0], x1
1075 function ff_put_vp8_epel8_h4_neon, export=1
1078 movrel x7, subpel_filters, -16
1079 add x5, x7, w5, uxtw #4
1082 ld1 {v2.8b,v3.8b}, [x2], x3
1084 vp8_epel8_h4 v2, v2, v3
1086 st1 {v2.8b}, [x0], x1
1093 function ff_put_vp8_epel8_h4v6_neon, export=1
1094 sub x2, x2, x3, lsl #1
1098 // first pass (horizontal):
1099 movrel x17, subpel_filters, -16
1101 add x5, x17, x5, lsl #4 // x
1105 add x16, x4, #5 // h
1108 ld1 {v1.8b, v2.8b}, [x2], x3
1110 vp8_epel8_h4 v1, v1, v2
1112 st1 {v1.8b}, [x7], #8
1116 // second pass (vertical):
1118 add x6, x17, x6, lsl #4 // y
1123 ld1 {v1.8b - v4.8b}, [x7], #32
1124 ld1 {v5.8b - v7.8b}, [x7]
1128 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1130 st1 {v1.8b}, [x0], x1
1131 st1 {v2.8b}, [x0], x1
1139 function ff_put_vp8_epel8_h4v4_neon, export=1
1145 // first pass (horizontal):
1146 movrel x17, subpel_filters, -16
1148 add x5, x17, x5, lsl #4 // x
1152 add x16, x4, #3 // h
1155 ld1 {v1.8b, v2.8b}, [x2], x3
1157 vp8_epel8_h4 v1, v1, v2
1159 st1 {v1.8b}, [x7], #8
1163 // second pass (vertical):
1165 add x6, x17, x6, lsl #4 // y
1170 ld1 {v1.8b - v2.8b}, [x7], #16
1171 ld1 {v3.8b - v5.8b}, [x7]
1173 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1175 st1 {v1.d}[0], [x0], x1
1176 st1 {v1.d}[1], [x0], x1
1184 function ff_put_vp8_epel8_h6v4_neon, export=1
1190 // first pass (horizontal):
1191 movrel x17, subpel_filters, -16
1193 add x5, x17, x5, lsl #4 // x
1197 add x16, x4, #3 // h
1200 ld1 {v1.8b, v2.8b}, [x2], x3
1202 vp8_epel8_h6 v1, v1, v2
1204 st1 {v1.8b}, [x7], #8
1208 // second pass (vertical):
1210 add x6, x17, x6, lsl #4 // y
1215 ld1 {v1.8b - v2.8b}, [x7], #16
1216 ld1 {v3.8b - v5.8b}, [x7]
1218 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1220 st1 {v1.d}[0], [x0], x1
1221 st1 {v1.d}[1], [x0], x1