2 * VP8 NEON optimisations
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/aarch64/asm.S"
28 function ff_vp8_idct_add_neon, export=1
29 ld1 {v0.8b - v3.8b}, [x1]
31 movk w4, #35468/2, lsl #16
34 smull v26.4s, v1.4h, v4.h[0]
35 smull v27.4s, v3.4h, v4.h[0]
36 sqdmulh v20.4h, v1.4h, v4.h[1]
37 sqdmulh v23.4h, v3.4h, v4.h[1]
38 sqshrn v21.4h, v26.4s, #16
39 sqshrn v22.4h, v27.4s, #16
40 add v21.4h, v21.4h, v1.4h
41 add v22.4h, v22.4h, v3.4h
43 add v16.4h, v0.4h, v2.4h
44 sub v17.4h, v0.4h, v2.4h
46 add v18.4h, v21.4h, v23.4h
47 sub v19.4h, v20.4h, v22.4h
49 add v0.4h, v16.4h, v18.4h
50 add v1.4h, v17.4h, v19.4h
51 sub v3.4h, v16.4h, v18.4h
52 sub v2.4h, v17.4h, v19.4h
54 transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7
57 smull v26.4s, v1.4h, v4.h[0]
58 st1 {v29.8h}, [x1], #16
59 smull v27.4s, v3.4h, v4.h[0]
61 sqdmulh v21.4h, v1.4h, v4.h[1]
62 sqdmulh v23.4h, v3.4h, v4.h[1]
63 sqshrn v20.4h, v26.4s, #16
64 sqshrn v22.4h, v27.4s, #16
65 add v20.4h, v20.4h, v1.4h
66 add v22.4h, v22.4h, v3.4h
67 add v16.4h, v0.4h, v2.4h
68 sub v17.4h, v0.4h, v2.4h
70 add v18.4h, v20.4h, v23.4h
71 ld1 {v24.d}[0], [x0], x2
72 zip1 v16.2d, v16.2d, v17.2d
73 sub v19.4h, v21.4h, v22.4h
74 ld1 {v25.d}[0], [x0], x2
75 zip1 v18.2d, v18.2d, v19.2d
76 add v0.8h, v16.8h, v18.8h
77 ld1 {v25.d}[1], [x0], x2
78 sub v1.8h, v16.8h, v18.8h
79 ld1 {v24.d}[1], [x0], x2
80 srshr v0.8h, v0.8h, #3
81 trn1 v24.4s, v24.4s, v25.4s
82 srshr v1.8h, v1.8h, #3
83 sub x0, x0, x2, lsl #2
85 ext v1.16b, v1.16b, v1.16b, #8
86 trn1 v3.2d, v0.2d, v1.2d
87 trn2 v0.2d, v0.2d, v1.2d
88 trn1 v1.8h, v3.8h, v0.8h
89 trn2 v3.8h, v3.8h, v0.8h
90 uzp1 v0.4s, v1.4s, v3.4s
91 uzp2 v1.4s, v3.4s, v1.4s
93 uaddw v0.8h, v0.8h, v24.8b
94 uaddw2 v1.8h, v1.8h, v24.16b
97 st1 {v0.s}[0], [x0], x2
98 st1 {v0.s}[1], [x0], x2
99 st1 {v0.s}[3], [x0], x2
100 st1 {v0.s}[2], [x0], x2
105 function ff_vp8_idct_dc_add4y_neon, export=1
109 st1 {v0.h}[0], [x1], x3
111 st1 {v0.h}[0], [x1], x3
112 zip1 v16.2d, v16.2d, v17.2d
114 st1 {v0.h}[0], [x1], x3
116 st1 {v0.h}[0], [x1], x3
117 zip1 v18.2d, v18.2d, v19.2d
118 srshr v16.8h, v16.8h, #3 // dc >>= 3
119 ld1 {v0.16b}, [x0], x2
120 srshr v18.8h, v18.8h, #3
121 ld1 {v1.16b}, [x0], x2
122 uaddw v20.8h, v16.8h, v0.8b
123 ld1 {v2.16b}, [x0], x2
124 uaddw2 v0.8h, v18.8h, v0.16b
125 ld1 {v3.16b}, [x0], x2
126 uaddw v21.8h, v16.8h, v1.8b
127 uaddw2 v1.8h, v18.8h, v1.16b
128 uaddw v22.8h, v16.8h, v2.8b
129 uaddw2 v2.8h, v18.8h, v2.16b
130 uaddw v23.8h, v16.8h, v3.8b
131 uaddw2 v3.8h, v18.8h, v3.16b
132 sub x0, x0, x2, lsl #2
133 sqxtun v20.8b, v20.8h
134 sqxtun2 v20.16b, v0.8h
135 sqxtun v21.8b, v21.8h
136 sqxtun2 v21.16b, v1.8h
137 sqxtun v22.8b, v22.8h
138 st1 {v20.16b}, [x0], x2
139 sqxtun2 v22.16b, v2.8h
140 st1 {v21.16b}, [x0], x2
141 sqxtun v23.8b, v23.8h
142 st1 {v22.16b}, [x0], x2
143 sqxtun2 v23.16b, v3.8h
144 st1 {v23.16b}, [x0], x2
149 function ff_vp8_idct_dc_add_neon, export=1
153 srshr v2.8h, v2.8h, #3
154 ld1 {v0.s}[0], [x0], x2
155 ld1 {v0.s}[1], [x0], x2
156 uaddw v3.8h, v2.8h, v0.8b
157 ld1 {v1.s}[0], [x0], x2
158 ld1 {v1.s}[1], [x0], x2
159 uaddw v4.8h, v2.8h, v1.8b
162 sub x0, x0, x2, lsl #2
163 st1 {v0.s}[0], [x0], x2
164 st1 {v0.s}[1], [x0], x2
165 st1 {v1.s}[0], [x0], x2
166 st1 {v1.s}[1], [x0], x2
176 .macro vp8_loop_filter, inner=0, simple=0, hev_thresh
178 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
179 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
180 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
181 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
182 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
184 cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
186 // calculate hev and normal_limit:
187 uabd v20.16b, v2.16b, v3.16b // abs(P1-P0)
188 uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0)
189 uabd v18.16b, v0.16b, v1.16b // abs(P3-P2)
190 uabd v19.16b, v1.16b, v2.16b // abs(P2-P1)
191 cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I
192 cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I
193 cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I
194 cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I
195 and v16.16b, v17.16b, v16.16b
196 uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2)
197 and v16.16b, v16.16b, v19.16b
198 uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1)
199 and v16.16b, v16.16b, v18.16b
200 cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I
201 cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I
202 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
203 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
204 and v16.16b, v16.16b, v18.16b
205 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
206 and v16.16b, v16.16b, v19.16b
207 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
208 dup v23.16b, \hev_thresh // hev_thresh
209 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
210 cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh
211 cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
212 cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh
213 and v16.16b, v16.16b, v19.16b
215 orr v17.16b, v20.16b, v22.16b
222 // convert to signed value:
223 eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
224 eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
227 ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
228 ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
229 eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
230 eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
231 mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
232 mul v19.8h, v19.8h, v20.8h
234 sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
238 and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
240 saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
241 saddw2 v19.8h, v19.8h, v20.16b
242 sqxtn v18.8b, v18.8h // narrow result back into v18
243 sqxtn2 v18.16b, v19.8h
244 .if !\inner && !\simple
245 eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
246 eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
248 and v18.16b, v18.16b, v16.16b // w &= normal_limit
250 // registers used at this point..
251 // v0 -> P3 (don't corrupt)
253 // v7 -> Q3 (don't corrupt)
259 // v16, v19, v29 -> unused
261 // filter_common: is4tap==1
262 // c1 = clamp(w + 4) >> 3;
263 // c2 = clamp(w + 3) >> 3;
264 // Q0 = s2u(QS0 - c1);
265 // P0 = s2u(PS0 + c2);
268 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
269 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
270 sshr v19.16b, v19.16b, #3 // c1 >>= 3
271 sshr v20.16b, v20.16b, #3 // c2 >>= 3
272 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
273 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
274 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
275 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
276 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
277 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
279 // the !is4tap case of filter_common, only used for inner blocks
280 // c3 = ((c1&~hev) + 1) >> 1;
281 // Q1 = s2u(QS1 - c3);
282 // P1 = s2u(PS1 + c3);
283 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
284 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
285 sshr v19.16b, v19.16b, #3 // c1 >>= 3
286 sshr v20.16b, v20.16b, #3 // c2 >>= 3
287 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
288 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
289 bic v19.16b, v19.16b, v17.16b // c1 & ~hev
290 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
291 srshr v19.16b, v19.16b, #1 // c3 >>= 1
292 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
293 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
294 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
295 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
296 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
298 and v20.16b, v18.16b, v17.16b // w & hev
299 sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
300 sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
301 sshr v19.16b, v19.16b, #3 // c1 >>= 3
302 sshr v20.16b, v20.16b, #3 // c2 >>= 3
303 bic v18.16b, v18.16b, v17.16b // w &= ~hev
304 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
305 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
308 // a = clamp((27*w + 63) >> 7);
309 // Q0 = s2u(QS0 - a);
310 // P0 = s2u(PS0 + a);
311 // a = clamp((18*w + 63) >> 7);
312 // Q1 = s2u(QS1 - a);
313 // P1 = s2u(PS1 + a);
314 // a = clamp((9*w + 63) >> 7);
315 // Q2 = s2u(QS2 - a);
316 // P2 = s2u(PS2 + a);
318 sshll v22.8h, v18.8b, #3
319 sshll2 v23.8h, v18.16b, #3
320 saddw v22.8h, v22.8h, v18.8b
321 saddw2 v23.8h, v23.8h, v18.16b
322 add v16.8h, v17.8h, v22.8h
323 add v17.8h, v17.8h, v23.8h // 9*w + 63
324 add v19.8h, v16.8h, v22.8h
325 add v20.8h, v17.8h, v23.8h // 18*w + 63
326 add v22.8h, v19.8h, v22.8h
327 add v23.8h, v20.8h, v23.8h // 27*w + 63
328 sqshrn v16.8b, v16.8h, #7
329 sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
330 sqshrn v19.8b, v19.8h, #7
331 sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
332 sqshrn v22.8b, v22.8h, #7
333 sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
334 sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
335 sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
336 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
337 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
338 sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
339 sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
340 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
341 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
342 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
343 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
344 eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
345 eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
349 .macro vp8_v_loop_filter16 name, inner=0, simple=0
350 function ff_vp8_v_loop_filter16\name\()_neon, export=1
351 sub x0, x0, x1, lsl #1+!\simple
355 ld1 {v0.16b}, [x0], x1 // P3
356 ld1 {v1.16b}, [x0], x1 // P2
358 ld1 {v2.16b}, [x0], x1 // P1
359 ld1 {v3.16b}, [x0], x1 // P0
360 ld1 {v4.16b}, [x0], x1 // Q0
361 ld1 {v5.16b}, [x0], x1 // Q1
363 ld1 {v6.16b}, [x0], x1 // Q2
364 ld1 {v7.16b}, [x0] // Q3
365 dup v23.16b, w3 // flim_I
367 dup v22.16b, w2 // flim_E
369 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
371 // back up to P2: dst -= stride * 6
372 sub x0, x0, x1, lsl #2
374 sub x0, x0, x1, lsl #1
377 st1 {v1.16b}, [x0], x1 // P2
379 st1 {v2.16b}, [x0], x1 // P1
380 st1 {v3.16b}, [x0], x1 // P0
381 st1 {v4.16b}, [x0], x1 // Q0
382 st1 {v5.16b}, [x0], x1 // Q1
384 st1 {v6.16b}, [x0] // Q2
392 vp8_v_loop_filter16 _inner, inner=1
393 vp8_v_loop_filter16 _simple, simple=1
395 .macro vp8_v_loop_filter8uv name, inner=0
396 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
397 sub x0, x0, x2, lsl #2
398 sub x1, x1, x2, lsl #2
400 ld1 {v0.d}[0], [x0], x2 // P3
401 ld1 {v0.d}[1], [x1], x2 // P3
402 ld1 {v1.d}[0], [x0], x2 // P2
403 ld1 {v1.d}[1], [x1], x2 // P2
404 ld1 {v2.d}[0], [x0], x2 // P1
405 ld1 {v2.d}[1], [x1], x2 // P1
406 ld1 {v3.d}[0], [x0], x2 // P0
407 ld1 {v3.d}[1], [x1], x2 // P0
408 ld1 {v4.d}[0], [x0], x2 // Q0
409 ld1 {v4.d}[1], [x1], x2 // Q0
410 ld1 {v5.d}[0], [x0], x2 // Q1
411 ld1 {v5.d}[1], [x1], x2 // Q1
412 ld1 {v6.d}[0], [x0], x2 // Q2
413 ld1 {v6.d}[1], [x1], x2 // Q2
414 ld1 {v7.d}[0], [x0] // Q3
415 ld1 {v7.d}[1], [x1] // Q3
417 dup v22.16b, w3 // flim_E
418 dup v23.16b, w4 // flim_I
420 vp8_loop_filter inner=\inner, hev_thresh=w5
422 // back up to P2: u,v -= stride * 6
423 sub x0, x0, x2, lsl #2
424 sub x1, x1, x2, lsl #2
425 sub x0, x0, x2, lsl #1
426 sub x1, x1, x2, lsl #1
430 st1 {v1.d}[0], [x0], x2 // P2
431 st1 {v1.d}[1], [x1], x2 // P2
432 st1 {v2.d}[0], [x0], x2 // P1
433 st1 {v2.d}[1], [x1], x2 // P1
434 st1 {v3.d}[0], [x0], x2 // P0
435 st1 {v3.d}[1], [x1], x2 // P0
436 st1 {v4.d}[0], [x0], x2 // Q0
437 st1 {v4.d}[1], [x1], x2 // Q0
438 st1 {v5.d}[0], [x0], x2 // Q1
439 st1 {v5.d}[1], [x1], x2 // Q1
440 st1 {v6.d}[0], [x0] // Q2
441 st1 {v6.d}[1], [x1] // Q2
448 vp8_v_loop_filter8uv _inner, inner=1
450 .macro vp8_h_loop_filter16 name, inner=0, simple=0
451 function ff_vp8_h_loop_filter16\name\()_neon, export=1
455 ld1 {v0.d}[0], [x0], x1
456 ld1 {v1.d}[0], [x0], x1
457 ld1 {v2.d}[0], [x0], x1
458 ld1 {v3.d}[0], [x0], x1
459 ld1 {v4.d}[0], [x0], x1
460 ld1 {v5.d}[0], [x0], x1
461 ld1 {v6.d}[0], [x0], x1
462 ld1 {v7.d}[0], [x0], x1
463 ld1 {v0.d}[1], [x0], x1
464 ld1 {v1.d}[1], [x0], x1
465 ld1 {v2.d}[1], [x0], x1
466 ld1 {v3.d}[1], [x0], x1
467 ld1 {v4.d}[1], [x0], x1
468 ld1 {v5.d}[1], [x0], x1
469 ld1 {v6.d}[1], [x0], x1
470 ld1 {v7.d}[1], [x0], x1
472 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
474 dup v22.16b, w2 // flim_E
476 dup v23.16b, w3 // flim_I
479 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
481 sub x0, x0, x1, lsl #4 // backup 16 rows
483 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
486 st1 {v0.d}[0], [x0], x1
487 st1 {v1.d}[0], [x0], x1
488 st1 {v2.d}[0], [x0], x1
489 st1 {v3.d}[0], [x0], x1
490 st1 {v4.d}[0], [x0], x1
491 st1 {v5.d}[0], [x0], x1
492 st1 {v6.d}[0], [x0], x1
493 st1 {v7.d}[0], [x0], x1
494 st1 {v0.d}[1], [x0], x1
495 st1 {v1.d}[1], [x0], x1
496 st1 {v2.d}[1], [x0], x1
497 st1 {v3.d}[1], [x0], x1
498 st1 {v4.d}[1], [x0], x1
499 st1 {v5.d}[1], [x0], x1
500 st1 {v6.d}[1], [x0], x1
508 vp8_h_loop_filter16 _inner, inner=1
509 vp8_h_loop_filter16 _simple, simple=1
511 .macro vp8_h_loop_filter8uv name, inner=0
512 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
517 ld1 {v0.d}[0], [x0], x2 // load u
518 ld1 {v0.d}[1], [x1], x2 // load v
519 ld1 {v1.d}[0], [x0], x2
520 ld1 {v1.d}[1], [x1], x2
521 ld1 {v2.d}[0], [x0], x2
522 ld1 {v2.d}[1], [x1], x2
523 ld1 {v3.d}[0], [x0], x2
524 ld1 {v3.d}[1], [x1], x2
525 ld1 {v4.d}[0], [x0], x2
526 ld1 {v4.d}[1], [x1], x2
527 ld1 {v5.d}[0], [x0], x2
528 ld1 {v5.d}[1], [x1], x2
529 ld1 {v6.d}[0], [x0], x2
530 ld1 {v6.d}[1], [x1], x2
531 ld1 {v7.d}[0], [x0], x2
532 ld1 {v7.d}[1], [x1], x2
534 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
536 dup v22.16b, w3 // flim_E
537 dup v23.16b, w4 // flim_I
539 vp8_loop_filter inner=\inner, hev_thresh=w5
541 sub x0, x0, x2, lsl #3 // backup u 8 rows
542 sub x1, x1, x2, lsl #3 // backup v 8 rows
544 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
547 st1 {v0.d}[0], [x0], x2 // load u
548 st1 {v0.d}[1], [x1], x2 // load v
549 st1 {v1.d}[0], [x0], x2
550 st1 {v1.d}[1], [x1], x2
551 st1 {v2.d}[0], [x0], x2
552 st1 {v2.d}[1], [x1], x2
553 st1 {v3.d}[0], [x0], x2
554 st1 {v3.d}[1], [x1], x2
555 st1 {v4.d}[0], [x0], x2
556 st1 {v4.d}[1], [x1], x2
557 st1 {v5.d}[0], [x0], x2
558 st1 {v5.d}[1], [x1], x2
559 st1 {v6.d}[0], [x0], x2
560 st1 {v6.d}[1], [x1], x2
570 vp8_h_loop_filter8uv _inner, inner=1
573 function ff_put_vp8_pixels16_neon, export=1
576 ld1 {v0.16b}, [x2], x3
577 ld1 {v1.16b}, [x2], x3
578 ld1 {v2.16b}, [x2], x3
579 ld1 {v3.16b}, [x2], x3
580 st1 {v0.16b}, [x0], x1
581 st1 {v1.16b}, [x0], x1
582 st1 {v2.16b}, [x0], x1
583 st1 {v3.16b}, [x0], x1
588 function ff_put_vp8_pixels8_neon, export=1
591 ld1 {v0.8b}, [x2], x3
592 ld1 {v0.d}[1], [x2], x3
593 ld1 {v1.8b}, [x2], x3
594 ld1 {v1.d}[1], [x2], x3
595 st1 {v0.8b}, [x0], x1
596 st1 {v0.d}[1], [x0], x1
597 st1 {v1.8b}, [x0], x1
598 st1 {v1.d}[1], [x0], x1
603 /* 4/6-tap 8th-pel MC */
605 .macro vp8_epel8_h6 d, s0, s1
606 ext v22.8b, \s0\().8b, \s1\().8b, #1
607 uxtl v18.8h, \s0\().8b
608 ext v23.8b, \s0\().8b, \s1\().8b, #2
610 ext v24.8b, \s0\().8b, \s1\().8b, #3
612 ext v25.8b, \s0\().8b, \s1\().8b, #4
614 ext v26.8b, \s0\().8b, \s1\().8b, #5
616 mul v21.8h, v21.8h, v0.h[2]
618 mul v22.8h, v22.8h, v0.h[3]
619 mls v21.8h, v19.8h, v0.h[1]
620 mls v22.8h, v25.8h, v0.h[4]
621 mla v21.8h, v18.8h, v0.h[0]
622 mla v22.8h, v26.8h, v0.h[5]
623 sqadd v22.8h, v21.8h, v22.8h
624 sqrshrun \d\().8b, v22.8h, #7
627 .macro vp8_epel16_h6 d0, v0, v1
628 ext v22.16b, \v0\().16b, \v1\().16b, #3
629 ext v23.16b, \v0\().16b, \v1\().16b, #4
631 uxtl2 v22.8h, v22.16b
632 ext v3.16b, \v0\().16b, \v1\().16b, #2
634 uxtl2 v23.8h, v23.16b
635 ext v16.16b, \v0\().16b, \v1\().16b, #1
638 ext v2.16b, \v0\().16b, \v1\().16b, #5
642 uxtl2 v16.8h, v16.16b
643 mul v19.8h, v19.8h, v0.h[3]
644 mul v18.8h, v18.8h, v0.h[2]
645 mul v3.8h, v3.8h, v0.h[2]
646 mul v22.8h, v22.8h, v0.h[3]
647 mls v19.8h, v20.8h, v0.h[4]
648 uxtl v20.8h, \v0\().8b
649 uxtl2 v1.8h, \v0\().16b
650 mls v18.8h, v17.8h, v0.h[1]
651 mls v3.8h, v16.8h, v0.h[1]
652 mls v22.8h, v23.8h, v0.h[4]
653 mla v18.8h, v20.8h, v0.h[0]
654 mla v19.8h, v21.8h, v0.h[5]
655 mla v3.8h, v1.8h, v0.h[0]
656 mla v22.8h, v2.8h, v0.h[5]
657 sqadd v19.8h, v18.8h, v19.8h
658 sqadd v22.8h, v3.8h, v22.8h
659 sqrshrun \d0\().8b, v19.8h, #7
660 sqrshrun2 \d0\().16b, v22.8h, #7
663 .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
664 uxtl \s2\().8h, \s2\().8b
665 uxtl \s3\().8h, \s3\().8b
666 uxtl \s1\().8h, \s1\().8b
667 uxtl \s4\().8h, \s4\().8b
668 uxtl \s0\().8h, \s0\().8b
669 uxtl \s5\().8h, \s5\().8b
670 mul \s2\().8h, \s2\().8h, v0.h[2]
671 mul \s3\().8h, \s3\().8h, v0.h[3]
672 mls \s2\().8h, \s1\().8h, v0.h[1]
673 mls \s3\().8h, \s4\().8h, v0.h[4]
674 mla \s2\().8h, \s0\().8h, v0.h[0]
675 mla \s3\().8h, \s5\().8h, v0.h[5]
676 sqadd \s3\().8h, \s2\().8h, \s3\().8h
677 sqrshrun \d0\().8b, \s3\().8h, #7
680 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
681 uxtl \s0\().8h, \s0\().8b
682 uxtl \s3\().8h, \s3\().8b
683 uxtl \s6\().8h, \s6\().8b
684 uxtl \s1\().8h, \s1\().8b
685 uxtl \s4\().8h, \s4\().8b
686 uxtl \s2\().8h, \s2\().8b
687 uxtl \s5\().8h, \s5\().8b
688 mul \s0\().8h, \s0\().8h, v0.h[0]
689 mul v31.8h , \s3\().8h, v0.h[3]
690 mul \s3\().8h, \s3\().8h, v0.h[2]
691 mul \s6\().8h, \s6\().8h, v0.h[5]
693 mls \s0\().8h, \s1\().8h, v0.h[1]
694 mls v31.8h , \s4\().8h, v0.h[4]
695 mls \s3\().8h, \s2\().8h, v0.h[1]
696 mls \s6\().8h, \s5\().8h, v0.h[4]
698 mla \s0\().8h, \s2\().8h, v0.h[2]
699 mla v31.8h , \s5\().8h, v0.h[5]
700 mla \s3\().8h, \s1\().8h, v0.h[0]
701 mla \s6\().8h, \s4\().8h, v0.h[3]
702 sqadd v31.8h , \s0\().8h, v31.8h
703 sqadd \s6\().8h, \s3\().8h, \s6\().8h
704 sqrshrun \d0\().8b, v31.8h, #7
705 sqrshrun \d1\().8b, \s6\().8h, #7
708 .macro vp8_epel8_h4 d, v0, v1
709 ext v22.8b, \v0\().8b, \v1\().8b, #1
710 uxtl v19.8h, \v0\().8b
711 ext v23.8b, \v0\().8b, \v1\().8b, #2
713 ext v25.8b, \v0\().8b, \v1\().8b, #3
716 mul v20.8h, v20.8h, v0.h[2]
717 mul v22.8h, v22.8h, v0.h[3]
718 mls v20.8h, v19.8h, v0.h[1]
719 mls v22.8h, v25.8h, v0.h[4]
720 sqadd v22.8h, v20.8h, v22.8h
721 sqrshrun \d\().8b, v22.8h, #7
724 .macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
725 uxtl \s0\().8h, \s0\().8b
726 uxtl \s1\().8h, \s1\().8b
727 uxtl \s2\().8h, \s2\().8b
728 uxtl \s3\().8h, \s3\().8b
729 uxtl \s4\().8h, \s4\().8b
730 mul v21.8h, \s1\().8h, v0.h[2]
731 mul v23.8h, \s2\().8h, v0.h[3]
732 mul \s2\().8h, \s2\().8h, v0.h[2]
733 mul v22.8h, \s3\().8h, v0.h[3]
734 mls v21.8h, \s0\().8h, v0.h[1]
735 mls v23.8h, \s3\().8h, v0.h[4]
736 mls \s2\().8h, \s1\().8h, v0.h[1]
737 mls v22.8h, \s4\().8h, v0.h[4]
738 sqadd v21.8h, v21.8h, v23.8h
739 sqadd \s2\().8h, \s2\().8h, v22.8h
740 sqrshrun \d0\().8b, v21.8h, #7
741 sqrshrun2 \d0\().16b, \s2\().8h, #7
745 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
746 // arithmetic can be used to apply filters
747 const subpel_filters, align=4
748 .short 0, 6, 123, 12, 1, 0, 0, 0
749 .short 2, 11, 108, 36, 8, 1, 0, 0
750 .short 0, 9, 93, 50, 6, 0, 0, 0
751 .short 3, 16, 77, 77, 16, 3, 0, 0
752 .short 0, 6, 50, 93, 9, 0, 0, 0
753 .short 1, 8, 36, 108, 11, 2, 0, 0
754 .short 0, 1, 12, 123, 6, 0, 0, 0
757 function ff_put_vp8_epel16_v6_neon, export=1
758 sub x2, x2, x3, lsl #1
762 movrel x17, subpel_filters, -16
763 add x6, x17, x6, lsl #4 // y
766 ld1 {v1.1d - v2.1d}, [x2], x3
767 ld1 {v3.1d - v4.1d}, [x2], x3
768 ld1 {v16.1d - v17.1d}, [x2], x3
769 ld1 {v18.1d - v19.1d}, [x2], x3
770 ld1 {v20.1d - v21.1d}, [x2], x3
771 ld1 {v22.1d - v23.1d}, [x2], x3
772 ld1 {v24.1d - v25.1d}, [x2]
773 sub x2, x2, x3, lsl #2
775 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
776 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
778 st1 {v1.1d - v2.1d}, [x0], x1
779 st1 {v3.1d - v4.1d}, [x0], x1
786 function ff_put_vp8_epel16_h6_neon, export=1
790 // first pass (horizontal):
791 movrel x17, subpel_filters, -16
792 add x5, x17, x5, lsl #4 // x
795 ld1 {v1.16b, v2.16b}, [x2], x3
796 vp8_epel16_h6 v1, v1, v2
797 st1 {v1.16b}, [x0], x1
805 function ff_put_vp8_epel16_h6v6_neon, export=1
806 sub x2, x2, x3, lsl #1
809 // first pass (horizontal):
810 movrel x17, subpel_filters, -16
812 add x16, x17, x5, lsl #4 // x
820 ld1 {v1.16b, v2.16b}, [x2], x3
821 vp8_epel16_h6 v1, v1, v2
822 st1 {v1.16b}, [x7], #16
827 // second pass (vertical):
829 add x6, x17, x6, lsl #4 // y
834 ld1 {v1.8b - v4.8b}, [x7], #32
835 ld1 {v16.8b - v19.8b}, [x7], #32
836 ld1 {v20.8b - v23.8b}, [x7]
839 vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22
840 vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23
841 trn1 v2.2d, v5.2d, v2.2d
843 st1 {v2.16b}, [x0], x1
851 function ff_put_vp8_epel8_h6v6_neon, export=1
852 sub x2, x2, x3, lsl #1
856 // first pass (horizontal):
857 movrel x17, subpel_filters, -16
859 add x5, x17, x5, lsl #4 // x
866 ld1 {v1.8b, v2.8b}, [x2], x3
868 vp8_epel8_h6 v1, v1, v2
870 st1 {v1.8b}, [x7], #8
874 // second pass (vertical):
876 add x6, x17, x6, lsl #4 // y
881 ld1 {v1.8b - v4.8b}, [x7], #32
882 ld1 {v5.8b - v7.8b}, [x7]
886 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
888 st1 {v1.8b}, [x0], x1
889 st1 {v2.8b}, [x0], x1
897 function ff_put_vp8_epel8_h4v6_neon, export=1
898 sub x2, x2, x3, lsl #1
902 // first pass (horizontal):
903 movrel x17, subpel_filters, -16
905 add x5, x17, x5, lsl #4 // x
912 ld1 {v1.8b, v2.8b}, [x2], x3
914 vp8_epel8_h4 v1, v1, v2
916 st1 {v1.8b}, [x7], #8
920 // second pass (vertical):
922 add x6, x17, x6, lsl #4 // y
927 ld1 {v1.8b - v4.8b}, [x7], #32
928 ld1 {v5.8b - v7.8b}, [x7]
932 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
934 st1 {v1.8b}, [x0], x1
935 st1 {v2.8b}, [x0], x1
943 function ff_put_vp8_epel8_h4v4_neon, export=1
949 // first pass (horizontal):
950 movrel x17, subpel_filters, -16
952 add x5, x17, x5, lsl #4 // x
959 ld1 {v1.8b, v2.8b}, [x2], x3
961 vp8_epel8_h4 v1, v1, v2
963 st1 {v1.8b}, [x7], #8
967 // second pass (vertical):
969 add x6, x17, x6, lsl #4 // y
974 ld1 {v1.8b - v2.8b}, [x7], #16
975 ld1 {v3.8b - v5.8b}, [x7]
977 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
979 st1 {v1.d}[0], [x0], x1
980 st1 {v1.d}[1], [x0], x1
988 function ff_put_vp8_epel8_h6v4_neon, export=1
994 // first pass (horizontal):
995 movrel x17, subpel_filters, -16
997 add x5, x17, x5, lsl #4 // x
1001 add x16, x4, #3 // h
1004 ld1 {v1.8b, v2.8b}, [x2], x3
1006 vp8_epel8_h6 v1, v1, v2
1008 st1 {v1.8b}, [x7], #8
1012 // second pass (vertical):
1014 add x6, x17, x6, lsl #4 // y
1019 ld1 {v1.8b - v2.8b}, [x7], #16
1020 ld1 {v3.8b - v5.8b}, [x7]
1022 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1024 st1 {v1.d}[0], [x0], x1
1025 st1 {v1.d}[1], [x0], x1