2 * VP8 NEON optimisations
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
7 * This file is part of Libav.
9 * Libav is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * Libav is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with Libav; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 function ff_vp8_luma_dc_wht_neon, export=1
28 vld1.16 {q0-q1}, [r1,:128]
33 vst1.16 {q15}, [r1,:128]!
36 vst1.16 {q15}, [r1,:128]
60 vst1.16 {d0[0]}, [r0,:16], r3
61 vst1.16 {d1[0]}, [r0,:16], r3
62 vst1.16 {d2[0]}, [r0,:16], r3
63 vst1.16 {d3[0]}, [r0,:16], r3
64 vst1.16 {d0[1]}, [r0,:16], r3
65 vst1.16 {d1[1]}, [r0,:16], r3
66 vst1.16 {d2[1]}, [r0,:16], r3
67 vst1.16 {d3[1]}, [r0,:16], r3
68 vst1.16 {d0[2]}, [r0,:16], r3
69 vst1.16 {d1[2]}, [r0,:16], r3
70 vst1.16 {d2[2]}, [r0,:16], r3
71 vst1.16 {d3[2]}, [r0,:16], r3
72 vst1.16 {d0[3]}, [r0,:16], r3
73 vst1.16 {d1[3]}, [r0,:16], r3
74 vst1.16 {d2[3]}, [r0,:16], r3
75 vst1.16 {d3[3]}, [r0,:16], r3
80 function ff_vp8_luma_dc_wht_dc_neon, export=1
92 function ff_vp8_idct_add_neon, export=1
93 vld1.16 {q0-q1}, [r1,:128]
98 vmull.s16 q12, d1, d4[0]
99 vmull.s16 q13, d3, d4[0]
100 vqdmulh.s16 d20, d1, d4[1]
101 vqdmulh.s16 d23, d3, d4[1]
102 vshrn.s32 d21, q12, #16
103 vshrn.s32 d22, q13, #16
104 vadd.s16 d21, d21, d1
105 vadd.s16 d22, d22, d3
109 vadd.s16 d18, d21, d23
110 vsub.s16 d19, d20, d22
120 vmull.s16 q12, d1, d4[0]
121 vst1.16 {q15}, [r1,:128]!
122 vmull.s16 q13, d2, d4[0]
123 vst1.16 {q15}, [r1,:128]
124 vqdmulh.s16 d21, d1, d4[1]
125 vqdmulh.s16 d23, d2, d4[1]
126 vshrn.s32 d20, q12, #16
127 vshrn.s32 d22, q13, #16
128 vadd.i16 d20, d20, d1
129 vadd.i16 d22, d22, d2
133 vadd.i16 d18, d20, d23
134 vld1.32 {d20[]}, [r0,:32], r2
135 vsub.i16 d19, d21, d22
136 vld1.32 {d22[]}, [r0,:32], r2
138 vld1.32 {d23[]}, [r0,:32], r2
140 vld1.32 {d21[]}, [r0,:32], r2
145 sub r0, r0, r2, lsl #2
157 vst1.32 {d0[0]}, [r0,:32], r2
158 vst1.32 {d0[1]}, [r0,:32], r2
159 vst1.32 {d1[1]}, [r0,:32], r2
160 vst1.32 {d1[0]}, [r0,:32], r2
165 function ff_vp8_idct_dc_add_neon, export=1
171 vld1.32 {d0[]}, [r0,:32], r2
172 vld1.32 {d1[]}, [r0,:32], r2
173 vld1.32 {d0[1]}, [r0,:32], r2
174 vld1.32 {d1[1]}, [r0,:32], r2
177 sub r0, r0, r2, lsl #2
180 vst1.32 {d0[0]}, [r0,:32], r2
181 vst1.32 {d1[0]}, [r0,:32], r2
182 vst1.32 {d0[1]}, [r0,:32], r2
183 vst1.32 {d1[1]}, [r0,:32], r2
187 function ff_vp8_idct_dc_add4uv_neon, export=1
190 vld1.16 {d16[]}, [r1,:16]
191 vst1.16 {d0[0]}, [r1,:16], r3
192 vld1.16 {d17[]}, [r1,:16]
193 vst1.16 {d0[0]}, [r1,:16], r3
194 vld1.16 {d18[]}, [r1,:16]
195 vst1.16 {d0[0]}, [r1,:16], r3
196 vld1.16 {d19[]}, [r1,:16]
197 vst1.16 {d0[0]}, [r1,:16], r3
199 vrshr.s16 q8, q8, #3 @ dc >>= 3
200 vld1.8 {d0}, [r0,:64], r2
202 vld1.8 {d1}, [r0,:64], r2
204 vld1.8 {d2}, [r0,:64], r2
206 vld1.8 {d3}, [r0,:64], r2
208 vld1.8 {d4}, [r0,:64], r2
210 vld1.8 {d5}, [r0,:64], r2
212 vld1.8 {d6}, [r0,:64], r2
214 vld1.8 {d7}, [r0,:64], r2
220 vst1.8 {d20}, [r3,:64], r2
222 vst1.8 {d21}, [r3,:64], r2
224 vst1.8 {d22}, [r3,:64], r2
226 vst1.8 {d23}, [r3,:64], r2
228 vst1.8 {d24}, [r3,:64], r2
230 vst1.8 {d25}, [r3,:64], r2
231 vst1.8 {d26}, [r3,:64], r2
232 vst1.8 {d27}, [r3,:64], r2
237 function ff_vp8_idct_dc_add4y_neon, export=1
240 vld1.16 {d16[]}, [r1,:16]
241 vst1.16 {d0[0]}, [r1,:16], r3
242 vld1.16 {d17[]}, [r1,:16]
243 vst1.16 {d0[0]}, [r1,:16], r3
244 vld1.16 {d18[]}, [r1,:16]
245 vst1.16 {d0[0]}, [r1,:16], r3
246 vld1.16 {d19[]}, [r1,:16]
247 vst1.16 {d0[0]}, [r1,:16], r3
248 vrshr.s16 q8, q8, #3 @ dc >>= 3
249 vld1.8 {q0}, [r0,:128], r2
251 vld1.8 {q1}, [r0,:128], r2
253 vld1.8 {q2}, [r0,:128], r2
255 vld1.8 {q3}, [r0,:128], r2
262 sub r0, r0, r2, lsl #2
268 vst1.8 {q10}, [r0,:128], r2
270 vst1.8 {q11}, [r0,:128], r2
272 vst1.8 {q12}, [r0,:128], r2
274 vst1.8 {q13}, [r0,:128], r2
285 .macro vp8_loop_filter, inner=0, simple=0
287 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
288 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
289 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
290 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
291 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
293 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
295 @ calculate hev and normal_limit:
296 vabd.u8 q12, q2, q3 @ abs(P1-P0)
297 vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
298 vabd.u8 q10, q0, q1 @ abs(P3-P2)
299 vabd.u8 q11, q1, q2 @ abs(P2-P1)
300 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
301 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
302 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
303 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
305 vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
307 vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
309 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
310 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
311 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
312 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
314 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
316 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
317 vdup.8 q15, r12 @ hev_thresh
318 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
319 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
320 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
321 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
331 @ convert to signed value:
332 veor q3, q3, q13 @ PS0 = P0 ^ 0x80
333 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
336 vsubl.s8 q10, d8, d6 @ QS0 - PS0
337 vsubl.s8 q11, d9, d7 @ (widened to 16bit)
338 veor q2, q2, q13 @ PS1 = P1 ^ 0x80
339 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
340 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
341 vmul.i16 q11, q11, q12
343 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
347 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
349 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
350 vaddw.s8 q11, q11, d25
351 vqmovn.s16 d20, q10 @ narrow result back into q10
353 .if !\inner && !\simple
354 veor q1, q1, q13 @ PS2 = P2 ^ 0x80
355 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
357 vand q10, q10, q8 @ w &= normal_limit
359 @ registers used at this point..
360 @ q0 -> P3 (don't corrupt)
362 @ q7 -> Q3 (don't corrupt)
368 @ q8, q11, q12 -> unused
370 @ filter_common: is4tap==1
371 @ c1 = clamp(w + 4) >> 3;
372 @ c2 = clamp(w + 3) >> 3;
373 @ Q0 = s2u(QS0 - c1);
374 @ P0 = s2u(PS0 + c2);
377 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
378 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
379 vshr.s8 q11, q11, #3 @ c1 >>= 3
380 vshr.s8 q12, q12, #3 @ c2 >>= 3
381 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
382 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
383 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
384 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
385 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
386 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
388 @ the !is4tap case of filter_common, only used for inner blocks
389 @ c3 = ((c1&~hev) + 1) >> 1;
390 @ Q1 = s2u(QS1 - c3);
391 @ P1 = s2u(PS1 + c3);
392 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
393 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
394 vshr.s8 q11, q11, #3 @ c1 >>= 3
395 vshr.s8 q12, q12, #3 @ c2 >>= 3
396 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
397 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
398 vbic q11, q11, q9 @ c1 & ~hev
399 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
400 vrshr.s8 q11, q11, #1 @ c3 >>= 1
401 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
402 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
403 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
404 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
405 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
407 vand q12, q10, q9 @ w & hev
408 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
409 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
410 vshr.s8 q11, q11, #3 @ c1 >>= 3
411 vshr.s8 q12, q12, #3 @ c2 >>= 3
412 vbic q10, q10, q9 @ w &= ~hev
413 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
414 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
417 @ a = clamp((27*w + 63) >> 7);
420 @ a = clamp((18*w + 63) >> 7);
423 @ a = clamp((9*w + 63) >> 7);
427 vshll.s8 q14, d20, #3
428 vshll.s8 q15, d21, #3
429 vaddw.s8 q14, q14, d20
430 vaddw.s8 q15, q15, d21
432 vadd.s16 q9, q9, q15 @ 9*w + 63
433 vadd.s16 q11, q8, q14
434 vadd.s16 q12, q9, q15 @ 18*w + 63
435 vadd.s16 q14, q11, q14
436 vadd.s16 q15, q12, q15 @ 27*w + 63
437 vqshrn.s16 d16, q8, #7
438 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
439 vqshrn.s16 d22, q11, #7
440 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
441 vqshrn.s16 d28, q14, #7
442 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
443 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
444 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
445 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
446 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
447 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
448 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
449 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
450 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
451 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
452 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
453 veor q1, q1, q13 @ P2 = PS2 ^ 0x80
454 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
458 .macro vp8_v_loop_filter16 name, inner=0, simple=0
459 function ff_vp8_v_loop_filter16\name\()_neon, export=1
461 sub r0, r0, r1, lsl #1+!\simple
465 ldr r12, [sp, #64] @ hev_thresh
466 vld1.8 {q0}, [r0,:128], r1 @ P3
467 vld1.8 {q1}, [r0,:128], r1 @ P2
469 vld1.8 {q2}, [r0,:128], r1 @ P1
470 vld1.8 {q3}, [r0,:128], r1 @ P0
471 vld1.8 {q4}, [r0,:128], r1 @ Q0
472 vld1.8 {q5}, [r0,:128], r1 @ Q1
474 vld1.8 {q6}, [r0,:128], r1 @ Q2
475 vld1.8 {q7}, [r0,:128] @ Q3
476 vdup.8 q15, r3 @ flim_I
478 vdup.8 q14, r2 @ flim_E
480 vp8_loop_filter inner=\inner, simple=\simple
482 @ back up to P2: dst -= stride * 6
483 sub r0, r0, r1, lsl #2
485 sub r0, r0, r1, lsl #1
488 vst1.8 {q1}, [r0,:128], r1 @ P2
490 vst1.8 {q2}, [r0,:128], r1 @ P1
491 vst1.8 {q3}, [r0,:128], r1 @ P0
492 vst1.8 {q4}, [r0,:128], r1 @ Q0
493 vst1.8 {q5}, [r0,:128], r1 @ Q1
495 vst1.8 {q6}, [r0,:128] @ Q2
504 vp8_v_loop_filter16 _inner, inner=1
505 vp8_v_loop_filter16 _simple, simple=1
507 .macro vp8_v_loop_filter8uv name, inner=0
508 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
510 sub r0, r0, r2, lsl #2
511 sub r1, r1, r2, lsl #2
512 ldr r12, [sp, #64] @ flim_I
515 vld1.8 {d0}, [r0,:64], r2 @ P3
516 vld1.8 {d1}, [r1,:64], r2 @ P3
517 vld1.8 {d2}, [r0,:64], r2 @ P2
518 vld1.8 {d3}, [r1,:64], r2 @ P2
519 vld1.8 {d4}, [r0,:64], r2 @ P1
520 vld1.8 {d5}, [r1,:64], r2 @ P1
521 vld1.8 {d6}, [r0,:64], r2 @ P0
522 vld1.8 {d7}, [r1,:64], r2 @ P0
523 vld1.8 {d8}, [r0,:64], r2 @ Q0
524 vld1.8 {d9}, [r1,:64], r2 @ Q0
525 vld1.8 {d10}, [r0,:64], r2 @ Q1
526 vld1.8 {d11}, [r1,:64], r2 @ Q1
527 vld1.8 {d12}, [r0,:64], r2 @ Q2
528 vld1.8 {d13}, [r1,:64], r2 @ Q2
529 vld1.8 {d14}, [r0,:64] @ Q3
530 vld1.8 {d15}, [r1,:64] @ Q3
532 vdup.8 q14, r3 @ flim_E
533 vdup.8 q15, r12 @ flim_I
534 ldr r12, [sp, #68] @ hev_thresh
536 vp8_loop_filter inner=\inner
538 @ back up to P2: u,v -= stride * 6
539 sub r0, r0, r2, lsl #2
540 sub r1, r1, r2, lsl #2
541 sub r0, r0, r2, lsl #1
542 sub r1, r1, r2, lsl #1
545 vst1.8 {d2}, [r0,:64], r2 @ P2
546 vst1.8 {d3}, [r1,:64], r2 @ P2
547 vst1.8 {d4}, [r0,:64], r2 @ P1
548 vst1.8 {d5}, [r1,:64], r2 @ P1
549 vst1.8 {d6}, [r0,:64], r2 @ P0
550 vst1.8 {d7}, [r1,:64], r2 @ P0
551 vst1.8 {d8}, [r0,:64], r2 @ Q0
552 vst1.8 {d9}, [r1,:64], r2 @ Q0
553 vst1.8 {d10}, [r0,:64], r2 @ Q1
554 vst1.8 {d11}, [r1,:64], r2 @ Q1
555 vst1.8 {d12}, [r0,:64] @ Q2
556 vst1.8 {d13}, [r1,:64] @ Q2
564 vp8_v_loop_filter8uv _inner, inner=1
566 .macro vp8_h_loop_filter16 name, inner=0, simple=0
567 function ff_vp8_h_loop_filter16\name\()_neon, export=1
571 ldr r12, [sp, #64] @ hev_thresh
575 vld1.8 {d0}, [r0], r1 @ load first 8-line src data
576 vld1.8 {d2}, [r0], r1
577 vld1.8 {d4}, [r0], r1
578 vld1.8 {d6}, [r0], r1
579 vld1.8 {d8}, [r0], r1
580 vld1.8 {d10}, [r0], r1
581 vld1.8 {d12}, [r0], r1
582 vld1.8 {d14}, [r0], r1
583 vld1.8 {d1}, [r0], r1 @ load second 8-line src data
584 vld1.8 {d3}, [r0], r1
585 vld1.8 {d5}, [r0], r1
586 vld1.8 {d7}, [r0], r1
587 vld1.8 {d9}, [r0], r1
588 vld1.8 {d11}, [r0], r1
589 vld1.8 {d13}, [r0], r1
590 vld1.8 {d15}, [r0], r1
592 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
594 vdup.8 q14, r2 @ flim_E
596 vdup.8 q15, r3 @ flim_I
599 vp8_loop_filter inner=\inner, simple=\simple
601 sub r0, r0, r1, lsl #4 @ backup 16 rows
603 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
606 vst1.8 {d0}, [r0], r1
607 vst1.8 {d2}, [r0], r1
608 vst1.8 {d4}, [r0], r1
609 vst1.8 {d6}, [r0], r1
610 vst1.8 {d8}, [r0], r1
611 vst1.8 {d10}, [r0], r1
612 vst1.8 {d12}, [r0], r1
613 vst1.8 {d14}, [r0], r1
614 vst1.8 {d1}, [r0], r1
615 vst1.8 {d3}, [r0], r1
616 vst1.8 {d5}, [r0], r1
617 vst1.8 {d7}, [r0], r1
618 vst1.8 {d9}, [r0], r1
619 vst1.8 {d11}, [r0], r1
620 vst1.8 {d13}, [r0], r1
629 vp8_h_loop_filter16 _inner, inner=1
630 vp8_h_loop_filter16 _simple, simple=1
632 .macro vp8_h_loop_filter8uv name, inner=0
633 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
637 ldr r12, [sp, #64] @ flim_I
640 vld1.8 {d0}, [r0], r2 @ load u
641 vld1.8 {d1}, [r1], r2 @ load v
642 vld1.8 {d2}, [r0], r2
643 vld1.8 {d3}, [r1], r2
644 vld1.8 {d4}, [r0], r2
645 vld1.8 {d5}, [r1], r2
646 vld1.8 {d6}, [r0], r2
647 vld1.8 {d7}, [r1], r2
648 vld1.8 {d8}, [r0], r2
649 vld1.8 {d9}, [r1], r2
650 vld1.8 {d10}, [r0], r2
651 vld1.8 {d11}, [r1], r2
652 vld1.8 {d12}, [r0], r2
653 vld1.8 {d13}, [r1], r2
654 vld1.8 {d14}, [r0], r2
655 vld1.8 {d15}, [r1], r2
657 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
659 vdup.8 q14, r3 @ flim_E
660 vdup.8 q15, r12 @ flim_I
661 ldr r12, [sp, #68] @ hev_thresh
663 vp8_loop_filter inner=\inner
665 sub r0, r0, r2, lsl #3 @ backup u 8 rows
666 sub r1, r1, r2, lsl #3 @ backup v 8 rows
668 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
671 vst1.8 {d0}, [r0], r2
672 vst1.8 {d1}, [r1], r2
673 vst1.8 {d2}, [r0], r2
674 vst1.8 {d3}, [r1], r2
675 vst1.8 {d4}, [r0], r2
676 vst1.8 {d5}, [r1], r2
677 vst1.8 {d6}, [r0], r2
678 vst1.8 {d7}, [r1], r2
679 vst1.8 {d8}, [r0], r2
680 vst1.8 {d9}, [r1], r2
681 vst1.8 {d10}, [r0], r2
682 vst1.8 {d11}, [r1], r2
683 vst1.8 {d12}, [r0], r2
684 vst1.8 {d13}, [r1], r2
694 vp8_h_loop_filter8uv _inner, inner=1
696 function ff_put_vp8_pixels16_neon, export=1
697 ldr r12, [sp, #0] @ h
700 vld1.8 {q0}, [r2], r3
701 vld1.8 {q1}, [r2], r3
702 vld1.8 {q2}, [r2], r3
703 vld1.8 {q3}, [r2], r3
704 vst1.8 {q0}, [r0,:128], r1
705 vst1.8 {q1}, [r0,:128], r1
706 vst1.8 {q2}, [r0,:128], r1
707 vst1.8 {q3}, [r0,:128], r1
712 function ff_put_vp8_pixels8_neon, export=1
713 ldr r12, [sp, #0] @ h
716 vld1.8 {d0}, [r2], r3
717 vld1.8 {d1}, [r2], r3
718 vld1.8 {d2}, [r2], r3
719 vld1.8 {d3}, [r2], r3
720 vst1.8 {d0}, [r0,:64], r1
721 vst1.8 {d1}, [r0,:64], r1
722 vst1.8 {d2}, [r0,:64], r1
723 vst1.8 {d3}, [r0,:64], r1
728 function ff_put_vp8_pixels4_neon, export=1
729 ldr r12, [sp, #0] @ h
745 /* 4/6-tap 8th-pel MC */
747 .macro vp8_epel8_h6 d, a, b
748 vext.8 d27, \a, \b, #1
750 vext.8 d28, \a, \b, #2
752 vext.8 d29, \a, \b, #3
754 vext.8 d30, \a, \b, #4
756 vext.8 d31, \a, \b, #5
758 vmul.u16 q10, q10, d0[2]
760 vmul.u16 q11, q11, d0[3]
761 vmls.u16 q10, q9, d0[1]
762 vmls.u16 q11, q12, d1[0]
763 vmla.u16 q10, q8, d0[0]
764 vmla.u16 q11, q13, d1[1]
765 vqadd.s16 q11, q10, q11
766 vqrshrun.s16 \d, q11, #7
769 .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
770 vext.8 q14, \q0, \q1, #3
771 vext.8 q15, \q0, \q1, #4
774 vext.8 q3, \q0, \q1, #2
777 vext.8 q8, \q0, \q1, #1
780 vext.8 q2, \q0, \q1, #5
785 vmul.u16 q11, q11, d0[3]
786 vmul.u16 q10, q10, d0[2]
787 vmul.u16 q3, q3, d0[2]
788 vmul.u16 q14, q14, d0[3]
789 vmls.u16 q11, q12, d1[0]
792 vmls.u16 q10, q9, d0[1]
793 vmls.u16 q3, q8, d0[1]
794 vmls.u16 q14, q15, d1[0]
795 vmla.u16 q10, q12, d0[0]
796 vmla.u16 q11, q13, d1[1]
797 vmla.u16 q3, q1, d0[0]
798 vmla.u16 q14, q2, d1[1]
799 vqadd.s16 q11, q10, q11
800 vqadd.s16 q14, q3, q14
801 vqrshrun.s16 \d0, q11, #7
802 vqrshrun.s16 \d1, q14, #7
805 .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
812 vmul.u16 q10, q10, d0[2]
813 vmul.u16 q11, q11, d0[3]
814 vmls.u16 q10, q9, d0[1]
815 vmls.u16 q11, q12, d1[0]
816 vmla.u16 q10, q8, d0[0]
817 vmla.u16 q11, q13, d1[1]
818 vqadd.s16 q11, q10, q11
819 vqrshrun.s16 \d0, q11, #7
822 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
830 vmul.u16 q10, q10, d0[0]
831 vmul.u16 q15, q11, d0[3]
832 vmul.u16 q11, q11, d0[2]
833 vmul.u16 q14, q14, d1[1]
834 vmls.u16 q10, q9, d0[1]
835 vmls.u16 q15, q12, d1[0]
836 vmls.u16 q11, q8, d0[1]
837 vmls.u16 q14, q13, d1[0]
838 vmla.u16 q10, q8, d0[2]
839 vmla.u16 q15, q13, d1[1]
840 vmla.u16 q11, q9, d0[0]
841 vmla.u16 q14, q12, d0[3]
842 vqadd.s16 q15, q10, q15
843 vqadd.s16 q14, q11, q14
844 vqrshrun.s16 \d0, q15, #7
845 vqrshrun.s16 \d1, q14, #7
848 .macro vp8_epel8_h4 d, a, b
849 vext.8 d28, \a, \b, #1
851 vext.8 d29, \a, \b, #2
853 vext.8 d30, \a, \b, #3
856 vmul.u16 q10, q10, d0[2]
857 vmul.u16 q11, q11, d0[3]
858 vmls.u16 q10, q9, d0[1]
859 vmls.u16 q11, q12, d1[0]
860 vqadd.s16 q11, q10, q11
861 vqrshrun.s16 \d, q11, #7
864 .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
870 vmul.u16 q8, q10, d0[2]
871 vmul.u16 q14, q11, d0[3]
872 vmul.u16 q11, q11, d0[2]
873 vmul.u16 q15, q12, d0[3]
874 vmls.u16 q8, q9, d0[1]
875 vmls.u16 q14, q12, d1[0]
876 vmls.u16 q11, q10, d0[1]
877 vmls.u16 q15, q13, d1[0]
878 vqadd.s16 q8, q8, q14
879 vqadd.s16 q11, q11, q15
880 vqrshrun.s16 \d0, q8, #7
881 vqrshrun.s16 \d1, q11, #7
884 function ff_put_vp8_epel16_v6_neon, export=1
885 sub r2, r2, r3, lsl #1
889 ldr r4, [sp, #80] @ my
890 movrel lr, subpel_filters-16
891 ldr r12, [sp, #72] @ h
892 add r4, lr, r4, lsl #4
893 vld1.16 {q0}, [r4,:128]
895 vld1.8 {d2-d3}, [r2], r3
896 vld1.8 {d4-d5}, [r2], r3
897 vld1.8 {d6-d7}, [r2], r3
898 vld1.8 {d8-d9}, [r2], r3
899 vld1.8 {d10-d11},[r2], r3
900 vld1.8 {d12-d13},[r2], r3
901 vld1.8 {d14-d15},[r2]
902 sub r2, r2, r3, lsl #2
904 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
905 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
907 vst1.8 {d2-d3}, [r0,:128], r1
908 vst1.8 {d4-d5}, [r0,:128], r1
916 function ff_put_vp8_epel16_h6_neon, export=1
920 ldr r4, [sp, #12] @ mx
921 movrel lr, subpel_filters-16
922 ldr r12, [sp, #8] @ h
923 add r4, lr, r4, lsl #4
924 vld1.16 {q0}, [r4,:128]
926 vld1.8 {d2-d4}, [r2], r3
928 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
930 vst1.8 {d2-d3}, [r0,:128], r1
937 function ff_put_vp8_epel16_h6v6_neon, export=1
938 sub r2, r2, r3, lsl #1
943 @ first pass (horizontal):
944 ldr r4, [sp, #28] @ mx
945 movrel lr, subpel_filters-16
946 ldr r12, [sp, #24] @ h
947 add r4, lr, r4, lsl #4
949 vld1.16 {q0}, [r4,:128]
954 vld1.8 {d2,d3,d4}, [r2], r3
956 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
958 vst1.8 {d2-d3}, [lr,:128]!
962 @ second pass (vertical):
963 ldr r4, [sp, #336+16+32] @ my
964 movrel lr, subpel_filters-16
965 ldr r12, [sp, #336+16+24] @ h
966 add r4, lr, r4, lsl #4
968 vld1.16 {q0}, [r4,:128]
971 vld1.8 {d2-d5}, [lr,:128]!
972 vld1.8 {d6-d9}, [lr,:128]!
973 vld1.8 {d28-d31},[lr,:128]
976 vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
977 vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
979 vst1.8 {d2-d3}, [r0,:128], r1
988 function ff_put_vp8_epel8_v6_neon, export=1
989 sub r2, r2, r3, lsl #1
992 ldr r4, [sp, #16] @ my
993 movrel lr, subpel_filters-16
994 ldr r12, [sp, #8] @ h
995 add r4, lr, r4, lsl #4
996 vld1.16 {q0}, [r4,:128]
998 vld1.8 {d2}, [r2], r3
999 vld1.8 {d3}, [r2], r3
1000 vld1.8 {d4}, [r2], r3
1001 vld1.8 {d5}, [r2], r3
1002 vld1.8 {d6}, [r2], r3
1003 vld1.8 {d7}, [r2], r3
1006 sub r2, r2, r3, lsl #2
1008 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1010 vst1.8 {d2}, [r0,:64], r1
1011 vst1.8 {d3}, [r0,:64], r1
1018 function ff_put_vp8_epel8_h6_neon, export=1
1022 ldr r4, [sp, #12] @ mx
1023 movrel lr, subpel_filters-16
1024 ldr r12, [sp, #8] @ h
1025 add r4, lr, r4, lsl #4
1026 vld1.16 {q0}, [r4,:128]
1028 vld1.8 {d2,d3}, [r2], r3
1030 vp8_epel8_h6 d2, d2, d3
1032 vst1.8 {d2}, [r0,:64], r1
1039 function ff_put_vp8_epel8_h6v6_neon, export=1
1040 sub r2, r2, r3, lsl #1
1044 @ first pass (horizontal):
1045 ldr r4, [sp, #12] @ mx
1046 movrel lr, subpel_filters-16
1047 ldr r12, [sp, #8] @ h
1048 add r4, lr, r4, lsl #4
1050 vld1.16 {q0}, [r4,:128]
1055 vld1.8 {d2,d3}, [r2], r3
1057 vp8_epel8_h6 d2, d2, d3
1059 vst1.8 {d2}, [lr,:64]!
1063 @ second pass (vertical):
1064 ldr r4, [sp, #168+16+16] @ my
1065 movrel lr, subpel_filters-16
1066 ldr r12, [sp, #168+16+8] @ h
1067 add r4, lr, r4, lsl #4
1069 vld1.16 {q0}, [r4,:128]
1072 vld1.8 {d2-d5}, [lr,:128]!
1073 vld1.8 {d6-d7}, [lr,:128]!
1074 vld1.8 {d30}, [lr,:64]
1077 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1079 vst1.8 {d2}, [r0,:64], r1
1080 vst1.8 {d3}, [r0,:64], r1
1088 function ff_put_vp8_epel8_v4_neon, export=1
1092 ldr r4, [sp, #16] @ my
1093 movrel lr, subpel_filters-16
1094 ldr r12, [sp, #8] @ h
1095 add r4, lr, r4, lsl #4
1096 vld1.16 {q0}, [r4,:128]
1098 vld1.8 {d2}, [r2], r3
1099 vld1.8 {d3}, [r2], r3
1100 vld1.8 {d4}, [r2], r3
1101 vld1.8 {d5}, [r2], r3
1103 sub r2, r2, r3, lsl #1
1105 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1107 vst1.8 {d2}, [r0,:64], r1
1108 vst1.8 {d3}, [r0,:64], r1
1115 function ff_put_vp8_epel8_h4_neon, export=1
1119 ldr r4, [sp, #12] @ mx
1120 movrel lr, subpel_filters-16
1121 ldr r12, [sp, #8] @ h
1122 add r4, lr, r4, lsl #4
1123 vld1.16 {q0}, [r4,:128]
1125 vld1.8 {d2,d3}, [r2], r3
1127 vp8_epel8_h4 d2, d2, d3
1129 vst1.8 {d2}, [r0,:64], r1
1136 function ff_put_vp8_epel8_h4v4_neon, export=1
1141 @ first pass (horizontal):
1142 ldr r4, [sp, #12] @ mx
1143 movrel lr, subpel_filters-16
1144 ldr r12, [sp, #8] @ h
1145 add r4, lr, r4, lsl #4
1147 vld1.16 {q0}, [r4,:128]
1152 vld1.8 {d2,d3}, [r2], r3
1154 vp8_epel8_h4 d2, d2, d3
1156 vst1.8 {d2}, [lr,:64]!
1160 @ second pass (vertical):
1161 ldr r4, [sp, #168+16+16] @ my
1162 movrel lr, subpel_filters-16
1163 ldr r12, [sp, #168+16+8] @ h
1164 add r4, lr, r4, lsl #4
1166 vld1.16 {q0}, [r4,:128]
1169 vld1.8 {d2-d5}, [lr,:128]!
1170 vld1.8 {d6}, [lr,:64]
1173 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1175 vst1.8 {d2}, [r0,:64], r1
1176 vst1.8 {d3}, [r0,:64], r1
1184 function ff_put_vp8_epel8_h6v4_neon, export=1
1189 @ first pass (horizontal):
1190 ldr r4, [sp, #12] @ mx
1191 movrel lr, subpel_filters-16
1192 ldr r12, [sp, #8] @ h
1193 add r4, lr, r4, lsl #4
1195 vld1.16 {q0}, [r4,:128]
1200 vld1.8 {d2,d3}, [r2], r3
1202 vp8_epel8_h6 d2, d2, d3
1204 vst1.8 {d2}, [lr,:64]!
1208 @ second pass (vertical):
1209 ldr r4, [sp, #168+16+16] @ my
1210 movrel lr, subpel_filters-16
1211 ldr r12, [sp, #168+16+8] @ h
1212 add r4, lr, r4, lsl #4
1214 vld1.16 {q0}, [r4,:128]
1217 vld1.8 {d2-d5}, [lr,:128]!
1218 vld1.8 {d6}, [lr,:64]
1221 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1223 vst1.8 {d2}, [r0,:64], r1
1224 vst1.8 {d3}, [r0,:64], r1
1232 function ff_put_vp8_epel8_h4v6_neon, export=1
1233 sub r2, r2, r3, lsl #1
1237 @ first pass (horizontal):
1238 ldr r4, [sp, #12] @ mx
1239 movrel lr, subpel_filters-16
1240 ldr r12, [sp, #8] @ h
1241 add r4, lr, r4, lsl #4
1243 vld1.16 {q0}, [r4,:128]
1248 vld1.8 {d2,d3}, [r2], r3
1250 vp8_epel8_h4 d2, d2, d3
1252 vst1.8 {d2}, [lr,:64]!
1256 @ second pass (vertical):
1257 ldr r4, [sp, #168+16+16] @ my
1258 movrel lr, subpel_filters-16
1259 ldr r12, [sp, #168+16+8] @ h
1260 add r4, lr, r4, lsl #4
1262 vld1.16 {q0}, [r4,:128]
1265 vld1.8 {d2-d5}, [lr,:128]!
1266 vld1.8 {d6-d7}, [lr,:128]!
1267 vld1.8 {d30}, [lr,:64]
1270 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1272 vst1.8 {d2}, [r0,:64], r1
1273 vst1.8 {d3}, [r0,:64], r1
1283 function ff_put_vp8_epel4_v6_neon, export=1
1284 sub r2, r2, r3, lsl #1
1287 ldr r4, [sp, #16] @ my
1288 movrel lr, subpel_filters-16
1289 ldr r12, [sp, #8] @ h
1290 add r4, lr, r4, lsl #4
1291 vld1.16 {q0}, [r4,:128]
1293 vld1.32 {d2[]}, [r2], r3
1294 vld1.32 {d3[]}, [r2], r3
1295 vld1.32 {d4[]}, [r2], r3
1296 vld1.32 {d5[]}, [r2], r3
1297 vld1.32 {d6[]}, [r2], r3
1298 vld1.32 {d7[]}, [r2], r3
1299 vld1.32 {d28[]}, [r2]
1300 sub r2, r2, r3, lsl #2
1301 vld1.32 {d2[1]}, [r2], r3
1302 vld1.32 {d3[1]}, [r2], r3
1303 vld1.32 {d4[1]}, [r2], r3
1304 vld1.32 {d5[1]}, [r2], r3
1305 vld1.32 {d6[1]}, [r2], r3
1306 vld1.32 {d7[1]}, [r2], r3
1307 vld1.32 {d28[1]}, [r2]
1308 sub r2, r2, r3, lsl #2
1310 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1312 vst1.32 {d2[0]}, [r0,:32], r1
1313 vst1.32 {d3[0]}, [r0,:32], r1
1314 vst1.32 {d2[1]}, [r0,:32], r1
1315 vst1.32 {d3[1]}, [r0,:32], r1
1322 function ff_put_vp8_epel4_h6_neon, export=1
1326 ldr r4, [sp, #12] @ mx
1327 movrel lr, subpel_filters-16
1328 ldr r12, [sp, #8] @ h
1329 add r4, lr, r4, lsl #4
1330 vld1.16 {q0}, [r4,:128]
1332 vld1.8 {q1}, [r2], r3
1333 vp8_epel8_h6 d2, d2, d3
1334 vst1.32 {d2[0]}, [r0,:32], r1
1341 function ff_put_vp8_epel4_h6v6_neon, export=1
1342 sub r2, r2, r3, lsl #1
1346 ldr r4, [sp, #12] @ mx
1347 movrel lr, subpel_filters-16
1348 ldr r12, [sp, #8] @ h
1349 add r4, lr, r4, lsl #4
1351 vld1.16 {q0}, [r4,:128]
1356 vld1.8 {q1}, [r2], r3
1357 vp8_epel8_h6 d2, d2, d3
1358 vst1.32 {d2[0]}, [lr,:32]!
1362 ldr r4, [sp, #52+16+16] @ my
1363 movrel lr, subpel_filters-16
1364 ldr r12, [sp, #52+16+8] @ h
1365 add r4, lr, r4, lsl #4
1367 vld1.16 {q0}, [r4,:128]
1370 vld1.8 {d2-d3}, [lr,:128]!
1371 vld1.8 {d6}, [lr,:64]!
1372 vld1.32 {d28[]}, [lr,:32]
1374 vld1.8 {d4-d5}, [lr]!
1375 vld1.8 {d7}, [lr,:64]!
1376 vld1.32 {d28[1]}, [lr,:32]
1380 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1381 vst1.32 {d2[0]}, [r0,:32], r1
1382 vst1.32 {d3[0]}, [r0,:32], r1
1383 vst1.32 {d2[1]}, [r0,:32], r1
1384 vst1.32 {d3[1]}, [r0,:32], r1
1392 function ff_put_vp8_epel4_h4v6_neon, export=1
1393 sub r2, r2, r3, lsl #1
1397 ldr r4, [sp, #12] @ mx
1398 movrel lr, subpel_filters-16
1399 ldr r12, [sp, #8] @ h
1400 add r4, lr, r4, lsl #4
1402 vld1.16 {q0}, [r4,:128]
1407 vld1.8 {d2}, [r2], r3
1408 vp8_epel8_h4 d2, d2, d2
1409 vst1.32 {d2[0]}, [lr,:32]!
1413 ldr r4, [sp, #52+16+16] @ my
1414 movrel lr, subpel_filters-16
1415 ldr r12, [sp, #52+16+8] @ h
1416 add r4, lr, r4, lsl #4
1418 vld1.16 {q0}, [r4,:128]
1421 vld1.8 {d2-d3}, [lr,:128]!
1422 vld1.8 {d6}, [lr,:64]!
1423 vld1.32 {d28[]}, [lr,:32]
1425 vld1.8 {d4-d5}, [lr]!
1426 vld1.8 {d7}, [lr,:64]!
1427 vld1.32 {d28[1]}, [lr,:32]
1431 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1432 vst1.32 {d2[0]}, [r0,:32], r1
1433 vst1.32 {d3[0]}, [r0,:32], r1
1434 vst1.32 {d2[1]}, [r0,:32], r1
1435 vst1.32 {d3[1]}, [r0,:32], r1
1443 function ff_put_vp8_epel4_h6v4_neon, export=1
1448 ldr r4, [sp, #12] @ mx
1449 movrel lr, subpel_filters-16
1450 ldr r12, [sp, #8] @ h
1451 add r4, lr, r4, lsl #4
1453 vld1.16 {q0}, [r4,:128]
1458 vld1.8 {q1}, [r2], r3
1459 vp8_epel8_h6 d2, d2, d3
1460 vst1.32 {d2[0]}, [lr,:32]!
1464 ldr r4, [sp, #44+16+16] @ my
1465 movrel lr, subpel_filters-16
1466 ldr r12, [sp, #44+16+8] @ h
1467 add r4, lr, r4, lsl #4
1469 vld1.16 {q0}, [r4,:128]
1472 vld1.8 {d2-d3}, [lr,:128]!
1473 vld1.32 {d6[]}, [lr,:32]
1475 vld1.8 {d4-d5}, [lr]!
1476 vld1.32 {d6[1]}, [lr,:32]
1479 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1480 vst1.32 {d2[0]}, [r0,:32], r1
1481 vst1.32 {d3[0]}, [r0,:32], r1
1482 vst1.32 {d2[1]}, [r0,:32], r1
1483 vst1.32 {d3[1]}, [r0,:32], r1
1491 function ff_put_vp8_epel4_h4_neon, export=1
1495 ldr r4, [sp, #12] @ mx
1496 movrel lr, subpel_filters-16
1497 ldr r12, [sp, #8] @ h
1498 add r4, lr, r4, lsl #4
1499 vld1.16 {q0}, [r4,:128]
1501 vld1.8 {d2}, [r2], r3
1502 vp8_epel8_h4 d2, d2, d2
1503 vst1.32 {d2[0]}, [r0,:32], r1
1510 function ff_put_vp8_epel4_v4_neon, export=1
1514 ldr r4, [sp, #16] @ my
1515 movrel lr, subpel_filters-16
1516 ldr r12, [sp, #8] @ h
1517 add r4, lr, r4, lsl #4
1518 vld1.16 {q0}, [r4,:128]
1520 vld1.32 {d2[]}, [r2], r3
1521 vld1.32 {d3[]}, [r2], r3
1522 vld1.32 {d4[]}, [r2], r3
1523 vld1.32 {d5[]}, [r2], r3
1524 vld1.32 {d6[]}, [r2]
1525 sub r2, r2, r3, lsl #1
1526 vld1.32 {d2[1]}, [r2], r3
1527 vld1.32 {d3[1]}, [r2], r3
1528 vld1.32 {d4[1]}, [r2], r3
1529 vld1.32 {d5[1]}, [r2], r3
1530 vld1.32 {d6[1]}, [r2]
1531 sub r2, r2, r3, lsl #1
1533 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1535 vst1.32 {d2[0]}, [r0,:32], r1
1536 vst1.32 {d3[0]}, [r0,:32], r1
1537 vst1.32 {d2[1]}, [r0,:32], r1
1538 vst1.32 {d3[1]}, [r0,:32], r1
1545 function ff_put_vp8_epel4_h4v4_neon, export=1
1550 ldr r4, [sp, #12] @ mx
1551 movrel lr, subpel_filters-16
1552 ldr r12, [sp, #8] @ h
1553 add r4, lr, r4, lsl #4
1555 vld1.16 {q0}, [r4,:128]
1560 vld1.8 {d2}, [r2], r3
1561 vp8_epel8_h4 d2, d2, d3
1562 vst1.32 {d2[0]}, [lr,:32]!
1566 ldr r4, [sp, #44+16+16] @ my
1567 movrel lr, subpel_filters-16
1568 ldr r12, [sp, #44+16+8] @ h
1569 add r4, lr, r4, lsl #4
1571 vld1.16 {q0}, [r4,:128]
1574 vld1.8 {d2-d3}, [lr,:128]!
1575 vld1.32 {d6[]}, [lr,:32]
1577 vld1.8 {d4-d5}, [lr]!
1578 vld1.32 {d6[1]}, [lr,:32]
1581 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1582 vst1.32 {d2[0]}, [r0,:32], r1
1583 vst1.32 {d3[0]}, [r0,:32], r1
1584 vst1.32 {d2[1]}, [r0,:32], r1
1585 vst1.32 {d3[1]}, [r0,:32], r1
1593 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1594 @ arithmatic can be used to apply filters
1595 const subpel_filters, align=4
1596 .short 0, 6, 123, 12, 1, 0, 0, 0
1597 .short 2, 11, 108, 36, 8, 1, 0, 0
1598 .short 0, 9, 93, 50, 6, 0, 0, 0
1599 .short 3, 16, 77, 77, 16, 3, 0, 0
1600 .short 0, 6, 50, 93, 9, 0, 0, 0
1601 .short 1, 8, 36, 108, 11, 2, 0, 0
1602 .short 0, 1, 12, 123, 6, 0, 0, 0
1607 function ff_put_vp8_bilin16_h_neon, export=1
1608 ldr r3, [sp, #4] @ mx
1615 vld1.8 {d2-d4}, [r2], r1
1616 vext.8 q2, q1, q2, #1
1619 vld1.8 {d18-d20},[r2], r1
1622 vext.8 q10, q9, q10, #1
1623 vmull.u8 q11, d18, d1
1624 vmlal.u8 q11, d20, d0
1625 vmull.u8 q12, d19, d1
1626 vmlal.u8 q12, d21, d0
1627 vrshrn.u16 d4, q8, #3
1628 vrshrn.u16 d5, q3, #3
1629 vrshrn.u16 d6, q11, #3
1630 vrshrn.u16 d7, q12, #3
1631 vst1.8 {q2}, [r0,:128], r1
1632 vst1.8 {q3}, [r0,:128], r1
1638 function ff_put_vp8_bilin16_v_neon, export=1
1639 ldr r3, [sp, #8] @ my
1644 vld1.8 {q1}, [r2], r1
1647 vld1.8 {q2}, [r2], r1
1652 vld1.8 {q1}, [r2], r1
1655 vmull.u8 q10, d5, d1
1656 vmlal.u8 q10, d3, d0
1657 vrshrn.u16 d4, q3, #3
1658 vrshrn.u16 d5, q8, #3
1659 vrshrn.u16 d6, q9, #3
1660 vrshrn.u16 d7, q10, #3
1661 vst1.8 {q2}, [r0,:128], r1
1662 vst1.8 {q3}, [r0,:128], r1
1668 function ff_put_vp8_bilin16_hv_neon, export=1
1669 ldr r3, [sp, #4] @ mx
1673 ldr r3, [sp, #8] @ my
1679 vld1.8 {d4-d6}, [r2], r1
1680 vext.8 q3, q2, q3, #1
1685 vrshrn.u16 d4, q8, #3
1686 vrshrn.u16 d5, q9, #3
1689 vld1.8 {d18-d20},[r2], r1
1690 vext.8 q10, q9, q10, #1
1691 vmull.u8 q11, d18, d1
1692 vmlal.u8 q11, d20, d0
1693 vld1.8 {d26-d28},[r2], r1
1694 vmull.u8 q12, d19, d1
1695 vmlal.u8 q12, d21, d0
1696 vext.8 q14, q13, q14, #1
1697 vmull.u8 q8, d26, d1
1698 vmlal.u8 q8, d28, d0
1699 vmull.u8 q9, d27, d1
1700 vmlal.u8 q9, d29, d0
1701 vrshrn.u16 d6, q11, #3
1702 vrshrn.u16 d7, q12, #3
1703 vmull.u8 q12, d4, d3
1704 vmlal.u8 q12, d6, d2
1705 vmull.u8 q15, d5, d3
1706 vmlal.u8 q15, d7, d2
1707 vrshrn.u16 d4, q8, #3
1708 vrshrn.u16 d5, q9, #3
1709 vmull.u8 q10, d6, d3
1710 vmlal.u8 q10, d4, d2
1711 vmull.u8 q11, d7, d3
1712 vmlal.u8 q11, d5, d2
1713 vrshrn.u16 d24, q12, #3
1714 vrshrn.u16 d25, q15, #3
1715 vst1.8 {q12}, [r0,:128], r1
1716 vrshrn.u16 d20, q10, #3
1717 vrshrn.u16 d21, q11, #3
1718 vst1.8 {q10}, [r0,:128], r1
1724 function ff_put_vp8_bilin8_h_neon, export=1
1725 ldr r3, [sp, #4] @ mx
1732 vld1.8 {q1}, [r2], r1
1733 vext.8 d3, d2, d3, #1
1736 vld1.8 {q3}, [r2], r1
1737 vext.8 d7, d6, d7, #1
1740 vrshrn.u16 d4, q2, #3
1741 vrshrn.u16 d16, q8, #3
1742 vst1.8 {d4}, [r0,:64], r1
1743 vst1.8 {d16}, [r0,:64], r1
1749 function ff_put_vp8_bilin8_v_neon, export=1
1750 ldr r3, [sp, #8] @ my
1755 vld1.8 {d2}, [r2], r1
1758 vld1.8 {d3}, [r2], r1
1761 vld1.8 {d2}, [r2], r1
1764 vrshrn.u16 d4, q2, #3
1765 vrshrn.u16 d6, q3, #3
1766 vst1.8 {d4}, [r0,:64], r1
1767 vst1.8 {d6}, [r0,:64], r1
1773 function ff_put_vp8_bilin8_hv_neon, export=1
1774 ldr r3, [sp, #4] @ mx
1778 ldr r3, [sp, #8] @ my
1784 vld1.8 {q2}, [r2], r1
1785 vext.8 d5, d4, d5, #1
1788 vrshrn.u16 d22, q9, #3
1791 vld1.8 {q3}, [r2], r1
1792 vext.8 d7, d6, d7, #1
1795 vld1.8 {q2}, [r2], r1
1796 vext.8 d5, d4, d5, #1
1799 vrshrn.u16 d16, q8, #3
1800 vmull.u8 q10, d22, d3
1801 vmlal.u8 q10, d16, d2
1802 vrshrn.u16 d22, q9, #3
1803 vmull.u8 q12, d16, d3
1804 vmlal.u8 q12, d22, d2
1805 vrshrn.u16 d20, q10, #3
1806 vst1.8 {d20}, [r0,:64], r1
1807 vrshrn.u16 d23, q12, #3
1808 vst1.8 {d23}, [r0,:64], r1
1814 function ff_put_vp8_bilin4_h_neon, export=1
1815 ldr r3, [sp, #4] @ mx
1822 vld1.8 {d2}, [r2], r1
1823 vext.8 d3, d2, d3, #1
1824 vld1.8 {d6}, [r2], r1
1825 vext.8 d7, d6, d7, #1
1829 vrshrn.u16 d4, q2, #3
1830 vst1.32 {d4[0]}, [r0,:32], r1
1831 vst1.32 {d4[1]}, [r0,:32], r1
1837 function ff_put_vp8_bilin4_v_neon, export=1
1838 ldr r3, [sp, #8] @ my
1843 vld1.32 {d2[]}, [r2], r1
1845 vld1.32 {d3[]}, [r2]
1846 vld1.32 {d2[1]}, [r2], r1
1847 vld1.32 {d3[1]}, [r2], r1
1851 vrshrn.u16 d4, q2, #3
1852 vst1.32 {d4[0]}, [r0,:32], r1
1853 vst1.32 {d4[1]}, [r0,:32], r1
1860 function ff_put_vp8_bilin4_hv_neon, export=1
1861 ldr r3, [sp, #4] @ mx
1865 ldr r3, [sp, #8] @ my
1871 vld1.8 {d4}, [r2], r1
1872 vext.8 d5, d4, d4, #1
1875 vrshrn.u16 d22, q9, #3
1878 vld1.8 {d6}, [r2], r1
1879 vext.8 d7, d6, d6, #1
1880 vld1.8 {d4}, [r2], r1
1881 vext.8 d5, d4, d4, #1
1885 vrshrn.u16 d16, q8, #3
1886 vmull.u8 q10, d16, d2
1888 vmlal.u8 q10, d22, d3
1890 vrshrn.u16 d20, q10, #3
1891 vst1.32 {d20[0]}, [r0,:32], r1
1892 vst1.32 {d20[1]}, [r0,:32], r1