2 * VP8 NEON optimisations
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 function ff_vp8_luma_dc_wht_neon, export=1
27 vld1.16 {q0-q1}, [r1,:128]
32 vst1.16 {q15}, [r1,:128]!
35 vst1.16 {q15}, [r1,:128]
59 vst1.16 {d0[0]}, [r0,:16], r3
60 vst1.16 {d1[0]}, [r0,:16], r3
61 vst1.16 {d2[0]}, [r0,:16], r3
62 vst1.16 {d3[0]}, [r0,:16], r3
63 vst1.16 {d0[1]}, [r0,:16], r3
64 vst1.16 {d1[1]}, [r0,:16], r3
65 vst1.16 {d2[1]}, [r0,:16], r3
66 vst1.16 {d3[1]}, [r0,:16], r3
67 vst1.16 {d0[2]}, [r0,:16], r3
68 vst1.16 {d1[2]}, [r0,:16], r3
69 vst1.16 {d2[2]}, [r0,:16], r3
70 vst1.16 {d3[2]}, [r0,:16], r3
71 vst1.16 {d0[3]}, [r0,:16], r3
72 vst1.16 {d1[3]}, [r0,:16], r3
73 vst1.16 {d2[3]}, [r0,:16], r3
74 vst1.16 {d3[3]}, [r0,:16], r3
79 function ff_vp8_luma_dc_wht_dc_neon, export=1
91 function ff_vp8_idct_add_neon, export=1
92 vld1.16 {q0-q1}, [r1,:128]
97 vmull.s16 q12, d1, d4[0]
98 vmull.s16 q13, d3, d4[0]
99 vqdmulh.s16 d20, d1, d4[1]
100 vqdmulh.s16 d23, d3, d4[1]
101 vshrn.s32 d21, q12, #16
102 vshrn.s32 d22, q13, #16
103 vadd.s16 d21, d21, d1
104 vadd.s16 d22, d22, d3
108 vadd.s16 d18, d21, d23
109 vsub.s16 d19, d20, d22
119 vmull.s16 q12, d1, d4[0]
120 vst1.16 {q15}, [r1,:128]!
121 vmull.s16 q13, d2, d4[0]
122 vst1.16 {q15}, [r1,:128]
123 vqdmulh.s16 d21, d1, d4[1]
124 vqdmulh.s16 d23, d2, d4[1]
125 vshrn.s32 d20, q12, #16
126 vshrn.s32 d22, q13, #16
127 vadd.i16 d20, d20, d1
128 vadd.i16 d22, d22, d2
132 vadd.i16 d18, d20, d23
133 vld1.32 {d20[]}, [r0,:32], r2
134 vsub.i16 d19, d21, d22
135 vld1.32 {d22[]}, [r0,:32], r2
137 vld1.32 {d23[]}, [r0,:32], r2
139 vld1.32 {d21[]}, [r0,:32], r2
144 sub r0, r0, r2, lsl #2
156 vst1.32 {d0[0]}, [r0,:32], r2
157 vst1.32 {d0[1]}, [r0,:32], r2
158 vst1.32 {d1[1]}, [r0,:32], r2
159 vst1.32 {d1[0]}, [r0,:32], r2
164 function ff_vp8_idct_dc_add_neon, export=1
170 vld1.32 {d0[]}, [r0,:32], r2
171 vld1.32 {d1[]}, [r0,:32], r2
172 vld1.32 {d0[1]}, [r0,:32], r2
173 vld1.32 {d1[1]}, [r0,:32], r2
176 sub r0, r0, r2, lsl #2
179 vst1.32 {d0[0]}, [r0,:32], r2
180 vst1.32 {d1[0]}, [r0,:32], r2
181 vst1.32 {d0[1]}, [r0,:32], r2
182 vst1.32 {d1[1]}, [r0,:32], r2
186 function ff_vp8_idct_dc_add4uv_neon, export=1
189 vld1.16 {d16[]}, [r1,:16]
190 vst1.16 {d0[0]}, [r1,:16], r3
191 vld1.16 {d17[]}, [r1,:16]
192 vst1.16 {d0[0]}, [r1,:16], r3
193 vld1.16 {d18[]}, [r1,:16]
194 vst1.16 {d0[0]}, [r1,:16], r3
195 vld1.16 {d19[]}, [r1,:16]
196 vst1.16 {d0[0]}, [r1,:16], r3
198 vrshr.s16 q8, q8, #3 @ dc >>= 3
199 vld1.8 {d0}, [r0,:64], r2
201 vld1.8 {d1}, [r0,:64], r2
203 vld1.8 {d2}, [r0,:64], r2
205 vld1.8 {d3}, [r0,:64], r2
207 vld1.8 {d4}, [r0,:64], r2
209 vld1.8 {d5}, [r0,:64], r2
211 vld1.8 {d6}, [r0,:64], r2
213 vld1.8 {d7}, [r0,:64], r2
219 vst1.8 {d20}, [r3,:64], r2
221 vst1.8 {d21}, [r3,:64], r2
223 vst1.8 {d22}, [r3,:64], r2
225 vst1.8 {d23}, [r3,:64], r2
227 vst1.8 {d24}, [r3,:64], r2
229 vst1.8 {d25}, [r3,:64], r2
230 vst1.8 {d26}, [r3,:64], r2
231 vst1.8 {d27}, [r3,:64], r2
236 function ff_vp8_idct_dc_add4y_neon, export=1
239 vld1.16 {d16[]}, [r1,:16]
240 vst1.16 {d0[0]}, [r1,:16], r3
241 vld1.16 {d17[]}, [r1,:16]
242 vst1.16 {d0[0]}, [r1,:16], r3
243 vld1.16 {d18[]}, [r1,:16]
244 vst1.16 {d0[0]}, [r1,:16], r3
245 vld1.16 {d19[]}, [r1,:16]
246 vst1.16 {d0[0]}, [r1,:16], r3
247 vrshr.s16 q8, q8, #3 @ dc >>= 3
248 vld1.8 {q0}, [r0,:128], r2
250 vld1.8 {q1}, [r0,:128], r2
252 vld1.8 {q2}, [r0,:128], r2
254 vld1.8 {q3}, [r0,:128], r2
261 sub r0, r0, r2, lsl #2
267 vst1.8 {q10}, [r0,:128], r2
269 vst1.8 {q11}, [r0,:128], r2
271 vst1.8 {q12}, [r0,:128], r2
273 vst1.8 {q13}, [r0,:128], r2
284 .macro vp8_loop_filter, inner=0, simple=0
286 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
287 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
288 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
289 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
290 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
292 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
294 @ calculate hev and normal_limit:
295 vabd.u8 q12, q2, q3 @ abs(P1-P0)
296 vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
297 vabd.u8 q10, q0, q1 @ abs(P3-P2)
298 vabd.u8 q11, q1, q2 @ abs(P2-P1)
299 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
300 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
301 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
302 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
304 vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
306 vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
308 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
309 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
310 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
311 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
313 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
315 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
316 vdup.8 q15, r12 @ hev_thresh
317 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
318 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
319 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
320 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
330 @ convert to signed value:
331 veor q3, q3, q13 @ PS0 = P0 ^ 0x80
332 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
335 vsubl.s8 q10, d8, d6 @ QS0 - PS0
336 vsubl.s8 q11, d9, d7 @ (widened to 16bit)
337 veor q2, q2, q13 @ PS1 = P1 ^ 0x80
338 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
339 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
340 vmul.i16 q11, q11, q12
342 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
346 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
348 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
349 vaddw.s8 q11, q11, d25
350 vqmovn.s16 d20, q10 @ narrow result back into q10
352 .if !\inner && !\simple
353 veor q1, q1, q13 @ PS2 = P2 ^ 0x80
354 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
356 vand q10, q10, q8 @ w &= normal_limit
358 @ registers used at this point..
359 @ q0 -> P3 (don't corrupt)
361 @ q7 -> Q3 (don't corrupt)
367 @ q8, q11, q12 -> unused
369 @ filter_common: is4tap==1
370 @ c1 = clamp(w + 4) >> 3;
371 @ c2 = clamp(w + 3) >> 3;
372 @ Q0 = s2u(QS0 - c1);
373 @ P0 = s2u(PS0 + c2);
376 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
377 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
378 vshr.s8 q11, q11, #3 @ c1 >>= 3
379 vshr.s8 q12, q12, #3 @ c2 >>= 3
380 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
381 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
382 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
383 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
384 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
385 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
387 @ the !is4tap case of filter_common, only used for inner blocks
388 @ c3 = ((c1&~hev) + 1) >> 1;
389 @ Q1 = s2u(QS1 - c3);
390 @ P1 = s2u(PS1 + c3);
391 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
392 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
393 vshr.s8 q11, q11, #3 @ c1 >>= 3
394 vshr.s8 q12, q12, #3 @ c2 >>= 3
395 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
396 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
397 vbic q11, q11, q9 @ c1 & ~hev
398 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
399 vrshr.s8 q11, q11, #1 @ c3 >>= 1
400 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
401 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
402 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
403 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
404 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
406 vand q12, q10, q9 @ w & hev
407 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
408 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
409 vshr.s8 q11, q11, #3 @ c1 >>= 3
410 vshr.s8 q12, q12, #3 @ c2 >>= 3
411 vbic q10, q10, q9 @ w &= ~hev
412 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
413 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
416 @ a = clamp((27*w + 63) >> 7);
419 @ a = clamp((18*w + 63) >> 7);
422 @ a = clamp((9*w + 63) >> 7);
426 vshll.s8 q14, d20, #3
427 vshll.s8 q15, d21, #3
428 vaddw.s8 q14, q14, d20
429 vaddw.s8 q15, q15, d21
431 vadd.s16 q9, q9, q15 @ 9*w + 63
432 vadd.s16 q11, q8, q14
433 vadd.s16 q12, q9, q15 @ 18*w + 63
434 vadd.s16 q14, q11, q14
435 vadd.s16 q15, q12, q15 @ 27*w + 63
436 vqshrn.s16 d16, q8, #7
437 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
438 vqshrn.s16 d22, q11, #7
439 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
440 vqshrn.s16 d28, q14, #7
441 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
442 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
443 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
444 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
445 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
446 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
447 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
448 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
449 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
450 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
451 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
452 veor q1, q1, q13 @ P2 = PS2 ^ 0x80
453 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
457 .macro transpose8x16matrix
474 .macro vp8_v_loop_filter16 name, inner=0, simple=0
475 function ff_vp8_v_loop_filter16\name\()_neon, export=1
477 sub r0, r0, r1, lsl #1+!\simple
481 ldr r12, [sp, #64] @ hev_thresh
482 vld1.8 {q0}, [r0,:128], r1 @ P3
483 vld1.8 {q1}, [r0,:128], r1 @ P2
485 vld1.8 {q2}, [r0,:128], r1 @ P1
486 vld1.8 {q3}, [r0,:128], r1 @ P0
487 vld1.8 {q4}, [r0,:128], r1 @ Q0
488 vld1.8 {q5}, [r0,:128], r1 @ Q1
490 vld1.8 {q6}, [r0,:128], r1 @ Q2
491 vld1.8 {q7}, [r0,:128] @ Q3
492 vdup.8 q15, r3 @ flim_I
494 vdup.8 q14, r2 @ flim_E
496 vp8_loop_filter inner=\inner, simple=\simple
498 @ back up to P2: dst -= stride * 6
499 sub r0, r0, r1, lsl #2
501 sub r0, r0, r1, lsl #1
504 vst1.8 {q1}, [r0,:128], r1 @ P2
506 vst1.8 {q2}, [r0,:128], r1 @ P1
507 vst1.8 {q3}, [r0,:128], r1 @ P0
508 vst1.8 {q4}, [r0,:128], r1 @ Q0
509 vst1.8 {q5}, [r0,:128], r1 @ Q1
511 vst1.8 {q6}, [r0,:128] @ Q2
520 vp8_v_loop_filter16 _inner, inner=1
521 vp8_v_loop_filter16 _simple, simple=1
523 .macro vp8_v_loop_filter8uv name, inner=0
524 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
526 sub r0, r0, r2, lsl #2
527 sub r1, r1, r2, lsl #2
528 ldr r12, [sp, #64] @ flim_I
531 vld1.8 {d0}, [r0,:64], r2 @ P3
532 vld1.8 {d1}, [r1,:64], r2 @ P3
533 vld1.8 {d2}, [r0,:64], r2 @ P2
534 vld1.8 {d3}, [r1,:64], r2 @ P2
535 vld1.8 {d4}, [r0,:64], r2 @ P1
536 vld1.8 {d5}, [r1,:64], r2 @ P1
537 vld1.8 {d6}, [r0,:64], r2 @ P0
538 vld1.8 {d7}, [r1,:64], r2 @ P0
539 vld1.8 {d8}, [r0,:64], r2 @ Q0
540 vld1.8 {d9}, [r1,:64], r2 @ Q0
541 vld1.8 {d10}, [r0,:64], r2 @ Q1
542 vld1.8 {d11}, [r1,:64], r2 @ Q1
543 vld1.8 {d12}, [r0,:64], r2 @ Q2
544 vld1.8 {d13}, [r1,:64], r2 @ Q2
545 vld1.8 {d14}, [r0,:64] @ Q3
546 vld1.8 {d15}, [r1,:64] @ Q3
548 vdup.8 q14, r3 @ flim_E
549 vdup.8 q15, r12 @ flim_I
550 ldr r12, [sp, #68] @ hev_thresh
552 vp8_loop_filter inner=\inner
554 @ back up to P2: u,v -= stride * 6
555 sub r0, r0, r2, lsl #2
556 sub r1, r1, r2, lsl #2
557 sub r0, r0, r2, lsl #1
558 sub r1, r1, r2, lsl #1
561 vst1.8 {d2}, [r0,:64], r2 @ P2
562 vst1.8 {d3}, [r1,:64], r2 @ P2
563 vst1.8 {d4}, [r0,:64], r2 @ P1
564 vst1.8 {d5}, [r1,:64], r2 @ P1
565 vst1.8 {d6}, [r0,:64], r2 @ P0
566 vst1.8 {d7}, [r1,:64], r2 @ P0
567 vst1.8 {d8}, [r0,:64], r2 @ Q0
568 vst1.8 {d9}, [r1,:64], r2 @ Q0
569 vst1.8 {d10}, [r0,:64], r2 @ Q1
570 vst1.8 {d11}, [r1,:64], r2 @ Q1
571 vst1.8 {d12}, [r0,:64] @ Q2
572 vst1.8 {d13}, [r1,:64] @ Q2
580 vp8_v_loop_filter8uv _inner, inner=1
582 .macro vp8_h_loop_filter16 name, inner=0, simple=0
583 function ff_vp8_h_loop_filter16\name\()_neon, export=1
587 ldr r12, [sp, #64] @ hev_thresh
591 vld1.8 {d0}, [r0], r1 @ load first 8-line src data
592 vld1.8 {d2}, [r0], r1
593 vld1.8 {d4}, [r0], r1
594 vld1.8 {d6}, [r0], r1
595 vld1.8 {d8}, [r0], r1
596 vld1.8 {d10}, [r0], r1
597 vld1.8 {d12}, [r0], r1
598 vld1.8 {d14}, [r0], r1
599 vld1.8 {d1}, [r0], r1 @ load second 8-line src data
600 vld1.8 {d3}, [r0], r1
601 vld1.8 {d5}, [r0], r1
602 vld1.8 {d7}, [r0], r1
603 vld1.8 {d9}, [r0], r1
604 vld1.8 {d11}, [r0], r1
605 vld1.8 {d13}, [r0], r1
606 vld1.8 {d15}, [r0], r1
610 vdup.8 q14, r2 @ flim_E
612 vdup.8 q15, r3 @ flim_I
615 vp8_loop_filter inner=\inner, simple=\simple
617 sub r0, r0, r1, lsl #4 @ backup 16 rows
622 vst1.8 {d0}, [r0], r1
623 vst1.8 {d2}, [r0], r1
624 vst1.8 {d4}, [r0], r1
625 vst1.8 {d6}, [r0], r1
626 vst1.8 {d8}, [r0], r1
627 vst1.8 {d10}, [r0], r1
628 vst1.8 {d12}, [r0], r1
629 vst1.8 {d14}, [r0], r1
630 vst1.8 {d1}, [r0], r1
631 vst1.8 {d3}, [r0], r1
632 vst1.8 {d5}, [r0], r1
633 vst1.8 {d7}, [r0], r1
634 vst1.8 {d9}, [r0], r1
635 vst1.8 {d11}, [r0], r1
636 vst1.8 {d13}, [r0], r1
645 vp8_h_loop_filter16 _inner, inner=1
646 vp8_h_loop_filter16 _simple, simple=1
648 .macro vp8_h_loop_filter8uv name, inner=0
649 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
653 ldr r12, [sp, #64] @ flim_I
656 vld1.8 {d0}, [r0], r2 @ load u
657 vld1.8 {d1}, [r1], r2 @ load v
658 vld1.8 {d2}, [r0], r2
659 vld1.8 {d3}, [r1], r2
660 vld1.8 {d4}, [r0], r2
661 vld1.8 {d5}, [r1], r2
662 vld1.8 {d6}, [r0], r2
663 vld1.8 {d7}, [r1], r2
664 vld1.8 {d8}, [r0], r2
665 vld1.8 {d9}, [r1], r2
666 vld1.8 {d10}, [r0], r2
667 vld1.8 {d11}, [r1], r2
668 vld1.8 {d12}, [r0], r2
669 vld1.8 {d13}, [r1], r2
670 vld1.8 {d14}, [r0], r2
671 vld1.8 {d15}, [r1], r2
675 vdup.8 q14, r3 @ flim_E
676 vdup.8 q15, r12 @ flim_I
677 ldr r12, [sp, #68] @ hev_thresh
679 vp8_loop_filter inner=\inner
681 sub r0, r0, r2, lsl #3 @ backup u 8 rows
682 sub r1, r1, r2, lsl #3 @ backup v 8 rows
687 vst1.8 {d0}, [r0], r2
688 vst1.8 {d1}, [r1], r2
689 vst1.8 {d2}, [r0], r2
690 vst1.8 {d3}, [r1], r2
691 vst1.8 {d4}, [r0], r2
692 vst1.8 {d5}, [r1], r2
693 vst1.8 {d6}, [r0], r2
694 vst1.8 {d7}, [r1], r2
695 vst1.8 {d8}, [r0], r2
696 vst1.8 {d9}, [r1], r2
697 vst1.8 {d10}, [r0], r2
698 vst1.8 {d11}, [r1], r2
699 vst1.8 {d12}, [r0], r2
700 vst1.8 {d13}, [r1], r2
710 vp8_h_loop_filter8uv _inner, inner=1
712 function ff_put_vp8_pixels16_neon, export=1
713 ldr r12, [sp, #0] @ h
716 vld1.8 {q0}, [r2], r3
717 vld1.8 {q1}, [r2], r3
718 vld1.8 {q2}, [r2], r3
719 vld1.8 {q3}, [r2], r3
720 vst1.8 {q0}, [r0,:128], r1
721 vst1.8 {q1}, [r0,:128], r1
722 vst1.8 {q2}, [r0,:128], r1
723 vst1.8 {q3}, [r0,:128], r1
728 function ff_put_vp8_pixels8_neon, export=1
729 ldr r12, [sp, #0] @ h
732 vld1.8 {d0}, [r2], r3
733 vld1.8 {d1}, [r2], r3
734 vld1.8 {d2}, [r2], r3
735 vld1.8 {d3}, [r2], r3
736 vst1.8 {d0}, [r0,:64], r1
737 vst1.8 {d1}, [r0,:64], r1
738 vst1.8 {d2}, [r0,:64], r1
739 vst1.8 {d3}, [r0,:64], r1
744 function ff_put_vp8_pixels4_neon, export=1
745 ldr r12, [sp, #0] @ h
761 /* 4/6-tap 8th-pel MC */
763 .macro vp8_epel8_h6 d, a, b
764 vext.8 d27, \a, \b, #1
766 vext.8 d28, \a, \b, #2
768 vext.8 d29, \a, \b, #3
770 vext.8 d30, \a, \b, #4
772 vext.8 d31, \a, \b, #5
774 vmul.u16 q10, q10, d0[2]
776 vmul.u16 q11, q11, d0[3]
777 vmls.u16 q10, q9, d0[1]
778 vmls.u16 q11, q12, d1[0]
779 vmla.u16 q10, q8, d0[0]
780 vmla.u16 q11, q13, d1[1]
781 vqadd.s16 q11, q10, q11
782 vqrshrun.s16 \d, q11, #7
785 .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
786 vext.8 q14, \q0, \q1, #3
787 vext.8 q15, \q0, \q1, #4
790 vext.8 q3, \q0, \q1, #2
793 vext.8 q8, \q0, \q1, #1
796 vext.8 q2, \q0, \q1, #5
801 vmul.u16 q11, q11, d0[3]
802 vmul.u16 q10, q10, d0[2]
803 vmul.u16 q3, q3, d0[2]
804 vmul.u16 q14, q14, d0[3]
805 vmls.u16 q11, q12, d1[0]
808 vmls.u16 q10, q9, d0[1]
809 vmls.u16 q3, q8, d0[1]
810 vmls.u16 q14, q15, d1[0]
811 vmla.u16 q10, q12, d0[0]
812 vmla.u16 q11, q13, d1[1]
813 vmla.u16 q3, q1, d0[0]
814 vmla.u16 q14, q2, d1[1]
815 vqadd.s16 q11, q10, q11
816 vqadd.s16 q14, q3, q14
817 vqrshrun.s16 \d0, q11, #7
818 vqrshrun.s16 \d1, q14, #7
821 .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
828 vmul.u16 q10, q10, d0[2]
829 vmul.u16 q11, q11, d0[3]
830 vmls.u16 q10, q9, d0[1]
831 vmls.u16 q11, q12, d1[0]
832 vmla.u16 q10, q8, d0[0]
833 vmla.u16 q11, q13, d1[1]
834 vqadd.s16 q11, q10, q11
835 vqrshrun.s16 \d0, q11, #7
838 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
846 vmul.u16 q10, q10, d0[0]
847 vmul.u16 q15, q11, d0[3]
848 vmul.u16 q11, q11, d0[2]
849 vmul.u16 q14, q14, d1[1]
850 vmls.u16 q10, q9, d0[1]
851 vmls.u16 q15, q12, d1[0]
852 vmls.u16 q11, q8, d0[1]
853 vmls.u16 q14, q13, d1[0]
854 vmla.u16 q10, q8, d0[2]
855 vmla.u16 q15, q13, d1[1]
856 vmla.u16 q11, q9, d0[0]
857 vmla.u16 q14, q12, d0[3]
858 vqadd.s16 q15, q10, q15
859 vqadd.s16 q14, q11, q14
860 vqrshrun.s16 \d0, q15, #7
861 vqrshrun.s16 \d1, q14, #7
864 .macro vp8_epel8_h4 d, a, b
865 vext.8 d28, \a, \b, #1
867 vext.8 d29, \a, \b, #2
869 vext.8 d30, \a, \b, #3
872 vmul.u16 q10, q10, d0[2]
873 vmul.u16 q11, q11, d0[3]
874 vmls.u16 q10, q9, d0[1]
875 vmls.u16 q11, q12, d1[0]
876 vqadd.s16 q11, q10, q11
877 vqrshrun.s16 \d, q11, #7
880 .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
886 vmul.u16 q8, q10, d0[2]
887 vmul.u16 q14, q11, d0[3]
888 vmul.u16 q11, q11, d0[2]
889 vmul.u16 q15, q12, d0[3]
890 vmls.u16 q8, q9, d0[1]
891 vmls.u16 q14, q12, d1[0]
892 vmls.u16 q11, q10, d0[1]
893 vmls.u16 q15, q13, d1[0]
894 vqadd.s16 q8, q8, q14
895 vqadd.s16 q11, q11, q15
896 vqrshrun.s16 \d0, q8, #7
897 vqrshrun.s16 \d1, q11, #7
900 function ff_put_vp8_epel16_v6_neon, export=1
901 sub r2, r2, r3, lsl #1
905 ldr r4, [sp, #80] @ my
906 movrel lr, subpel_filters-16
907 ldr r12, [sp, #72] @ h
908 add r4, lr, r4, lsl #4
909 vld1.16 {q0}, [r4,:128]
911 vld1.8 {d2-d3}, [r2], r3
912 vld1.8 {d4-d5}, [r2], r3
913 vld1.8 {d6-d7}, [r2], r3
914 vld1.8 {d8-d9}, [r2], r3
915 vld1.8 {d10-d11},[r2], r3
916 vld1.8 {d12-d13},[r2], r3
917 vld1.8 {d14-d15},[r2]
918 sub r2, r2, r3, lsl #2
920 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
921 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
923 vst1.8 {d2-d3}, [r0,:128], r1
924 vst1.8 {d4-d5}, [r0,:128], r1
932 function ff_put_vp8_epel16_h6_neon, export=1
936 ldr r4, [sp, #12] @ mx
937 movrel lr, subpel_filters-16
938 ldr r12, [sp, #8] @ h
939 add r4, lr, r4, lsl #4
940 vld1.16 {q0}, [r4,:128]
942 vld1.8 {d2-d4}, [r2], r3
944 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
946 vst1.8 {d2-d3}, [r0,:128], r1
953 function ff_put_vp8_epel16_h6v6_neon, export=1
954 sub r2, r2, r3, lsl #1
959 @ first pass (horizontal):
960 ldr r4, [sp, #28] @ mx
961 movrel lr, subpel_filters-16
962 ldr r12, [sp, #24] @ h
963 add r4, lr, r4, lsl #4
965 vld1.16 {q0}, [r4,:128]
970 vld1.8 {d2,d3,d4}, [r2], r3
972 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
974 vst1.8 {d2-d3}, [lr,:128]!
978 @ second pass (vertical):
979 ldr r4, [sp, #336+16+32] @ my
980 movrel lr, subpel_filters-16
981 ldr r12, [sp, #336+16+24] @ h
982 add r4, lr, r4, lsl #4
984 vld1.16 {q0}, [r4,:128]
987 vld1.8 {d2-d5}, [lr,:128]!
988 vld1.8 {d6-d9}, [lr,:128]!
989 vld1.8 {d28-d31},[lr,:128]
992 vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
993 vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
995 vst1.8 {d2-d3}, [r0,:128], r1
1004 function ff_put_vp8_epel8_v6_neon, export=1
1005 sub r2, r2, r3, lsl #1
1008 ldr r4, [sp, #16] @ my
1009 movrel lr, subpel_filters-16
1010 ldr r12, [sp, #8] @ h
1011 add r4, lr, r4, lsl #4
1012 vld1.16 {q0}, [r4,:128]
1014 vld1.8 {d2}, [r2], r3
1015 vld1.8 {d3}, [r2], r3
1016 vld1.8 {d4}, [r2], r3
1017 vld1.8 {d5}, [r2], r3
1018 vld1.8 {d6}, [r2], r3
1019 vld1.8 {d7}, [r2], r3
1022 sub r2, r2, r3, lsl #2
1024 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1026 vst1.8 {d2}, [r0,:64], r1
1027 vst1.8 {d3}, [r0,:64], r1
1034 function ff_put_vp8_epel8_h6_neon, export=1
1038 ldr r4, [sp, #12] @ mx
1039 movrel lr, subpel_filters-16
1040 ldr r12, [sp, #8] @ h
1041 add r4, lr, r4, lsl #4
1042 vld1.16 {q0}, [r4,:128]
1044 vld1.8 {d2,d3}, [r2], r3
1046 vp8_epel8_h6 d2, d2, d3
1048 vst1.8 {d2}, [r0,:64], r1
1055 function ff_put_vp8_epel8_h6v6_neon, export=1
1056 sub r2, r2, r3, lsl #1
1060 @ first pass (horizontal):
1061 ldr r4, [sp, #12] @ mx
1062 movrel lr, subpel_filters-16
1063 ldr r12, [sp, #8] @ h
1064 add r4, lr, r4, lsl #4
1066 vld1.16 {q0}, [r4,:128]
1071 vld1.8 {d2,d3}, [r2], r3
1073 vp8_epel8_h6 d2, d2, d3
1075 vst1.8 {d2}, [lr,:64]!
1079 @ second pass (vertical):
1080 ldr r4, [sp, #168+16+16] @ my
1081 movrel lr, subpel_filters-16
1082 ldr r12, [sp, #168+16+8] @ h
1083 add r4, lr, r4, lsl #4
1085 vld1.16 {q0}, [r4,:128]
1088 vld1.8 {d2-d5}, [lr,:128]!
1089 vld1.8 {d6-d7}, [lr,:128]!
1090 vld1.8 {d30}, [lr,:64]
1093 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1095 vst1.8 {d2}, [r0,:64], r1
1096 vst1.8 {d3}, [r0,:64], r1
1104 function ff_put_vp8_epel8_v4_neon, export=1
1108 ldr r4, [sp, #16] @ my
1109 movrel lr, subpel_filters-16
1110 ldr r12, [sp, #8] @ h
1111 add r4, lr, r4, lsl #4
1112 vld1.16 {q0}, [r4,:128]
1114 vld1.8 {d2}, [r2], r3
1115 vld1.8 {d3}, [r2], r3
1116 vld1.8 {d4}, [r2], r3
1117 vld1.8 {d5}, [r2], r3
1119 sub r2, r2, r3, lsl #1
1121 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1123 vst1.8 {d2}, [r0,:64], r1
1124 vst1.8 {d3}, [r0,:64], r1
1131 function ff_put_vp8_epel8_h4_neon, export=1
1135 ldr r4, [sp, #12] @ mx
1136 movrel lr, subpel_filters-16
1137 ldr r12, [sp, #8] @ h
1138 add r4, lr, r4, lsl #4
1139 vld1.16 {q0}, [r4,:128]
1141 vld1.8 {d2,d3}, [r2], r3
1143 vp8_epel8_h4 d2, d2, d3
1145 vst1.8 {d2}, [r0,:64], r1
1152 function ff_put_vp8_epel8_h4v4_neon, export=1
1157 @ first pass (horizontal):
1158 ldr r4, [sp, #12] @ mx
1159 movrel lr, subpel_filters-16
1160 ldr r12, [sp, #8] @ h
1161 add r4, lr, r4, lsl #4
1163 vld1.16 {q0}, [r4,:128]
1168 vld1.8 {d2,d3}, [r2], r3
1170 vp8_epel8_h4 d2, d2, d3
1172 vst1.8 {d2}, [lr,:64]!
1176 @ second pass (vertical):
1177 ldr r4, [sp, #168+16+16] @ my
1178 movrel lr, subpel_filters-16
1179 ldr r12, [sp, #168+16+8] @ h
1180 add r4, lr, r4, lsl #4
1182 vld1.16 {q0}, [r4,:128]
1185 vld1.8 {d2-d5}, [lr,:128]!
1186 vld1.8 {d6}, [lr,:64]
1189 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1191 vst1.8 {d2}, [r0,:64], r1
1192 vst1.8 {d3}, [r0,:64], r1
1200 function ff_put_vp8_epel8_h6v4_neon, export=1
1205 @ first pass (horizontal):
1206 ldr r4, [sp, #12] @ mx
1207 movrel lr, subpel_filters-16
1208 ldr r12, [sp, #8] @ h
1209 add r4, lr, r4, lsl #4
1211 vld1.16 {q0}, [r4,:128]
1216 vld1.8 {d2,d3}, [r2], r3
1218 vp8_epel8_h6 d2, d2, d3
1220 vst1.8 {d2}, [lr,:64]!
1224 @ second pass (vertical):
1225 ldr r4, [sp, #168+16+16] @ my
1226 movrel lr, subpel_filters-16
1227 ldr r12, [sp, #168+16+8] @ h
1228 add r4, lr, r4, lsl #4
1230 vld1.16 {q0}, [r4,:128]
1233 vld1.8 {d2-d5}, [lr,:128]!
1234 vld1.8 {d6}, [lr,:64]
1237 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1239 vst1.8 {d2}, [r0,:64], r1
1240 vst1.8 {d3}, [r0,:64], r1
1248 function ff_put_vp8_epel8_h4v6_neon, export=1
1249 sub r2, r2, r3, lsl #1
1253 @ first pass (horizontal):
1254 ldr r4, [sp, #12] @ mx
1255 movrel lr, subpel_filters-16
1256 ldr r12, [sp, #8] @ h
1257 add r4, lr, r4, lsl #4
1259 vld1.16 {q0}, [r4,:128]
1264 vld1.8 {d2,d3}, [r2], r3
1266 vp8_epel8_h4 d2, d2, d3
1268 vst1.8 {d2}, [lr,:64]!
1272 @ second pass (vertical):
1273 ldr r4, [sp, #168+16+16] @ my
1274 movrel lr, subpel_filters-16
1275 ldr r12, [sp, #168+16+8] @ h
1276 add r4, lr, r4, lsl #4
1278 vld1.16 {q0}, [r4,:128]
1281 vld1.8 {d2-d5}, [lr,:128]!
1282 vld1.8 {d6-d7}, [lr,:128]!
1283 vld1.8 {d30}, [lr,:64]
1286 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1288 vst1.8 {d2}, [r0,:64], r1
1289 vst1.8 {d3}, [r0,:64], r1
1297 function ff_put_vp8_epel4_v6_neon, export=1
1298 sub r2, r2, r3, lsl #1
1301 ldr r4, [sp, #16] @ my
1302 movrel lr, subpel_filters-16
1303 ldr r12, [sp, #8] @ h
1304 add r4, lr, r4, lsl #4
1305 vld1.16 {q0}, [r4,:128]
1307 vld1.32 {d2[]}, [r2], r3
1308 vld1.32 {d3[]}, [r2], r3
1309 vld1.32 {d4[]}, [r2], r3
1310 vld1.32 {d5[]}, [r2], r3
1311 vld1.32 {d6[]}, [r2], r3
1312 vld1.32 {d7[]}, [r2], r3
1313 vld1.32 {d28[]}, [r2]
1314 sub r2, r2, r3, lsl #2
1315 vld1.32 {d2[1]}, [r2], r3
1316 vld1.32 {d3[1]}, [r2], r3
1317 vld1.32 {d4[1]}, [r2], r3
1318 vld1.32 {d5[1]}, [r2], r3
1319 vld1.32 {d6[1]}, [r2], r3
1320 vld1.32 {d7[1]}, [r2], r3
1321 vld1.32 {d28[1]}, [r2]
1322 sub r2, r2, r3, lsl #2
1324 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1326 vst1.32 {d2[0]}, [r0,:32], r1
1327 vst1.32 {d3[0]}, [r0,:32], r1
1328 vst1.32 {d2[1]}, [r0,:32], r1
1329 vst1.32 {d3[1]}, [r0,:32], r1
1336 function ff_put_vp8_epel4_h6_neon, export=1
1340 ldr r4, [sp, #12] @ mx
1341 movrel lr, subpel_filters-16
1342 ldr r12, [sp, #8] @ h
1343 add r4, lr, r4, lsl #4
1344 vld1.16 {q0}, [r4,:128]
1346 vld1.8 {q1}, [r2], r3
1347 vp8_epel8_h6 d2, d2, d3
1348 vst1.32 {d2[0]}, [r0,:32], r1
1355 function ff_put_vp8_epel4_h6v6_neon, export=1
1356 sub r2, r2, r3, lsl #1
1360 ldr r4, [sp, #12] @ mx
1361 movrel lr, subpel_filters-16
1362 ldr r12, [sp, #8] @ h
1363 add r4, lr, r4, lsl #4
1365 vld1.16 {q0}, [r4,:128]
1370 vld1.8 {q1}, [r2], r3
1371 vp8_epel8_h6 d2, d2, d3
1372 vst1.32 {d2[0]}, [lr,:32]!
1376 ldr r4, [sp, #52+16+16] @ my
1377 movrel lr, subpel_filters-16
1378 ldr r12, [sp, #52+16+8] @ h
1379 add r4, lr, r4, lsl #4
1381 vld1.16 {q0}, [r4,:128]
1384 vld1.8 {d2-d3}, [lr,:128]!
1385 vld1.8 {d6}, [lr,:64]!
1386 vld1.32 {d28[]}, [lr,:32]
1388 vld1.8 {d4-d5}, [lr]!
1389 vld1.8 {d7}, [lr,:64]!
1390 vld1.32 {d28[1]}, [lr,:32]
1394 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1395 vst1.32 {d2[0]}, [r0,:32], r1
1396 vst1.32 {d3[0]}, [r0,:32], r1
1397 vst1.32 {d2[1]}, [r0,:32], r1
1398 vst1.32 {d3[1]}, [r0,:32], r1
1406 function ff_put_vp8_epel4_h4v6_neon, export=1
1407 sub r2, r2, r3, lsl #1
1411 ldr r4, [sp, #12] @ mx
1412 movrel lr, subpel_filters-16
1413 ldr r12, [sp, #8] @ h
1414 add r4, lr, r4, lsl #4
1416 vld1.16 {q0}, [r4,:128]
1421 vld1.8 {d2}, [r2], r3
1422 vp8_epel8_h4 d2, d2, d2
1423 vst1.32 {d2[0]}, [lr,:32]!
1427 ldr r4, [sp, #52+16+16] @ my
1428 movrel lr, subpel_filters-16
1429 ldr r12, [sp, #52+16+8] @ h
1430 add r4, lr, r4, lsl #4
1432 vld1.16 {q0}, [r4,:128]
1435 vld1.8 {d2-d3}, [lr,:128]!
1436 vld1.8 {d6}, [lr,:64]!
1437 vld1.32 {d28[]}, [lr,:32]
1439 vld1.8 {d4-d5}, [lr]!
1440 vld1.8 {d7}, [lr,:64]!
1441 vld1.32 {d28[1]}, [lr,:32]
1445 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1446 vst1.32 {d2[0]}, [r0,:32], r1
1447 vst1.32 {d3[0]}, [r0,:32], r1
1448 vst1.32 {d2[1]}, [r0,:32], r1
1449 vst1.32 {d3[1]}, [r0,:32], r1
1457 function ff_put_vp8_epel4_h6v4_neon, export=1
1462 ldr r4, [sp, #12] @ mx
1463 movrel lr, subpel_filters-16
1464 ldr r12, [sp, #8] @ h
1465 add r4, lr, r4, lsl #4
1467 vld1.16 {q0}, [r4,:128]
1472 vld1.8 {q1}, [r2], r3
1473 vp8_epel8_h6 d2, d2, d3
1474 vst1.32 {d2[0]}, [lr,:32]!
1478 ldr r4, [sp, #44+16+16] @ my
1479 movrel lr, subpel_filters-16
1480 ldr r12, [sp, #44+16+8] @ h
1481 add r4, lr, r4, lsl #4
1483 vld1.16 {q0}, [r4,:128]
1486 vld1.8 {d2-d3}, [lr,:128]!
1487 vld1.32 {d6[]}, [lr,:32]
1489 vld1.8 {d4-d5}, [lr]!
1490 vld1.32 {d6[1]}, [lr,:32]
1493 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1494 vst1.32 {d2[0]}, [r0,:32], r1
1495 vst1.32 {d3[0]}, [r0,:32], r1
1496 vst1.32 {d2[1]}, [r0,:32], r1
1497 vst1.32 {d3[1]}, [r0,:32], r1
1505 function ff_put_vp8_epel4_h4_neon, export=1
1509 ldr r4, [sp, #12] @ mx
1510 movrel lr, subpel_filters-16
1511 ldr r12, [sp, #8] @ h
1512 add r4, lr, r4, lsl #4
1513 vld1.16 {q0}, [r4,:128]
1515 vld1.8 {d2}, [r2], r3
1516 vp8_epel8_h4 d2, d2, d2
1517 vst1.32 {d2[0]}, [r0,:32], r1
1524 function ff_put_vp8_epel4_v4_neon, export=1
1528 ldr r4, [sp, #16] @ my
1529 movrel lr, subpel_filters-16
1530 ldr r12, [sp, #8] @ h
1531 add r4, lr, r4, lsl #4
1532 vld1.16 {q0}, [r4,:128]
1534 vld1.32 {d2[]}, [r2], r3
1535 vld1.32 {d3[]}, [r2], r3
1536 vld1.32 {d4[]}, [r2], r3
1537 vld1.32 {d5[]}, [r2], r3
1538 vld1.32 {d6[]}, [r2]
1539 sub r2, r2, r3, lsl #1
1540 vld1.32 {d2[1]}, [r2], r3
1541 vld1.32 {d3[1]}, [r2], r3
1542 vld1.32 {d4[1]}, [r2], r3
1543 vld1.32 {d5[1]}, [r2], r3
1544 vld1.32 {d6[1]}, [r2]
1545 sub r2, r2, r3, lsl #1
1547 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1549 vst1.32 {d2[0]}, [r0,:32], r1
1550 vst1.32 {d3[0]}, [r0,:32], r1
1551 vst1.32 {d2[1]}, [r0,:32], r1
1552 vst1.32 {d3[1]}, [r0,:32], r1
1559 function ff_put_vp8_epel4_h4v4_neon, export=1
1564 ldr r4, [sp, #12] @ mx
1565 movrel lr, subpel_filters-16
1566 ldr r12, [sp, #8] @ h
1567 add r4, lr, r4, lsl #4
1569 vld1.16 {q0}, [r4,:128]
1574 vld1.8 {d2}, [r2], r3
1575 vp8_epel8_h4 d2, d2, d3
1576 vst1.32 {d2[0]}, [lr,:32]!
1580 ldr r4, [sp, #44+16+16] @ my
1581 movrel lr, subpel_filters-16
1582 ldr r12, [sp, #44+16+8] @ h
1583 add r4, lr, r4, lsl #4
1585 vld1.16 {q0}, [r4,:128]
1588 vld1.8 {d2-d3}, [lr,:128]!
1589 vld1.32 {d6[]}, [lr,:32]
1591 vld1.8 {d4-d5}, [lr]!
1592 vld1.32 {d6[1]}, [lr,:32]
1595 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1596 vst1.32 {d2[0]}, [r0,:32], r1
1597 vst1.32 {d3[0]}, [r0,:32], r1
1598 vst1.32 {d2[1]}, [r0,:32], r1
1599 vst1.32 {d3[1]}, [r0,:32], r1
1607 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1608 @ arithmatic can be used to apply filters
1609 const subpel_filters, align=4
1610 .short 0, 6, 123, 12, 1, 0, 0, 0
1611 .short 2, 11, 108, 36, 8, 1, 0, 0
1612 .short 0, 9, 93, 50, 6, 0, 0, 0
1613 .short 3, 16, 77, 77, 16, 3, 0, 0
1614 .short 0, 6, 50, 93, 9, 0, 0, 0
1615 .short 1, 8, 36, 108, 11, 2, 0, 0
1616 .short 0, 1, 12, 123, 6, 0, 0, 0
1621 function ff_put_vp8_bilin16_h_neon, export=1
1622 ldr r3, [sp, #4] @ mx
1629 vld1.8 {d2-d4}, [r2], r1
1630 vext.8 q2, q1, q2, #1
1633 vld1.8 {d18-d20},[r2], r1
1636 vext.8 q10, q9, q10, #1
1637 vmull.u8 q11, d18, d1
1638 vmlal.u8 q11, d20, d0
1639 vmull.u8 q12, d19, d1
1640 vmlal.u8 q12, d21, d0
1641 vrshrn.u16 d4, q8, #3
1642 vrshrn.u16 d5, q3, #3
1643 vrshrn.u16 d6, q11, #3
1644 vrshrn.u16 d7, q12, #3
1645 vst1.8 {q2}, [r0,:128], r1
1646 vst1.8 {q3}, [r0,:128], r1
1652 function ff_put_vp8_bilin16_v_neon, export=1
1653 ldr r3, [sp, #8] @ my
1658 vld1.8 {q1}, [r2], r1
1661 vld1.8 {q2}, [r2], r1
1666 vld1.8 {q1}, [r2], r1
1669 vmull.u8 q10, d5, d1
1670 vmlal.u8 q10, d3, d0
1671 vrshrn.u16 d4, q3, #3
1672 vrshrn.u16 d5, q8, #3
1673 vrshrn.u16 d6, q9, #3
1674 vrshrn.u16 d7, q10, #3
1675 vst1.8 {q2}, [r0,:128], r1
1676 vst1.8 {q3}, [r0,:128], r1
1682 function ff_put_vp8_bilin16_hv_neon, export=1
1683 ldr r3, [sp, #4] @ mx
1687 ldr r3, [sp, #8] @ my
1693 vld1.8 {d4-d6}, [r2], r1
1694 vext.8 q3, q2, q3, #1
1699 vrshrn.u16 d4, q8, #3
1700 vrshrn.u16 d5, q9, #3
1703 vld1.8 {d18-d20},[r2], r1
1704 vext.8 q10, q9, q10, #1
1705 vmull.u8 q11, d18, d1
1706 vmlal.u8 q11, d20, d0
1707 vld1.8 {d26-d28},[r2], r1
1708 vmull.u8 q12, d19, d1
1709 vmlal.u8 q12, d21, d0
1710 vext.8 q14, q13, q14, #1
1711 vmull.u8 q8, d26, d1
1712 vmlal.u8 q8, d28, d0
1713 vmull.u8 q9, d27, d1
1714 vmlal.u8 q9, d29, d0
1715 vrshrn.u16 d6, q11, #3
1716 vrshrn.u16 d7, q12, #3
1717 vmull.u8 q12, d4, d3
1718 vmlal.u8 q12, d6, d2
1719 vmull.u8 q15, d5, d3
1720 vmlal.u8 q15, d7, d2
1721 vrshrn.u16 d4, q8, #3
1722 vrshrn.u16 d5, q9, #3
1723 vmull.u8 q10, d6, d3
1724 vmlal.u8 q10, d4, d2
1725 vmull.u8 q11, d7, d3
1726 vmlal.u8 q11, d5, d2
1727 vrshrn.u16 d24, q12, #3
1728 vrshrn.u16 d25, q15, #3
1729 vst1.8 {q12}, [r0,:128], r1
1730 vrshrn.u16 d20, q10, #3
1731 vrshrn.u16 d21, q11, #3
1732 vst1.8 {q10}, [r0,:128], r1
1738 function ff_put_vp8_bilin8_h_neon, export=1
1739 ldr r3, [sp, #4] @ mx
1746 vld1.8 {q1}, [r2], r1
1747 vext.8 d3, d2, d3, #1
1750 vld1.8 {q3}, [r2], r1
1751 vext.8 d7, d6, d7, #1
1754 vrshrn.u16 d4, q2, #3
1755 vrshrn.u16 d16, q8, #3
1756 vst1.8 {d4}, [r0,:64], r1
1757 vst1.8 {d16}, [r0,:64], r1
1763 function ff_put_vp8_bilin8_v_neon, export=1
1764 ldr r3, [sp, #8] @ my
1769 vld1.8 {d2}, [r2], r1
1772 vld1.8 {d3}, [r2], r1
1775 vld1.8 {d2}, [r2], r1
1778 vrshrn.u16 d4, q2, #3
1779 vrshrn.u16 d6, q3, #3
1780 vst1.8 {d4}, [r0,:64], r1
1781 vst1.8 {d6}, [r0,:64], r1
1787 function ff_put_vp8_bilin8_hv_neon, export=1
1788 ldr r3, [sp, #4] @ mx
1792 ldr r3, [sp, #8] @ my
1798 vld1.8 {q2}, [r2], r1
1799 vext.8 d5, d4, d5, #1
1802 vrshrn.u16 d22, q9, #3
1805 vld1.8 {q3}, [r2], r1
1806 vext.8 d7, d6, d7, #1
1809 vld1.8 {q2}, [r2], r1
1810 vext.8 d5, d4, d5, #1
1813 vrshrn.u16 d16, q8, #3
1814 vmull.u8 q10, d22, d3
1815 vmlal.u8 q10, d16, d2
1816 vrshrn.u16 d22, q9, #3
1817 vmull.u8 q12, d16, d3
1818 vmlal.u8 q12, d22, d2
1819 vrshrn.u16 d20, q10, #3
1820 vst1.8 {d20}, [r0,:64], r1
1821 vrshrn.u16 d23, q12, #3
1822 vst1.8 {d23}, [r0,:64], r1
1828 function ff_put_vp8_bilin4_h_neon, export=1
1829 ldr r3, [sp, #4] @ mx
1836 vld1.8 {d2}, [r2], r1
1837 vext.8 d3, d2, d3, #1
1838 vld1.8 {d6}, [r2], r1
1839 vext.8 d7, d6, d7, #1
1843 vrshrn.u16 d4, q2, #3
1844 vst1.32 {d4[0]}, [r0,:32], r1
1845 vst1.32 {d4[1]}, [r0,:32], r1
1851 function ff_put_vp8_bilin4_v_neon, export=1
1852 ldr r3, [sp, #8] @ my
1857 vld1.32 {d2[]}, [r2], r1
1859 vld1.32 {d3[]}, [r2]
1860 vld1.32 {d2[1]}, [r2], r1
1861 vld1.32 {d3[1]}, [r2], r1
1865 vrshrn.u16 d4, q2, #3
1866 vst1.32 {d4[0]}, [r0,:32], r1
1867 vst1.32 {d4[1]}, [r0,:32], r1
1874 function ff_put_vp8_bilin4_hv_neon, export=1
1875 ldr r3, [sp, #4] @ mx
1879 ldr r3, [sp, #8] @ my
1885 vld1.8 {d4}, [r2], r1
1886 vext.8 d5, d4, d4, #1
1889 vrshrn.u16 d22, q9, #3
1892 vld1.8 {d6}, [r2], r1
1893 vext.8 d7, d6, d6, #1
1894 vld1.8 {d4}, [r2], r1
1895 vext.8 d5, d4, d4, #1
1899 vrshrn.u16 d16, q8, #3
1900 vmull.u8 q10, d16, d2
1902 vmlal.u8 q10, d22, d3
1904 vrshrn.u16 d20, q10, #3
1905 vst1.32 {d20[0]}, [r0,:32], r1
1906 vst1.32 {d20[1]}, [r0,:32], r1