2 * VP8 NEON optimisations
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "libavutil/arm/asm.S"
27 function ff_vp8_luma_dc_wht_neon, export=1
28 vld1.16 {q0-q1}, [r1,:128]
33 vst1.16 {q15}, [r1,:128]!
36 vst1.16 {q15}, [r1,:128]
60 vst1.16 {d0[0]}, [r0,:16], r3
61 vst1.16 {d1[0]}, [r0,:16], r3
62 vst1.16 {d2[0]}, [r0,:16], r3
63 vst1.16 {d3[0]}, [r0,:16], r3
64 vst1.16 {d0[1]}, [r0,:16], r3
65 vst1.16 {d1[1]}, [r0,:16], r3
66 vst1.16 {d2[1]}, [r0,:16], r3
67 vst1.16 {d3[1]}, [r0,:16], r3
68 vst1.16 {d0[2]}, [r0,:16], r3
69 vst1.16 {d1[2]}, [r0,:16], r3
70 vst1.16 {d2[2]}, [r0,:16], r3
71 vst1.16 {d3[2]}, [r0,:16], r3
72 vst1.16 {d0[3]}, [r0,:16], r3
73 vst1.16 {d1[3]}, [r0,:16], r3
74 vst1.16 {d2[3]}, [r0,:16], r3
75 vst1.16 {d3[3]}, [r0,:16], r3
80 function ff_vp8_idct_add_neon, export=1
81 vld1.16 {q0-q1}, [r1,:128]
86 vmull.s16 q12, d1, d4[0]
87 vmull.s16 q13, d3, d4[0]
88 vqdmulh.s16 d20, d1, d4[1]
89 vqdmulh.s16 d23, d3, d4[1]
90 vshrn.s32 d21, q12, #16
91 vshrn.s32 d22, q13, #16
97 vadd.s16 d18, d21, d23
98 vsub.s16 d19, d20, d22
108 vmull.s16 q12, d1, d4[0]
109 vst1.16 {q15}, [r1,:128]!
110 vmull.s16 q13, d2, d4[0]
111 vst1.16 {q15}, [r1,:128]
112 vqdmulh.s16 d21, d1, d4[1]
113 vqdmulh.s16 d23, d2, d4[1]
114 vshrn.s32 d20, q12, #16
115 vshrn.s32 d22, q13, #16
116 vadd.i16 d20, d20, d1
117 vadd.i16 d22, d22, d2
121 vadd.i16 d18, d20, d23
122 vld1.32 {d20[]}, [r0,:32], r2
123 vsub.i16 d19, d21, d22
124 vld1.32 {d22[]}, [r0,:32], r2
126 vld1.32 {d23[]}, [r0,:32], r2
128 vld1.32 {d21[]}, [r0,:32], r2
133 sub r0, r0, r2, lsl #2
145 vst1.32 {d0[0]}, [r0,:32], r2
146 vst1.32 {d0[1]}, [r0,:32], r2
147 vst1.32 {d1[1]}, [r0,:32], r2
148 vst1.32 {d1[0]}, [r0,:32], r2
153 function ff_vp8_idct_dc_add_neon, export=1
159 vld1.32 {d0[]}, [r0,:32], r2
160 vld1.32 {d1[]}, [r0,:32], r2
161 vld1.32 {d0[1]}, [r0,:32], r2
162 vld1.32 {d1[1]}, [r0,:32], r2
165 sub r0, r0, r2, lsl #2
168 vst1.32 {d0[0]}, [r0,:32], r2
169 vst1.32 {d1[0]}, [r0,:32], r2
170 vst1.32 {d0[1]}, [r0,:32], r2
171 vst1.32 {d1[1]}, [r0,:32], r2
175 function ff_vp8_idct_dc_add4uv_neon, export=1
178 vld1.16 {d16[]}, [r1,:16]
179 vst1.16 {d0[0]}, [r1,:16], r3
180 vld1.16 {d17[]}, [r1,:16]
181 vst1.16 {d0[0]}, [r1,:16], r3
182 vld1.16 {d18[]}, [r1,:16]
183 vst1.16 {d0[0]}, [r1,:16], r3
184 vld1.16 {d19[]}, [r1,:16]
185 vst1.16 {d0[0]}, [r1,:16], r3
187 vrshr.s16 q8, q8, #3 @ dc >>= 3
188 vld1.8 {d0}, [r0,:64], r2
190 vld1.8 {d1}, [r0,:64], r2
192 vld1.8 {d2}, [r0,:64], r2
194 vld1.8 {d3}, [r0,:64], r2
196 vld1.8 {d4}, [r0,:64], r2
198 vld1.8 {d5}, [r0,:64], r2
200 vld1.8 {d6}, [r0,:64], r2
202 vld1.8 {d7}, [r0,:64], r2
208 vst1.8 {d20}, [r3,:64], r2
210 vst1.8 {d21}, [r3,:64], r2
212 vst1.8 {d22}, [r3,:64], r2
214 vst1.8 {d23}, [r3,:64], r2
216 vst1.8 {d24}, [r3,:64], r2
218 vst1.8 {d25}, [r3,:64], r2
219 vst1.8 {d26}, [r3,:64], r2
220 vst1.8 {d27}, [r3,:64], r2
225 function ff_vp8_idct_dc_add4y_neon, export=1
228 vld1.16 {d16[]}, [r1,:16]
229 vst1.16 {d0[0]}, [r1,:16], r3
230 vld1.16 {d17[]}, [r1,:16]
231 vst1.16 {d0[0]}, [r1,:16], r3
232 vld1.16 {d18[]}, [r1,:16]
233 vst1.16 {d0[0]}, [r1,:16], r3
234 vld1.16 {d19[]}, [r1,:16]
235 vst1.16 {d0[0]}, [r1,:16], r3
236 vrshr.s16 q8, q8, #3 @ dc >>= 3
237 vld1.8 {q0}, [r0,:128], r2
239 vld1.8 {q1}, [r0,:128], r2
241 vld1.8 {q2}, [r0,:128], r2
243 vld1.8 {q3}, [r0,:128], r2
250 sub r0, r0, r2, lsl #2
256 vst1.8 {q10}, [r0,:128], r2
258 vst1.8 {q11}, [r0,:128], r2
260 vst1.8 {q12}, [r0,:128], r2
262 vst1.8 {q13}, [r0,:128], r2
273 .macro vp8_loop_filter, inner=0, simple=0
275 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
276 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
277 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
278 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
279 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
281 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
283 @ calculate hev and normal_limit:
284 vabd.u8 q12, q2, q3 @ abs(P1-P0)
285 vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
286 vabd.u8 q10, q0, q1 @ abs(P3-P2)
287 vabd.u8 q11, q1, q2 @ abs(P2-P1)
288 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
289 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
290 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
291 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
293 vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
295 vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
297 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
298 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
299 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
300 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
302 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
304 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
305 vdup.8 q15, r12 @ hev_thresh
306 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
307 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
308 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
309 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
319 @ convert to signed value:
320 veor q3, q3, q13 @ PS0 = P0 ^ 0x80
321 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
324 vsubl.s8 q10, d8, d6 @ QS0 - PS0
325 vsubl.s8 q11, d9, d7 @ (widened to 16 bits)
326 veor q2, q2, q13 @ PS1 = P1 ^ 0x80
327 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
328 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
329 vmul.i16 q11, q11, q12
331 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
335 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
337 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
338 vaddw.s8 q11, q11, d25
339 vqmovn.s16 d20, q10 @ narrow result back into q10
341 .if !\inner && !\simple
342 veor q1, q1, q13 @ PS2 = P2 ^ 0x80
343 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
345 vand q10, q10, q8 @ w &= normal_limit
347 @ registers used at this point..
348 @ q0 -> P3 (don't corrupt)
350 @ q7 -> Q3 (don't corrupt)
356 @ q8, q11, q12 -> unused
358 @ filter_common: is4tap==1
359 @ c1 = clamp(w + 4) >> 3;
360 @ c2 = clamp(w + 3) >> 3;
361 @ Q0 = s2u(QS0 - c1);
362 @ P0 = s2u(PS0 + c2);
365 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
366 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
367 vshr.s8 q11, q11, #3 @ c1 >>= 3
368 vshr.s8 q12, q12, #3 @ c2 >>= 3
369 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
370 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
371 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
372 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
373 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
374 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
376 @ the !is4tap case of filter_common, only used for inner blocks
377 @ c3 = ((c1&~hev) + 1) >> 1;
378 @ Q1 = s2u(QS1 - c3);
379 @ P1 = s2u(PS1 + c3);
380 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
381 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
382 vshr.s8 q11, q11, #3 @ c1 >>= 3
383 vshr.s8 q12, q12, #3 @ c2 >>= 3
384 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
385 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
386 vbic q11, q11, q9 @ c1 & ~hev
387 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
388 vrshr.s8 q11, q11, #1 @ c3 >>= 1
389 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
390 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
391 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
392 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
393 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
395 vand q12, q10, q9 @ w & hev
396 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
397 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
398 vshr.s8 q11, q11, #3 @ c1 >>= 3
399 vshr.s8 q12, q12, #3 @ c2 >>= 3
400 vbic q10, q10, q9 @ w &= ~hev
401 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
402 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
405 @ a = clamp((27*w + 63) >> 7);
408 @ a = clamp((18*w + 63) >> 7);
411 @ a = clamp((9*w + 63) >> 7);
415 vshll.s8 q14, d20, #3
416 vshll.s8 q15, d21, #3
417 vaddw.s8 q14, q14, d20
418 vaddw.s8 q15, q15, d21
420 vadd.s16 q9, q9, q15 @ 9*w + 63
421 vadd.s16 q11, q8, q14
422 vadd.s16 q12, q9, q15 @ 18*w + 63
423 vadd.s16 q14, q11, q14
424 vadd.s16 q15, q12, q15 @ 27*w + 63
425 vqshrn.s16 d16, q8, #7
426 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
427 vqshrn.s16 d22, q11, #7
428 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
429 vqshrn.s16 d28, q14, #7
430 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
431 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
432 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
433 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
434 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
435 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
436 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
437 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
438 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
439 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
440 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
441 veor q1, q1, q13 @ P2 = PS2 ^ 0x80
442 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
446 .macro vp8_v_loop_filter16 name, inner=0, simple=0
447 function ff_vp8_v_loop_filter16\name\()_neon, export=1
449 sub r0, r0, r1, lsl #1+!\simple
453 ldr r12, [sp, #64] @ hev_thresh
454 vld1.8 {q0}, [r0,:128], r1 @ P3
455 vld1.8 {q1}, [r0,:128], r1 @ P2
457 vld1.8 {q2}, [r0,:128], r1 @ P1
458 vld1.8 {q3}, [r0,:128], r1 @ P0
459 vld1.8 {q4}, [r0,:128], r1 @ Q0
460 vld1.8 {q5}, [r0,:128], r1 @ Q1
462 vld1.8 {q6}, [r0,:128], r1 @ Q2
463 vld1.8 {q7}, [r0,:128] @ Q3
464 vdup.8 q15, r3 @ flim_I
466 vdup.8 q14, r2 @ flim_E
468 vp8_loop_filter inner=\inner, simple=\simple
470 @ back up to P2: dst -= stride * 6
471 sub r0, r0, r1, lsl #2
473 sub r0, r0, r1, lsl #1
476 vst1.8 {q1}, [r0,:128], r1 @ P2
478 vst1.8 {q2}, [r0,:128], r1 @ P1
479 vst1.8 {q3}, [r0,:128], r1 @ P0
480 vst1.8 {q4}, [r0,:128], r1 @ Q0
481 vst1.8 {q5}, [r0,:128], r1 @ Q1
483 vst1.8 {q6}, [r0,:128] @ Q2
492 vp8_v_loop_filter16 _inner, inner=1
493 vp8_v_loop_filter16 _simple, simple=1
495 .macro vp8_v_loop_filter8uv name, inner=0
496 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
498 sub r0, r0, r2, lsl #2
499 sub r1, r1, r2, lsl #2
500 ldr r12, [sp, #64] @ flim_I
503 vld1.8 {d0}, [r0,:64], r2 @ P3
504 vld1.8 {d1}, [r1,:64], r2 @ P3
505 vld1.8 {d2}, [r0,:64], r2 @ P2
506 vld1.8 {d3}, [r1,:64], r2 @ P2
507 vld1.8 {d4}, [r0,:64], r2 @ P1
508 vld1.8 {d5}, [r1,:64], r2 @ P1
509 vld1.8 {d6}, [r0,:64], r2 @ P0
510 vld1.8 {d7}, [r1,:64], r2 @ P0
511 vld1.8 {d8}, [r0,:64], r2 @ Q0
512 vld1.8 {d9}, [r1,:64], r2 @ Q0
513 vld1.8 {d10}, [r0,:64], r2 @ Q1
514 vld1.8 {d11}, [r1,:64], r2 @ Q1
515 vld1.8 {d12}, [r0,:64], r2 @ Q2
516 vld1.8 {d13}, [r1,:64], r2 @ Q2
517 vld1.8 {d14}, [r0,:64] @ Q3
518 vld1.8 {d15}, [r1,:64] @ Q3
520 vdup.8 q14, r3 @ flim_E
521 vdup.8 q15, r12 @ flim_I
522 ldr r12, [sp, #68] @ hev_thresh
524 vp8_loop_filter inner=\inner
526 @ back up to P2: u,v -= stride * 6
527 sub r0, r0, r2, lsl #2
528 sub r1, r1, r2, lsl #2
529 sub r0, r0, r2, lsl #1
530 sub r1, r1, r2, lsl #1
533 vst1.8 {d2}, [r0,:64], r2 @ P2
534 vst1.8 {d3}, [r1,:64], r2 @ P2
535 vst1.8 {d4}, [r0,:64], r2 @ P1
536 vst1.8 {d5}, [r1,:64], r2 @ P1
537 vst1.8 {d6}, [r0,:64], r2 @ P0
538 vst1.8 {d7}, [r1,:64], r2 @ P0
539 vst1.8 {d8}, [r0,:64], r2 @ Q0
540 vst1.8 {d9}, [r1,:64], r2 @ Q0
541 vst1.8 {d10}, [r0,:64], r2 @ Q1
542 vst1.8 {d11}, [r1,:64], r2 @ Q1
543 vst1.8 {d12}, [r0,:64] @ Q2
544 vst1.8 {d13}, [r1,:64] @ Q2
552 vp8_v_loop_filter8uv _inner, inner=1
554 .macro vp8_h_loop_filter16 name, inner=0, simple=0
555 function ff_vp8_h_loop_filter16\name\()_neon, export=1
559 ldr r12, [sp, #64] @ hev_thresh
563 vld1.8 {d0}, [r0], r1 @ load first 8-line src data
564 vld1.8 {d2}, [r0], r1
565 vld1.8 {d4}, [r0], r1
566 vld1.8 {d6}, [r0], r1
567 vld1.8 {d8}, [r0], r1
568 vld1.8 {d10}, [r0], r1
569 vld1.8 {d12}, [r0], r1
570 vld1.8 {d14}, [r0], r1
571 vld1.8 {d1}, [r0], r1 @ load second 8-line src data
572 vld1.8 {d3}, [r0], r1
573 vld1.8 {d5}, [r0], r1
574 vld1.8 {d7}, [r0], r1
575 vld1.8 {d9}, [r0], r1
576 vld1.8 {d11}, [r0], r1
577 vld1.8 {d13}, [r0], r1
578 vld1.8 {d15}, [r0], r1
580 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
582 vdup.8 q14, r2 @ flim_E
584 vdup.8 q15, r3 @ flim_I
587 vp8_loop_filter inner=\inner, simple=\simple
589 sub r0, r0, r1, lsl #4 @ backup 16 rows
591 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
594 vst1.8 {d0}, [r0], r1
595 vst1.8 {d2}, [r0], r1
596 vst1.8 {d4}, [r0], r1
597 vst1.8 {d6}, [r0], r1
598 vst1.8 {d8}, [r0], r1
599 vst1.8 {d10}, [r0], r1
600 vst1.8 {d12}, [r0], r1
601 vst1.8 {d14}, [r0], r1
602 vst1.8 {d1}, [r0], r1
603 vst1.8 {d3}, [r0], r1
604 vst1.8 {d5}, [r0], r1
605 vst1.8 {d7}, [r0], r1
606 vst1.8 {d9}, [r0], r1
607 vst1.8 {d11}, [r0], r1
608 vst1.8 {d13}, [r0], r1
617 vp8_h_loop_filter16 _inner, inner=1
618 vp8_h_loop_filter16 _simple, simple=1
620 .macro vp8_h_loop_filter8uv name, inner=0
621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
625 ldr r12, [sp, #64] @ flim_I
628 vld1.8 {d0}, [r0], r2 @ load u
629 vld1.8 {d1}, [r1], r2 @ load v
630 vld1.8 {d2}, [r0], r2
631 vld1.8 {d3}, [r1], r2
632 vld1.8 {d4}, [r0], r2
633 vld1.8 {d5}, [r1], r2
634 vld1.8 {d6}, [r0], r2
635 vld1.8 {d7}, [r1], r2
636 vld1.8 {d8}, [r0], r2
637 vld1.8 {d9}, [r1], r2
638 vld1.8 {d10}, [r0], r2
639 vld1.8 {d11}, [r1], r2
640 vld1.8 {d12}, [r0], r2
641 vld1.8 {d13}, [r1], r2
642 vld1.8 {d14}, [r0], r2
643 vld1.8 {d15}, [r1], r2
645 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
647 vdup.8 q14, r3 @ flim_E
648 vdup.8 q15, r12 @ flim_I
649 ldr r12, [sp, #68] @ hev_thresh
651 vp8_loop_filter inner=\inner
653 sub r0, r0, r2, lsl #3 @ backup u 8 rows
654 sub r1, r1, r2, lsl #3 @ backup v 8 rows
656 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
659 vst1.8 {d0}, [r0], r2
660 vst1.8 {d1}, [r1], r2
661 vst1.8 {d2}, [r0], r2
662 vst1.8 {d3}, [r1], r2
663 vst1.8 {d4}, [r0], r2
664 vst1.8 {d5}, [r1], r2
665 vst1.8 {d6}, [r0], r2
666 vst1.8 {d7}, [r1], r2
667 vst1.8 {d8}, [r0], r2
668 vst1.8 {d9}, [r1], r2
669 vst1.8 {d10}, [r0], r2
670 vst1.8 {d11}, [r1], r2
671 vst1.8 {d12}, [r0], r2
672 vst1.8 {d13}, [r1], r2
682 vp8_h_loop_filter8uv _inner, inner=1
684 function ff_put_vp8_pixels16_neon, export=1
685 ldr r12, [sp, #0] @ h
688 vld1.8 {q0}, [r2], r3
689 vld1.8 {q1}, [r2], r3
690 vld1.8 {q2}, [r2], r3
691 vld1.8 {q3}, [r2], r3
692 vst1.8 {q0}, [r0,:128], r1
693 vst1.8 {q1}, [r0,:128], r1
694 vst1.8 {q2}, [r0,:128], r1
695 vst1.8 {q3}, [r0,:128], r1
700 function ff_put_vp8_pixels8_neon, export=1
701 ldr r12, [sp, #0] @ h
704 vld1.8 {d0}, [r2], r3
705 vld1.8 {d1}, [r2], r3
706 vld1.8 {d2}, [r2], r3
707 vld1.8 {d3}, [r2], r3
708 vst1.8 {d0}, [r0,:64], r1
709 vst1.8 {d1}, [r0,:64], r1
710 vst1.8 {d2}, [r0,:64], r1
711 vst1.8 {d3}, [r0,:64], r1
716 /* 4/6-tap 8th-pel MC */
718 .macro vp8_epel8_h6 d, a, b
719 vext.8 d27, \a, \b, #1
721 vext.8 d28, \a, \b, #2
723 vext.8 d29, \a, \b, #3
725 vext.8 d30, \a, \b, #4
727 vext.8 d31, \a, \b, #5
729 vmul.u16 q10, q10, d0[2]
731 vmul.u16 q11, q11, d0[3]
732 vmls.u16 q10, q9, d0[1]
733 vmls.u16 q11, q12, d1[0]
734 vmla.u16 q10, q8, d0[0]
735 vmla.u16 q11, q13, d1[1]
736 vqadd.s16 q11, q10, q11
737 vqrshrun.s16 \d, q11, #7
740 .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
741 vext.8 q14, \q0, \q1, #3
742 vext.8 q15, \q0, \q1, #4
745 vext.8 q3, \q0, \q1, #2
748 vext.8 q8, \q0, \q1, #1
751 vext.8 q2, \q0, \q1, #5
756 vmul.u16 q11, q11, d0[3]
757 vmul.u16 q10, q10, d0[2]
758 vmul.u16 q3, q3, d0[2]
759 vmul.u16 q14, q14, d0[3]
760 vmls.u16 q11, q12, d1[0]
763 vmls.u16 q10, q9, d0[1]
764 vmls.u16 q3, q8, d0[1]
765 vmls.u16 q14, q15, d1[0]
766 vmla.u16 q10, q12, d0[0]
767 vmla.u16 q11, q13, d1[1]
768 vmla.u16 q3, q1, d0[0]
769 vmla.u16 q14, q2, d1[1]
770 vqadd.s16 q11, q10, q11
771 vqadd.s16 q14, q3, q14
772 vqrshrun.s16 \d0, q11, #7
773 vqrshrun.s16 \d1, q14, #7
776 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
784 vmul.u16 q10, q10, d0[0]
785 vmul.u16 q15, q11, d0[3]
786 vmul.u16 q11, q11, d0[2]
787 vmul.u16 q14, q14, d1[1]
788 vmls.u16 q10, q9, d0[1]
789 vmls.u16 q15, q12, d1[0]
790 vmls.u16 q11, q8, d0[1]
791 vmls.u16 q14, q13, d1[0]
792 vmla.u16 q10, q8, d0[2]
793 vmla.u16 q15, q13, d1[1]
794 vmla.u16 q11, q9, d0[0]
795 vmla.u16 q14, q12, d0[3]
796 vqadd.s16 q15, q10, q15
797 vqadd.s16 q14, q11, q14
798 vqrshrun.s16 \d0, q15, #7
799 vqrshrun.s16 \d1, q14, #7
802 .macro vp8_epel8_h4 d, a, b
803 vext.8 d28, \a, \b, #1
805 vext.8 d29, \a, \b, #2
807 vext.8 d30, \a, \b, #3
810 vmul.u16 q10, q10, d0[2]
811 vmul.u16 q11, q11, d0[3]
812 vmls.u16 q10, q9, d0[1]
813 vmls.u16 q11, q12, d1[0]
814 vqadd.s16 q11, q10, q11
815 vqrshrun.s16 \d, q11, #7
818 .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
824 vmul.u16 q8, q10, d0[2]
825 vmul.u16 q14, q11, d0[3]
826 vmul.u16 q11, q11, d0[2]
827 vmul.u16 q15, q12, d0[3]
828 vmls.u16 q8, q9, d0[1]
829 vmls.u16 q14, q12, d1[0]
830 vmls.u16 q11, q10, d0[1]
831 vmls.u16 q15, q13, d1[0]
832 vqadd.s16 q8, q8, q14
833 vqadd.s16 q11, q11, q15
834 vqrshrun.s16 \d0, q8, #7
835 vqrshrun.s16 \d1, q11, #7
838 function ff_put_vp8_epel16_v6_neon, export=1
839 sub r2, r2, r3, lsl #1
843 ldr r4, [sp, #80] @ my
844 movrel lr, subpel_filters-16
845 ldr r12, [sp, #72] @ h
846 add r4, lr, r4, lsl #4
847 vld1.16 {q0}, [r4,:128]
849 vld1.8 {d2-d3}, [r2], r3
850 vld1.8 {d4-d5}, [r2], r3
851 vld1.8 {d6-d7}, [r2], r3
852 vld1.8 {d8-d9}, [r2], r3
853 vld1.8 {d10-d11},[r2], r3
854 vld1.8 {d12-d13},[r2], r3
855 vld1.8 {d14-d15},[r2]
856 sub r2, r2, r3, lsl #2
858 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
859 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
861 vst1.8 {d2-d3}, [r0,:128], r1
862 vst1.8 {d4-d5}, [r0,:128], r1
870 function ff_put_vp8_epel16_h6_neon, export=1
874 ldr r4, [sp, #12] @ mx
875 movrel lr, subpel_filters-16
876 ldr r12, [sp, #8] @ h
877 add r4, lr, r4, lsl #4
878 vld1.16 {q0}, [r4,:128]
880 vld1.8 {d2-d4}, [r2], r3
882 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
884 vst1.8 {d2-d3}, [r0,:128], r1
891 function ff_put_vp8_epel16_h6v6_neon, export=1
892 sub r2, r2, r3, lsl #1
897 @ first pass (horizontal):
898 ldr r4, [sp, #64+8+4] @ mx
899 movrel lr, subpel_filters-16
900 ldr r12, [sp, #64+8+0] @ h
901 add r4, lr, r4, lsl #4
903 vld1.16 {q0}, [r4,:128]
908 vld1.8 {d2,d3,d4}, [r2], r3
910 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
912 vst1.8 {d2-d3}, [lr,:128]!
916 @ second pass (vertical):
917 ldr r4, [sp, #336+16+64+8+8] @ my
918 movrel lr, subpel_filters-16
919 ldr r12, [sp, #336+16+64+8+0] @ h
920 add r4, lr, r4, lsl #4
922 vld1.16 {q0}, [r4,:128]
925 vld1.8 {d2-d5}, [lr,:128]!
926 vld1.8 {d6-d9}, [lr,:128]!
927 vld1.8 {d10-d13},[lr,:128]!
928 vld1.8 {d14-d15},[lr,:128]
931 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
932 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
934 vst1.8 {d2-d3}, [r0,:128], r1
935 vst1.8 {d4-d5}, [r0,:128], r1
944 function ff_put_vp8_epel8_v6_neon, export=1
945 sub r2, r2, r3, lsl #1
948 ldr r4, [sp, #16] @ my
949 movrel lr, subpel_filters-16
950 ldr r12, [sp, #8] @ h
951 add r4, lr, r4, lsl #4
952 vld1.16 {q0}, [r4,:128]
954 vld1.8 {d2}, [r2], r3
955 vld1.8 {d3}, [r2], r3
956 vld1.8 {d4}, [r2], r3
957 vld1.8 {d5}, [r2], r3
958 vld1.8 {d6}, [r2], r3
959 vld1.8 {d7}, [r2], r3
962 sub r2, r2, r3, lsl #2
964 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
966 vst1.8 {d2}, [r0,:64], r1
967 vst1.8 {d3}, [r0,:64], r1
974 function ff_put_vp8_epel8_h6_neon, export=1
978 ldr r4, [sp, #12] @ mx
979 movrel lr, subpel_filters-16
980 ldr r12, [sp, #8] @ h
981 add r4, lr, r4, lsl #4
982 vld1.16 {q0}, [r4,:128]
984 vld1.8 {d2,d3}, [r2], r3
986 vp8_epel8_h6 d2, d2, d3
988 vst1.8 {d2}, [r0,:64], r1
995 function ff_put_vp8_epel8_h6v6_neon, export=1
996 sub r2, r2, r3, lsl #1
1000 @ first pass (horizontal):
1001 ldr r4, [sp, #12] @ mx
1002 movrel lr, subpel_filters-16
1003 ldr r12, [sp, #8] @ h
1004 add r4, lr, r4, lsl #4
1006 vld1.16 {q0}, [r4,:128]
1011 vld1.8 {d2,d3}, [r2], r3
1013 vp8_epel8_h6 d2, d2, d3
1015 vst1.8 {d2}, [lr,:64]!
1019 @ second pass (vertical):
1020 ldr r4, [sp, #168+16+16] @ my
1021 movrel lr, subpel_filters-16
1022 ldr r12, [sp, #168+16+8] @ h
1023 add r4, lr, r4, lsl #4
1025 vld1.16 {q0}, [r4,:128]
1028 vld1.8 {d2-d5}, [lr,:128]!
1029 vld1.8 {d6-d7}, [lr,:128]!
1030 vld1.8 {d30}, [lr,:64]
1033 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1035 vst1.8 {d2}, [r0,:64], r1
1036 vst1.8 {d3}, [r0,:64], r1
1044 function ff_put_vp8_epel8_v4_neon, export=1
1048 ldr r4, [sp, #16] @ my
1049 movrel lr, subpel_filters-16
1050 ldr r12, [sp, #8] @ h
1051 add r4, lr, r4, lsl #4
1052 vld1.16 {q0}, [r4,:128]
1054 vld1.8 {d2}, [r2], r3
1055 vld1.8 {d3}, [r2], r3
1056 vld1.8 {d4}, [r2], r3
1057 vld1.8 {d5}, [r2], r3
1059 sub r2, r2, r3, lsl #1
1061 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1063 vst1.8 {d2}, [r0,:64], r1
1064 vst1.8 {d3}, [r0,:64], r1
1071 function ff_put_vp8_epel8_h4_neon, export=1
1075 ldr r4, [sp, #12] @ mx
1076 movrel lr, subpel_filters-16
1077 ldr r12, [sp, #8] @ h
1078 add r4, lr, r4, lsl #4
1079 vld1.16 {q0}, [r4,:128]
1081 vld1.8 {d2,d3}, [r2], r3
1083 vp8_epel8_h4 d2, d2, d3
1085 vst1.8 {d2}, [r0,:64], r1
1092 function ff_put_vp8_epel8_h4v4_neon, export=1
1097 @ first pass (horizontal):
1098 ldr r4, [sp, #12] @ mx
1099 movrel lr, subpel_filters-16
1100 ldr r12, [sp, #8] @ h
1101 add r4, lr, r4, lsl #4
1103 vld1.16 {q0}, [r4,:128]
1108 vld1.8 {d2,d3}, [r2], r3
1110 vp8_epel8_h4 d2, d2, d3
1112 vst1.8 {d2}, [lr,:64]!
1116 @ second pass (vertical):
1117 ldr r4, [sp, #168+16+16] @ my
1118 movrel lr, subpel_filters-16
1119 ldr r12, [sp, #168+16+8] @ h
1120 add r4, lr, r4, lsl #4
1122 vld1.16 {q0}, [r4,:128]
1125 vld1.8 {d2-d5}, [lr,:128]!
1126 vld1.8 {d6}, [lr,:64]
1129 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1131 vst1.8 {d2}, [r0,:64], r1
1132 vst1.8 {d3}, [r0,:64], r1
1140 function ff_put_vp8_epel8_h6v4_neon, export=1
1145 @ first pass (horizontal):
1146 ldr r4, [sp, #12] @ mx
1147 movrel lr, subpel_filters-16
1148 ldr r12, [sp, #8] @ h
1149 add r4, lr, r4, lsl #4
1151 vld1.16 {q0}, [r4,:128]
1156 vld1.8 {d2,d3}, [r2], r3
1158 vp8_epel8_h6 d2, d2, d3
1160 vst1.8 {d2}, [lr,:64]!
1164 @ second pass (vertical):
1165 ldr r4, [sp, #168+16+16] @ my
1166 movrel lr, subpel_filters-16
1167 ldr r12, [sp, #168+16+8] @ h
1168 add r4, lr, r4, lsl #4
1170 vld1.16 {q0}, [r4,:128]
1173 vld1.8 {d2-d5}, [lr,:128]!
1174 vld1.8 {d6}, [lr,:64]
1177 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1179 vst1.8 {d2}, [r0,:64], r1
1180 vst1.8 {d3}, [r0,:64], r1
1188 function ff_put_vp8_epel8_h4v6_neon, export=1
1189 sub r2, r2, r3, lsl #1
1193 @ first pass (horizontal):
1194 ldr r4, [sp, #12] @ mx
1195 movrel lr, subpel_filters-16
1196 ldr r12, [sp, #8] @ h
1197 add r4, lr, r4, lsl #4
1199 vld1.16 {q0}, [r4,:128]
1204 vld1.8 {d2,d3}, [r2], r3
1206 vp8_epel8_h4 d2, d2, d3
1208 vst1.8 {d2}, [lr,:64]!
1212 @ second pass (vertical):
1213 ldr r4, [sp, #168+16+16] @ my
1214 movrel lr, subpel_filters-16
1215 ldr r12, [sp, #168+16+8] @ h
1216 add r4, lr, r4, lsl #4
1218 vld1.16 {q0}, [r4,:128]
1221 vld1.8 {d2-d5}, [lr,:128]!
1222 vld1.8 {d6-d7}, [lr,:128]!
1223 vld1.8 {d30}, [lr,:64]
1226 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1228 vst1.8 {d2}, [r0,:64], r1
1229 vst1.8 {d3}, [r0,:64], r1
1239 function ff_put_vp8_epel4_v6_neon, export=1
1240 sub r2, r2, r3, lsl #1
1243 ldr r4, [sp, #16] @ my
1244 movrel lr, subpel_filters-16
1245 ldr r12, [sp, #8] @ h
1246 add r4, lr, r4, lsl #4
1247 vld1.16 {q0}, [r4,:128]
1249 vld1.32 {d2[]}, [r2], r3
1250 vld1.32 {d3[]}, [r2], r3
1251 vld1.32 {d4[]}, [r2], r3
1252 vld1.32 {d5[]}, [r2], r3
1253 vld1.32 {d6[]}, [r2], r3
1254 vld1.32 {d7[]}, [r2], r3
1255 vld1.32 {d28[]}, [r2]
1256 sub r2, r2, r3, lsl #2
1257 vld1.32 {d2[1]}, [r2], r3
1258 vld1.32 {d3[1]}, [r2], r3
1259 vld1.32 {d4[1]}, [r2], r3
1260 vld1.32 {d5[1]}, [r2], r3
1261 vld1.32 {d6[1]}, [r2], r3
1262 vld1.32 {d7[1]}, [r2], r3
1263 vld1.32 {d28[1]}, [r2]
1264 sub r2, r2, r3, lsl #2
1266 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1268 vst1.32 {d2[0]}, [r0,:32], r1
1269 vst1.32 {d3[0]}, [r0,:32], r1
1270 vst1.32 {d2[1]}, [r0,:32], r1
1271 vst1.32 {d3[1]}, [r0,:32], r1
1278 function ff_put_vp8_epel4_h6_neon, export=1
1282 ldr r4, [sp, #12] @ mx
1283 movrel lr, subpel_filters-16
1284 ldr r12, [sp, #8] @ h
1285 add r4, lr, r4, lsl #4
1286 vld1.16 {q0}, [r4,:128]
1288 vld1.8 {q1}, [r2], r3
1289 vp8_epel8_h6 d2, d2, d3
1290 vst1.32 {d2[0]}, [r0,:32], r1
1297 function ff_put_vp8_epel4_h6v6_neon, export=1
1298 sub r2, r2, r3, lsl #1
1302 ldr r4, [sp, #12] @ mx
1303 movrel lr, subpel_filters-16
1304 ldr r12, [sp, #8] @ h
1305 add r4, lr, r4, lsl #4
1307 vld1.16 {q0}, [r4,:128]
1312 vld1.8 {q1}, [r2], r3
1313 vp8_epel8_h6 d2, d2, d3
1314 vst1.32 {d2[0]}, [lr,:32]!
1318 ldr r4, [sp, #52+16+16] @ my
1319 movrel lr, subpel_filters-16
1320 ldr r12, [sp, #52+16+8] @ h
1321 add r4, lr, r4, lsl #4
1323 vld1.16 {q0}, [r4,:128]
1326 vld1.8 {d2-d3}, [lr,:128]!
1327 vld1.8 {d6}, [lr,:64]!
1328 vld1.32 {d28[]}, [lr,:32]
1330 vld1.8 {d4-d5}, [lr]!
1331 vld1.8 {d7}, [lr,:64]!
1332 vld1.32 {d28[1]}, [lr,:32]
1336 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1337 vst1.32 {d2[0]}, [r0,:32], r1
1338 vst1.32 {d3[0]}, [r0,:32], r1
1339 vst1.32 {d2[1]}, [r0,:32], r1
1340 vst1.32 {d3[1]}, [r0,:32], r1
1348 function ff_put_vp8_epel4_h4v6_neon, export=1
1349 sub r2, r2, r3, lsl #1
1353 ldr r4, [sp, #12] @ mx
1354 movrel lr, subpel_filters-16
1355 ldr r12, [sp, #8] @ h
1356 add r4, lr, r4, lsl #4
1358 vld1.16 {q0}, [r4,:128]
1363 vld1.8 {d2}, [r2], r3
1364 vp8_epel8_h4 d2, d2, d2
1365 vst1.32 {d2[0]}, [lr,:32]!
1369 ldr r4, [sp, #52+16+16] @ my
1370 movrel lr, subpel_filters-16
1371 ldr r12, [sp, #52+16+8] @ h
1372 add r4, lr, r4, lsl #4
1374 vld1.16 {q0}, [r4,:128]
1377 vld1.8 {d2-d3}, [lr,:128]!
1378 vld1.8 {d6}, [lr,:64]!
1379 vld1.32 {d28[]}, [lr,:32]
1381 vld1.8 {d4-d5}, [lr]!
1382 vld1.8 {d7}, [lr,:64]!
1383 vld1.32 {d28[1]}, [lr,:32]
1387 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1388 vst1.32 {d2[0]}, [r0,:32], r1
1389 vst1.32 {d3[0]}, [r0,:32], r1
1390 vst1.32 {d2[1]}, [r0,:32], r1
1391 vst1.32 {d3[1]}, [r0,:32], r1
1399 function ff_put_vp8_epel4_h6v4_neon, export=1
1404 ldr r4, [sp, #12] @ mx
1405 movrel lr, subpel_filters-16
1406 ldr r12, [sp, #8] @ h
1407 add r4, lr, r4, lsl #4
1409 vld1.16 {q0}, [r4,:128]
1414 vld1.8 {q1}, [r2], r3
1415 vp8_epel8_h6 d2, d2, d3
1416 vst1.32 {d2[0]}, [lr,:32]!
1420 ldr r4, [sp, #44+16+16] @ my
1421 movrel lr, subpel_filters-16
1422 ldr r12, [sp, #44+16+8] @ h
1423 add r4, lr, r4, lsl #4
1425 vld1.16 {q0}, [r4,:128]
1428 vld1.8 {d2-d3}, [lr,:128]!
1429 vld1.32 {d6[]}, [lr,:32]
1431 vld1.8 {d4-d5}, [lr]!
1432 vld1.32 {d6[1]}, [lr,:32]
1435 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1436 vst1.32 {d2[0]}, [r0,:32], r1
1437 vst1.32 {d3[0]}, [r0,:32], r1
1438 vst1.32 {d2[1]}, [r0,:32], r1
1439 vst1.32 {d3[1]}, [r0,:32], r1
1447 function ff_put_vp8_epel4_h4_neon, export=1
1451 ldr r4, [sp, #12] @ mx
1452 movrel lr, subpel_filters-16
1453 ldr r12, [sp, #8] @ h
1454 add r4, lr, r4, lsl #4
1455 vld1.16 {q0}, [r4,:128]
1457 vld1.8 {d2}, [r2], r3
1458 vp8_epel8_h4 d2, d2, d2
1459 vst1.32 {d2[0]}, [r0,:32], r1
1466 function ff_put_vp8_epel4_v4_neon, export=1
1470 ldr r4, [sp, #16] @ my
1471 movrel lr, subpel_filters-16
1472 ldr r12, [sp, #8] @ h
1473 add r4, lr, r4, lsl #4
1474 vld1.16 {q0}, [r4,:128]
1476 vld1.32 {d2[]}, [r2], r3
1477 vld1.32 {d3[]}, [r2], r3
1478 vld1.32 {d4[]}, [r2], r3
1479 vld1.32 {d5[]}, [r2], r3
1480 vld1.32 {d6[]}, [r2]
1481 sub r2, r2, r3, lsl #1
1482 vld1.32 {d2[1]}, [r2], r3
1483 vld1.32 {d3[1]}, [r2], r3
1484 vld1.32 {d4[1]}, [r2], r3
1485 vld1.32 {d5[1]}, [r2], r3
1486 vld1.32 {d6[1]}, [r2]
1487 sub r2, r2, r3, lsl #1
1489 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1491 vst1.32 {d2[0]}, [r0,:32], r1
1492 vst1.32 {d3[0]}, [r0,:32], r1
1493 vst1.32 {d2[1]}, [r0,:32], r1
1494 vst1.32 {d3[1]}, [r0,:32], r1
1501 function ff_put_vp8_epel4_h4v4_neon, export=1
1506 ldr r4, [sp, #12] @ mx
1507 movrel lr, subpel_filters-16
1508 ldr r12, [sp, #8] @ h
1509 add r4, lr, r4, lsl #4
1511 vld1.16 {q0}, [r4,:128]
1516 vld1.8 {d2}, [r2], r3
1517 vp8_epel8_h4 d2, d2, d3
1518 vst1.32 {d2[0]}, [lr,:32]!
1522 ldr r4, [sp, #44+16+16] @ my
1523 movrel lr, subpel_filters-16
1524 ldr r12, [sp, #44+16+8] @ h
1525 add r4, lr, r4, lsl #4
1527 vld1.16 {q0}, [r4,:128]
1530 vld1.8 {d2-d3}, [lr,:128]!
1531 vld1.32 {d6[]}, [lr,:32]
1533 vld1.8 {d4-d5}, [lr]!
1534 vld1.32 {d6[1]}, [lr,:32]
1537 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1538 vst1.32 {d2[0]}, [r0,:32], r1
1539 vst1.32 {d3[0]}, [r0,:32], r1
1540 vst1.32 {d2[1]}, [r0,:32], r1
1541 vst1.32 {d3[1]}, [r0,:32], r1
1549 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1550 @ arithmetic can be used to apply filters
1551 const subpel_filters, align=4
1552 .short 0, 6, 123, 12, 1, 0, 0, 0
1553 .short 2, 11, 108, 36, 8, 1, 0, 0
1554 .short 0, 9, 93, 50, 6, 0, 0, 0
1555 .short 3, 16, 77, 77, 16, 3, 0, 0
1556 .short 0, 6, 50, 93, 9, 0, 0, 0
1557 .short 1, 8, 36, 108, 11, 2, 0, 0
1558 .short 0, 1, 12, 123, 6, 0, 0, 0
1563 function ff_put_vp8_bilin16_h_neon, export=1
1564 ldr r12, [sp, #4] @ mx
1571 vld1.8 {d2-d4}, [r2], r3
1572 vext.8 q2, q1, q2, #1
1575 vld1.8 {d18-d20},[r2], r3
1578 vext.8 q10, q9, q10, #1
1579 vmull.u8 q11, d18, d1
1580 vmlal.u8 q11, d20, d0
1581 vmull.u8 q12, d19, d1
1582 vmlal.u8 q12, d21, d0
1583 vrshrn.u16 d4, q8, #3
1584 vrshrn.u16 d5, q3, #3
1585 vrshrn.u16 d6, q11, #3
1586 vrshrn.u16 d7, q12, #3
1587 vst1.8 {q2}, [r0,:128], r1
1588 vst1.8 {q3}, [r0,:128], r1
1594 function ff_put_vp8_bilin16_v_neon, export=1
1595 ldr r12, [sp, #8] @ my
1600 vld1.8 {q1}, [r2], r3
1603 vld1.8 {q2}, [r2], r3
1608 vld1.8 {q1}, [r2], r3
1611 vmull.u8 q10, d5, d1
1612 vmlal.u8 q10, d3, d0
1613 vrshrn.u16 d4, q3, #3
1614 vrshrn.u16 d5, q8, #3
1615 vrshrn.u16 d6, q9, #3
1616 vrshrn.u16 d7, q10, #3
1617 vst1.8 {q2}, [r0,:128], r1
1618 vst1.8 {q3}, [r0,:128], r1
1624 function ff_put_vp8_bilin16_hv_neon, export=1
1625 ldr r12, [sp, #4] @ mx
1629 ldr r12, [sp, #8] @ my
1635 vld1.8 {d4-d6}, [r2], r3
1636 vext.8 q3, q2, q3, #1
1641 vrshrn.u16 d4, q8, #3
1642 vrshrn.u16 d5, q9, #3
1645 vld1.8 {d18-d20},[r2], r3
1646 vext.8 q10, q9, q10, #1
1647 vmull.u8 q11, d18, d1
1648 vmlal.u8 q11, d20, d0
1649 vld1.8 {d26-d28},[r2], r3
1650 vmull.u8 q12, d19, d1
1651 vmlal.u8 q12, d21, d0
1652 vext.8 q14, q13, q14, #1
1653 vmull.u8 q8, d26, d1
1654 vmlal.u8 q8, d28, d0
1655 vmull.u8 q9, d27, d1
1656 vmlal.u8 q9, d29, d0
1657 vrshrn.u16 d6, q11, #3
1658 vrshrn.u16 d7, q12, #3
1659 vmull.u8 q12, d4, d3
1660 vmlal.u8 q12, d6, d2
1661 vmull.u8 q15, d5, d3
1662 vmlal.u8 q15, d7, d2
1663 vrshrn.u16 d4, q8, #3
1664 vrshrn.u16 d5, q9, #3
1665 vmull.u8 q10, d6, d3
1666 vmlal.u8 q10, d4, d2
1667 vmull.u8 q11, d7, d3
1668 vmlal.u8 q11, d5, d2
1669 vrshrn.u16 d24, q12, #3
1670 vrshrn.u16 d25, q15, #3
1671 vst1.8 {q12}, [r0,:128], r1
1672 vrshrn.u16 d20, q10, #3
1673 vrshrn.u16 d21, q11, #3
1674 vst1.8 {q10}, [r0,:128], r1
1680 function ff_put_vp8_bilin8_h_neon, export=1
1681 ldr r12, [sp, #4] @ mx
1688 vld1.8 {q1}, [r2], r3
1689 vext.8 d3, d2, d3, #1
1692 vld1.8 {q3}, [r2], r3
1693 vext.8 d7, d6, d7, #1
1696 vrshrn.u16 d4, q2, #3
1697 vrshrn.u16 d16, q8, #3
1698 vst1.8 {d4}, [r0,:64], r1
1699 vst1.8 {d16}, [r0,:64], r1
1705 function ff_put_vp8_bilin8_v_neon, export=1
1706 ldr r12, [sp, #8] @ my
1711 vld1.8 {d2}, [r2], r3
1714 vld1.8 {d3}, [r2], r3
1717 vld1.8 {d2}, [r2], r3
1720 vrshrn.u16 d4, q2, #3
1721 vrshrn.u16 d6, q3, #3
1722 vst1.8 {d4}, [r0,:64], r1
1723 vst1.8 {d6}, [r0,:64], r1
1729 function ff_put_vp8_bilin8_hv_neon, export=1
1730 ldr r12, [sp, #4] @ mx
1734 ldr r12, [sp, #8] @ my
1740 vld1.8 {q2}, [r2], r3
1741 vext.8 d5, d4, d5, #1
1744 vrshrn.u16 d22, q9, #3
1747 vld1.8 {q3}, [r2], r3
1748 vext.8 d7, d6, d7, #1
1751 vld1.8 {q2}, [r2], r3
1752 vext.8 d5, d4, d5, #1
1755 vrshrn.u16 d16, q8, #3
1756 vmull.u8 q10, d22, d3
1757 vmlal.u8 q10, d16, d2
1758 vrshrn.u16 d22, q9, #3
1759 vmull.u8 q12, d16, d3
1760 vmlal.u8 q12, d22, d2
1761 vrshrn.u16 d20, q10, #3
1762 vst1.8 {d20}, [r0,:64], r1
1763 vrshrn.u16 d23, q12, #3
1764 vst1.8 {d23}, [r0,:64], r1
1770 function ff_put_vp8_bilin4_h_neon, export=1
1771 ldr r12, [sp, #4] @ mx
1778 vld1.8 {d2}, [r2], r3
1779 vext.8 d3, d2, d3, #1
1780 vld1.8 {d6}, [r2], r3
1781 vext.8 d7, d6, d7, #1
1785 vrshrn.u16 d4, q2, #3
1786 vst1.32 {d4[0]}, [r0,:32], r1
1787 vst1.32 {d4[1]}, [r0,:32], r1
1793 function ff_put_vp8_bilin4_v_neon, export=1
1794 ldr r12, [sp, #8] @ my
1799 vld1.32 {d2[]}, [r2], r3
1801 vld1.32 {d3[]}, [r2]
1802 vld1.32 {d2[1]}, [r2], r3
1803 vld1.32 {d3[1]}, [r2], r3
1807 vrshrn.u16 d4, q2, #3
1808 vst1.32 {d4[0]}, [r0,:32], r1
1809 vst1.32 {d4[1]}, [r0,:32], r1
1816 function ff_put_vp8_bilin4_hv_neon, export=1
1817 ldr r12, [sp, #4] @ mx
1821 ldr r12, [sp, #8] @ my
1827 vld1.8 {d4}, [r2], r3
1828 vext.8 d5, d4, d4, #1
1831 vrshrn.u16 d22, q9, #3
1834 vld1.8 {d6}, [r2], r3
1835 vext.8 d7, d6, d6, #1
1836 vld1.8 {d4}, [r2], r3
1837 vext.8 d5, d4, d4, #1
1841 vrshrn.u16 d16, q8, #3
1842 vmull.u8 q10, d16, d2
1844 vmlal.u8 q10, d22, d3
1846 vrshrn.u16 d20, q10, #3
1847 vst1.32 {d20[0]}, [r0,:32], r1
1848 vst1.32 {d20[1]}, [r0,:32], r1