2 * VP8 NEON optimisations
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "libavutil/arm/asm.S"
27 function ff_vp8_luma_dc_wht_neon, export=1
28 vld1.16 {q0-q1}, [r1,:128]
33 vst1.16 {q15}, [r1,:128]!
36 vst1.16 {q15}, [r1,:128]
60 vst1.16 {d0[0]}, [r0,:16], r3
61 vst1.16 {d1[0]}, [r0,:16], r3
62 vst1.16 {d2[0]}, [r0,:16], r3
63 vst1.16 {d3[0]}, [r0,:16], r3
64 vst1.16 {d0[1]}, [r0,:16], r3
65 vst1.16 {d1[1]}, [r0,:16], r3
66 vst1.16 {d2[1]}, [r0,:16], r3
67 vst1.16 {d3[1]}, [r0,:16], r3
68 vst1.16 {d0[2]}, [r0,:16], r3
69 vst1.16 {d1[2]}, [r0,:16], r3
70 vst1.16 {d2[2]}, [r0,:16], r3
71 vst1.16 {d3[2]}, [r0,:16], r3
72 vst1.16 {d0[3]}, [r0,:16], r3
73 vst1.16 {d1[3]}, [r0,:16], r3
74 vst1.16 {d2[3]}, [r0,:16], r3
75 vst1.16 {d3[3]}, [r0,:16], r3
80 function ff_vp8_idct_add_neon, export=1
81 vld1.16 {q0-q1}, [r1,:128]
86 vmull.s16 q12, d1, d4[0]
87 vmull.s16 q13, d3, d4[0]
88 vqdmulh.s16 d20, d1, d4[1]
89 vqdmulh.s16 d23, d3, d4[1]
90 vshrn.s32 d21, q12, #16
91 vshrn.s32 d22, q13, #16
97 vadd.s16 d18, d21, d23
98 vsub.s16 d19, d20, d22
108 vmull.s16 q12, d1, d4[0]
109 vst1.16 {q15}, [r1,:128]!
110 vmull.s16 q13, d2, d4[0]
111 vst1.16 {q15}, [r1,:128]
112 vqdmulh.s16 d21, d1, d4[1]
113 vqdmulh.s16 d23, d2, d4[1]
114 vshrn.s32 d20, q12, #16
115 vshrn.s32 d22, q13, #16
116 vadd.i16 d20, d20, d1
117 vadd.i16 d22, d22, d2
121 vadd.i16 d18, d20, d23
122 vld1.32 {d20[]}, [r0,:32], r2
123 vsub.i16 d19, d21, d22
124 vld1.32 {d22[]}, [r0,:32], r2
126 vld1.32 {d23[]}, [r0,:32], r2
128 vld1.32 {d21[]}, [r0,:32], r2
133 sub r0, r0, r2, lsl #2
145 vst1.32 {d0[0]}, [r0,:32], r2
146 vst1.32 {d0[1]}, [r0,:32], r2
147 vst1.32 {d1[1]}, [r0,:32], r2
148 vst1.32 {d1[0]}, [r0,:32], r2
153 function ff_vp8_idct_dc_add_neon, export=1
159 vld1.32 {d0[]}, [r0,:32], r2
160 vld1.32 {d1[]}, [r0,:32], r2
161 vld1.32 {d0[1]}, [r0,:32], r2
162 vld1.32 {d1[1]}, [r0,:32], r2
165 sub r0, r0, r2, lsl #2
168 vst1.32 {d0[0]}, [r0,:32], r2
169 vst1.32 {d1[0]}, [r0,:32], r2
170 vst1.32 {d0[1]}, [r0,:32], r2
171 vst1.32 {d1[1]}, [r0,:32], r2
175 function ff_vp8_idct_dc_add4uv_neon, export=1
178 vld1.16 {d16[]}, [r1,:16]
179 vst1.16 {d0[0]}, [r1,:16], r3
180 vld1.16 {d17[]}, [r1,:16]
181 vst1.16 {d0[0]}, [r1,:16], r3
182 vld1.16 {d18[]}, [r1,:16]
183 vst1.16 {d0[0]}, [r1,:16], r3
184 vld1.16 {d19[]}, [r1,:16]
185 vst1.16 {d0[0]}, [r1,:16], r3
187 vrshr.s16 q8, q8, #3 @ dc >>= 3
188 vld1.8 {d0}, [r0,:64], r2
190 vld1.8 {d1}, [r0,:64], r2
192 vld1.8 {d2}, [r0,:64], r2
194 vld1.8 {d3}, [r0,:64], r2
196 vld1.8 {d4}, [r0,:64], r2
198 vld1.8 {d5}, [r0,:64], r2
200 vld1.8 {d6}, [r0,:64], r2
202 vld1.8 {d7}, [r0,:64], r2
208 vst1.8 {d20}, [r3,:64], r2
210 vst1.8 {d21}, [r3,:64], r2
212 vst1.8 {d22}, [r3,:64], r2
214 vst1.8 {d23}, [r3,:64], r2
216 vst1.8 {d24}, [r3,:64], r2
218 vst1.8 {d25}, [r3,:64], r2
219 vst1.8 {d26}, [r3,:64], r2
220 vst1.8 {d27}, [r3,:64], r2
225 function ff_vp8_idct_dc_add4y_neon, export=1
228 vld1.16 {d16[]}, [r1,:16]
229 vst1.16 {d0[0]}, [r1,:16], r3
230 vld1.16 {d17[]}, [r1,:16]
231 vst1.16 {d0[0]}, [r1,:16], r3
232 vld1.16 {d18[]}, [r1,:16]
233 vst1.16 {d0[0]}, [r1,:16], r3
234 vld1.16 {d19[]}, [r1,:16]
235 vst1.16 {d0[0]}, [r1,:16], r3
236 vrshr.s16 q8, q8, #3 @ dc >>= 3
237 vld1.8 {q0}, [r0,:128], r2
239 vld1.8 {q1}, [r0,:128], r2
241 vld1.8 {q2}, [r0,:128], r2
243 vld1.8 {q3}, [r0,:128], r2
250 sub r0, r0, r2, lsl #2
256 vst1.8 {q10}, [r0,:128], r2
258 vst1.8 {q11}, [r0,:128], r2
260 vst1.8 {q12}, [r0,:128], r2
262 vst1.8 {q13}, [r0,:128], r2
273 .macro vp8_loop_filter, inner=0, simple=0
275 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
276 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
277 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
278 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
279 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
281 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
283 @ calculate hev and normal_limit:
284 vabd.u8 q12, q2, q3 @ abs(P1-P0)
285 vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
286 vabd.u8 q10, q0, q1 @ abs(P3-P2)
287 vabd.u8 q11, q1, q2 @ abs(P2-P1)
288 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
289 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
290 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
291 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
293 vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
295 vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
297 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
298 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
299 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
300 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
302 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
304 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
305 vdup.8 q15, r12 @ hev_thresh
306 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
307 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
308 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
309 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
319 @ convert to signed value:
320 veor q3, q3, q13 @ PS0 = P0 ^ 0x80
321 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
324 vsubl.s8 q10, d8, d6 @ QS0 - PS0
325 vsubl.s8 q11, d9, d7 @ (widened to 16 bits)
326 veor q2, q2, q13 @ PS1 = P1 ^ 0x80
327 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
328 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
329 vmul.i16 q11, q11, q12
331 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
335 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
337 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
338 vaddw.s8 q11, q11, d25
339 vqmovn.s16 d20, q10 @ narrow result back into q10
341 .if !\inner && !\simple
342 veor q1, q1, q13 @ PS2 = P2 ^ 0x80
343 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
345 vand q10, q10, q8 @ w &= normal_limit
347 @ registers used at this point..
348 @ q0 -> P3 (don't corrupt)
350 @ q7 -> Q3 (don't corrupt)
356 @ q8, q11, q12 -> unused
358 @ filter_common: is4tap==1
359 @ c1 = clamp(w + 4) >> 3;
360 @ c2 = clamp(w + 3) >> 3;
361 @ Q0 = s2u(QS0 - c1);
362 @ P0 = s2u(PS0 + c2);
365 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
366 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
367 vshr.s8 q11, q11, #3 @ c1 >>= 3
368 vshr.s8 q12, q12, #3 @ c2 >>= 3
369 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
370 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
371 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
372 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
373 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
374 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
376 @ the !is4tap case of filter_common, only used for inner blocks
377 @ c3 = ((c1&~hev) + 1) >> 1;
378 @ Q1 = s2u(QS1 - c3);
379 @ P1 = s2u(PS1 + c3);
380 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
381 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
382 vshr.s8 q11, q11, #3 @ c1 >>= 3
383 vshr.s8 q12, q12, #3 @ c2 >>= 3
384 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
385 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
386 vbic q11, q11, q9 @ c1 & ~hev
387 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
388 vrshr.s8 q11, q11, #1 @ c3 >>= 1
389 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
390 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
391 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
392 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
393 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
395 vand q12, q10, q9 @ w & hev
396 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
397 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
398 vshr.s8 q11, q11, #3 @ c1 >>= 3
399 vshr.s8 q12, q12, #3 @ c2 >>= 3
400 vbic q10, q10, q9 @ w &= ~hev
401 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
402 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
405 @ a = clamp((27*w + 63) >> 7);
408 @ a = clamp((18*w + 63) >> 7);
411 @ a = clamp((9*w + 63) >> 7);
415 vshll.s8 q14, d20, #3
416 vshll.s8 q15, d21, #3
417 vaddw.s8 q14, q14, d20
418 vaddw.s8 q15, q15, d21
420 vadd.s16 q9, q9, q15 @ 9*w + 63
421 vadd.s16 q11, q8, q14
422 vadd.s16 q12, q9, q15 @ 18*w + 63
423 vadd.s16 q14, q11, q14
424 vadd.s16 q15, q12, q15 @ 27*w + 63
425 vqshrn.s16 d16, q8, #7
426 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
427 vqshrn.s16 d22, q11, #7
428 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
429 vqshrn.s16 d28, q14, #7
430 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
431 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
432 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
433 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
434 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
435 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
436 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
437 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
438 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
439 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
440 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
441 veor q1, q1, q13 @ P2 = PS2 ^ 0x80
442 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
446 .macro vp8_v_loop_filter16 name, inner=0, simple=0
447 function ff_vp8_v_loop_filter16\name\()_neon, export=1
449 sub r0, r0, r1, lsl #1+!\simple
453 ldr r12, [sp, #64] @ hev_thresh
454 vld1.8 {q0}, [r0,:128], r1 @ P3
455 vld1.8 {q1}, [r0,:128], r1 @ P2
457 vld1.8 {q2}, [r0,:128], r1 @ P1
458 vld1.8 {q3}, [r0,:128], r1 @ P0
459 vld1.8 {q4}, [r0,:128], r1 @ Q0
460 vld1.8 {q5}, [r0,:128], r1 @ Q1
462 vld1.8 {q6}, [r0,:128], r1 @ Q2
463 vld1.8 {q7}, [r0,:128] @ Q3
464 vdup.8 q15, r3 @ flim_I
466 vdup.8 q14, r2 @ flim_E
468 vp8_loop_filter inner=\inner, simple=\simple
470 @ back up to P2: dst -= stride * 6
471 sub r0, r0, r1, lsl #2
473 sub r0, r0, r1, lsl #1
476 vst1.8 {q1}, [r0,:128], r1 @ P2
478 vst1.8 {q2}, [r0,:128], r1 @ P1
479 vst1.8 {q3}, [r0,:128], r1 @ P0
480 vst1.8 {q4}, [r0,:128], r1 @ Q0
481 vst1.8 {q5}, [r0,:128], r1 @ Q1
483 vst1.8 {q6}, [r0,:128] @ Q2
492 vp8_v_loop_filter16 _inner, inner=1
493 vp8_v_loop_filter16 _simple, simple=1
495 .macro vp8_v_loop_filter8uv name, inner=0
496 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
498 sub r0, r0, r2, lsl #2
499 sub r1, r1, r2, lsl #2
500 ldr r12, [sp, #64] @ flim_I
503 vld1.8 {d0}, [r0,:64], r2 @ P3
504 vld1.8 {d1}, [r1,:64], r2 @ P3
505 vld1.8 {d2}, [r0,:64], r2 @ P2
506 vld1.8 {d3}, [r1,:64], r2 @ P2
507 vld1.8 {d4}, [r0,:64], r2 @ P1
508 vld1.8 {d5}, [r1,:64], r2 @ P1
509 vld1.8 {d6}, [r0,:64], r2 @ P0
510 vld1.8 {d7}, [r1,:64], r2 @ P0
511 vld1.8 {d8}, [r0,:64], r2 @ Q0
512 vld1.8 {d9}, [r1,:64], r2 @ Q0
513 vld1.8 {d10}, [r0,:64], r2 @ Q1
514 vld1.8 {d11}, [r1,:64], r2 @ Q1
515 vld1.8 {d12}, [r0,:64], r2 @ Q2
516 vld1.8 {d13}, [r1,:64], r2 @ Q2
517 vld1.8 {d14}, [r0,:64] @ Q3
518 vld1.8 {d15}, [r1,:64] @ Q3
520 vdup.8 q14, r3 @ flim_E
521 vdup.8 q15, r12 @ flim_I
522 ldr r12, [sp, #68] @ hev_thresh
524 vp8_loop_filter inner=\inner
526 @ back up to P2: u,v -= stride * 6
527 sub r0, r0, r2, lsl #2
528 sub r1, r1, r2, lsl #2
529 sub r0, r0, r2, lsl #1
530 sub r1, r1, r2, lsl #1
533 vst1.8 {d2}, [r0,:64], r2 @ P2
534 vst1.8 {d3}, [r1,:64], r2 @ P2
535 vst1.8 {d4}, [r0,:64], r2 @ P1
536 vst1.8 {d5}, [r1,:64], r2 @ P1
537 vst1.8 {d6}, [r0,:64], r2 @ P0
538 vst1.8 {d7}, [r1,:64], r2 @ P0
539 vst1.8 {d8}, [r0,:64], r2 @ Q0
540 vst1.8 {d9}, [r1,:64], r2 @ Q0
541 vst1.8 {d10}, [r0,:64], r2 @ Q1
542 vst1.8 {d11}, [r1,:64], r2 @ Q1
543 vst1.8 {d12}, [r0,:64] @ Q2
544 vst1.8 {d13}, [r1,:64] @ Q2
552 vp8_v_loop_filter8uv _inner, inner=1
554 .macro vp8_h_loop_filter16 name, inner=0, simple=0
555 function ff_vp8_h_loop_filter16\name\()_neon, export=1
559 ldr r12, [sp, #64] @ hev_thresh
563 vld1.8 {d0}, [r0], r1 @ load first 8-line src data
564 vld1.8 {d2}, [r0], r1
565 vld1.8 {d4}, [r0], r1
566 vld1.8 {d6}, [r0], r1
567 vld1.8 {d8}, [r0], r1
568 vld1.8 {d10}, [r0], r1
569 vld1.8 {d12}, [r0], r1
570 vld1.8 {d14}, [r0], r1
571 vld1.8 {d1}, [r0], r1 @ load second 8-line src data
572 vld1.8 {d3}, [r0], r1
573 vld1.8 {d5}, [r0], r1
574 vld1.8 {d7}, [r0], r1
575 vld1.8 {d9}, [r0], r1
576 vld1.8 {d11}, [r0], r1
577 vld1.8 {d13}, [r0], r1
578 vld1.8 {d15}, [r0], r1
580 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
582 vdup.8 q14, r2 @ flim_E
584 vdup.8 q15, r3 @ flim_I
587 vp8_loop_filter inner=\inner, simple=\simple
589 sub r0, r0, r1, lsl #4 @ backup 16 rows
591 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
594 vst1.8 {d0}, [r0], r1
595 vst1.8 {d2}, [r0], r1
596 vst1.8 {d4}, [r0], r1
597 vst1.8 {d6}, [r0], r1
598 vst1.8 {d8}, [r0], r1
599 vst1.8 {d10}, [r0], r1
600 vst1.8 {d12}, [r0], r1
601 vst1.8 {d14}, [r0], r1
602 vst1.8 {d1}, [r0], r1
603 vst1.8 {d3}, [r0], r1
604 vst1.8 {d5}, [r0], r1
605 vst1.8 {d7}, [r0], r1
606 vst1.8 {d9}, [r0], r1
607 vst1.8 {d11}, [r0], r1
608 vst1.8 {d13}, [r0], r1
617 vp8_h_loop_filter16 _inner, inner=1
618 vp8_h_loop_filter16 _simple, simple=1
620 .macro vp8_h_loop_filter8uv name, inner=0
621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
625 ldr r12, [sp, #64] @ flim_I
628 vld1.8 {d0}, [r0], r2 @ load u
629 vld1.8 {d1}, [r1], r2 @ load v
630 vld1.8 {d2}, [r0], r2
631 vld1.8 {d3}, [r1], r2
632 vld1.8 {d4}, [r0], r2
633 vld1.8 {d5}, [r1], r2
634 vld1.8 {d6}, [r0], r2
635 vld1.8 {d7}, [r1], r2
636 vld1.8 {d8}, [r0], r2
637 vld1.8 {d9}, [r1], r2
638 vld1.8 {d10}, [r0], r2
639 vld1.8 {d11}, [r1], r2
640 vld1.8 {d12}, [r0], r2
641 vld1.8 {d13}, [r1], r2
642 vld1.8 {d14}, [r0], r2
643 vld1.8 {d15}, [r1], r2
645 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
647 vdup.8 q14, r3 @ flim_E
648 vdup.8 q15, r12 @ flim_I
649 ldr r12, [sp, #68] @ hev_thresh
651 vp8_loop_filter inner=\inner
653 sub r0, r0, r2, lsl #3 @ backup u 8 rows
654 sub r1, r1, r2, lsl #3 @ backup v 8 rows
656 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
659 vst1.8 {d0}, [r0], r2
660 vst1.8 {d1}, [r1], r2
661 vst1.8 {d2}, [r0], r2
662 vst1.8 {d3}, [r1], r2
663 vst1.8 {d4}, [r0], r2
664 vst1.8 {d5}, [r1], r2
665 vst1.8 {d6}, [r0], r2
666 vst1.8 {d7}, [r1], r2
667 vst1.8 {d8}, [r0], r2
668 vst1.8 {d9}, [r1], r2
669 vst1.8 {d10}, [r0], r2
670 vst1.8 {d11}, [r1], r2
671 vst1.8 {d12}, [r0], r2
672 vst1.8 {d13}, [r1], r2
682 vp8_h_loop_filter8uv _inner, inner=1
684 function ff_put_vp8_pixels16_neon, export=1
685 ldr r12, [sp, #0] @ h
688 vld1.8 {q0}, [r2], r3
689 vld1.8 {q1}, [r2], r3
690 vld1.8 {q2}, [r2], r3
691 vld1.8 {q3}, [r2], r3
692 vst1.8 {q0}, [r0,:128], r1
693 vst1.8 {q1}, [r0,:128], r1
694 vst1.8 {q2}, [r0,:128], r1
695 vst1.8 {q3}, [r0,:128], r1
700 function ff_put_vp8_pixels8_neon, export=1
701 ldr r12, [sp, #0] @ h
704 vld1.8 {d0}, [r2], r3
705 vld1.8 {d1}, [r2], r3
706 vld1.8 {d2}, [r2], r3
707 vld1.8 {d3}, [r2], r3
708 vst1.8 {d0}, [r0,:64], r1
709 vst1.8 {d1}, [r0,:64], r1
710 vst1.8 {d2}, [r0,:64], r1
711 vst1.8 {d3}, [r0,:64], r1
716 /* 4/6-tap 8th-pel MC */
718 .macro vp8_epel8_h6 d, a, b
719 vext.8 d27, \a, \b, #1
721 vext.8 d28, \a, \b, #2
723 vext.8 d29, \a, \b, #3
725 vext.8 d30, \a, \b, #4
727 vext.8 d31, \a, \b, #5
729 vmul.u16 q10, q10, d0[2]
731 vmul.u16 q11, q11, d0[3]
732 vmls.u16 q10, q9, d0[1]
733 vmls.u16 q11, q12, d1[0]
734 vmla.u16 q10, q8, d0[0]
735 vmla.u16 q11, q13, d1[1]
736 vqadd.s16 q11, q10, q11
737 vqrshrun.s16 \d, q11, #7
740 .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
741 vext.8 q14, \q0, \q1, #3
742 vext.8 q15, \q0, \q1, #4
745 vext.8 q3, \q0, \q1, #2
748 vext.8 q8, \q0, \q1, #1
751 vext.8 q2, \q0, \q1, #5
756 vmul.u16 q11, q11, d0[3]
757 vmul.u16 q10, q10, d0[2]
758 vmul.u16 q3, q3, d0[2]
759 vmul.u16 q14, q14, d0[3]
760 vmls.u16 q11, q12, d1[0]
763 vmls.u16 q10, q9, d0[1]
764 vmls.u16 q3, q8, d0[1]
765 vmls.u16 q14, q15, d1[0]
766 vmla.u16 q10, q12, d0[0]
767 vmla.u16 q11, q13, d1[1]
768 vmla.u16 q3, q1, d0[0]
769 vmla.u16 q14, q2, d1[1]
770 vqadd.s16 q11, q10, q11
771 vqadd.s16 q14, q3, q14
772 vqrshrun.s16 \d0, q11, #7
773 vqrshrun.s16 \d1, q14, #7
776 .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
783 vmul.u16 q10, q10, d0[2]
784 vmul.u16 q11, q11, d0[3]
785 vmls.u16 q10, q9, d0[1]
786 vmls.u16 q11, q12, d1[0]
787 vmla.u16 q10, q8, d0[0]
788 vmla.u16 q11, q13, d1[1]
789 vqadd.s16 q11, q10, q11
790 vqrshrun.s16 \d0, q11, #7
793 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
801 vmul.u16 q10, q10, d0[0]
802 vmul.u16 q15, q11, d0[3]
803 vmul.u16 q11, q11, d0[2]
804 vmul.u16 q14, q14, d1[1]
805 vmls.u16 q10, q9, d0[1]
806 vmls.u16 q15, q12, d1[0]
807 vmls.u16 q11, q8, d0[1]
808 vmls.u16 q14, q13, d1[0]
809 vmla.u16 q10, q8, d0[2]
810 vmla.u16 q15, q13, d1[1]
811 vmla.u16 q11, q9, d0[0]
812 vmla.u16 q14, q12, d0[3]
813 vqadd.s16 q15, q10, q15
814 vqadd.s16 q14, q11, q14
815 vqrshrun.s16 \d0, q15, #7
816 vqrshrun.s16 \d1, q14, #7
819 .macro vp8_epel8_h4 d, a, b
820 vext.8 d28, \a, \b, #1
822 vext.8 d29, \a, \b, #2
824 vext.8 d30, \a, \b, #3
827 vmul.u16 q10, q10, d0[2]
828 vmul.u16 q11, q11, d0[3]
829 vmls.u16 q10, q9, d0[1]
830 vmls.u16 q11, q12, d1[0]
831 vqadd.s16 q11, q10, q11
832 vqrshrun.s16 \d, q11, #7
835 .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
841 vmul.u16 q8, q10, d0[2]
842 vmul.u16 q14, q11, d0[3]
843 vmul.u16 q11, q11, d0[2]
844 vmul.u16 q15, q12, d0[3]
845 vmls.u16 q8, q9, d0[1]
846 vmls.u16 q14, q12, d1[0]
847 vmls.u16 q11, q10, d0[1]
848 vmls.u16 q15, q13, d1[0]
849 vqadd.s16 q8, q8, q14
850 vqadd.s16 q11, q11, q15
851 vqrshrun.s16 \d0, q8, #7
852 vqrshrun.s16 \d1, q11, #7
855 function ff_put_vp8_epel16_v6_neon, export=1
856 sub r2, r2, r3, lsl #1
860 ldr r4, [sp, #80] @ my
861 movrel lr, subpel_filters-16
862 ldr r12, [sp, #72] @ h
863 add r4, lr, r4, lsl #4
864 vld1.16 {q0}, [r4,:128]
866 vld1.8 {d2-d3}, [r2], r3
867 vld1.8 {d4-d5}, [r2], r3
868 vld1.8 {d6-d7}, [r2], r3
869 vld1.8 {d8-d9}, [r2], r3
870 vld1.8 {d10-d11},[r2], r3
871 vld1.8 {d12-d13},[r2], r3
872 vld1.8 {d14-d15},[r2]
873 sub r2, r2, r3, lsl #2
875 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
876 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
878 vst1.8 {d2-d3}, [r0,:128], r1
879 vst1.8 {d4-d5}, [r0,:128], r1
887 function ff_put_vp8_epel16_h6_neon, export=1
891 ldr r4, [sp, #12] @ mx
892 movrel lr, subpel_filters-16
893 ldr r12, [sp, #8] @ h
894 add r4, lr, r4, lsl #4
895 vld1.16 {q0}, [r4,:128]
897 vld1.8 {d2-d4}, [r2], r3
899 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
901 vst1.8 {d2-d3}, [r0,:128], r1
908 function ff_put_vp8_epel16_h6v6_neon, export=1
909 sub r2, r2, r3, lsl #1
914 @ first pass (horizontal):
915 ldr r4, [sp, #28] @ mx
916 movrel lr, subpel_filters-16
917 ldr r12, [sp, #24] @ h
918 add r4, lr, r4, lsl #4
920 vld1.16 {q0}, [r4,:128]
925 vld1.8 {d2,d3,d4}, [r2], r3
927 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
929 vst1.8 {d2-d3}, [lr,:128]!
933 @ second pass (vertical):
934 ldr r4, [sp, #336+16+32] @ my
935 movrel lr, subpel_filters-16
936 ldr r12, [sp, #336+16+24] @ h
937 add r4, lr, r4, lsl #4
939 vld1.16 {q0}, [r4,:128]
942 vld1.8 {d2-d5}, [lr,:128]!
943 vld1.8 {d6-d9}, [lr,:128]!
944 vld1.8 {d28-d31},[lr,:128]
947 vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
948 vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
950 vst1.8 {d2-d3}, [r0,:128], r1
959 function ff_put_vp8_epel8_v6_neon, export=1
960 sub r2, r2, r3, lsl #1
963 ldr r4, [sp, #16] @ my
964 movrel lr, subpel_filters-16
965 ldr r12, [sp, #8] @ h
966 add r4, lr, r4, lsl #4
967 vld1.16 {q0}, [r4,:128]
969 vld1.8 {d2}, [r2], r3
970 vld1.8 {d3}, [r2], r3
971 vld1.8 {d4}, [r2], r3
972 vld1.8 {d5}, [r2], r3
973 vld1.8 {d6}, [r2], r3
974 vld1.8 {d7}, [r2], r3
977 sub r2, r2, r3, lsl #2
979 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
981 vst1.8 {d2}, [r0,:64], r1
982 vst1.8 {d3}, [r0,:64], r1
989 function ff_put_vp8_epel8_h6_neon, export=1
993 ldr r4, [sp, #12] @ mx
994 movrel lr, subpel_filters-16
995 ldr r12, [sp, #8] @ h
996 add r4, lr, r4, lsl #4
997 vld1.16 {q0}, [r4,:128]
999 vld1.8 {d2,d3}, [r2], r3
1001 vp8_epel8_h6 d2, d2, d3
1003 vst1.8 {d2}, [r0,:64], r1
1010 function ff_put_vp8_epel8_h6v6_neon, export=1
1011 sub r2, r2, r3, lsl #1
1015 @ first pass (horizontal):
1016 ldr r4, [sp, #12] @ mx
1017 movrel lr, subpel_filters-16
1018 ldr r12, [sp, #8] @ h
1019 add r4, lr, r4, lsl #4
1021 vld1.16 {q0}, [r4,:128]
1026 vld1.8 {d2,d3}, [r2], r3
1028 vp8_epel8_h6 d2, d2, d3
1030 vst1.8 {d2}, [lr,:64]!
1034 @ second pass (vertical):
1035 ldr r4, [sp, #168+16+16] @ my
1036 movrel lr, subpel_filters-16
1037 ldr r12, [sp, #168+16+8] @ h
1038 add r4, lr, r4, lsl #4
1040 vld1.16 {q0}, [r4,:128]
1043 vld1.8 {d2-d5}, [lr,:128]!
1044 vld1.8 {d6-d7}, [lr,:128]!
1045 vld1.8 {d30}, [lr,:64]
1048 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1050 vst1.8 {d2}, [r0,:64], r1
1051 vst1.8 {d3}, [r0,:64], r1
1059 function ff_put_vp8_epel8_v4_neon, export=1
1063 ldr r4, [sp, #16] @ my
1064 movrel lr, subpel_filters-16
1065 ldr r12, [sp, #8] @ h
1066 add r4, lr, r4, lsl #4
1067 vld1.16 {q0}, [r4,:128]
1069 vld1.8 {d2}, [r2], r3
1070 vld1.8 {d3}, [r2], r3
1071 vld1.8 {d4}, [r2], r3
1072 vld1.8 {d5}, [r2], r3
1074 sub r2, r2, r3, lsl #1
1076 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1078 vst1.8 {d2}, [r0,:64], r1
1079 vst1.8 {d3}, [r0,:64], r1
1086 function ff_put_vp8_epel8_h4_neon, export=1
1090 ldr r4, [sp, #12] @ mx
1091 movrel lr, subpel_filters-16
1092 ldr r12, [sp, #8] @ h
1093 add r4, lr, r4, lsl #4
1094 vld1.16 {q0}, [r4,:128]
1096 vld1.8 {d2,d3}, [r2], r3
1098 vp8_epel8_h4 d2, d2, d3
1100 vst1.8 {d2}, [r0,:64], r1
1107 function ff_put_vp8_epel8_h4v4_neon, export=1
1112 @ first pass (horizontal):
1113 ldr r4, [sp, #12] @ mx
1114 movrel lr, subpel_filters-16
1115 ldr r12, [sp, #8] @ h
1116 add r4, lr, r4, lsl #4
1118 vld1.16 {q0}, [r4,:128]
1123 vld1.8 {d2,d3}, [r2], r3
1125 vp8_epel8_h4 d2, d2, d3
1127 vst1.8 {d2}, [lr,:64]!
1131 @ second pass (vertical):
1132 ldr r4, [sp, #168+16+16] @ my
1133 movrel lr, subpel_filters-16
1134 ldr r12, [sp, #168+16+8] @ h
1135 add r4, lr, r4, lsl #4
1137 vld1.16 {q0}, [r4,:128]
1140 vld1.8 {d2-d5}, [lr,:128]!
1141 vld1.8 {d6}, [lr,:64]
1144 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1146 vst1.8 {d2}, [r0,:64], r1
1147 vst1.8 {d3}, [r0,:64], r1
1155 function ff_put_vp8_epel8_h6v4_neon, export=1
1160 @ first pass (horizontal):
1161 ldr r4, [sp, #12] @ mx
1162 movrel lr, subpel_filters-16
1163 ldr r12, [sp, #8] @ h
1164 add r4, lr, r4, lsl #4
1166 vld1.16 {q0}, [r4,:128]
1171 vld1.8 {d2,d3}, [r2], r3
1173 vp8_epel8_h6 d2, d2, d3
1175 vst1.8 {d2}, [lr,:64]!
1179 @ second pass (vertical):
1180 ldr r4, [sp, #168+16+16] @ my
1181 movrel lr, subpel_filters-16
1182 ldr r12, [sp, #168+16+8] @ h
1183 add r4, lr, r4, lsl #4
1185 vld1.16 {q0}, [r4,:128]
1188 vld1.8 {d2-d5}, [lr,:128]!
1189 vld1.8 {d6}, [lr,:64]
1192 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1194 vst1.8 {d2}, [r0,:64], r1
1195 vst1.8 {d3}, [r0,:64], r1
1203 function ff_put_vp8_epel8_h4v6_neon, export=1
1204 sub r2, r2, r3, lsl #1
1208 @ first pass (horizontal):
1209 ldr r4, [sp, #12] @ mx
1210 movrel lr, subpel_filters-16
1211 ldr r12, [sp, #8] @ h
1212 add r4, lr, r4, lsl #4
1214 vld1.16 {q0}, [r4,:128]
1219 vld1.8 {d2,d3}, [r2], r3
1221 vp8_epel8_h4 d2, d2, d3
1223 vst1.8 {d2}, [lr,:64]!
1227 @ second pass (vertical):
1228 ldr r4, [sp, #168+16+16] @ my
1229 movrel lr, subpel_filters-16
1230 ldr r12, [sp, #168+16+8] @ h
1231 add r4, lr, r4, lsl #4
1233 vld1.16 {q0}, [r4,:128]
1236 vld1.8 {d2-d5}, [lr,:128]!
1237 vld1.8 {d6-d7}, [lr,:128]!
1238 vld1.8 {d30}, [lr,:64]
1241 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1243 vst1.8 {d2}, [r0,:64], r1
1244 vst1.8 {d3}, [r0,:64], r1
1254 function ff_put_vp8_epel4_v6_neon, export=1
1255 sub r2, r2, r3, lsl #1
1258 ldr r4, [sp, #16] @ my
1259 movrel lr, subpel_filters-16
1260 ldr r12, [sp, #8] @ h
1261 add r4, lr, r4, lsl #4
1262 vld1.16 {q0}, [r4,:128]
1264 vld1.32 {d2[]}, [r2], r3
1265 vld1.32 {d3[]}, [r2], r3
1266 vld1.32 {d4[]}, [r2], r3
1267 vld1.32 {d5[]}, [r2], r3
1268 vld1.32 {d6[]}, [r2], r3
1269 vld1.32 {d7[]}, [r2], r3
1270 vld1.32 {d28[]}, [r2]
1271 sub r2, r2, r3, lsl #2
1272 vld1.32 {d2[1]}, [r2], r3
1273 vld1.32 {d3[1]}, [r2], r3
1274 vld1.32 {d4[1]}, [r2], r3
1275 vld1.32 {d5[1]}, [r2], r3
1276 vld1.32 {d6[1]}, [r2], r3
1277 vld1.32 {d7[1]}, [r2], r3
1278 vld1.32 {d28[1]}, [r2]
1279 sub r2, r2, r3, lsl #2
1281 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1283 vst1.32 {d2[0]}, [r0,:32], r1
1284 vst1.32 {d3[0]}, [r0,:32], r1
1285 vst1.32 {d2[1]}, [r0,:32], r1
1286 vst1.32 {d3[1]}, [r0,:32], r1
1293 function ff_put_vp8_epel4_h6_neon, export=1
1297 ldr r4, [sp, #12] @ mx
1298 movrel lr, subpel_filters-16
1299 ldr r12, [sp, #8] @ h
1300 add r4, lr, r4, lsl #4
1301 vld1.16 {q0}, [r4,:128]
1303 vld1.8 {q1}, [r2], r3
1304 vp8_epel8_h6 d2, d2, d3
1305 vst1.32 {d2[0]}, [r0,:32], r1
1312 function ff_put_vp8_epel4_h6v6_neon, export=1
1313 sub r2, r2, r3, lsl #1
1317 ldr r4, [sp, #12] @ mx
1318 movrel lr, subpel_filters-16
1319 ldr r12, [sp, #8] @ h
1320 add r4, lr, r4, lsl #4
1322 vld1.16 {q0}, [r4,:128]
1327 vld1.8 {q1}, [r2], r3
1328 vp8_epel8_h6 d2, d2, d3
1329 vst1.32 {d2[0]}, [lr,:32]!
1333 ldr r4, [sp, #52+16+16] @ my
1334 movrel lr, subpel_filters-16
1335 ldr r12, [sp, #52+16+8] @ h
1336 add r4, lr, r4, lsl #4
1338 vld1.16 {q0}, [r4,:128]
1341 vld1.8 {d2-d3}, [lr,:128]!
1342 vld1.8 {d6}, [lr,:64]!
1343 vld1.32 {d28[]}, [lr,:32]
1345 vld1.8 {d4-d5}, [lr]!
1346 vld1.8 {d7}, [lr,:64]!
1347 vld1.32 {d28[1]}, [lr,:32]
1351 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1352 vst1.32 {d2[0]}, [r0,:32], r1
1353 vst1.32 {d3[0]}, [r0,:32], r1
1354 vst1.32 {d2[1]}, [r0,:32], r1
1355 vst1.32 {d3[1]}, [r0,:32], r1
1363 function ff_put_vp8_epel4_h4v6_neon, export=1
1364 sub r2, r2, r3, lsl #1
1368 ldr r4, [sp, #12] @ mx
1369 movrel lr, subpel_filters-16
1370 ldr r12, [sp, #8] @ h
1371 add r4, lr, r4, lsl #4
1373 vld1.16 {q0}, [r4,:128]
1378 vld1.8 {d2}, [r2], r3
1379 vp8_epel8_h4 d2, d2, d2
1380 vst1.32 {d2[0]}, [lr,:32]!
1384 ldr r4, [sp, #52+16+16] @ my
1385 movrel lr, subpel_filters-16
1386 ldr r12, [sp, #52+16+8] @ h
1387 add r4, lr, r4, lsl #4
1389 vld1.16 {q0}, [r4,:128]
1392 vld1.8 {d2-d3}, [lr,:128]!
1393 vld1.8 {d6}, [lr,:64]!
1394 vld1.32 {d28[]}, [lr,:32]
1396 vld1.8 {d4-d5}, [lr]!
1397 vld1.8 {d7}, [lr,:64]!
1398 vld1.32 {d28[1]}, [lr,:32]
1402 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1403 vst1.32 {d2[0]}, [r0,:32], r1
1404 vst1.32 {d3[0]}, [r0,:32], r1
1405 vst1.32 {d2[1]}, [r0,:32], r1
1406 vst1.32 {d3[1]}, [r0,:32], r1
1414 function ff_put_vp8_epel4_h6v4_neon, export=1
1419 ldr r4, [sp, #12] @ mx
1420 movrel lr, subpel_filters-16
1421 ldr r12, [sp, #8] @ h
1422 add r4, lr, r4, lsl #4
1424 vld1.16 {q0}, [r4,:128]
1429 vld1.8 {q1}, [r2], r3
1430 vp8_epel8_h6 d2, d2, d3
1431 vst1.32 {d2[0]}, [lr,:32]!
1435 ldr r4, [sp, #44+16+16] @ my
1436 movrel lr, subpel_filters-16
1437 ldr r12, [sp, #44+16+8] @ h
1438 add r4, lr, r4, lsl #4
1440 vld1.16 {q0}, [r4,:128]
1443 vld1.8 {d2-d3}, [lr,:128]!
1444 vld1.32 {d6[]}, [lr,:32]
1446 vld1.8 {d4-d5}, [lr]!
1447 vld1.32 {d6[1]}, [lr,:32]
1450 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1451 vst1.32 {d2[0]}, [r0,:32], r1
1452 vst1.32 {d3[0]}, [r0,:32], r1
1453 vst1.32 {d2[1]}, [r0,:32], r1
1454 vst1.32 {d3[1]}, [r0,:32], r1
1462 function ff_put_vp8_epel4_h4_neon, export=1
1466 ldr r4, [sp, #12] @ mx
1467 movrel lr, subpel_filters-16
1468 ldr r12, [sp, #8] @ h
1469 add r4, lr, r4, lsl #4
1470 vld1.16 {q0}, [r4,:128]
1472 vld1.8 {d2}, [r2], r3
1473 vp8_epel8_h4 d2, d2, d2
1474 vst1.32 {d2[0]}, [r0,:32], r1
1481 function ff_put_vp8_epel4_v4_neon, export=1
1485 ldr r4, [sp, #16] @ my
1486 movrel lr, subpel_filters-16
1487 ldr r12, [sp, #8] @ h
1488 add r4, lr, r4, lsl #4
1489 vld1.16 {q0}, [r4,:128]
1491 vld1.32 {d2[]}, [r2], r3
1492 vld1.32 {d3[]}, [r2], r3
1493 vld1.32 {d4[]}, [r2], r3
1494 vld1.32 {d5[]}, [r2], r3
1495 vld1.32 {d6[]}, [r2]
1496 sub r2, r2, r3, lsl #1
1497 vld1.32 {d2[1]}, [r2], r3
1498 vld1.32 {d3[1]}, [r2], r3
1499 vld1.32 {d4[1]}, [r2], r3
1500 vld1.32 {d5[1]}, [r2], r3
1501 vld1.32 {d6[1]}, [r2]
1502 sub r2, r2, r3, lsl #1
1504 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1506 vst1.32 {d2[0]}, [r0,:32], r1
1507 vst1.32 {d3[0]}, [r0,:32], r1
1508 vst1.32 {d2[1]}, [r0,:32], r1
1509 vst1.32 {d3[1]}, [r0,:32], r1
1516 function ff_put_vp8_epel4_h4v4_neon, export=1
1521 ldr r4, [sp, #12] @ mx
1522 movrel lr, subpel_filters-16
1523 ldr r12, [sp, #8] @ h
1524 add r4, lr, r4, lsl #4
1526 vld1.16 {q0}, [r4,:128]
1531 vld1.8 {d2}, [r2], r3
1532 vp8_epel8_h4 d2, d2, d3
1533 vst1.32 {d2[0]}, [lr,:32]!
1537 ldr r4, [sp, #44+16+16] @ my
1538 movrel lr, subpel_filters-16
1539 ldr r12, [sp, #44+16+8] @ h
1540 add r4, lr, r4, lsl #4
1542 vld1.16 {q0}, [r4,:128]
1545 vld1.8 {d2-d3}, [lr,:128]!
1546 vld1.32 {d6[]}, [lr,:32]
1548 vld1.8 {d4-d5}, [lr]!
1549 vld1.32 {d6[1]}, [lr,:32]
1552 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1553 vst1.32 {d2[0]}, [r0,:32], r1
1554 vst1.32 {d3[0]}, [r0,:32], r1
1555 vst1.32 {d2[1]}, [r0,:32], r1
1556 vst1.32 {d3[1]}, [r0,:32], r1
1564 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1565 @ arithmatic can be used to apply filters
1566 const subpel_filters, align=4
1567 .short 0, 6, 123, 12, 1, 0, 0, 0
1568 .short 2, 11, 108, 36, 8, 1, 0, 0
1569 .short 0, 9, 93, 50, 6, 0, 0, 0
1570 .short 3, 16, 77, 77, 16, 3, 0, 0
1571 .short 0, 6, 50, 93, 9, 0, 0, 0
1572 .short 1, 8, 36, 108, 11, 2, 0, 0
1573 .short 0, 1, 12, 123, 6, 0, 0, 0
1578 function ff_put_vp8_bilin16_h_neon, export=1
1579 ldr r12, [sp, #4] @ mx
1586 vld1.8 {d2-d4}, [r2], r3
1587 vext.8 q2, q1, q2, #1
1590 vld1.8 {d18-d20},[r2], r3
1593 vext.8 q10, q9, q10, #1
1594 vmull.u8 q11, d18, d1
1595 vmlal.u8 q11, d20, d0
1596 vmull.u8 q12, d19, d1
1597 vmlal.u8 q12, d21, d0
1598 vrshrn.u16 d4, q8, #3
1599 vrshrn.u16 d5, q3, #3
1600 vrshrn.u16 d6, q11, #3
1601 vrshrn.u16 d7, q12, #3
1602 vst1.8 {q2}, [r0,:128], r1
1603 vst1.8 {q3}, [r0,:128], r1
1609 function ff_put_vp8_bilin16_v_neon, export=1
1610 ldr r12, [sp, #8] @ my
1615 vld1.8 {q1}, [r2], r3
1618 vld1.8 {q2}, [r2], r3
1623 vld1.8 {q1}, [r2], r3
1626 vmull.u8 q10, d5, d1
1627 vmlal.u8 q10, d3, d0
1628 vrshrn.u16 d4, q3, #3
1629 vrshrn.u16 d5, q8, #3
1630 vrshrn.u16 d6, q9, #3
1631 vrshrn.u16 d7, q10, #3
1632 vst1.8 {q2}, [r0,:128], r1
1633 vst1.8 {q3}, [r0,:128], r1
1639 function ff_put_vp8_bilin16_hv_neon, export=1
1640 ldr r12, [sp, #4] @ mx
1644 ldr r12, [sp, #8] @ my
1650 vld1.8 {d4-d6}, [r2], r3
1651 vext.8 q3, q2, q3, #1
1656 vrshrn.u16 d4, q8, #3
1657 vrshrn.u16 d5, q9, #3
1660 vld1.8 {d18-d20},[r2], r3
1661 vext.8 q10, q9, q10, #1
1662 vmull.u8 q11, d18, d1
1663 vmlal.u8 q11, d20, d0
1664 vld1.8 {d26-d28},[r2], r3
1665 vmull.u8 q12, d19, d1
1666 vmlal.u8 q12, d21, d0
1667 vext.8 q14, q13, q14, #1
1668 vmull.u8 q8, d26, d1
1669 vmlal.u8 q8, d28, d0
1670 vmull.u8 q9, d27, d1
1671 vmlal.u8 q9, d29, d0
1672 vrshrn.u16 d6, q11, #3
1673 vrshrn.u16 d7, q12, #3
1674 vmull.u8 q12, d4, d3
1675 vmlal.u8 q12, d6, d2
1676 vmull.u8 q15, d5, d3
1677 vmlal.u8 q15, d7, d2
1678 vrshrn.u16 d4, q8, #3
1679 vrshrn.u16 d5, q9, #3
1680 vmull.u8 q10, d6, d3
1681 vmlal.u8 q10, d4, d2
1682 vmull.u8 q11, d7, d3
1683 vmlal.u8 q11, d5, d2
1684 vrshrn.u16 d24, q12, #3
1685 vrshrn.u16 d25, q15, #3
1686 vst1.8 {q12}, [r0,:128], r1
1687 vrshrn.u16 d20, q10, #3
1688 vrshrn.u16 d21, q11, #3
1689 vst1.8 {q10}, [r0,:128], r1
1695 function ff_put_vp8_bilin8_h_neon, export=1
1696 ldr r12, [sp, #4] @ mx
1703 vld1.8 {q1}, [r2], r3
1704 vext.8 d3, d2, d3, #1
1707 vld1.8 {q3}, [r2], r3
1708 vext.8 d7, d6, d7, #1
1711 vrshrn.u16 d4, q2, #3
1712 vrshrn.u16 d16, q8, #3
1713 vst1.8 {d4}, [r0,:64], r1
1714 vst1.8 {d16}, [r0,:64], r1
1720 function ff_put_vp8_bilin8_v_neon, export=1
1721 ldr r12, [sp, #8] @ my
1726 vld1.8 {d2}, [r2], r3
1729 vld1.8 {d3}, [r2], r3
1732 vld1.8 {d2}, [r2], r3
1735 vrshrn.u16 d4, q2, #3
1736 vrshrn.u16 d6, q3, #3
1737 vst1.8 {d4}, [r0,:64], r1
1738 vst1.8 {d6}, [r0,:64], r1
1744 function ff_put_vp8_bilin8_hv_neon, export=1
1745 ldr r12, [sp, #4] @ mx
1749 ldr r12, [sp, #8] @ my
1755 vld1.8 {q2}, [r2], r3
1756 vext.8 d5, d4, d5, #1
1759 vrshrn.u16 d22, q9, #3
1762 vld1.8 {q3}, [r2], r3
1763 vext.8 d7, d6, d7, #1
1766 vld1.8 {q2}, [r2], r3
1767 vext.8 d5, d4, d5, #1
1770 vrshrn.u16 d16, q8, #3
1771 vmull.u8 q10, d22, d3
1772 vmlal.u8 q10, d16, d2
1773 vrshrn.u16 d22, q9, #3
1774 vmull.u8 q12, d16, d3
1775 vmlal.u8 q12, d22, d2
1776 vrshrn.u16 d20, q10, #3
1777 vst1.8 {d20}, [r0,:64], r1
1778 vrshrn.u16 d23, q12, #3
1779 vst1.8 {d23}, [r0,:64], r1
1785 function ff_put_vp8_bilin4_h_neon, export=1
1786 ldr r12, [sp, #4] @ mx
1793 vld1.8 {d2}, [r2], r3
1794 vext.8 d3, d2, d3, #1
1795 vld1.8 {d6}, [r2], r3
1796 vext.8 d7, d6, d7, #1
1800 vrshrn.u16 d4, q2, #3
1801 vst1.32 {d4[0]}, [r0,:32], r1
1802 vst1.32 {d4[1]}, [r0,:32], r1
1808 function ff_put_vp8_bilin4_v_neon, export=1
1809 ldr r12, [sp, #8] @ my
1814 vld1.32 {d2[]}, [r2], r3
1816 vld1.32 {d3[]}, [r2]
1817 vld1.32 {d2[1]}, [r2], r3
1818 vld1.32 {d3[1]}, [r2], r3
1822 vrshrn.u16 d4, q2, #3
1823 vst1.32 {d4[0]}, [r0,:32], r1
1824 vst1.32 {d4[1]}, [r0,:32], r1
1831 function ff_put_vp8_bilin4_hv_neon, export=1
1832 ldr r12, [sp, #4] @ mx
1836 ldr r12, [sp, #8] @ my
1842 vld1.8 {d4}, [r2], r3
1843 vext.8 d5, d4, d4, #1
1846 vrshrn.u16 d22, q9, #3
1849 vld1.8 {d6}, [r2], r3
1850 vext.8 d7, d6, d6, #1
1851 vld1.8 {d4}, [r2], r3
1852 vext.8 d5, d4, d4, #1
1856 vrshrn.u16 d16, q8, #3
1857 vmull.u8 q10, d16, d2
1859 vmlal.u8 q10, d22, d3
1861 vrshrn.u16 d20, q10, #3
1862 vst1.32 {d20[0]}, [r0,:32], r1
1863 vst1.32 {d20[1]}, [r0,:32], r1