2 * VP8 NEON optimisations
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 function ff_vp8_luma_dc_wht_neon, export=1
27 vld1.16 {q0-q1}, [r1,:128]
32 vst1.16 {q15}, [r1,:128]!
35 vst1.16 {q15}, [r1,:128]
59 vst1.16 {d0[0]}, [r0,:16], r3
60 vst1.16 {d1[0]}, [r0,:16], r3
61 vst1.16 {d2[0]}, [r0,:16], r3
62 vst1.16 {d3[0]}, [r0,:16], r3
63 vst1.16 {d0[1]}, [r0,:16], r3
64 vst1.16 {d1[1]}, [r0,:16], r3
65 vst1.16 {d2[1]}, [r0,:16], r3
66 vst1.16 {d3[1]}, [r0,:16], r3
67 vst1.16 {d0[2]}, [r0,:16], r3
68 vst1.16 {d1[2]}, [r0,:16], r3
69 vst1.16 {d2[2]}, [r0,:16], r3
70 vst1.16 {d3[2]}, [r0,:16], r3
71 vst1.16 {d0[3]}, [r0,:16], r3
72 vst1.16 {d1[3]}, [r0,:16], r3
73 vst1.16 {d2[3]}, [r0,:16], r3
74 vst1.16 {d3[3]}, [r0,:16], r3
79 function ff_vp8_idct_add_neon, export=1
80 vld1.16 {q0-q1}, [r1,:128]
85 vmull.s16 q12, d1, d4[0]
86 vmull.s16 q13, d3, d4[0]
87 vqdmulh.s16 d20, d1, d4[1]
88 vqdmulh.s16 d23, d3, d4[1]
89 vshrn.s32 d21, q12, #16
90 vshrn.s32 d22, q13, #16
96 vadd.s16 d18, d21, d23
97 vsub.s16 d19, d20, d22
107 vmull.s16 q12, d1, d4[0]
108 vst1.16 {q15}, [r1,:128]!
109 vmull.s16 q13, d2, d4[0]
110 vst1.16 {q15}, [r1,:128]
111 vqdmulh.s16 d21, d1, d4[1]
112 vqdmulh.s16 d23, d2, d4[1]
113 vshrn.s32 d20, q12, #16
114 vshrn.s32 d22, q13, #16
115 vadd.i16 d20, d20, d1
116 vadd.i16 d22, d22, d2
120 vadd.i16 d18, d20, d23
121 vld1.32 {d20[]}, [r0,:32], r2
122 vsub.i16 d19, d21, d22
123 vld1.32 {d22[]}, [r0,:32], r2
125 vld1.32 {d23[]}, [r0,:32], r2
127 vld1.32 {d21[]}, [r0,:32], r2
132 sub r0, r0, r2, lsl #2
144 vst1.32 {d0[0]}, [r0,:32], r2
145 vst1.32 {d0[1]}, [r0,:32], r2
146 vst1.32 {d1[1]}, [r0,:32], r2
147 vst1.32 {d1[0]}, [r0,:32], r2
152 function ff_vp8_idct_dc_add_neon, export=1
158 vld1.32 {d0[]}, [r0,:32], r2
159 vld1.32 {d1[]}, [r0,:32], r2
160 vld1.32 {d0[1]}, [r0,:32], r2
161 vld1.32 {d1[1]}, [r0,:32], r2
164 sub r0, r0, r2, lsl #2
167 vst1.32 {d0[0]}, [r0,:32], r2
168 vst1.32 {d1[0]}, [r0,:32], r2
169 vst1.32 {d0[1]}, [r0,:32], r2
170 vst1.32 {d1[1]}, [r0,:32], r2
174 function ff_vp8_idct_dc_add4uv_neon, export=1
177 vld1.16 {d16[]}, [r1,:16]
178 vst1.16 {d0[0]}, [r1,:16], r3
179 vld1.16 {d17[]}, [r1,:16]
180 vst1.16 {d0[0]}, [r1,:16], r3
181 vld1.16 {d18[]}, [r1,:16]
182 vst1.16 {d0[0]}, [r1,:16], r3
183 vld1.16 {d19[]}, [r1,:16]
184 vst1.16 {d0[0]}, [r1,:16], r3
186 vrshr.s16 q8, q8, #3 @ dc >>= 3
187 vld1.8 {d0}, [r0,:64], r2
189 vld1.8 {d1}, [r0,:64], r2
191 vld1.8 {d2}, [r0,:64], r2
193 vld1.8 {d3}, [r0,:64], r2
195 vld1.8 {d4}, [r0,:64], r2
197 vld1.8 {d5}, [r0,:64], r2
199 vld1.8 {d6}, [r0,:64], r2
201 vld1.8 {d7}, [r0,:64], r2
207 vst1.8 {d20}, [r3,:64], r2
209 vst1.8 {d21}, [r3,:64], r2
211 vst1.8 {d22}, [r3,:64], r2
213 vst1.8 {d23}, [r3,:64], r2
215 vst1.8 {d24}, [r3,:64], r2
217 vst1.8 {d25}, [r3,:64], r2
218 vst1.8 {d26}, [r3,:64], r2
219 vst1.8 {d27}, [r3,:64], r2
224 function ff_vp8_idct_dc_add4y_neon, export=1
227 vld1.16 {d16[]}, [r1,:16]
228 vst1.16 {d0[0]}, [r1,:16], r3
229 vld1.16 {d17[]}, [r1,:16]
230 vst1.16 {d0[0]}, [r1,:16], r3
231 vld1.16 {d18[]}, [r1,:16]
232 vst1.16 {d0[0]}, [r1,:16], r3
233 vld1.16 {d19[]}, [r1,:16]
234 vst1.16 {d0[0]}, [r1,:16], r3
235 vrshr.s16 q8, q8, #3 @ dc >>= 3
236 vld1.8 {q0}, [r0,:128], r2
238 vld1.8 {q1}, [r0,:128], r2
240 vld1.8 {q2}, [r0,:128], r2
242 vld1.8 {q3}, [r0,:128], r2
249 sub r0, r0, r2, lsl #2
255 vst1.8 {q10}, [r0,:128], r2
257 vst1.8 {q11}, [r0,:128], r2
259 vst1.8 {q12}, [r0,:128], r2
261 vst1.8 {q13}, [r0,:128], r2
272 .macro vp8_loop_filter, inner=0, simple=0
274 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
275 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
276 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
277 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
278 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
280 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
282 @ calculate hev and normal_limit:
283 vabd.u8 q12, q2, q3 @ abs(P1-P0)
284 vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
285 vabd.u8 q10, q0, q1 @ abs(P3-P2)
286 vabd.u8 q11, q1, q2 @ abs(P2-P1)
287 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
288 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
289 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
290 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
292 vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
294 vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
296 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
297 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
298 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
299 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
301 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
303 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
304 vdup.8 q15, r12 @ hev_thresh
305 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
306 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
307 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
308 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
318 @ convert to signed value:
319 veor q3, q3, q13 @ PS0 = P0 ^ 0x80
320 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
323 vsubl.s8 q10, d8, d6 @ QS0 - PS0
324 vsubl.s8 q11, d9, d7 @ (widened to 16bit)
325 veor q2, q2, q13 @ PS1 = P1 ^ 0x80
326 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
327 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
328 vmul.i16 q11, q11, q12
330 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
334 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
336 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
337 vaddw.s8 q11, q11, d25
338 vqmovn.s16 d20, q10 @ narrow result back into q10
340 .if !\inner && !\simple
341 veor q1, q1, q13 @ PS2 = P2 ^ 0x80
342 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
344 vand q10, q10, q8 @ w &= normal_limit
346 @ registers used at this point..
347 @ q0 -> P3 (don't corrupt)
349 @ q7 -> Q3 (don't corrupt)
355 @ q8, q11, q12 -> unused
357 @ filter_common: is4tap==1
358 @ c1 = clamp(w + 4) >> 3;
359 @ c2 = clamp(w + 3) >> 3;
360 @ Q0 = s2u(QS0 - c1);
361 @ P0 = s2u(PS0 + c2);
364 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
365 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
366 vshr.s8 q11, q11, #3 @ c1 >>= 3
367 vshr.s8 q12, q12, #3 @ c2 >>= 3
368 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
369 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
370 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
371 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
372 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
373 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
375 @ the !is4tap case of filter_common, only used for inner blocks
376 @ c3 = ((c1&~hev) + 1) >> 1;
377 @ Q1 = s2u(QS1 - c3);
378 @ P1 = s2u(PS1 + c3);
379 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
380 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
381 vshr.s8 q11, q11, #3 @ c1 >>= 3
382 vshr.s8 q12, q12, #3 @ c2 >>= 3
383 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
384 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
385 vbic q11, q11, q9 @ c1 & ~hev
386 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
387 vrshr.s8 q11, q11, #1 @ c3 >>= 1
388 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
389 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
390 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
391 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
392 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
394 vand q12, q10, q9 @ w & hev
395 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
396 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
397 vshr.s8 q11, q11, #3 @ c1 >>= 3
398 vshr.s8 q12, q12, #3 @ c2 >>= 3
399 vbic q10, q10, q9 @ w &= ~hev
400 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
401 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
404 @ a = clamp((27*w + 63) >> 7);
407 @ a = clamp((18*w + 63) >> 7);
410 @ a = clamp((9*w + 63) >> 7);
414 vshll.s8 q14, d20, #3
415 vshll.s8 q15, d21, #3
416 vaddw.s8 q14, q14, d20
417 vaddw.s8 q15, q15, d21
419 vadd.s16 q9, q9, q15 @ 9*w + 63
420 vadd.s16 q11, q8, q14
421 vadd.s16 q12, q9, q15 @ 18*w + 63
422 vadd.s16 q14, q11, q14
423 vadd.s16 q15, q12, q15 @ 27*w + 63
424 vqshrn.s16 d16, q8, #7
425 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
426 vqshrn.s16 d22, q11, #7
427 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
428 vqshrn.s16 d28, q14, #7
429 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
430 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
431 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
432 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
433 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
434 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
435 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
436 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
437 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
438 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
439 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
440 veor q1, q1, q13 @ P2 = PS2 ^ 0x80
441 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
445 .macro transpose8x16matrix
462 .macro vp8_v_loop_filter16 name, inner=0, simple=0
463 function ff_vp8_v_loop_filter16\name\()_neon, export=1
465 sub r0, r0, r1, lsl #1+!\simple
469 ldr r12, [sp, #64] @ hev_thresh
470 vld1.8 {q0}, [r0,:128], r1 @ P3
471 vld1.8 {q1}, [r0,:128], r1 @ P2
473 vld1.8 {q2}, [r0,:128], r1 @ P1
474 vld1.8 {q3}, [r0,:128], r1 @ P0
475 vld1.8 {q4}, [r0,:128], r1 @ Q0
476 vld1.8 {q5}, [r0,:128], r1 @ Q1
478 vld1.8 {q6}, [r0,:128], r1 @ Q2
479 vld1.8 {q7}, [r0,:128] @ Q3
480 vdup.8 q15, r3 @ flim_I
482 vdup.8 q14, r2 @ flim_E
484 vp8_loop_filter inner=\inner, simple=\simple
486 @ back up to P2: dst -= stride * 6
487 sub r0, r0, r1, lsl #2
489 sub r0, r0, r1, lsl #1
492 vst1.8 {q1}, [r0,:128], r1 @ P2
494 vst1.8 {q2}, [r0,:128], r1 @ P1
495 vst1.8 {q3}, [r0,:128], r1 @ P0
496 vst1.8 {q4}, [r0,:128], r1 @ Q0
497 vst1.8 {q5}, [r0,:128], r1 @ Q1
499 vst1.8 {q6}, [r0,:128] @ Q2
508 vp8_v_loop_filter16 _inner, inner=1
509 vp8_v_loop_filter16 _simple, simple=1
511 .macro vp8_v_loop_filter8uv name, inner=0
512 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
514 sub r0, r0, r2, lsl #2
515 sub r1, r1, r2, lsl #2
516 ldr r12, [sp, #64] @ flim_I
519 vld1.8 {d0}, [r0,:64], r2 @ P3
520 vld1.8 {d1}, [r1,:64], r2 @ P3
521 vld1.8 {d2}, [r0,:64], r2 @ P2
522 vld1.8 {d3}, [r1,:64], r2 @ P2
523 vld1.8 {d4}, [r0,:64], r2 @ P1
524 vld1.8 {d5}, [r1,:64], r2 @ P1
525 vld1.8 {d6}, [r0,:64], r2 @ P0
526 vld1.8 {d7}, [r1,:64], r2 @ P0
527 vld1.8 {d8}, [r0,:64], r2 @ Q0
528 vld1.8 {d9}, [r1,:64], r2 @ Q0
529 vld1.8 {d10}, [r0,:64], r2 @ Q1
530 vld1.8 {d11}, [r1,:64], r2 @ Q1
531 vld1.8 {d12}, [r0,:64], r2 @ Q2
532 vld1.8 {d13}, [r1,:64], r2 @ Q2
533 vld1.8 {d14}, [r0,:64] @ Q3
534 vld1.8 {d15}, [r1,:64] @ Q3
536 vdup.8 q14, r3 @ flim_E
537 vdup.8 q15, r12 @ flim_I
538 ldr r12, [sp, #68] @ hev_thresh
540 vp8_loop_filter inner=\inner
542 @ back up to P2: u,v -= stride * 6
543 sub r0, r0, r2, lsl #2
544 sub r1, r1, r2, lsl #2
545 sub r0, r0, r2, lsl #1
546 sub r1, r1, r2, lsl #1
549 vst1.8 {d2}, [r0,:64], r2 @ P2
550 vst1.8 {d3}, [r1,:64], r2 @ P2
551 vst1.8 {d4}, [r0,:64], r2 @ P1
552 vst1.8 {d5}, [r1,:64], r2 @ P1
553 vst1.8 {d6}, [r0,:64], r2 @ P0
554 vst1.8 {d7}, [r1,:64], r2 @ P0
555 vst1.8 {d8}, [r0,:64], r2 @ Q0
556 vst1.8 {d9}, [r1,:64], r2 @ Q0
557 vst1.8 {d10}, [r0,:64], r2 @ Q1
558 vst1.8 {d11}, [r1,:64], r2 @ Q1
559 vst1.8 {d12}, [r0,:64] @ Q2
560 vst1.8 {d13}, [r1,:64] @ Q2
568 vp8_v_loop_filter8uv _inner, inner=1
570 .macro vp8_h_loop_filter16 name, inner=0, simple=0
571 function ff_vp8_h_loop_filter16\name\()_neon, export=1
575 ldr r12, [sp, #64] @ hev_thresh
579 vld1.8 {d0}, [r0], r1 @ load first 8-line src data
580 vld1.8 {d2}, [r0], r1
581 vld1.8 {d4}, [r0], r1
582 vld1.8 {d6}, [r0], r1
583 vld1.8 {d8}, [r0], r1
584 vld1.8 {d10}, [r0], r1
585 vld1.8 {d12}, [r0], r1
586 vld1.8 {d14}, [r0], r1
587 vld1.8 {d1}, [r0], r1 @ load second 8-line src data
588 vld1.8 {d3}, [r0], r1
589 vld1.8 {d5}, [r0], r1
590 vld1.8 {d7}, [r0], r1
591 vld1.8 {d9}, [r0], r1
592 vld1.8 {d11}, [r0], r1
593 vld1.8 {d13}, [r0], r1
594 vld1.8 {d15}, [r0], r1
598 vdup.8 q14, r2 @ flim_E
600 vdup.8 q15, r3 @ flim_I
603 vp8_loop_filter inner=\inner, simple=\simple
605 sub r0, r0, r1, lsl #4 @ backup 16 rows
610 vst1.8 {d0}, [r0], r1
611 vst1.8 {d2}, [r0], r1
612 vst1.8 {d4}, [r0], r1
613 vst1.8 {d6}, [r0], r1
614 vst1.8 {d8}, [r0], r1
615 vst1.8 {d10}, [r0], r1
616 vst1.8 {d12}, [r0], r1
617 vst1.8 {d14}, [r0], r1
618 vst1.8 {d1}, [r0], r1
619 vst1.8 {d3}, [r0], r1
620 vst1.8 {d5}, [r0], r1
621 vst1.8 {d7}, [r0], r1
622 vst1.8 {d9}, [r0], r1
623 vst1.8 {d11}, [r0], r1
624 vst1.8 {d13}, [r0], r1
633 vp8_h_loop_filter16 _inner, inner=1
634 vp8_h_loop_filter16 _simple, simple=1
636 .macro vp8_h_loop_filter8uv name, inner=0
637 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
641 ldr r12, [sp, #64] @ flim_I
644 vld1.8 {d0}, [r0], r2 @ load u
645 vld1.8 {d1}, [r1], r2 @ load v
646 vld1.8 {d2}, [r0], r2
647 vld1.8 {d3}, [r1], r2
648 vld1.8 {d4}, [r0], r2
649 vld1.8 {d5}, [r1], r2
650 vld1.8 {d6}, [r0], r2
651 vld1.8 {d7}, [r1], r2
652 vld1.8 {d8}, [r0], r2
653 vld1.8 {d9}, [r1], r2
654 vld1.8 {d10}, [r0], r2
655 vld1.8 {d11}, [r1], r2
656 vld1.8 {d12}, [r0], r2
657 vld1.8 {d13}, [r1], r2
658 vld1.8 {d14}, [r0], r2
659 vld1.8 {d15}, [r1], r2
663 vdup.8 q14, r3 @ flim_E
664 vdup.8 q15, r12 @ flim_I
665 ldr r12, [sp, #68] @ hev_thresh
667 vp8_loop_filter inner=\inner
669 sub r0, r0, r2, lsl #3 @ backup u 8 rows
670 sub r1, r1, r2, lsl #3 @ backup v 8 rows
675 vst1.8 {d0}, [r0], r2
676 vst1.8 {d1}, [r1], r2
677 vst1.8 {d2}, [r0], r2
678 vst1.8 {d3}, [r1], r2
679 vst1.8 {d4}, [r0], r2
680 vst1.8 {d5}, [r1], r2
681 vst1.8 {d6}, [r0], r2
682 vst1.8 {d7}, [r1], r2
683 vst1.8 {d8}, [r0], r2
684 vst1.8 {d9}, [r1], r2
685 vst1.8 {d10}, [r0], r2
686 vst1.8 {d11}, [r1], r2
687 vst1.8 {d12}, [r0], r2
688 vst1.8 {d13}, [r1], r2
698 vp8_h_loop_filter8uv _inner, inner=1
700 function ff_put_vp8_pixels16_neon, export=1
701 ldr r12, [sp, #0] @ h
704 vld1.8 {q0}, [r2], r3
705 vld1.8 {q1}, [r2], r3
706 vld1.8 {q2}, [r2], r3
707 vld1.8 {q3}, [r2], r3
708 vst1.8 {q0}, [r0,:128], r1
709 vst1.8 {q1}, [r0,:128], r1
710 vst1.8 {q2}, [r0,:128], r1
711 vst1.8 {q3}, [r0,:128], r1
716 function ff_put_vp8_pixels8_neon, export=1
717 ldr r12, [sp, #0] @ h
720 vld1.8 {d0}, [r2], r3
721 vld1.8 {d1}, [r2], r3
722 vld1.8 {d2}, [r2], r3
723 vld1.8 {d3}, [r2], r3
724 vst1.8 {d0}, [r0,:64], r1
725 vst1.8 {d1}, [r0,:64], r1
726 vst1.8 {d2}, [r0,:64], r1
727 vst1.8 {d3}, [r0,:64], r1
732 /* 4/6-tap 8th-pel MC */
734 .macro vp8_epel8_h6 d, a, b
735 vext.8 d27, \a, \b, #1
737 vext.8 d28, \a, \b, #2
739 vext.8 d29, \a, \b, #3
741 vext.8 d30, \a, \b, #4
743 vext.8 d31, \a, \b, #5
745 vmul.u16 q10, q10, d0[2]
747 vmul.u16 q11, q11, d0[3]
748 vmls.u16 q10, q9, d0[1]
749 vmls.u16 q11, q12, d1[0]
750 vmla.u16 q10, q8, d0[0]
751 vmla.u16 q11, q13, d1[1]
752 vqadd.s16 q11, q10, q11
753 vqrshrun.s16 \d, q11, #7
756 .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
757 vext.8 q14, \q0, \q1, #3
758 vext.8 q15, \q0, \q1, #4
761 vext.8 q3, \q0, \q1, #2
764 vext.8 q8, \q0, \q1, #1
767 vext.8 q2, \q0, \q1, #5
772 vmul.u16 q11, q11, d0[3]
773 vmul.u16 q10, q10, d0[2]
774 vmul.u16 q3, q3, d0[2]
775 vmul.u16 q14, q14, d0[3]
776 vmls.u16 q11, q12, d1[0]
779 vmls.u16 q10, q9, d0[1]
780 vmls.u16 q3, q8, d0[1]
781 vmls.u16 q14, q15, d1[0]
782 vmla.u16 q10, q12, d0[0]
783 vmla.u16 q11, q13, d1[1]
784 vmla.u16 q3, q1, d0[0]
785 vmla.u16 q14, q2, d1[1]
786 vqadd.s16 q11, q10, q11
787 vqadd.s16 q14, q3, q14
788 vqrshrun.s16 \d0, q11, #7
789 vqrshrun.s16 \d1, q14, #7
792 .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
799 vmul.u16 q10, q10, d0[2]
800 vmul.u16 q11, q11, d0[3]
801 vmls.u16 q10, q9, d0[1]
802 vmls.u16 q11, q12, d1[0]
803 vmla.u16 q10, q8, d0[0]
804 vmla.u16 q11, q13, d1[1]
805 vqadd.s16 q11, q10, q11
806 vqrshrun.s16 \d0, q11, #7
809 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
817 vmul.u16 q10, q10, d0[0]
818 vmul.u16 q15, q11, d0[3]
819 vmul.u16 q11, q11, d0[2]
820 vmul.u16 q14, q14, d1[1]
821 vmls.u16 q10, q9, d0[1]
822 vmls.u16 q15, q12, d1[0]
823 vmls.u16 q11, q8, d0[1]
824 vmls.u16 q14, q13, d1[0]
825 vmla.u16 q10, q8, d0[2]
826 vmla.u16 q15, q13, d1[1]
827 vmla.u16 q11, q9, d0[0]
828 vmla.u16 q14, q12, d0[3]
829 vqadd.s16 q15, q10, q15
830 vqadd.s16 q14, q11, q14
831 vqrshrun.s16 \d0, q15, #7
832 vqrshrun.s16 \d1, q14, #7
835 .macro vp8_epel8_h4 d, a, b
836 vext.8 d28, \a, \b, #1
838 vext.8 d29, \a, \b, #2
840 vext.8 d30, \a, \b, #3
843 vmul.u16 q10, q10, d0[2]
844 vmul.u16 q11, q11, d0[3]
845 vmls.u16 q10, q9, d0[1]
846 vmls.u16 q11, q12, d1[0]
847 vqadd.s16 q11, q10, q11
848 vqrshrun.s16 \d, q11, #7
851 .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
857 vmul.u16 q8, q10, d0[2]
858 vmul.u16 q14, q11, d0[3]
859 vmul.u16 q11, q11, d0[2]
860 vmul.u16 q15, q12, d0[3]
861 vmls.u16 q8, q9, d0[1]
862 vmls.u16 q14, q12, d1[0]
863 vmls.u16 q11, q10, d0[1]
864 vmls.u16 q15, q13, d1[0]
865 vqadd.s16 q8, q8, q14
866 vqadd.s16 q11, q11, q15
867 vqrshrun.s16 \d0, q8, #7
868 vqrshrun.s16 \d1, q11, #7
871 function ff_put_vp8_epel16_v6_neon, export=1
872 sub r2, r2, r3, lsl #1
876 ldr r4, [sp, #80] @ my
877 movrel lr, subpel_filters-16
878 ldr r12, [sp, #72] @ h
879 add r4, lr, r4, lsl #4
880 vld1.16 {q0}, [r4,:128]
882 vld1.8 {d2-d3}, [r2], r3
883 vld1.8 {d4-d5}, [r2], r3
884 vld1.8 {d6-d7}, [r2], r3
885 vld1.8 {d8-d9}, [r2], r3
886 vld1.8 {d10-d11},[r2], r3
887 vld1.8 {d12-d13},[r2], r3
888 vld1.8 {d14-d15},[r2]
889 sub r2, r2, r3, lsl #2
891 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
892 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
894 vst1.8 {d2-d3}, [r0,:128], r1
895 vst1.8 {d4-d5}, [r0,:128], r1
903 function ff_put_vp8_epel16_h6_neon, export=1
907 ldr r4, [sp, #12] @ mx
908 movrel lr, subpel_filters-16
909 ldr r12, [sp, #8] @ h
910 add r4, lr, r4, lsl #4
911 vld1.16 {q0}, [r4,:128]
913 vld1.8 {d2-d4}, [r2], r3
915 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
917 vst1.8 {d2-d3}, [r0,:128], r1
924 function ff_put_vp8_epel16_h6v6_neon, export=1
925 sub r2, r2, r3, lsl #1
930 @ first pass (horizontal):
931 ldr r4, [sp, #28] @ mx
932 movrel lr, subpel_filters-16
933 ldr r12, [sp, #24] @ h
934 add r4, lr, r4, lsl #4
936 vld1.16 {q0}, [r4,:128]
941 vld1.8 {d2,d3,d4}, [r2], r3
943 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
945 vst1.8 {d2-d3}, [lr,:128]!
949 @ second pass (vertical):
950 ldr r4, [sp, #336+16+32] @ my
951 movrel lr, subpel_filters-16
952 ldr r12, [sp, #336+16+24] @ h
953 add r4, lr, r4, lsl #4
955 vld1.16 {q0}, [r4,:128]
958 vld1.8 {d2-d5}, [lr,:128]!
959 vld1.8 {d6-d9}, [lr,:128]!
960 vld1.8 {d28-d31},[lr,:128]
963 vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
964 vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
966 vst1.8 {d2-d3}, [r0,:128], r1
975 function ff_put_vp8_epel8_v6_neon, export=1
976 sub r2, r2, r3, lsl #1
979 ldr r4, [sp, #16] @ my
980 movrel lr, subpel_filters-16
981 ldr r12, [sp, #8] @ h
982 add r4, lr, r4, lsl #4
983 vld1.16 {q0}, [r4,:128]
985 vld1.8 {d2}, [r2], r3
986 vld1.8 {d3}, [r2], r3
987 vld1.8 {d4}, [r2], r3
988 vld1.8 {d5}, [r2], r3
989 vld1.8 {d6}, [r2], r3
990 vld1.8 {d7}, [r2], r3
993 sub r2, r2, r3, lsl #2
995 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
997 vst1.8 {d2}, [r0,:64], r1
998 vst1.8 {d3}, [r0,:64], r1
1005 function ff_put_vp8_epel8_h6_neon, export=1
1009 ldr r4, [sp, #12] @ mx
1010 movrel lr, subpel_filters-16
1011 ldr r12, [sp, #8] @ h
1012 add r4, lr, r4, lsl #4
1013 vld1.16 {q0}, [r4,:128]
1015 vld1.8 {d2,d3}, [r2], r3
1017 vp8_epel8_h6 d2, d2, d3
1019 vst1.8 {d2}, [r0,:64], r1
1026 function ff_put_vp8_epel8_h6v6_neon, export=1
1027 sub r2, r2, r3, lsl #1
1031 @ first pass (horizontal):
1032 ldr r4, [sp, #12] @ mx
1033 movrel lr, subpel_filters-16
1034 ldr r12, [sp, #8] @ h
1035 add r4, lr, r4, lsl #4
1037 vld1.16 {q0}, [r4,:128]
1042 vld1.8 {d2,d3}, [r2], r3
1044 vp8_epel8_h6 d2, d2, d3
1046 vst1.8 {d2}, [lr,:64]!
1050 @ second pass (vertical):
1051 ldr r4, [sp, #168+16+16] @ my
1052 movrel lr, subpel_filters-16
1053 ldr r12, [sp, #168+16+8] @ h
1054 add r4, lr, r4, lsl #4
1056 vld1.16 {q0}, [r4,:128]
1059 vld1.8 {d2-d5}, [lr,:128]!
1060 vld1.8 {d6-d7}, [lr,:128]!
1061 vld1.8 {d30}, [lr,:64]
1064 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1066 vst1.8 {d2}, [r0,:64], r1
1067 vst1.8 {d3}, [r0,:64], r1
1075 function ff_put_vp8_epel8_v4_neon, export=1
1079 ldr r4, [sp, #16] @ my
1080 movrel lr, subpel_filters-16
1081 ldr r12, [sp, #8] @ h
1082 add r4, lr, r4, lsl #4
1083 vld1.16 {q0}, [r4,:128]
1085 vld1.8 {d2}, [r2], r3
1086 vld1.8 {d3}, [r2], r3
1087 vld1.8 {d4}, [r2], r3
1088 vld1.8 {d5}, [r2], r3
1090 sub r2, r2, r3, lsl #1
1092 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1094 vst1.8 {d2}, [r0,:64], r1
1095 vst1.8 {d3}, [r0,:64], r1
1102 function ff_put_vp8_epel8_h4_neon, export=1
1106 ldr r4, [sp, #12] @ mx
1107 movrel lr, subpel_filters-16
1108 ldr r12, [sp, #8] @ h
1109 add r4, lr, r4, lsl #4
1110 vld1.16 {q0}, [r4,:128]
1112 vld1.8 {d2,d3}, [r2], r3
1114 vp8_epel8_h4 d2, d2, d3
1116 vst1.8 {d2}, [r0,:64], r1
1123 function ff_put_vp8_epel8_h4v4_neon, export=1
1128 @ first pass (horizontal):
1129 ldr r4, [sp, #12] @ mx
1130 movrel lr, subpel_filters-16
1131 ldr r12, [sp, #8] @ h
1132 add r4, lr, r4, lsl #4
1134 vld1.16 {q0}, [r4,:128]
1139 vld1.8 {d2,d3}, [r2], r3
1141 vp8_epel8_h4 d2, d2, d3
1143 vst1.8 {d2}, [lr,:64]!
1147 @ second pass (vertical):
1148 ldr r4, [sp, #168+16+16] @ my
1149 movrel lr, subpel_filters-16
1150 ldr r12, [sp, #168+16+8] @ h
1151 add r4, lr, r4, lsl #4
1153 vld1.16 {q0}, [r4,:128]
1156 vld1.8 {d2-d5}, [lr,:128]!
1157 vld1.8 {d6}, [lr,:64]
1160 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1162 vst1.8 {d2}, [r0,:64], r1
1163 vst1.8 {d3}, [r0,:64], r1
1171 function ff_put_vp8_epel8_h6v4_neon, export=1
1176 @ first pass (horizontal):
1177 ldr r4, [sp, #12] @ mx
1178 movrel lr, subpel_filters-16
1179 ldr r12, [sp, #8] @ h
1180 add r4, lr, r4, lsl #4
1182 vld1.16 {q0}, [r4,:128]
1187 vld1.8 {d2,d3}, [r2], r3
1189 vp8_epel8_h6 d2, d2, d3
1191 vst1.8 {d2}, [lr,:64]!
1195 @ second pass (vertical):
1196 ldr r4, [sp, #168+16+16] @ my
1197 movrel lr, subpel_filters-16
1198 ldr r12, [sp, #168+16+8] @ h
1199 add r4, lr, r4, lsl #4
1201 vld1.16 {q0}, [r4,:128]
1204 vld1.8 {d2-d5}, [lr,:128]!
1205 vld1.8 {d6}, [lr,:64]
1208 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1210 vst1.8 {d2}, [r0,:64], r1
1211 vst1.8 {d3}, [r0,:64], r1
1219 function ff_put_vp8_epel8_h4v6_neon, export=1
1220 sub r2, r2, r3, lsl #1
1224 @ first pass (horizontal):
1225 ldr r4, [sp, #12] @ mx
1226 movrel lr, subpel_filters-16
1227 ldr r12, [sp, #8] @ h
1228 add r4, lr, r4, lsl #4
1230 vld1.16 {q0}, [r4,:128]
1235 vld1.8 {d2,d3}, [r2], r3
1237 vp8_epel8_h4 d2, d2, d3
1239 vst1.8 {d2}, [lr,:64]!
1243 @ second pass (vertical):
1244 ldr r4, [sp, #168+16+16] @ my
1245 movrel lr, subpel_filters-16
1246 ldr r12, [sp, #168+16+8] @ h
1247 add r4, lr, r4, lsl #4
1249 vld1.16 {q0}, [r4,:128]
1252 vld1.8 {d2-d5}, [lr,:128]!
1253 vld1.8 {d6-d7}, [lr,:128]!
1254 vld1.8 {d30}, [lr,:64]
1257 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1259 vst1.8 {d2}, [r0,:64], r1
1260 vst1.8 {d3}, [r0,:64], r1
1270 function ff_put_vp8_epel4_v6_neon, export=1
1271 sub r2, r2, r3, lsl #1
1274 ldr r4, [sp, #16] @ my
1275 movrel lr, subpel_filters-16
1276 ldr r12, [sp, #8] @ h
1277 add r4, lr, r4, lsl #4
1278 vld1.16 {q0}, [r4,:128]
1280 vld1.32 {d2[]}, [r2], r3
1281 vld1.32 {d3[]}, [r2], r3
1282 vld1.32 {d4[]}, [r2], r3
1283 vld1.32 {d5[]}, [r2], r3
1284 vld1.32 {d6[]}, [r2], r3
1285 vld1.32 {d7[]}, [r2], r3
1286 vld1.32 {d28[]}, [r2]
1287 sub r2, r2, r3, lsl #2
1288 vld1.32 {d2[1]}, [r2], r3
1289 vld1.32 {d3[1]}, [r2], r3
1290 vld1.32 {d4[1]}, [r2], r3
1291 vld1.32 {d5[1]}, [r2], r3
1292 vld1.32 {d6[1]}, [r2], r3
1293 vld1.32 {d7[1]}, [r2], r3
1294 vld1.32 {d28[1]}, [r2]
1295 sub r2, r2, r3, lsl #2
1297 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1299 vst1.32 {d2[0]}, [r0,:32], r1
1300 vst1.32 {d3[0]}, [r0,:32], r1
1301 vst1.32 {d2[1]}, [r0,:32], r1
1302 vst1.32 {d3[1]}, [r0,:32], r1
1309 function ff_put_vp8_epel4_h6_neon, export=1
1313 ldr r4, [sp, #12] @ mx
1314 movrel lr, subpel_filters-16
1315 ldr r12, [sp, #8] @ h
1316 add r4, lr, r4, lsl #4
1317 vld1.16 {q0}, [r4,:128]
1319 vld1.8 {q1}, [r2], r3
1320 vp8_epel8_h6 d2, d2, d3
1321 vst1.32 {d2[0]}, [r0,:32], r1
1328 function ff_put_vp8_epel4_h6v6_neon, export=1
1329 sub r2, r2, r3, lsl #1
1333 ldr r4, [sp, #12] @ mx
1334 movrel lr, subpel_filters-16
1335 ldr r12, [sp, #8] @ h
1336 add r4, lr, r4, lsl #4
1338 vld1.16 {q0}, [r4,:128]
1343 vld1.8 {q1}, [r2], r3
1344 vp8_epel8_h6 d2, d2, d3
1345 vst1.32 {d2[0]}, [lr,:32]!
1349 ldr r4, [sp, #52+16+16] @ my
1350 movrel lr, subpel_filters-16
1351 ldr r12, [sp, #52+16+8] @ h
1352 add r4, lr, r4, lsl #4
1354 vld1.16 {q0}, [r4,:128]
1357 vld1.8 {d2-d3}, [lr,:128]!
1358 vld1.8 {d6}, [lr,:64]!
1359 vld1.32 {d28[]}, [lr,:32]
1361 vld1.8 {d4-d5}, [lr]!
1362 vld1.8 {d7}, [lr,:64]!
1363 vld1.32 {d28[1]}, [lr,:32]
1367 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1368 vst1.32 {d2[0]}, [r0,:32], r1
1369 vst1.32 {d3[0]}, [r0,:32], r1
1370 vst1.32 {d2[1]}, [r0,:32], r1
1371 vst1.32 {d3[1]}, [r0,:32], r1
1379 function ff_put_vp8_epel4_h4v6_neon, export=1
1380 sub r2, r2, r3, lsl #1
1384 ldr r4, [sp, #12] @ mx
1385 movrel lr, subpel_filters-16
1386 ldr r12, [sp, #8] @ h
1387 add r4, lr, r4, lsl #4
1389 vld1.16 {q0}, [r4,:128]
1394 vld1.8 {d2}, [r2], r3
1395 vp8_epel8_h4 d2, d2, d2
1396 vst1.32 {d2[0]}, [lr,:32]!
1400 ldr r4, [sp, #52+16+16] @ my
1401 movrel lr, subpel_filters-16
1402 ldr r12, [sp, #52+16+8] @ h
1403 add r4, lr, r4, lsl #4
1405 vld1.16 {q0}, [r4,:128]
1408 vld1.8 {d2-d3}, [lr,:128]!
1409 vld1.8 {d6}, [lr,:64]!
1410 vld1.32 {d28[]}, [lr,:32]
1412 vld1.8 {d4-d5}, [lr]!
1413 vld1.8 {d7}, [lr,:64]!
1414 vld1.32 {d28[1]}, [lr,:32]
1418 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1419 vst1.32 {d2[0]}, [r0,:32], r1
1420 vst1.32 {d3[0]}, [r0,:32], r1
1421 vst1.32 {d2[1]}, [r0,:32], r1
1422 vst1.32 {d3[1]}, [r0,:32], r1
1430 function ff_put_vp8_epel4_h6v4_neon, export=1
1435 ldr r4, [sp, #12] @ mx
1436 movrel lr, subpel_filters-16
1437 ldr r12, [sp, #8] @ h
1438 add r4, lr, r4, lsl #4
1440 vld1.16 {q0}, [r4,:128]
1445 vld1.8 {q1}, [r2], r3
1446 vp8_epel8_h6 d2, d2, d3
1447 vst1.32 {d2[0]}, [lr,:32]!
1451 ldr r4, [sp, #44+16+16] @ my
1452 movrel lr, subpel_filters-16
1453 ldr r12, [sp, #44+16+8] @ h
1454 add r4, lr, r4, lsl #4
1456 vld1.16 {q0}, [r4,:128]
1459 vld1.8 {d2-d3}, [lr,:128]!
1460 vld1.32 {d6[]}, [lr,:32]
1462 vld1.8 {d4-d5}, [lr]!
1463 vld1.32 {d6[1]}, [lr,:32]
1466 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1467 vst1.32 {d2[0]}, [r0,:32], r1
1468 vst1.32 {d3[0]}, [r0,:32], r1
1469 vst1.32 {d2[1]}, [r0,:32], r1
1470 vst1.32 {d3[1]}, [r0,:32], r1
1478 function ff_put_vp8_epel4_h4_neon, export=1
1482 ldr r4, [sp, #12] @ mx
1483 movrel lr, subpel_filters-16
1484 ldr r12, [sp, #8] @ h
1485 add r4, lr, r4, lsl #4
1486 vld1.16 {q0}, [r4,:128]
1488 vld1.8 {d2}, [r2], r3
1489 vp8_epel8_h4 d2, d2, d2
1490 vst1.32 {d2[0]}, [r0,:32], r1
1497 function ff_put_vp8_epel4_v4_neon, export=1
1501 ldr r4, [sp, #16] @ my
1502 movrel lr, subpel_filters-16
1503 ldr r12, [sp, #8] @ h
1504 add r4, lr, r4, lsl #4
1505 vld1.16 {q0}, [r4,:128]
1507 vld1.32 {d2[]}, [r2], r3
1508 vld1.32 {d3[]}, [r2], r3
1509 vld1.32 {d4[]}, [r2], r3
1510 vld1.32 {d5[]}, [r2], r3
1511 vld1.32 {d6[]}, [r2]
1512 sub r2, r2, r3, lsl #1
1513 vld1.32 {d2[1]}, [r2], r3
1514 vld1.32 {d3[1]}, [r2], r3
1515 vld1.32 {d4[1]}, [r2], r3
1516 vld1.32 {d5[1]}, [r2], r3
1517 vld1.32 {d6[1]}, [r2]
1518 sub r2, r2, r3, lsl #1
1520 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1522 vst1.32 {d2[0]}, [r0,:32], r1
1523 vst1.32 {d3[0]}, [r0,:32], r1
1524 vst1.32 {d2[1]}, [r0,:32], r1
1525 vst1.32 {d3[1]}, [r0,:32], r1
1532 function ff_put_vp8_epel4_h4v4_neon, export=1
1537 ldr r4, [sp, #12] @ mx
1538 movrel lr, subpel_filters-16
1539 ldr r12, [sp, #8] @ h
1540 add r4, lr, r4, lsl #4
1542 vld1.16 {q0}, [r4,:128]
1547 vld1.8 {d2}, [r2], r3
1548 vp8_epel8_h4 d2, d2, d3
1549 vst1.32 {d2[0]}, [lr,:32]!
1553 ldr r4, [sp, #44+16+16] @ my
1554 movrel lr, subpel_filters-16
1555 ldr r12, [sp, #44+16+8] @ h
1556 add r4, lr, r4, lsl #4
1558 vld1.16 {q0}, [r4,:128]
1561 vld1.8 {d2-d3}, [lr,:128]!
1562 vld1.32 {d6[]}, [lr,:32]
1564 vld1.8 {d4-d5}, [lr]!
1565 vld1.32 {d6[1]}, [lr,:32]
1568 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1569 vst1.32 {d2[0]}, [r0,:32], r1
1570 vst1.32 {d3[0]}, [r0,:32], r1
1571 vst1.32 {d2[1]}, [r0,:32], r1
1572 vst1.32 {d3[1]}, [r0,:32], r1
1580 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1581 @ arithmatic can be used to apply filters
1582 const subpel_filters, align=4
1583 .short 0, 6, 123, 12, 1, 0, 0, 0
1584 .short 2, 11, 108, 36, 8, 1, 0, 0
1585 .short 0, 9, 93, 50, 6, 0, 0, 0
1586 .short 3, 16, 77, 77, 16, 3, 0, 0
1587 .short 0, 6, 50, 93, 9, 0, 0, 0
1588 .short 1, 8, 36, 108, 11, 2, 0, 0
1589 .short 0, 1, 12, 123, 6, 0, 0, 0
1594 function ff_put_vp8_bilin16_h_neon, export=1
1595 ldr r3, [sp, #4] @ mx
1602 vld1.8 {d2-d4}, [r2], r1
1603 vext.8 q2, q1, q2, #1
1606 vld1.8 {d18-d20},[r2], r1
1609 vext.8 q10, q9, q10, #1
1610 vmull.u8 q11, d18, d1
1611 vmlal.u8 q11, d20, d0
1612 vmull.u8 q12, d19, d1
1613 vmlal.u8 q12, d21, d0
1614 vrshrn.u16 d4, q8, #3
1615 vrshrn.u16 d5, q3, #3
1616 vrshrn.u16 d6, q11, #3
1617 vrshrn.u16 d7, q12, #3
1618 vst1.8 {q2}, [r0,:128], r1
1619 vst1.8 {q3}, [r0,:128], r1
1625 function ff_put_vp8_bilin16_v_neon, export=1
1626 ldr r3, [sp, #8] @ my
1631 vld1.8 {q1}, [r2], r1
1634 vld1.8 {q2}, [r2], r1
1639 vld1.8 {q1}, [r2], r1
1642 vmull.u8 q10, d5, d1
1643 vmlal.u8 q10, d3, d0
1644 vrshrn.u16 d4, q3, #3
1645 vrshrn.u16 d5, q8, #3
1646 vrshrn.u16 d6, q9, #3
1647 vrshrn.u16 d7, q10, #3
1648 vst1.8 {q2}, [r0,:128], r1
1649 vst1.8 {q3}, [r0,:128], r1
1655 function ff_put_vp8_bilin16_hv_neon, export=1
1656 ldr r3, [sp, #4] @ mx
1660 ldr r3, [sp, #8] @ my
1666 vld1.8 {d4-d6}, [r2], r1
1667 vext.8 q3, q2, q3, #1
1672 vrshrn.u16 d4, q8, #3
1673 vrshrn.u16 d5, q9, #3
1676 vld1.8 {d18-d20},[r2], r1
1677 vext.8 q10, q9, q10, #1
1678 vmull.u8 q11, d18, d1
1679 vmlal.u8 q11, d20, d0
1680 vld1.8 {d26-d28},[r2], r1
1681 vmull.u8 q12, d19, d1
1682 vmlal.u8 q12, d21, d0
1683 vext.8 q14, q13, q14, #1
1684 vmull.u8 q8, d26, d1
1685 vmlal.u8 q8, d28, d0
1686 vmull.u8 q9, d27, d1
1687 vmlal.u8 q9, d29, d0
1688 vrshrn.u16 d6, q11, #3
1689 vrshrn.u16 d7, q12, #3
1690 vmull.u8 q12, d4, d3
1691 vmlal.u8 q12, d6, d2
1692 vmull.u8 q15, d5, d3
1693 vmlal.u8 q15, d7, d2
1694 vrshrn.u16 d4, q8, #3
1695 vrshrn.u16 d5, q9, #3
1696 vmull.u8 q10, d6, d3
1697 vmlal.u8 q10, d4, d2
1698 vmull.u8 q11, d7, d3
1699 vmlal.u8 q11, d5, d2
1700 vrshrn.u16 d24, q12, #3
1701 vrshrn.u16 d25, q15, #3
1702 vst1.8 {q12}, [r0,:128], r1
1703 vrshrn.u16 d20, q10, #3
1704 vrshrn.u16 d21, q11, #3
1705 vst1.8 {q10}, [r0,:128], r1
1711 function ff_put_vp8_bilin8_h_neon, export=1
1712 ldr r3, [sp, #4] @ mx
1719 vld1.8 {q1}, [r2], r1
1720 vext.8 d3, d2, d3, #1
1723 vld1.8 {q3}, [r2], r1
1724 vext.8 d7, d6, d7, #1
1727 vrshrn.u16 d4, q2, #3
1728 vrshrn.u16 d16, q8, #3
1729 vst1.8 {d4}, [r0,:64], r1
1730 vst1.8 {d16}, [r0,:64], r1
1736 function ff_put_vp8_bilin8_v_neon, export=1
1737 ldr r3, [sp, #8] @ my
1742 vld1.8 {d2}, [r2], r1
1745 vld1.8 {d3}, [r2], r1
1748 vld1.8 {d2}, [r2], r1
1751 vrshrn.u16 d4, q2, #3
1752 vrshrn.u16 d6, q3, #3
1753 vst1.8 {d4}, [r0,:64], r1
1754 vst1.8 {d6}, [r0,:64], r1
1760 function ff_put_vp8_bilin8_hv_neon, export=1
1761 ldr r3, [sp, #4] @ mx
1765 ldr r3, [sp, #8] @ my
1771 vld1.8 {q2}, [r2], r1
1772 vext.8 d5, d4, d5, #1
1775 vrshrn.u16 d22, q9, #3
1778 vld1.8 {q3}, [r2], r1
1779 vext.8 d7, d6, d7, #1
1782 vld1.8 {q2}, [r2], r1
1783 vext.8 d5, d4, d5, #1
1786 vrshrn.u16 d16, q8, #3
1787 vmull.u8 q10, d22, d3
1788 vmlal.u8 q10, d16, d2
1789 vrshrn.u16 d22, q9, #3
1790 vmull.u8 q12, d16, d3
1791 vmlal.u8 q12, d22, d2
1792 vrshrn.u16 d20, q10, #3
1793 vst1.8 {d20}, [r0,:64], r1
1794 vrshrn.u16 d23, q12, #3
1795 vst1.8 {d23}, [r0,:64], r1
1801 function ff_put_vp8_bilin4_h_neon, export=1
1802 ldr r3, [sp, #4] @ mx
1809 vld1.8 {d2}, [r2], r1
1810 vext.8 d3, d2, d3, #1
1811 vld1.8 {d6}, [r2], r1
1812 vext.8 d7, d6, d7, #1
1816 vrshrn.u16 d4, q2, #3
1817 vst1.32 {d4[0]}, [r0,:32], r1
1818 vst1.32 {d4[1]}, [r0,:32], r1
1824 function ff_put_vp8_bilin4_v_neon, export=1
1825 ldr r3, [sp, #8] @ my
1830 vld1.32 {d2[]}, [r2], r1
1832 vld1.32 {d3[]}, [r2]
1833 vld1.32 {d2[1]}, [r2], r1
1834 vld1.32 {d3[1]}, [r2], r1
1838 vrshrn.u16 d4, q2, #3
1839 vst1.32 {d4[0]}, [r0,:32], r1
1840 vst1.32 {d4[1]}, [r0,:32], r1
1847 function ff_put_vp8_bilin4_hv_neon, export=1
1848 ldr r3, [sp, #4] @ mx
1852 ldr r3, [sp, #8] @ my
1858 vld1.8 {d4}, [r2], r1
1859 vext.8 d5, d4, d4, #1
1862 vrshrn.u16 d22, q9, #3
1865 vld1.8 {d6}, [r2], r1
1866 vext.8 d7, d6, d6, #1
1867 vld1.8 {d4}, [r2], r1
1868 vext.8 d5, d4, d4, #1
1872 vrshrn.u16 d16, q8, #3
1873 vmull.u8 q10, d16, d2
1875 vmlal.u8 q10, d22, d3
1877 vrshrn.u16 d20, q10, #3
1878 vst1.32 {d20[0]}, [r0,:32], r1
1879 vst1.32 {d20[1]}, [r0,:32], r1