]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/vp8dsp_neon.S
VP8: ARM NEON optimisations for dsp functions
[ffmpeg] / libavcodec / arm / vp8dsp_neon.S
1 /**
2  * VP8 NEON optimisations
3  *
4  * Copyright (c) 2010 Rob Clark <rob@ti.com>
5  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23
24 #include "asm.S"
25
26 function ff_vp8_luma_dc_wht_neon, export=1
27         vld1.16         {q0-q1},  [r1,:128]
28         vmov.i16        q15, #0
29
30         vadd.i16        d4,  d0,  d3
31         vadd.i16        d6,  d1,  d2
32         vst1.16         {q15},    [r1,:128]!
33         vsub.i16        d7,  d1,  d2
34         vsub.i16        d5,  d0,  d3
35         vst1.16         {q15},    [r1,:128]
36         vadd.i16        q0,  q2,  q3
37         vsub.i16        q1,  q2,  q3
38
39         vmov.i16        q8, #3
40
41         vtrn.32         d0,  d2
42         vtrn.32         d1,  d3
43         vtrn.16         d0,  d1
44         vtrn.16         d2,  d3
45
46         vadd.i16        d0,  d0,  d16
47
48         vadd.i16        d4,  d0,  d3
49         vadd.i16        d6,  d1,  d2
50         vsub.i16        d7,  d1,  d2
51         vsub.i16        d5,  d0,  d3
52         vadd.i16        q0,  q2,  q3
53         vsub.i16        q1,  q2,  q3
54
55         vshr.s16        q0,  q0,  #3
56         vshr.s16        q1,  q1,  #3
57
58         mov             r3,  #32
59         vst1.16         {d0[0]},  [r0,:16], r3
60         vst1.16         {d1[0]},  [r0,:16], r3
61         vst1.16         {d2[0]},  [r0,:16], r3
62         vst1.16         {d3[0]},  [r0,:16], r3
63         vst1.16         {d0[1]},  [r0,:16], r3
64         vst1.16         {d1[1]},  [r0,:16], r3
65         vst1.16         {d2[1]},  [r0,:16], r3
66         vst1.16         {d3[1]},  [r0,:16], r3
67         vst1.16         {d0[2]},  [r0,:16], r3
68         vst1.16         {d1[2]},  [r0,:16], r3
69         vst1.16         {d2[2]},  [r0,:16], r3
70         vst1.16         {d3[2]},  [r0,:16], r3
71         vst1.16         {d0[3]},  [r0,:16], r3
72         vst1.16         {d1[3]},  [r0,:16], r3
73         vst1.16         {d2[3]},  [r0,:16], r3
74         vst1.16         {d3[3]},  [r0,:16], r3
75
76         bx              lr
77 endfunc
78
79 function ff_vp8_luma_dc_wht_dc_neon, export=1
80         ldrsh           r2,  [r1]
81         mov             r3,  #0
82         add             r2,  r2,  #3
83         strh            r3,  [r1]
84         asr             r2,  r2,  #3
85     .rept 16
86         strh            r2,  [r0], #32
87     .endr
88         bx              lr
89 endfunc
90
91 function ff_vp8_idct_add_neon, export=1
92         vld1.16         {q0-q1},  [r1,:128]
93         movw            r3,  #20091
94         movt            r3,  #35468/2
95         vdup.32         d4,  r3
96
97         vmull.s16       q12, d1,  d4[0]
98         vmull.s16       q13, d3,  d4[0]
99         vqdmulh.s16     d20, d1,  d4[1]
100         vqdmulh.s16     d23, d3,  d4[1]
101         vshrn.s32       d21, q12, #16
102         vshrn.s32       d22, q13, #16
103         vadd.s16        d21, d21, d1
104         vadd.s16        d22, d22, d3
105
106         vadd.s16        d16, d0,  d2
107         vsub.s16        d17, d0,  d2
108         vadd.s16        d18, d21, d23
109         vsub.s16        d19, d20, d22
110         vadd.s16        q0,  q8,  q9
111         vsub.s16        q1,  q8,  q9
112
113         vtrn.32         d0,  d3
114         vtrn.32         d1,  d2
115         vtrn.16         d0,  d1
116         vtrn.16         d3,  d2
117
118         vmov.i16        q15, #0
119         vmull.s16       q12, d1,  d4[0]
120         vst1.16         {q15},    [r1,:128]!
121         vmull.s16       q13, d2,  d4[0]
122         vst1.16         {q15},    [r1,:128]
123         vqdmulh.s16     d21, d1,  d4[1]
124         vqdmulh.s16     d23, d2,  d4[1]
125         vshrn.s32       d20, q12, #16
126         vshrn.s32       d22, q13, #16
127         vadd.i16        d20, d20, d1
128         vadd.i16        d22, d22, d2
129
130         vadd.i16        d16, d0,  d3
131         vsub.i16        d17, d0,  d3
132         vadd.i16        d18, d20, d23
133         vld1.32         {d20[]},  [r0,:32], r2
134         vsub.i16        d19, d21, d22
135         vld1.32         {d22[]},  [r0,:32], r2
136         vadd.s16        q0,  q8,  q9
137         vld1.32         {d23[]},  [r0,:32], r2
138         vsub.s16        q1,  q8,  q9
139         vld1.32         {d21[]},  [r0,:32], r2
140         vrshr.s16       q0,  q0,  #3
141         vtrn.32         q10, q11
142         vrshr.s16       q1,  q1,  #3
143
144         sub             r0,  r0,  r2,  lsl #2
145
146         vtrn.32         d0,  d3
147         vtrn.32         d1,  d2
148         vtrn.16         d0,  d1
149         vtrn.16         d3,  d2
150
151         vaddw.u8        q0,  q0,  d20
152         vaddw.u8        q1,  q1,  d21
153         vqmovun.s16     d0,  q0
154         vqmovun.s16     d1,  q1
155
156         vst1.32         {d0[0]},  [r0,:32], r2
157         vst1.32         {d0[1]},  [r0,:32], r2
158         vst1.32         {d1[1]},  [r0,:32], r2
159         vst1.32         {d1[0]},  [r0,:32], r2
160
161         bx              lr
162 endfunc
163
164 function ff_vp8_idct_dc_add_neon, export=1
165         mov             r3,  #0
166         ldrsh           r12, [r1]
167         strh            r3,  [r1]
168         vdup.16         q1,  r12
169         vrshr.s16       q1,  q1,  #3
170         vld1.32         {d0[]},   [r0,:32], r2
171         vld1.32         {d1[]},   [r0,:32], r2
172         vld1.32         {d0[1]},  [r0,:32], r2
173         vld1.32         {d1[1]},  [r0,:32], r2
174         vaddw.u8        q2,  q1,  d0
175         vaddw.u8        q3,  q1,  d1
176         sub             r0,  r0,  r2, lsl #2
177         vqmovun.s16     d0,  q2
178         vqmovun.s16     d1,  q3
179         vst1.32         {d0[0]},  [r0,:32], r2
180         vst1.32         {d1[0]},  [r0,:32], r2
181         vst1.32         {d0[1]},  [r0,:32], r2
182         vst1.32         {d1[1]},  [r0,:32], r2
183         bx              lr
184 endfunc
185
186 function ff_vp8_idct_dc_add4uv_neon, export=1
187         vmov.i16        d0,  #0
188         mov             r3,  #32
189         vld1.16         {d16[]},  [r1,:16]
190         vst1.16         {d0[0]},  [r1,:16], r3
191         vld1.16         {d17[]},  [r1,:16]
192         vst1.16         {d0[0]},  [r1,:16], r3
193         vld1.16         {d18[]},  [r1,:16]
194         vst1.16         {d0[0]},  [r1,:16], r3
195         vld1.16         {d19[]},  [r1,:16]
196         vst1.16         {d0[0]},  [r1,:16], r3
197         mov             r3,  r0
198         vrshr.s16       q8,  q8,  #3            @ dc >>= 3
199         vld1.8          {d0},     [r0,:64], r2
200         vrshr.s16       q9,  q9,  #3
201         vld1.8          {d1},     [r0,:64], r2
202         vaddw.u8        q10, q8,  d0
203         vld1.8          {d2},     [r0,:64], r2
204         vaddw.u8        q0,  q8,  d1
205         vld1.8          {d3},     [r0,:64], r2
206         vaddw.u8        q11, q8,  d2
207         vld1.8          {d4},     [r0,:64], r2
208         vaddw.u8        q1,  q8,  d3
209         vld1.8          {d5},     [r0,:64], r2
210         vaddw.u8        q12, q9,  d4
211         vld1.8          {d6},     [r0,:64], r2
212         vaddw.u8        q2,  q9,  d5
213         vld1.8          {d7},     [r0,:64], r2
214         vaddw.u8        q13, q9,  d6
215         vqmovun.s16     d20, q10
216         vaddw.u8        q3,  q9,  d7
217         vqmovun.s16     d21, q0
218         vqmovun.s16     d22, q11
219         vst1.8          {d20},    [r3,:64], r2
220         vqmovun.s16     d23, q1
221         vst1.8          {d21},    [r3,:64], r2
222         vqmovun.s16     d24, q12
223         vst1.8          {d22},    [r3,:64], r2
224         vqmovun.s16     d25, q2
225         vst1.8          {d23},    [r3,:64], r2
226         vqmovun.s16     d26, q13
227         vst1.8          {d24},    [r3,:64], r2
228         vqmovun.s16     d27, q3
229         vst1.8          {d25},    [r3,:64], r2
230         vst1.8          {d26},    [r3,:64], r2
231         vst1.8          {d27},    [r3,:64], r2
232
233         bx              lr
234 endfunc
235
236 function ff_vp8_idct_dc_add4y_neon, export=1
237         vmov.i16        d0,  #0
238         mov             r3,  #32
239         vld1.16         {d16[]},  [r1,:16]
240         vst1.16         {d0[0]},  [r1,:16], r3
241         vld1.16         {d17[]},  [r1,:16]
242         vst1.16         {d0[0]},  [r1,:16], r3
243         vld1.16         {d18[]},  [r1,:16]
244         vst1.16         {d0[0]},  [r1,:16], r3
245         vld1.16         {d19[]},  [r1,:16]
246         vst1.16         {d0[0]},  [r1,:16], r3
247         vrshr.s16       q8,  q8,  #3            @ dc >>= 3
248         vld1.8          {q0},     [r0,:128], r2
249         vrshr.s16       q9,  q9,  #3
250         vld1.8          {q1},     [r0,:128], r2
251         vaddw.u8        q10, q8,  d0
252         vld1.8          {q2},     [r0,:128], r2
253         vaddw.u8        q0,  q9,  d1
254         vld1.8          {q3},     [r0,:128], r2
255         vaddw.u8        q11, q8,  d2
256         vaddw.u8        q1,  q9,  d3
257         vaddw.u8        q12, q8,  d4
258         vaddw.u8        q2,  q9,  d5
259         vaddw.u8        q13, q8,  d6
260         vaddw.u8        q3,  q9,  d7
261         sub             r0,  r0,  r2,  lsl #2
262         vqmovun.s16     d20, q10
263         vqmovun.s16     d21, q0
264         vqmovun.s16     d22, q11
265         vqmovun.s16     d23, q1
266         vqmovun.s16     d24, q12
267         vst1.8          {q10},    [r0,:128], r2
268         vqmovun.s16     d25, q2
269         vst1.8          {q11},    [r0,:128], r2
270         vqmovun.s16     d26, q13
271         vst1.8          {q12},    [r0,:128], r2
272         vqmovun.s16     d27, q3
273         vst1.8          {q13},    [r0,:128], r2
274
275         bx              lr
276 endfunc
277
278 @ Register layout:
279 @   P3..Q3 -> q0..q7
280 @   flim_E -> q14
281 @   flim_I -> q15
282 @   hev_thresh -> r12
283 @
284 .macro  vp8_loop_filter, inner=0, simple=0
285     .if \simple
286         vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
287         vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
288         vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
289         vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
290         vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
291         vmov.i8         q13, #0x80
292         vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
293     .else
294         @ calculate hev and normal_limit:
295         vabd.u8         q12, q2,  q3            @ abs(P1-P0)
296         vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
297         vabd.u8         q10, q0,  q1            @ abs(P3-P2)
298         vabd.u8         q11, q1,  q2            @ abs(P2-P1)
299         vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
300         vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
301         vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
302         vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
303         vand            q8,  q8,  q9
304         vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
305         vand            q8,  q8,  q11
306         vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
307         vand            q8,  q8,  q10
308         vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
309         vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
310         vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
311         vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
312         vand            q8,  q8,  q10
313         vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
314         vand            q8,  q8,  q11
315         vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
316         vdup.8          q15, r12                @ hev_thresh
317         vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
318         vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
319         vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
320         vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
321         vand            q8,  q8,  q11
322         vmov.i8         q13, #0x80
323         vorr            q9,  q12, q14
324     .endif
325
326         @ at this point:
327         @   q8: normal_limit
328         @   q9: hev
329
330         @ convert to signed value:
331         veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
332         veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
333
334         vmov.i16        q12, #3
335         vsubl.s8        q10, d8,  d6            @ QS0 - PS0
336         vsubl.s8        q11, d9,  d7            @   (widened to 16bit)
337         veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
338         veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
339         vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
340         vmul.i16        q11, q11, q12
341
342         vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
343         vmov.i8         q14, #4
344         vmov.i8         q15, #3
345     .if \inner
346         vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
347     .endif
348         vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
349         vaddw.s8        q11, q11, d25
350         vqmovn.s16      d20, q10                @ narrow result back into q10
351         vqmovn.s16      d21, q11
352     .if !\inner && !\simple
353         veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
354         veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
355     .endif
356         vand            q10, q10, q8            @ w &= normal_limit
357
358         @ registers used at this point..
359         @   q0 -> P3  (don't corrupt)
360         @   q1-q6 -> PS2-QS2
361         @   q7 -> Q3  (don't corrupt)
362         @   q9 -> hev
363         @   q10 -> w
364         @   q13 -> #0x80
365         @   q14 -> #4
366         @   q15 -> #3
367         @   q8, q11, q12 -> unused
368
369         @ filter_common:   is4tap==1
370         @   c1 = clamp(w + 4) >> 3;
371         @   c2 = clamp(w + 3) >> 3;
372         @   Q0 = s2u(QS0 - c1);
373         @   P0 = s2u(PS0 + c2);
374
375     .if \simple
376         vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
377         vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
378         vshr.s8         q11, q11, #3            @ c1 >>= 3
379         vshr.s8         q12, q12, #3            @ c2 >>= 3
380         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
381         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
382         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
383         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
384         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
385         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
386     .elseif \inner
387         @ the !is4tap case of filter_common, only used for inner blocks
388         @   c3 = ((c1&~hev) + 1) >> 1;
389         @   Q1 = s2u(QS1 - c3);
390         @   P1 = s2u(PS1 + c3);
391         vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
392         vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
393         vshr.s8         q11, q11, #3            @ c1 >>= 3
394         vshr.s8         q12, q12, #3            @ c2 >>= 3
395         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
396         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
397         vbic            q11, q11, q9            @ c1 & ~hev
398         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
399         vrshr.s8        q11, q11, #1            @ c3 >>= 1
400         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
401         vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
402         vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
403         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
404         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
405     .else
406         vand            q12, q10, q9            @ w & hev
407         vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
408         vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
409         vshr.s8         q11, q11, #3            @ c1 >>= 3
410         vshr.s8         q12, q12, #3            @ c2 >>= 3
411         vbic            q10, q10, q9            @ w &= ~hev
412         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
413         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
414
415         @ filter_mbedge:
416         @   a = clamp((27*w + 63) >> 7);
417         @   Q0 = s2u(QS0 - a);
418         @   P0 = s2u(PS0 + a);
419         @   a = clamp((18*w + 63) >> 7);
420         @   Q1 = s2u(QS1 - a);
421         @   P1 = s2u(PS1 + a);
422         @   a = clamp((9*w + 63) >> 7);
423         @   Q2 = s2u(QS2 - a);
424         @   P2 = s2u(PS2 + a);
425         vmov.i16        q9,  #63
426         vshll.s8        q14, d20, #3
427         vshll.s8        q15, d21, #3
428         vaddw.s8        q14, q14, d20
429         vaddw.s8        q15, q15, d21
430         vadd.s16        q8,  q9,  q14
431         vadd.s16        q9,  q9,  q15           @  9*w + 63
432         vadd.s16        q11, q8,  q14
433         vadd.s16        q12, q9,  q15           @ 18*w + 63
434         vadd.s16        q14, q11, q14
435         vadd.s16        q15, q12, q15           @ 27*w + 63
436         vqshrn.s16      d16, q8,  #7
437         vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
438         vqshrn.s16      d22, q11, #7
439         vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
440         vqshrn.s16      d28, q14, #7
441         vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
442         vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
443         vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
444         vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
445         vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
446         vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
447         vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
448         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
449         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
450         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
451         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
452         veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
453         veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
454     .endif
455 .endm
456
457 .macro transpose8x16matrix
458         vtrn.32         q0,   q4
459         vtrn.32         q1,   q5
460         vtrn.32         q2,   q6
461         vtrn.32         q3,   q7
462
463         vtrn.16         q0,   q2
464         vtrn.16         q1,   q3
465         vtrn.16         q4,   q6
466         vtrn.16         q5,   q7
467
468         vtrn.8          q0,   q1
469         vtrn.8          q2,   q3
470         vtrn.8          q4,   q5
471         vtrn.8          q6,   q7
472 .endm
473
474 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
475 function ff_vp8_v_loop_filter16\name\()_neon, export=1
476         vpush           {q4-q7}
477         sub             r0,  r0,  r1,  lsl #1+!\simple
478
479         @ Load pixels:
480     .if !\simple
481         ldr             r12, [sp, #64]          @ hev_thresh
482         vld1.8          {q0},     [r0,:128], r1 @ P3
483         vld1.8          {q1},     [r0,:128], r1 @ P2
484     .endif
485         vld1.8          {q2},     [r0,:128], r1 @ P1
486         vld1.8          {q3},     [r0,:128], r1 @ P0
487         vld1.8          {q4},     [r0,:128], r1 @ Q0
488         vld1.8          {q5},     [r0,:128], r1 @ Q1
489     .if !\simple
490         vld1.8          {q6},     [r0,:128], r1 @ Q2
491         vld1.8          {q7},     [r0,:128]     @ Q3
492         vdup.8          q15, r3                 @ flim_I
493     .endif
494         vdup.8          q14, r2                 @ flim_E
495
496         vp8_loop_filter inner=\inner, simple=\simple
497
498         @ back up to P2:  dst -= stride * 6
499         sub             r0,  r0,  r1,  lsl #2
500     .if !\simple
501         sub             r0,  r0,  r1,  lsl #1
502
503         @ Store pixels:
504         vst1.8          {q1},     [r0,:128], r1 @ P2
505     .endif
506         vst1.8          {q2},     [r0,:128], r1 @ P1
507         vst1.8          {q3},     [r0,:128], r1 @ P0
508         vst1.8          {q4},     [r0,:128], r1 @ Q0
509         vst1.8          {q5},     [r0,:128], r1 @ Q1
510     .if !\simple
511         vst1.8          {q6},     [r0,:128]     @ Q2
512     .endif
513
514         vpop            {q4-q7}
515         bx              lr
516 endfunc
517 .endm
518
519 vp8_v_loop_filter16
520 vp8_v_loop_filter16 _inner,  inner=1
521 vp8_v_loop_filter16 _simple, simple=1
522
523 .macro  vp8_v_loop_filter8uv name, inner=0
524 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
525         vpush           {q4-q7}
526         sub             r0,  r0,  r2,  lsl #2
527         sub             r1,  r1,  r2,  lsl #2
528         ldr             r12, [sp, #64]          @ flim_I
529
530         @ Load pixels:
531         vld1.8          {d0},     [r0,:64], r2  @ P3
532         vld1.8          {d1},     [r1,:64], r2  @ P3
533         vld1.8          {d2},     [r0,:64], r2  @ P2
534         vld1.8          {d3},     [r1,:64], r2  @ P2
535         vld1.8          {d4},     [r0,:64], r2  @ P1
536         vld1.8          {d5},     [r1,:64], r2  @ P1
537         vld1.8          {d6},     [r0,:64], r2  @ P0
538         vld1.8          {d7},     [r1,:64], r2  @ P0
539         vld1.8          {d8},     [r0,:64], r2  @ Q0
540         vld1.8          {d9},     [r1,:64], r2  @ Q0
541         vld1.8          {d10},    [r0,:64], r2  @ Q1
542         vld1.8          {d11},    [r1,:64], r2  @ Q1
543         vld1.8          {d12},    [r0,:64], r2  @ Q2
544         vld1.8          {d13},    [r1,:64], r2  @ Q2
545         vld1.8          {d14},    [r0,:64]      @ Q3
546         vld1.8          {d15},    [r1,:64]      @ Q3
547
548         vdup.8          q14, r3                 @ flim_E
549         vdup.8          q15, r12                @ flim_I
550         ldr             r12, [sp, #68]          @ hev_thresh
551
552         vp8_loop_filter inner=\inner
553
554         @ back up to P2:  u,v -= stride * 6
555         sub             r0,  r0,  r2,  lsl #2
556         sub             r1,  r1,  r2,  lsl #2
557         sub             r0,  r0,  r2,  lsl #1
558         sub             r1,  r1,  r2,  lsl #1
559
560         @ Store pixels:
561         vst1.8          {d2},     [r0,:64], r2  @ P2
562         vst1.8          {d3},     [r1,:64], r2  @ P2
563         vst1.8          {d4},     [r0,:64], r2  @ P1
564         vst1.8          {d5},     [r1,:64], r2  @ P1
565         vst1.8          {d6},     [r0,:64], r2  @ P0
566         vst1.8          {d7},     [r1,:64], r2  @ P0
567         vst1.8          {d8},     [r0,:64], r2  @ Q0
568         vst1.8          {d9},     [r1,:64], r2  @ Q0
569         vst1.8          {d10},    [r0,:64], r2  @ Q1
570         vst1.8          {d11},    [r1,:64], r2  @ Q1
571         vst1.8          {d12},    [r0,:64]      @ Q2
572         vst1.8          {d13},    [r1,:64]      @ Q2
573
574         vpop            {q4-q7}
575         bx              lr
576 endfunc
577 .endm
578
579 vp8_v_loop_filter8uv
580 vp8_v_loop_filter8uv _inner, inner=1
581
582 .macro  vp8_h_loop_filter16 name, inner=0, simple=0
583 function ff_vp8_h_loop_filter16\name\()_neon, export=1
584         vpush           {q4-q7}
585         sub             r0,  r0,  #4
586     .if !\simple
587         ldr             r12, [sp, #64]          @ hev_thresh
588     .endif
589
590         @ Load pixels:
591         vld1.8          {d0},     [r0], r1      @ load first 8-line src data
592         vld1.8          {d2},     [r0], r1
593         vld1.8          {d4},     [r0], r1
594         vld1.8          {d6},     [r0], r1
595         vld1.8          {d8},     [r0], r1
596         vld1.8          {d10},    [r0], r1
597         vld1.8          {d12},    [r0], r1
598         vld1.8          {d14},    [r0], r1
599         vld1.8          {d1},     [r0], r1      @ load second 8-line src data
600         vld1.8          {d3},     [r0], r1
601         vld1.8          {d5},     [r0], r1
602         vld1.8          {d7},     [r0], r1
603         vld1.8          {d9},     [r0], r1
604         vld1.8          {d11},    [r0], r1
605         vld1.8          {d13},    [r0], r1
606         vld1.8          {d15},    [r0], r1
607
608         transpose8x16matrix
609
610         vdup.8          q14, r2                 @ flim_E
611     .if !\simple
612         vdup.8          q15, r3                 @ flim_I
613     .endif
614
615         vp8_loop_filter inner=\inner, simple=\simple
616
617         sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
618
619         transpose8x16matrix
620
621         @ Store pixels:
622         vst1.8          {d0},     [r0],     r1
623         vst1.8          {d2},     [r0],     r1
624         vst1.8          {d4},     [r0],     r1
625         vst1.8          {d6},     [r0],     r1
626         vst1.8          {d8},     [r0],     r1
627         vst1.8          {d10},    [r0],     r1
628         vst1.8          {d12},    [r0],     r1
629         vst1.8          {d14},    [r0],     r1
630         vst1.8          {d1},     [r0],     r1
631         vst1.8          {d3},     [r0],     r1
632         vst1.8          {d5},     [r0],     r1
633         vst1.8          {d7},     [r0],     r1
634         vst1.8          {d9},     [r0],     r1
635         vst1.8          {d11},    [r0],     r1
636         vst1.8          {d13},    [r0],     r1
637         vst1.8          {d15},    [r0]
638
639         vpop            {q4-q7}
640         bx              lr
641 endfunc
642 .endm
643
644 vp8_h_loop_filter16
645 vp8_h_loop_filter16 _inner,  inner=1
646 vp8_h_loop_filter16 _simple, simple=1
647
648 .macro  vp8_h_loop_filter8uv name, inner=0
649 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
650         vpush           {q4-q7}
651         sub             r0,  r0,  #4
652         sub             r1,  r1,  #4
653         ldr             r12, [sp, #64]          @ flim_I
654
655         @ Load pixels:
656         vld1.8          {d0},     [r0], r2      @ load u
657         vld1.8          {d1},     [r1], r2      @ load v
658         vld1.8          {d2},     [r0], r2
659         vld1.8          {d3},     [r1], r2
660         vld1.8          {d4},     [r0], r2
661         vld1.8          {d5},     [r1], r2
662         vld1.8          {d6},     [r0], r2
663         vld1.8          {d7},     [r1], r2
664         vld1.8          {d8},     [r0], r2
665         vld1.8          {d9},     [r1], r2
666         vld1.8          {d10},    [r0], r2
667         vld1.8          {d11},    [r1], r2
668         vld1.8          {d12},    [r0], r2
669         vld1.8          {d13},    [r1], r2
670         vld1.8          {d14},    [r0], r2
671         vld1.8          {d15},    [r1], r2
672
673         transpose8x16matrix
674
675         vdup.8          q14, r3                 @ flim_E
676         vdup.8          q15, r12                @ flim_I
677         ldr             r12, [sp, #68]          @ hev_thresh
678
679         vp8_loop_filter inner=\inner
680
681         sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
682         sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
683
684         transpose8x16matrix
685
686         @ Store pixels:
687         vst1.8          {d0},     [r0], r2
688         vst1.8          {d1},     [r1], r2
689         vst1.8          {d2},     [r0], r2
690         vst1.8          {d3},     [r1], r2
691         vst1.8          {d4},     [r0], r2
692         vst1.8          {d5},     [r1], r2
693         vst1.8          {d6},     [r0], r2
694         vst1.8          {d7},     [r1], r2
695         vst1.8          {d8},     [r0], r2
696         vst1.8          {d9},     [r1], r2
697         vst1.8          {d10},    [r0], r2
698         vst1.8          {d11},    [r1], r2
699         vst1.8          {d12},    [r0], r2
700         vst1.8          {d13},    [r1], r2
701         vst1.8          {d14},    [r0]
702         vst1.8          {d15},    [r1]
703
704         vpop            {q4-q7}
705         bx              lr
706 endfunc
707 .endm
708
709 vp8_h_loop_filter8uv
710 vp8_h_loop_filter8uv _inner, inner=1
711
712 function ff_put_vp8_pixels16_neon, export=1
713         ldr             r12, [sp, #0]           @ h
714 1:
715         subs            r12, r12, #4
716         vld1.8          {q0},     [r2], r3
717         vld1.8          {q1},     [r2], r3
718         vld1.8          {q2},     [r2], r3
719         vld1.8          {q3},     [r2], r3
720         vst1.8          {q0},     [r0,:128], r1
721         vst1.8          {q1},     [r0,:128], r1
722         vst1.8          {q2},     [r0,:128], r1
723         vst1.8          {q3},     [r0,:128], r1
724         bgt             1b
725         bx              lr
726 endfunc
727
728 function ff_put_vp8_pixels8_neon, export=1
729         ldr             r12, [sp, #0]           @ h
730 1:
731         subs            r12, r12, #4
732         vld1.8          {d0},     [r2], r3
733         vld1.8          {d1},     [r2], r3
734         vld1.8          {d2},     [r2], r3
735         vld1.8          {d3},     [r2], r3
736         vst1.8          {d0},     [r0,:64], r1
737         vst1.8          {d1},     [r0,:64], r1
738         vst1.8          {d2},     [r0,:64], r1
739         vst1.8          {d3},     [r0,:64], r1
740         bgt             1b
741         bx              lr
742 endfunc
743
744 function ff_put_vp8_pixels4_neon, export=1
745         ldr             r12, [sp, #0]           @ h
746         push            {r4-r6,lr}
747 1:
748         subs            r12, r12, #4
749         ldr             r4,       [r2], r3
750         ldr             r5,       [r2], r3
751         ldr             r6,       [r2], r3
752         ldr             lr,       [r2], r3
753         str             r4,       [r0], r1
754         str             r5,       [r0], r1
755         str             r6,       [r0], r1
756         str             lr,       [r0], r1
757         bgt             1b
758         pop             {r4-r6,pc}
759 endfunc
760
761 /* 4/6-tap 8th-pel MC */
762
763 .macro  vp8_epel8_h6    d,   a,   b
764         vext.8          d27, \a,  \b,  #1
765         vmovl.u8        q8,  \a
766         vext.8          d28, \a,  \b,  #2
767         vmovl.u8        q9,  d27
768         vext.8          d29, \a,  \b,  #3
769         vmovl.u8        q10, d28
770         vext.8          d30, \a,  \b,  #4
771         vmovl.u8        q11, d29
772         vext.8          d31, \a,  \b,  #5
773         vmovl.u8        q12, d30
774         vmul.u16        q10, q10, d0[2]
775         vmovl.u8        q13, d31
776         vmul.u16        q11, q11, d0[3]
777         vmls.u16        q10, q9,  d0[1]
778         vmls.u16        q11, q12, d1[0]
779         vmla.u16        q10, q8,  d0[0]
780         vmla.u16        q11, q13, d1[1]
781         vqadd.s16       q11, q10, q11
782         vqrshrun.s16    \d,  q11, #7
783 .endm
784
785 .macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
786         vext.8          q14, \q0, \q1, #3
787         vext.8          q15, \q0, \q1, #4
788         vmovl.u8        q11, d28
789         vmovl.u8        q14, d29
790         vext.8          q3,  \q0, \q1, #2
791         vmovl.u8        q12, d30
792         vmovl.u8        q15, d31
793         vext.8          q8,  \q0, \q1, #1
794         vmovl.u8        q10, d6
795         vmovl.u8        q3,  d7
796         vext.8          q2,  \q0, \q1, #5
797         vmovl.u8        q13, d4
798         vmovl.u8        q2,  d5
799         vmovl.u8        q9,  d16
800         vmovl.u8        q8,  d17
801         vmul.u16        q11, q11, d0[3]
802         vmul.u16        q10, q10, d0[2]
803         vmul.u16        q3,  q3,  d0[2]
804         vmul.u16        q14, q14, d0[3]
805         vmls.u16        q11, q12, d1[0]
806         vmovl.u8        q12, \s0
807         vmovl.u8        q1,  \s1
808         vmls.u16        q10, q9,  d0[1]
809         vmls.u16        q3,  q8,  d0[1]
810         vmls.u16        q14, q15, d1[0]
811         vmla.u16        q10, q12, d0[0]
812         vmla.u16        q11, q13, d1[1]
813         vmla.u16        q3,  q1,  d0[0]
814         vmla.u16        q14, q2,  d1[1]
815         vqadd.s16       q11, q10, q11
816         vqadd.s16       q14, q3,  q14
817         vqrshrun.s16    \d0, q11, #7
818         vqrshrun.s16    \d1, q14, #7
819 .endm
820
821 .macro  vp8_epel8_v6    d0,  s0,  s1,  s2,  s3,  s4,  s5
822         vmovl.u8        q10, \s2
823         vmovl.u8        q11, \s3
824         vmovl.u8        q9,  \s1
825         vmovl.u8        q12, \s4
826         vmovl.u8        q8,  \s0
827         vmovl.u8        q13, \s5
828         vmul.u16        q10, q10, d0[2]
829         vmul.u16        q11, q11, d0[3]
830         vmls.u16        q10, q9,  d0[1]
831         vmls.u16        q11, q12, d1[0]
832         vmla.u16        q10, q8,  d0[0]
833         vmla.u16        q11, q13, d1[1]
834         vqadd.s16       q11, q10, q11
835         vqrshrun.s16    \d0, q11, #7
836 .endm
837
838 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
839         vmovl.u8        q10, \s0
840         vmovl.u8        q11, \s3
841         vmovl.u8        q14, \s6
842         vmovl.u8        q9,  \s1
843         vmovl.u8        q12, \s4
844         vmovl.u8        q8,  \s2
845         vmovl.u8        q13, \s5
846         vmul.u16        q10, q10, d0[0]
847         vmul.u16        q15, q11, d0[3]
848         vmul.u16        q11, q11, d0[2]
849         vmul.u16        q14, q14, d1[1]
850         vmls.u16        q10, q9,  d0[1]
851         vmls.u16        q15, q12, d1[0]
852         vmls.u16        q11, q8,  d0[1]
853         vmls.u16        q14, q13, d1[0]
854         vmla.u16        q10, q8,  d0[2]
855         vmla.u16        q15, q13, d1[1]
856         vmla.u16        q11, q9,  d0[0]
857         vmla.u16        q14, q12, d0[3]
858         vqadd.s16       q15, q10, q15
859         vqadd.s16       q14, q11, q14
860         vqrshrun.s16    \d0, q15, #7
861         vqrshrun.s16    \d1, q14, #7
862 .endm
863
864 .macro  vp8_epel8_h4    d,   a,   b
865         vext.8          d28, \a,  \b,  #1
866         vmovl.u8        q9,  \a
867         vext.8          d29, \a,  \b,  #2
868         vmovl.u8        q10, d28
869         vext.8          d30, \a,  \b,  #3
870         vmovl.u8        q11, d29
871         vmovl.u8        q12, d30
872         vmul.u16        q10, q10, d0[2]
873         vmul.u16        q11, q11, d0[3]
874         vmls.u16        q10, q9,  d0[1]
875         vmls.u16        q11, q12, d1[0]
876         vqadd.s16       q11, q10, q11
877         vqrshrun.s16    \d,  q11, #7
878 .endm
879
880 .macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
881         vmovl.u8        q9,  \s0
882         vmovl.u8        q10, \s1
883         vmovl.u8        q11, \s2
884         vmovl.u8        q12, \s3
885         vmovl.u8        q13, \s4
886         vmul.u16        q8,  q10, d0[2]
887         vmul.u16        q14, q11, d0[3]
888         vmul.u16        q11, q11, d0[2]
889         vmul.u16        q15, q12, d0[3]
890         vmls.u16        q8,  q9,  d0[1]
891         vmls.u16        q14, q12, d1[0]
892         vmls.u16        q11, q10, d0[1]
893         vmls.u16        q15, q13, d1[0]
894         vqadd.s16       q8,  q8,  q14
895         vqadd.s16       q11, q11, q15
896         vqrshrun.s16    \d0, q8,  #7
897         vqrshrun.s16    \d1, q11, #7
898 .endm
899
900 function ff_put_vp8_epel16_v6_neon, export=1
901         sub             r2,  r2,  r3,  lsl #1
902         push            {r4,lr}
903         vpush           {d8-d15}
904
905         ldr             r4,  [sp, #80]          @ my
906         movrel          lr,  subpel_filters-16
907         ldr             r12, [sp, #72]          @ h
908         add             r4,  lr,  r4, lsl #4
909         vld1.16         {q0},     [r4,:128]
910 1:
911         vld1.8          {d2-d3},  [r2], r3
912         vld1.8          {d4-d5},  [r2], r3
913         vld1.8          {d6-d7},  [r2], r3
914         vld1.8          {d8-d9},  [r2], r3
915         vld1.8          {d10-d11},[r2], r3
916         vld1.8          {d12-d13},[r2], r3
917         vld1.8          {d14-d15},[r2]
918         sub             r2,  r2,  r3,  lsl #2
919
920         vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
921         vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
922
923         vst1.8          {d2-d3},  [r0,:128], r1
924         vst1.8          {d4-d5},  [r0,:128], r1
925         subs            r12, r12, #2
926         bne             1b
927
928         vpop            {d8-d15}
929         pop             {r4,pc}
930 endfunc
931
932 function ff_put_vp8_epel16_h6_neon, export=1
933         sub             r2,  r2,  #2
934         push            {r4,lr}
935
936         ldr             r4,  [sp, #12]          @ mx
937         movrel          lr,  subpel_filters-16
938         ldr             r12, [sp, #8]           @ h
939         add             r4,  lr,  r4, lsl #4
940         vld1.16         {q0},     [r4,:128]
941 1:
942         vld1.8          {d2-d4},  [r2], r3
943
944         vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
945
946         vst1.8          {d2-d3}, [r0,:128], r1
947         subs            r12, r12, #1
948         bne             1b
949
950         pop             {r4,pc}
951 endfunc
952
953 function ff_put_vp8_epel16_h6v6_neon, export=1
954         sub             r2,  r2,  r3,  lsl #1
955         sub             r2,  r2,  #2
956         push            {r4,lr}
957         vpush           {d8-d9}
958
959         @ first pass (horizontal):
960         ldr             r4,  [sp, #28]          @ mx
961         movrel          lr,  subpel_filters-16
962         ldr             r12, [sp, #24]          @ h
963         add             r4,  lr,  r4, lsl #4
964         sub             sp,  sp,  #336+16
965         vld1.16         {q0},     [r4,:128]
966         add             lr,  sp,  #15
967         add             r12, r12, #5
968         bic             lr,  lr,  #15
969 1:
970         vld1.8          {d2,d3,d4}, [r2], r3
971
972         vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
973
974         vst1.8          {d2-d3}, [lr,:128]!
975         subs            r12, r12, #1
976         bne             1b
977
978         @ second pass (vertical):
979         ldr             r4,  [sp, #336+16+32]   @ my
980         movrel          lr,  subpel_filters-16
981         ldr             r12, [sp, #336+16+24]   @ h
982         add             r4,  lr,  r4, lsl #4
983         add             lr,  sp,  #15
984         vld1.16         {q0},     [r4,:128]
985         bic             lr,  lr,  #15
986 2:
987         vld1.8          {d2-d5},  [lr,:128]!
988         vld1.8          {d6-d9},  [lr,:128]!
989         vld1.8          {d28-d31},[lr,:128]
990         sub             lr,  lr,  #48
991
992         vp8_epel8_v6    d2, d2, d4, d6, d8, d28, d30
993         vp8_epel8_v6    d3, d3, d5, d7, d9, d29, d31
994
995         vst1.8          {d2-d3}, [r0,:128], r1
996         subs            r12, r12, #1
997         bne             2b
998
999         add             sp,  sp,  #336+16
1000         vpop            {d8-d9}
1001         pop             {r4,pc}
1002 endfunc
1003
1004 function ff_put_vp8_epel8_v6_neon, export=1
1005         sub             r2,  r2,  r3,  lsl #1
1006         push            {r4,lr}
1007
1008         ldr             r4,  [sp, #16]          @ my
1009         movrel          lr,  subpel_filters-16
1010         ldr             r12, [sp, #8]           @ h
1011         add             r4,  lr,  r4, lsl #4
1012         vld1.16         {q0},     [r4,:128]
1013 1:
1014         vld1.8          {d2},  [r2], r3
1015         vld1.8          {d3},  [r2], r3
1016         vld1.8          {d4},  [r2], r3
1017         vld1.8          {d5},  [r2], r3
1018         vld1.8          {d6},  [r2], r3
1019         vld1.8          {d7},  [r2], r3
1020         vld1.8          {d28}, [r2]
1021
1022         sub             r2,  r2,  r3,  lsl #2
1023
1024         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1025
1026         vst1.8          {d2}, [r0,:64], r1
1027         vst1.8          {d3}, [r0,:64], r1
1028         subs            r12, r12, #2
1029         bne             1b
1030
1031         pop             {r4,pc}
1032 endfunc
1033
1034 function ff_put_vp8_epel8_h6_neon, export=1
1035         sub             r2,  r2,  #2
1036         push            {r4,lr}
1037
1038         ldr             r4,  [sp, #12]          @ mx
1039         movrel          lr,  subpel_filters-16
1040         ldr             r12, [sp, #8]           @ h
1041         add             r4,  lr,  r4, lsl #4
1042         vld1.16         {q0},     [r4,:128]
1043 1:
1044         vld1.8          {d2,d3}, [r2], r3
1045
1046         vp8_epel8_h6    d2,  d2,  d3
1047
1048         vst1.8          {d2}, [r0,:64], r1
1049         subs            r12, r12, #1
1050         bne             1b
1051
1052         pop             {r4,pc}
1053 endfunc
1054
1055 function ff_put_vp8_epel8_h6v6_neon, export=1
1056         sub             r2,  r2,  r3,  lsl #1
1057         sub             r2,  r2,  #2
1058         push            {r4,lr}
1059
1060         @ first pass (horizontal):
1061         ldr             r4,  [sp, #12]          @ mx
1062         movrel          lr,  subpel_filters-16
1063         ldr             r12, [sp, #8]           @ h
1064         add             r4,  lr,  r4, lsl #4
1065         sub             sp,  sp,  #168+16
1066         vld1.16         {q0},     [r4,:128]
1067         add             lr,  sp,  #15
1068         add             r12, r12, #5
1069         bic             lr,  lr,  #15
1070 1:
1071         vld1.8          {d2,d3}, [r2], r3
1072
1073         vp8_epel8_h6    d2,  d2,  d3
1074
1075         vst1.8          {d2}, [lr,:64]!
1076         subs            r12, r12, #1
1077         bne             1b
1078
1079         @ second pass (vertical):
1080         ldr             r4,  [sp, #168+16+16]   @ my
1081         movrel          lr,  subpel_filters-16
1082         ldr             r12, [sp, #168+16+8]    @ h
1083         add             r4,  lr,  r4, lsl #4
1084         add             lr,  sp,  #15
1085         vld1.16         {q0},     [r4,:128]
1086         bic             lr,  lr,  #15
1087 2:
1088         vld1.8          {d2-d5},  [lr,:128]!
1089         vld1.8          {d6-d7},  [lr,:128]!
1090         vld1.8          {d30},    [lr,:64]
1091         sub             lr,  lr,  #32
1092
1093         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1094
1095         vst1.8          {d2}, [r0,:64], r1
1096         vst1.8          {d3}, [r0,:64], r1
1097         subs            r12, r12, #2
1098         bne             2b
1099
1100         add             sp,  sp,  #168+16
1101         pop             {r4,pc}
1102 endfunc
1103
1104 function ff_put_vp8_epel8_v4_neon, export=1
1105         sub             r2,  r2,  r3
1106         push            {r4,lr}
1107
1108         ldr             r4,  [sp, #16]          @ my
1109         movrel          lr,  subpel_filters-16
1110         ldr             r12, [sp, #8]           @ h
1111         add             r4,  lr,  r4, lsl #4
1112         vld1.16         {q0},     [r4,:128]
1113 1:
1114         vld1.8          {d2},     [r2], r3
1115         vld1.8          {d3},     [r2], r3
1116         vld1.8          {d4},     [r2], r3
1117         vld1.8          {d5},     [r2], r3
1118         vld1.8          {d6},     [r2]
1119         sub             r2,  r2,  r3,  lsl #1
1120
1121         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1122
1123         vst1.8          {d2}, [r0,:64], r1
1124         vst1.8          {d3}, [r0,:64], r1
1125         subs            r12, r12, #2
1126         bne             1b
1127
1128         pop             {r4,pc}
1129 endfunc
1130
1131 function ff_put_vp8_epel8_h4_neon, export=1
1132         sub             r2,  r2,  #1
1133         push            {r4,lr}
1134
1135         ldr             r4,  [sp, #12]          @ mx
1136         movrel          lr,  subpel_filters-16
1137         ldr             r12, [sp, #8]           @ h
1138         add             r4,  lr,  r4, lsl #4
1139         vld1.16         {q0},     [r4,:128]
1140 1:
1141         vld1.8          {d2,d3}, [r2], r3
1142
1143         vp8_epel8_h4    d2,  d2,  d3
1144
1145         vst1.8          {d2}, [r0,:64], r1
1146         subs            r12, r12, #1
1147         bne             1b
1148
1149         pop             {r4,pc}
1150 endfunc
1151
1152 function ff_put_vp8_epel8_h4v4_neon, export=1
1153         sub             r2,  r2,  r3
1154         sub             r2,  r2,  #1
1155         push            {r4,lr}
1156
1157         @ first pass (horizontal):
1158         ldr             r4,  [sp, #12]          @ mx
1159         movrel          lr,  subpel_filters-16
1160         ldr             r12, [sp, #8]           @ h
1161         add             r4,  lr,  r4, lsl #4
1162         sub             sp,  sp,  #168+16
1163         vld1.16         {q0},     [r4,:128]
1164         add             lr,  sp,  #15
1165         add             r12, r12, #3
1166         bic             lr,  lr,  #15
1167 1:
1168         vld1.8          {d2,d3}, [r2], r3
1169
1170         vp8_epel8_h4    d2,  d2,  d3
1171
1172         vst1.8          {d2}, [lr,:64]!
1173         subs            r12, r12, #1
1174         bne             1b
1175
1176         @ second pass (vertical):
1177         ldr             r4,  [sp, #168+16+16]   @ my
1178         movrel          lr,  subpel_filters-16
1179         ldr             r12, [sp, #168+16+8]    @ h
1180         add             r4,  lr,  r4, lsl #4
1181         add             lr,  sp,  #15
1182         vld1.16         {q0},     [r4,:128]
1183         bic             lr,  lr,  #15
1184 2:
1185         vld1.8          {d2-d5},  [lr,:128]!
1186         vld1.8          {d6},     [lr,:64]
1187         sub             lr,  lr,  #16
1188
1189         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1190
1191         vst1.8          {d2},     [r0,:64], r1
1192         vst1.8          {d3},     [r0,:64], r1
1193         subs            r12, r12, #2
1194         bne             2b
1195
1196         add             sp,  sp,  #168+16
1197         pop             {r4,pc}
1198 endfunc
1199
1200 function ff_put_vp8_epel8_h6v4_neon, export=1
1201         sub             r2,  r2,  r3
1202         sub             r2,  r2,  #2
1203         push            {r4,lr}
1204
1205         @ first pass (horizontal):
1206         ldr             r4,  [sp, #12]          @ mx
1207         movrel          lr,  subpel_filters-16
1208         ldr             r12, [sp, #8]           @ h
1209         add             r4,  lr,  r4, lsl #4
1210         sub             sp,  sp,  #168+16
1211         vld1.16         {q0},     [r4,:128]
1212         add             lr,  sp,  #15
1213         add             r12, r12, #3
1214         bic             lr,  lr,  #15
1215 1:
1216         vld1.8          {d2,d3}, [r2], r3
1217
1218         vp8_epel8_h6    d2,  d2,  d3
1219
1220         vst1.8          {d2}, [lr,:64]!
1221         subs            r12, r12, #1
1222         bne             1b
1223
1224         @ second pass (vertical):
1225         ldr             r4,  [sp, #168+16+16]   @ my
1226         movrel          lr,  subpel_filters-16
1227         ldr             r12, [sp, #168+16+8]    @ h
1228         add             r4,  lr,  r4, lsl #4
1229         add             lr,  sp,  #15
1230         vld1.16         {q0},     [r4,:128]
1231         bic             lr,  lr,  #15
1232 2:
1233         vld1.8          {d2-d5},  [lr,:128]!
1234         vld1.8          {d6},     [lr,:64]
1235         sub             lr,  lr,  #16
1236
1237         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1238
1239         vst1.8          {d2},     [r0,:64], r1
1240         vst1.8          {d3},     [r0,:64], r1
1241         subs            r12, r12, #2
1242         bne             2b
1243
1244         add             sp,  sp,  #168+16
1245         pop             {r4,pc}
1246 endfunc
1247
1248 function ff_put_vp8_epel8_h4v6_neon, export=1
1249         sub             r2,  r2,  r3,  lsl #1
1250         sub             r2,  r2,  #1
1251         push            {r4,lr}
1252
1253         @ first pass (horizontal):
1254         ldr             r4,  [sp, #12]          @ mx
1255         movrel          lr,  subpel_filters-16
1256         ldr             r12, [sp, #8]           @ h
1257         add             r4,  lr,  r4, lsl #4
1258         sub             sp,  sp,  #168+16
1259         vld1.16         {q0},     [r4,:128]
1260         add             lr,  sp,  #15
1261         add             r12, r12, #5
1262         bic             lr,  lr,  #15
1263 1:
1264         vld1.8          {d2,d3}, [r2], r3
1265
1266         vp8_epel8_h4    d2,  d2,  d3
1267
1268         vst1.8          {d2}, [lr,:64]!
1269         subs            r12, r12, #1
1270         bne             1b
1271
1272         @ second pass (vertical):
1273         ldr             r4,  [sp, #168+16+16]   @ my
1274         movrel          lr,  subpel_filters-16
1275         ldr             r12, [sp, #168+16+8]    @ h
1276         add             r4,  lr,  r4, lsl #4
1277         add             lr,  sp,  #15
1278         vld1.16         {q0},     [r4,:128]
1279         bic             lr,  lr,  #15
1280 2:
1281         vld1.8          {d2-d5},  [lr,:128]!
1282         vld1.8          {d6-d7},  [lr,:128]!
1283         vld1.8          {d30},    [lr,:64]
1284         sub             lr,  lr,  #32
1285
1286         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1287
1288         vst1.8          {d2}, [r0,:64], r1
1289         vst1.8          {d3}, [r0,:64], r1
1290         subs            r12, r12, #2
1291         bne             2b
1292
1293         add             sp,  sp,  #168+16
1294         pop             {r4,pc}
1295 endfunc
1296
1297 function ff_put_vp8_epel4_v6_neon, export=1
1298         sub             r2,  r2,  r3,  lsl #1
1299         push            {r4,lr}
1300
1301         ldr             r4,  [sp, #16]          @ my
1302         movrel          lr,  subpel_filters-16
1303         ldr             r12, [sp, #8]           @ h
1304         add             r4,  lr,  r4, lsl #4
1305         vld1.16         {q0},     [r4,:128]
1306 1:
1307         vld1.32         {d2[]},   [r2], r3
1308         vld1.32         {d3[]},   [r2], r3
1309         vld1.32         {d4[]},   [r2], r3
1310         vld1.32         {d5[]},   [r2], r3
1311         vld1.32         {d6[]},   [r2], r3
1312         vld1.32         {d7[]},   [r2], r3
1313         vld1.32         {d28[]},  [r2]
1314         sub             r2,  r2,  r3,  lsl #2
1315         vld1.32         {d2[1]},  [r2], r3
1316         vld1.32         {d3[1]},  [r2], r3
1317         vld1.32         {d4[1]},  [r2], r3
1318         vld1.32         {d5[1]},  [r2], r3
1319         vld1.32         {d6[1]},  [r2], r3
1320         vld1.32         {d7[1]},  [r2], r3
1321         vld1.32         {d28[1]}, [r2]
1322         sub             r2,  r2,  r3,  lsl #2
1323
1324         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1325
1326         vst1.32         {d2[0]},  [r0,:32], r1
1327         vst1.32         {d3[0]},  [r0,:32], r1
1328         vst1.32         {d2[1]},  [r0,:32], r1
1329         vst1.32         {d3[1]},  [r0,:32], r1
1330         subs            r12, r12, #4
1331         bne             1b
1332
1333         pop             {r4,pc}
1334 endfunc
1335
1336 function ff_put_vp8_epel4_h6_neon, export=1
1337         sub             r2,  r2,  #2
1338         push            {r4,lr}
1339
1340         ldr             r4,  [sp, #12]          @ mx
1341         movrel          lr,  subpel_filters-16
1342         ldr             r12, [sp, #8]           @ h
1343         add             r4,  lr,  r4, lsl #4
1344         vld1.16         {q0},     [r4,:128]
1345 1:
1346         vld1.8          {q1},     [r2], r3
1347         vp8_epel8_h6    d2,  d2,  d3
1348         vst1.32         {d2[0]},  [r0,:32], r1
1349         subs            r12, r12, #1
1350         bne             1b
1351
1352         pop             {r4,pc}
1353 endfunc
1354
1355 function ff_put_vp8_epel4_h6v6_neon, export=1
1356         sub             r2,  r2,  r3,  lsl #1
1357         sub             r2,  r2,  #2
1358         push            {r4,lr}
1359
1360         ldr             r4,  [sp, #12]          @ mx
1361         movrel          lr,  subpel_filters-16
1362         ldr             r12, [sp, #8]           @ h
1363         add             r4,  lr,  r4, lsl #4
1364         sub             sp,  sp,  #52+16
1365         vld1.16         {q0},     [r4,:128]
1366         add             lr,  sp,  #15
1367         add             r12, r12, #5
1368         bic             lr,  lr,  #15
1369 1:
1370         vld1.8          {q1},     [r2], r3
1371         vp8_epel8_h6    d2,  d2,  d3
1372         vst1.32         {d2[0]},  [lr,:32]!
1373         subs            r12, r12, #1
1374         bne             1b
1375
1376         ldr             r4,  [sp, #52+16+16]    @ my
1377         movrel          lr,  subpel_filters-16
1378         ldr             r12, [sp, #52+16+8]     @ h
1379         add             r4,  lr,  r4, lsl #4
1380         add             lr,  sp,  #15
1381         vld1.16         {q0},     [r4,:128]
1382         bic             lr,  lr,  #15
1383 2:
1384         vld1.8          {d2-d3},  [lr,:128]!
1385         vld1.8          {d6},     [lr,:64]!
1386         vld1.32         {d28[]},  [lr,:32]
1387         sub             lr,  lr,  #16
1388         vld1.8          {d4-d5},  [lr]!
1389         vld1.8          {d7},     [lr,:64]!
1390         vld1.32         {d28[1]}, [lr,:32]
1391         sub             lr,  lr,  #16
1392         vtrn.32         q1,  q2
1393         vtrn.32         d6,  d7
1394         vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1395         vst1.32         {d2[0]},  [r0,:32], r1
1396         vst1.32         {d3[0]},  [r0,:32], r1
1397         vst1.32         {d2[1]},  [r0,:32], r1
1398         vst1.32         {d3[1]},  [r0,:32], r1
1399         subs            r12, r12, #4
1400         bne             2b
1401
1402         add             sp,  sp,  #52+16
1403         pop             {r4,pc}
1404 endfunc
1405
1406 function ff_put_vp8_epel4_h4v6_neon, export=1
1407         sub             r2,  r2,  r3,  lsl #1
1408         sub             r2,  r2,  #1
1409         push            {r4,lr}
1410
1411         ldr             r4,  [sp, #12]          @ mx
1412         movrel          lr,  subpel_filters-16
1413         ldr             r12, [sp, #8]           @ h
1414         add             r4,  lr,  r4, lsl #4
1415         sub             sp,  sp,  #52+16
1416         vld1.16         {q0},     [r4,:128]
1417         add             lr,  sp,  #15
1418         add             r12, r12, #5
1419         bic             lr,  lr,  #15
1420 1:
1421         vld1.8          {d2},     [r2], r3
1422         vp8_epel8_h4    d2,  d2,  d2
1423         vst1.32         {d2[0]},  [lr,:32]!
1424         subs            r12, r12, #1
1425         bne             1b
1426
1427         ldr             r4,  [sp, #52+16+16]    @ my
1428         movrel          lr,  subpel_filters-16
1429         ldr             r12, [sp, #52+16+8]     @ h
1430         add             r4,  lr,  r4, lsl #4
1431         add             lr,  sp,  #15
1432         vld1.16         {q0},     [r4,:128]
1433         bic             lr,  lr,  #15
1434 2:
1435         vld1.8          {d2-d3},  [lr,:128]!
1436         vld1.8          {d6},     [lr,:64]!
1437         vld1.32         {d28[]},  [lr,:32]
1438         sub             lr,  lr,  #16
1439         vld1.8          {d4-d5},  [lr]!
1440         vld1.8          {d7},     [lr,:64]!
1441         vld1.32         {d28[1]}, [lr,:32]
1442         sub             lr,  lr,  #16
1443         vtrn.32         q1,  q2
1444         vtrn.32         d6,  d7
1445         vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1446         vst1.32         {d2[0]},  [r0,:32], r1
1447         vst1.32         {d3[0]},  [r0,:32], r1
1448         vst1.32         {d2[1]},  [r0,:32], r1
1449         vst1.32         {d3[1]},  [r0,:32], r1
1450         subs            r12, r12, #4
1451         bne             2b
1452
1453         add             sp,  sp,  #52+16
1454         pop             {r4,pc}
1455 endfunc
1456
1457 function ff_put_vp8_epel4_h6v4_neon, export=1
1458         sub             r2,  r2,  r3
1459         sub             r2,  r2,  #2
1460         push            {r4,lr}
1461
1462         ldr             r4,  [sp, #12]          @ mx
1463         movrel          lr,  subpel_filters-16
1464         ldr             r12, [sp, #8]           @ h
1465         add             r4,  lr,  r4, lsl #4
1466         sub             sp,  sp,  #44+16
1467         vld1.16         {q0},     [r4,:128]
1468         add             lr,  sp,  #15
1469         add             r12, r12, #3
1470         bic             lr,  lr,  #15
1471 1:
1472         vld1.8          {q1},     [r2], r3
1473         vp8_epel8_h6    d2,  d2,  d3
1474         vst1.32         {d2[0]},  [lr,:32]!
1475         subs            r12, r12, #1
1476         bne             1b
1477
1478         ldr             r4,  [sp, #44+16+16]    @ my
1479         movrel          lr,  subpel_filters-16
1480         ldr             r12, [sp, #44+16+8]     @ h
1481         add             r4,  lr,  r4, lsl #4
1482         add             lr,  sp,  #15
1483         vld1.16         {q0},     [r4,:128]
1484         bic             lr,  lr,  #15
1485 2:
1486         vld1.8          {d2-d3},  [lr,:128]!
1487         vld1.32         {d6[]},   [lr,:32]
1488         sub             lr,  lr,  #8
1489         vld1.8          {d4-d5},  [lr]!
1490         vld1.32         {d6[1]},  [lr,:32]
1491         sub             lr,  lr,  #8
1492         vtrn.32         q1,  q2
1493         vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1494         vst1.32         {d2[0]},  [r0,:32], r1
1495         vst1.32         {d3[0]},  [r0,:32], r1
1496         vst1.32         {d2[1]},  [r0,:32], r1
1497         vst1.32         {d3[1]},  [r0,:32], r1
1498         subs            r12, r12, #4
1499         bne             2b
1500
1501         add             sp,  sp,  #44+16
1502         pop             {r4,pc}
1503 endfunc
1504
1505 function ff_put_vp8_epel4_h4_neon, export=1
1506         sub             r2,  r2,  #1
1507         push            {r4,lr}
1508
1509         ldr             r4,  [sp, #12]          @ mx
1510         movrel          lr,  subpel_filters-16
1511         ldr             r12, [sp, #8]           @ h
1512         add             r4,  lr,  r4, lsl #4
1513         vld1.16         {q0},     [r4,:128]
1514 1:
1515         vld1.8          {d2},     [r2], r3
1516         vp8_epel8_h4    d2,  d2,  d2
1517         vst1.32         {d2[0]},  [r0,:32], r1
1518         subs            r12, r12, #1
1519         bne             1b
1520
1521         pop             {r4,pc}
1522 endfunc
1523
1524 function ff_put_vp8_epel4_v4_neon, export=1
1525         sub             r2,  r2,  r3
1526         push            {r4,lr}
1527
1528         ldr             r4,  [sp, #16]          @ my
1529         movrel          lr,  subpel_filters-16
1530         ldr             r12, [sp, #8]           @ h
1531         add             r4,  lr,  r4, lsl #4
1532         vld1.16         {q0},     [r4,:128]
1533 1:
1534         vld1.32         {d2[]},   [r2], r3
1535         vld1.32         {d3[]},   [r2], r3
1536         vld1.32         {d4[]},   [r2], r3
1537         vld1.32         {d5[]},   [r2], r3
1538         vld1.32         {d6[]},   [r2]
1539         sub             r2,  r2,  r3,  lsl #1
1540         vld1.32         {d2[1]},  [r2], r3
1541         vld1.32         {d3[1]},  [r2], r3
1542         vld1.32         {d4[1]},  [r2], r3
1543         vld1.32         {d5[1]},  [r2], r3
1544         vld1.32         {d6[1]},  [r2]
1545         sub             r2,  r2,  r3,  lsl #1
1546
1547         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1548
1549         vst1.32         {d2[0]},  [r0,:32], r1
1550         vst1.32         {d3[0]},  [r0,:32], r1
1551         vst1.32         {d2[1]},  [r0,:32], r1
1552         vst1.32         {d3[1]},  [r0,:32], r1
1553         subs            r12, r12, #4
1554         bne             1b
1555
1556         pop             {r4,pc}
1557 endfunc
1558
1559 function ff_put_vp8_epel4_h4v4_neon, export=1
1560         sub             r2,  r2,  r3
1561         sub             r2,  r2,  #1
1562         push            {r4,lr}
1563
1564         ldr             r4,  [sp, #12]          @ mx
1565         movrel          lr,  subpel_filters-16
1566         ldr             r12, [sp, #8]           @ h
1567         add             r4,  lr,  r4, lsl #4
1568         sub             sp,  sp,  #44+16
1569         vld1.16         {q0},     [r4,:128]
1570         add             lr,  sp,  #15
1571         add             r12, r12, #3
1572         bic             lr,  lr,  #15
1573 1:
1574         vld1.8          {d2},     [r2], r3
1575         vp8_epel8_h4    d2,  d2,  d3
1576         vst1.32         {d2[0]},  [lr,:32]!
1577         subs            r12, r12, #1
1578         bne             1b
1579
1580         ldr             r4,  [sp, #44+16+16]    @ my
1581         movrel          lr,  subpel_filters-16
1582         ldr             r12, [sp, #44+16+8]     @ h
1583         add             r4,  lr,  r4, lsl #4
1584         add             lr,  sp,  #15
1585         vld1.16         {q0},     [r4,:128]
1586         bic             lr,  lr,  #15
1587 2:
1588         vld1.8          {d2-d3},  [lr,:128]!
1589         vld1.32         {d6[]},   [lr,:32]
1590         sub             lr,  lr,  #8
1591         vld1.8          {d4-d5},  [lr]!
1592         vld1.32         {d6[1]},  [lr,:32]
1593         sub             lr,  lr,  #8
1594         vtrn.32         q1,  q2
1595         vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1596         vst1.32         {d2[0]},  [r0,:32], r1
1597         vst1.32         {d3[0]},  [r0,:32], r1
1598         vst1.32         {d2[1]},  [r0,:32], r1
1599         vst1.32         {d3[1]},  [r0,:32], r1
1600         subs            r12, r12, #4
1601         bne             2b
1602
1603         add             sp,  sp,  #44+16
1604         pop             {r4,pc}
1605 endfunc
1606
1607 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1608 @ arithmatic can be used to apply filters
1609 const   subpel_filters, align=4
1610         .short     0,   6, 123,  12,   1,   0,   0,   0
1611         .short     2,  11, 108,  36,   8,   1,   0,   0
1612         .short     0,   9,  93,  50,   6,   0,   0,   0
1613         .short     3,  16,  77,  77,  16,   3,   0,   0
1614         .short     0,   6,  50,  93,   9,   0,   0,   0
1615         .short     1,   8,  36, 108,  11,   2,   0,   0
1616         .short     0,   1,  12, 123,   6,   0,   0,   0
1617 endconst
1618
1619 /* Bilinear MC */
1620
1621 function ff_put_vp8_bilin16_h_neon, export=1
1622         ldr             r3,  [sp, #4]           @ mx
1623         rsb             r12, r3,  #8
1624         vdup.8          d0,  r3
1625         vdup.8          d1,  r12
1626         ldr             r12, [sp]               @ h
1627 1:
1628         subs            r12, r12, #2
1629         vld1.8          {d2-d4},  [r2], r1
1630         vext.8          q2,  q1,  q2,  #1
1631         vmull.u8        q8,  d2,  d1
1632         vmlal.u8        q8,  d4,  d0
1633         vld1.8          {d18-d20},[r2], r1
1634         vmull.u8        q3,  d3,  d1
1635         vmlal.u8        q3,  d5,  d0
1636         vext.8          q10, q9,  q10, #1
1637         vmull.u8        q11, d18, d1
1638         vmlal.u8        q11, d20, d0
1639         vmull.u8        q12, d19, d1
1640         vmlal.u8        q12, d21, d0
1641         vrshrn.u16      d4,  q8,  #3
1642         vrshrn.u16      d5,  q3,  #3
1643         vrshrn.u16      d6,  q11, #3
1644         vrshrn.u16      d7,  q12, #3
1645         vst1.8          {q2},     [r0,:128], r1
1646         vst1.8          {q3},     [r0,:128], r1
1647         bgt             1b
1648
1649         bx              lr
1650 endfunc
1651
1652 function ff_put_vp8_bilin16_v_neon, export=1
1653         ldr             r3,  [sp, #8]           @ my
1654         rsb             r12, r3,  #8
1655         vdup.8          d0,  r3
1656         vdup.8          d1,  r12
1657         ldr             r12, [sp]               @ h
1658         vld1.8          {q1},     [r2], r1
1659 1:
1660         subs            r12, r12, #2
1661         vld1.8          {q2},     [r2], r1
1662         vmull.u8        q3,  d2,  d1
1663         vmlal.u8        q3,  d4,  d0
1664         vmull.u8        q8,  d3,  d1
1665         vmlal.u8        q8,  d5,  d0
1666         vld1.8          {q1},     [r2], r1
1667         vmull.u8        q9,  d4,  d1
1668         vmlal.u8        q9,  d2,  d0
1669         vmull.u8        q10, d5,  d1
1670         vmlal.u8        q10, d3,  d0
1671         vrshrn.u16      d4,  q3,  #3
1672         vrshrn.u16      d5,  q8,  #3
1673         vrshrn.u16      d6,  q9,  #3
1674         vrshrn.u16      d7,  q10, #3
1675         vst1.8          {q2},     [r0,:128], r1
1676         vst1.8          {q3},     [r0,:128], r1
1677         bgt             1b
1678
1679         bx              lr
1680 endfunc
1681
1682 function ff_put_vp8_bilin16_hv_neon, export=1
1683         ldr             r3,  [sp, #4]           @ mx
1684         rsb             r12, r3,  #8
1685         vdup.8          d0,  r3
1686         vdup.8          d1,  r12
1687         ldr             r3,  [sp, #8]           @ my
1688         rsb             r12, r3,  #8
1689         vdup.8          d2,  r3
1690         vdup.8          d3,  r12
1691         ldr             r12, [sp]               @ h
1692
1693         vld1.8          {d4-d6},  [r2], r1
1694         vext.8          q3,  q2,  q3,  #1
1695         vmull.u8        q8,  d4,  d1
1696         vmlal.u8        q8,  d6,  d0
1697         vmull.u8        q9,  d5,  d1
1698         vmlal.u8        q9,  d7,  d0
1699         vrshrn.u16      d4,  q8,  #3
1700         vrshrn.u16      d5,  q9,  #3
1701 1:
1702         subs            r12, r12, #2
1703         vld1.8          {d18-d20},[r2], r1
1704         vext.8          q10, q9,  q10, #1
1705         vmull.u8        q11, d18, d1
1706         vmlal.u8        q11, d20, d0
1707         vld1.8          {d26-d28},[r2], r1
1708         vmull.u8        q12, d19, d1
1709         vmlal.u8        q12, d21, d0
1710         vext.8          q14, q13, q14, #1
1711         vmull.u8        q8,  d26, d1
1712         vmlal.u8        q8,  d28, d0
1713         vmull.u8        q9,  d27, d1
1714         vmlal.u8        q9,  d29, d0
1715         vrshrn.u16      d6,  q11, #3
1716         vrshrn.u16      d7,  q12, #3
1717         vmull.u8        q12, d4,  d3
1718         vmlal.u8        q12, d6,  d2
1719         vmull.u8        q15, d5,  d3
1720         vmlal.u8        q15, d7,  d2
1721         vrshrn.u16      d4,  q8,  #3
1722         vrshrn.u16      d5,  q9,  #3
1723         vmull.u8        q10, d6,  d3
1724         vmlal.u8        q10, d4,  d2
1725         vmull.u8        q11, d7,  d3
1726         vmlal.u8        q11, d5,  d2
1727         vrshrn.u16      d24, q12, #3
1728         vrshrn.u16      d25, q15, #3
1729         vst1.8          {q12},    [r0,:128], r1
1730         vrshrn.u16      d20, q10, #3
1731         vrshrn.u16      d21, q11, #3
1732         vst1.8          {q10},    [r0,:128], r1
1733         bgt             1b
1734
1735         bx              lr
1736 endfunc
1737
1738 function ff_put_vp8_bilin8_h_neon, export=1
1739         ldr             r3,  [sp, #4]           @ mx
1740         rsb             r12, r3,  #8
1741         vdup.8          d0,  r3
1742         vdup.8          d1,  r12
1743         ldr             r12, [sp]               @ h
1744 1:
1745         subs            r12, r12, #2
1746         vld1.8          {q1},     [r2], r1
1747         vext.8          d3,  d2,  d3,  #1
1748         vmull.u8        q2,  d2,  d1
1749         vmlal.u8        q2,  d3,  d0
1750         vld1.8          {q3},     [r2], r1
1751         vext.8          d7,  d6,  d7,  #1
1752         vmull.u8        q8,  d6,  d1
1753         vmlal.u8        q8,  d7,  d0
1754         vrshrn.u16      d4,  q2,  #3
1755         vrshrn.u16      d16, q8,  #3
1756         vst1.8          {d4},     [r0,:64], r1
1757         vst1.8          {d16},    [r0,:64], r1
1758         bgt             1b
1759
1760         bx              lr
1761 endfunc
1762
1763 function ff_put_vp8_bilin8_v_neon, export=1
1764         ldr             r3,  [sp, #8]           @ my
1765         rsb             r12, r3,  #8
1766         vdup.8          d0,  r3
1767         vdup.8          d1,  r12
1768         ldr             r12, [sp]               @ h
1769         vld1.8          {d2},     [r2], r1
1770 1:
1771         subs            r12, r12, #2
1772         vld1.8          {d3},     [r2], r1
1773         vmull.u8        q2,  d2,  d1
1774         vmlal.u8        q2,  d3,  d0
1775         vld1.8          {d2},     [r2], r1
1776         vmull.u8        q3,  d3,  d1
1777         vmlal.u8        q3,  d2,  d0
1778         vrshrn.u16      d4,  q2,  #3
1779         vrshrn.u16      d6,  q3,  #3
1780         vst1.8          {d4},     [r0,:64], r1
1781         vst1.8          {d6},     [r0,:64], r1
1782         bgt             1b
1783
1784         bx              lr
1785 endfunc
1786
1787 function ff_put_vp8_bilin8_hv_neon, export=1
1788         ldr             r3,  [sp, #4]           @ mx
1789         rsb             r12, r3,  #8
1790         vdup.8          d0,  r3
1791         vdup.8          d1,  r12
1792         ldr             r3,  [sp, #8]           @ my
1793         rsb             r12, r3,  #8
1794         vdup.8          d2,  r3
1795         vdup.8          d3,  r12
1796         ldr             r12, [sp]               @ h
1797
1798         vld1.8          {q2},     [r2], r1
1799         vext.8          d5,  d4,  d5,  #1
1800         vmull.u8        q9,  d4,  d1
1801         vmlal.u8        q9,  d5,  d0
1802         vrshrn.u16      d22, q9,  #3
1803 1:
1804         subs            r12, r12, #2
1805         vld1.8          {q3},     [r2], r1
1806         vext.8          d7,  d6,  d7,  #1
1807         vmull.u8        q8,  d6,  d1
1808         vmlal.u8        q8,  d7,  d0
1809         vld1.8          {q2},     [r2], r1
1810         vext.8          d5,  d4,  d5,  #1
1811         vmull.u8        q9,  d4,  d1
1812         vmlal.u8        q9,  d5,  d0
1813         vrshrn.u16      d16, q8,  #3
1814         vmull.u8        q10, d22, d3
1815         vmlal.u8        q10, d16, d2
1816         vrshrn.u16      d22, q9,  #3
1817         vmull.u8        q12, d16, d3
1818         vmlal.u8        q12, d22, d2
1819         vrshrn.u16      d20, q10, #3
1820         vst1.8          {d20},    [r0,:64], r1
1821         vrshrn.u16      d23, q12, #3
1822         vst1.8          {d23},    [r0,:64], r1
1823         bgt             1b
1824
1825         bx              lr
1826 endfunc
1827
1828 function ff_put_vp8_bilin4_h_neon, export=1
1829         ldr             r3,  [sp, #4]           @ mx
1830         rsb             r12, r3,  #8
1831         vdup.8          d0,  r3
1832         vdup.8          d1,  r12
1833         ldr             r12, [sp]               @ h
1834 1:
1835         subs            r12, r12, #2
1836         vld1.8          {d2},     [r2], r1
1837         vext.8          d3,  d2,  d3,  #1
1838         vld1.8          {d6},     [r2], r1
1839         vext.8          d7,  d6,  d7,  #1
1840         vtrn.32         q1,  q3
1841         vmull.u8        q2,  d2,  d1
1842         vmlal.u8        q2,  d3,  d0
1843         vrshrn.u16      d4,  q2,  #3
1844         vst1.32         {d4[0]},  [r0,:32], r1
1845         vst1.32         {d4[1]}, [r0,:32], r1
1846         bgt             1b
1847
1848         bx              lr
1849 endfunc
1850
1851 function ff_put_vp8_bilin4_v_neon, export=1
1852         ldr             r3,  [sp, #8]           @ my
1853         rsb             r12, r3,  #8
1854         vdup.8          d0,  r3
1855         vdup.8          d1,  r12
1856         ldr             r12, [sp]               @ h
1857         vld1.32         {d2[]},   [r2], r1
1858 1:
1859         vld1.32         {d3[]},   [r2]
1860         vld1.32         {d2[1]},  [r2], r1
1861         vld1.32         {d3[1]},  [r2], r1
1862         vmull.u8        q2,  d2,  d1
1863         vmlal.u8        q2,  d3,  d0
1864         vtrn.32         d3,  d2
1865         vrshrn.u16      d4,  q2,  #3
1866         vst1.32         {d4[0]},  [r0,:32], r1
1867         vst1.32         {d4[1]},  [r0,:32], r1
1868         subs            r12, r12, #2
1869         bgt             1b
1870
1871         bx              lr
1872 endfunc
1873
1874 function ff_put_vp8_bilin4_hv_neon, export=1
1875         ldr             r3,  [sp, #4]           @ mx
1876         rsb             r12, r3,  #8
1877         vdup.8          d0,  r3
1878         vdup.8          d1,  r12
1879         ldr             r3,  [sp, #8]           @ my
1880         rsb             r12, r3,  #8
1881         vdup.8          d2,  r3
1882         vdup.8          d3,  r12
1883         ldr             r12, [sp]               @ h
1884
1885         vld1.8          {d4},     [r2], r1
1886         vext.8          d5,  d4,  d4,  #1
1887         vmull.u8        q9,  d4,  d1
1888         vmlal.u8        q9,  d5,  d0
1889         vrshrn.u16      d22, q9,  #3
1890 1:
1891         subs            r12, r12, #2
1892         vld1.8          {d6},     [r2], r1
1893         vext.8          d7,  d6,  d6,  #1
1894         vld1.8          {d4},     [r2], r1
1895         vext.8          d5,  d4,  d4,  #1
1896         vtrn.32         q3,  q2
1897         vmull.u8        q8,  d6,  d1
1898         vmlal.u8        q8,  d7,  d0
1899         vrshrn.u16      d16, q8,  #3
1900         vmull.u8        q10, d16, d2
1901         vtrn.32         d22, d16
1902         vmlal.u8        q10, d22, d3
1903         vrev64.32       d22, d16
1904         vrshrn.u16      d20, q10, #3
1905         vst1.32         {d20[0]}, [r0,:32], r1
1906         vst1.32         {d20[1]}, [r0,:32], r1
1907         bgt             1b
1908
1909         bx              lr
1910 endfunc