]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/vp8dsp_neon.S
Merge commit '70ab2778be9c83dab84340af7e3ba83fa0f98576'
[ffmpeg] / libavcodec / arm / vp8dsp_neon.S
1 /*
2  * VP8 NEON optimisations
3  *
4  * Copyright (c) 2010 Rob Clark <rob@ti.com>
5  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23
24 #include "libavutil/arm/asm.S"
25 #include "neon.S"
26
27 function ff_vp8_luma_dc_wht_neon, export=1
28         vld1.16         {q0-q1},  [r1,:128]
29         vmov.i16        q15, #0
30
31         vadd.i16        d4,  d0,  d3
32         vadd.i16        d6,  d1,  d2
33         vst1.16         {q15},    [r1,:128]!
34         vsub.i16        d7,  d1,  d2
35         vsub.i16        d5,  d0,  d3
36         vst1.16         {q15},    [r1,:128]
37         vadd.i16        q0,  q2,  q3
38         vsub.i16        q1,  q2,  q3
39
40         vmov.i16        q8, #3
41
42         vtrn.32         d0,  d2
43         vtrn.32         d1,  d3
44         vtrn.16         d0,  d1
45         vtrn.16         d2,  d3
46
47         vadd.i16        d0,  d0,  d16
48
49         vadd.i16        d4,  d0,  d3
50         vadd.i16        d6,  d1,  d2
51         vsub.i16        d7,  d1,  d2
52         vsub.i16        d5,  d0,  d3
53         vadd.i16        q0,  q2,  q3
54         vsub.i16        q1,  q2,  q3
55
56         vshr.s16        q0,  q0,  #3
57         vshr.s16        q1,  q1,  #3
58
59         mov             r3,  #32
60         vst1.16         {d0[0]},  [r0,:16], r3
61         vst1.16         {d1[0]},  [r0,:16], r3
62         vst1.16         {d2[0]},  [r0,:16], r3
63         vst1.16         {d3[0]},  [r0,:16], r3
64         vst1.16         {d0[1]},  [r0,:16], r3
65         vst1.16         {d1[1]},  [r0,:16], r3
66         vst1.16         {d2[1]},  [r0,:16], r3
67         vst1.16         {d3[1]},  [r0,:16], r3
68         vst1.16         {d0[2]},  [r0,:16], r3
69         vst1.16         {d1[2]},  [r0,:16], r3
70         vst1.16         {d2[2]},  [r0,:16], r3
71         vst1.16         {d3[2]},  [r0,:16], r3
72         vst1.16         {d0[3]},  [r0,:16], r3
73         vst1.16         {d1[3]},  [r0,:16], r3
74         vst1.16         {d2[3]},  [r0,:16], r3
75         vst1.16         {d3[3]},  [r0,:16], r3
76
77         bx              lr
78 endfunc
79
80 function ff_vp8_idct_add_neon, export=1
81         vld1.16         {q0-q1},  [r1,:128]
82         movw            r3,  #20091
83         movt            r3,  #35468/2
84         vdup.32         d4,  r3
85
86         vmull.s16       q12, d1,  d4[0]
87         vmull.s16       q13, d3,  d4[0]
88         vqdmulh.s16     d20, d1,  d4[1]
89         vqdmulh.s16     d23, d3,  d4[1]
90         vshrn.s32       d21, q12, #16
91         vshrn.s32       d22, q13, #16
92         vadd.s16        d21, d21, d1
93         vadd.s16        d22, d22, d3
94
95         vadd.s16        d16, d0,  d2
96         vsub.s16        d17, d0,  d2
97         vadd.s16        d18, d21, d23
98         vsub.s16        d19, d20, d22
99         vadd.s16        q0,  q8,  q9
100         vsub.s16        q1,  q8,  q9
101
102         vtrn.32         d0,  d3
103         vtrn.32         d1,  d2
104         vtrn.16         d0,  d1
105         vtrn.16         d3,  d2
106
107         vmov.i16        q15, #0
108         vmull.s16       q12, d1,  d4[0]
109         vst1.16         {q15},    [r1,:128]!
110         vmull.s16       q13, d2,  d4[0]
111         vst1.16         {q15},    [r1,:128]
112         vqdmulh.s16     d21, d1,  d4[1]
113         vqdmulh.s16     d23, d2,  d4[1]
114         vshrn.s32       d20, q12, #16
115         vshrn.s32       d22, q13, #16
116         vadd.i16        d20, d20, d1
117         vadd.i16        d22, d22, d2
118
119         vadd.i16        d16, d0,  d3
120         vsub.i16        d17, d0,  d3
121         vadd.i16        d18, d20, d23
122         vld1.32         {d20[]},  [r0,:32], r2
123         vsub.i16        d19, d21, d22
124         vld1.32         {d22[]},  [r0,:32], r2
125         vadd.s16        q0,  q8,  q9
126         vld1.32         {d23[]},  [r0,:32], r2
127         vsub.s16        q1,  q8,  q9
128         vld1.32         {d21[]},  [r0,:32], r2
129         vrshr.s16       q0,  q0,  #3
130         vtrn.32         q10, q11
131         vrshr.s16       q1,  q1,  #3
132
133         sub             r0,  r0,  r2,  lsl #2
134
135         vtrn.32         d0,  d3
136         vtrn.32         d1,  d2
137         vtrn.16         d0,  d1
138         vtrn.16         d3,  d2
139
140         vaddw.u8        q0,  q0,  d20
141         vaddw.u8        q1,  q1,  d21
142         vqmovun.s16     d0,  q0
143         vqmovun.s16     d1,  q1
144
145         vst1.32         {d0[0]},  [r0,:32], r2
146         vst1.32         {d0[1]},  [r0,:32], r2
147         vst1.32         {d1[1]},  [r0,:32], r2
148         vst1.32         {d1[0]},  [r0,:32], r2
149
150         bx              lr
151 endfunc
152
153 function ff_vp8_idct_dc_add_neon, export=1
154         mov             r3,  #0
155         ldrsh           r12, [r1]
156         strh            r3,  [r1]
157         vdup.16         q1,  r12
158         vrshr.s16       q1,  q1,  #3
159         vld1.32         {d0[]},   [r0,:32], r2
160         vld1.32         {d1[]},   [r0,:32], r2
161         vld1.32         {d0[1]},  [r0,:32], r2
162         vld1.32         {d1[1]},  [r0,:32], r2
163         vaddw.u8        q2,  q1,  d0
164         vaddw.u8        q3,  q1,  d1
165         sub             r0,  r0,  r2, lsl #2
166         vqmovun.s16     d0,  q2
167         vqmovun.s16     d1,  q3
168         vst1.32         {d0[0]},  [r0,:32], r2
169         vst1.32         {d1[0]},  [r0,:32], r2
170         vst1.32         {d0[1]},  [r0,:32], r2
171         vst1.32         {d1[1]},  [r0,:32], r2
172         bx              lr
173 endfunc
174
175 function ff_vp8_idct_dc_add4uv_neon, export=1
176         vmov.i16        d0,  #0
177         mov             r3,  #32
178         vld1.16         {d16[]},  [r1,:16]
179         vst1.16         {d0[0]},  [r1,:16], r3
180         vld1.16         {d17[]},  [r1,:16]
181         vst1.16         {d0[0]},  [r1,:16], r3
182         vld1.16         {d18[]},  [r1,:16]
183         vst1.16         {d0[0]},  [r1,:16], r3
184         vld1.16         {d19[]},  [r1,:16]
185         vst1.16         {d0[0]},  [r1,:16], r3
186         mov             r3,  r0
187         vrshr.s16       q8,  q8,  #3            @ dc >>= 3
188         vld1.8          {d0},     [r0,:64], r2
189         vrshr.s16       q9,  q9,  #3
190         vld1.8          {d1},     [r0,:64], r2
191         vaddw.u8        q10, q8,  d0
192         vld1.8          {d2},     [r0,:64], r2
193         vaddw.u8        q0,  q8,  d1
194         vld1.8          {d3},     [r0,:64], r2
195         vaddw.u8        q11, q8,  d2
196         vld1.8          {d4},     [r0,:64], r2
197         vaddw.u8        q1,  q8,  d3
198         vld1.8          {d5},     [r0,:64], r2
199         vaddw.u8        q12, q9,  d4
200         vld1.8          {d6},     [r0,:64], r2
201         vaddw.u8        q2,  q9,  d5
202         vld1.8          {d7},     [r0,:64], r2
203         vaddw.u8        q13, q9,  d6
204         vqmovun.s16     d20, q10
205         vaddw.u8        q3,  q9,  d7
206         vqmovun.s16     d21, q0
207         vqmovun.s16     d22, q11
208         vst1.8          {d20},    [r3,:64], r2
209         vqmovun.s16     d23, q1
210         vst1.8          {d21},    [r3,:64], r2
211         vqmovun.s16     d24, q12
212         vst1.8          {d22},    [r3,:64], r2
213         vqmovun.s16     d25, q2
214         vst1.8          {d23},    [r3,:64], r2
215         vqmovun.s16     d26, q13
216         vst1.8          {d24},    [r3,:64], r2
217         vqmovun.s16     d27, q3
218         vst1.8          {d25},    [r3,:64], r2
219         vst1.8          {d26},    [r3,:64], r2
220         vst1.8          {d27},    [r3,:64], r2
221
222         bx              lr
223 endfunc
224
225 function ff_vp8_idct_dc_add4y_neon, export=1
226         vmov.i16        d0,  #0
227         mov             r3,  #32
228         vld1.16         {d16[]},  [r1,:16]
229         vst1.16         {d0[0]},  [r1,:16], r3
230         vld1.16         {d17[]},  [r1,:16]
231         vst1.16         {d0[0]},  [r1,:16], r3
232         vld1.16         {d18[]},  [r1,:16]
233         vst1.16         {d0[0]},  [r1,:16], r3
234         vld1.16         {d19[]},  [r1,:16]
235         vst1.16         {d0[0]},  [r1,:16], r3
236         vrshr.s16       q8,  q8,  #3            @ dc >>= 3
237         vld1.8          {q0},     [r0,:128], r2
238         vrshr.s16       q9,  q9,  #3
239         vld1.8          {q1},     [r0,:128], r2
240         vaddw.u8        q10, q8,  d0
241         vld1.8          {q2},     [r0,:128], r2
242         vaddw.u8        q0,  q9,  d1
243         vld1.8          {q3},     [r0,:128], r2
244         vaddw.u8        q11, q8,  d2
245         vaddw.u8        q1,  q9,  d3
246         vaddw.u8        q12, q8,  d4
247         vaddw.u8        q2,  q9,  d5
248         vaddw.u8        q13, q8,  d6
249         vaddw.u8        q3,  q9,  d7
250         sub             r0,  r0,  r2,  lsl #2
251         vqmovun.s16     d20, q10
252         vqmovun.s16     d21, q0
253         vqmovun.s16     d22, q11
254         vqmovun.s16     d23, q1
255         vqmovun.s16     d24, q12
256         vst1.8          {q10},    [r0,:128], r2
257         vqmovun.s16     d25, q2
258         vst1.8          {q11},    [r0,:128], r2
259         vqmovun.s16     d26, q13
260         vst1.8          {q12},    [r0,:128], r2
261         vqmovun.s16     d27, q3
262         vst1.8          {q13},    [r0,:128], r2
263
264         bx              lr
265 endfunc
266
267 @ Register layout:
268 @   P3..Q3 -> q0..q7
269 @   flim_E -> q14
270 @   flim_I -> q15
271 @   hev_thresh -> r12
272 @
273 .macro  vp8_loop_filter, inner=0, simple=0
274     .if \simple
275         vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
276         vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
277         vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
278         vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
279         vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
280         vmov.i8         q13, #0x80
281         vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
282     .else
283         @ calculate hev and normal_limit:
284         vabd.u8         q12, q2,  q3            @ abs(P1-P0)
285         vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
286         vabd.u8         q10, q0,  q1            @ abs(P3-P2)
287         vabd.u8         q11, q1,  q2            @ abs(P2-P1)
288         vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
289         vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
290         vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
291         vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
292         vand            q8,  q8,  q9
293         vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
294         vand            q8,  q8,  q11
295         vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
296         vand            q8,  q8,  q10
297         vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
298         vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
299         vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
300         vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
301         vand            q8,  q8,  q10
302         vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
303         vand            q8,  q8,  q11
304         vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
305         vdup.8          q15, r12                @ hev_thresh
306         vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
307         vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
308         vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
309         vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
310         vand            q8,  q8,  q11
311         vmov.i8         q13, #0x80
312         vorr            q9,  q12, q14
313     .endif
314
315         @ at this point:
316         @   q8: normal_limit
317         @   q9: hev
318
319         @ convert to signed value:
320         veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
321         veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
322
323         vmov.i16        q12, #3
324         vsubl.s8        q10, d8,  d6            @ QS0 - PS0
325         vsubl.s8        q11, d9,  d7            @   (widened to 16 bits)
326         veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
327         veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
328         vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
329         vmul.i16        q11, q11, q12
330
331         vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
332         vmov.i8         q14, #4
333         vmov.i8         q15, #3
334     .if \inner
335         vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
336     .endif
337         vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
338         vaddw.s8        q11, q11, d25
339         vqmovn.s16      d20, q10                @ narrow result back into q10
340         vqmovn.s16      d21, q11
341     .if !\inner && !\simple
342         veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
343         veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
344     .endif
345         vand            q10, q10, q8            @ w &= normal_limit
346
347         @ registers used at this point..
348         @   q0 -> P3  (don't corrupt)
349         @   q1-q6 -> PS2-QS2
350         @   q7 -> Q3  (don't corrupt)
351         @   q9 -> hev
352         @   q10 -> w
353         @   q13 -> #0x80
354         @   q14 -> #4
355         @   q15 -> #3
356         @   q8, q11, q12 -> unused
357
358         @ filter_common:   is4tap==1
359         @   c1 = clamp(w + 4) >> 3;
360         @   c2 = clamp(w + 3) >> 3;
361         @   Q0 = s2u(QS0 - c1);
362         @   P0 = s2u(PS0 + c2);
363
364     .if \simple
365         vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
366         vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
367         vshr.s8         q11, q11, #3            @ c1 >>= 3
368         vshr.s8         q12, q12, #3            @ c2 >>= 3
369         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
370         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
371         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
372         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
373         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
374         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
375     .elseif \inner
376         @ the !is4tap case of filter_common, only used for inner blocks
377         @   c3 = ((c1&~hev) + 1) >> 1;
378         @   Q1 = s2u(QS1 - c3);
379         @   P1 = s2u(PS1 + c3);
380         vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
381         vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
382         vshr.s8         q11, q11, #3            @ c1 >>= 3
383         vshr.s8         q12, q12, #3            @ c2 >>= 3
384         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
385         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
386         vbic            q11, q11, q9            @ c1 & ~hev
387         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
388         vrshr.s8        q11, q11, #1            @ c3 >>= 1
389         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
390         vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
391         vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
392         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
393         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
394     .else
395         vand            q12, q10, q9            @ w & hev
396         vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
397         vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
398         vshr.s8         q11, q11, #3            @ c1 >>= 3
399         vshr.s8         q12, q12, #3            @ c2 >>= 3
400         vbic            q10, q10, q9            @ w &= ~hev
401         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
402         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
403
404         @ filter_mbedge:
405         @   a = clamp((27*w + 63) >> 7);
406         @   Q0 = s2u(QS0 - a);
407         @   P0 = s2u(PS0 + a);
408         @   a = clamp((18*w + 63) >> 7);
409         @   Q1 = s2u(QS1 - a);
410         @   P1 = s2u(PS1 + a);
411         @   a = clamp((9*w + 63) >> 7);
412         @   Q2 = s2u(QS2 - a);
413         @   P2 = s2u(PS2 + a);
414         vmov.i16        q9,  #63
415         vshll.s8        q14, d20, #3
416         vshll.s8        q15, d21, #3
417         vaddw.s8        q14, q14, d20
418         vaddw.s8        q15, q15, d21
419         vadd.s16        q8,  q9,  q14
420         vadd.s16        q9,  q9,  q15           @  9*w + 63
421         vadd.s16        q11, q8,  q14
422         vadd.s16        q12, q9,  q15           @ 18*w + 63
423         vadd.s16        q14, q11, q14
424         vadd.s16        q15, q12, q15           @ 27*w + 63
425         vqshrn.s16      d16, q8,  #7
426         vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
427         vqshrn.s16      d22, q11, #7
428         vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
429         vqshrn.s16      d28, q14, #7
430         vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
431         vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
432         vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
433         vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
434         vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
435         vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
436         vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
437         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
438         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
439         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
440         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
441         veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
442         veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
443     .endif
444 .endm
445
446 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
447 function ff_vp8_v_loop_filter16\name\()_neon, export=1
448         vpush           {q4-q7}
449         sub             r0,  r0,  r1,  lsl #1+!\simple
450
451         @ Load pixels:
452     .if !\simple
453         ldr             r12, [sp, #64]          @ hev_thresh
454         vld1.8          {q0},     [r0,:128], r1 @ P3
455         vld1.8          {q1},     [r0,:128], r1 @ P2
456     .endif
457         vld1.8          {q2},     [r0,:128], r1 @ P1
458         vld1.8          {q3},     [r0,:128], r1 @ P0
459         vld1.8          {q4},     [r0,:128], r1 @ Q0
460         vld1.8          {q5},     [r0,:128], r1 @ Q1
461     .if !\simple
462         vld1.8          {q6},     [r0,:128], r1 @ Q2
463         vld1.8          {q7},     [r0,:128]     @ Q3
464         vdup.8          q15, r3                 @ flim_I
465     .endif
466         vdup.8          q14, r2                 @ flim_E
467
468         vp8_loop_filter inner=\inner, simple=\simple
469
470         @ back up to P2:  dst -= stride * 6
471         sub             r0,  r0,  r1,  lsl #2
472     .if !\simple
473         sub             r0,  r0,  r1,  lsl #1
474
475         @ Store pixels:
476         vst1.8          {q1},     [r0,:128], r1 @ P2
477     .endif
478         vst1.8          {q2},     [r0,:128], r1 @ P1
479         vst1.8          {q3},     [r0,:128], r1 @ P0
480         vst1.8          {q4},     [r0,:128], r1 @ Q0
481         vst1.8          {q5},     [r0,:128], r1 @ Q1
482     .if !\simple
483         vst1.8          {q6},     [r0,:128]     @ Q2
484     .endif
485
486         vpop            {q4-q7}
487         bx              lr
488 endfunc
489 .endm
490
491 vp8_v_loop_filter16
492 vp8_v_loop_filter16 _inner,  inner=1
493 vp8_v_loop_filter16 _simple, simple=1
494
495 .macro  vp8_v_loop_filter8uv name, inner=0
496 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
497         vpush           {q4-q7}
498         sub             r0,  r0,  r2,  lsl #2
499         sub             r1,  r1,  r2,  lsl #2
500         ldr             r12, [sp, #64]          @ flim_I
501
502         @ Load pixels:
503         vld1.8          {d0},     [r0,:64], r2  @ P3
504         vld1.8          {d1},     [r1,:64], r2  @ P3
505         vld1.8          {d2},     [r0,:64], r2  @ P2
506         vld1.8          {d3},     [r1,:64], r2  @ P2
507         vld1.8          {d4},     [r0,:64], r2  @ P1
508         vld1.8          {d5},     [r1,:64], r2  @ P1
509         vld1.8          {d6},     [r0,:64], r2  @ P0
510         vld1.8          {d7},     [r1,:64], r2  @ P0
511         vld1.8          {d8},     [r0,:64], r2  @ Q0
512         vld1.8          {d9},     [r1,:64], r2  @ Q0
513         vld1.8          {d10},    [r0,:64], r2  @ Q1
514         vld1.8          {d11},    [r1,:64], r2  @ Q1
515         vld1.8          {d12},    [r0,:64], r2  @ Q2
516         vld1.8          {d13},    [r1,:64], r2  @ Q2
517         vld1.8          {d14},    [r0,:64]      @ Q3
518         vld1.8          {d15},    [r1,:64]      @ Q3
519
520         vdup.8          q14, r3                 @ flim_E
521         vdup.8          q15, r12                @ flim_I
522         ldr             r12, [sp, #68]          @ hev_thresh
523
524         vp8_loop_filter inner=\inner
525
526         @ back up to P2:  u,v -= stride * 6
527         sub             r0,  r0,  r2,  lsl #2
528         sub             r1,  r1,  r2,  lsl #2
529         sub             r0,  r0,  r2,  lsl #1
530         sub             r1,  r1,  r2,  lsl #1
531
532         @ Store pixels:
533         vst1.8          {d2},     [r0,:64], r2  @ P2
534         vst1.8          {d3},     [r1,:64], r2  @ P2
535         vst1.8          {d4},     [r0,:64], r2  @ P1
536         vst1.8          {d5},     [r1,:64], r2  @ P1
537         vst1.8          {d6},     [r0,:64], r2  @ P0
538         vst1.8          {d7},     [r1,:64], r2  @ P0
539         vst1.8          {d8},     [r0,:64], r2  @ Q0
540         vst1.8          {d9},     [r1,:64], r2  @ Q0
541         vst1.8          {d10},    [r0,:64], r2  @ Q1
542         vst1.8          {d11},    [r1,:64], r2  @ Q1
543         vst1.8          {d12},    [r0,:64]      @ Q2
544         vst1.8          {d13},    [r1,:64]      @ Q2
545
546         vpop            {q4-q7}
547         bx              lr
548 endfunc
549 .endm
550
551 vp8_v_loop_filter8uv
552 vp8_v_loop_filter8uv _inner, inner=1
553
554 .macro  vp8_h_loop_filter16 name, inner=0, simple=0
555 function ff_vp8_h_loop_filter16\name\()_neon, export=1
556         vpush           {q4-q7}
557         sub             r0,  r0,  #4
558     .if !\simple
559         ldr             r12, [sp, #64]          @ hev_thresh
560     .endif
561
562         @ Load pixels:
563         vld1.8          {d0},     [r0], r1      @ load first 8-line src data
564         vld1.8          {d2},     [r0], r1
565         vld1.8          {d4},     [r0], r1
566         vld1.8          {d6},     [r0], r1
567         vld1.8          {d8},     [r0], r1
568         vld1.8          {d10},    [r0], r1
569         vld1.8          {d12},    [r0], r1
570         vld1.8          {d14},    [r0], r1
571         vld1.8          {d1},     [r0], r1      @ load second 8-line src data
572         vld1.8          {d3},     [r0], r1
573         vld1.8          {d5},     [r0], r1
574         vld1.8          {d7},     [r0], r1
575         vld1.8          {d9},     [r0], r1
576         vld1.8          {d11},    [r0], r1
577         vld1.8          {d13},    [r0], r1
578         vld1.8          {d15},    [r0], r1
579
580         transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
581
582         vdup.8          q14, r2                 @ flim_E
583     .if !\simple
584         vdup.8          q15, r3                 @ flim_I
585     .endif
586
587         vp8_loop_filter inner=\inner, simple=\simple
588
589         sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
590
591         transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
592
593         @ Store pixels:
594         vst1.8          {d0},     [r0],     r1
595         vst1.8          {d2},     [r0],     r1
596         vst1.8          {d4},     [r0],     r1
597         vst1.8          {d6},     [r0],     r1
598         vst1.8          {d8},     [r0],     r1
599         vst1.8          {d10},    [r0],     r1
600         vst1.8          {d12},    [r0],     r1
601         vst1.8          {d14},    [r0],     r1
602         vst1.8          {d1},     [r0],     r1
603         vst1.8          {d3},     [r0],     r1
604         vst1.8          {d5},     [r0],     r1
605         vst1.8          {d7},     [r0],     r1
606         vst1.8          {d9},     [r0],     r1
607         vst1.8          {d11},    [r0],     r1
608         vst1.8          {d13},    [r0],     r1
609         vst1.8          {d15},    [r0]
610
611         vpop            {q4-q7}
612         bx              lr
613 endfunc
614 .endm
615
616 vp8_h_loop_filter16
617 vp8_h_loop_filter16 _inner,  inner=1
618 vp8_h_loop_filter16 _simple, simple=1
619
620 .macro  vp8_h_loop_filter8uv name, inner=0
621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
622         vpush           {q4-q7}
623         sub             r0,  r0,  #4
624         sub             r1,  r1,  #4
625         ldr             r12, [sp, #64]          @ flim_I
626
627         @ Load pixels:
628         vld1.8          {d0},     [r0], r2      @ load u
629         vld1.8          {d1},     [r1], r2      @ load v
630         vld1.8          {d2},     [r0], r2
631         vld1.8          {d3},     [r1], r2
632         vld1.8          {d4},     [r0], r2
633         vld1.8          {d5},     [r1], r2
634         vld1.8          {d6},     [r0], r2
635         vld1.8          {d7},     [r1], r2
636         vld1.8          {d8},     [r0], r2
637         vld1.8          {d9},     [r1], r2
638         vld1.8          {d10},    [r0], r2
639         vld1.8          {d11},    [r1], r2
640         vld1.8          {d12},    [r0], r2
641         vld1.8          {d13},    [r1], r2
642         vld1.8          {d14},    [r0], r2
643         vld1.8          {d15},    [r1], r2
644
645         transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
646
647         vdup.8          q14, r3                 @ flim_E
648         vdup.8          q15, r12                @ flim_I
649         ldr             r12, [sp, #68]          @ hev_thresh
650
651         vp8_loop_filter inner=\inner
652
653         sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
654         sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
655
656         transpose_8x8   q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
657
658         @ Store pixels:
659         vst1.8          {d0},     [r0], r2
660         vst1.8          {d1},     [r1], r2
661         vst1.8          {d2},     [r0], r2
662         vst1.8          {d3},     [r1], r2
663         vst1.8          {d4},     [r0], r2
664         vst1.8          {d5},     [r1], r2
665         vst1.8          {d6},     [r0], r2
666         vst1.8          {d7},     [r1], r2
667         vst1.8          {d8},     [r0], r2
668         vst1.8          {d9},     [r1], r2
669         vst1.8          {d10},    [r0], r2
670         vst1.8          {d11},    [r1], r2
671         vst1.8          {d12},    [r0], r2
672         vst1.8          {d13},    [r1], r2
673         vst1.8          {d14},    [r0]
674         vst1.8          {d15},    [r1]
675
676         vpop            {q4-q7}
677         bx              lr
678 endfunc
679 .endm
680
681 vp8_h_loop_filter8uv
682 vp8_h_loop_filter8uv _inner, inner=1
683
684 function ff_put_vp8_pixels16_neon, export=1
685         ldr             r12, [sp, #0]           @ h
686 1:
687         subs            r12, r12, #4
688         vld1.8          {q0},     [r2], r3
689         vld1.8          {q1},     [r2], r3
690         vld1.8          {q2},     [r2], r3
691         vld1.8          {q3},     [r2], r3
692         vst1.8          {q0},     [r0,:128], r1
693         vst1.8          {q1},     [r0,:128], r1
694         vst1.8          {q2},     [r0,:128], r1
695         vst1.8          {q3},     [r0,:128], r1
696         bgt             1b
697         bx              lr
698 endfunc
699
700 function ff_put_vp8_pixels8_neon, export=1
701         ldr             r12, [sp, #0]           @ h
702 1:
703         subs            r12, r12, #4
704         vld1.8          {d0},     [r2], r3
705         vld1.8          {d1},     [r2], r3
706         vld1.8          {d2},     [r2], r3
707         vld1.8          {d3},     [r2], r3
708         vst1.8          {d0},     [r0,:64], r1
709         vst1.8          {d1},     [r0,:64], r1
710         vst1.8          {d2},     [r0,:64], r1
711         vst1.8          {d3},     [r0,:64], r1
712         bgt             1b
713         bx              lr
714 endfunc
715
716 /* 4/6-tap 8th-pel MC */
717
718 .macro  vp8_epel8_h6    d,   a,   b
719         vext.8          d27, \a,  \b,  #1
720         vmovl.u8        q8,  \a
721         vext.8          d28, \a,  \b,  #2
722         vmovl.u8        q9,  d27
723         vext.8          d29, \a,  \b,  #3
724         vmovl.u8        q10, d28
725         vext.8          d30, \a,  \b,  #4
726         vmovl.u8        q11, d29
727         vext.8          d31, \a,  \b,  #5
728         vmovl.u8        q12, d30
729         vmul.u16        q10, q10, d0[2]
730         vmovl.u8        q13, d31
731         vmul.u16        q11, q11, d0[3]
732         vmls.u16        q10, q9,  d0[1]
733         vmls.u16        q11, q12, d1[0]
734         vmla.u16        q10, q8,  d0[0]
735         vmla.u16        q11, q13, d1[1]
736         vqadd.s16       q11, q10, q11
737         vqrshrun.s16    \d,  q11, #7
738 .endm
739
740 .macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
741         vext.8          q14, \q0, \q1, #3
742         vext.8          q15, \q0, \q1, #4
743         vmovl.u8        q11, d28
744         vmovl.u8        q14, d29
745         vext.8          q3,  \q0, \q1, #2
746         vmovl.u8        q12, d30
747         vmovl.u8        q15, d31
748         vext.8          q8,  \q0, \q1, #1
749         vmovl.u8        q10, d6
750         vmovl.u8        q3,  d7
751         vext.8          q2,  \q0, \q1, #5
752         vmovl.u8        q13, d4
753         vmovl.u8        q2,  d5
754         vmovl.u8        q9,  d16
755         vmovl.u8        q8,  d17
756         vmul.u16        q11, q11, d0[3]
757         vmul.u16        q10, q10, d0[2]
758         vmul.u16        q3,  q3,  d0[2]
759         vmul.u16        q14, q14, d0[3]
760         vmls.u16        q11, q12, d1[0]
761         vmovl.u8        q12, \s0
762         vmovl.u8        q1,  \s1
763         vmls.u16        q10, q9,  d0[1]
764         vmls.u16        q3,  q8,  d0[1]
765         vmls.u16        q14, q15, d1[0]
766         vmla.u16        q10, q12, d0[0]
767         vmla.u16        q11, q13, d1[1]
768         vmla.u16        q3,  q1,  d0[0]
769         vmla.u16        q14, q2,  d1[1]
770         vqadd.s16       q11, q10, q11
771         vqadd.s16       q14, q3,  q14
772         vqrshrun.s16    \d0, q11, #7
773         vqrshrun.s16    \d1, q14, #7
774 .endm
775
776 .macro  vp8_epel8_v6    d0,  s0,  s1,  s2,  s3,  s4,  s5
777         vmovl.u8        q10, \s2
778         vmovl.u8        q11, \s3
779         vmovl.u8        q9,  \s1
780         vmovl.u8        q12, \s4
781         vmovl.u8        q8,  \s0
782         vmovl.u8        q13, \s5
783         vmul.u16        q10, q10, d0[2]
784         vmul.u16        q11, q11, d0[3]
785         vmls.u16        q10, q9,  d0[1]
786         vmls.u16        q11, q12, d1[0]
787         vmla.u16        q10, q8,  d0[0]
788         vmla.u16        q11, q13, d1[1]
789         vqadd.s16       q11, q10, q11
790         vqrshrun.s16    \d0, q11, #7
791 .endm
792
793 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
794         vmovl.u8        q10, \s0
795         vmovl.u8        q11, \s3
796         vmovl.u8        q14, \s6
797         vmovl.u8        q9,  \s1
798         vmovl.u8        q12, \s4
799         vmovl.u8        q8,  \s2
800         vmovl.u8        q13, \s5
801         vmul.u16        q10, q10, d0[0]
802         vmul.u16        q15, q11, d0[3]
803         vmul.u16        q11, q11, d0[2]
804         vmul.u16        q14, q14, d1[1]
805         vmls.u16        q10, q9,  d0[1]
806         vmls.u16        q15, q12, d1[0]
807         vmls.u16        q11, q8,  d0[1]
808         vmls.u16        q14, q13, d1[0]
809         vmla.u16        q10, q8,  d0[2]
810         vmla.u16        q15, q13, d1[1]
811         vmla.u16        q11, q9,  d0[0]
812         vmla.u16        q14, q12, d0[3]
813         vqadd.s16       q15, q10, q15
814         vqadd.s16       q14, q11, q14
815         vqrshrun.s16    \d0, q15, #7
816         vqrshrun.s16    \d1, q14, #7
817 .endm
818
819 .macro  vp8_epel8_h4    d,   a,   b
820         vext.8          d28, \a,  \b,  #1
821         vmovl.u8        q9,  \a
822         vext.8          d29, \a,  \b,  #2
823         vmovl.u8        q10, d28
824         vext.8          d30, \a,  \b,  #3
825         vmovl.u8        q11, d29
826         vmovl.u8        q12, d30
827         vmul.u16        q10, q10, d0[2]
828         vmul.u16        q11, q11, d0[3]
829         vmls.u16        q10, q9,  d0[1]
830         vmls.u16        q11, q12, d1[0]
831         vqadd.s16       q11, q10, q11
832         vqrshrun.s16    \d,  q11, #7
833 .endm
834
835 .macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
836         vmovl.u8        q9,  \s0
837         vmovl.u8        q10, \s1
838         vmovl.u8        q11, \s2
839         vmovl.u8        q12, \s3
840         vmovl.u8        q13, \s4
841         vmul.u16        q8,  q10, d0[2]
842         vmul.u16        q14, q11, d0[3]
843         vmul.u16        q11, q11, d0[2]
844         vmul.u16        q15, q12, d0[3]
845         vmls.u16        q8,  q9,  d0[1]
846         vmls.u16        q14, q12, d1[0]
847         vmls.u16        q11, q10, d0[1]
848         vmls.u16        q15, q13, d1[0]
849         vqadd.s16       q8,  q8,  q14
850         vqadd.s16       q11, q11, q15
851         vqrshrun.s16    \d0, q8,  #7
852         vqrshrun.s16    \d1, q11, #7
853 .endm
854
855 function ff_put_vp8_epel16_v6_neon, export=1
856         sub             r2,  r2,  r3,  lsl #1
857         push            {r4,lr}
858         vpush           {d8-d15}
859
860         ldr             r4,  [sp, #80]          @ my
861         movrel          lr,  subpel_filters-16
862         ldr             r12, [sp, #72]          @ h
863         add             r4,  lr,  r4, lsl #4
864         vld1.16         {q0},     [r4,:128]
865 1:
866         vld1.8          {d2-d3},  [r2], r3
867         vld1.8          {d4-d5},  [r2], r3
868         vld1.8          {d6-d7},  [r2], r3
869         vld1.8          {d8-d9},  [r2], r3
870         vld1.8          {d10-d11},[r2], r3
871         vld1.8          {d12-d13},[r2], r3
872         vld1.8          {d14-d15},[r2]
873         sub             r2,  r2,  r3,  lsl #2
874
875         vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
876         vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
877
878         vst1.8          {d2-d3},  [r0,:128], r1
879         vst1.8          {d4-d5},  [r0,:128], r1
880         subs            r12, r12, #2
881         bne             1b
882
883         vpop            {d8-d15}
884         pop             {r4,pc}
885 endfunc
886
887 function ff_put_vp8_epel16_h6_neon, export=1
888         sub             r2,  r2,  #2
889         push            {r4,lr}
890
891         ldr             r4,  [sp, #12]          @ mx
892         movrel          lr,  subpel_filters-16
893         ldr             r12, [sp, #8]           @ h
894         add             r4,  lr,  r4, lsl #4
895         vld1.16         {q0},     [r4,:128]
896 1:
897         vld1.8          {d2-d4},  [r2], r3
898
899         vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
900
901         vst1.8          {d2-d3}, [r0,:128], r1
902         subs            r12, r12, #1
903         bne             1b
904
905         pop             {r4,pc}
906 endfunc
907
908 function ff_put_vp8_epel16_h6v6_neon, export=1
909         sub             r2,  r2,  r3,  lsl #1
910         sub             r2,  r2,  #2
911         push            {r4,lr}
912         vpush           {d8-d9}
913
914         @ first pass (horizontal):
915         ldr             r4,  [sp, #28]          @ mx
916         movrel          lr,  subpel_filters-16
917         ldr             r12, [sp, #24]          @ h
918         add             r4,  lr,  r4, lsl #4
919         sub             sp,  sp,  #336+16
920         vld1.16         {q0},     [r4,:128]
921         add             lr,  sp,  #15
922         add             r12, r12, #5
923         bic             lr,  lr,  #15
924 1:
925         vld1.8          {d2,d3,d4}, [r2], r3
926
927         vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
928
929         vst1.8          {d2-d3}, [lr,:128]!
930         subs            r12, r12, #1
931         bne             1b
932
933         @ second pass (vertical):
934         ldr             r4,  [sp, #336+16+32]   @ my
935         movrel          lr,  subpel_filters-16
936         ldr             r12, [sp, #336+16+24]   @ h
937         add             r4,  lr,  r4, lsl #4
938         add             lr,  sp,  #15
939         vld1.16         {q0},     [r4,:128]
940         bic             lr,  lr,  #15
941 2:
942         vld1.8          {d2-d5},  [lr,:128]!
943         vld1.8          {d6-d9},  [lr,:128]!
944         vld1.8          {d28-d31},[lr,:128]
945         sub             lr,  lr,  #48
946
947         vp8_epel8_v6    d2, d2, d4, d6, d8, d28, d30
948         vp8_epel8_v6    d3, d3, d5, d7, d9, d29, d31
949
950         vst1.8          {d2-d3}, [r0,:128], r1
951         subs            r12, r12, #1
952         bne             2b
953
954         add             sp,  sp,  #336+16
955         vpop            {d8-d9}
956         pop             {r4,pc}
957 endfunc
958
959 function ff_put_vp8_epel8_v6_neon, export=1
960         sub             r2,  r2,  r3,  lsl #1
961         push            {r4,lr}
962
963         ldr             r4,  [sp, #16]          @ my
964         movrel          lr,  subpel_filters-16
965         ldr             r12, [sp, #8]           @ h
966         add             r4,  lr,  r4, lsl #4
967         vld1.16         {q0},     [r4,:128]
968 1:
969         vld1.8          {d2},  [r2], r3
970         vld1.8          {d3},  [r2], r3
971         vld1.8          {d4},  [r2], r3
972         vld1.8          {d5},  [r2], r3
973         vld1.8          {d6},  [r2], r3
974         vld1.8          {d7},  [r2], r3
975         vld1.8          {d28}, [r2]
976
977         sub             r2,  r2,  r3,  lsl #2
978
979         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
980
981         vst1.8          {d2}, [r0,:64], r1
982         vst1.8          {d3}, [r0,:64], r1
983         subs            r12, r12, #2
984         bne             1b
985
986         pop             {r4,pc}
987 endfunc
988
989 function ff_put_vp8_epel8_h6_neon, export=1
990         sub             r2,  r2,  #2
991         push            {r4,lr}
992
993         ldr             r4,  [sp, #12]          @ mx
994         movrel          lr,  subpel_filters-16
995         ldr             r12, [sp, #8]           @ h
996         add             r4,  lr,  r4, lsl #4
997         vld1.16         {q0},     [r4,:128]
998 1:
999         vld1.8          {d2,d3}, [r2], r3
1000
1001         vp8_epel8_h6    d2,  d2,  d3
1002
1003         vst1.8          {d2}, [r0,:64], r1
1004         subs            r12, r12, #1
1005         bne             1b
1006
1007         pop             {r4,pc}
1008 endfunc
1009
1010 function ff_put_vp8_epel8_h6v6_neon, export=1
1011         sub             r2,  r2,  r3,  lsl #1
1012         sub             r2,  r2,  #2
1013         push            {r4,lr}
1014
1015         @ first pass (horizontal):
1016         ldr             r4,  [sp, #12]          @ mx
1017         movrel          lr,  subpel_filters-16
1018         ldr             r12, [sp, #8]           @ h
1019         add             r4,  lr,  r4, lsl #4
1020         sub             sp,  sp,  #168+16
1021         vld1.16         {q0},     [r4,:128]
1022         add             lr,  sp,  #15
1023         add             r12, r12, #5
1024         bic             lr,  lr,  #15
1025 1:
1026         vld1.8          {d2,d3}, [r2], r3
1027
1028         vp8_epel8_h6    d2,  d2,  d3
1029
1030         vst1.8          {d2}, [lr,:64]!
1031         subs            r12, r12, #1
1032         bne             1b
1033
1034         @ second pass (vertical):
1035         ldr             r4,  [sp, #168+16+16]   @ my
1036         movrel          lr,  subpel_filters-16
1037         ldr             r12, [sp, #168+16+8]    @ h
1038         add             r4,  lr,  r4, lsl #4
1039         add             lr,  sp,  #15
1040         vld1.16         {q0},     [r4,:128]
1041         bic             lr,  lr,  #15
1042 2:
1043         vld1.8          {d2-d5},  [lr,:128]!
1044         vld1.8          {d6-d7},  [lr,:128]!
1045         vld1.8          {d30},    [lr,:64]
1046         sub             lr,  lr,  #32
1047
1048         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1049
1050         vst1.8          {d2}, [r0,:64], r1
1051         vst1.8          {d3}, [r0,:64], r1
1052         subs            r12, r12, #2
1053         bne             2b
1054
1055         add             sp,  sp,  #168+16
1056         pop             {r4,pc}
1057 endfunc
1058
1059 function ff_put_vp8_epel8_v4_neon, export=1
1060         sub             r2,  r2,  r3
1061         push            {r4,lr}
1062
1063         ldr             r4,  [sp, #16]          @ my
1064         movrel          lr,  subpel_filters-16
1065         ldr             r12, [sp, #8]           @ h
1066         add             r4,  lr,  r4, lsl #4
1067         vld1.16         {q0},     [r4,:128]
1068 1:
1069         vld1.8          {d2},     [r2], r3
1070         vld1.8          {d3},     [r2], r3
1071         vld1.8          {d4},     [r2], r3
1072         vld1.8          {d5},     [r2], r3
1073         vld1.8          {d6},     [r2]
1074         sub             r2,  r2,  r3,  lsl #1
1075
1076         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1077
1078         vst1.8          {d2}, [r0,:64], r1
1079         vst1.8          {d3}, [r0,:64], r1
1080         subs            r12, r12, #2
1081         bne             1b
1082
1083         pop             {r4,pc}
1084 endfunc
1085
1086 function ff_put_vp8_epel8_h4_neon, export=1
1087         sub             r2,  r2,  #1
1088         push            {r4,lr}
1089
1090         ldr             r4,  [sp, #12]          @ mx
1091         movrel          lr,  subpel_filters-16
1092         ldr             r12, [sp, #8]           @ h
1093         add             r4,  lr,  r4, lsl #4
1094         vld1.16         {q0},     [r4,:128]
1095 1:
1096         vld1.8          {d2,d3}, [r2], r3
1097
1098         vp8_epel8_h4    d2,  d2,  d3
1099
1100         vst1.8          {d2}, [r0,:64], r1
1101         subs            r12, r12, #1
1102         bne             1b
1103
1104         pop             {r4,pc}
1105 endfunc
1106
1107 function ff_put_vp8_epel8_h4v4_neon, export=1
1108         sub             r2,  r2,  r3
1109         sub             r2,  r2,  #1
1110         push            {r4,lr}
1111
1112         @ first pass (horizontal):
1113         ldr             r4,  [sp, #12]          @ mx
1114         movrel          lr,  subpel_filters-16
1115         ldr             r12, [sp, #8]           @ h
1116         add             r4,  lr,  r4, lsl #4
1117         sub             sp,  sp,  #168+16
1118         vld1.16         {q0},     [r4,:128]
1119         add             lr,  sp,  #15
1120         add             r12, r12, #3
1121         bic             lr,  lr,  #15
1122 1:
1123         vld1.8          {d2,d3}, [r2], r3
1124
1125         vp8_epel8_h4    d2,  d2,  d3
1126
1127         vst1.8          {d2}, [lr,:64]!
1128         subs            r12, r12, #1
1129         bne             1b
1130
1131         @ second pass (vertical):
1132         ldr             r4,  [sp, #168+16+16]   @ my
1133         movrel          lr,  subpel_filters-16
1134         ldr             r12, [sp, #168+16+8]    @ h
1135         add             r4,  lr,  r4, lsl #4
1136         add             lr,  sp,  #15
1137         vld1.16         {q0},     [r4,:128]
1138         bic             lr,  lr,  #15
1139 2:
1140         vld1.8          {d2-d5},  [lr,:128]!
1141         vld1.8          {d6},     [lr,:64]
1142         sub             lr,  lr,  #16
1143
1144         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1145
1146         vst1.8          {d2},     [r0,:64], r1
1147         vst1.8          {d3},     [r0,:64], r1
1148         subs            r12, r12, #2
1149         bne             2b
1150
1151         add             sp,  sp,  #168+16
1152         pop             {r4,pc}
1153 endfunc
1154
1155 function ff_put_vp8_epel8_h6v4_neon, export=1
1156         sub             r2,  r2,  r3
1157         sub             r2,  r2,  #2
1158         push            {r4,lr}
1159
1160         @ first pass (horizontal):
1161         ldr             r4,  [sp, #12]          @ mx
1162         movrel          lr,  subpel_filters-16
1163         ldr             r12, [sp, #8]           @ h
1164         add             r4,  lr,  r4, lsl #4
1165         sub             sp,  sp,  #168+16
1166         vld1.16         {q0},     [r4,:128]
1167         add             lr,  sp,  #15
1168         add             r12, r12, #3
1169         bic             lr,  lr,  #15
1170 1:
1171         vld1.8          {d2,d3}, [r2], r3
1172
1173         vp8_epel8_h6    d2,  d2,  d3
1174
1175         vst1.8          {d2}, [lr,:64]!
1176         subs            r12, r12, #1
1177         bne             1b
1178
1179         @ second pass (vertical):
1180         ldr             r4,  [sp, #168+16+16]   @ my
1181         movrel          lr,  subpel_filters-16
1182         ldr             r12, [sp, #168+16+8]    @ h
1183         add             r4,  lr,  r4, lsl #4
1184         add             lr,  sp,  #15
1185         vld1.16         {q0},     [r4,:128]
1186         bic             lr,  lr,  #15
1187 2:
1188         vld1.8          {d2-d5},  [lr,:128]!
1189         vld1.8          {d6},     [lr,:64]
1190         sub             lr,  lr,  #16
1191
1192         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1193
1194         vst1.8          {d2},     [r0,:64], r1
1195         vst1.8          {d3},     [r0,:64], r1
1196         subs            r12, r12, #2
1197         bne             2b
1198
1199         add             sp,  sp,  #168+16
1200         pop             {r4,pc}
1201 endfunc
1202
1203 function ff_put_vp8_epel8_h4v6_neon, export=1
1204         sub             r2,  r2,  r3,  lsl #1
1205         sub             r2,  r2,  #1
1206         push            {r4,lr}
1207
1208         @ first pass (horizontal):
1209         ldr             r4,  [sp, #12]          @ mx
1210         movrel          lr,  subpel_filters-16
1211         ldr             r12, [sp, #8]           @ h
1212         add             r4,  lr,  r4, lsl #4
1213         sub             sp,  sp,  #168+16
1214         vld1.16         {q0},     [r4,:128]
1215         add             lr,  sp,  #15
1216         add             r12, r12, #5
1217         bic             lr,  lr,  #15
1218 1:
1219         vld1.8          {d2,d3}, [r2], r3
1220
1221         vp8_epel8_h4    d2,  d2,  d3
1222
1223         vst1.8          {d2}, [lr,:64]!
1224         subs            r12, r12, #1
1225         bne             1b
1226
1227         @ second pass (vertical):
1228         ldr             r4,  [sp, #168+16+16]   @ my
1229         movrel          lr,  subpel_filters-16
1230         ldr             r12, [sp, #168+16+8]    @ h
1231         add             r4,  lr,  r4, lsl #4
1232         add             lr,  sp,  #15
1233         vld1.16         {q0},     [r4,:128]
1234         bic             lr,  lr,  #15
1235 2:
1236         vld1.8          {d2-d5},  [lr,:128]!
1237         vld1.8          {d6-d7},  [lr,:128]!
1238         vld1.8          {d30},    [lr,:64]
1239         sub             lr,  lr,  #32
1240
1241         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1242
1243         vst1.8          {d2}, [r0,:64], r1
1244         vst1.8          {d3}, [r0,:64], r1
1245         subs            r12, r12, #2
1246         bne             2b
1247
1248         add             sp,  sp,  #168+16
1249         pop             {r4,pc}
1250 endfunc
1251
1252 .ltorg
1253
1254 function ff_put_vp8_epel4_v6_neon, export=1
1255         sub             r2,  r2,  r3,  lsl #1
1256         push            {r4,lr}
1257
1258         ldr             r4,  [sp, #16]          @ my
1259         movrel          lr,  subpel_filters-16
1260         ldr             r12, [sp, #8]           @ h
1261         add             r4,  lr,  r4, lsl #4
1262         vld1.16         {q0},     [r4,:128]
1263 1:
1264         vld1.32         {d2[]},   [r2], r3
1265         vld1.32         {d3[]},   [r2], r3
1266         vld1.32         {d4[]},   [r2], r3
1267         vld1.32         {d5[]},   [r2], r3
1268         vld1.32         {d6[]},   [r2], r3
1269         vld1.32         {d7[]},   [r2], r3
1270         vld1.32         {d28[]},  [r2]
1271         sub             r2,  r2,  r3,  lsl #2
1272         vld1.32         {d2[1]},  [r2], r3
1273         vld1.32         {d3[1]},  [r2], r3
1274         vld1.32         {d4[1]},  [r2], r3
1275         vld1.32         {d5[1]},  [r2], r3
1276         vld1.32         {d6[1]},  [r2], r3
1277         vld1.32         {d7[1]},  [r2], r3
1278         vld1.32         {d28[1]}, [r2]
1279         sub             r2,  r2,  r3,  lsl #2
1280
1281         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1282
1283         vst1.32         {d2[0]},  [r0,:32], r1
1284         vst1.32         {d3[0]},  [r0,:32], r1
1285         vst1.32         {d2[1]},  [r0,:32], r1
1286         vst1.32         {d3[1]},  [r0,:32], r1
1287         subs            r12, r12, #4
1288         bne             1b
1289
1290         pop             {r4,pc}
1291 endfunc
1292
1293 function ff_put_vp8_epel4_h6_neon, export=1
1294         sub             r2,  r2,  #2
1295         push            {r4,lr}
1296
1297         ldr             r4,  [sp, #12]          @ mx
1298         movrel          lr,  subpel_filters-16
1299         ldr             r12, [sp, #8]           @ h
1300         add             r4,  lr,  r4, lsl #4
1301         vld1.16         {q0},     [r4,:128]
1302 1:
1303         vld1.8          {q1},     [r2], r3
1304         vp8_epel8_h6    d2,  d2,  d3
1305         vst1.32         {d2[0]},  [r0,:32], r1
1306         subs            r12, r12, #1
1307         bne             1b
1308
1309         pop             {r4,pc}
1310 endfunc
1311
1312 function ff_put_vp8_epel4_h6v6_neon, export=1
1313         sub             r2,  r2,  r3,  lsl #1
1314         sub             r2,  r2,  #2
1315         push            {r4,lr}
1316
1317         ldr             r4,  [sp, #12]          @ mx
1318         movrel          lr,  subpel_filters-16
1319         ldr             r12, [sp, #8]           @ h
1320         add             r4,  lr,  r4, lsl #4
1321         sub             sp,  sp,  #52+16
1322         vld1.16         {q0},     [r4,:128]
1323         add             lr,  sp,  #15
1324         add             r12, r12, #5
1325         bic             lr,  lr,  #15
1326 1:
1327         vld1.8          {q1},     [r2], r3
1328         vp8_epel8_h6    d2,  d2,  d3
1329         vst1.32         {d2[0]},  [lr,:32]!
1330         subs            r12, r12, #1
1331         bne             1b
1332
1333         ldr             r4,  [sp, #52+16+16]    @ my
1334         movrel          lr,  subpel_filters-16
1335         ldr             r12, [sp, #52+16+8]     @ h
1336         add             r4,  lr,  r4, lsl #4
1337         add             lr,  sp,  #15
1338         vld1.16         {q0},     [r4,:128]
1339         bic             lr,  lr,  #15
1340 2:
1341         vld1.8          {d2-d3},  [lr,:128]!
1342         vld1.8          {d6},     [lr,:64]!
1343         vld1.32         {d28[]},  [lr,:32]
1344         sub             lr,  lr,  #16
1345         vld1.8          {d4-d5},  [lr]!
1346         vld1.8          {d7},     [lr,:64]!
1347         vld1.32         {d28[1]}, [lr,:32]
1348         sub             lr,  lr,  #16
1349         vtrn.32         q1,  q2
1350         vtrn.32         d6,  d7
1351         vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1352         vst1.32         {d2[0]},  [r0,:32], r1
1353         vst1.32         {d3[0]},  [r0,:32], r1
1354         vst1.32         {d2[1]},  [r0,:32], r1
1355         vst1.32         {d3[1]},  [r0,:32], r1
1356         subs            r12, r12, #4
1357         bne             2b
1358
1359         add             sp,  sp,  #52+16
1360         pop             {r4,pc}
1361 endfunc
1362
1363 function ff_put_vp8_epel4_h4v6_neon, export=1
1364         sub             r2,  r2,  r3,  lsl #1
1365         sub             r2,  r2,  #1
1366         push            {r4,lr}
1367
1368         ldr             r4,  [sp, #12]          @ mx
1369         movrel          lr,  subpel_filters-16
1370         ldr             r12, [sp, #8]           @ h
1371         add             r4,  lr,  r4, lsl #4
1372         sub             sp,  sp,  #52+16
1373         vld1.16         {q0},     [r4,:128]
1374         add             lr,  sp,  #15
1375         add             r12, r12, #5
1376         bic             lr,  lr,  #15
1377 1:
1378         vld1.8          {d2},     [r2], r3
1379         vp8_epel8_h4    d2,  d2,  d2
1380         vst1.32         {d2[0]},  [lr,:32]!
1381         subs            r12, r12, #1
1382         bne             1b
1383
1384         ldr             r4,  [sp, #52+16+16]    @ my
1385         movrel          lr,  subpel_filters-16
1386         ldr             r12, [sp, #52+16+8]     @ h
1387         add             r4,  lr,  r4, lsl #4
1388         add             lr,  sp,  #15
1389         vld1.16         {q0},     [r4,:128]
1390         bic             lr,  lr,  #15
1391 2:
1392         vld1.8          {d2-d3},  [lr,:128]!
1393         vld1.8          {d6},     [lr,:64]!
1394         vld1.32         {d28[]},  [lr,:32]
1395         sub             lr,  lr,  #16
1396         vld1.8          {d4-d5},  [lr]!
1397         vld1.8          {d7},     [lr,:64]!
1398         vld1.32         {d28[1]}, [lr,:32]
1399         sub             lr,  lr,  #16
1400         vtrn.32         q1,  q2
1401         vtrn.32         d6,  d7
1402         vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1403         vst1.32         {d2[0]},  [r0,:32], r1
1404         vst1.32         {d3[0]},  [r0,:32], r1
1405         vst1.32         {d2[1]},  [r0,:32], r1
1406         vst1.32         {d3[1]},  [r0,:32], r1
1407         subs            r12, r12, #4
1408         bne             2b
1409
1410         add             sp,  sp,  #52+16
1411         pop             {r4,pc}
1412 endfunc
1413
1414 function ff_put_vp8_epel4_h6v4_neon, export=1
1415         sub             r2,  r2,  r3
1416         sub             r2,  r2,  #2
1417         push            {r4,lr}
1418
1419         ldr             r4,  [sp, #12]          @ mx
1420         movrel          lr,  subpel_filters-16
1421         ldr             r12, [sp, #8]           @ h
1422         add             r4,  lr,  r4, lsl #4
1423         sub             sp,  sp,  #44+16
1424         vld1.16         {q0},     [r4,:128]
1425         add             lr,  sp,  #15
1426         add             r12, r12, #3
1427         bic             lr,  lr,  #15
1428 1:
1429         vld1.8          {q1},     [r2], r3
1430         vp8_epel8_h6    d2,  d2,  d3
1431         vst1.32         {d2[0]},  [lr,:32]!
1432         subs            r12, r12, #1
1433         bne             1b
1434
1435         ldr             r4,  [sp, #44+16+16]    @ my
1436         movrel          lr,  subpel_filters-16
1437         ldr             r12, [sp, #44+16+8]     @ h
1438         add             r4,  lr,  r4, lsl #4
1439         add             lr,  sp,  #15
1440         vld1.16         {q0},     [r4,:128]
1441         bic             lr,  lr,  #15
1442 2:
1443         vld1.8          {d2-d3},  [lr,:128]!
1444         vld1.32         {d6[]},   [lr,:32]
1445         sub             lr,  lr,  #8
1446         vld1.8          {d4-d5},  [lr]!
1447         vld1.32         {d6[1]},  [lr,:32]
1448         sub             lr,  lr,  #8
1449         vtrn.32         q1,  q2
1450         vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1451         vst1.32         {d2[0]},  [r0,:32], r1
1452         vst1.32         {d3[0]},  [r0,:32], r1
1453         vst1.32         {d2[1]},  [r0,:32], r1
1454         vst1.32         {d3[1]},  [r0,:32], r1
1455         subs            r12, r12, #4
1456         bne             2b
1457
1458         add             sp,  sp,  #44+16
1459         pop             {r4,pc}
1460 endfunc
1461
1462 function ff_put_vp8_epel4_h4_neon, export=1
1463         sub             r2,  r2,  #1
1464         push            {r4,lr}
1465
1466         ldr             r4,  [sp, #12]          @ mx
1467         movrel          lr,  subpel_filters-16
1468         ldr             r12, [sp, #8]           @ h
1469         add             r4,  lr,  r4, lsl #4
1470         vld1.16         {q0},     [r4,:128]
1471 1:
1472         vld1.8          {d2},     [r2], r3
1473         vp8_epel8_h4    d2,  d2,  d2
1474         vst1.32         {d2[0]},  [r0,:32], r1
1475         subs            r12, r12, #1
1476         bne             1b
1477
1478         pop             {r4,pc}
1479 endfunc
1480
1481 function ff_put_vp8_epel4_v4_neon, export=1
1482         sub             r2,  r2,  r3
1483         push            {r4,lr}
1484
1485         ldr             r4,  [sp, #16]          @ my
1486         movrel          lr,  subpel_filters-16
1487         ldr             r12, [sp, #8]           @ h
1488         add             r4,  lr,  r4, lsl #4
1489         vld1.16         {q0},     [r4,:128]
1490 1:
1491         vld1.32         {d2[]},   [r2], r3
1492         vld1.32         {d3[]},   [r2], r3
1493         vld1.32         {d4[]},   [r2], r3
1494         vld1.32         {d5[]},   [r2], r3
1495         vld1.32         {d6[]},   [r2]
1496         sub             r2,  r2,  r3,  lsl #1
1497         vld1.32         {d2[1]},  [r2], r3
1498         vld1.32         {d3[1]},  [r2], r3
1499         vld1.32         {d4[1]},  [r2], r3
1500         vld1.32         {d5[1]},  [r2], r3
1501         vld1.32         {d6[1]},  [r2]
1502         sub             r2,  r2,  r3,  lsl #1
1503
1504         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1505
1506         vst1.32         {d2[0]},  [r0,:32], r1
1507         vst1.32         {d3[0]},  [r0,:32], r1
1508         vst1.32         {d2[1]},  [r0,:32], r1
1509         vst1.32         {d3[1]},  [r0,:32], r1
1510         subs            r12, r12, #4
1511         bne             1b
1512
1513         pop             {r4,pc}
1514 endfunc
1515
1516 function ff_put_vp8_epel4_h4v4_neon, export=1
1517         sub             r2,  r2,  r3
1518         sub             r2,  r2,  #1
1519         push            {r4,lr}
1520
1521         ldr             r4,  [sp, #12]          @ mx
1522         movrel          lr,  subpel_filters-16
1523         ldr             r12, [sp, #8]           @ h
1524         add             r4,  lr,  r4, lsl #4
1525         sub             sp,  sp,  #44+16
1526         vld1.16         {q0},     [r4,:128]
1527         add             lr,  sp,  #15
1528         add             r12, r12, #3
1529         bic             lr,  lr,  #15
1530 1:
1531         vld1.8          {d2},     [r2], r3
1532         vp8_epel8_h4    d2,  d2,  d3
1533         vst1.32         {d2[0]},  [lr,:32]!
1534         subs            r12, r12, #1
1535         bne             1b
1536
1537         ldr             r4,  [sp, #44+16+16]    @ my
1538         movrel          lr,  subpel_filters-16
1539         ldr             r12, [sp, #44+16+8]     @ h
1540         add             r4,  lr,  r4, lsl #4
1541         add             lr,  sp,  #15
1542         vld1.16         {q0},     [r4,:128]
1543         bic             lr,  lr,  #15
1544 2:
1545         vld1.8          {d2-d3},  [lr,:128]!
1546         vld1.32         {d6[]},   [lr,:32]
1547         sub             lr,  lr,  #8
1548         vld1.8          {d4-d5},  [lr]!
1549         vld1.32         {d6[1]},  [lr,:32]
1550         sub             lr,  lr,  #8
1551         vtrn.32         q1,  q2
1552         vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1553         vst1.32         {d2[0]},  [r0,:32], r1
1554         vst1.32         {d3[0]},  [r0,:32], r1
1555         vst1.32         {d2[1]},  [r0,:32], r1
1556         vst1.32         {d3[1]},  [r0,:32], r1
1557         subs            r12, r12, #4
1558         bne             2b
1559
1560         add             sp,  sp,  #44+16
1561         pop             {r4,pc}
1562 endfunc
1563
1564 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1565 @ arithmetic can be used to apply filters
1566 const   subpel_filters, align=4
1567         .short     0,   6, 123,  12,   1,   0,   0,   0
1568         .short     2,  11, 108,  36,   8,   1,   0,   0
1569         .short     0,   9,  93,  50,   6,   0,   0,   0
1570         .short     3,  16,  77,  77,  16,   3,   0,   0
1571         .short     0,   6,  50,  93,   9,   0,   0,   0
1572         .short     1,   8,  36, 108,  11,   2,   0,   0
1573         .short     0,   1,  12, 123,   6,   0,   0,   0
1574 endconst
1575
1576 /* Bilinear MC */
1577
1578 function ff_put_vp8_bilin16_h_neon, export=1
1579         ldr             r12, [sp, #4]           @ mx
1580         vdup.8          d0,  r12
1581         rsb             r12, r12, #8
1582         vdup.8          d1,  r12
1583         ldr             r12, [sp]               @ h
1584 1:
1585         subs            r12, r12, #2
1586         vld1.8          {d2-d4},  [r2], r3
1587         vext.8          q2,  q1,  q2,  #1
1588         vmull.u8        q8,  d2,  d1
1589         vmlal.u8        q8,  d4,  d0
1590         vld1.8          {d18-d20},[r2], r3
1591         vmull.u8        q3,  d3,  d1
1592         vmlal.u8        q3,  d5,  d0
1593         vext.8          q10, q9,  q10, #1
1594         vmull.u8        q11, d18, d1
1595         vmlal.u8        q11, d20, d0
1596         vmull.u8        q12, d19, d1
1597         vmlal.u8        q12, d21, d0
1598         vrshrn.u16      d4,  q8,  #3
1599         vrshrn.u16      d5,  q3,  #3
1600         vrshrn.u16      d6,  q11, #3
1601         vrshrn.u16      d7,  q12, #3
1602         vst1.8          {q2},     [r0,:128], r1
1603         vst1.8          {q3},     [r0,:128], r1
1604         bgt             1b
1605
1606         bx              lr
1607 endfunc
1608
1609 function ff_put_vp8_bilin16_v_neon, export=1
1610         ldr             r12, [sp, #8]           @ my
1611         vdup.8          d0,  r12
1612         rsb             r12, r12, #8
1613         vdup.8          d1,  r12
1614         ldr             r12, [sp]               @ h
1615         vld1.8          {q1},     [r2], r3
1616 1:
1617         subs            r12, r12, #2
1618         vld1.8          {q2},     [r2], r3
1619         vmull.u8        q3,  d2,  d1
1620         vmlal.u8        q3,  d4,  d0
1621         vmull.u8        q8,  d3,  d1
1622         vmlal.u8        q8,  d5,  d0
1623         vld1.8          {q1},     [r2], r3
1624         vmull.u8        q9,  d4,  d1
1625         vmlal.u8        q9,  d2,  d0
1626         vmull.u8        q10, d5,  d1
1627         vmlal.u8        q10, d3,  d0
1628         vrshrn.u16      d4,  q3,  #3
1629         vrshrn.u16      d5,  q8,  #3
1630         vrshrn.u16      d6,  q9,  #3
1631         vrshrn.u16      d7,  q10, #3
1632         vst1.8          {q2},     [r0,:128], r1
1633         vst1.8          {q3},     [r0,:128], r1
1634         bgt             1b
1635
1636         bx              lr
1637 endfunc
1638
1639 function ff_put_vp8_bilin16_hv_neon, export=1
1640         ldr             r12, [sp, #4]           @ mx
1641         vdup.8          d0,  r12
1642         rsb             r12, r12, #8
1643         vdup.8          d1,  r12
1644         ldr             r12, [sp, #8]           @ my
1645         vdup.8          d2,  r12
1646         rsb             r12, r12, #8
1647         vdup.8          d3,  r12
1648         ldr             r12, [sp]               @ h
1649
1650         vld1.8          {d4-d6},  [r2], r3
1651         vext.8          q3,  q2,  q3,  #1
1652         vmull.u8        q8,  d4,  d1
1653         vmlal.u8        q8,  d6,  d0
1654         vmull.u8        q9,  d5,  d1
1655         vmlal.u8        q9,  d7,  d0
1656         vrshrn.u16      d4,  q8,  #3
1657         vrshrn.u16      d5,  q9,  #3
1658 1:
1659         subs            r12, r12, #2
1660         vld1.8          {d18-d20},[r2], r3
1661         vext.8          q10, q9,  q10, #1
1662         vmull.u8        q11, d18, d1
1663         vmlal.u8        q11, d20, d0
1664         vld1.8          {d26-d28},[r2], r3
1665         vmull.u8        q12, d19, d1
1666         vmlal.u8        q12, d21, d0
1667         vext.8          q14, q13, q14, #1
1668         vmull.u8        q8,  d26, d1
1669         vmlal.u8        q8,  d28, d0
1670         vmull.u8        q9,  d27, d1
1671         vmlal.u8        q9,  d29, d0
1672         vrshrn.u16      d6,  q11, #3
1673         vrshrn.u16      d7,  q12, #3
1674         vmull.u8        q12, d4,  d3
1675         vmlal.u8        q12, d6,  d2
1676         vmull.u8        q15, d5,  d3
1677         vmlal.u8        q15, d7,  d2
1678         vrshrn.u16      d4,  q8,  #3
1679         vrshrn.u16      d5,  q9,  #3
1680         vmull.u8        q10, d6,  d3
1681         vmlal.u8        q10, d4,  d2
1682         vmull.u8        q11, d7,  d3
1683         vmlal.u8        q11, d5,  d2
1684         vrshrn.u16      d24, q12, #3
1685         vrshrn.u16      d25, q15, #3
1686         vst1.8          {q12},    [r0,:128], r1
1687         vrshrn.u16      d20, q10, #3
1688         vrshrn.u16      d21, q11, #3
1689         vst1.8          {q10},    [r0,:128], r1
1690         bgt             1b
1691
1692         bx              lr
1693 endfunc
1694
1695 function ff_put_vp8_bilin8_h_neon, export=1
1696         ldr             r12, [sp, #4]           @ mx
1697         vdup.8          d0,  r12
1698         rsb             r12, r12, #8
1699         vdup.8          d1,  r12
1700         ldr             r12, [sp]               @ h
1701 1:
1702         subs            r12, r12, #2
1703         vld1.8          {q1},     [r2], r3
1704         vext.8          d3,  d2,  d3,  #1
1705         vmull.u8        q2,  d2,  d1
1706         vmlal.u8        q2,  d3,  d0
1707         vld1.8          {q3},     [r2], r3
1708         vext.8          d7,  d6,  d7,  #1
1709         vmull.u8        q8,  d6,  d1
1710         vmlal.u8        q8,  d7,  d0
1711         vrshrn.u16      d4,  q2,  #3
1712         vrshrn.u16      d16, q8,  #3
1713         vst1.8          {d4},     [r0,:64], r1
1714         vst1.8          {d16},    [r0,:64], r1
1715         bgt             1b
1716
1717         bx              lr
1718 endfunc
1719
1720 function ff_put_vp8_bilin8_v_neon, export=1
1721         ldr             r12, [sp, #8]           @ my
1722         vdup.8          d0,  r12
1723         rsb             r12, r12,  #8
1724         vdup.8          d1,  r12
1725         ldr             r12, [sp]               @ h
1726         vld1.8          {d2},     [r2], r3
1727 1:
1728         subs            r12, r12, #2
1729         vld1.8          {d3},     [r2], r3
1730         vmull.u8        q2,  d2,  d1
1731         vmlal.u8        q2,  d3,  d0
1732         vld1.8          {d2},     [r2], r3
1733         vmull.u8        q3,  d3,  d1
1734         vmlal.u8        q3,  d2,  d0
1735         vrshrn.u16      d4,  q2,  #3
1736         vrshrn.u16      d6,  q3,  #3
1737         vst1.8          {d4},     [r0,:64], r1
1738         vst1.8          {d6},     [r0,:64], r1
1739         bgt             1b
1740
1741         bx              lr
1742 endfunc
1743
1744 function ff_put_vp8_bilin8_hv_neon, export=1
1745         ldr             r12, [sp, #4]           @ mx
1746         vdup.8          d0,  r12
1747         rsb             r12, r12, #8
1748         vdup.8          d1,  r12
1749         ldr             r12, [sp, #8]           @ my
1750         vdup.8          d2,  r12
1751         rsb             r12, r12, #8
1752         vdup.8          d3,  r12
1753         ldr             r12, [sp]               @ h
1754
1755         vld1.8          {q2},     [r2], r3
1756         vext.8          d5,  d4,  d5,  #1
1757         vmull.u8        q9,  d4,  d1
1758         vmlal.u8        q9,  d5,  d0
1759         vrshrn.u16      d22, q9,  #3
1760 1:
1761         subs            r12, r12, #2
1762         vld1.8          {q3},     [r2], r3
1763         vext.8          d7,  d6,  d7,  #1
1764         vmull.u8        q8,  d6,  d1
1765         vmlal.u8        q8,  d7,  d0
1766         vld1.8          {q2},     [r2], r3
1767         vext.8          d5,  d4,  d5,  #1
1768         vmull.u8        q9,  d4,  d1
1769         vmlal.u8        q9,  d5,  d0
1770         vrshrn.u16      d16, q8,  #3
1771         vmull.u8        q10, d22, d3
1772         vmlal.u8        q10, d16, d2
1773         vrshrn.u16      d22, q9,  #3
1774         vmull.u8        q12, d16, d3
1775         vmlal.u8        q12, d22, d2
1776         vrshrn.u16      d20, q10, #3
1777         vst1.8          {d20},    [r0,:64], r1
1778         vrshrn.u16      d23, q12, #3
1779         vst1.8          {d23},    [r0,:64], r1
1780         bgt             1b
1781
1782         bx              lr
1783 endfunc
1784
1785 function ff_put_vp8_bilin4_h_neon, export=1
1786         ldr             r12, [sp, #4]           @ mx
1787         vdup.8          d0,  r12
1788         rsb             r12, r12, #8
1789         vdup.8          d1,  r12
1790         ldr             r12, [sp]               @ h
1791 1:
1792         subs            r12, r12, #2
1793         vld1.8          {d2},     [r2], r3
1794         vext.8          d3,  d2,  d3,  #1
1795         vld1.8          {d6},     [r2], r3
1796         vext.8          d7,  d6,  d7,  #1
1797         vtrn.32         q1,  q3
1798         vmull.u8        q2,  d2,  d1
1799         vmlal.u8        q2,  d3,  d0
1800         vrshrn.u16      d4,  q2,  #3
1801         vst1.32         {d4[0]},  [r0,:32], r1
1802         vst1.32         {d4[1]}, [r0,:32], r1
1803         bgt             1b
1804
1805         bx              lr
1806 endfunc
1807
1808 function ff_put_vp8_bilin4_v_neon, export=1
1809         ldr             r12, [sp, #8]           @ my
1810         vdup.8          d0,  r12
1811         rsb             r12, r12, #8
1812         vdup.8          d1,  r12
1813         ldr             r12, [sp]               @ h
1814         vld1.32         {d2[]},   [r2], r3
1815 1:
1816         vld1.32         {d3[]},   [r2]
1817         vld1.32         {d2[1]},  [r2], r3
1818         vld1.32         {d3[1]},  [r2], r3
1819         vmull.u8        q2,  d2,  d1
1820         vmlal.u8        q2,  d3,  d0
1821         vtrn.32         d3,  d2
1822         vrshrn.u16      d4,  q2,  #3
1823         vst1.32         {d4[0]},  [r0,:32], r1
1824         vst1.32         {d4[1]},  [r0,:32], r1
1825         subs            r12, r12, #2
1826         bgt             1b
1827
1828         bx              lr
1829 endfunc
1830
1831 function ff_put_vp8_bilin4_hv_neon, export=1
1832         ldr             r12, [sp, #4]           @ mx
1833         vdup.8          d0,  r12
1834         rsb             r12, r12, #8
1835         vdup.8          d1,  r12
1836         ldr             r12, [sp, #8]           @ my
1837         vdup.8          d2,  r12
1838         rsb             r12, r12, #8
1839         vdup.8          d3,  r12
1840         ldr             r12, [sp]               @ h
1841
1842         vld1.8          {d4},     [r2], r3
1843         vext.8          d5,  d4,  d4,  #1
1844         vmull.u8        q9,  d4,  d1
1845         vmlal.u8        q9,  d5,  d0
1846         vrshrn.u16      d22, q9,  #3
1847 1:
1848         subs            r12, r12, #2
1849         vld1.8          {d6},     [r2], r3
1850         vext.8          d7,  d6,  d6,  #1
1851         vld1.8          {d4},     [r2], r3
1852         vext.8          d5,  d4,  d4,  #1
1853         vtrn.32         q3,  q2
1854         vmull.u8        q8,  d6,  d1
1855         vmlal.u8        q8,  d7,  d0
1856         vrshrn.u16      d16, q8,  #3
1857         vmull.u8        q10, d16, d2
1858         vtrn.32         d22, d16
1859         vmlal.u8        q10, d22, d3
1860         vrev64.32       d22, d16
1861         vrshrn.u16      d20, q10, #3
1862         vst1.32         {d20[0]}, [r0,:32], r1
1863         vst1.32         {d20[1]}, [r0,:32], r1
1864         bgt             1b
1865
1866         bx              lr
1867 endfunc