]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/vp8dsp_neon.S
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libavcodec / arm / vp8dsp_neon.S
1 /**
2  * VP8 NEON optimisations
3  *
4  * Copyright (c) 2010 Rob Clark <rob@ti.com>
5  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23
24 #include "asm.S"
25
26 function ff_vp8_luma_dc_wht_neon, export=1
27         vld1.16         {q0-q1},  [r1,:128]
28         vmov.i16        q15, #0
29
30         vadd.i16        d4,  d0,  d3
31         vadd.i16        d6,  d1,  d2
32         vst1.16         {q15},    [r1,:128]!
33         vsub.i16        d7,  d1,  d2
34         vsub.i16        d5,  d0,  d3
35         vst1.16         {q15},    [r1,:128]
36         vadd.i16        q0,  q2,  q3
37         vsub.i16        q1,  q2,  q3
38
39         vmov.i16        q8, #3
40
41         vtrn.32         d0,  d2
42         vtrn.32         d1,  d3
43         vtrn.16         d0,  d1
44         vtrn.16         d2,  d3
45
46         vadd.i16        d0,  d0,  d16
47
48         vadd.i16        d4,  d0,  d3
49         vadd.i16        d6,  d1,  d2
50         vsub.i16        d7,  d1,  d2
51         vsub.i16        d5,  d0,  d3
52         vadd.i16        q0,  q2,  q3
53         vsub.i16        q1,  q2,  q3
54
55         vshr.s16        q0,  q0,  #3
56         vshr.s16        q1,  q1,  #3
57
58         mov             r3,  #32
59         vst1.16         {d0[0]},  [r0,:16], r3
60         vst1.16         {d1[0]},  [r0,:16], r3
61         vst1.16         {d2[0]},  [r0,:16], r3
62         vst1.16         {d3[0]},  [r0,:16], r3
63         vst1.16         {d0[1]},  [r0,:16], r3
64         vst1.16         {d1[1]},  [r0,:16], r3
65         vst1.16         {d2[1]},  [r0,:16], r3
66         vst1.16         {d3[1]},  [r0,:16], r3
67         vst1.16         {d0[2]},  [r0,:16], r3
68         vst1.16         {d1[2]},  [r0,:16], r3
69         vst1.16         {d2[2]},  [r0,:16], r3
70         vst1.16         {d3[2]},  [r0,:16], r3
71         vst1.16         {d0[3]},  [r0,:16], r3
72         vst1.16         {d1[3]},  [r0,:16], r3
73         vst1.16         {d2[3]},  [r0,:16], r3
74         vst1.16         {d3[3]},  [r0,:16], r3
75
76         bx              lr
77 endfunc
78
79 function ff_vp8_idct_add_neon, export=1
80         vld1.16         {q0-q1},  [r1,:128]
81         movw            r3,  #20091
82         movt            r3,  #35468/2
83         vdup.32         d4,  r3
84
85         vmull.s16       q12, d1,  d4[0]
86         vmull.s16       q13, d3,  d4[0]
87         vqdmulh.s16     d20, d1,  d4[1]
88         vqdmulh.s16     d23, d3,  d4[1]
89         vshrn.s32       d21, q12, #16
90         vshrn.s32       d22, q13, #16
91         vadd.s16        d21, d21, d1
92         vadd.s16        d22, d22, d3
93
94         vadd.s16        d16, d0,  d2
95         vsub.s16        d17, d0,  d2
96         vadd.s16        d18, d21, d23
97         vsub.s16        d19, d20, d22
98         vadd.s16        q0,  q8,  q9
99         vsub.s16        q1,  q8,  q9
100
101         vtrn.32         d0,  d3
102         vtrn.32         d1,  d2
103         vtrn.16         d0,  d1
104         vtrn.16         d3,  d2
105
106         vmov.i16        q15, #0
107         vmull.s16       q12, d1,  d4[0]
108         vst1.16         {q15},    [r1,:128]!
109         vmull.s16       q13, d2,  d4[0]
110         vst1.16         {q15},    [r1,:128]
111         vqdmulh.s16     d21, d1,  d4[1]
112         vqdmulh.s16     d23, d2,  d4[1]
113         vshrn.s32       d20, q12, #16
114         vshrn.s32       d22, q13, #16
115         vadd.i16        d20, d20, d1
116         vadd.i16        d22, d22, d2
117
118         vadd.i16        d16, d0,  d3
119         vsub.i16        d17, d0,  d3
120         vadd.i16        d18, d20, d23
121         vld1.32         {d20[]},  [r0,:32], r2
122         vsub.i16        d19, d21, d22
123         vld1.32         {d22[]},  [r0,:32], r2
124         vadd.s16        q0,  q8,  q9
125         vld1.32         {d23[]},  [r0,:32], r2
126         vsub.s16        q1,  q8,  q9
127         vld1.32         {d21[]},  [r0,:32], r2
128         vrshr.s16       q0,  q0,  #3
129         vtrn.32         q10, q11
130         vrshr.s16       q1,  q1,  #3
131
132         sub             r0,  r0,  r2,  lsl #2
133
134         vtrn.32         d0,  d3
135         vtrn.32         d1,  d2
136         vtrn.16         d0,  d1
137         vtrn.16         d3,  d2
138
139         vaddw.u8        q0,  q0,  d20
140         vaddw.u8        q1,  q1,  d21
141         vqmovun.s16     d0,  q0
142         vqmovun.s16     d1,  q1
143
144         vst1.32         {d0[0]},  [r0,:32], r2
145         vst1.32         {d0[1]},  [r0,:32], r2
146         vst1.32         {d1[1]},  [r0,:32], r2
147         vst1.32         {d1[0]},  [r0,:32], r2
148
149         bx              lr
150 endfunc
151
152 function ff_vp8_idct_dc_add_neon, export=1
153         mov             r3,  #0
154         ldrsh           r12, [r1]
155         strh            r3,  [r1]
156         vdup.16         q1,  r12
157         vrshr.s16       q1,  q1,  #3
158         vld1.32         {d0[]},   [r0,:32], r2
159         vld1.32         {d1[]},   [r0,:32], r2
160         vld1.32         {d0[1]},  [r0,:32], r2
161         vld1.32         {d1[1]},  [r0,:32], r2
162         vaddw.u8        q2,  q1,  d0
163         vaddw.u8        q3,  q1,  d1
164         sub             r0,  r0,  r2, lsl #2
165         vqmovun.s16     d0,  q2
166         vqmovun.s16     d1,  q3
167         vst1.32         {d0[0]},  [r0,:32], r2
168         vst1.32         {d1[0]},  [r0,:32], r2
169         vst1.32         {d0[1]},  [r0,:32], r2
170         vst1.32         {d1[1]},  [r0,:32], r2
171         bx              lr
172 endfunc
173
174 function ff_vp8_idct_dc_add4uv_neon, export=1
175         vmov.i16        d0,  #0
176         mov             r3,  #32
177         vld1.16         {d16[]},  [r1,:16]
178         vst1.16         {d0[0]},  [r1,:16], r3
179         vld1.16         {d17[]},  [r1,:16]
180         vst1.16         {d0[0]},  [r1,:16], r3
181         vld1.16         {d18[]},  [r1,:16]
182         vst1.16         {d0[0]},  [r1,:16], r3
183         vld1.16         {d19[]},  [r1,:16]
184         vst1.16         {d0[0]},  [r1,:16], r3
185         mov             r3,  r0
186         vrshr.s16       q8,  q8,  #3            @ dc >>= 3
187         vld1.8          {d0},     [r0,:64], r2
188         vrshr.s16       q9,  q9,  #3
189         vld1.8          {d1},     [r0,:64], r2
190         vaddw.u8        q10, q8,  d0
191         vld1.8          {d2},     [r0,:64], r2
192         vaddw.u8        q0,  q8,  d1
193         vld1.8          {d3},     [r0,:64], r2
194         vaddw.u8        q11, q8,  d2
195         vld1.8          {d4},     [r0,:64], r2
196         vaddw.u8        q1,  q8,  d3
197         vld1.8          {d5},     [r0,:64], r2
198         vaddw.u8        q12, q9,  d4
199         vld1.8          {d6},     [r0,:64], r2
200         vaddw.u8        q2,  q9,  d5
201         vld1.8          {d7},     [r0,:64], r2
202         vaddw.u8        q13, q9,  d6
203         vqmovun.s16     d20, q10
204         vaddw.u8        q3,  q9,  d7
205         vqmovun.s16     d21, q0
206         vqmovun.s16     d22, q11
207         vst1.8          {d20},    [r3,:64], r2
208         vqmovun.s16     d23, q1
209         vst1.8          {d21},    [r3,:64], r2
210         vqmovun.s16     d24, q12
211         vst1.8          {d22},    [r3,:64], r2
212         vqmovun.s16     d25, q2
213         vst1.8          {d23},    [r3,:64], r2
214         vqmovun.s16     d26, q13
215         vst1.8          {d24},    [r3,:64], r2
216         vqmovun.s16     d27, q3
217         vst1.8          {d25},    [r3,:64], r2
218         vst1.8          {d26},    [r3,:64], r2
219         vst1.8          {d27},    [r3,:64], r2
220
221         bx              lr
222 endfunc
223
224 function ff_vp8_idct_dc_add4y_neon, export=1
225         vmov.i16        d0,  #0
226         mov             r3,  #32
227         vld1.16         {d16[]},  [r1,:16]
228         vst1.16         {d0[0]},  [r1,:16], r3
229         vld1.16         {d17[]},  [r1,:16]
230         vst1.16         {d0[0]},  [r1,:16], r3
231         vld1.16         {d18[]},  [r1,:16]
232         vst1.16         {d0[0]},  [r1,:16], r3
233         vld1.16         {d19[]},  [r1,:16]
234         vst1.16         {d0[0]},  [r1,:16], r3
235         vrshr.s16       q8,  q8,  #3            @ dc >>= 3
236         vld1.8          {q0},     [r0,:128], r2
237         vrshr.s16       q9,  q9,  #3
238         vld1.8          {q1},     [r0,:128], r2
239         vaddw.u8        q10, q8,  d0
240         vld1.8          {q2},     [r0,:128], r2
241         vaddw.u8        q0,  q9,  d1
242         vld1.8          {q3},     [r0,:128], r2
243         vaddw.u8        q11, q8,  d2
244         vaddw.u8        q1,  q9,  d3
245         vaddw.u8        q12, q8,  d4
246         vaddw.u8        q2,  q9,  d5
247         vaddw.u8        q13, q8,  d6
248         vaddw.u8        q3,  q9,  d7
249         sub             r0,  r0,  r2,  lsl #2
250         vqmovun.s16     d20, q10
251         vqmovun.s16     d21, q0
252         vqmovun.s16     d22, q11
253         vqmovun.s16     d23, q1
254         vqmovun.s16     d24, q12
255         vst1.8          {q10},    [r0,:128], r2
256         vqmovun.s16     d25, q2
257         vst1.8          {q11},    [r0,:128], r2
258         vqmovun.s16     d26, q13
259         vst1.8          {q12},    [r0,:128], r2
260         vqmovun.s16     d27, q3
261         vst1.8          {q13},    [r0,:128], r2
262
263         bx              lr
264 endfunc
265
266 @ Register layout:
267 @   P3..Q3 -> q0..q7
268 @   flim_E -> q14
269 @   flim_I -> q15
270 @   hev_thresh -> r12
271 @
272 .macro  vp8_loop_filter, inner=0, simple=0
273     .if \simple
274         vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
275         vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
276         vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
277         vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
278         vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
279         vmov.i8         q13, #0x80
280         vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
281     .else
282         @ calculate hev and normal_limit:
283         vabd.u8         q12, q2,  q3            @ abs(P1-P0)
284         vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
285         vabd.u8         q10, q0,  q1            @ abs(P3-P2)
286         vabd.u8         q11, q1,  q2            @ abs(P2-P1)
287         vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
288         vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
289         vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
290         vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
291         vand            q8,  q8,  q9
292         vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
293         vand            q8,  q8,  q11
294         vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
295         vand            q8,  q8,  q10
296         vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
297         vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
298         vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
299         vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
300         vand            q8,  q8,  q10
301         vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
302         vand            q8,  q8,  q11
303         vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
304         vdup.8          q15, r12                @ hev_thresh
305         vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
306         vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
307         vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
308         vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
309         vand            q8,  q8,  q11
310         vmov.i8         q13, #0x80
311         vorr            q9,  q12, q14
312     .endif
313
314         @ at this point:
315         @   q8: normal_limit
316         @   q9: hev
317
318         @ convert to signed value:
319         veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
320         veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
321
322         vmov.i16        q12, #3
323         vsubl.s8        q10, d8,  d6            @ QS0 - PS0
324         vsubl.s8        q11, d9,  d7            @   (widened to 16bit)
325         veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
326         veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
327         vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
328         vmul.i16        q11, q11, q12
329
330         vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
331         vmov.i8         q14, #4
332         vmov.i8         q15, #3
333     .if \inner
334         vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
335     .endif
336         vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
337         vaddw.s8        q11, q11, d25
338         vqmovn.s16      d20, q10                @ narrow result back into q10
339         vqmovn.s16      d21, q11
340     .if !\inner && !\simple
341         veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
342         veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
343     .endif
344         vand            q10, q10, q8            @ w &= normal_limit
345
346         @ registers used at this point..
347         @   q0 -> P3  (don't corrupt)
348         @   q1-q6 -> PS2-QS2
349         @   q7 -> Q3  (don't corrupt)
350         @   q9 -> hev
351         @   q10 -> w
352         @   q13 -> #0x80
353         @   q14 -> #4
354         @   q15 -> #3
355         @   q8, q11, q12 -> unused
356
357         @ filter_common:   is4tap==1
358         @   c1 = clamp(w + 4) >> 3;
359         @   c2 = clamp(w + 3) >> 3;
360         @   Q0 = s2u(QS0 - c1);
361         @   P0 = s2u(PS0 + c2);
362
363     .if \simple
364         vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
365         vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
366         vshr.s8         q11, q11, #3            @ c1 >>= 3
367         vshr.s8         q12, q12, #3            @ c2 >>= 3
368         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
369         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
370         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
371         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
372         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
373         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
374     .elseif \inner
375         @ the !is4tap case of filter_common, only used for inner blocks
376         @   c3 = ((c1&~hev) + 1) >> 1;
377         @   Q1 = s2u(QS1 - c3);
378         @   P1 = s2u(PS1 + c3);
379         vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
380         vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
381         vshr.s8         q11, q11, #3            @ c1 >>= 3
382         vshr.s8         q12, q12, #3            @ c2 >>= 3
383         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
384         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
385         vbic            q11, q11, q9            @ c1 & ~hev
386         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
387         vrshr.s8        q11, q11, #1            @ c3 >>= 1
388         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
389         vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
390         vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
391         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
392         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
393     .else
394         vand            q12, q10, q9            @ w & hev
395         vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
396         vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
397         vshr.s8         q11, q11, #3            @ c1 >>= 3
398         vshr.s8         q12, q12, #3            @ c2 >>= 3
399         vbic            q10, q10, q9            @ w &= ~hev
400         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
401         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
402
403         @ filter_mbedge:
404         @   a = clamp((27*w + 63) >> 7);
405         @   Q0 = s2u(QS0 - a);
406         @   P0 = s2u(PS0 + a);
407         @   a = clamp((18*w + 63) >> 7);
408         @   Q1 = s2u(QS1 - a);
409         @   P1 = s2u(PS1 + a);
410         @   a = clamp((9*w + 63) >> 7);
411         @   Q2 = s2u(QS2 - a);
412         @   P2 = s2u(PS2 + a);
413         vmov.i16        q9,  #63
414         vshll.s8        q14, d20, #3
415         vshll.s8        q15, d21, #3
416         vaddw.s8        q14, q14, d20
417         vaddw.s8        q15, q15, d21
418         vadd.s16        q8,  q9,  q14
419         vadd.s16        q9,  q9,  q15           @  9*w + 63
420         vadd.s16        q11, q8,  q14
421         vadd.s16        q12, q9,  q15           @ 18*w + 63
422         vadd.s16        q14, q11, q14
423         vadd.s16        q15, q12, q15           @ 27*w + 63
424         vqshrn.s16      d16, q8,  #7
425         vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
426         vqshrn.s16      d22, q11, #7
427         vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
428         vqshrn.s16      d28, q14, #7
429         vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
430         vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
431         vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
432         vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
433         vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
434         vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
435         vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
436         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
437         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
438         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
439         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
440         veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
441         veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
442     .endif
443 .endm
444
445 .macro transpose8x16matrix
446         vtrn.32         q0,   q4
447         vtrn.32         q1,   q5
448         vtrn.32         q2,   q6
449         vtrn.32         q3,   q7
450
451         vtrn.16         q0,   q2
452         vtrn.16         q1,   q3
453         vtrn.16         q4,   q6
454         vtrn.16         q5,   q7
455
456         vtrn.8          q0,   q1
457         vtrn.8          q2,   q3
458         vtrn.8          q4,   q5
459         vtrn.8          q6,   q7
460 .endm
461
462 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
463 function ff_vp8_v_loop_filter16\name\()_neon, export=1
464         vpush           {q4-q7}
465         sub             r0,  r0,  r1,  lsl #1+!\simple
466
467         @ Load pixels:
468     .if !\simple
469         ldr             r12, [sp, #64]          @ hev_thresh
470         vld1.8          {q0},     [r0,:128], r1 @ P3
471         vld1.8          {q1},     [r0,:128], r1 @ P2
472     .endif
473         vld1.8          {q2},     [r0,:128], r1 @ P1
474         vld1.8          {q3},     [r0,:128], r1 @ P0
475         vld1.8          {q4},     [r0,:128], r1 @ Q0
476         vld1.8          {q5},     [r0,:128], r1 @ Q1
477     .if !\simple
478         vld1.8          {q6},     [r0,:128], r1 @ Q2
479         vld1.8          {q7},     [r0,:128]     @ Q3
480         vdup.8          q15, r3                 @ flim_I
481     .endif
482         vdup.8          q14, r2                 @ flim_E
483
484         vp8_loop_filter inner=\inner, simple=\simple
485
486         @ back up to P2:  dst -= stride * 6
487         sub             r0,  r0,  r1,  lsl #2
488     .if !\simple
489         sub             r0,  r0,  r1,  lsl #1
490
491         @ Store pixels:
492         vst1.8          {q1},     [r0,:128], r1 @ P2
493     .endif
494         vst1.8          {q2},     [r0,:128], r1 @ P1
495         vst1.8          {q3},     [r0,:128], r1 @ P0
496         vst1.8          {q4},     [r0,:128], r1 @ Q0
497         vst1.8          {q5},     [r0,:128], r1 @ Q1
498     .if !\simple
499         vst1.8          {q6},     [r0,:128]     @ Q2
500     .endif
501
502         vpop            {q4-q7}
503         bx              lr
504 endfunc
505 .endm
506
507 vp8_v_loop_filter16
508 vp8_v_loop_filter16 _inner,  inner=1
509 vp8_v_loop_filter16 _simple, simple=1
510
511 .macro  vp8_v_loop_filter8uv name, inner=0
512 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
513         vpush           {q4-q7}
514         sub             r0,  r0,  r2,  lsl #2
515         sub             r1,  r1,  r2,  lsl #2
516         ldr             r12, [sp, #64]          @ flim_I
517
518         @ Load pixels:
519         vld1.8          {d0},     [r0,:64], r2  @ P3
520         vld1.8          {d1},     [r1,:64], r2  @ P3
521         vld1.8          {d2},     [r0,:64], r2  @ P2
522         vld1.8          {d3},     [r1,:64], r2  @ P2
523         vld1.8          {d4},     [r0,:64], r2  @ P1
524         vld1.8          {d5},     [r1,:64], r2  @ P1
525         vld1.8          {d6},     [r0,:64], r2  @ P0
526         vld1.8          {d7},     [r1,:64], r2  @ P0
527         vld1.8          {d8},     [r0,:64], r2  @ Q0
528         vld1.8          {d9},     [r1,:64], r2  @ Q0
529         vld1.8          {d10},    [r0,:64], r2  @ Q1
530         vld1.8          {d11},    [r1,:64], r2  @ Q1
531         vld1.8          {d12},    [r0,:64], r2  @ Q2
532         vld1.8          {d13},    [r1,:64], r2  @ Q2
533         vld1.8          {d14},    [r0,:64]      @ Q3
534         vld1.8          {d15},    [r1,:64]      @ Q3
535
536         vdup.8          q14, r3                 @ flim_E
537         vdup.8          q15, r12                @ flim_I
538         ldr             r12, [sp, #68]          @ hev_thresh
539
540         vp8_loop_filter inner=\inner
541
542         @ back up to P2:  u,v -= stride * 6
543         sub             r0,  r0,  r2,  lsl #2
544         sub             r1,  r1,  r2,  lsl #2
545         sub             r0,  r0,  r2,  lsl #1
546         sub             r1,  r1,  r2,  lsl #1
547
548         @ Store pixels:
549         vst1.8          {d2},     [r0,:64], r2  @ P2
550         vst1.8          {d3},     [r1,:64], r2  @ P2
551         vst1.8          {d4},     [r0,:64], r2  @ P1
552         vst1.8          {d5},     [r1,:64], r2  @ P1
553         vst1.8          {d6},     [r0,:64], r2  @ P0
554         vst1.8          {d7},     [r1,:64], r2  @ P0
555         vst1.8          {d8},     [r0,:64], r2  @ Q0
556         vst1.8          {d9},     [r1,:64], r2  @ Q0
557         vst1.8          {d10},    [r0,:64], r2  @ Q1
558         vst1.8          {d11},    [r1,:64], r2  @ Q1
559         vst1.8          {d12},    [r0,:64]      @ Q2
560         vst1.8          {d13},    [r1,:64]      @ Q2
561
562         vpop            {q4-q7}
563         bx              lr
564 endfunc
565 .endm
566
567 vp8_v_loop_filter8uv
568 vp8_v_loop_filter8uv _inner, inner=1
569
570 .macro  vp8_h_loop_filter16 name, inner=0, simple=0
571 function ff_vp8_h_loop_filter16\name\()_neon, export=1
572         vpush           {q4-q7}
573         sub             r0,  r0,  #4
574     .if !\simple
575         ldr             r12, [sp, #64]          @ hev_thresh
576     .endif
577
578         @ Load pixels:
579         vld1.8          {d0},     [r0], r1      @ load first 8-line src data
580         vld1.8          {d2},     [r0], r1
581         vld1.8          {d4},     [r0], r1
582         vld1.8          {d6},     [r0], r1
583         vld1.8          {d8},     [r0], r1
584         vld1.8          {d10},    [r0], r1
585         vld1.8          {d12},    [r0], r1
586         vld1.8          {d14},    [r0], r1
587         vld1.8          {d1},     [r0], r1      @ load second 8-line src data
588         vld1.8          {d3},     [r0], r1
589         vld1.8          {d5},     [r0], r1
590         vld1.8          {d7},     [r0], r1
591         vld1.8          {d9},     [r0], r1
592         vld1.8          {d11},    [r0], r1
593         vld1.8          {d13},    [r0], r1
594         vld1.8          {d15},    [r0], r1
595
596         transpose8x16matrix
597
598         vdup.8          q14, r2                 @ flim_E
599     .if !\simple
600         vdup.8          q15, r3                 @ flim_I
601     .endif
602
603         vp8_loop_filter inner=\inner, simple=\simple
604
605         sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
606
607         transpose8x16matrix
608
609         @ Store pixels:
610         vst1.8          {d0},     [r0],     r1
611         vst1.8          {d2},     [r0],     r1
612         vst1.8          {d4},     [r0],     r1
613         vst1.8          {d6},     [r0],     r1
614         vst1.8          {d8},     [r0],     r1
615         vst1.8          {d10},    [r0],     r1
616         vst1.8          {d12},    [r0],     r1
617         vst1.8          {d14},    [r0],     r1
618         vst1.8          {d1},     [r0],     r1
619         vst1.8          {d3},     [r0],     r1
620         vst1.8          {d5},     [r0],     r1
621         vst1.8          {d7},     [r0],     r1
622         vst1.8          {d9},     [r0],     r1
623         vst1.8          {d11},    [r0],     r1
624         vst1.8          {d13},    [r0],     r1
625         vst1.8          {d15},    [r0]
626
627         vpop            {q4-q7}
628         bx              lr
629 endfunc
630 .endm
631
632 vp8_h_loop_filter16
633 vp8_h_loop_filter16 _inner,  inner=1
634 vp8_h_loop_filter16 _simple, simple=1
635
636 .macro  vp8_h_loop_filter8uv name, inner=0
637 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
638         vpush           {q4-q7}
639         sub             r0,  r0,  #4
640         sub             r1,  r1,  #4
641         ldr             r12, [sp, #64]          @ flim_I
642
643         @ Load pixels:
644         vld1.8          {d0},     [r0], r2      @ load u
645         vld1.8          {d1},     [r1], r2      @ load v
646         vld1.8          {d2},     [r0], r2
647         vld1.8          {d3},     [r1], r2
648         vld1.8          {d4},     [r0], r2
649         vld1.8          {d5},     [r1], r2
650         vld1.8          {d6},     [r0], r2
651         vld1.8          {d7},     [r1], r2
652         vld1.8          {d8},     [r0], r2
653         vld1.8          {d9},     [r1], r2
654         vld1.8          {d10},    [r0], r2
655         vld1.8          {d11},    [r1], r2
656         vld1.8          {d12},    [r0], r2
657         vld1.8          {d13},    [r1], r2
658         vld1.8          {d14},    [r0], r2
659         vld1.8          {d15},    [r1], r2
660
661         transpose8x16matrix
662
663         vdup.8          q14, r3                 @ flim_E
664         vdup.8          q15, r12                @ flim_I
665         ldr             r12, [sp, #68]          @ hev_thresh
666
667         vp8_loop_filter inner=\inner
668
669         sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
670         sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
671
672         transpose8x16matrix
673
674         @ Store pixels:
675         vst1.8          {d0},     [r0], r2
676         vst1.8          {d1},     [r1], r2
677         vst1.8          {d2},     [r0], r2
678         vst1.8          {d3},     [r1], r2
679         vst1.8          {d4},     [r0], r2
680         vst1.8          {d5},     [r1], r2
681         vst1.8          {d6},     [r0], r2
682         vst1.8          {d7},     [r1], r2
683         vst1.8          {d8},     [r0], r2
684         vst1.8          {d9},     [r1], r2
685         vst1.8          {d10},    [r0], r2
686         vst1.8          {d11},    [r1], r2
687         vst1.8          {d12},    [r0], r2
688         vst1.8          {d13},    [r1], r2
689         vst1.8          {d14},    [r0]
690         vst1.8          {d15},    [r1]
691
692         vpop            {q4-q7}
693         bx              lr
694 endfunc
695 .endm
696
697 vp8_h_loop_filter8uv
698 vp8_h_loop_filter8uv _inner, inner=1
699
700 function ff_put_vp8_pixels16_neon, export=1
701         ldr             r12, [sp, #0]           @ h
702 1:
703         subs            r12, r12, #4
704         vld1.8          {q0},     [r2], r3
705         vld1.8          {q1},     [r2], r3
706         vld1.8          {q2},     [r2], r3
707         vld1.8          {q3},     [r2], r3
708         vst1.8          {q0},     [r0,:128], r1
709         vst1.8          {q1},     [r0,:128], r1
710         vst1.8          {q2},     [r0,:128], r1
711         vst1.8          {q3},     [r0,:128], r1
712         bgt             1b
713         bx              lr
714 endfunc
715
716 function ff_put_vp8_pixels8_neon, export=1
717         ldr             r12, [sp, #0]           @ h
718 1:
719         subs            r12, r12, #4
720         vld1.8          {d0},     [r2], r3
721         vld1.8          {d1},     [r2], r3
722         vld1.8          {d2},     [r2], r3
723         vld1.8          {d3},     [r2], r3
724         vst1.8          {d0},     [r0,:64], r1
725         vst1.8          {d1},     [r0,:64], r1
726         vst1.8          {d2},     [r0,:64], r1
727         vst1.8          {d3},     [r0,:64], r1
728         bgt             1b
729         bx              lr
730 endfunc
731
732 /* 4/6-tap 8th-pel MC */
733
734 .macro  vp8_epel8_h6    d,   a,   b
735         vext.8          d27, \a,  \b,  #1
736         vmovl.u8        q8,  \a
737         vext.8          d28, \a,  \b,  #2
738         vmovl.u8        q9,  d27
739         vext.8          d29, \a,  \b,  #3
740         vmovl.u8        q10, d28
741         vext.8          d30, \a,  \b,  #4
742         vmovl.u8        q11, d29
743         vext.8          d31, \a,  \b,  #5
744         vmovl.u8        q12, d30
745         vmul.u16        q10, q10, d0[2]
746         vmovl.u8        q13, d31
747         vmul.u16        q11, q11, d0[3]
748         vmls.u16        q10, q9,  d0[1]
749         vmls.u16        q11, q12, d1[0]
750         vmla.u16        q10, q8,  d0[0]
751         vmla.u16        q11, q13, d1[1]
752         vqadd.s16       q11, q10, q11
753         vqrshrun.s16    \d,  q11, #7
754 .endm
755
756 .macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
757         vext.8          q14, \q0, \q1, #3
758         vext.8          q15, \q0, \q1, #4
759         vmovl.u8        q11, d28
760         vmovl.u8        q14, d29
761         vext.8          q3,  \q0, \q1, #2
762         vmovl.u8        q12, d30
763         vmovl.u8        q15, d31
764         vext.8          q8,  \q0, \q1, #1
765         vmovl.u8        q10, d6
766         vmovl.u8        q3,  d7
767         vext.8          q2,  \q0, \q1, #5
768         vmovl.u8        q13, d4
769         vmovl.u8        q2,  d5
770         vmovl.u8        q9,  d16
771         vmovl.u8        q8,  d17
772         vmul.u16        q11, q11, d0[3]
773         vmul.u16        q10, q10, d0[2]
774         vmul.u16        q3,  q3,  d0[2]
775         vmul.u16        q14, q14, d0[3]
776         vmls.u16        q11, q12, d1[0]
777         vmovl.u8        q12, \s0
778         vmovl.u8        q1,  \s1
779         vmls.u16        q10, q9,  d0[1]
780         vmls.u16        q3,  q8,  d0[1]
781         vmls.u16        q14, q15, d1[0]
782         vmla.u16        q10, q12, d0[0]
783         vmla.u16        q11, q13, d1[1]
784         vmla.u16        q3,  q1,  d0[0]
785         vmla.u16        q14, q2,  d1[1]
786         vqadd.s16       q11, q10, q11
787         vqadd.s16       q14, q3,  q14
788         vqrshrun.s16    \d0, q11, #7
789         vqrshrun.s16    \d1, q14, #7
790 .endm
791
792 .macro  vp8_epel8_v6    d0,  s0,  s1,  s2,  s3,  s4,  s5
793         vmovl.u8        q10, \s2
794         vmovl.u8        q11, \s3
795         vmovl.u8        q9,  \s1
796         vmovl.u8        q12, \s4
797         vmovl.u8        q8,  \s0
798         vmovl.u8        q13, \s5
799         vmul.u16        q10, q10, d0[2]
800         vmul.u16        q11, q11, d0[3]
801         vmls.u16        q10, q9,  d0[1]
802         vmls.u16        q11, q12, d1[0]
803         vmla.u16        q10, q8,  d0[0]
804         vmla.u16        q11, q13, d1[1]
805         vqadd.s16       q11, q10, q11
806         vqrshrun.s16    \d0, q11, #7
807 .endm
808
809 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
810         vmovl.u8        q10, \s0
811         vmovl.u8        q11, \s3
812         vmovl.u8        q14, \s6
813         vmovl.u8        q9,  \s1
814         vmovl.u8        q12, \s4
815         vmovl.u8        q8,  \s2
816         vmovl.u8        q13, \s5
817         vmul.u16        q10, q10, d0[0]
818         vmul.u16        q15, q11, d0[3]
819         vmul.u16        q11, q11, d0[2]
820         vmul.u16        q14, q14, d1[1]
821         vmls.u16        q10, q9,  d0[1]
822         vmls.u16        q15, q12, d1[0]
823         vmls.u16        q11, q8,  d0[1]
824         vmls.u16        q14, q13, d1[0]
825         vmla.u16        q10, q8,  d0[2]
826         vmla.u16        q15, q13, d1[1]
827         vmla.u16        q11, q9,  d0[0]
828         vmla.u16        q14, q12, d0[3]
829         vqadd.s16       q15, q10, q15
830         vqadd.s16       q14, q11, q14
831         vqrshrun.s16    \d0, q15, #7
832         vqrshrun.s16    \d1, q14, #7
833 .endm
834
835 .macro  vp8_epel8_h4    d,   a,   b
836         vext.8          d28, \a,  \b,  #1
837         vmovl.u8        q9,  \a
838         vext.8          d29, \a,  \b,  #2
839         vmovl.u8        q10, d28
840         vext.8          d30, \a,  \b,  #3
841         vmovl.u8        q11, d29
842         vmovl.u8        q12, d30
843         vmul.u16        q10, q10, d0[2]
844         vmul.u16        q11, q11, d0[3]
845         vmls.u16        q10, q9,  d0[1]
846         vmls.u16        q11, q12, d1[0]
847         vqadd.s16       q11, q10, q11
848         vqrshrun.s16    \d,  q11, #7
849 .endm
850
851 .macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
852         vmovl.u8        q9,  \s0
853         vmovl.u8        q10, \s1
854         vmovl.u8        q11, \s2
855         vmovl.u8        q12, \s3
856         vmovl.u8        q13, \s4
857         vmul.u16        q8,  q10, d0[2]
858         vmul.u16        q14, q11, d0[3]
859         vmul.u16        q11, q11, d0[2]
860         vmul.u16        q15, q12, d0[3]
861         vmls.u16        q8,  q9,  d0[1]
862         vmls.u16        q14, q12, d1[0]
863         vmls.u16        q11, q10, d0[1]
864         vmls.u16        q15, q13, d1[0]
865         vqadd.s16       q8,  q8,  q14
866         vqadd.s16       q11, q11, q15
867         vqrshrun.s16    \d0, q8,  #7
868         vqrshrun.s16    \d1, q11, #7
869 .endm
870
871 function ff_put_vp8_epel16_v6_neon, export=1
872         sub             r2,  r2,  r3,  lsl #1
873         push            {r4,lr}
874         vpush           {d8-d15}
875
876         ldr             r4,  [sp, #80]          @ my
877         movrel          lr,  subpel_filters-16
878         ldr             r12, [sp, #72]          @ h
879         add             r4,  lr,  r4, lsl #4
880         vld1.16         {q0},     [r4,:128]
881 1:
882         vld1.8          {d2-d3},  [r2], r3
883         vld1.8          {d4-d5},  [r2], r3
884         vld1.8          {d6-d7},  [r2], r3
885         vld1.8          {d8-d9},  [r2], r3
886         vld1.8          {d10-d11},[r2], r3
887         vld1.8          {d12-d13},[r2], r3
888         vld1.8          {d14-d15},[r2]
889         sub             r2,  r2,  r3,  lsl #2
890
891         vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
892         vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
893
894         vst1.8          {d2-d3},  [r0,:128], r1
895         vst1.8          {d4-d5},  [r0,:128], r1
896         subs            r12, r12, #2
897         bne             1b
898
899         vpop            {d8-d15}
900         pop             {r4,pc}
901 endfunc
902
903 function ff_put_vp8_epel16_h6_neon, export=1
904         sub             r2,  r2,  #2
905         push            {r4,lr}
906
907         ldr             r4,  [sp, #12]          @ mx
908         movrel          lr,  subpel_filters-16
909         ldr             r12, [sp, #8]           @ h
910         add             r4,  lr,  r4, lsl #4
911         vld1.16         {q0},     [r4,:128]
912 1:
913         vld1.8          {d2-d4},  [r2], r3
914
915         vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
916
917         vst1.8          {d2-d3}, [r0,:128], r1
918         subs            r12, r12, #1
919         bne             1b
920
921         pop             {r4,pc}
922 endfunc
923
924 function ff_put_vp8_epel16_h6v6_neon, export=1
925         sub             r2,  r2,  r3,  lsl #1
926         sub             r2,  r2,  #2
927         push            {r4,lr}
928         vpush           {d8-d9}
929
930         @ first pass (horizontal):
931         ldr             r4,  [sp, #28]          @ mx
932         movrel          lr,  subpel_filters-16
933         ldr             r12, [sp, #24]          @ h
934         add             r4,  lr,  r4, lsl #4
935         sub             sp,  sp,  #336+16
936         vld1.16         {q0},     [r4,:128]
937         add             lr,  sp,  #15
938         add             r12, r12, #5
939         bic             lr,  lr,  #15
940 1:
941         vld1.8          {d2,d3,d4}, [r2], r3
942
943         vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
944
945         vst1.8          {d2-d3}, [lr,:128]!
946         subs            r12, r12, #1
947         bne             1b
948
949         @ second pass (vertical):
950         ldr             r4,  [sp, #336+16+32]   @ my
951         movrel          lr,  subpel_filters-16
952         ldr             r12, [sp, #336+16+24]   @ h
953         add             r4,  lr,  r4, lsl #4
954         add             lr,  sp,  #15
955         vld1.16         {q0},     [r4,:128]
956         bic             lr,  lr,  #15
957 2:
958         vld1.8          {d2-d5},  [lr,:128]!
959         vld1.8          {d6-d9},  [lr,:128]!
960         vld1.8          {d28-d31},[lr,:128]
961         sub             lr,  lr,  #48
962
963         vp8_epel8_v6    d2, d2, d4, d6, d8, d28, d30
964         vp8_epel8_v6    d3, d3, d5, d7, d9, d29, d31
965
966         vst1.8          {d2-d3}, [r0,:128], r1
967         subs            r12, r12, #1
968         bne             2b
969
970         add             sp,  sp,  #336+16
971         vpop            {d8-d9}
972         pop             {r4,pc}
973 endfunc
974
975 function ff_put_vp8_epel8_v6_neon, export=1
976         sub             r2,  r2,  r3,  lsl #1
977         push            {r4,lr}
978
979         ldr             r4,  [sp, #16]          @ my
980         movrel          lr,  subpel_filters-16
981         ldr             r12, [sp, #8]           @ h
982         add             r4,  lr,  r4, lsl #4
983         vld1.16         {q0},     [r4,:128]
984 1:
985         vld1.8          {d2},  [r2], r3
986         vld1.8          {d3},  [r2], r3
987         vld1.8          {d4},  [r2], r3
988         vld1.8          {d5},  [r2], r3
989         vld1.8          {d6},  [r2], r3
990         vld1.8          {d7},  [r2], r3
991         vld1.8          {d28}, [r2]
992
993         sub             r2,  r2,  r3,  lsl #2
994
995         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
996
997         vst1.8          {d2}, [r0,:64], r1
998         vst1.8          {d3}, [r0,:64], r1
999         subs            r12, r12, #2
1000         bne             1b
1001
1002         pop             {r4,pc}
1003 endfunc
1004
1005 function ff_put_vp8_epel8_h6_neon, export=1
1006         sub             r2,  r2,  #2
1007         push            {r4,lr}
1008
1009         ldr             r4,  [sp, #12]          @ mx
1010         movrel          lr,  subpel_filters-16
1011         ldr             r12, [sp, #8]           @ h
1012         add             r4,  lr,  r4, lsl #4
1013         vld1.16         {q0},     [r4,:128]
1014 1:
1015         vld1.8          {d2,d3}, [r2], r3
1016
1017         vp8_epel8_h6    d2,  d2,  d3
1018
1019         vst1.8          {d2}, [r0,:64], r1
1020         subs            r12, r12, #1
1021         bne             1b
1022
1023         pop             {r4,pc}
1024 endfunc
1025
1026 function ff_put_vp8_epel8_h6v6_neon, export=1
1027         sub             r2,  r2,  r3,  lsl #1
1028         sub             r2,  r2,  #2
1029         push            {r4,lr}
1030
1031         @ first pass (horizontal):
1032         ldr             r4,  [sp, #12]          @ mx
1033         movrel          lr,  subpel_filters-16
1034         ldr             r12, [sp, #8]           @ h
1035         add             r4,  lr,  r4, lsl #4
1036         sub             sp,  sp,  #168+16
1037         vld1.16         {q0},     [r4,:128]
1038         add             lr,  sp,  #15
1039         add             r12, r12, #5
1040         bic             lr,  lr,  #15
1041 1:
1042         vld1.8          {d2,d3}, [r2], r3
1043
1044         vp8_epel8_h6    d2,  d2,  d3
1045
1046         vst1.8          {d2}, [lr,:64]!
1047         subs            r12, r12, #1
1048         bne             1b
1049
1050         @ second pass (vertical):
1051         ldr             r4,  [sp, #168+16+16]   @ my
1052         movrel          lr,  subpel_filters-16
1053         ldr             r12, [sp, #168+16+8]    @ h
1054         add             r4,  lr,  r4, lsl #4
1055         add             lr,  sp,  #15
1056         vld1.16         {q0},     [r4,:128]
1057         bic             lr,  lr,  #15
1058 2:
1059         vld1.8          {d2-d5},  [lr,:128]!
1060         vld1.8          {d6-d7},  [lr,:128]!
1061         vld1.8          {d30},    [lr,:64]
1062         sub             lr,  lr,  #32
1063
1064         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1065
1066         vst1.8          {d2}, [r0,:64], r1
1067         vst1.8          {d3}, [r0,:64], r1
1068         subs            r12, r12, #2
1069         bne             2b
1070
1071         add             sp,  sp,  #168+16
1072         pop             {r4,pc}
1073 endfunc
1074
1075 function ff_put_vp8_epel8_v4_neon, export=1
1076         sub             r2,  r2,  r3
1077         push            {r4,lr}
1078
1079         ldr             r4,  [sp, #16]          @ my
1080         movrel          lr,  subpel_filters-16
1081         ldr             r12, [sp, #8]           @ h
1082         add             r4,  lr,  r4, lsl #4
1083         vld1.16         {q0},     [r4,:128]
1084 1:
1085         vld1.8          {d2},     [r2], r3
1086         vld1.8          {d3},     [r2], r3
1087         vld1.8          {d4},     [r2], r3
1088         vld1.8          {d5},     [r2], r3
1089         vld1.8          {d6},     [r2]
1090         sub             r2,  r2,  r3,  lsl #1
1091
1092         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1093
1094         vst1.8          {d2}, [r0,:64], r1
1095         vst1.8          {d3}, [r0,:64], r1
1096         subs            r12, r12, #2
1097         bne             1b
1098
1099         pop             {r4,pc}
1100 endfunc
1101
1102 function ff_put_vp8_epel8_h4_neon, export=1
1103         sub             r2,  r2,  #1
1104         push            {r4,lr}
1105
1106         ldr             r4,  [sp, #12]          @ mx
1107         movrel          lr,  subpel_filters-16
1108         ldr             r12, [sp, #8]           @ h
1109         add             r4,  lr,  r4, lsl #4
1110         vld1.16         {q0},     [r4,:128]
1111 1:
1112         vld1.8          {d2,d3}, [r2], r3
1113
1114         vp8_epel8_h4    d2,  d2,  d3
1115
1116         vst1.8          {d2}, [r0,:64], r1
1117         subs            r12, r12, #1
1118         bne             1b
1119
1120         pop             {r4,pc}
1121 endfunc
1122
1123 function ff_put_vp8_epel8_h4v4_neon, export=1
1124         sub             r2,  r2,  r3
1125         sub             r2,  r2,  #1
1126         push            {r4,lr}
1127
1128         @ first pass (horizontal):
1129         ldr             r4,  [sp, #12]          @ mx
1130         movrel          lr,  subpel_filters-16
1131         ldr             r12, [sp, #8]           @ h
1132         add             r4,  lr,  r4, lsl #4
1133         sub             sp,  sp,  #168+16
1134         vld1.16         {q0},     [r4,:128]
1135         add             lr,  sp,  #15
1136         add             r12, r12, #3
1137         bic             lr,  lr,  #15
1138 1:
1139         vld1.8          {d2,d3}, [r2], r3
1140
1141         vp8_epel8_h4    d2,  d2,  d3
1142
1143         vst1.8          {d2}, [lr,:64]!
1144         subs            r12, r12, #1
1145         bne             1b
1146
1147         @ second pass (vertical):
1148         ldr             r4,  [sp, #168+16+16]   @ my
1149         movrel          lr,  subpel_filters-16
1150         ldr             r12, [sp, #168+16+8]    @ h
1151         add             r4,  lr,  r4, lsl #4
1152         add             lr,  sp,  #15
1153         vld1.16         {q0},     [r4,:128]
1154         bic             lr,  lr,  #15
1155 2:
1156         vld1.8          {d2-d5},  [lr,:128]!
1157         vld1.8          {d6},     [lr,:64]
1158         sub             lr,  lr,  #16
1159
1160         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1161
1162         vst1.8          {d2},     [r0,:64], r1
1163         vst1.8          {d3},     [r0,:64], r1
1164         subs            r12, r12, #2
1165         bne             2b
1166
1167         add             sp,  sp,  #168+16
1168         pop             {r4,pc}
1169 endfunc
1170
1171 function ff_put_vp8_epel8_h6v4_neon, export=1
1172         sub             r2,  r2,  r3
1173         sub             r2,  r2,  #2
1174         push            {r4,lr}
1175
1176         @ first pass (horizontal):
1177         ldr             r4,  [sp, #12]          @ mx
1178         movrel          lr,  subpel_filters-16
1179         ldr             r12, [sp, #8]           @ h
1180         add             r4,  lr,  r4, lsl #4
1181         sub             sp,  sp,  #168+16
1182         vld1.16         {q0},     [r4,:128]
1183         add             lr,  sp,  #15
1184         add             r12, r12, #3
1185         bic             lr,  lr,  #15
1186 1:
1187         vld1.8          {d2,d3}, [r2], r3
1188
1189         vp8_epel8_h6    d2,  d2,  d3
1190
1191         vst1.8          {d2}, [lr,:64]!
1192         subs            r12, r12, #1
1193         bne             1b
1194
1195         @ second pass (vertical):
1196         ldr             r4,  [sp, #168+16+16]   @ my
1197         movrel          lr,  subpel_filters-16
1198         ldr             r12, [sp, #168+16+8]    @ h
1199         add             r4,  lr,  r4, lsl #4
1200         add             lr,  sp,  #15
1201         vld1.16         {q0},     [r4,:128]
1202         bic             lr,  lr,  #15
1203 2:
1204         vld1.8          {d2-d5},  [lr,:128]!
1205         vld1.8          {d6},     [lr,:64]
1206         sub             lr,  lr,  #16
1207
1208         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1209
1210         vst1.8          {d2},     [r0,:64], r1
1211         vst1.8          {d3},     [r0,:64], r1
1212         subs            r12, r12, #2
1213         bne             2b
1214
1215         add             sp,  sp,  #168+16
1216         pop             {r4,pc}
1217 endfunc
1218
1219 function ff_put_vp8_epel8_h4v6_neon, export=1
1220         sub             r2,  r2,  r3,  lsl #1
1221         sub             r2,  r2,  #1
1222         push            {r4,lr}
1223
1224         @ first pass (horizontal):
1225         ldr             r4,  [sp, #12]          @ mx
1226         movrel          lr,  subpel_filters-16
1227         ldr             r12, [sp, #8]           @ h
1228         add             r4,  lr,  r4, lsl #4
1229         sub             sp,  sp,  #168+16
1230         vld1.16         {q0},     [r4,:128]
1231         add             lr,  sp,  #15
1232         add             r12, r12, #5
1233         bic             lr,  lr,  #15
1234 1:
1235         vld1.8          {d2,d3}, [r2], r3
1236
1237         vp8_epel8_h4    d2,  d2,  d3
1238
1239         vst1.8          {d2}, [lr,:64]!
1240         subs            r12, r12, #1
1241         bne             1b
1242
1243         @ second pass (vertical):
1244         ldr             r4,  [sp, #168+16+16]   @ my
1245         movrel          lr,  subpel_filters-16
1246         ldr             r12, [sp, #168+16+8]    @ h
1247         add             r4,  lr,  r4, lsl #4
1248         add             lr,  sp,  #15
1249         vld1.16         {q0},     [r4,:128]
1250         bic             lr,  lr,  #15
1251 2:
1252         vld1.8          {d2-d5},  [lr,:128]!
1253         vld1.8          {d6-d7},  [lr,:128]!
1254         vld1.8          {d30},    [lr,:64]
1255         sub             lr,  lr,  #32
1256
1257         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1258
1259         vst1.8          {d2}, [r0,:64], r1
1260         vst1.8          {d3}, [r0,:64], r1
1261         subs            r12, r12, #2
1262         bne             2b
1263
1264         add             sp,  sp,  #168+16
1265         pop             {r4,pc}
1266 endfunc
1267
1268 .ltorg
1269
1270 function ff_put_vp8_epel4_v6_neon, export=1
1271         sub             r2,  r2,  r3,  lsl #1
1272         push            {r4,lr}
1273
1274         ldr             r4,  [sp, #16]          @ my
1275         movrel          lr,  subpel_filters-16
1276         ldr             r12, [sp, #8]           @ h
1277         add             r4,  lr,  r4, lsl #4
1278         vld1.16         {q0},     [r4,:128]
1279 1:
1280         vld1.32         {d2[]},   [r2], r3
1281         vld1.32         {d3[]},   [r2], r3
1282         vld1.32         {d4[]},   [r2], r3
1283         vld1.32         {d5[]},   [r2], r3
1284         vld1.32         {d6[]},   [r2], r3
1285         vld1.32         {d7[]},   [r2], r3
1286         vld1.32         {d28[]},  [r2]
1287         sub             r2,  r2,  r3,  lsl #2
1288         vld1.32         {d2[1]},  [r2], r3
1289         vld1.32         {d3[1]},  [r2], r3
1290         vld1.32         {d4[1]},  [r2], r3
1291         vld1.32         {d5[1]},  [r2], r3
1292         vld1.32         {d6[1]},  [r2], r3
1293         vld1.32         {d7[1]},  [r2], r3
1294         vld1.32         {d28[1]}, [r2]
1295         sub             r2,  r2,  r3,  lsl #2
1296
1297         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1298
1299         vst1.32         {d2[0]},  [r0,:32], r1
1300         vst1.32         {d3[0]},  [r0,:32], r1
1301         vst1.32         {d2[1]},  [r0,:32], r1
1302         vst1.32         {d3[1]},  [r0,:32], r1
1303         subs            r12, r12, #4
1304         bne             1b
1305
1306         pop             {r4,pc}
1307 endfunc
1308
1309 function ff_put_vp8_epel4_h6_neon, export=1
1310         sub             r2,  r2,  #2
1311         push            {r4,lr}
1312
1313         ldr             r4,  [sp, #12]          @ mx
1314         movrel          lr,  subpel_filters-16
1315         ldr             r12, [sp, #8]           @ h
1316         add             r4,  lr,  r4, lsl #4
1317         vld1.16         {q0},     [r4,:128]
1318 1:
1319         vld1.8          {q1},     [r2], r3
1320         vp8_epel8_h6    d2,  d2,  d3
1321         vst1.32         {d2[0]},  [r0,:32], r1
1322         subs            r12, r12, #1
1323         bne             1b
1324
1325         pop             {r4,pc}
1326 endfunc
1327
1328 function ff_put_vp8_epel4_h6v6_neon, export=1
1329         sub             r2,  r2,  r3,  lsl #1
1330         sub             r2,  r2,  #2
1331         push            {r4,lr}
1332
1333         ldr             r4,  [sp, #12]          @ mx
1334         movrel          lr,  subpel_filters-16
1335         ldr             r12, [sp, #8]           @ h
1336         add             r4,  lr,  r4, lsl #4
1337         sub             sp,  sp,  #52+16
1338         vld1.16         {q0},     [r4,:128]
1339         add             lr,  sp,  #15
1340         add             r12, r12, #5
1341         bic             lr,  lr,  #15
1342 1:
1343         vld1.8          {q1},     [r2], r3
1344         vp8_epel8_h6    d2,  d2,  d3
1345         vst1.32         {d2[0]},  [lr,:32]!
1346         subs            r12, r12, #1
1347         bne             1b
1348
1349         ldr             r4,  [sp, #52+16+16]    @ my
1350         movrel          lr,  subpel_filters-16
1351         ldr             r12, [sp, #52+16+8]     @ h
1352         add             r4,  lr,  r4, lsl #4
1353         add             lr,  sp,  #15
1354         vld1.16         {q0},     [r4,:128]
1355         bic             lr,  lr,  #15
1356 2:
1357         vld1.8          {d2-d3},  [lr,:128]!
1358         vld1.8          {d6},     [lr,:64]!
1359         vld1.32         {d28[]},  [lr,:32]
1360         sub             lr,  lr,  #16
1361         vld1.8          {d4-d5},  [lr]!
1362         vld1.8          {d7},     [lr,:64]!
1363         vld1.32         {d28[1]}, [lr,:32]
1364         sub             lr,  lr,  #16
1365         vtrn.32         q1,  q2
1366         vtrn.32         d6,  d7
1367         vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1368         vst1.32         {d2[0]},  [r0,:32], r1
1369         vst1.32         {d3[0]},  [r0,:32], r1
1370         vst1.32         {d2[1]},  [r0,:32], r1
1371         vst1.32         {d3[1]},  [r0,:32], r1
1372         subs            r12, r12, #4
1373         bne             2b
1374
1375         add             sp,  sp,  #52+16
1376         pop             {r4,pc}
1377 endfunc
1378
1379 function ff_put_vp8_epel4_h4v6_neon, export=1
1380         sub             r2,  r2,  r3,  lsl #1
1381         sub             r2,  r2,  #1
1382         push            {r4,lr}
1383
1384         ldr             r4,  [sp, #12]          @ mx
1385         movrel          lr,  subpel_filters-16
1386         ldr             r12, [sp, #8]           @ h
1387         add             r4,  lr,  r4, lsl #4
1388         sub             sp,  sp,  #52+16
1389         vld1.16         {q0},     [r4,:128]
1390         add             lr,  sp,  #15
1391         add             r12, r12, #5
1392         bic             lr,  lr,  #15
1393 1:
1394         vld1.8          {d2},     [r2], r3
1395         vp8_epel8_h4    d2,  d2,  d2
1396         vst1.32         {d2[0]},  [lr,:32]!
1397         subs            r12, r12, #1
1398         bne             1b
1399
1400         ldr             r4,  [sp, #52+16+16]    @ my
1401         movrel          lr,  subpel_filters-16
1402         ldr             r12, [sp, #52+16+8]     @ h
1403         add             r4,  lr,  r4, lsl #4
1404         add             lr,  sp,  #15
1405         vld1.16         {q0},     [r4,:128]
1406         bic             lr,  lr,  #15
1407 2:
1408         vld1.8          {d2-d3},  [lr,:128]!
1409         vld1.8          {d6},     [lr,:64]!
1410         vld1.32         {d28[]},  [lr,:32]
1411         sub             lr,  lr,  #16
1412         vld1.8          {d4-d5},  [lr]!
1413         vld1.8          {d7},     [lr,:64]!
1414         vld1.32         {d28[1]}, [lr,:32]
1415         sub             lr,  lr,  #16
1416         vtrn.32         q1,  q2
1417         vtrn.32         d6,  d7
1418         vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1419         vst1.32         {d2[0]},  [r0,:32], r1
1420         vst1.32         {d3[0]},  [r0,:32], r1
1421         vst1.32         {d2[1]},  [r0,:32], r1
1422         vst1.32         {d3[1]},  [r0,:32], r1
1423         subs            r12, r12, #4
1424         bne             2b
1425
1426         add             sp,  sp,  #52+16
1427         pop             {r4,pc}
1428 endfunc
1429
1430 function ff_put_vp8_epel4_h6v4_neon, export=1
1431         sub             r2,  r2,  r3
1432         sub             r2,  r2,  #2
1433         push            {r4,lr}
1434
1435         ldr             r4,  [sp, #12]          @ mx
1436         movrel          lr,  subpel_filters-16
1437         ldr             r12, [sp, #8]           @ h
1438         add             r4,  lr,  r4, lsl #4
1439         sub             sp,  sp,  #44+16
1440         vld1.16         {q0},     [r4,:128]
1441         add             lr,  sp,  #15
1442         add             r12, r12, #3
1443         bic             lr,  lr,  #15
1444 1:
1445         vld1.8          {q1},     [r2], r3
1446         vp8_epel8_h6    d2,  d2,  d3
1447         vst1.32         {d2[0]},  [lr,:32]!
1448         subs            r12, r12, #1
1449         bne             1b
1450
1451         ldr             r4,  [sp, #44+16+16]    @ my
1452         movrel          lr,  subpel_filters-16
1453         ldr             r12, [sp, #44+16+8]     @ h
1454         add             r4,  lr,  r4, lsl #4
1455         add             lr,  sp,  #15
1456         vld1.16         {q0},     [r4,:128]
1457         bic             lr,  lr,  #15
1458 2:
1459         vld1.8          {d2-d3},  [lr,:128]!
1460         vld1.32         {d6[]},   [lr,:32]
1461         sub             lr,  lr,  #8
1462         vld1.8          {d4-d5},  [lr]!
1463         vld1.32         {d6[1]},  [lr,:32]
1464         sub             lr,  lr,  #8
1465         vtrn.32         q1,  q2
1466         vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1467         vst1.32         {d2[0]},  [r0,:32], r1
1468         vst1.32         {d3[0]},  [r0,:32], r1
1469         vst1.32         {d2[1]},  [r0,:32], r1
1470         vst1.32         {d3[1]},  [r0,:32], r1
1471         subs            r12, r12, #4
1472         bne             2b
1473
1474         add             sp,  sp,  #44+16
1475         pop             {r4,pc}
1476 endfunc
1477
1478 function ff_put_vp8_epel4_h4_neon, export=1
1479         sub             r2,  r2,  #1
1480         push            {r4,lr}
1481
1482         ldr             r4,  [sp, #12]          @ mx
1483         movrel          lr,  subpel_filters-16
1484         ldr             r12, [sp, #8]           @ h
1485         add             r4,  lr,  r4, lsl #4
1486         vld1.16         {q0},     [r4,:128]
1487 1:
1488         vld1.8          {d2},     [r2], r3
1489         vp8_epel8_h4    d2,  d2,  d2
1490         vst1.32         {d2[0]},  [r0,:32], r1
1491         subs            r12, r12, #1
1492         bne             1b
1493
1494         pop             {r4,pc}
1495 endfunc
1496
1497 function ff_put_vp8_epel4_v4_neon, export=1
1498         sub             r2,  r2,  r3
1499         push            {r4,lr}
1500
1501         ldr             r4,  [sp, #16]          @ my
1502         movrel          lr,  subpel_filters-16
1503         ldr             r12, [sp, #8]           @ h
1504         add             r4,  lr,  r4, lsl #4
1505         vld1.16         {q0},     [r4,:128]
1506 1:
1507         vld1.32         {d2[]},   [r2], r3
1508         vld1.32         {d3[]},   [r2], r3
1509         vld1.32         {d4[]},   [r2], r3
1510         vld1.32         {d5[]},   [r2], r3
1511         vld1.32         {d6[]},   [r2]
1512         sub             r2,  r2,  r3,  lsl #1
1513         vld1.32         {d2[1]},  [r2], r3
1514         vld1.32         {d3[1]},  [r2], r3
1515         vld1.32         {d4[1]},  [r2], r3
1516         vld1.32         {d5[1]},  [r2], r3
1517         vld1.32         {d6[1]},  [r2]
1518         sub             r2,  r2,  r3,  lsl #1
1519
1520         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1521
1522         vst1.32         {d2[0]},  [r0,:32], r1
1523         vst1.32         {d3[0]},  [r0,:32], r1
1524         vst1.32         {d2[1]},  [r0,:32], r1
1525         vst1.32         {d3[1]},  [r0,:32], r1
1526         subs            r12, r12, #4
1527         bne             1b
1528
1529         pop             {r4,pc}
1530 endfunc
1531
1532 function ff_put_vp8_epel4_h4v4_neon, export=1
1533         sub             r2,  r2,  r3
1534         sub             r2,  r2,  #1
1535         push            {r4,lr}
1536
1537         ldr             r4,  [sp, #12]          @ mx
1538         movrel          lr,  subpel_filters-16
1539         ldr             r12, [sp, #8]           @ h
1540         add             r4,  lr,  r4, lsl #4
1541         sub             sp,  sp,  #44+16
1542         vld1.16         {q0},     [r4,:128]
1543         add             lr,  sp,  #15
1544         add             r12, r12, #3
1545         bic             lr,  lr,  #15
1546 1:
1547         vld1.8          {d2},     [r2], r3
1548         vp8_epel8_h4    d2,  d2,  d3
1549         vst1.32         {d2[0]},  [lr,:32]!
1550         subs            r12, r12, #1
1551         bne             1b
1552
1553         ldr             r4,  [sp, #44+16+16]    @ my
1554         movrel          lr,  subpel_filters-16
1555         ldr             r12, [sp, #44+16+8]     @ h
1556         add             r4,  lr,  r4, lsl #4
1557         add             lr,  sp,  #15
1558         vld1.16         {q0},     [r4,:128]
1559         bic             lr,  lr,  #15
1560 2:
1561         vld1.8          {d2-d3},  [lr,:128]!
1562         vld1.32         {d6[]},   [lr,:32]
1563         sub             lr,  lr,  #8
1564         vld1.8          {d4-d5},  [lr]!
1565         vld1.32         {d6[1]},  [lr,:32]
1566         sub             lr,  lr,  #8
1567         vtrn.32         q1,  q2
1568         vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1569         vst1.32         {d2[0]},  [r0,:32], r1
1570         vst1.32         {d3[0]},  [r0,:32], r1
1571         vst1.32         {d2[1]},  [r0,:32], r1
1572         vst1.32         {d3[1]},  [r0,:32], r1
1573         subs            r12, r12, #4
1574         bne             2b
1575
1576         add             sp,  sp,  #44+16
1577         pop             {r4,pc}
1578 endfunc
1579
1580 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1581 @ arithmatic can be used to apply filters
1582 const   subpel_filters, align=4
1583         .short     0,   6, 123,  12,   1,   0,   0,   0
1584         .short     2,  11, 108,  36,   8,   1,   0,   0
1585         .short     0,   9,  93,  50,   6,   0,   0,   0
1586         .short     3,  16,  77,  77,  16,   3,   0,   0
1587         .short     0,   6,  50,  93,   9,   0,   0,   0
1588         .short     1,   8,  36, 108,  11,   2,   0,   0
1589         .short     0,   1,  12, 123,   6,   0,   0,   0
1590 endconst
1591
1592 /* Bilinear MC */
1593
1594 function ff_put_vp8_bilin16_h_neon, export=1
1595         ldr             r3,  [sp, #4]           @ mx
1596         rsb             r12, r3,  #8
1597         vdup.8          d0,  r3
1598         vdup.8          d1,  r12
1599         ldr             r12, [sp]               @ h
1600 1:
1601         subs            r12, r12, #2
1602         vld1.8          {d2-d4},  [r2], r1
1603         vext.8          q2,  q1,  q2,  #1
1604         vmull.u8        q8,  d2,  d1
1605         vmlal.u8        q8,  d4,  d0
1606         vld1.8          {d18-d20},[r2], r1
1607         vmull.u8        q3,  d3,  d1
1608         vmlal.u8        q3,  d5,  d0
1609         vext.8          q10, q9,  q10, #1
1610         vmull.u8        q11, d18, d1
1611         vmlal.u8        q11, d20, d0
1612         vmull.u8        q12, d19, d1
1613         vmlal.u8        q12, d21, d0
1614         vrshrn.u16      d4,  q8,  #3
1615         vrshrn.u16      d5,  q3,  #3
1616         vrshrn.u16      d6,  q11, #3
1617         vrshrn.u16      d7,  q12, #3
1618         vst1.8          {q2},     [r0,:128], r1
1619         vst1.8          {q3},     [r0,:128], r1
1620         bgt             1b
1621
1622         bx              lr
1623 endfunc
1624
1625 function ff_put_vp8_bilin16_v_neon, export=1
1626         ldr             r3,  [sp, #8]           @ my
1627         rsb             r12, r3,  #8
1628         vdup.8          d0,  r3
1629         vdup.8          d1,  r12
1630         ldr             r12, [sp]               @ h
1631         vld1.8          {q1},     [r2], r1
1632 1:
1633         subs            r12, r12, #2
1634         vld1.8          {q2},     [r2], r1
1635         vmull.u8        q3,  d2,  d1
1636         vmlal.u8        q3,  d4,  d0
1637         vmull.u8        q8,  d3,  d1
1638         vmlal.u8        q8,  d5,  d0
1639         vld1.8          {q1},     [r2], r1
1640         vmull.u8        q9,  d4,  d1
1641         vmlal.u8        q9,  d2,  d0
1642         vmull.u8        q10, d5,  d1
1643         vmlal.u8        q10, d3,  d0
1644         vrshrn.u16      d4,  q3,  #3
1645         vrshrn.u16      d5,  q8,  #3
1646         vrshrn.u16      d6,  q9,  #3
1647         vrshrn.u16      d7,  q10, #3
1648         vst1.8          {q2},     [r0,:128], r1
1649         vst1.8          {q3},     [r0,:128], r1
1650         bgt             1b
1651
1652         bx              lr
1653 endfunc
1654
1655 function ff_put_vp8_bilin16_hv_neon, export=1
1656         ldr             r3,  [sp, #4]           @ mx
1657         rsb             r12, r3,  #8
1658         vdup.8          d0,  r3
1659         vdup.8          d1,  r12
1660         ldr             r3,  [sp, #8]           @ my
1661         rsb             r12, r3,  #8
1662         vdup.8          d2,  r3
1663         vdup.8          d3,  r12
1664         ldr             r12, [sp]               @ h
1665
1666         vld1.8          {d4-d6},  [r2], r1
1667         vext.8          q3,  q2,  q3,  #1
1668         vmull.u8        q8,  d4,  d1
1669         vmlal.u8        q8,  d6,  d0
1670         vmull.u8        q9,  d5,  d1
1671         vmlal.u8        q9,  d7,  d0
1672         vrshrn.u16      d4,  q8,  #3
1673         vrshrn.u16      d5,  q9,  #3
1674 1:
1675         subs            r12, r12, #2
1676         vld1.8          {d18-d20},[r2], r1
1677         vext.8          q10, q9,  q10, #1
1678         vmull.u8        q11, d18, d1
1679         vmlal.u8        q11, d20, d0
1680         vld1.8          {d26-d28},[r2], r1
1681         vmull.u8        q12, d19, d1
1682         vmlal.u8        q12, d21, d0
1683         vext.8          q14, q13, q14, #1
1684         vmull.u8        q8,  d26, d1
1685         vmlal.u8        q8,  d28, d0
1686         vmull.u8        q9,  d27, d1
1687         vmlal.u8        q9,  d29, d0
1688         vrshrn.u16      d6,  q11, #3
1689         vrshrn.u16      d7,  q12, #3
1690         vmull.u8        q12, d4,  d3
1691         vmlal.u8        q12, d6,  d2
1692         vmull.u8        q15, d5,  d3
1693         vmlal.u8        q15, d7,  d2
1694         vrshrn.u16      d4,  q8,  #3
1695         vrshrn.u16      d5,  q9,  #3
1696         vmull.u8        q10, d6,  d3
1697         vmlal.u8        q10, d4,  d2
1698         vmull.u8        q11, d7,  d3
1699         vmlal.u8        q11, d5,  d2
1700         vrshrn.u16      d24, q12, #3
1701         vrshrn.u16      d25, q15, #3
1702         vst1.8          {q12},    [r0,:128], r1
1703         vrshrn.u16      d20, q10, #3
1704         vrshrn.u16      d21, q11, #3
1705         vst1.8          {q10},    [r0,:128], r1
1706         bgt             1b
1707
1708         bx              lr
1709 endfunc
1710
1711 function ff_put_vp8_bilin8_h_neon, export=1
1712         ldr             r3,  [sp, #4]           @ mx
1713         rsb             r12, r3,  #8
1714         vdup.8          d0,  r3
1715         vdup.8          d1,  r12
1716         ldr             r12, [sp]               @ h
1717 1:
1718         subs            r12, r12, #2
1719         vld1.8          {q1},     [r2], r1
1720         vext.8          d3,  d2,  d3,  #1
1721         vmull.u8        q2,  d2,  d1
1722         vmlal.u8        q2,  d3,  d0
1723         vld1.8          {q3},     [r2], r1
1724         vext.8          d7,  d6,  d7,  #1
1725         vmull.u8        q8,  d6,  d1
1726         vmlal.u8        q8,  d7,  d0
1727         vrshrn.u16      d4,  q2,  #3
1728         vrshrn.u16      d16, q8,  #3
1729         vst1.8          {d4},     [r0,:64], r1
1730         vst1.8          {d16},    [r0,:64], r1
1731         bgt             1b
1732
1733         bx              lr
1734 endfunc
1735
1736 function ff_put_vp8_bilin8_v_neon, export=1
1737         ldr             r3,  [sp, #8]           @ my
1738         rsb             r12, r3,  #8
1739         vdup.8          d0,  r3
1740         vdup.8          d1,  r12
1741         ldr             r12, [sp]               @ h
1742         vld1.8          {d2},     [r2], r1
1743 1:
1744         subs            r12, r12, #2
1745         vld1.8          {d3},     [r2], r1
1746         vmull.u8        q2,  d2,  d1
1747         vmlal.u8        q2,  d3,  d0
1748         vld1.8          {d2},     [r2], r1
1749         vmull.u8        q3,  d3,  d1
1750         vmlal.u8        q3,  d2,  d0
1751         vrshrn.u16      d4,  q2,  #3
1752         vrshrn.u16      d6,  q3,  #3
1753         vst1.8          {d4},     [r0,:64], r1
1754         vst1.8          {d6},     [r0,:64], r1
1755         bgt             1b
1756
1757         bx              lr
1758 endfunc
1759
1760 function ff_put_vp8_bilin8_hv_neon, export=1
1761         ldr             r3,  [sp, #4]           @ mx
1762         rsb             r12, r3,  #8
1763         vdup.8          d0,  r3
1764         vdup.8          d1,  r12
1765         ldr             r3,  [sp, #8]           @ my
1766         rsb             r12, r3,  #8
1767         vdup.8          d2,  r3
1768         vdup.8          d3,  r12
1769         ldr             r12, [sp]               @ h
1770
1771         vld1.8          {q2},     [r2], r1
1772         vext.8          d5,  d4,  d5,  #1
1773         vmull.u8        q9,  d4,  d1
1774         vmlal.u8        q9,  d5,  d0
1775         vrshrn.u16      d22, q9,  #3
1776 1:
1777         subs            r12, r12, #2
1778         vld1.8          {q3},     [r2], r1
1779         vext.8          d7,  d6,  d7,  #1
1780         vmull.u8        q8,  d6,  d1
1781         vmlal.u8        q8,  d7,  d0
1782         vld1.8          {q2},     [r2], r1
1783         vext.8          d5,  d4,  d5,  #1
1784         vmull.u8        q9,  d4,  d1
1785         vmlal.u8        q9,  d5,  d0
1786         vrshrn.u16      d16, q8,  #3
1787         vmull.u8        q10, d22, d3
1788         vmlal.u8        q10, d16, d2
1789         vrshrn.u16      d22, q9,  #3
1790         vmull.u8        q12, d16, d3
1791         vmlal.u8        q12, d22, d2
1792         vrshrn.u16      d20, q10, #3
1793         vst1.8          {d20},    [r0,:64], r1
1794         vrshrn.u16      d23, q12, #3
1795         vst1.8          {d23},    [r0,:64], r1
1796         bgt             1b
1797
1798         bx              lr
1799 endfunc
1800
1801 function ff_put_vp8_bilin4_h_neon, export=1
1802         ldr             r3,  [sp, #4]           @ mx
1803         rsb             r12, r3,  #8
1804         vdup.8          d0,  r3
1805         vdup.8          d1,  r12
1806         ldr             r12, [sp]               @ h
1807 1:
1808         subs            r12, r12, #2
1809         vld1.8          {d2},     [r2], r1
1810         vext.8          d3,  d2,  d3,  #1
1811         vld1.8          {d6},     [r2], r1
1812         vext.8          d7,  d6,  d7,  #1
1813         vtrn.32         q1,  q3
1814         vmull.u8        q2,  d2,  d1
1815         vmlal.u8        q2,  d3,  d0
1816         vrshrn.u16      d4,  q2,  #3
1817         vst1.32         {d4[0]},  [r0,:32], r1
1818         vst1.32         {d4[1]}, [r0,:32], r1
1819         bgt             1b
1820
1821         bx              lr
1822 endfunc
1823
1824 function ff_put_vp8_bilin4_v_neon, export=1
1825         ldr             r3,  [sp, #8]           @ my
1826         rsb             r12, r3,  #8
1827         vdup.8          d0,  r3
1828         vdup.8          d1,  r12
1829         ldr             r12, [sp]               @ h
1830         vld1.32         {d2[]},   [r2], r1
1831 1:
1832         vld1.32         {d3[]},   [r2]
1833         vld1.32         {d2[1]},  [r2], r1
1834         vld1.32         {d3[1]},  [r2], r1
1835         vmull.u8        q2,  d2,  d1
1836         vmlal.u8        q2,  d3,  d0
1837         vtrn.32         d3,  d2
1838         vrshrn.u16      d4,  q2,  #3
1839         vst1.32         {d4[0]},  [r0,:32], r1
1840         vst1.32         {d4[1]},  [r0,:32], r1
1841         subs            r12, r12, #2
1842         bgt             1b
1843
1844         bx              lr
1845 endfunc
1846
1847 function ff_put_vp8_bilin4_hv_neon, export=1
1848         ldr             r3,  [sp, #4]           @ mx
1849         rsb             r12, r3,  #8
1850         vdup.8          d0,  r3
1851         vdup.8          d1,  r12
1852         ldr             r3,  [sp, #8]           @ my
1853         rsb             r12, r3,  #8
1854         vdup.8          d2,  r3
1855         vdup.8          d3,  r12
1856         ldr             r12, [sp]               @ h
1857
1858         vld1.8          {d4},     [r2], r1
1859         vext.8          d5,  d4,  d4,  #1
1860         vmull.u8        q9,  d4,  d1
1861         vmlal.u8        q9,  d5,  d0
1862         vrshrn.u16      d22, q9,  #3
1863 1:
1864         subs            r12, r12, #2
1865         vld1.8          {d6},     [r2], r1
1866         vext.8          d7,  d6,  d6,  #1
1867         vld1.8          {d4},     [r2], r1
1868         vext.8          d5,  d4,  d4,  #1
1869         vtrn.32         q3,  q2
1870         vmull.u8        q8,  d6,  d1
1871         vmlal.u8        q8,  d7,  d0
1872         vrshrn.u16      d16, q8,  #3
1873         vmull.u8        q10, d16, d2
1874         vtrn.32         d22, d16
1875         vmlal.u8        q10, d22, d3
1876         vrev64.32       d22, d16
1877         vrshrn.u16      d20, q10, #3
1878         vst1.32         {d20[0]}, [r0,:32], r1
1879         vst1.32         {d20[1]}, [r0,:32], r1
1880         bgt             1b
1881
1882         bx              lr
1883 endfunc