]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264dsp_neon.S
ARM: NEON optimised int32_to_float_fmul_scalar
[ffmpeg] / libavcodec / arm / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "asm.S"
22
23         .fpu neon
24
25         .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
26         vtrn.32         \r0, \r4
27         vtrn.32         \r1, \r5
28         vtrn.32         \r2, \r6
29         vtrn.32         \r3, \r7
30         vtrn.16         \r0, \r2
31         vtrn.16         \r1, \r3
32         vtrn.16         \r4, \r6
33         vtrn.16         \r5, \r7
34         vtrn.8          \r0, \r1
35         vtrn.8          \r2, \r3
36         vtrn.8          \r4, \r5
37         vtrn.8          \r6, \r7
38         .endm
39
40         .macro transpose_4x4 r0 r1 r2 r3
41         vtrn.16         \r0, \r2
42         vtrn.16         \r1, \r3
43         vtrn.8          \r0, \r1
44         vtrn.8          \r2, \r3
45         .endm
46
47         .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
48         vswp            \r0, \r4
49         vswp            \r1, \r5
50         vswp            \r2, \r6
51         vswp            \r3, \r7
52         .endm
53
54         .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
55         vtrn.32         \r0, \r2
56         vtrn.32         \r1, \r3
57         vtrn.32         \r4, \r6
58         vtrn.32         \r5, \r7
59         vtrn.16         \r0, \r1
60         vtrn.16         \r2, \r3
61         vtrn.16         \r4, \r5
62         vtrn.16         \r6, \r7
63         .endm
64
65 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
66         .macro  h264_chroma_mc8 type
67 function ff_\type\()_h264_chroma_mc8_neon, export=1
68         push            {r4-r7, lr}
69         ldrd            r4,  [sp, #20]
70 .ifc \type,avg
71         mov             lr,  r0
72 .endif
73         pld             [r1]
74         pld             [r1, r2]
75
76         muls            r7,  r4,  r5
77         rsb             r6,  r7,  r5,  lsl #3
78         rsb             ip,  r7,  r4,  lsl #3
79         sub             r4,  r7,  r4,  lsl #3
80         sub             r4,  r4,  r5,  lsl #3
81         add             r4,  r4,  #64
82
83         beq             2f
84
85         add             r5,  r1,  r2
86
87         vdup.8          d0,  r4
88         lsl             r4,  r2,  #1
89         vdup.8          d1,  ip
90         vld1.64         {d4, d5}, [r1], r4
91         vdup.8          d2,  r6
92         vld1.64         {d6, d7}, [r5], r4
93         vdup.8          d3,  r7
94
95         vext.8          d5,  d4,  d5,  #1
96         vext.8          d7,  d6,  d7,  #1
97
98 1:      pld             [r5]
99         vmull.u8        q8,  d4,  d0
100         vmlal.u8        q8,  d5,  d1
101         vld1.64         {d4, d5}, [r1], r4
102         vmlal.u8        q8,  d6,  d2
103         vext.8          d5,  d4,  d5,  #1
104         vmlal.u8        q8,  d7,  d3
105         vmull.u8        q9,  d6,  d0
106         subs            r3,  r3,  #2
107         vmlal.u8        q9,  d7,  d1
108         vmlal.u8        q9,  d4,  d2
109         vmlal.u8        q9,  d5,  d3
110         vrshrn.u16      d16, q8,  #6
111         vld1.64         {d6, d7}, [r5], r4
112         pld             [r1]
113         vrshrn.u16      d17, q9,  #6
114 .ifc \type,avg
115         vld1.64         {d20}, [lr,:64], r2
116         vld1.64         {d21}, [lr,:64], r2
117         vrhadd.u8       q8,  q8,  q10
118 .endif
119         vext.8          d7,  d6,  d7,  #1
120         vst1.64         {d16}, [r0,:64], r2
121         vst1.64         {d17}, [r0,:64], r2
122         bgt             1b
123
124         pop             {r4-r7, pc}
125
126 2:      tst             r6,  r6
127         add             ip,  ip,  r6
128         vdup.8          d0,  r4
129         vdup.8          d1,  ip
130
131         beq             4f
132
133         add             r5,  r1,  r2
134         lsl             r4,  r2,  #1
135         vld1.64         {d4}, [r1], r4
136         vld1.64         {d6}, [r5], r4
137
138 3:      pld             [r5]
139         vmull.u8        q8,  d4,  d0
140         vmlal.u8        q8,  d6,  d1
141         vld1.64         {d4}, [r1], r4
142         vmull.u8        q9,  d6,  d0
143         vmlal.u8        q9,  d4,  d1
144         vld1.64         {d6}, [r5], r4
145         vrshrn.u16      d16, q8,  #6
146         vrshrn.u16      d17, q9,  #6
147 .ifc \type,avg
148         vld1.64         {d20}, [lr,:64], r2
149         vld1.64         {d21}, [lr,:64], r2
150         vrhadd.u8       q8,  q8,  q10
151 .endif
152         subs            r3,  r3,  #2
153         pld             [r1]
154         vst1.64         {d16}, [r0,:64], r2
155         vst1.64         {d17}, [r0,:64], r2
156         bgt             3b
157
158         pop             {r4-r7, pc}
159
160 4:      vld1.64         {d4, d5}, [r1], r2
161         vld1.64         {d6, d7}, [r1], r2
162         vext.8          d5,  d4,  d5,  #1
163         vext.8          d7,  d6,  d7,  #1
164
165 5:      pld             [r1]
166         subs            r3,  r3,  #2
167         vmull.u8        q8,  d4,  d0
168         vmlal.u8        q8,  d5,  d1
169         vld1.64         {d4, d5}, [r1], r2
170         vmull.u8        q9,  d6,  d0
171         vmlal.u8        q9,  d7,  d1
172         pld             [r1]
173         vext.8          d5,  d4,  d5,  #1
174         vrshrn.u16      d16, q8,  #6
175         vrshrn.u16      d17, q9,  #6
176 .ifc \type,avg
177         vld1.64         {d20}, [lr,:64], r2
178         vld1.64         {d21}, [lr,:64], r2
179         vrhadd.u8       q8,  q8,  q10
180 .endif
181         vld1.64         {d6, d7}, [r1], r2
182         vext.8          d7,  d6,  d7,  #1
183         vst1.64         {d16}, [r0,:64], r2
184         vst1.64         {d17}, [r0,:64], r2
185         bgt             5b
186
187         pop             {r4-r7, pc}
188         .endfunc
189         .endm
190
191 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
192         .macro  h264_chroma_mc4 type
193 function ff_\type\()_h264_chroma_mc4_neon, export=1
194         push            {r4-r7, lr}
195         ldrd            r4,  [sp, #20]
196 .ifc \type,avg
197         mov             lr,  r0
198 .endif
199         pld             [r1]
200         pld             [r1, r2]
201
202         muls            r7,  r4,  r5
203         rsb             r6,  r7,  r5,  lsl #3
204         rsb             ip,  r7,  r4,  lsl #3
205         sub             r4,  r7,  r4,  lsl #3
206         sub             r4,  r4,  r5,  lsl #3
207         add             r4,  r4,  #64
208
209         beq             2f
210
211         add             r5,  r1,  r2
212
213         vdup.8          d0,  r4
214         lsl             r4,  r2,  #1
215         vdup.8          d1,  ip
216         vld1.64         {d4},     [r1], r4
217         vdup.8          d2,  r6
218         vld1.64         {d6},     [r5], r4
219         vdup.8          d3,  r7
220
221         vext.8          d5,  d4,  d5,  #1
222         vext.8          d7,  d6,  d7,  #1
223         vtrn.32         d4,  d5
224         vtrn.32         d6,  d7
225
226         vtrn.32         d0,  d1
227         vtrn.32         d2,  d3
228
229 1:      pld             [r5]
230         vmull.u8        q8,  d4,  d0
231         vmlal.u8        q8,  d6,  d2
232         vld1.64         {d4},     [r1], r4
233         vext.8          d5,  d4,  d5,  #1
234         vtrn.32         d4,  d5
235         vmull.u8        q9,  d6,  d0
236         vmlal.u8        q9,  d4,  d2
237         vld1.64         {d6},     [r5], r4
238         vadd.i16        d16, d16, d17
239         vadd.i16        d17, d18, d19
240         vrshrn.u16      d16, q8,  #6
241         subs            r3,  r3,  #2
242         pld             [r1]
243 .ifc \type,avg
244         vld1.32         {d20[0]}, [lr,:32], r2
245         vld1.32         {d20[1]}, [lr,:32], r2
246         vrhadd.u8       d16, d16, d20
247 .endif
248         vext.8          d7,  d6,  d7,  #1
249         vtrn.32         d6,  d7
250         vst1.32         {d16[0]}, [r0,:32], r2
251         vst1.32         {d16[1]}, [r0,:32], r2
252         bgt             1b
253
254         pop             {r4-r7, pc}
255
256 2:      tst             r6,  r6
257         add             ip,  ip,  r6
258         vdup.8          d0,  r4
259         vdup.8          d1,  ip
260         vtrn.32         d0,  d1
261
262         beq             4f
263
264         vext.32         d1,  d0,  d1,  #1
265         add             r5,  r1,  r2
266         lsl             r4,  r2,  #1
267         vld1.32         {d4[0]},  [r1], r4
268         vld1.32         {d4[1]},  [r5], r4
269
270 3:      pld             [r5]
271         vmull.u8        q8,  d4,  d0
272         vld1.32         {d4[0]},  [r1], r4
273         vmull.u8        q9,  d4,  d1
274         vld1.32         {d4[1]},  [r5], r4
275         vadd.i16        d16, d16, d17
276         vadd.i16        d17, d18, d19
277         vrshrn.u16      d16, q8,  #6
278 .ifc \type,avg
279         vld1.32         {d20[0]}, [lr,:32], r2
280         vld1.32         {d20[1]}, [lr,:32], r2
281         vrhadd.u8       d16, d16, d20
282 .endif
283         subs            r3,  r3,  #2
284         pld             [r1]
285         vst1.32         {d16[0]}, [r0,:32], r2
286         vst1.32         {d16[1]}, [r0,:32], r2
287         bgt             3b
288
289         pop             {r4-r7, pc}
290
291 4:      vld1.64         {d4},     [r1], r2
292         vld1.64         {d6},     [r1], r2
293         vext.8          d5,  d4,  d5,  #1
294         vext.8          d7,  d6,  d7,  #1
295         vtrn.32         d4,  d5
296         vtrn.32         d6,  d7
297
298 5:      vmull.u8        q8,  d4,  d0
299         vmull.u8        q9,  d6,  d0
300         subs            r3,  r3,  #2
301         vld1.64         {d4},     [r1], r2
302         vext.8          d5,  d4,  d5,  #1
303         vtrn.32         d4,  d5
304         vadd.i16        d16, d16, d17
305         vadd.i16        d17, d18, d19
306         pld             [r1]
307         vrshrn.u16      d16, q8,  #6
308 .ifc \type,avg
309         vld1.32         {d20[0]}, [lr,:32], r2
310         vld1.32         {d20[1]}, [lr,:32], r2
311         vrhadd.u8       d16, d16, d20
312 .endif
313         vld1.64         {d6},     [r1], r2
314         vext.8          d7,  d6,  d7,  #1
315         vtrn.32         d6,  d7
316         pld             [r1]
317         vst1.32         {d16[0]}, [r0,:32], r2
318         vst1.32         {d16[1]}, [r0,:32], r2
319         bgt             5b
320
321         pop             {r4-r7, pc}
322         .endfunc
323         .endm
324
325         .text
326         .align
327
328         h264_chroma_mc8 put
329         h264_chroma_mc8 avg
330         h264_chroma_mc4 put
331         h264_chroma_mc4 avg
332
333         /* H.264 loop filter */
334
335         .macro h264_loop_filter_start
336         ldr             ip,  [sp]
337         tst             r2,  r2
338         ldr             ip,  [ip]
339         tstne           r3,  r3
340         vmov.32         d24[0], ip
341         and             ip,  ip,  ip, lsl #16
342         bxeq            lr
343         ands            ip,  ip,  ip, lsl #8
344         bxlt            lr
345         .endm
346
347         .macro align_push_regs
348         and             ip,  sp,  #15
349         add             ip,  ip,  #32
350         sub             sp,  sp,  ip
351         vst1.64         {d12-d15}, [sp,:128]
352         sub             sp,  sp,  #32
353         vst1.64         {d8-d11},  [sp,:128]
354         .endm
355
356         .macro align_pop_regs
357         vld1.64         {d8-d11},  [sp,:128]!
358         vld1.64         {d12-d15}, [sp,:128], ip
359         .endm
360
361         .macro h264_loop_filter_luma
362         vdup.8          q11, r2         @ alpha
363         vmovl.u8        q12, d24
364         vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
365         vmovl.u16       q12, d24
366         vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
367         vsli.16         q12, q12, #8
368         vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
369         vsli.32         q12, q12, #16
370         vclt.u8         q6,  q6,  q11   @ < alpha
371         vdup.8          q11, r3         @ beta
372         vclt.s8         q7,  q12, #0
373         vclt.u8         q14, q14, q11   @ < beta
374         vclt.u8         q15, q15, q11   @ < beta
375         vbic            q6,  q6,  q7
376         vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
377         vand            q6,  q6,  q14
378         vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
379         vclt.u8         q4,  q4,  q11   @ < beta
380         vand            q6,  q6,  q15
381         vclt.u8         q5,  q5,  q11   @ < beta
382         vand            q4,  q4,  q6
383         vand            q5,  q5,  q6
384         vand            q12, q12, q6
385         vrhadd.u8       q14, q8,  q0
386         vsub.i8         q6,  q12, q4
387         vqadd.u8        q7,  q9,  q12
388         vhadd.u8        q10, q10, q14
389         vsub.i8         q6,  q6,  q5
390         vhadd.u8        q14, q2,  q14
391         vmin.u8         q7,  q7,  q10
392         vqsub.u8        q11, q9,  q12
393         vqadd.u8        q2,  q1,  q12
394         vmax.u8         q7,  q7,  q11
395         vqsub.u8        q11, q1,  q12
396         vmin.u8         q14, q2,  q14
397         vmovl.u8        q2,  d0
398         vmax.u8         q14, q14, q11
399         vmovl.u8        q10, d1
400         vsubw.u8        q2,  q2,  d16
401         vsubw.u8        q10, q10, d17
402         vshl.i16        q2,  q2,  #2
403         vshl.i16        q10, q10, #2
404         vaddw.u8        q2,  q2,  d18
405         vaddw.u8        q10, q10, d19
406         vsubw.u8        q2,  q2,  d2
407         vsubw.u8        q10, q10, d3
408         vrshrn.i16      d4,  q2,  #3
409         vrshrn.i16      d5,  q10, #3
410         vbsl            q4,  q7,  q9
411         vbsl            q5,  q14, q1
412         vneg.s8         q7,  q6
413         vmovl.u8        q14, d16
414         vmin.s8         q2,  q2,  q6
415         vmovl.u8        q6,  d17
416         vmax.s8         q2,  q2,  q7
417         vmovl.u8        q11, d0
418         vmovl.u8        q12, d1
419         vaddw.s8        q14, q14, d4
420         vaddw.s8        q6,  q6,  d5
421         vsubw.s8        q11, q11, d4
422         vsubw.s8        q12, q12, d5
423         vqmovun.s16     d16, q14
424         vqmovun.s16     d17, q6
425         vqmovun.s16     d0,  q11
426         vqmovun.s16     d1,  q12
427         .endm
428
429 function ff_h264_v_loop_filter_luma_neon, export=1
430         h264_loop_filter_start
431
432         vld1.64         {d0, d1},  [r0,:128], r1
433         vld1.64         {d2, d3},  [r0,:128], r1
434         vld1.64         {d4, d5},  [r0,:128], r1
435         sub             r0,  r0,  r1, lsl #2
436         sub             r0,  r0,  r1, lsl #1
437         vld1.64         {d20,d21}, [r0,:128], r1
438         vld1.64         {d18,d19}, [r0,:128], r1
439         vld1.64         {d16,d17}, [r0,:128], r1
440
441         align_push_regs
442
443         h264_loop_filter_luma
444
445         sub             r0,  r0,  r1, lsl #1
446         vst1.64         {d8, d9},  [r0,:128], r1
447         vst1.64         {d16,d17}, [r0,:128], r1
448         vst1.64         {d0, d1},  [r0,:128], r1
449         vst1.64         {d10,d11}, [r0,:128]
450
451         align_pop_regs
452         bx              lr
453         .endfunc
454
455 function ff_h264_h_loop_filter_luma_neon, export=1
456         h264_loop_filter_start
457
458         sub             r0,  r0,  #4
459         vld1.64         {d6},  [r0], r1
460         vld1.64         {d20}, [r0], r1
461         vld1.64         {d18}, [r0], r1
462         vld1.64         {d16}, [r0], r1
463         vld1.64         {d0},  [r0], r1
464         vld1.64         {d2},  [r0], r1
465         vld1.64         {d4},  [r0], r1
466         vld1.64         {d26}, [r0], r1
467         vld1.64         {d7},  [r0], r1
468         vld1.64         {d21}, [r0], r1
469         vld1.64         {d19}, [r0], r1
470         vld1.64         {d17}, [r0], r1
471         vld1.64         {d1},  [r0], r1
472         vld1.64         {d3},  [r0], r1
473         vld1.64         {d5},  [r0], r1
474         vld1.64         {d27}, [r0], r1
475
476         transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
477
478         align_push_regs
479
480         h264_loop_filter_luma
481
482         transpose_4x4   q4, q8, q0, q5
483
484         sub             r0,  r0,  r1, lsl #4
485         add             r0,  r0,  #2
486         vst1.32         {d8[0]},  [r0], r1
487         vst1.32         {d16[0]}, [r0], r1
488         vst1.32         {d0[0]},  [r0], r1
489         vst1.32         {d10[0]}, [r0], r1
490         vst1.32         {d8[1]},  [r0], r1
491         vst1.32         {d16[1]}, [r0], r1
492         vst1.32         {d0[1]},  [r0], r1
493         vst1.32         {d10[1]}, [r0], r1
494         vst1.32         {d9[0]},  [r0], r1
495         vst1.32         {d17[0]}, [r0], r1
496         vst1.32         {d1[0]},  [r0], r1
497         vst1.32         {d11[0]}, [r0], r1
498         vst1.32         {d9[1]},  [r0], r1
499         vst1.32         {d17[1]}, [r0], r1
500         vst1.32         {d1[1]},  [r0], r1
501         vst1.32         {d11[1]}, [r0], r1
502
503         align_pop_regs
504         bx              lr
505         .endfunc
506
507         .macro h264_loop_filter_chroma
508         vdup.8          d22, r2         @ alpha
509         vmovl.u8        q12, d24
510         vabd.u8         d26, d16, d0    @ abs(p0 - q0)
511         vmovl.u8        q2,  d0
512         vabd.u8         d28, d18, d16   @ abs(p1 - p0)
513         vsubw.u8        q2,  q2,  d16
514         vsli.16         d24, d24, #8
515         vshl.i16        q2,  q2,  #2
516         vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
517         vaddw.u8        q2,  q2,  d18
518         vclt.u8         d26, d26, d22   @ < alpha
519         vsubw.u8        q2,  q2,  d2
520         vdup.8          d22, r3         @ beta
521         vclt.s8         d25, d24, #0
522         vrshrn.i16      d4,  q2,  #3
523         vclt.u8         d28, d28, d22   @ < beta
524         vbic            d26, d26, d25
525         vclt.u8         d30, d30, d22   @ < beta
526         vand            d26, d26, d28
527         vneg.s8         d25, d24
528         vand            d26, d26, d30
529         vmin.s8         d4,  d4,  d24
530         vmovl.u8        q14, d16
531         vand            d4,  d4,  d26
532         vmax.s8         d4,  d4,  d25
533         vmovl.u8        q11, d0
534         vaddw.s8        q14, q14, d4
535         vsubw.s8        q11, q11, d4
536         vqmovun.s16     d16, q14
537         vqmovun.s16     d0,  q11
538         .endm
539
540 function ff_h264_v_loop_filter_chroma_neon, export=1
541         h264_loop_filter_start
542
543         sub             r0,  r0,  r1, lsl #1
544         vld1.64         {d18}, [r0,:64], r1
545         vld1.64         {d16}, [r0,:64], r1
546         vld1.64         {d0},  [r0,:64], r1
547         vld1.64         {d2},  [r0,:64]
548
549         h264_loop_filter_chroma
550
551         sub             r0,  r0,  r1, lsl #1
552         vst1.64         {d16}, [r0,:64], r1
553         vst1.64         {d0},  [r0,:64], r1
554
555         bx              lr
556         .endfunc
557
558 function ff_h264_h_loop_filter_chroma_neon, export=1
559         h264_loop_filter_start
560
561         sub             r0,  r0,  #2
562         vld1.32         {d18[0]}, [r0], r1
563         vld1.32         {d16[0]}, [r0], r1
564         vld1.32         {d0[0]},  [r0], r1
565         vld1.32         {d2[0]},  [r0], r1
566         vld1.32         {d18[1]}, [r0], r1
567         vld1.32         {d16[1]}, [r0], r1
568         vld1.32         {d0[1]},  [r0], r1
569         vld1.32         {d2[1]},  [r0], r1
570
571         vtrn.16         d18, d0
572         vtrn.16         d16, d2
573         vtrn.8          d18, d16
574         vtrn.8          d0,  d2
575
576         h264_loop_filter_chroma
577
578         vtrn.16         d18, d0
579         vtrn.16         d16, d2
580         vtrn.8          d18, d16
581         vtrn.8          d0,  d2
582
583         sub             r0,  r0,  r1, lsl #3
584         vst1.32         {d18[0]}, [r0], r1
585         vst1.32         {d16[0]}, [r0], r1
586         vst1.32         {d0[0]},  [r0], r1
587         vst1.32         {d2[0]},  [r0], r1
588         vst1.32         {d18[1]}, [r0], r1
589         vst1.32         {d16[1]}, [r0], r1
590         vst1.32         {d0[1]},  [r0], r1
591         vst1.32         {d2[1]},  [r0], r1
592
593         bx              lr
594         .endfunc
595
596         /* H.264 qpel MC */
597
598         .macro  lowpass_const r
599         movw            \r,  #5
600         movt            \r,  #20
601         vmov.32         d6[0], \r
602         .endm
603
604         .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
605 .if \narrow
606         t0 .req q0
607         t1 .req q8
608 .else
609         t0 .req \d0
610         t1 .req \d1
611 .endif
612         vext.8          d2,  \r0, \r1, #2
613         vext.8          d3,  \r0, \r1, #3
614         vaddl.u8        q1,  d2,  d3
615         vext.8          d4,  \r0, \r1, #1
616         vext.8          d5,  \r0, \r1, #4
617         vaddl.u8        q2,  d4,  d5
618         vext.8          d30, \r0, \r1, #5
619         vaddl.u8        t0,  \r0, d30
620         vext.8          d18, \r2, \r3, #2
621         vmla.i16        t0,  q1,  d6[1]
622         vext.8          d19, \r2, \r3, #3
623         vaddl.u8        q9,  d18, d19
624         vext.8          d20, \r2, \r3, #1
625         vmls.i16        t0,  q2,  d6[0]
626         vext.8          d21, \r2, \r3, #4
627         vaddl.u8        q10, d20, d21
628         vext.8          d31, \r2, \r3, #5
629         vaddl.u8        t1,  \r2, d31
630         vmla.i16        t1,  q9,  d6[1]
631         vmls.i16        t1,  q10, d6[0]
632 .if \narrow
633         vqrshrun.s16    \d0, t0,  #5
634         vqrshrun.s16    \d1, t1,  #5
635 .endif
636         .unreq  t0
637         .unreq  t1
638         .endm
639
640         .macro  lowpass_8_1 r0, r1, d0, narrow=1
641 .if \narrow
642         t0 .req q0
643 .else
644         t0 .req \d0
645 .endif
646         vext.8          d2,  \r0, \r1, #2
647         vext.8          d3,  \r0, \r1, #3
648         vaddl.u8        q1,  d2,  d3
649         vext.8          d4,  \r0, \r1, #1
650         vext.8          d5,  \r0, \r1, #4
651         vaddl.u8        q2,  d4,  d5
652         vext.8          d30, \r0, \r1, #5
653         vaddl.u8        t0,  \r0, d30
654         vmla.i16        t0,  q1,  d6[1]
655         vmls.i16        t0,  q2,  d6[0]
656 .if \narrow
657         vqrshrun.s16    \d0, t0,  #5
658 .endif
659         .unreq  t0
660         .endm
661
662         .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
663         vext.16         q1,  \r0, \r1, #2
664         vext.16         q0,  \r0, \r1, #3
665         vaddl.s16       q9,  d2,  d0
666         vext.16         q2,  \r0, \r1, #1
667         vaddl.s16       q1,  d3,  d1
668         vext.16         q3,  \r0, \r1, #4
669         vaddl.s16       q10, d4,  d6
670         vext.16         \r1, \r0, \r1, #5
671         vaddl.s16       q2,  d5,  d7
672         vaddl.s16       q0,  \h0, \h1
673         vaddl.s16       q8,  \l0, \l1
674
675         vshl.i32        q3,  q9,  #4
676         vshl.i32        q9,  q9,  #2
677         vshl.i32        q15, q10, #2
678         vadd.i32        q9,  q9,  q3
679         vadd.i32        q10, q10, q15
680
681         vshl.i32        q3,  q1,  #4
682         vshl.i32        q1,  q1,  #2
683         vshl.i32        q15, q2,  #2
684         vadd.i32        q1,  q1,  q3
685         vadd.i32        q2,  q2,  q15
686
687         vadd.i32        q9,  q9,  q8
688         vsub.i32        q9,  q9,  q10
689
690         vadd.i32        q1,  q1,  q0
691         vsub.i32        q1,  q1,  q2
692
693         vrshrn.s32      d18, q9,  #10
694         vrshrn.s32      d19, q1,  #10
695
696         vqmovun.s16     \d,  q9
697         .endm
698
699 function put_h264_qpel16_h_lowpass_neon_packed
700         mov             r4,  lr
701         mov             ip,  #16
702         mov             r3,  #8
703         bl              put_h264_qpel8_h_lowpass_neon
704         sub             r1,  r1,  r2, lsl #4
705         add             r1,  r1,  #8
706         mov             ip,  #16
707         mov             lr,  r4
708         b               put_h264_qpel8_h_lowpass_neon
709         .endfunc
710
711 function put_h264_qpel16_h_lowpass_neon
712         push            {lr}
713         mov             ip,  #16
714         bl              put_h264_qpel8_h_lowpass_neon
715         sub             r0,  r0,  r3, lsl #4
716         sub             r1,  r1,  r2, lsl #4
717         add             r0,  r0,  #8
718         add             r1,  r1,  #8
719         mov             ip,  #16
720         pop             {lr}
721         .endfunc
722
723 function put_h264_qpel8_h_lowpass_neon
724 1:      vld1.64         {d0, d1},  [r1], r2
725         vld1.64         {d16,d17}, [r1], r2
726         subs            ip,  ip,  #2
727         lowpass_8       d0,  d1,  d16, d17, d0,  d16
728         vst1.64         {d0},     [r0,:64], r3
729         vst1.64         {d16},    [r0,:64], r3
730         bne             1b
731         bx              lr
732         .endfunc
733
734 function put_h264_qpel16_h_lowpass_l2_neon
735         push            {lr}
736         mov             ip,  #16
737         bl              put_h264_qpel8_h_lowpass_l2_neon
738         sub             r0,  r0,  r2, lsl #4
739         sub             r1,  r1,  r2, lsl #4
740         sub             r3,  r3,  r2, lsl #4
741         add             r0,  r0,  #8
742         add             r1,  r1,  #8
743         add             r3,  r3,  #8
744         mov             ip,  #16
745         pop             {lr}
746         .endfunc
747
748 function put_h264_qpel8_h_lowpass_l2_neon
749 1:      vld1.64         {d0, d1},  [r1], r2
750         vld1.64         {d16,d17}, [r1], r2
751         vld1.64         {d28},     [r3], r2
752         vld1.64         {d29},     [r3], r2
753         subs            ip,  ip,  #2
754         lowpass_8       d0,  d1,  d16, d17, d0,  d1
755         vrhadd.u8       q0,  q0,  q14
756         vst1.64         {d0},      [r0,:64], r2
757         vst1.64         {d1},      [r0,:64], r2
758         bne             1b
759         bx              lr
760         .endfunc
761
762 function put_h264_qpel16_v_lowpass_neon_packed
763         mov             r4,  lr
764         mov             r2,  #8
765         bl              put_h264_qpel8_v_lowpass_neon
766         sub             r1,  r1,  r3, lsl #2
767         bl              put_h264_qpel8_v_lowpass_neon
768         sub             r1,  r1,  r3, lsl #4
769         sub             r1,  r1,  r3, lsl #2
770         add             r1,  r1,  #8
771         bl              put_h264_qpel8_v_lowpass_neon
772         sub             r1,  r1,  r3, lsl #2
773         mov             lr,  r4
774         b               put_h264_qpel8_v_lowpass_neon
775         .endfunc
776
777 function put_h264_qpel16_v_lowpass_neon
778         mov             r4,  lr
779         bl              put_h264_qpel8_v_lowpass_neon
780         sub             r1,  r1,  r3, lsl #2
781         bl              put_h264_qpel8_v_lowpass_neon
782         sub             r0,  r0,  r2, lsl #4
783         add             r0,  r0,  #8
784         sub             r1,  r1,  r3, lsl #4
785         sub             r1,  r1,  r3, lsl #2
786         add             r1,  r1,  #8
787         bl              put_h264_qpel8_v_lowpass_neon
788         sub             r1,  r1,  r3, lsl #2
789         mov             lr,  r4
790         .endfunc
791
792 function put_h264_qpel8_v_lowpass_neon
793         vld1.64         {d8},  [r1], r3
794         vld1.64         {d10}, [r1], r3
795         vld1.64         {d12}, [r1], r3
796         vld1.64         {d14}, [r1], r3
797         vld1.64         {d22}, [r1], r3
798         vld1.64         {d24}, [r1], r3
799         vld1.64         {d26}, [r1], r3
800         vld1.64         {d28}, [r1], r3
801         vld1.64         {d9},  [r1], r3
802         vld1.64         {d11}, [r1], r3
803         vld1.64         {d13}, [r1], r3
804         vld1.64         {d15}, [r1], r3
805         vld1.64         {d23}, [r1]
806
807         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
808         lowpass_8       d8,  d9,  d10, d11, d8,  d10
809         lowpass_8       d12, d13, d14, d15, d12, d14
810         lowpass_8       d22, d23, d24, d25, d22, d24
811         lowpass_8       d26, d27, d28, d29, d26, d28
812         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
813
814         vst1.64         {d8},  [r0,:64], r2
815         vst1.64         {d10}, [r0,:64], r2
816         vst1.64         {d12}, [r0,:64], r2
817         vst1.64         {d14}, [r0,:64], r2
818         vst1.64         {d22}, [r0,:64], r2
819         vst1.64         {d24}, [r0,:64], r2
820         vst1.64         {d26}, [r0,:64], r2
821         vst1.64         {d28}, [r0,:64], r2
822
823         bx              lr
824         .endfunc
825
826 function put_h264_qpel16_v_lowpass_l2_neon
827         mov             r4,  lr
828         bl              put_h264_qpel8_v_lowpass_l2_neon
829         sub             r1,  r1,  r3, lsl #2
830         bl              put_h264_qpel8_v_lowpass_l2_neon
831         sub             r0,  r0,  r3, lsl #4
832         sub             ip,  ip,  r2, lsl #4
833         add             r0,  r0,  #8
834         add             ip,  ip,  #8
835         sub             r1,  r1,  r3, lsl #4
836         sub             r1,  r1,  r3, lsl #2
837         add             r1,  r1,  #8
838         bl              put_h264_qpel8_v_lowpass_l2_neon
839         sub             r1,  r1,  r3, lsl #2
840         mov             lr,  r4
841         .endfunc
842
843 function put_h264_qpel8_v_lowpass_l2_neon
844         vld1.64         {d8},  [r1], r3
845         vld1.64         {d10}, [r1], r3
846         vld1.64         {d12}, [r1], r3
847         vld1.64         {d14}, [r1], r3
848         vld1.64         {d22}, [r1], r3
849         vld1.64         {d24}, [r1], r3
850         vld1.64         {d26}, [r1], r3
851         vld1.64         {d28}, [r1], r3
852         vld1.64         {d9},  [r1], r3
853         vld1.64         {d11}, [r1], r3
854         vld1.64         {d13}, [r1], r3
855         vld1.64         {d15}, [r1], r3
856         vld1.64         {d23}, [r1]
857
858         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
859         lowpass_8       d8,  d9,  d10, d11, d8,  d9
860         lowpass_8       d12, d13, d14, d15, d12, d13
861         lowpass_8       d22, d23, d24, d25, d22, d23
862         lowpass_8       d26, d27, d28, d29, d26, d27
863         transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
864
865         vld1.64         {d0},  [ip], r2
866         vld1.64         {d1},  [ip], r2
867         vld1.64         {d2},  [ip], r2
868         vld1.64         {d3},  [ip], r2
869         vld1.64         {d4},  [ip], r2
870         vrhadd.u8       q0,  q0,  q4
871         vld1.64         {d5},  [ip], r2
872         vrhadd.u8       q1,  q1,  q6
873         vld1.64         {d10}, [ip], r2
874         vrhadd.u8       q2,  q2,  q11
875         vld1.64         {d11}, [ip], r2
876
877         vst1.64         {d0},  [r0,:64], r3
878         vst1.64         {d1},  [r0,:64], r3
879         vrhadd.u8       q5,  q5,  q13
880         vst1.64         {d2},  [r0,:64], r3
881         vst1.64         {d3},  [r0,:64], r3
882         vst1.64         {d4},  [r0,:64], r3
883         vst1.64         {d5},  [r0,:64], r3
884         vst1.64         {d10}, [r0,:64], r3
885         vst1.64         {d11}, [r0,:64], r3
886
887         bx              lr
888         .endfunc
889
890 function put_h264_qpel8_hv_lowpass_neon_top
891         lowpass_const   ip
892         mov             ip,  #12
893 1:      vld1.64         {d0, d1},  [r1], r3
894         vld1.64         {d16,d17}, [r1], r3
895         subs            ip,  ip,  #2
896         lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
897         vst1.64         {d22-d25}, [r4,:128]!
898         bne             1b
899
900         vld1.64         {d0, d1},  [r1]
901         lowpass_8_1     d0,  d1,  q12, narrow=0
902
903         mov             ip,  #-16
904         add             r4,  r4,  ip
905         vld1.64         {d30,d31}, [r4,:128], ip
906         vld1.64         {d20,d21}, [r4,:128], ip
907         vld1.64         {d18,d19}, [r4,:128], ip
908         vld1.64         {d16,d17}, [r4,:128], ip
909         vld1.64         {d14,d15}, [r4,:128], ip
910         vld1.64         {d12,d13}, [r4,:128], ip
911         vld1.64         {d10,d11}, [r4,:128], ip
912         vld1.64         {d8, d9},  [r4,:128], ip
913         vld1.64         {d6, d7},  [r4,:128], ip
914         vld1.64         {d4, d5},  [r4,:128], ip
915         vld1.64         {d2, d3},  [r4,:128], ip
916         vld1.64         {d0, d1},  [r4,:128]
917
918         swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
919         transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
920
921         swap4           d17, d19, d21, d31, d24, d26, d28, d22
922         transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
923
924         vst1.64         {d30,d31}, [r4,:128]!
925         vst1.64         {d6, d7},  [r4,:128]!
926         vst1.64         {d20,d21}, [r4,:128]!
927         vst1.64         {d4, d5},  [r4,:128]!
928         vst1.64         {d18,d19}, [r4,:128]!
929         vst1.64         {d2, d3},  [r4,:128]!
930         vst1.64         {d16,d17}, [r4,:128]!
931         vst1.64         {d0, d1},  [r4,:128]
932
933         lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
934         lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
935         lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
936         lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
937
938         vld1.64         {d16,d17}, [r4,:128], ip
939         vld1.64         {d30,d31}, [r4,:128], ip
940         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
941         vld1.64         {d16,d17}, [r4,:128], ip
942         vld1.64         {d30,d31}, [r4,:128], ip
943         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
944         vld1.64         {d16,d17}, [r4,:128], ip
945         vld1.64         {d30,d31}, [r4,:128], ip
946         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
947         vld1.64         {d16,d17}, [r4,:128], ip
948         vld1.64         {d30,d31}, [r4,:128]
949         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
950
951         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
952
953         bx              lr
954         .endfunc
955
956 function put_h264_qpel8_hv_lowpass_neon
957         mov             r10, lr
958         bl              put_h264_qpel8_hv_lowpass_neon_top
959         vst1.64         {d12},     [r0,:64], r2
960         vst1.64         {d13},     [r0,:64], r2
961         vst1.64         {d14},     [r0,:64], r2
962         vst1.64         {d15},     [r0,:64], r2
963         vst1.64         {d8},      [r0,:64], r2
964         vst1.64         {d9},      [r0,:64], r2
965         vst1.64         {d10},     [r0,:64], r2
966         vst1.64         {d11},     [r0,:64], r2
967
968         mov             lr,  r10
969         bx              lr
970         .endfunc
971
972 function put_h264_qpel8_hv_lowpass_l2_neon
973         mov             r10, lr
974         bl              put_h264_qpel8_hv_lowpass_neon_top
975
976         vld1.64         {d0, d1},  [r2,:128]!
977         vld1.64         {d2, d3},  [r2,:128]!
978         vrhadd.u8       q0,  q0,  q6
979         vld1.64         {d4, d5},  [r2,:128]!
980         vrhadd.u8       q1,  q1,  q7
981         vld1.64         {d6, d7},  [r2,:128]!
982         vrhadd.u8       q2,  q2,  q4
983
984         vst1.64         {d0},      [r0,:64], r3
985         vrhadd.u8       q3,  q3,  q5
986         vst1.64         {d1},      [r0,:64], r3
987         vst1.64         {d2},      [r0,:64], r3
988         vst1.64         {d3},      [r0,:64], r3
989         vst1.64         {d4},      [r0,:64], r3
990         vst1.64         {d5},      [r0,:64], r3
991         vst1.64         {d6},      [r0,:64], r3
992         vst1.64         {d7},      [r0,:64], r3
993
994         mov             lr,  r10
995         bx              lr
996         .endfunc
997
998 function put_h264_qpel16_hv_lowpass_neon
999         mov             r9,  lr
1000         bl              put_h264_qpel8_hv_lowpass_neon
1001         sub             r1,  r1,  r3, lsl #2
1002         bl              put_h264_qpel8_hv_lowpass_neon
1003         sub             r1,  r1,  r3, lsl #4
1004         sub             r1,  r1,  r3, lsl #2
1005         add             r1,  r1,  #8
1006         sub             r0,  r0,  r2, lsl #4
1007         add             r0,  r0,  #8
1008         bl              put_h264_qpel8_hv_lowpass_neon
1009         sub             r1,  r1,  r3, lsl #2
1010         mov             lr,  r9
1011         b               put_h264_qpel8_hv_lowpass_neon
1012         .endfunc
1013
1014 function put_h264_qpel16_hv_lowpass_l2_neon
1015         mov             r9,  lr
1016         sub             r2,  r4,  #256
1017         bl              put_h264_qpel8_hv_lowpass_l2_neon
1018         sub             r1,  r1,  r3, lsl #2
1019         bl              put_h264_qpel8_hv_lowpass_l2_neon
1020         sub             r1,  r1,  r3, lsl #4
1021         sub             r1,  r1,  r3, lsl #2
1022         add             r1,  r1,  #8
1023         sub             r0,  r0,  r3, lsl #4
1024         add             r0,  r0,  #8
1025         bl              put_h264_qpel8_hv_lowpass_l2_neon
1026         sub             r1,  r1,  r3, lsl #2
1027         mov             lr,  r9
1028         b               put_h264_qpel8_hv_lowpass_l2_neon
1029         .endfunc
1030
1031 function ff_put_h264_qpel8_mc10_neon, export=1
1032         lowpass_const   r3
1033         mov             r3,  r1
1034         sub             r1,  r1,  #2
1035         mov             ip,  #8
1036         b               put_h264_qpel8_h_lowpass_l2_neon
1037         .endfunc
1038
1039 function ff_put_h264_qpel8_mc20_neon, export=1
1040         lowpass_const   r3
1041         sub             r1,  r1,  #2
1042         mov             r3,  r2
1043         mov             ip,  #8
1044         b               put_h264_qpel8_h_lowpass_neon
1045         .endfunc
1046
1047 function ff_put_h264_qpel8_mc30_neon, export=1
1048         lowpass_const   r3
1049         add             r3,  r1,  #1
1050         sub             r1,  r1,  #2
1051         mov             ip,  #8
1052         b               put_h264_qpel8_h_lowpass_l2_neon
1053         .endfunc
1054
1055 function ff_put_h264_qpel8_mc01_neon, export=1
1056         push            {lr}
1057         mov             ip,  r1
1058 put_h264_qpel8_mc01:
1059         lowpass_const   r3
1060         mov             r3,  r2
1061         sub             r1,  r1,  r2, lsl #1
1062         vpush           {d8-d15}
1063         bl              put_h264_qpel8_v_lowpass_l2_neon
1064         vpop            {d8-d15}
1065         pop             {pc}
1066         .endfunc
1067
1068 function ff_put_h264_qpel8_mc11_neon, export=1
1069         push            {r0, r1, r2, lr}
1070 put_h264_qpel8_mc11:
1071         lowpass_const   r3
1072         sub             sp,  sp,  #64
1073         mov             r0,  sp
1074         sub             r1,  r1,  #2
1075         mov             r3,  #8
1076         mov             ip,  #8
1077         vpush           {d8-d15}
1078         bl              put_h264_qpel8_h_lowpass_neon
1079         ldrd            r0,  [sp, #128]
1080         mov             r3,  r2
1081         add             ip,  sp,  #64
1082         sub             r1,  r1,  r2, lsl #1
1083         mov             r2,  #8
1084         bl              put_h264_qpel8_v_lowpass_l2_neon
1085         vpop            {d8-d15}
1086         add             sp,  sp,  #76
1087         pop             {pc}
1088         .endfunc
1089
1090 function ff_put_h264_qpel8_mc21_neon, export=1
1091         push            {r0, r1, r4, r10, r11, lr}
1092 put_h264_qpel8_mc21:
1093         lowpass_const   r3
1094         mov             r11, sp
1095         bic             sp,  sp,  #15
1096         sub             sp,  sp,  #(8*8+16*12)
1097         sub             r1,  r1,  #2
1098         mov             r3,  #8
1099         mov             r0,  sp
1100         mov             ip,  #8
1101         vpush           {d8-d15}
1102         bl              put_h264_qpel8_h_lowpass_neon
1103         mov             r4,  r0
1104         ldrd            r0,  [r11]
1105         sub             r1,  r1,  r2, lsl #1
1106         sub             r1,  r1,  #2
1107         mov             r3,  r2
1108         sub             r2,  r4,  #64
1109         bl              put_h264_qpel8_hv_lowpass_l2_neon
1110         vpop            {d8-d15}
1111         add             sp,  r11,  #8
1112         pop             {r4, r10, r11, pc}
1113         .endfunc
1114
1115 function ff_put_h264_qpel8_mc31_neon, export=1
1116         add             r1,  r1,  #1
1117         push            {r0, r1, r2, lr}
1118         sub             r1,  r1,  #1
1119         b               put_h264_qpel8_mc11
1120         .endfunc
1121
1122 function ff_put_h264_qpel8_mc02_neon, export=1
1123         push            {lr}
1124         lowpass_const   r3
1125         sub             r1,  r1,  r2, lsl #1
1126         mov             r3,  r2
1127         vpush           {d8-d15}
1128         bl              put_h264_qpel8_v_lowpass_neon
1129         vpop            {d8-d15}
1130         pop             {pc}
1131         .endfunc
1132
1133 function ff_put_h264_qpel8_mc12_neon, export=1
1134         push            {r0, r1, r4, r10, r11, lr}
1135 put_h264_qpel8_mc12:
1136         lowpass_const   r3
1137         mov             r11, sp
1138         bic             sp,  sp,  #15
1139         sub             sp,  sp,  #(8*8+16*12)
1140         sub             r1,  r1,  r2, lsl #1
1141         mov             r3,  r2
1142         mov             r2,  #8
1143         mov             r0,  sp
1144         vpush           {d8-d15}
1145         bl              put_h264_qpel8_v_lowpass_neon
1146         mov             r4,  r0
1147         ldrd            r0,  [r11]
1148         sub             r1,  r1,  r3, lsl #1
1149         sub             r1,  r1,  #2
1150         sub             r2,  r4,  #64
1151         bl              put_h264_qpel8_hv_lowpass_l2_neon
1152         vpop            {d8-d15}
1153         add             sp,  r11,  #8
1154         pop             {r4, r10, r11, pc}
1155         .endfunc
1156
1157 function ff_put_h264_qpel8_mc22_neon, export=1
1158         push            {r4, r10, r11, lr}
1159         mov             r11, sp
1160         bic             sp,  sp,  #15
1161         sub             r1,  r1,  r2, lsl #1
1162         sub             r1,  r1,  #2
1163         mov             r3,  r2
1164         sub             sp,  sp,  #(16*12)
1165         mov             r4,  sp
1166         vpush           {d8-d15}
1167         bl              put_h264_qpel8_hv_lowpass_neon
1168         vpop            {d8-d15}
1169         mov             sp,  r11
1170         pop             {r4, r10, r11, pc}
1171         .endfunc
1172
1173 function ff_put_h264_qpel8_mc32_neon, export=1
1174         push            {r0, r1, r4, r10, r11, lr}
1175         add             r1,  r1,  #1
1176         b               put_h264_qpel8_mc12
1177         .endfunc
1178
1179 function ff_put_h264_qpel8_mc03_neon, export=1
1180         push            {lr}
1181         add             ip,  r1,  r2
1182         b               put_h264_qpel8_mc01
1183         .endfunc
1184
1185 function ff_put_h264_qpel8_mc13_neon, export=1
1186         push            {r0, r1, r2, lr}
1187         add             r1,  r1,  r2
1188         b               put_h264_qpel8_mc11
1189         .endfunc
1190
1191 function ff_put_h264_qpel8_mc23_neon, export=1
1192         push            {r0, r1, r4, r10, r11, lr}
1193         add             r1,  r1,  r2
1194         b               put_h264_qpel8_mc21
1195         .endfunc
1196
1197 function ff_put_h264_qpel8_mc33_neon, export=1
1198         add             r1,  r1,  #1
1199         push            {r0, r1, r2, lr}
1200         add             r1,  r1,  r2
1201         sub             r1,  r1,  #1
1202         b               put_h264_qpel8_mc11
1203         .endfunc
1204
1205 function ff_put_h264_qpel16_mc10_neon, export=1
1206         lowpass_const   r3
1207         mov             r3,  r1
1208         sub             r1,  r1,  #2
1209         b               put_h264_qpel16_h_lowpass_l2_neon
1210         .endfunc
1211
1212 function ff_put_h264_qpel16_mc20_neon, export=1
1213         lowpass_const   r3
1214         sub             r1,  r1,  #2
1215         mov             r3,  r2
1216         b               put_h264_qpel16_h_lowpass_neon
1217         .endfunc
1218
1219 function ff_put_h264_qpel16_mc30_neon, export=1
1220         lowpass_const   r3
1221         add             r3,  r1,  #1
1222         sub             r1,  r1,  #2
1223         b               put_h264_qpel16_h_lowpass_l2_neon
1224         .endfunc
1225
1226 function ff_put_h264_qpel16_mc01_neon, export=1
1227         push            {r4, lr}
1228         mov             ip,  r1
1229 put_h264_qpel16_mc01:
1230         lowpass_const   r3
1231         mov             r3,  r2
1232         sub             r1,  r1,  r2, lsl #1
1233         vpush           {d8-d15}
1234         bl              put_h264_qpel16_v_lowpass_l2_neon
1235         vpop            {d8-d15}
1236         pop             {r4, pc}
1237         .endfunc
1238
1239 function ff_put_h264_qpel16_mc11_neon, export=1
1240         push            {r0, r1, r4, lr}
1241 put_h264_qpel16_mc11:
1242         lowpass_const   r3
1243         sub             sp,  sp,  #256
1244         mov             r0,  sp
1245         sub             r1,  r1,  #2
1246         mov             r3,  #16
1247         vpush           {d8-d15}
1248         bl              put_h264_qpel16_h_lowpass_neon
1249         add             r0,  sp,  #256
1250         ldrd            r0,  [r0, #64]
1251         mov             r3,  r2
1252         add             ip,  sp,  #64
1253         sub             r1,  r1,  r2, lsl #1
1254         mov             r2,  #16
1255         bl              put_h264_qpel16_v_lowpass_l2_neon
1256         vpop            {d8-d15}
1257         add             sp,  sp,  #(256+8)
1258         pop             {r4, pc}
1259         .endfunc
1260
1261 function ff_put_h264_qpel16_mc21_neon, export=1
1262         push            {r0, r1, r4-r5, r9-r11, lr}
1263 put_h264_qpel16_mc21:
1264         lowpass_const   r3
1265         mov             r11, sp
1266         bic             sp,  sp,  #15
1267         sub             sp,  sp,  #(16*16+16*12)
1268         sub             r1,  r1,  #2
1269         mov             r0,  sp
1270         vpush           {d8-d15}
1271         bl              put_h264_qpel16_h_lowpass_neon_packed
1272         mov             r4,  r0
1273         ldrd            r0,  [r11]
1274         sub             r1,  r1,  r2, lsl #1
1275         sub             r1,  r1,  #2
1276         mov             r3,  r2
1277         bl              put_h264_qpel16_hv_lowpass_l2_neon
1278         vpop            {d8-d15}
1279         add             sp,  r11,  #8
1280         pop             {r4-r5, r9-r11, pc}
1281         .endfunc
1282
1283 function ff_put_h264_qpel16_mc31_neon, export=1
1284         add             r1,  r1,  #1
1285         push            {r0, r1, r4, lr}
1286         sub             r1,  r1,  #1
1287         b               put_h264_qpel16_mc11
1288         .endfunc
1289
1290 function ff_put_h264_qpel16_mc02_neon, export=1
1291         push            {r4, lr}
1292         lowpass_const   r3
1293         sub             r1,  r1,  r2, lsl #1
1294         mov             r3,  r2
1295         vpush           {d8-d15}
1296         bl              put_h264_qpel16_v_lowpass_neon
1297         vpop            {d8-d15}
1298         pop             {r4, pc}
1299         .endfunc
1300
1301 function ff_put_h264_qpel16_mc12_neon, export=1
1302         push            {r0, r1, r4-r5, r9-r11, lr}
1303 put_h264_qpel16_mc12:
1304         lowpass_const   r3
1305         mov             r11, sp
1306         bic             sp,  sp,  #15
1307         sub             sp,  sp,  #(16*16+16*12)
1308         sub             r1,  r1,  r2, lsl #1
1309         mov             r0,  sp
1310         mov             r3,  r2
1311         vpush           {d8-d15}
1312         bl              put_h264_qpel16_v_lowpass_neon_packed
1313         mov             r4,  r0
1314         ldrd            r0,  [r11]
1315         sub             r1,  r1,  r3, lsl #1
1316         sub             r1,  r1,  #2
1317         mov             r2,  r3
1318         bl              put_h264_qpel16_hv_lowpass_l2_neon
1319         vpop            {d8-d15}
1320         add             sp,  r11,  #8
1321         pop             {r4-r5, r9-r11, pc}
1322         .endfunc
1323
1324 function ff_put_h264_qpel16_mc22_neon, export=1
1325         push            {r4, r9-r11, lr}
1326         lowpass_const   r3
1327         mov             r11, sp
1328         bic             sp,  sp,  #15
1329         sub             r1,  r1,  r2, lsl #1
1330         sub             r1,  r1,  #2
1331         mov             r3,  r2
1332         sub             sp,  sp,  #(16*12)
1333         mov             r4,  sp
1334         vpush           {d8-d15}
1335         bl              put_h264_qpel16_hv_lowpass_neon
1336         vpop            {d8-d15}
1337         mov             sp,  r11
1338         pop             {r4, r9-r11, pc}
1339         .endfunc
1340
1341 function ff_put_h264_qpel16_mc32_neon, export=1
1342         push            {r0, r1, r4-r5, r9-r11, lr}
1343         add             r1,  r1,  #1
1344         b               put_h264_qpel16_mc12
1345         .endfunc
1346
1347 function ff_put_h264_qpel16_mc03_neon, export=1
1348         push            {r4, lr}
1349         add             ip,  r1,  r2
1350         b               put_h264_qpel16_mc01
1351         .endfunc
1352
1353 function ff_put_h264_qpel16_mc13_neon, export=1
1354         push            {r0, r1, r4, lr}
1355         add             r1,  r1,  r2
1356         b               put_h264_qpel16_mc11
1357         .endfunc
1358
1359 function ff_put_h264_qpel16_mc23_neon, export=1
1360         push            {r0, r1, r4-r5, r9-r11, lr}
1361         add             r1,  r1,  r2
1362         b               put_h264_qpel16_mc21
1363         .endfunc
1364
1365 function ff_put_h264_qpel16_mc33_neon, export=1
1366         add             r1,  r1,  #1
1367         push            {r0, r1, r4, lr}
1368         add             r1,  r1,  r2
1369         sub             r1,  r1,  #1
1370         b               put_h264_qpel16_mc11
1371         .endfunc
1372
1373 @ Biweighted prediction
1374
1375         .macro  biweight_16 macs, macd
1376         vdup.8          d0,  r4
1377         vdup.8          d1,  r5
1378         vmov            q2,  q8
1379         vmov            q3,  q8
1380 1:      subs            ip,  ip,  #2
1381         vld1.8          {d20-d21},[r0,:128], r2
1382         \macd           q2,  d0,  d20
1383         pld             [r0]
1384         \macd           q3,  d0,  d21
1385         vld1.8          {d22-d23},[r1,:128], r2
1386         \macs           q2,  d1,  d22
1387         pld             [r1]
1388         \macs           q3,  d1,  d23
1389         vmov            q12, q8
1390         vld1.8          {d28-d29},[r0,:128], r2
1391         vmov            q13, q8
1392         \macd           q12, d0,  d28
1393         pld             [r0]
1394         \macd           q13, d0,  d29
1395         vld1.8          {d30-d31},[r1,:128], r2
1396         \macs           q12, d1,  d30
1397         pld             [r1]
1398         \macs           q13, d1,  d31
1399         vshl.s16        q2,  q2,  q9
1400         vshl.s16        q3,  q3,  q9
1401         vqmovun.s16     d4,  q2
1402         vqmovun.s16     d5,  q3
1403         vshl.s16        q12, q12, q9
1404         vshl.s16        q13, q13, q9
1405         vqmovun.s16     d24, q12
1406         vqmovun.s16     d25, q13
1407         vmov            q3,  q8
1408         vst1.8          {d4- d5}, [r6,:128], r2
1409         vmov            q2,  q8
1410         vst1.8          {d24-d25},[r6,:128], r2
1411         bne             1b
1412         pop             {r4-r6, pc}
1413         .endm
1414
1415         .macro  biweight_8 macs, macd
1416         vdup.8          d0,  r4
1417         vdup.8          d1,  r5
1418         vmov            q1,  q8
1419         vmov            q10, q8
1420 1:      subs            ip,  ip,  #2
1421         vld1.8          {d4},[r0,:64], r2
1422         \macd           q1,  d0,  d4
1423         pld             [r0]
1424         vld1.8          {d5},[r1,:64], r2
1425         \macs           q1,  d1,  d5
1426         pld             [r1]
1427         vld1.8          {d6},[r0,:64], r2
1428         \macd           q10, d0,  d6
1429         pld             [r0]
1430         vld1.8          {d7},[r1,:64], r2
1431         \macs           q10, d1,  d7
1432         pld             [r1]
1433         vshl.s16        q1,  q1,  q9
1434         vqmovun.s16     d2,  q1
1435         vshl.s16        q10, q10, q9
1436         vqmovun.s16     d4,  q10
1437         vmov            q10, q8
1438         vst1.8          {d2},[r6,:64], r2
1439         vmov            q1,  q8
1440         vst1.8          {d4},[r6,:64], r2
1441         bne             1b
1442         pop             {r4-r6, pc}
1443         .endm
1444
1445         .macro  biweight_4 macs, macd
1446         vdup.8          d0,  r4
1447         vdup.8          d1,  r5
1448         vmov            q1,  q8
1449         vmov            q10, q8
1450 1:      subs            ip,  ip,  #4
1451         vld1.32         {d4[0]},[r0,:32], r2
1452         vld1.32         {d4[1]},[r0,:32], r2
1453         \macd           q1,  d0,  d4
1454         pld             [r0]
1455         vld1.32         {d5[0]},[r1,:32], r2
1456         vld1.32         {d5[1]},[r1,:32], r2
1457         \macs           q1,  d1,  d5
1458         pld             [r1]
1459         blt             2f
1460         vld1.32         {d6[0]},[r0,:32], r2
1461         vld1.32         {d6[1]},[r0,:32], r2
1462         \macd           q10, d0,  d6
1463         pld             [r0]
1464         vld1.32         {d7[0]},[r1,:32], r2
1465         vld1.32         {d7[1]},[r1,:32], r2
1466         \macs           q10, d1,  d7
1467         pld             [r1]
1468         vshl.s16        q1,  q1,  q9
1469         vqmovun.s16     d2,  q1
1470         vshl.s16        q10, q10, q9
1471         vqmovun.s16     d4,  q10
1472         vmov            q10, q8
1473         vst1.32         {d2[0]},[r6,:32], r2
1474         vst1.32         {d2[1]},[r6,:32], r2
1475         vmov            q1,  q8
1476         vst1.32         {d4[0]},[r6,:32], r2
1477         vst1.32         {d4[1]},[r6,:32], r2
1478         bne             1b
1479         pop             {r4-r6, pc}
1480 2:      vshl.s16        q1,  q1,  q9
1481         vqmovun.s16     d2,  q1
1482         vst1.32         {d2[0]},[r6,:32], r2
1483         vst1.32         {d2[1]},[r6,:32], r2
1484         pop             {r4-r6, pc}
1485         .endm
1486
1487         .macro  biweight_func w
1488 function biweight_h264_pixels_\w\()_neon
1489         push            {r4-r6, lr}
1490         add             r4,  sp,  #16
1491         ldm             r4,  {r4-r6}
1492         lsr             lr,  r4,  #31
1493         add             r6,  r6,  #1
1494         eors            lr,  lr,  r5,  lsr #30
1495         orr             r6,  r6,  #1
1496         vdup.16         q9,  r3
1497         lsl             r6,  r6,  r3
1498         vmvn            q9,  q9
1499         vdup.16         q8,  r6
1500         mov             r6,  r0
1501         beq             10f
1502         subs            lr,  lr,  #1
1503         beq             20f
1504         subs            lr,  lr,  #1
1505         beq             30f
1506         b               40f
1507 10:     biweight_\w     vmlal.u8, vmlal.u8
1508 20:     rsb             r4,  r4,  #0
1509         biweight_\w     vmlal.u8, vmlsl.u8
1510 30:     rsb             r4,  r4,  #0
1511         rsb             r5,  r5,  #0
1512         biweight_\w     vmlsl.u8, vmlsl.u8
1513 40:     rsb             r5,  r5,  #0
1514         biweight_\w     vmlsl.u8, vmlal.u8
1515         .endfunc
1516         .endm
1517
1518         .macro  biweight_entry w, h, b=1
1519 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1520         mov             ip,  #\h
1521 .if \b
1522         b               biweight_h264_pixels_\w\()_neon
1523 .endif
1524         .endfunc
1525         .endm
1526
1527         biweight_entry  16, 8
1528         biweight_entry  16, 16, b=0
1529         biweight_func   16
1530
1531         biweight_entry  8,  16
1532         biweight_entry  8,  4
1533         biweight_entry  8,  8,  b=0
1534         biweight_func   8
1535
1536         biweight_entry  4,  8
1537         biweight_entry  4,  2
1538         biweight_entry  4,  4,  b=0
1539         biweight_func   4
1540
1541 @ Weighted prediction
1542
1543         .macro  weight_16 add
1544         vdup.8          d0,  r3
1545 1:      subs            ip,  ip,  #2
1546         vld1.8          {d20-d21},[r0,:128], r1
1547         vmull.u8        q2,  d0,  d20
1548         pld             [r0]
1549         vmull.u8        q3,  d0,  d21
1550         vld1.8          {d28-d29},[r0,:128], r1
1551         vmull.u8        q12, d0,  d28
1552         pld             [r0]
1553         vmull.u8        q13, d0,  d29
1554         \add            q2,  q8,  q2
1555         vrshl.s16       q2,  q2,  q9
1556         \add            q3,  q8,  q3
1557         vrshl.s16       q3,  q3,  q9
1558         vqmovun.s16     d4,  q2
1559         vqmovun.s16     d5,  q3
1560         \add            q12, q8,  q12
1561         vrshl.s16       q12, q12, q9
1562         \add            q13, q8,  q13
1563         vrshl.s16       q13, q13, q9
1564         vqmovun.s16     d24, q12
1565         vqmovun.s16     d25, q13
1566         vst1.8          {d4- d5}, [r4,:128], r1
1567         vst1.8          {d24-d25},[r4,:128], r1
1568         bne             1b
1569         pop             {r4, pc}
1570         .endm
1571
1572         .macro  weight_8 add
1573         vdup.8          d0,  r3
1574 1:      subs            ip,  ip,  #2
1575         vld1.8          {d4},[r0,:64], r1
1576         vmull.u8        q1,  d0,  d4
1577         pld             [r0]
1578         vld1.8          {d6},[r0,:64], r1
1579         vmull.u8        q10, d0,  d6
1580         \add            q1,  q8,  q1
1581         pld             [r0]
1582         vrshl.s16       q1,  q1,  q9
1583         vqmovun.s16     d2,  q1
1584         \add            q10, q8,  q10
1585         vrshl.s16       q10, q10, q9
1586         vqmovun.s16     d4,  q10
1587         vst1.8          {d2},[r4,:64], r1
1588         vst1.8          {d4},[r4,:64], r1
1589         bne             1b
1590         pop             {r4, pc}
1591         .endm
1592
1593         .macro  weight_4 add
1594         vdup.8          d0,  r3
1595         vmov            q1,  q8
1596         vmov            q10, q8
1597 1:      subs            ip,  ip,  #4
1598         vld1.32         {d4[0]},[r0,:32], r1
1599         vld1.32         {d4[1]},[r0,:32], r1
1600         vmull.u8        q1,  d0,  d4
1601         pld             [r0]
1602         blt             2f
1603         vld1.32         {d6[0]},[r0,:32], r1
1604         vld1.32         {d6[1]},[r0,:32], r1
1605         vmull.u8        q10, d0,  d6
1606         pld             [r0]
1607         \add            q1,  q8,  q1
1608         vrshl.s16       q1,  q1,  q9
1609         vqmovun.s16     d2,  q1
1610         \add            q10, q8,  q10
1611         vrshl.s16       q10, q10, q9
1612         vqmovun.s16     d4,  q10
1613         vmov            q10, q8
1614         vst1.32         {d2[0]},[r4,:32], r1
1615         vst1.32         {d2[1]},[r4,:32], r1
1616         vmov            q1,  q8
1617         vst1.32         {d4[0]},[r4,:32], r1
1618         vst1.32         {d4[1]},[r4,:32], r1
1619         bne             1b
1620         pop             {r4, pc}
1621 2:      \add            q1,  q8,  q1
1622         vrshl.s16       q1,  q1,  q9
1623         vqmovun.s16     d2,  q1
1624         vst1.32         {d2[0]},[r4,:32], r1
1625         vst1.32         {d2[1]},[r4,:32], r1
1626         pop             {r4, pc}
1627         .endm
1628
1629         .macro  weight_func w
1630 function weight_h264_pixels_\w\()_neon
1631         push            {r4, lr}
1632         ldr             r4,  [sp, #8]
1633         cmp             r2,  #1
1634         lsl             r4,  r4,  r2
1635         vdup.16         q8,  r4
1636         mov             r4,  r0
1637         ble             20f
1638         rsb             lr,  r2,  #1
1639         vdup.16         q9,  lr
1640         cmp             r3,  #0
1641         blt             10f
1642         weight_\w       vhadd.s16
1643 10:     rsb             r3,  r3,  #0
1644         weight_\w       vhsub.s16
1645 20:     rsb             lr,  r2,  #0
1646         vdup.16         q9,  lr
1647         cmp             r3,  #0
1648         blt             10f
1649         weight_\w       vadd.s16
1650 10:     rsb             r3,  r3,  #0
1651         weight_\w       vsub.s16
1652         .endfunc
1653         .endm
1654
1655         .macro  weight_entry w, h, b=1
1656 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1657         mov             ip,  #\h
1658 .if \b
1659         b               weight_h264_pixels_\w\()_neon
1660 .endif
1661         .endfunc
1662         .endm
1663
1664         weight_entry    16, 8
1665         weight_entry    16, 16, b=0
1666         weight_func     16
1667
1668         weight_entry    8,  16
1669         weight_entry    8,  4
1670         weight_entry    8,  8,  b=0
1671         weight_func     8
1672
1673         weight_entry    4,  8
1674         weight_entry    4,  2
1675         weight_entry    4,  4,  b=0
1676         weight_func     4