]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264dsp_neon.S
ARM: remove unnecessary .fpu neon directives
[ffmpeg] / libavcodec / arm / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "asm.S"
22
23         .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24         vtrn.32         \r0, \r4
25         vtrn.32         \r1, \r5
26         vtrn.32         \r2, \r6
27         vtrn.32         \r3, \r7
28         vtrn.16         \r0, \r2
29         vtrn.16         \r1, \r3
30         vtrn.16         \r4, \r6
31         vtrn.16         \r5, \r7
32         vtrn.8          \r0, \r1
33         vtrn.8          \r2, \r3
34         vtrn.8          \r4, \r5
35         vtrn.8          \r6, \r7
36         .endm
37
38         .macro transpose_4x4 r0 r1 r2 r3
39         vtrn.16         \r0, \r2
40         vtrn.16         \r1, \r3
41         vtrn.8          \r0, \r1
42         vtrn.8          \r2, \r3
43         .endm
44
45         .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46         vswp            \r0, \r4
47         vswp            \r1, \r5
48         vswp            \r2, \r6
49         vswp            \r3, \r7
50         .endm
51
52         .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53         vtrn.32         \r0, \r2
54         vtrn.32         \r1, \r3
55         vtrn.32         \r4, \r6
56         vtrn.32         \r5, \r7
57         vtrn.16         \r0, \r1
58         vtrn.16         \r2, \r3
59         vtrn.16         \r4, \r5
60         vtrn.16         \r6, \r7
61         .endm
62
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64         .macro  h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
66         push            {r4-r7, lr}
67         ldrd            r4,  [sp, #20]
68 .ifc \type,avg
69         mov             lr,  r0
70 .endif
71         pld             [r1]
72         pld             [r1, r2]
73
74         muls            r7,  r4,  r5
75         rsb             r6,  r7,  r5,  lsl #3
76         rsb             ip,  r7,  r4,  lsl #3
77         sub             r4,  r7,  r4,  lsl #3
78         sub             r4,  r4,  r5,  lsl #3
79         add             r4,  r4,  #64
80
81         beq             2f
82
83         add             r5,  r1,  r2
84
85         vdup.8          d0,  r4
86         lsl             r4,  r2,  #1
87         vdup.8          d1,  ip
88         vld1.64         {d4, d5}, [r1], r4
89         vdup.8          d2,  r6
90         vld1.64         {d6, d7}, [r5], r4
91         vdup.8          d3,  r7
92
93         vext.8          d5,  d4,  d5,  #1
94         vext.8          d7,  d6,  d7,  #1
95
96 1:      pld             [r5]
97         vmull.u8        q8,  d4,  d0
98         vmlal.u8        q8,  d5,  d1
99         vld1.64         {d4, d5}, [r1], r4
100         vmlal.u8        q8,  d6,  d2
101         vext.8          d5,  d4,  d5,  #1
102         vmlal.u8        q8,  d7,  d3
103         vmull.u8        q9,  d6,  d0
104         subs            r3,  r3,  #2
105         vmlal.u8        q9,  d7,  d1
106         vmlal.u8        q9,  d4,  d2
107         vmlal.u8        q9,  d5,  d3
108         vrshrn.u16      d16, q8,  #6
109         vld1.64         {d6, d7}, [r5], r4
110         pld             [r1]
111         vrshrn.u16      d17, q9,  #6
112 .ifc \type,avg
113         vld1.64         {d20}, [lr,:64], r2
114         vld1.64         {d21}, [lr,:64], r2
115         vrhadd.u8       q8,  q8,  q10
116 .endif
117         vext.8          d7,  d6,  d7,  #1
118         vst1.64         {d16}, [r0,:64], r2
119         vst1.64         {d17}, [r0,:64], r2
120         bgt             1b
121
122         pop             {r4-r7, pc}
123
124 2:      tst             r6,  r6
125         add             ip,  ip,  r6
126         vdup.8          d0,  r4
127         vdup.8          d1,  ip
128
129         beq             4f
130
131         add             r5,  r1,  r2
132         lsl             r4,  r2,  #1
133         vld1.64         {d4}, [r1], r4
134         vld1.64         {d6}, [r5], r4
135
136 3:      pld             [r5]
137         vmull.u8        q8,  d4,  d0
138         vmlal.u8        q8,  d6,  d1
139         vld1.64         {d4}, [r1], r4
140         vmull.u8        q9,  d6,  d0
141         vmlal.u8        q9,  d4,  d1
142         vld1.64         {d6}, [r5], r4
143         vrshrn.u16      d16, q8,  #6
144         vrshrn.u16      d17, q9,  #6
145 .ifc \type,avg
146         vld1.64         {d20}, [lr,:64], r2
147         vld1.64         {d21}, [lr,:64], r2
148         vrhadd.u8       q8,  q8,  q10
149 .endif
150         subs            r3,  r3,  #2
151         pld             [r1]
152         vst1.64         {d16}, [r0,:64], r2
153         vst1.64         {d17}, [r0,:64], r2
154         bgt             3b
155
156         pop             {r4-r7, pc}
157
158 4:      vld1.64         {d4, d5}, [r1], r2
159         vld1.64         {d6, d7}, [r1], r2
160         vext.8          d5,  d4,  d5,  #1
161         vext.8          d7,  d6,  d7,  #1
162
163 5:      pld             [r1]
164         subs            r3,  r3,  #2
165         vmull.u8        q8,  d4,  d0
166         vmlal.u8        q8,  d5,  d1
167         vld1.64         {d4, d5}, [r1], r2
168         vmull.u8        q9,  d6,  d0
169         vmlal.u8        q9,  d7,  d1
170         pld             [r1]
171         vext.8          d5,  d4,  d5,  #1
172         vrshrn.u16      d16, q8,  #6
173         vrshrn.u16      d17, q9,  #6
174 .ifc \type,avg
175         vld1.64         {d20}, [lr,:64], r2
176         vld1.64         {d21}, [lr,:64], r2
177         vrhadd.u8       q8,  q8,  q10
178 .endif
179         vld1.64         {d6, d7}, [r1], r2
180         vext.8          d7,  d6,  d7,  #1
181         vst1.64         {d16}, [r0,:64], r2
182         vst1.64         {d17}, [r0,:64], r2
183         bgt             5b
184
185         pop             {r4-r7, pc}
186         .endfunc
187         .endm
188
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190         .macro  h264_chroma_mc4 type
191 function ff_\type\()_h264_chroma_mc4_neon, export=1
192         push            {r4-r7, lr}
193         ldrd            r4,  [sp, #20]
194 .ifc \type,avg
195         mov             lr,  r0
196 .endif
197         pld             [r1]
198         pld             [r1, r2]
199
200         muls            r7,  r4,  r5
201         rsb             r6,  r7,  r5,  lsl #3
202         rsb             ip,  r7,  r4,  lsl #3
203         sub             r4,  r7,  r4,  lsl #3
204         sub             r4,  r4,  r5,  lsl #3
205         add             r4,  r4,  #64
206
207         beq             2f
208
209         add             r5,  r1,  r2
210
211         vdup.8          d0,  r4
212         lsl             r4,  r2,  #1
213         vdup.8          d1,  ip
214         vld1.64         {d4},     [r1], r4
215         vdup.8          d2,  r6
216         vld1.64         {d6},     [r5], r4
217         vdup.8          d3,  r7
218
219         vext.8          d5,  d4,  d5,  #1
220         vext.8          d7,  d6,  d7,  #1
221         vtrn.32         d4,  d5
222         vtrn.32         d6,  d7
223
224         vtrn.32         d0,  d1
225         vtrn.32         d2,  d3
226
227 1:      pld             [r5]
228         vmull.u8        q8,  d4,  d0
229         vmlal.u8        q8,  d6,  d2
230         vld1.64         {d4},     [r1], r4
231         vext.8          d5,  d4,  d5,  #1
232         vtrn.32         d4,  d5
233         vmull.u8        q9,  d6,  d0
234         vmlal.u8        q9,  d4,  d2
235         vld1.64         {d6},     [r5], r4
236         vadd.i16        d16, d16, d17
237         vadd.i16        d17, d18, d19
238         vrshrn.u16      d16, q8,  #6
239         subs            r3,  r3,  #2
240         pld             [r1]
241 .ifc \type,avg
242         vld1.32         {d20[0]}, [lr,:32], r2
243         vld1.32         {d20[1]}, [lr,:32], r2
244         vrhadd.u8       d16, d16, d20
245 .endif
246         vext.8          d7,  d6,  d7,  #1
247         vtrn.32         d6,  d7
248         vst1.32         {d16[0]}, [r0,:32], r2
249         vst1.32         {d16[1]}, [r0,:32], r2
250         bgt             1b
251
252         pop             {r4-r7, pc}
253
254 2:      tst             r6,  r6
255         add             ip,  ip,  r6
256         vdup.8          d0,  r4
257         vdup.8          d1,  ip
258         vtrn.32         d0,  d1
259
260         beq             4f
261
262         vext.32         d1,  d0,  d1,  #1
263         add             r5,  r1,  r2
264         lsl             r4,  r2,  #1
265         vld1.32         {d4[0]},  [r1], r4
266         vld1.32         {d4[1]},  [r5], r4
267
268 3:      pld             [r5]
269         vmull.u8        q8,  d4,  d0
270         vld1.32         {d4[0]},  [r1], r4
271         vmull.u8        q9,  d4,  d1
272         vld1.32         {d4[1]},  [r5], r4
273         vadd.i16        d16, d16, d17
274         vadd.i16        d17, d18, d19
275         vrshrn.u16      d16, q8,  #6
276 .ifc \type,avg
277         vld1.32         {d20[0]}, [lr,:32], r2
278         vld1.32         {d20[1]}, [lr,:32], r2
279         vrhadd.u8       d16, d16, d20
280 .endif
281         subs            r3,  r3,  #2
282         pld             [r1]
283         vst1.32         {d16[0]}, [r0,:32], r2
284         vst1.32         {d16[1]}, [r0,:32], r2
285         bgt             3b
286
287         pop             {r4-r7, pc}
288
289 4:      vld1.64         {d4},     [r1], r2
290         vld1.64         {d6},     [r1], r2
291         vext.8          d5,  d4,  d5,  #1
292         vext.8          d7,  d6,  d7,  #1
293         vtrn.32         d4,  d5
294         vtrn.32         d6,  d7
295
296 5:      vmull.u8        q8,  d4,  d0
297         vmull.u8        q9,  d6,  d0
298         subs            r3,  r3,  #2
299         vld1.64         {d4},     [r1], r2
300         vext.8          d5,  d4,  d5,  #1
301         vtrn.32         d4,  d5
302         vadd.i16        d16, d16, d17
303         vadd.i16        d17, d18, d19
304         pld             [r1]
305         vrshrn.u16      d16, q8,  #6
306 .ifc \type,avg
307         vld1.32         {d20[0]}, [lr,:32], r2
308         vld1.32         {d20[1]}, [lr,:32], r2
309         vrhadd.u8       d16, d16, d20
310 .endif
311         vld1.64         {d6},     [r1], r2
312         vext.8          d7,  d6,  d7,  #1
313         vtrn.32         d6,  d7
314         pld             [r1]
315         vst1.32         {d16[0]}, [r0,:32], r2
316         vst1.32         {d16[1]}, [r0,:32], r2
317         bgt             5b
318
319         pop             {r4-r7, pc}
320         .endfunc
321         .endm
322
323         .text
324         .align
325
326         h264_chroma_mc8 put
327         h264_chroma_mc8 avg
328         h264_chroma_mc4 put
329         h264_chroma_mc4 avg
330
331         /* H.264 loop filter */
332
333         .macro h264_loop_filter_start
334         ldr             ip,  [sp]
335         tst             r2,  r2
336         ldr             ip,  [ip]
337         tstne           r3,  r3
338         vmov.32         d24[0], ip
339         and             ip,  ip,  ip, lsl #16
340         bxeq            lr
341         ands            ip,  ip,  ip, lsl #8
342         bxlt            lr
343         .endm
344
345         .macro align_push_regs
346         and             ip,  sp,  #15
347         add             ip,  ip,  #32
348         sub             sp,  sp,  ip
349         vst1.64         {d12-d15}, [sp,:128]
350         sub             sp,  sp,  #32
351         vst1.64         {d8-d11},  [sp,:128]
352         .endm
353
354         .macro align_pop_regs
355         vld1.64         {d8-d11},  [sp,:128]!
356         vld1.64         {d12-d15}, [sp,:128], ip
357         .endm
358
359         .macro h264_loop_filter_luma
360         vdup.8          q11, r2         @ alpha
361         vmovl.u8        q12, d24
362         vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
363         vmovl.u16       q12, d24
364         vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
365         vsli.16         q12, q12, #8
366         vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
367         vsli.32         q12, q12, #16
368         vclt.u8         q6,  q6,  q11   @ < alpha
369         vdup.8          q11, r3         @ beta
370         vclt.s8         q7,  q12, #0
371         vclt.u8         q14, q14, q11   @ < beta
372         vclt.u8         q15, q15, q11   @ < beta
373         vbic            q6,  q6,  q7
374         vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
375         vand            q6,  q6,  q14
376         vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
377         vclt.u8         q4,  q4,  q11   @ < beta
378         vand            q6,  q6,  q15
379         vclt.u8         q5,  q5,  q11   @ < beta
380         vand            q4,  q4,  q6
381         vand            q5,  q5,  q6
382         vand            q12, q12, q6
383         vrhadd.u8       q14, q8,  q0
384         vsub.i8         q6,  q12, q4
385         vqadd.u8        q7,  q9,  q12
386         vhadd.u8        q10, q10, q14
387         vsub.i8         q6,  q6,  q5
388         vhadd.u8        q14, q2,  q14
389         vmin.u8         q7,  q7,  q10
390         vqsub.u8        q11, q9,  q12
391         vqadd.u8        q2,  q1,  q12
392         vmax.u8         q7,  q7,  q11
393         vqsub.u8        q11, q1,  q12
394         vmin.u8         q14, q2,  q14
395         vmovl.u8        q2,  d0
396         vmax.u8         q14, q14, q11
397         vmovl.u8        q10, d1
398         vsubw.u8        q2,  q2,  d16
399         vsubw.u8        q10, q10, d17
400         vshl.i16        q2,  q2,  #2
401         vshl.i16        q10, q10, #2
402         vaddw.u8        q2,  q2,  d18
403         vaddw.u8        q10, q10, d19
404         vsubw.u8        q2,  q2,  d2
405         vsubw.u8        q10, q10, d3
406         vrshrn.i16      d4,  q2,  #3
407         vrshrn.i16      d5,  q10, #3
408         vbsl            q4,  q7,  q9
409         vbsl            q5,  q14, q1
410         vneg.s8         q7,  q6
411         vmovl.u8        q14, d16
412         vmin.s8         q2,  q2,  q6
413         vmovl.u8        q6,  d17
414         vmax.s8         q2,  q2,  q7
415         vmovl.u8        q11, d0
416         vmovl.u8        q12, d1
417         vaddw.s8        q14, q14, d4
418         vaddw.s8        q6,  q6,  d5
419         vsubw.s8        q11, q11, d4
420         vsubw.s8        q12, q12, d5
421         vqmovun.s16     d16, q14
422         vqmovun.s16     d17, q6
423         vqmovun.s16     d0,  q11
424         vqmovun.s16     d1,  q12
425         .endm
426
427 function ff_h264_v_loop_filter_luma_neon, export=1
428         h264_loop_filter_start
429
430         vld1.64         {d0, d1},  [r0,:128], r1
431         vld1.64         {d2, d3},  [r0,:128], r1
432         vld1.64         {d4, d5},  [r0,:128], r1
433         sub             r0,  r0,  r1, lsl #2
434         sub             r0,  r0,  r1, lsl #1
435         vld1.64         {d20,d21}, [r0,:128], r1
436         vld1.64         {d18,d19}, [r0,:128], r1
437         vld1.64         {d16,d17}, [r0,:128], r1
438
439         align_push_regs
440
441         h264_loop_filter_luma
442
443         sub             r0,  r0,  r1, lsl #1
444         vst1.64         {d8, d9},  [r0,:128], r1
445         vst1.64         {d16,d17}, [r0,:128], r1
446         vst1.64         {d0, d1},  [r0,:128], r1
447         vst1.64         {d10,d11}, [r0,:128]
448
449         align_pop_regs
450         bx              lr
451         .endfunc
452
453 function ff_h264_h_loop_filter_luma_neon, export=1
454         h264_loop_filter_start
455
456         sub             r0,  r0,  #4
457         vld1.64         {d6},  [r0], r1
458         vld1.64         {d20}, [r0], r1
459         vld1.64         {d18}, [r0], r1
460         vld1.64         {d16}, [r0], r1
461         vld1.64         {d0},  [r0], r1
462         vld1.64         {d2},  [r0], r1
463         vld1.64         {d4},  [r0], r1
464         vld1.64         {d26}, [r0], r1
465         vld1.64         {d7},  [r0], r1
466         vld1.64         {d21}, [r0], r1
467         vld1.64         {d19}, [r0], r1
468         vld1.64         {d17}, [r0], r1
469         vld1.64         {d1},  [r0], r1
470         vld1.64         {d3},  [r0], r1
471         vld1.64         {d5},  [r0], r1
472         vld1.64         {d27}, [r0], r1
473
474         transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
475
476         align_push_regs
477
478         h264_loop_filter_luma
479
480         transpose_4x4   q4, q8, q0, q5
481
482         sub             r0,  r0,  r1, lsl #4
483         add             r0,  r0,  #2
484         vst1.32         {d8[0]},  [r0], r1
485         vst1.32         {d16[0]}, [r0], r1
486         vst1.32         {d0[0]},  [r0], r1
487         vst1.32         {d10[0]}, [r0], r1
488         vst1.32         {d8[1]},  [r0], r1
489         vst1.32         {d16[1]}, [r0], r1
490         vst1.32         {d0[1]},  [r0], r1
491         vst1.32         {d10[1]}, [r0], r1
492         vst1.32         {d9[0]},  [r0], r1
493         vst1.32         {d17[0]}, [r0], r1
494         vst1.32         {d1[0]},  [r0], r1
495         vst1.32         {d11[0]}, [r0], r1
496         vst1.32         {d9[1]},  [r0], r1
497         vst1.32         {d17[1]}, [r0], r1
498         vst1.32         {d1[1]},  [r0], r1
499         vst1.32         {d11[1]}, [r0], r1
500
501         align_pop_regs
502         bx              lr
503         .endfunc
504
505         .macro h264_loop_filter_chroma
506         vdup.8          d22, r2         @ alpha
507         vmovl.u8        q12, d24
508         vabd.u8         d26, d16, d0    @ abs(p0 - q0)
509         vmovl.u8        q2,  d0
510         vabd.u8         d28, d18, d16   @ abs(p1 - p0)
511         vsubw.u8        q2,  q2,  d16
512         vsli.16         d24, d24, #8
513         vshl.i16        q2,  q2,  #2
514         vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
515         vaddw.u8        q2,  q2,  d18
516         vclt.u8         d26, d26, d22   @ < alpha
517         vsubw.u8        q2,  q2,  d2
518         vdup.8          d22, r3         @ beta
519         vclt.s8         d25, d24, #0
520         vrshrn.i16      d4,  q2,  #3
521         vclt.u8         d28, d28, d22   @ < beta
522         vbic            d26, d26, d25
523         vclt.u8         d30, d30, d22   @ < beta
524         vand            d26, d26, d28
525         vneg.s8         d25, d24
526         vand            d26, d26, d30
527         vmin.s8         d4,  d4,  d24
528         vmovl.u8        q14, d16
529         vand            d4,  d4,  d26
530         vmax.s8         d4,  d4,  d25
531         vmovl.u8        q11, d0
532         vaddw.s8        q14, q14, d4
533         vsubw.s8        q11, q11, d4
534         vqmovun.s16     d16, q14
535         vqmovun.s16     d0,  q11
536         .endm
537
538 function ff_h264_v_loop_filter_chroma_neon, export=1
539         h264_loop_filter_start
540
541         sub             r0,  r0,  r1, lsl #1
542         vld1.64         {d18}, [r0,:64], r1
543         vld1.64         {d16}, [r0,:64], r1
544         vld1.64         {d0},  [r0,:64], r1
545         vld1.64         {d2},  [r0,:64]
546
547         h264_loop_filter_chroma
548
549         sub             r0,  r0,  r1, lsl #1
550         vst1.64         {d16}, [r0,:64], r1
551         vst1.64         {d0},  [r0,:64], r1
552
553         bx              lr
554         .endfunc
555
556 function ff_h264_h_loop_filter_chroma_neon, export=1
557         h264_loop_filter_start
558
559         sub             r0,  r0,  #2
560         vld1.32         {d18[0]}, [r0], r1
561         vld1.32         {d16[0]}, [r0], r1
562         vld1.32         {d0[0]},  [r0], r1
563         vld1.32         {d2[0]},  [r0], r1
564         vld1.32         {d18[1]}, [r0], r1
565         vld1.32         {d16[1]}, [r0], r1
566         vld1.32         {d0[1]},  [r0], r1
567         vld1.32         {d2[1]},  [r0], r1
568
569         vtrn.16         d18, d0
570         vtrn.16         d16, d2
571         vtrn.8          d18, d16
572         vtrn.8          d0,  d2
573
574         h264_loop_filter_chroma
575
576         vtrn.16         d18, d0
577         vtrn.16         d16, d2
578         vtrn.8          d18, d16
579         vtrn.8          d0,  d2
580
581         sub             r0,  r0,  r1, lsl #3
582         vst1.32         {d18[0]}, [r0], r1
583         vst1.32         {d16[0]}, [r0], r1
584         vst1.32         {d0[0]},  [r0], r1
585         vst1.32         {d2[0]},  [r0], r1
586         vst1.32         {d18[1]}, [r0], r1
587         vst1.32         {d16[1]}, [r0], r1
588         vst1.32         {d0[1]},  [r0], r1
589         vst1.32         {d2[1]},  [r0], r1
590
591         bx              lr
592         .endfunc
593
594         /* H.264 qpel MC */
595
596         .macro  lowpass_const r
597         movw            \r,  #5
598         movt            \r,  #20
599         vmov.32         d6[0], \r
600         .endm
601
602         .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
603 .if \narrow
604         t0 .req q0
605         t1 .req q8
606 .else
607         t0 .req \d0
608         t1 .req \d1
609 .endif
610         vext.8          d2,  \r0, \r1, #2
611         vext.8          d3,  \r0, \r1, #3
612         vaddl.u8        q1,  d2,  d3
613         vext.8          d4,  \r0, \r1, #1
614         vext.8          d5,  \r0, \r1, #4
615         vaddl.u8        q2,  d4,  d5
616         vext.8          d30, \r0, \r1, #5
617         vaddl.u8        t0,  \r0, d30
618         vext.8          d18, \r2, \r3, #2
619         vmla.i16        t0,  q1,  d6[1]
620         vext.8          d19, \r2, \r3, #3
621         vaddl.u8        q9,  d18, d19
622         vext.8          d20, \r2, \r3, #1
623         vmls.i16        t0,  q2,  d6[0]
624         vext.8          d21, \r2, \r3, #4
625         vaddl.u8        q10, d20, d21
626         vext.8          d31, \r2, \r3, #5
627         vaddl.u8        t1,  \r2, d31
628         vmla.i16        t1,  q9,  d6[1]
629         vmls.i16        t1,  q10, d6[0]
630 .if \narrow
631         vqrshrun.s16    \d0, t0,  #5
632         vqrshrun.s16    \d1, t1,  #5
633 .endif
634         .unreq  t0
635         .unreq  t1
636         .endm
637
638         .macro  lowpass_8_1 r0, r1, d0, narrow=1
639 .if \narrow
640         t0 .req q0
641 .else
642         t0 .req \d0
643 .endif
644         vext.8          d2,  \r0, \r1, #2
645         vext.8          d3,  \r0, \r1, #3
646         vaddl.u8        q1,  d2,  d3
647         vext.8          d4,  \r0, \r1, #1
648         vext.8          d5,  \r0, \r1, #4
649         vaddl.u8        q2,  d4,  d5
650         vext.8          d30, \r0, \r1, #5
651         vaddl.u8        t0,  \r0, d30
652         vmla.i16        t0,  q1,  d6[1]
653         vmls.i16        t0,  q2,  d6[0]
654 .if \narrow
655         vqrshrun.s16    \d0, t0,  #5
656 .endif
657         .unreq  t0
658         .endm
659
660         .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
661         vext.16         q1,  \r0, \r1, #2
662         vext.16         q0,  \r0, \r1, #3
663         vaddl.s16       q9,  d2,  d0
664         vext.16         q2,  \r0, \r1, #1
665         vaddl.s16       q1,  d3,  d1
666         vext.16         q3,  \r0, \r1, #4
667         vaddl.s16       q10, d4,  d6
668         vext.16         \r1, \r0, \r1, #5
669         vaddl.s16       q2,  d5,  d7
670         vaddl.s16       q0,  \h0, \h1
671         vaddl.s16       q8,  \l0, \l1
672
673         vshl.i32        q3,  q9,  #4
674         vshl.i32        q9,  q9,  #2
675         vshl.i32        q15, q10, #2
676         vadd.i32        q9,  q9,  q3
677         vadd.i32        q10, q10, q15
678
679         vshl.i32        q3,  q1,  #4
680         vshl.i32        q1,  q1,  #2
681         vshl.i32        q15, q2,  #2
682         vadd.i32        q1,  q1,  q3
683         vadd.i32        q2,  q2,  q15
684
685         vadd.i32        q9,  q9,  q8
686         vsub.i32        q9,  q9,  q10
687
688         vadd.i32        q1,  q1,  q0
689         vsub.i32        q1,  q1,  q2
690
691         vrshrn.s32      d18, q9,  #10
692         vrshrn.s32      d19, q1,  #10
693
694         vqmovun.s16     \d,  q9
695         .endm
696
697 function put_h264_qpel16_h_lowpass_neon_packed
698         mov             r4,  lr
699         mov             ip,  #16
700         mov             r3,  #8
701         bl              put_h264_qpel8_h_lowpass_neon
702         sub             r1,  r1,  r2, lsl #4
703         add             r1,  r1,  #8
704         mov             ip,  #16
705         mov             lr,  r4
706         b               put_h264_qpel8_h_lowpass_neon
707         .endfunc
708
709 function put_h264_qpel16_h_lowpass_neon
710         push            {lr}
711         mov             ip,  #16
712         bl              put_h264_qpel8_h_lowpass_neon
713         sub             r0,  r0,  r3, lsl #4
714         sub             r1,  r1,  r2, lsl #4
715         add             r0,  r0,  #8
716         add             r1,  r1,  #8
717         mov             ip,  #16
718         pop             {lr}
719         .endfunc
720
721 function put_h264_qpel8_h_lowpass_neon
722 1:      vld1.64         {d0, d1},  [r1], r2
723         vld1.64         {d16,d17}, [r1], r2
724         subs            ip,  ip,  #2
725         lowpass_8       d0,  d1,  d16, d17, d0,  d16
726         vst1.64         {d0},     [r0,:64], r3
727         vst1.64         {d16},    [r0,:64], r3
728         bne             1b
729         bx              lr
730         .endfunc
731
732 function put_h264_qpel16_h_lowpass_l2_neon
733         push            {lr}
734         mov             ip,  #16
735         bl              put_h264_qpel8_h_lowpass_l2_neon
736         sub             r0,  r0,  r2, lsl #4
737         sub             r1,  r1,  r2, lsl #4
738         sub             r3,  r3,  r2, lsl #4
739         add             r0,  r0,  #8
740         add             r1,  r1,  #8
741         add             r3,  r3,  #8
742         mov             ip,  #16
743         pop             {lr}
744         .endfunc
745
746 function put_h264_qpel8_h_lowpass_l2_neon
747 1:      vld1.64         {d0, d1},  [r1], r2
748         vld1.64         {d16,d17}, [r1], r2
749         vld1.64         {d28},     [r3], r2
750         vld1.64         {d29},     [r3], r2
751         subs            ip,  ip,  #2
752         lowpass_8       d0,  d1,  d16, d17, d0,  d1
753         vrhadd.u8       q0,  q0,  q14
754         vst1.64         {d0},      [r0,:64], r2
755         vst1.64         {d1},      [r0,:64], r2
756         bne             1b
757         bx              lr
758         .endfunc
759
760 function put_h264_qpel16_v_lowpass_neon_packed
761         mov             r4,  lr
762         mov             r2,  #8
763         bl              put_h264_qpel8_v_lowpass_neon
764         sub             r1,  r1,  r3, lsl #2
765         bl              put_h264_qpel8_v_lowpass_neon
766         sub             r1,  r1,  r3, lsl #4
767         sub             r1,  r1,  r3, lsl #2
768         add             r1,  r1,  #8
769         bl              put_h264_qpel8_v_lowpass_neon
770         sub             r1,  r1,  r3, lsl #2
771         mov             lr,  r4
772         b               put_h264_qpel8_v_lowpass_neon
773         .endfunc
774
775 function put_h264_qpel16_v_lowpass_neon
776         mov             r4,  lr
777         bl              put_h264_qpel8_v_lowpass_neon
778         sub             r1,  r1,  r3, lsl #2
779         bl              put_h264_qpel8_v_lowpass_neon
780         sub             r0,  r0,  r2, lsl #4
781         add             r0,  r0,  #8
782         sub             r1,  r1,  r3, lsl #4
783         sub             r1,  r1,  r3, lsl #2
784         add             r1,  r1,  #8
785         bl              put_h264_qpel8_v_lowpass_neon
786         sub             r1,  r1,  r3, lsl #2
787         mov             lr,  r4
788         .endfunc
789
790 function put_h264_qpel8_v_lowpass_neon
791         vld1.64         {d8},  [r1], r3
792         vld1.64         {d10}, [r1], r3
793         vld1.64         {d12}, [r1], r3
794         vld1.64         {d14}, [r1], r3
795         vld1.64         {d22}, [r1], r3
796         vld1.64         {d24}, [r1], r3
797         vld1.64         {d26}, [r1], r3
798         vld1.64         {d28}, [r1], r3
799         vld1.64         {d9},  [r1], r3
800         vld1.64         {d11}, [r1], r3
801         vld1.64         {d13}, [r1], r3
802         vld1.64         {d15}, [r1], r3
803         vld1.64         {d23}, [r1]
804
805         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
806         lowpass_8       d8,  d9,  d10, d11, d8,  d10
807         lowpass_8       d12, d13, d14, d15, d12, d14
808         lowpass_8       d22, d23, d24, d25, d22, d24
809         lowpass_8       d26, d27, d28, d29, d26, d28
810         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
811
812         vst1.64         {d8},  [r0,:64], r2
813         vst1.64         {d10}, [r0,:64], r2
814         vst1.64         {d12}, [r0,:64], r2
815         vst1.64         {d14}, [r0,:64], r2
816         vst1.64         {d22}, [r0,:64], r2
817         vst1.64         {d24}, [r0,:64], r2
818         vst1.64         {d26}, [r0,:64], r2
819         vst1.64         {d28}, [r0,:64], r2
820
821         bx              lr
822         .endfunc
823
824 function put_h264_qpel16_v_lowpass_l2_neon
825         mov             r4,  lr
826         bl              put_h264_qpel8_v_lowpass_l2_neon
827         sub             r1,  r1,  r3, lsl #2
828         bl              put_h264_qpel8_v_lowpass_l2_neon
829         sub             r0,  r0,  r3, lsl #4
830         sub             ip,  ip,  r2, lsl #4
831         add             r0,  r0,  #8
832         add             ip,  ip,  #8
833         sub             r1,  r1,  r3, lsl #4
834         sub             r1,  r1,  r3, lsl #2
835         add             r1,  r1,  #8
836         bl              put_h264_qpel8_v_lowpass_l2_neon
837         sub             r1,  r1,  r3, lsl #2
838         mov             lr,  r4
839         .endfunc
840
841 function put_h264_qpel8_v_lowpass_l2_neon
842         vld1.64         {d8},  [r1], r3
843         vld1.64         {d10}, [r1], r3
844         vld1.64         {d12}, [r1], r3
845         vld1.64         {d14}, [r1], r3
846         vld1.64         {d22}, [r1], r3
847         vld1.64         {d24}, [r1], r3
848         vld1.64         {d26}, [r1], r3
849         vld1.64         {d28}, [r1], r3
850         vld1.64         {d9},  [r1], r3
851         vld1.64         {d11}, [r1], r3
852         vld1.64         {d13}, [r1], r3
853         vld1.64         {d15}, [r1], r3
854         vld1.64         {d23}, [r1]
855
856         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
857         lowpass_8       d8,  d9,  d10, d11, d8,  d9
858         lowpass_8       d12, d13, d14, d15, d12, d13
859         lowpass_8       d22, d23, d24, d25, d22, d23
860         lowpass_8       d26, d27, d28, d29, d26, d27
861         transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
862
863         vld1.64         {d0},  [ip], r2
864         vld1.64         {d1},  [ip], r2
865         vld1.64         {d2},  [ip], r2
866         vld1.64         {d3},  [ip], r2
867         vld1.64         {d4},  [ip], r2
868         vrhadd.u8       q0,  q0,  q4
869         vld1.64         {d5},  [ip], r2
870         vrhadd.u8       q1,  q1,  q6
871         vld1.64         {d10}, [ip], r2
872         vrhadd.u8       q2,  q2,  q11
873         vld1.64         {d11}, [ip], r2
874
875         vst1.64         {d0},  [r0,:64], r3
876         vst1.64         {d1},  [r0,:64], r3
877         vrhadd.u8       q5,  q5,  q13
878         vst1.64         {d2},  [r0,:64], r3
879         vst1.64         {d3},  [r0,:64], r3
880         vst1.64         {d4},  [r0,:64], r3
881         vst1.64         {d5},  [r0,:64], r3
882         vst1.64         {d10}, [r0,:64], r3
883         vst1.64         {d11}, [r0,:64], r3
884
885         bx              lr
886         .endfunc
887
888 function put_h264_qpel8_hv_lowpass_neon_top
889         lowpass_const   ip
890         mov             ip,  #12
891 1:      vld1.64         {d0, d1},  [r1], r3
892         vld1.64         {d16,d17}, [r1], r3
893         subs            ip,  ip,  #2
894         lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
895         vst1.64         {d22-d25}, [r4,:128]!
896         bne             1b
897
898         vld1.64         {d0, d1},  [r1]
899         lowpass_8_1     d0,  d1,  q12, narrow=0
900
901         mov             ip,  #-16
902         add             r4,  r4,  ip
903         vld1.64         {d30,d31}, [r4,:128], ip
904         vld1.64         {d20,d21}, [r4,:128], ip
905         vld1.64         {d18,d19}, [r4,:128], ip
906         vld1.64         {d16,d17}, [r4,:128], ip
907         vld1.64         {d14,d15}, [r4,:128], ip
908         vld1.64         {d12,d13}, [r4,:128], ip
909         vld1.64         {d10,d11}, [r4,:128], ip
910         vld1.64         {d8, d9},  [r4,:128], ip
911         vld1.64         {d6, d7},  [r4,:128], ip
912         vld1.64         {d4, d5},  [r4,:128], ip
913         vld1.64         {d2, d3},  [r4,:128], ip
914         vld1.64         {d0, d1},  [r4,:128]
915
916         swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
917         transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
918
919         swap4           d17, d19, d21, d31, d24, d26, d28, d22
920         transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
921
922         vst1.64         {d30,d31}, [r4,:128]!
923         vst1.64         {d6, d7},  [r4,:128]!
924         vst1.64         {d20,d21}, [r4,:128]!
925         vst1.64         {d4, d5},  [r4,:128]!
926         vst1.64         {d18,d19}, [r4,:128]!
927         vst1.64         {d2, d3},  [r4,:128]!
928         vst1.64         {d16,d17}, [r4,:128]!
929         vst1.64         {d0, d1},  [r4,:128]
930
931         lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
932         lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
933         lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
934         lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
935
936         vld1.64         {d16,d17}, [r4,:128], ip
937         vld1.64         {d30,d31}, [r4,:128], ip
938         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
939         vld1.64         {d16,d17}, [r4,:128], ip
940         vld1.64         {d30,d31}, [r4,:128], ip
941         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
942         vld1.64         {d16,d17}, [r4,:128], ip
943         vld1.64         {d30,d31}, [r4,:128], ip
944         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
945         vld1.64         {d16,d17}, [r4,:128], ip
946         vld1.64         {d30,d31}, [r4,:128]
947         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
948
949         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
950
951         bx              lr
952         .endfunc
953
954 function put_h264_qpel8_hv_lowpass_neon
955         mov             r10, lr
956         bl              put_h264_qpel8_hv_lowpass_neon_top
957         vst1.64         {d12},     [r0,:64], r2
958         vst1.64         {d13},     [r0,:64], r2
959         vst1.64         {d14},     [r0,:64], r2
960         vst1.64         {d15},     [r0,:64], r2
961         vst1.64         {d8},      [r0,:64], r2
962         vst1.64         {d9},      [r0,:64], r2
963         vst1.64         {d10},     [r0,:64], r2
964         vst1.64         {d11},     [r0,:64], r2
965
966         mov             lr,  r10
967         bx              lr
968         .endfunc
969
970 function put_h264_qpel8_hv_lowpass_l2_neon
971         mov             r10, lr
972         bl              put_h264_qpel8_hv_lowpass_neon_top
973
974         vld1.64         {d0, d1},  [r2,:128]!
975         vld1.64         {d2, d3},  [r2,:128]!
976         vrhadd.u8       q0,  q0,  q6
977         vld1.64         {d4, d5},  [r2,:128]!
978         vrhadd.u8       q1,  q1,  q7
979         vld1.64         {d6, d7},  [r2,:128]!
980         vrhadd.u8       q2,  q2,  q4
981
982         vst1.64         {d0},      [r0,:64], r3
983         vrhadd.u8       q3,  q3,  q5
984         vst1.64         {d1},      [r0,:64], r3
985         vst1.64         {d2},      [r0,:64], r3
986         vst1.64         {d3},      [r0,:64], r3
987         vst1.64         {d4},      [r0,:64], r3
988         vst1.64         {d5},      [r0,:64], r3
989         vst1.64         {d6},      [r0,:64], r3
990         vst1.64         {d7},      [r0,:64], r3
991
992         mov             lr,  r10
993         bx              lr
994         .endfunc
995
996 function put_h264_qpel16_hv_lowpass_neon
997         mov             r9,  lr
998         bl              put_h264_qpel8_hv_lowpass_neon
999         sub             r1,  r1,  r3, lsl #2
1000         bl              put_h264_qpel8_hv_lowpass_neon
1001         sub             r1,  r1,  r3, lsl #4
1002         sub             r1,  r1,  r3, lsl #2
1003         add             r1,  r1,  #8
1004         sub             r0,  r0,  r2, lsl #4
1005         add             r0,  r0,  #8
1006         bl              put_h264_qpel8_hv_lowpass_neon
1007         sub             r1,  r1,  r3, lsl #2
1008         mov             lr,  r9
1009         b               put_h264_qpel8_hv_lowpass_neon
1010         .endfunc
1011
1012 function put_h264_qpel16_hv_lowpass_l2_neon
1013         mov             r9,  lr
1014         sub             r2,  r4,  #256
1015         bl              put_h264_qpel8_hv_lowpass_l2_neon
1016         sub             r1,  r1,  r3, lsl #2
1017         bl              put_h264_qpel8_hv_lowpass_l2_neon
1018         sub             r1,  r1,  r3, lsl #4
1019         sub             r1,  r1,  r3, lsl #2
1020         add             r1,  r1,  #8
1021         sub             r0,  r0,  r3, lsl #4
1022         add             r0,  r0,  #8
1023         bl              put_h264_qpel8_hv_lowpass_l2_neon
1024         sub             r1,  r1,  r3, lsl #2
1025         mov             lr,  r9
1026         b               put_h264_qpel8_hv_lowpass_l2_neon
1027         .endfunc
1028
1029 function ff_put_h264_qpel8_mc10_neon, export=1
1030         lowpass_const   r3
1031         mov             r3,  r1
1032         sub             r1,  r1,  #2
1033         mov             ip,  #8
1034         b               put_h264_qpel8_h_lowpass_l2_neon
1035         .endfunc
1036
1037 function ff_put_h264_qpel8_mc20_neon, export=1
1038         lowpass_const   r3
1039         sub             r1,  r1,  #2
1040         mov             r3,  r2
1041         mov             ip,  #8
1042         b               put_h264_qpel8_h_lowpass_neon
1043         .endfunc
1044
1045 function ff_put_h264_qpel8_mc30_neon, export=1
1046         lowpass_const   r3
1047         add             r3,  r1,  #1
1048         sub             r1,  r1,  #2
1049         mov             ip,  #8
1050         b               put_h264_qpel8_h_lowpass_l2_neon
1051         .endfunc
1052
1053 function ff_put_h264_qpel8_mc01_neon, export=1
1054         push            {lr}
1055         mov             ip,  r1
1056 put_h264_qpel8_mc01:
1057         lowpass_const   r3
1058         mov             r3,  r2
1059         sub             r1,  r1,  r2, lsl #1
1060         vpush           {d8-d15}
1061         bl              put_h264_qpel8_v_lowpass_l2_neon
1062         vpop            {d8-d15}
1063         pop             {pc}
1064         .endfunc
1065
1066 function ff_put_h264_qpel8_mc11_neon, export=1
1067         push            {r0, r1, r2, lr}
1068 put_h264_qpel8_mc11:
1069         lowpass_const   r3
1070         sub             sp,  sp,  #64
1071         mov             r0,  sp
1072         sub             r1,  r1,  #2
1073         mov             r3,  #8
1074         mov             ip,  #8
1075         vpush           {d8-d15}
1076         bl              put_h264_qpel8_h_lowpass_neon
1077         ldrd            r0,  [sp, #128]
1078         mov             r3,  r2
1079         add             ip,  sp,  #64
1080         sub             r1,  r1,  r2, lsl #1
1081         mov             r2,  #8
1082         bl              put_h264_qpel8_v_lowpass_l2_neon
1083         vpop            {d8-d15}
1084         add             sp,  sp,  #76
1085         pop             {pc}
1086         .endfunc
1087
1088 function ff_put_h264_qpel8_mc21_neon, export=1
1089         push            {r0, r1, r4, r10, r11, lr}
1090 put_h264_qpel8_mc21:
1091         lowpass_const   r3
1092         mov             r11, sp
1093         bic             sp,  sp,  #15
1094         sub             sp,  sp,  #(8*8+16*12)
1095         sub             r1,  r1,  #2
1096         mov             r3,  #8
1097         mov             r0,  sp
1098         mov             ip,  #8
1099         vpush           {d8-d15}
1100         bl              put_h264_qpel8_h_lowpass_neon
1101         mov             r4,  r0
1102         ldrd            r0,  [r11]
1103         sub             r1,  r1,  r2, lsl #1
1104         sub             r1,  r1,  #2
1105         mov             r3,  r2
1106         sub             r2,  r4,  #64
1107         bl              put_h264_qpel8_hv_lowpass_l2_neon
1108         vpop            {d8-d15}
1109         add             sp,  r11,  #8
1110         pop             {r4, r10, r11, pc}
1111         .endfunc
1112
1113 function ff_put_h264_qpel8_mc31_neon, export=1
1114         add             r1,  r1,  #1
1115         push            {r0, r1, r2, lr}
1116         sub             r1,  r1,  #1
1117         b               put_h264_qpel8_mc11
1118         .endfunc
1119
1120 function ff_put_h264_qpel8_mc02_neon, export=1
1121         push            {lr}
1122         lowpass_const   r3
1123         sub             r1,  r1,  r2, lsl #1
1124         mov             r3,  r2
1125         vpush           {d8-d15}
1126         bl              put_h264_qpel8_v_lowpass_neon
1127         vpop            {d8-d15}
1128         pop             {pc}
1129         .endfunc
1130
1131 function ff_put_h264_qpel8_mc12_neon, export=1
1132         push            {r0, r1, r4, r10, r11, lr}
1133 put_h264_qpel8_mc12:
1134         lowpass_const   r3
1135         mov             r11, sp
1136         bic             sp,  sp,  #15
1137         sub             sp,  sp,  #(8*8+16*12)
1138         sub             r1,  r1,  r2, lsl #1
1139         mov             r3,  r2
1140         mov             r2,  #8
1141         mov             r0,  sp
1142         vpush           {d8-d15}
1143         bl              put_h264_qpel8_v_lowpass_neon
1144         mov             r4,  r0
1145         ldrd            r0,  [r11]
1146         sub             r1,  r1,  r3, lsl #1
1147         sub             r1,  r1,  #2
1148         sub             r2,  r4,  #64
1149         bl              put_h264_qpel8_hv_lowpass_l2_neon
1150         vpop            {d8-d15}
1151         add             sp,  r11,  #8
1152         pop             {r4, r10, r11, pc}
1153         .endfunc
1154
1155 function ff_put_h264_qpel8_mc22_neon, export=1
1156         push            {r4, r10, r11, lr}
1157         mov             r11, sp
1158         bic             sp,  sp,  #15
1159         sub             r1,  r1,  r2, lsl #1
1160         sub             r1,  r1,  #2
1161         mov             r3,  r2
1162         sub             sp,  sp,  #(16*12)
1163         mov             r4,  sp
1164         vpush           {d8-d15}
1165         bl              put_h264_qpel8_hv_lowpass_neon
1166         vpop            {d8-d15}
1167         mov             sp,  r11
1168         pop             {r4, r10, r11, pc}
1169         .endfunc
1170
1171 function ff_put_h264_qpel8_mc32_neon, export=1
1172         push            {r0, r1, r4, r10, r11, lr}
1173         add             r1,  r1,  #1
1174         b               put_h264_qpel8_mc12
1175         .endfunc
1176
1177 function ff_put_h264_qpel8_mc03_neon, export=1
1178         push            {lr}
1179         add             ip,  r1,  r2
1180         b               put_h264_qpel8_mc01
1181         .endfunc
1182
1183 function ff_put_h264_qpel8_mc13_neon, export=1
1184         push            {r0, r1, r2, lr}
1185         add             r1,  r1,  r2
1186         b               put_h264_qpel8_mc11
1187         .endfunc
1188
1189 function ff_put_h264_qpel8_mc23_neon, export=1
1190         push            {r0, r1, r4, r10, r11, lr}
1191         add             r1,  r1,  r2
1192         b               put_h264_qpel8_mc21
1193         .endfunc
1194
1195 function ff_put_h264_qpel8_mc33_neon, export=1
1196         add             r1,  r1,  #1
1197         push            {r0, r1, r2, lr}
1198         add             r1,  r1,  r2
1199         sub             r1,  r1,  #1
1200         b               put_h264_qpel8_mc11
1201         .endfunc
1202
1203 function ff_put_h264_qpel16_mc10_neon, export=1
1204         lowpass_const   r3
1205         mov             r3,  r1
1206         sub             r1,  r1,  #2
1207         b               put_h264_qpel16_h_lowpass_l2_neon
1208         .endfunc
1209
1210 function ff_put_h264_qpel16_mc20_neon, export=1
1211         lowpass_const   r3
1212         sub             r1,  r1,  #2
1213         mov             r3,  r2
1214         b               put_h264_qpel16_h_lowpass_neon
1215         .endfunc
1216
1217 function ff_put_h264_qpel16_mc30_neon, export=1
1218         lowpass_const   r3
1219         add             r3,  r1,  #1
1220         sub             r1,  r1,  #2
1221         b               put_h264_qpel16_h_lowpass_l2_neon
1222         .endfunc
1223
1224 function ff_put_h264_qpel16_mc01_neon, export=1
1225         push            {r4, lr}
1226         mov             ip,  r1
1227 put_h264_qpel16_mc01:
1228         lowpass_const   r3
1229         mov             r3,  r2
1230         sub             r1,  r1,  r2, lsl #1
1231         vpush           {d8-d15}
1232         bl              put_h264_qpel16_v_lowpass_l2_neon
1233         vpop            {d8-d15}
1234         pop             {r4, pc}
1235         .endfunc
1236
1237 function ff_put_h264_qpel16_mc11_neon, export=1
1238         push            {r0, r1, r4, lr}
1239 put_h264_qpel16_mc11:
1240         lowpass_const   r3
1241         sub             sp,  sp,  #256
1242         mov             r0,  sp
1243         sub             r1,  r1,  #2
1244         mov             r3,  #16
1245         vpush           {d8-d15}
1246         bl              put_h264_qpel16_h_lowpass_neon
1247         add             r0,  sp,  #256
1248         ldrd            r0,  [r0, #64]
1249         mov             r3,  r2
1250         add             ip,  sp,  #64
1251         sub             r1,  r1,  r2, lsl #1
1252         mov             r2,  #16
1253         bl              put_h264_qpel16_v_lowpass_l2_neon
1254         vpop            {d8-d15}
1255         add             sp,  sp,  #(256+8)
1256         pop             {r4, pc}
1257         .endfunc
1258
1259 function ff_put_h264_qpel16_mc21_neon, export=1
1260         push            {r0, r1, r4-r5, r9-r11, lr}
1261 put_h264_qpel16_mc21:
1262         lowpass_const   r3
1263         mov             r11, sp
1264         bic             sp,  sp,  #15
1265         sub             sp,  sp,  #(16*16+16*12)
1266         sub             r1,  r1,  #2
1267         mov             r0,  sp
1268         vpush           {d8-d15}
1269         bl              put_h264_qpel16_h_lowpass_neon_packed
1270         mov             r4,  r0
1271         ldrd            r0,  [r11]
1272         sub             r1,  r1,  r2, lsl #1
1273         sub             r1,  r1,  #2
1274         mov             r3,  r2
1275         bl              put_h264_qpel16_hv_lowpass_l2_neon
1276         vpop            {d8-d15}
1277         add             sp,  r11,  #8
1278         pop             {r4-r5, r9-r11, pc}
1279         .endfunc
1280
1281 function ff_put_h264_qpel16_mc31_neon, export=1
1282         add             r1,  r1,  #1
1283         push            {r0, r1, r4, lr}
1284         sub             r1,  r1,  #1
1285         b               put_h264_qpel16_mc11
1286         .endfunc
1287
1288 function ff_put_h264_qpel16_mc02_neon, export=1
1289         push            {r4, lr}
1290         lowpass_const   r3
1291         sub             r1,  r1,  r2, lsl #1
1292         mov             r3,  r2
1293         vpush           {d8-d15}
1294         bl              put_h264_qpel16_v_lowpass_neon
1295         vpop            {d8-d15}
1296         pop             {r4, pc}
1297         .endfunc
1298
1299 function ff_put_h264_qpel16_mc12_neon, export=1
1300         push            {r0, r1, r4-r5, r9-r11, lr}
1301 put_h264_qpel16_mc12:
1302         lowpass_const   r3
1303         mov             r11, sp
1304         bic             sp,  sp,  #15
1305         sub             sp,  sp,  #(16*16+16*12)
1306         sub             r1,  r1,  r2, lsl #1
1307         mov             r0,  sp
1308         mov             r3,  r2
1309         vpush           {d8-d15}
1310         bl              put_h264_qpel16_v_lowpass_neon_packed
1311         mov             r4,  r0
1312         ldrd            r0,  [r11]
1313         sub             r1,  r1,  r3, lsl #1
1314         sub             r1,  r1,  #2
1315         mov             r2,  r3
1316         bl              put_h264_qpel16_hv_lowpass_l2_neon
1317         vpop            {d8-d15}
1318         add             sp,  r11,  #8
1319         pop             {r4-r5, r9-r11, pc}
1320         .endfunc
1321
1322 function ff_put_h264_qpel16_mc22_neon, export=1
1323         push            {r4, r9-r11, lr}
1324         lowpass_const   r3
1325         mov             r11, sp
1326         bic             sp,  sp,  #15
1327         sub             r1,  r1,  r2, lsl #1
1328         sub             r1,  r1,  #2
1329         mov             r3,  r2
1330         sub             sp,  sp,  #(16*12)
1331         mov             r4,  sp
1332         vpush           {d8-d15}
1333         bl              put_h264_qpel16_hv_lowpass_neon
1334         vpop            {d8-d15}
1335         mov             sp,  r11
1336         pop             {r4, r9-r11, pc}
1337         .endfunc
1338
1339 function ff_put_h264_qpel16_mc32_neon, export=1
1340         push            {r0, r1, r4-r5, r9-r11, lr}
1341         add             r1,  r1,  #1
1342         b               put_h264_qpel16_mc12
1343         .endfunc
1344
1345 function ff_put_h264_qpel16_mc03_neon, export=1
1346         push            {r4, lr}
1347         add             ip,  r1,  r2
1348         b               put_h264_qpel16_mc01
1349         .endfunc
1350
1351 function ff_put_h264_qpel16_mc13_neon, export=1
1352         push            {r0, r1, r4, lr}
1353         add             r1,  r1,  r2
1354         b               put_h264_qpel16_mc11
1355         .endfunc
1356
1357 function ff_put_h264_qpel16_mc23_neon, export=1
1358         push            {r0, r1, r4-r5, r9-r11, lr}
1359         add             r1,  r1,  r2
1360         b               put_h264_qpel16_mc21
1361         .endfunc
1362
1363 function ff_put_h264_qpel16_mc33_neon, export=1
1364         add             r1,  r1,  #1
1365         push            {r0, r1, r4, lr}
1366         add             r1,  r1,  r2
1367         sub             r1,  r1,  #1
1368         b               put_h264_qpel16_mc11
1369         .endfunc
1370
1371 @ Biweighted prediction
1372
1373         .macro  biweight_16 macs, macd
1374         vdup.8          d0,  r4
1375         vdup.8          d1,  r5
1376         vmov            q2,  q8
1377         vmov            q3,  q8
1378 1:      subs            ip,  ip,  #2
1379         vld1.8          {d20-d21},[r0,:128], r2
1380         \macd           q2,  d0,  d20
1381         pld             [r0]
1382         \macd           q3,  d0,  d21
1383         vld1.8          {d22-d23},[r1,:128], r2
1384         \macs           q2,  d1,  d22
1385         pld             [r1]
1386         \macs           q3,  d1,  d23
1387         vmov            q12, q8
1388         vld1.8          {d28-d29},[r0,:128], r2
1389         vmov            q13, q8
1390         \macd           q12, d0,  d28
1391         pld             [r0]
1392         \macd           q13, d0,  d29
1393         vld1.8          {d30-d31},[r1,:128], r2
1394         \macs           q12, d1,  d30
1395         pld             [r1]
1396         \macs           q13, d1,  d31
1397         vshl.s16        q2,  q2,  q9
1398         vshl.s16        q3,  q3,  q9
1399         vqmovun.s16     d4,  q2
1400         vqmovun.s16     d5,  q3
1401         vshl.s16        q12, q12, q9
1402         vshl.s16        q13, q13, q9
1403         vqmovun.s16     d24, q12
1404         vqmovun.s16     d25, q13
1405         vmov            q3,  q8
1406         vst1.8          {d4- d5}, [r6,:128], r2
1407         vmov            q2,  q8
1408         vst1.8          {d24-d25},[r6,:128], r2
1409         bne             1b
1410         pop             {r4-r6, pc}
1411         .endm
1412
1413         .macro  biweight_8 macs, macd
1414         vdup.8          d0,  r4
1415         vdup.8          d1,  r5
1416         vmov            q1,  q8
1417         vmov            q10, q8
1418 1:      subs            ip,  ip,  #2
1419         vld1.8          {d4},[r0,:64], r2
1420         \macd           q1,  d0,  d4
1421         pld             [r0]
1422         vld1.8          {d5},[r1,:64], r2
1423         \macs           q1,  d1,  d5
1424         pld             [r1]
1425         vld1.8          {d6},[r0,:64], r2
1426         \macd           q10, d0,  d6
1427         pld             [r0]
1428         vld1.8          {d7},[r1,:64], r2
1429         \macs           q10, d1,  d7
1430         pld             [r1]
1431         vshl.s16        q1,  q1,  q9
1432         vqmovun.s16     d2,  q1
1433         vshl.s16        q10, q10, q9
1434         vqmovun.s16     d4,  q10
1435         vmov            q10, q8
1436         vst1.8          {d2},[r6,:64], r2
1437         vmov            q1,  q8
1438         vst1.8          {d4},[r6,:64], r2
1439         bne             1b
1440         pop             {r4-r6, pc}
1441         .endm
1442
1443         .macro  biweight_4 macs, macd
1444         vdup.8          d0,  r4
1445         vdup.8          d1,  r5
1446         vmov            q1,  q8
1447         vmov            q10, q8
1448 1:      subs            ip,  ip,  #4
1449         vld1.32         {d4[0]},[r0,:32], r2
1450         vld1.32         {d4[1]},[r0,:32], r2
1451         \macd           q1,  d0,  d4
1452         pld             [r0]
1453         vld1.32         {d5[0]},[r1,:32], r2
1454         vld1.32         {d5[1]},[r1,:32], r2
1455         \macs           q1,  d1,  d5
1456         pld             [r1]
1457         blt             2f
1458         vld1.32         {d6[0]},[r0,:32], r2
1459         vld1.32         {d6[1]},[r0,:32], r2
1460         \macd           q10, d0,  d6
1461         pld             [r0]
1462         vld1.32         {d7[0]},[r1,:32], r2
1463         vld1.32         {d7[1]},[r1,:32], r2
1464         \macs           q10, d1,  d7
1465         pld             [r1]
1466         vshl.s16        q1,  q1,  q9
1467         vqmovun.s16     d2,  q1
1468         vshl.s16        q10, q10, q9
1469         vqmovun.s16     d4,  q10
1470         vmov            q10, q8
1471         vst1.32         {d2[0]},[r6,:32], r2
1472         vst1.32         {d2[1]},[r6,:32], r2
1473         vmov            q1,  q8
1474         vst1.32         {d4[0]},[r6,:32], r2
1475         vst1.32         {d4[1]},[r6,:32], r2
1476         bne             1b
1477         pop             {r4-r6, pc}
1478 2:      vshl.s16        q1,  q1,  q9
1479         vqmovun.s16     d2,  q1
1480         vst1.32         {d2[0]},[r6,:32], r2
1481         vst1.32         {d2[1]},[r6,:32], r2
1482         pop             {r4-r6, pc}
1483         .endm
1484
1485         .macro  biweight_func w
1486 function biweight_h264_pixels_\w\()_neon
1487         push            {r4-r6, lr}
1488         add             r4,  sp,  #16
1489         ldm             r4,  {r4-r6}
1490         lsr             lr,  r4,  #31
1491         add             r6,  r6,  #1
1492         eors            lr,  lr,  r5,  lsr #30
1493         orr             r6,  r6,  #1
1494         vdup.16         q9,  r3
1495         lsl             r6,  r6,  r3
1496         vmvn            q9,  q9
1497         vdup.16         q8,  r6
1498         mov             r6,  r0
1499         beq             10f
1500         subs            lr,  lr,  #1
1501         beq             20f
1502         subs            lr,  lr,  #1
1503         beq             30f
1504         b               40f
1505 10:     biweight_\w     vmlal.u8, vmlal.u8
1506 20:     rsb             r4,  r4,  #0
1507         biweight_\w     vmlal.u8, vmlsl.u8
1508 30:     rsb             r4,  r4,  #0
1509         rsb             r5,  r5,  #0
1510         biweight_\w     vmlsl.u8, vmlsl.u8
1511 40:     rsb             r5,  r5,  #0
1512         biweight_\w     vmlsl.u8, vmlal.u8
1513         .endfunc
1514         .endm
1515
1516         .macro  biweight_entry w, h, b=1
1517 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1518         mov             ip,  #\h
1519 .if \b
1520         b               biweight_h264_pixels_\w\()_neon
1521 .endif
1522         .endfunc
1523         .endm
1524
1525         biweight_entry  16, 8
1526         biweight_entry  16, 16, b=0
1527         biweight_func   16
1528
1529         biweight_entry  8,  16
1530         biweight_entry  8,  4
1531         biweight_entry  8,  8,  b=0
1532         biweight_func   8
1533
1534         biweight_entry  4,  8
1535         biweight_entry  4,  2
1536         biweight_entry  4,  4,  b=0
1537         biweight_func   4
1538
1539 @ Weighted prediction
1540
1541         .macro  weight_16 add
1542         vdup.8          d0,  r3
1543 1:      subs            ip,  ip,  #2
1544         vld1.8          {d20-d21},[r0,:128], r1
1545         vmull.u8        q2,  d0,  d20
1546         pld             [r0]
1547         vmull.u8        q3,  d0,  d21
1548         vld1.8          {d28-d29},[r0,:128], r1
1549         vmull.u8        q12, d0,  d28
1550         pld             [r0]
1551         vmull.u8        q13, d0,  d29
1552         \add            q2,  q8,  q2
1553         vrshl.s16       q2,  q2,  q9
1554         \add            q3,  q8,  q3
1555         vrshl.s16       q3,  q3,  q9
1556         vqmovun.s16     d4,  q2
1557         vqmovun.s16     d5,  q3
1558         \add            q12, q8,  q12
1559         vrshl.s16       q12, q12, q9
1560         \add            q13, q8,  q13
1561         vrshl.s16       q13, q13, q9
1562         vqmovun.s16     d24, q12
1563         vqmovun.s16     d25, q13
1564         vst1.8          {d4- d5}, [r4,:128], r1
1565         vst1.8          {d24-d25},[r4,:128], r1
1566         bne             1b
1567         pop             {r4, pc}
1568         .endm
1569
1570         .macro  weight_8 add
1571         vdup.8          d0,  r3
1572 1:      subs            ip,  ip,  #2
1573         vld1.8          {d4},[r0,:64], r1
1574         vmull.u8        q1,  d0,  d4
1575         pld             [r0]
1576         vld1.8          {d6},[r0,:64], r1
1577         vmull.u8        q10, d0,  d6
1578         \add            q1,  q8,  q1
1579         pld             [r0]
1580         vrshl.s16       q1,  q1,  q9
1581         vqmovun.s16     d2,  q1
1582         \add            q10, q8,  q10
1583         vrshl.s16       q10, q10, q9
1584         vqmovun.s16     d4,  q10
1585         vst1.8          {d2},[r4,:64], r1
1586         vst1.8          {d4},[r4,:64], r1
1587         bne             1b
1588         pop             {r4, pc}
1589         .endm
1590
1591         .macro  weight_4 add
1592         vdup.8          d0,  r3
1593         vmov            q1,  q8
1594         vmov            q10, q8
1595 1:      subs            ip,  ip,  #4
1596         vld1.32         {d4[0]},[r0,:32], r1
1597         vld1.32         {d4[1]},[r0,:32], r1
1598         vmull.u8        q1,  d0,  d4
1599         pld             [r0]
1600         blt             2f
1601         vld1.32         {d6[0]},[r0,:32], r1
1602         vld1.32         {d6[1]},[r0,:32], r1
1603         vmull.u8        q10, d0,  d6
1604         pld             [r0]
1605         \add            q1,  q8,  q1
1606         vrshl.s16       q1,  q1,  q9
1607         vqmovun.s16     d2,  q1
1608         \add            q10, q8,  q10
1609         vrshl.s16       q10, q10, q9
1610         vqmovun.s16     d4,  q10
1611         vmov            q10, q8
1612         vst1.32         {d2[0]},[r4,:32], r1
1613         vst1.32         {d2[1]},[r4,:32], r1
1614         vmov            q1,  q8
1615         vst1.32         {d4[0]},[r4,:32], r1
1616         vst1.32         {d4[1]},[r4,:32], r1
1617         bne             1b
1618         pop             {r4, pc}
1619 2:      \add            q1,  q8,  q1
1620         vrshl.s16       q1,  q1,  q9
1621         vqmovun.s16     d2,  q1
1622         vst1.32         {d2[0]},[r4,:32], r1
1623         vst1.32         {d2[1]},[r4,:32], r1
1624         pop             {r4, pc}
1625         .endm
1626
1627         .macro  weight_func w
1628 function weight_h264_pixels_\w\()_neon
1629         push            {r4, lr}
1630         ldr             r4,  [sp, #8]
1631         cmp             r2,  #1
1632         lsl             r4,  r4,  r2
1633         vdup.16         q8,  r4
1634         mov             r4,  r0
1635         ble             20f
1636         rsb             lr,  r2,  #1
1637         vdup.16         q9,  lr
1638         cmp             r3,  #0
1639         blt             10f
1640         weight_\w       vhadd.s16
1641 10:     rsb             r3,  r3,  #0
1642         weight_\w       vhsub.s16
1643 20:     rsb             lr,  r2,  #0
1644         vdup.16         q9,  lr
1645         cmp             r3,  #0
1646         blt             10f
1647         weight_\w       vadd.s16
1648 10:     rsb             r3,  r3,  #0
1649         weight_\w       vsub.s16
1650         .endfunc
1651         .endm
1652
1653         .macro  weight_entry w, h, b=1
1654 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1655         mov             ip,  #\h
1656 .if \b
1657         b               weight_h264_pixels_\w\()_neon
1658 .endif
1659         .endfunc
1660         .endm
1661
1662         weight_entry    16, 8
1663         weight_entry    16, 16, b=0
1664         weight_func     16
1665
1666         weight_entry    8,  16
1667         weight_entry    8,  4
1668         weight_entry    8,  8,  b=0
1669         weight_func     8
1670
1671         weight_entry    4,  8
1672         weight_entry    4,  2
1673         weight_entry    4,  4,  b=0
1674         weight_func     4