]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264dsp_neon.S
Reduce warnings about too few consumed bytes to debug level.
[ffmpeg] / libavcodec / arm / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "asm.S"
22
23         .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24         vtrn.32         \r0, \r4
25         vtrn.32         \r1, \r5
26         vtrn.32         \r2, \r6
27         vtrn.32         \r3, \r7
28         vtrn.16         \r0, \r2
29         vtrn.16         \r1, \r3
30         vtrn.16         \r4, \r6
31         vtrn.16         \r5, \r7
32         vtrn.8          \r0, \r1
33         vtrn.8          \r2, \r3
34         vtrn.8          \r4, \r5
35         vtrn.8          \r6, \r7
36         .endm
37
38         .macro transpose_4x4 r0 r1 r2 r3
39         vtrn.16         \r0, \r2
40         vtrn.16         \r1, \r3
41         vtrn.8          \r0, \r1
42         vtrn.8          \r2, \r3
43         .endm
44
45         .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46         vswp            \r0, \r4
47         vswp            \r1, \r5
48         vswp            \r2, \r6
49         vswp            \r3, \r7
50         .endm
51
52         .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53         vtrn.32         \r0, \r2
54         vtrn.32         \r1, \r3
55         vtrn.32         \r4, \r6
56         vtrn.32         \r5, \r7
57         vtrn.16         \r0, \r1
58         vtrn.16         \r2, \r3
59         vtrn.16         \r4, \r5
60         vtrn.16         \r6, \r7
61         .endm
62
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64         .macro  h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
66         push            {r4-r7, lr}
67         ldrd            r4,  [sp, #20]
68 .ifc \type,avg
69         mov             lr,  r0
70 .endif
71         pld             [r1]
72         pld             [r1, r2]
73
74         muls            r7,  r4,  r5
75         rsb             r6,  r7,  r5,  lsl #3
76         rsb             ip,  r7,  r4,  lsl #3
77         sub             r4,  r7,  r4,  lsl #3
78         sub             r4,  r4,  r5,  lsl #3
79         add             r4,  r4,  #64
80
81         beq             2f
82
83         add             r5,  r1,  r2
84
85         vdup.8          d0,  r4
86         lsl             r4,  r2,  #1
87         vdup.8          d1,  ip
88         vld1.64         {d4, d5}, [r1], r4
89         vdup.8          d2,  r6
90         vld1.64         {d6, d7}, [r5], r4
91         vdup.8          d3,  r7
92
93         vext.8          d5,  d4,  d5,  #1
94         vext.8          d7,  d6,  d7,  #1
95
96 1:      pld             [r5]
97         vmull.u8        q8,  d4,  d0
98         vmlal.u8        q8,  d5,  d1
99         vld1.64         {d4, d5}, [r1], r4
100         vmlal.u8        q8,  d6,  d2
101         vext.8          d5,  d4,  d5,  #1
102         vmlal.u8        q8,  d7,  d3
103         vmull.u8        q9,  d6,  d0
104         subs            r3,  r3,  #2
105         vmlal.u8        q9,  d7,  d1
106         vmlal.u8        q9,  d4,  d2
107         vmlal.u8        q9,  d5,  d3
108         vrshrn.u16      d16, q8,  #6
109         vld1.64         {d6, d7}, [r5], r4
110         pld             [r1]
111         vrshrn.u16      d17, q9,  #6
112 .ifc \type,avg
113         vld1.64         {d20}, [lr,:64], r2
114         vld1.64         {d21}, [lr,:64], r2
115         vrhadd.u8       q8,  q8,  q10
116 .endif
117         vext.8          d7,  d6,  d7,  #1
118         vst1.64         {d16}, [r0,:64], r2
119         vst1.64         {d17}, [r0,:64], r2
120         bgt             1b
121
122         pop             {r4-r7, pc}
123
124 2:      tst             r6,  r6
125         add             ip,  ip,  r6
126         vdup.8          d0,  r4
127         vdup.8          d1,  ip
128
129         beq             4f
130
131         add             r5,  r1,  r2
132         lsl             r4,  r2,  #1
133         vld1.64         {d4}, [r1], r4
134         vld1.64         {d6}, [r5], r4
135
136 3:      pld             [r5]
137         vmull.u8        q8,  d4,  d0
138         vmlal.u8        q8,  d6,  d1
139         vld1.64         {d4}, [r1], r4
140         vmull.u8        q9,  d6,  d0
141         vmlal.u8        q9,  d4,  d1
142         vld1.64         {d6}, [r5], r4
143         vrshrn.u16      d16, q8,  #6
144         vrshrn.u16      d17, q9,  #6
145 .ifc \type,avg
146         vld1.64         {d20}, [lr,:64], r2
147         vld1.64         {d21}, [lr,:64], r2
148         vrhadd.u8       q8,  q8,  q10
149 .endif
150         subs            r3,  r3,  #2
151         pld             [r1]
152         vst1.64         {d16}, [r0,:64], r2
153         vst1.64         {d17}, [r0,:64], r2
154         bgt             3b
155
156         pop             {r4-r7, pc}
157
158 4:      vld1.64         {d4, d5}, [r1], r2
159         vld1.64         {d6, d7}, [r1], r2
160         vext.8          d5,  d4,  d5,  #1
161         vext.8          d7,  d6,  d7,  #1
162
163 5:      pld             [r1]
164         subs            r3,  r3,  #2
165         vmull.u8        q8,  d4,  d0
166         vmlal.u8        q8,  d5,  d1
167         vld1.64         {d4, d5}, [r1], r2
168         vmull.u8        q9,  d6,  d0
169         vmlal.u8        q9,  d7,  d1
170         pld             [r1]
171         vext.8          d5,  d4,  d5,  #1
172         vrshrn.u16      d16, q8,  #6
173         vrshrn.u16      d17, q9,  #6
174 .ifc \type,avg
175         vld1.64         {d20}, [lr,:64], r2
176         vld1.64         {d21}, [lr,:64], r2
177         vrhadd.u8       q8,  q8,  q10
178 .endif
179         vld1.64         {d6, d7}, [r1], r2
180         vext.8          d7,  d6,  d7,  #1
181         vst1.64         {d16}, [r0,:64], r2
182         vst1.64         {d17}, [r0,:64], r2
183         bgt             5b
184
185         pop             {r4-r7, pc}
186         .endfunc
187         .endm
188
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190         .macro  h264_chroma_mc4 type
191 function ff_\type\()_h264_chroma_mc4_neon, export=1
192         push            {r4-r7, lr}
193         ldrd            r4,  [sp, #20]
194 .ifc \type,avg
195         mov             lr,  r0
196 .endif
197         pld             [r1]
198         pld             [r1, r2]
199
200         muls            r7,  r4,  r5
201         rsb             r6,  r7,  r5,  lsl #3
202         rsb             ip,  r7,  r4,  lsl #3
203         sub             r4,  r7,  r4,  lsl #3
204         sub             r4,  r4,  r5,  lsl #3
205         add             r4,  r4,  #64
206
207         beq             2f
208
209         add             r5,  r1,  r2
210
211         vdup.8          d0,  r4
212         lsl             r4,  r2,  #1
213         vdup.8          d1,  ip
214         vld1.64         {d4},     [r1], r4
215         vdup.8          d2,  r6
216         vld1.64         {d6},     [r5], r4
217         vdup.8          d3,  r7
218
219         vext.8          d5,  d4,  d5,  #1
220         vext.8          d7,  d6,  d7,  #1
221         vtrn.32         d4,  d5
222         vtrn.32         d6,  d7
223
224         vtrn.32         d0,  d1
225         vtrn.32         d2,  d3
226
227 1:      pld             [r5]
228         vmull.u8        q8,  d4,  d0
229         vmlal.u8        q8,  d6,  d2
230         vld1.64         {d4},     [r1], r4
231         vext.8          d5,  d4,  d5,  #1
232         vtrn.32         d4,  d5
233         vmull.u8        q9,  d6,  d0
234         vmlal.u8        q9,  d4,  d2
235         vld1.64         {d6},     [r5], r4
236         vadd.i16        d16, d16, d17
237         vadd.i16        d17, d18, d19
238         vrshrn.u16      d16, q8,  #6
239         subs            r3,  r3,  #2
240         pld             [r1]
241 .ifc \type,avg
242         vld1.32         {d20[0]}, [lr,:32], r2
243         vld1.32         {d20[1]}, [lr,:32], r2
244         vrhadd.u8       d16, d16, d20
245 .endif
246         vext.8          d7,  d6,  d7,  #1
247         vtrn.32         d6,  d7
248         vst1.32         {d16[0]}, [r0,:32], r2
249         vst1.32         {d16[1]}, [r0,:32], r2
250         bgt             1b
251
252         pop             {r4-r7, pc}
253
254 2:      tst             r6,  r6
255         add             ip,  ip,  r6
256         vdup.8          d0,  r4
257         vdup.8          d1,  ip
258         vtrn.32         d0,  d1
259
260         beq             4f
261
262         vext.32         d1,  d0,  d1,  #1
263         add             r5,  r1,  r2
264         lsl             r4,  r2,  #1
265         vld1.32         {d4[0]},  [r1], r4
266         vld1.32         {d4[1]},  [r5], r4
267
268 3:      pld             [r5]
269         vmull.u8        q8,  d4,  d0
270         vld1.32         {d4[0]},  [r1], r4
271         vmull.u8        q9,  d4,  d1
272         vld1.32         {d4[1]},  [r5], r4
273         vadd.i16        d16, d16, d17
274         vadd.i16        d17, d18, d19
275         vrshrn.u16      d16, q8,  #6
276 .ifc \type,avg
277         vld1.32         {d20[0]}, [lr,:32], r2
278         vld1.32         {d20[1]}, [lr,:32], r2
279         vrhadd.u8       d16, d16, d20
280 .endif
281         subs            r3,  r3,  #2
282         pld             [r1]
283         vst1.32         {d16[0]}, [r0,:32], r2
284         vst1.32         {d16[1]}, [r0,:32], r2
285         bgt             3b
286
287         pop             {r4-r7, pc}
288
289 4:      vld1.64         {d4},     [r1], r2
290         vld1.64         {d6},     [r1], r2
291         vext.8          d5,  d4,  d5,  #1
292         vext.8          d7,  d6,  d7,  #1
293         vtrn.32         d4,  d5
294         vtrn.32         d6,  d7
295
296 5:      vmull.u8        q8,  d4,  d0
297         vmull.u8        q9,  d6,  d0
298         subs            r3,  r3,  #2
299         vld1.64         {d4},     [r1], r2
300         vext.8          d5,  d4,  d5,  #1
301         vtrn.32         d4,  d5
302         vadd.i16        d16, d16, d17
303         vadd.i16        d17, d18, d19
304         pld             [r1]
305         vrshrn.u16      d16, q8,  #6
306 .ifc \type,avg
307         vld1.32         {d20[0]}, [lr,:32], r2
308         vld1.32         {d20[1]}, [lr,:32], r2
309         vrhadd.u8       d16, d16, d20
310 .endif
311         vld1.64         {d6},     [r1], r2
312         vext.8          d7,  d6,  d7,  #1
313         vtrn.32         d6,  d7
314         pld             [r1]
315         vst1.32         {d16[0]}, [r0,:32], r2
316         vst1.32         {d16[1]}, [r0,:32], r2
317         bgt             5b
318
319         pop             {r4-r7, pc}
320         .endfunc
321         .endm
322
323         .macro  h264_chroma_mc2 type
324 function ff_\type\()_h264_chroma_mc2_neon, export=1
325         push            {r4-r6, lr}
326         ldr             r4,  [sp, #16]
327         ldr             lr,  [sp, #20]
328         pld             [r1]
329         pld             [r1, r2]
330         orrs            r5,  r4,  lr
331         beq             2f
332
333         mul             r5,  r4,  lr
334         rsb             r6,  r5,  lr,  lsl #3
335         rsb             r12, r5,  r4,  lsl #3
336         sub             r4,  r5,  r4,  lsl #3
337         sub             r4,  r4,  lr,  lsl #3
338         add             r4,  r4,  #64
339         vdup.8          d0,  r4
340         vdup.8          d2,  r12
341         vdup.8          d1,  r6
342         vdup.8          d3,  r5
343         vtrn.16         q0,  q1
344 1:
345         vld1.32         {d4[0]},  [r1], r2
346         vld1.32         {d4[1]},  [r1], r2
347         vrev64.32       d5,  d4
348         vld1.32         {d5[1]},  [r1]
349         vext.8          q3,  q2,  q2,  #1
350         vtrn.16         q2,  q3
351         vmull.u8        q8,  d4,  d0
352         vmlal.u8        q8,  d5,  d1
353 .ifc \type,avg
354         vld1.16         {d18[0]}, [r0,:16], r2
355         vld1.16         {d18[1]}, [r0,:16]
356         sub             r0,  r0,  r2
357 .endif
358         vtrn.32         d16, d17
359         vadd.i16        d16, d16, d17
360         vrshrn.u16      d16, q8,  #6
361 .ifc \type,avg
362         vrhadd.u8       d16, d16, d18
363 .endif
364         vst1.16         {d16[0]}, [r0,:16], r2
365         vst1.16         {d16[1]}, [r0,:16], r2
366         subs            r3,  r3,  #2
367         bgt             1b
368         pop             {r4-r6, pc}
369 2:
370 .ifc \type,put
371         ldrh            r5,  [r1], r2
372         strh            r5,  [r0], r2
373         ldrh            r6,  [r1], r2
374         strh            r6,  [r0], r2
375 .else
376         vld1.16         {d16[0]}, [r1], r2
377         vld1.16         {d16[1]}, [r1], r2
378         vld1.16         {d18[0]}, [r0,:16], r2
379         vld1.16         {d18[1]}, [r0,:16]
380         sub             r0,  r0,  r2
381         vrhadd.u8       d16, d16, d18
382         vst1.16         {d16[0]}, [r0,:16], r2
383         vst1.16         {d16[1]}, [r0,:16], r2
384 .endif
385         subs            r3,  r3,  #2
386         bgt             2b
387         pop             {r4-r6, pc}
388         .endfunc
389 .endm
390
391         .text
392         .align
393
394         h264_chroma_mc8 put
395         h264_chroma_mc8 avg
396         h264_chroma_mc4 put
397         h264_chroma_mc4 avg
398         h264_chroma_mc2 put
399         h264_chroma_mc2 avg
400
401         /* H.264 loop filter */
402
403         .macro h264_loop_filter_start
404         ldr             ip,  [sp]
405         tst             r2,  r2
406         ldr             ip,  [ip]
407         tstne           r3,  r3
408         vmov.32         d24[0], ip
409         and             ip,  ip,  ip, lsl #16
410         bxeq            lr
411         ands            ip,  ip,  ip, lsl #8
412         bxlt            lr
413         .endm
414
415         .macro align_push_regs
416         and             ip,  sp,  #15
417         add             ip,  ip,  #32
418         sub             sp,  sp,  ip
419         vst1.64         {d12-d15}, [sp,:128]
420         sub             sp,  sp,  #32
421         vst1.64         {d8-d11},  [sp,:128]
422         .endm
423
424         .macro align_pop_regs
425         vld1.64         {d8-d11},  [sp,:128]!
426         vld1.64         {d12-d15}, [sp,:128], ip
427         .endm
428
429         .macro h264_loop_filter_luma
430         vdup.8          q11, r2         @ alpha
431         vmovl.u8        q12, d24
432         vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
433         vmovl.u16       q12, d24
434         vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
435         vsli.16         q12, q12, #8
436         vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
437         vsli.32         q12, q12, #16
438         vclt.u8         q6,  q6,  q11   @ < alpha
439         vdup.8          q11, r3         @ beta
440         vclt.s8         q7,  q12, #0
441         vclt.u8         q14, q14, q11   @ < beta
442         vclt.u8         q15, q15, q11   @ < beta
443         vbic            q6,  q6,  q7
444         vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
445         vand            q6,  q6,  q14
446         vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
447         vclt.u8         q4,  q4,  q11   @ < beta
448         vand            q6,  q6,  q15
449         vclt.u8         q5,  q5,  q11   @ < beta
450         vand            q4,  q4,  q6
451         vand            q5,  q5,  q6
452         vand            q12, q12, q6
453         vrhadd.u8       q14, q8,  q0
454         vsub.i8         q6,  q12, q4
455         vqadd.u8        q7,  q9,  q12
456         vhadd.u8        q10, q10, q14
457         vsub.i8         q6,  q6,  q5
458         vhadd.u8        q14, q2,  q14
459         vmin.u8         q7,  q7,  q10
460         vqsub.u8        q11, q9,  q12
461         vqadd.u8        q2,  q1,  q12
462         vmax.u8         q7,  q7,  q11
463         vqsub.u8        q11, q1,  q12
464         vmin.u8         q14, q2,  q14
465         vmovl.u8        q2,  d0
466         vmax.u8         q14, q14, q11
467         vmovl.u8        q10, d1
468         vsubw.u8        q2,  q2,  d16
469         vsubw.u8        q10, q10, d17
470         vshl.i16        q2,  q2,  #2
471         vshl.i16        q10, q10, #2
472         vaddw.u8        q2,  q2,  d18
473         vaddw.u8        q10, q10, d19
474         vsubw.u8        q2,  q2,  d2
475         vsubw.u8        q10, q10, d3
476         vrshrn.i16      d4,  q2,  #3
477         vrshrn.i16      d5,  q10, #3
478         vbsl            q4,  q7,  q9
479         vbsl            q5,  q14, q1
480         vneg.s8         q7,  q6
481         vmovl.u8        q14, d16
482         vmin.s8         q2,  q2,  q6
483         vmovl.u8        q6,  d17
484         vmax.s8         q2,  q2,  q7
485         vmovl.u8        q11, d0
486         vmovl.u8        q12, d1
487         vaddw.s8        q14, q14, d4
488         vaddw.s8        q6,  q6,  d5
489         vsubw.s8        q11, q11, d4
490         vsubw.s8        q12, q12, d5
491         vqmovun.s16     d16, q14
492         vqmovun.s16     d17, q6
493         vqmovun.s16     d0,  q11
494         vqmovun.s16     d1,  q12
495         .endm
496
497 function ff_h264_v_loop_filter_luma_neon, export=1
498         h264_loop_filter_start
499
500         vld1.64         {d0, d1},  [r0,:128], r1
501         vld1.64         {d2, d3},  [r0,:128], r1
502         vld1.64         {d4, d5},  [r0,:128], r1
503         sub             r0,  r0,  r1, lsl #2
504         sub             r0,  r0,  r1, lsl #1
505         vld1.64         {d20,d21}, [r0,:128], r1
506         vld1.64         {d18,d19}, [r0,:128], r1
507         vld1.64         {d16,d17}, [r0,:128], r1
508
509         align_push_regs
510
511         h264_loop_filter_luma
512
513         sub             r0,  r0,  r1, lsl #1
514         vst1.64         {d8, d9},  [r0,:128], r1
515         vst1.64         {d16,d17}, [r0,:128], r1
516         vst1.64         {d0, d1},  [r0,:128], r1
517         vst1.64         {d10,d11}, [r0,:128]
518
519         align_pop_regs
520         bx              lr
521         .endfunc
522
523 function ff_h264_h_loop_filter_luma_neon, export=1
524         h264_loop_filter_start
525
526         sub             r0,  r0,  #4
527         vld1.64         {d6},  [r0], r1
528         vld1.64         {d20}, [r0], r1
529         vld1.64         {d18}, [r0], r1
530         vld1.64         {d16}, [r0], r1
531         vld1.64         {d0},  [r0], r1
532         vld1.64         {d2},  [r0], r1
533         vld1.64         {d4},  [r0], r1
534         vld1.64         {d26}, [r0], r1
535         vld1.64         {d7},  [r0], r1
536         vld1.64         {d21}, [r0], r1
537         vld1.64         {d19}, [r0], r1
538         vld1.64         {d17}, [r0], r1
539         vld1.64         {d1},  [r0], r1
540         vld1.64         {d3},  [r0], r1
541         vld1.64         {d5},  [r0], r1
542         vld1.64         {d27}, [r0], r1
543
544         transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
545
546         align_push_regs
547
548         h264_loop_filter_luma
549
550         transpose_4x4   q4, q8, q0, q5
551
552         sub             r0,  r0,  r1, lsl #4
553         add             r0,  r0,  #2
554         vst1.32         {d8[0]},  [r0], r1
555         vst1.32         {d16[0]}, [r0], r1
556         vst1.32         {d0[0]},  [r0], r1
557         vst1.32         {d10[0]}, [r0], r1
558         vst1.32         {d8[1]},  [r0], r1
559         vst1.32         {d16[1]}, [r0], r1
560         vst1.32         {d0[1]},  [r0], r1
561         vst1.32         {d10[1]}, [r0], r1
562         vst1.32         {d9[0]},  [r0], r1
563         vst1.32         {d17[0]}, [r0], r1
564         vst1.32         {d1[0]},  [r0], r1
565         vst1.32         {d11[0]}, [r0], r1
566         vst1.32         {d9[1]},  [r0], r1
567         vst1.32         {d17[1]}, [r0], r1
568         vst1.32         {d1[1]},  [r0], r1
569         vst1.32         {d11[1]}, [r0], r1
570
571         align_pop_regs
572         bx              lr
573         .endfunc
574
575         .macro h264_loop_filter_chroma
576         vdup.8          d22, r2         @ alpha
577         vmovl.u8        q12, d24
578         vabd.u8         d26, d16, d0    @ abs(p0 - q0)
579         vmovl.u8        q2,  d0
580         vabd.u8         d28, d18, d16   @ abs(p1 - p0)
581         vsubw.u8        q2,  q2,  d16
582         vsli.16         d24, d24, #8
583         vshl.i16        q2,  q2,  #2
584         vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
585         vaddw.u8        q2,  q2,  d18
586         vclt.u8         d26, d26, d22   @ < alpha
587         vsubw.u8        q2,  q2,  d2
588         vdup.8          d22, r3         @ beta
589         vclt.s8         d25, d24, #0
590         vrshrn.i16      d4,  q2,  #3
591         vclt.u8         d28, d28, d22   @ < beta
592         vbic            d26, d26, d25
593         vclt.u8         d30, d30, d22   @ < beta
594         vand            d26, d26, d28
595         vneg.s8         d25, d24
596         vand            d26, d26, d30
597         vmin.s8         d4,  d4,  d24
598         vmovl.u8        q14, d16
599         vand            d4,  d4,  d26
600         vmax.s8         d4,  d4,  d25
601         vmovl.u8        q11, d0
602         vaddw.s8        q14, q14, d4
603         vsubw.s8        q11, q11, d4
604         vqmovun.s16     d16, q14
605         vqmovun.s16     d0,  q11
606         .endm
607
608 function ff_h264_v_loop_filter_chroma_neon, export=1
609         h264_loop_filter_start
610
611         sub             r0,  r0,  r1, lsl #1
612         vld1.64         {d18}, [r0,:64], r1
613         vld1.64         {d16}, [r0,:64], r1
614         vld1.64         {d0},  [r0,:64], r1
615         vld1.64         {d2},  [r0,:64]
616
617         h264_loop_filter_chroma
618
619         sub             r0,  r0,  r1, lsl #1
620         vst1.64         {d16}, [r0,:64], r1
621         vst1.64         {d0},  [r0,:64], r1
622
623         bx              lr
624         .endfunc
625
626 function ff_h264_h_loop_filter_chroma_neon, export=1
627         h264_loop_filter_start
628
629         sub             r0,  r0,  #2
630         vld1.32         {d18[0]}, [r0], r1
631         vld1.32         {d16[0]}, [r0], r1
632         vld1.32         {d0[0]},  [r0], r1
633         vld1.32         {d2[0]},  [r0], r1
634         vld1.32         {d18[1]}, [r0], r1
635         vld1.32         {d16[1]}, [r0], r1
636         vld1.32         {d0[1]},  [r0], r1
637         vld1.32         {d2[1]},  [r0], r1
638
639         vtrn.16         d18, d0
640         vtrn.16         d16, d2
641         vtrn.8          d18, d16
642         vtrn.8          d0,  d2
643
644         h264_loop_filter_chroma
645
646         vtrn.16         d18, d0
647         vtrn.16         d16, d2
648         vtrn.8          d18, d16
649         vtrn.8          d0,  d2
650
651         sub             r0,  r0,  r1, lsl #3
652         vst1.32         {d18[0]}, [r0], r1
653         vst1.32         {d16[0]}, [r0], r1
654         vst1.32         {d0[0]},  [r0], r1
655         vst1.32         {d2[0]},  [r0], r1
656         vst1.32         {d18[1]}, [r0], r1
657         vst1.32         {d16[1]}, [r0], r1
658         vst1.32         {d0[1]},  [r0], r1
659         vst1.32         {d2[1]},  [r0], r1
660
661         bx              lr
662         .endfunc
663
664         /* H.264 qpel MC */
665
666         .macro  lowpass_const r
667         movw            \r,  #5
668         movt            \r,  #20
669         vmov.32         d6[0], \r
670         .endm
671
672         .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
673 .if \narrow
674         t0 .req q0
675         t1 .req q8
676 .else
677         t0 .req \d0
678         t1 .req \d1
679 .endif
680         vext.8          d2,  \r0, \r1, #2
681         vext.8          d3,  \r0, \r1, #3
682         vaddl.u8        q1,  d2,  d3
683         vext.8          d4,  \r0, \r1, #1
684         vext.8          d5,  \r0, \r1, #4
685         vaddl.u8        q2,  d4,  d5
686         vext.8          d30, \r0, \r1, #5
687         vaddl.u8        t0,  \r0, d30
688         vext.8          d18, \r2, \r3, #2
689         vmla.i16        t0,  q1,  d6[1]
690         vext.8          d19, \r2, \r3, #3
691         vaddl.u8        q9,  d18, d19
692         vext.8          d20, \r2, \r3, #1
693         vmls.i16        t0,  q2,  d6[0]
694         vext.8          d21, \r2, \r3, #4
695         vaddl.u8        q10, d20, d21
696         vext.8          d31, \r2, \r3, #5
697         vaddl.u8        t1,  \r2, d31
698         vmla.i16        t1,  q9,  d6[1]
699         vmls.i16        t1,  q10, d6[0]
700 .if \narrow
701         vqrshrun.s16    \d0, t0,  #5
702         vqrshrun.s16    \d1, t1,  #5
703 .endif
704         .unreq  t0
705         .unreq  t1
706         .endm
707
708         .macro  lowpass_8_1 r0, r1, d0, narrow=1
709 .if \narrow
710         t0 .req q0
711 .else
712         t0 .req \d0
713 .endif
714         vext.8          d2,  \r0, \r1, #2
715         vext.8          d3,  \r0, \r1, #3
716         vaddl.u8        q1,  d2,  d3
717         vext.8          d4,  \r0, \r1, #1
718         vext.8          d5,  \r0, \r1, #4
719         vaddl.u8        q2,  d4,  d5
720         vext.8          d30, \r0, \r1, #5
721         vaddl.u8        t0,  \r0, d30
722         vmla.i16        t0,  q1,  d6[1]
723         vmls.i16        t0,  q2,  d6[0]
724 .if \narrow
725         vqrshrun.s16    \d0, t0,  #5
726 .endif
727         .unreq  t0
728         .endm
729
730         .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
731         vext.16         q1,  \r0, \r1, #2
732         vext.16         q0,  \r0, \r1, #3
733         vaddl.s16       q9,  d2,  d0
734         vext.16         q2,  \r0, \r1, #1
735         vaddl.s16       q1,  d3,  d1
736         vext.16         q3,  \r0, \r1, #4
737         vaddl.s16       q10, d4,  d6
738         vext.16         \r1, \r0, \r1, #5
739         vaddl.s16       q2,  d5,  d7
740         vaddl.s16       q0,  \h0, \h1
741         vaddl.s16       q8,  \l0, \l1
742
743         vshl.i32        q3,  q9,  #4
744         vshl.i32        q9,  q9,  #2
745         vshl.i32        q15, q10, #2
746         vadd.i32        q9,  q9,  q3
747         vadd.i32        q10, q10, q15
748
749         vshl.i32        q3,  q1,  #4
750         vshl.i32        q1,  q1,  #2
751         vshl.i32        q15, q2,  #2
752         vadd.i32        q1,  q1,  q3
753         vadd.i32        q2,  q2,  q15
754
755         vadd.i32        q9,  q9,  q8
756         vsub.i32        q9,  q9,  q10
757
758         vadd.i32        q1,  q1,  q0
759         vsub.i32        q1,  q1,  q2
760
761         vrshrn.s32      d18, q9,  #10
762         vrshrn.s32      d19, q1,  #10
763
764         vqmovun.s16     \d,  q9
765         .endm
766
767 function put_h264_qpel16_h_lowpass_neon_packed
768         mov             r4,  lr
769         mov             ip,  #16
770         mov             r3,  #8
771         bl              put_h264_qpel8_h_lowpass_neon
772         sub             r1,  r1,  r2, lsl #4
773         add             r1,  r1,  #8
774         mov             ip,  #16
775         mov             lr,  r4
776         b               put_h264_qpel8_h_lowpass_neon
777         .endfunc
778
779         .macro h264_qpel_h_lowpass type
780 function \type\()_h264_qpel16_h_lowpass_neon
781         push            {lr}
782         mov             ip,  #16
783         bl              \type\()_h264_qpel8_h_lowpass_neon
784         sub             r0,  r0,  r3, lsl #4
785         sub             r1,  r1,  r2, lsl #4
786         add             r0,  r0,  #8
787         add             r1,  r1,  #8
788         mov             ip,  #16
789         pop             {lr}
790         .endfunc
791
792 function \type\()_h264_qpel8_h_lowpass_neon
793 1:      vld1.64         {d0, d1},  [r1], r2
794         vld1.64         {d16,d17}, [r1], r2
795         subs            ip,  ip,  #2
796         lowpass_8       d0,  d1,  d16, d17, d0,  d16
797 .ifc \type,avg
798         vld1.8          {d2},     [r0,:64], r3
799         vrhadd.u8       d0,  d0,  d2
800         vld1.8          {d3},     [r0,:64]
801         vrhadd.u8       d16, d16, d3
802         sub             r0,  r0,  r3
803 .endif
804         vst1.64         {d0},     [r0,:64], r3
805         vst1.64         {d16},    [r0,:64], r3
806         bne             1b
807         bx              lr
808         .endfunc
809         .endm
810
811         h264_qpel_h_lowpass put
812         h264_qpel_h_lowpass avg
813
814         .macro h264_qpel_h_lowpass_l2 type
815 function \type\()_h264_qpel16_h_lowpass_l2_neon
816         push            {lr}
817         mov             ip,  #16
818         bl              \type\()_h264_qpel8_h_lowpass_l2_neon
819         sub             r0,  r0,  r2, lsl #4
820         sub             r1,  r1,  r2, lsl #4
821         sub             r3,  r3,  r2, lsl #4
822         add             r0,  r0,  #8
823         add             r1,  r1,  #8
824         add             r3,  r3,  #8
825         mov             ip,  #16
826         pop             {lr}
827         .endfunc
828
829 function \type\()_h264_qpel8_h_lowpass_l2_neon
830 1:      vld1.64         {d0, d1},  [r1], r2
831         vld1.64         {d16,d17}, [r1], r2
832         vld1.64         {d28},     [r3], r2
833         vld1.64         {d29},     [r3], r2
834         subs            ip,  ip,  #2
835         lowpass_8       d0,  d1,  d16, d17, d0,  d1
836         vrhadd.u8       q0,  q0,  q14
837 .ifc \type,avg
838         vld1.8          {d2},      [r0,:64], r2
839         vrhadd.u8       d0,  d0,  d2
840         vld1.8          {d3},      [r0,:64]
841         vrhadd.u8       d1,  d1,  d3
842         sub             r0,  r0,  r2
843 .endif
844         vst1.64         {d0},      [r0,:64], r2
845         vst1.64         {d1},      [r0,:64], r2
846         bne             1b
847         bx              lr
848         .endfunc
849         .endm
850
851         h264_qpel_h_lowpass_l2 put
852         h264_qpel_h_lowpass_l2 avg
853
854 function put_h264_qpel16_v_lowpass_neon_packed
855         mov             r4,  lr
856         mov             r2,  #8
857         bl              put_h264_qpel8_v_lowpass_neon
858         sub             r1,  r1,  r3, lsl #2
859         bl              put_h264_qpel8_v_lowpass_neon
860         sub             r1,  r1,  r3, lsl #4
861         sub             r1,  r1,  r3, lsl #2
862         add             r1,  r1,  #8
863         bl              put_h264_qpel8_v_lowpass_neon
864         sub             r1,  r1,  r3, lsl #2
865         mov             lr,  r4
866         b               put_h264_qpel8_v_lowpass_neon
867         .endfunc
868
869         .macro h264_qpel_v_lowpass type
870 function \type\()_h264_qpel16_v_lowpass_neon
871         mov             r4,  lr
872         bl              \type\()_h264_qpel8_v_lowpass_neon
873         sub             r1,  r1,  r3, lsl #2
874         bl              \type\()_h264_qpel8_v_lowpass_neon
875         sub             r0,  r0,  r2, lsl #4
876         add             r0,  r0,  #8
877         sub             r1,  r1,  r3, lsl #4
878         sub             r1,  r1,  r3, lsl #2
879         add             r1,  r1,  #8
880         bl              \type\()_h264_qpel8_v_lowpass_neon
881         sub             r1,  r1,  r3, lsl #2
882         mov             lr,  r4
883         .endfunc
884
885 function \type\()_h264_qpel8_v_lowpass_neon
886         vld1.64         {d8},  [r1], r3
887         vld1.64         {d10}, [r1], r3
888         vld1.64         {d12}, [r1], r3
889         vld1.64         {d14}, [r1], r3
890         vld1.64         {d22}, [r1], r3
891         vld1.64         {d24}, [r1], r3
892         vld1.64         {d26}, [r1], r3
893         vld1.64         {d28}, [r1], r3
894         vld1.64         {d9},  [r1], r3
895         vld1.64         {d11}, [r1], r3
896         vld1.64         {d13}, [r1], r3
897         vld1.64         {d15}, [r1], r3
898         vld1.64         {d23}, [r1]
899
900         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
901         lowpass_8       d8,  d9,  d10, d11, d8,  d10
902         lowpass_8       d12, d13, d14, d15, d12, d14
903         lowpass_8       d22, d23, d24, d25, d22, d24
904         lowpass_8       d26, d27, d28, d29, d26, d28
905         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
906
907 .ifc \type,avg
908         vld1.8          {d9},  [r0,:64], r2
909         vrhadd.u8       d8,  d8,  d9
910         vld1.8          {d11}, [r0,:64], r2
911         vrhadd.u8       d10, d10, d11
912         vld1.8          {d13}, [r0,:64], r2
913         vrhadd.u8       d12, d12, d13
914         vld1.8          {d15}, [r0,:64], r2
915         vrhadd.u8       d14, d14, d15
916         vld1.8          {d23}, [r0,:64], r2
917         vrhadd.u8       d22, d22, d23
918         vld1.8          {d25}, [r0,:64], r2
919         vrhadd.u8       d24, d24, d25
920         vld1.8          {d27}, [r0,:64], r2
921         vrhadd.u8       d26, d26, d27
922         vld1.8          {d29}, [r0,:64], r2
923         vrhadd.u8       d28, d28, d29
924         sub             r0,  r0,  r2,  lsl #3
925 .endif
926
927         vst1.64         {d8},  [r0,:64], r2
928         vst1.64         {d10}, [r0,:64], r2
929         vst1.64         {d12}, [r0,:64], r2
930         vst1.64         {d14}, [r0,:64], r2
931         vst1.64         {d22}, [r0,:64], r2
932         vst1.64         {d24}, [r0,:64], r2
933         vst1.64         {d26}, [r0,:64], r2
934         vst1.64         {d28}, [r0,:64], r2
935
936         bx              lr
937         .endfunc
938         .endm
939
940         h264_qpel_v_lowpass put
941         h264_qpel_v_lowpass avg
942
943         .macro h264_qpel_v_lowpass_l2 type
944 function \type\()_h264_qpel16_v_lowpass_l2_neon
945         mov             r4,  lr
946         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
947         sub             r1,  r1,  r3, lsl #2
948         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
949         sub             r0,  r0,  r3, lsl #4
950         sub             ip,  ip,  r2, lsl #4
951         add             r0,  r0,  #8
952         add             ip,  ip,  #8
953         sub             r1,  r1,  r3, lsl #4
954         sub             r1,  r1,  r3, lsl #2
955         add             r1,  r1,  #8
956         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
957         sub             r1,  r1,  r3, lsl #2
958         mov             lr,  r4
959         .endfunc
960
961 function \type\()_h264_qpel8_v_lowpass_l2_neon
962         vld1.64         {d8},  [r1], r3
963         vld1.64         {d10}, [r1], r3
964         vld1.64         {d12}, [r1], r3
965         vld1.64         {d14}, [r1], r3
966         vld1.64         {d22}, [r1], r3
967         vld1.64         {d24}, [r1], r3
968         vld1.64         {d26}, [r1], r3
969         vld1.64         {d28}, [r1], r3
970         vld1.64         {d9},  [r1], r3
971         vld1.64         {d11}, [r1], r3
972         vld1.64         {d13}, [r1], r3
973         vld1.64         {d15}, [r1], r3
974         vld1.64         {d23}, [r1]
975
976         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
977         lowpass_8       d8,  d9,  d10, d11, d8,  d9
978         lowpass_8       d12, d13, d14, d15, d12, d13
979         lowpass_8       d22, d23, d24, d25, d22, d23
980         lowpass_8       d26, d27, d28, d29, d26, d27
981         transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
982
983         vld1.64         {d0},  [ip], r2
984         vld1.64         {d1},  [ip], r2
985         vld1.64         {d2},  [ip], r2
986         vld1.64         {d3},  [ip], r2
987         vld1.64         {d4},  [ip], r2
988         vrhadd.u8       q0,  q0,  q4
989         vld1.64         {d5},  [ip], r2
990         vrhadd.u8       q1,  q1,  q6
991         vld1.64         {d10}, [ip], r2
992         vrhadd.u8       q2,  q2,  q11
993         vld1.64         {d11}, [ip], r2
994         vrhadd.u8       q5,  q5,  q13
995
996 .ifc \type,avg
997         vld1.8          {d16}, [r0,:64], r3
998         vrhadd.u8       d0,  d0,  d16
999         vld1.8          {d17}, [r0,:64], r3
1000         vrhadd.u8       d1,  d1,  d17
1001         vld1.8          {d16}, [r0,:64], r3
1002         vrhadd.u8       d2,  d2,  d16
1003         vld1.8          {d17}, [r0,:64], r3
1004         vrhadd.u8       d3,  d3,  d17
1005         vld1.8          {d16}, [r0,:64], r3
1006         vrhadd.u8       d4,  d4,  d16
1007         vld1.8          {d17}, [r0,:64], r3
1008         vrhadd.u8       d5,  d5,  d17
1009         vld1.8          {d16}, [r0,:64], r3
1010         vrhadd.u8       d10, d10, d16
1011         vld1.8          {d17}, [r0,:64], r3
1012         vrhadd.u8       d11, d11, d17
1013         sub             r0,  r0,  r3,  lsl #3
1014 .endif
1015
1016         vst1.64         {d0},  [r0,:64], r3
1017         vst1.64         {d1},  [r0,:64], r3
1018         vst1.64         {d2},  [r0,:64], r3
1019         vst1.64         {d3},  [r0,:64], r3
1020         vst1.64         {d4},  [r0,:64], r3
1021         vst1.64         {d5},  [r0,:64], r3
1022         vst1.64         {d10}, [r0,:64], r3
1023         vst1.64         {d11}, [r0,:64], r3
1024
1025         bx              lr
1026         .endfunc
1027         .endm
1028
1029         h264_qpel_v_lowpass_l2 put
1030         h264_qpel_v_lowpass_l2 avg
1031
1032 function put_h264_qpel8_hv_lowpass_neon_top
1033         lowpass_const   ip
1034         mov             ip,  #12
1035 1:      vld1.64         {d0, d1},  [r1], r3
1036         vld1.64         {d16,d17}, [r1], r3
1037         subs            ip,  ip,  #2
1038         lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
1039         vst1.64         {d22-d25}, [r4,:128]!
1040         bne             1b
1041
1042         vld1.64         {d0, d1},  [r1]
1043         lowpass_8_1     d0,  d1,  q12, narrow=0
1044
1045         mov             ip,  #-16
1046         add             r4,  r4,  ip
1047         vld1.64         {d30,d31}, [r4,:128], ip
1048         vld1.64         {d20,d21}, [r4,:128], ip
1049         vld1.64         {d18,d19}, [r4,:128], ip
1050         vld1.64         {d16,d17}, [r4,:128], ip
1051         vld1.64         {d14,d15}, [r4,:128], ip
1052         vld1.64         {d12,d13}, [r4,:128], ip
1053         vld1.64         {d10,d11}, [r4,:128], ip
1054         vld1.64         {d8, d9},  [r4,:128], ip
1055         vld1.64         {d6, d7},  [r4,:128], ip
1056         vld1.64         {d4, d5},  [r4,:128], ip
1057         vld1.64         {d2, d3},  [r4,:128], ip
1058         vld1.64         {d0, d1},  [r4,:128]
1059
1060         swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
1061         transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
1062
1063         swap4           d17, d19, d21, d31, d24, d26, d28, d22
1064         transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
1065
1066         vst1.64         {d30,d31}, [r4,:128]!
1067         vst1.64         {d6, d7},  [r4,:128]!
1068         vst1.64         {d20,d21}, [r4,:128]!
1069         vst1.64         {d4, d5},  [r4,:128]!
1070         vst1.64         {d18,d19}, [r4,:128]!
1071         vst1.64         {d2, d3},  [r4,:128]!
1072         vst1.64         {d16,d17}, [r4,:128]!
1073         vst1.64         {d0, d1},  [r4,:128]
1074
1075         lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
1076         lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
1077         lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
1078         lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
1079
1080         vld1.64         {d16,d17}, [r4,:128], ip
1081         vld1.64         {d30,d31}, [r4,:128], ip
1082         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
1083         vld1.64         {d16,d17}, [r4,:128], ip
1084         vld1.64         {d30,d31}, [r4,:128], ip
1085         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
1086         vld1.64         {d16,d17}, [r4,:128], ip
1087         vld1.64         {d30,d31}, [r4,:128], ip
1088         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
1089         vld1.64         {d16,d17}, [r4,:128], ip
1090         vld1.64         {d30,d31}, [r4,:128]
1091         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
1092
1093         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
1094
1095         bx              lr
1096         .endfunc
1097
1098         .macro h264_qpel8_hv_lowpass type
1099 function \type\()_h264_qpel8_hv_lowpass_neon
1100         mov             r10, lr
1101         bl              put_h264_qpel8_hv_lowpass_neon_top
1102 .ifc \type,avg
1103         vld1.8          {d0},      [r0,:64], r2
1104         vrhadd.u8       d12, d12, d0
1105         vld1.8          {d1},      [r0,:64], r2
1106         vrhadd.u8       d13, d13, d1
1107         vld1.8          {d2},      [r0,:64], r2
1108         vrhadd.u8       d14, d14, d2
1109         vld1.8          {d3},      [r0,:64], r2
1110         vrhadd.u8       d15, d15, d3
1111         vld1.8          {d4},      [r0,:64], r2
1112         vrhadd.u8       d8,  d8,  d4
1113         vld1.8          {d5},      [r0,:64], r2
1114         vrhadd.u8       d9,  d9,  d5
1115         vld1.8          {d6},      [r0,:64], r2
1116         vrhadd.u8       d10, d10, d6
1117         vld1.8          {d7},      [r0,:64], r2
1118         vrhadd.u8       d11, d11, d7
1119         sub             r0,  r0,  r2,  lsl #3
1120 .endif
1121         vst1.64         {d12},     [r0,:64], r2
1122         vst1.64         {d13},     [r0,:64], r2
1123         vst1.64         {d14},     [r0,:64], r2
1124         vst1.64         {d15},     [r0,:64], r2
1125         vst1.64         {d8},      [r0,:64], r2
1126         vst1.64         {d9},      [r0,:64], r2
1127         vst1.64         {d10},     [r0,:64], r2
1128         vst1.64         {d11},     [r0,:64], r2
1129
1130         mov             lr,  r10
1131         bx              lr
1132         .endfunc
1133         .endm
1134
1135         h264_qpel8_hv_lowpass put
1136         h264_qpel8_hv_lowpass avg
1137
1138         .macro h264_qpel8_hv_lowpass_l2 type
1139 function \type\()_h264_qpel8_hv_lowpass_l2_neon
1140         mov             r10, lr
1141         bl              put_h264_qpel8_hv_lowpass_neon_top
1142
1143         vld1.64         {d0, d1},  [r2,:128]!
1144         vld1.64         {d2, d3},  [r2,:128]!
1145         vrhadd.u8       q0,  q0,  q6
1146         vld1.64         {d4, d5},  [r2,:128]!
1147         vrhadd.u8       q1,  q1,  q7
1148         vld1.64         {d6, d7},  [r2,:128]!
1149         vrhadd.u8       q2,  q2,  q4
1150         vrhadd.u8       q3,  q3,  q5
1151 .ifc \type,avg
1152         vld1.8          {d16},     [r0,:64], r3
1153         vrhadd.u8       d0,  d0,  d16
1154         vld1.8          {d17},     [r0,:64], r3
1155         vrhadd.u8       d1,  d1,  d17
1156         vld1.8          {d18},     [r0,:64], r3
1157         vrhadd.u8       d2,  d2,  d18
1158         vld1.8          {d19},     [r0,:64], r3
1159         vrhadd.u8       d3,  d3,  d19
1160         vld1.8          {d20},     [r0,:64], r3
1161         vrhadd.u8       d4,  d4,  d20
1162         vld1.8          {d21},     [r0,:64], r3
1163         vrhadd.u8       d5,  d5,  d21
1164         vld1.8          {d22},     [r0,:64], r3
1165         vrhadd.u8       d6,  d6,  d22
1166         vld1.8          {d23},     [r0,:64], r3
1167         vrhadd.u8       d7,  d7,  d23
1168         sub             r0,  r0,  r3,  lsl #3
1169 .endif
1170         vst1.64         {d0},      [r0,:64], r3
1171         vst1.64         {d1},      [r0,:64], r3
1172         vst1.64         {d2},      [r0,:64], r3
1173         vst1.64         {d3},      [r0,:64], r3
1174         vst1.64         {d4},      [r0,:64], r3
1175         vst1.64         {d5},      [r0,:64], r3
1176         vst1.64         {d6},      [r0,:64], r3
1177         vst1.64         {d7},      [r0,:64], r3
1178
1179         mov             lr,  r10
1180         bx              lr
1181         .endfunc
1182         .endm
1183
1184         h264_qpel8_hv_lowpass_l2 put
1185         h264_qpel8_hv_lowpass_l2 avg
1186
1187         .macro h264_qpel16_hv type
1188 function \type\()_h264_qpel16_hv_lowpass_neon
1189         mov             r9,  lr
1190         bl              \type\()_h264_qpel8_hv_lowpass_neon
1191         sub             r1,  r1,  r3, lsl #2
1192         bl              \type\()_h264_qpel8_hv_lowpass_neon
1193         sub             r1,  r1,  r3, lsl #4
1194         sub             r1,  r1,  r3, lsl #2
1195         add             r1,  r1,  #8
1196         sub             r0,  r0,  r2, lsl #4
1197         add             r0,  r0,  #8
1198         bl              \type\()_h264_qpel8_hv_lowpass_neon
1199         sub             r1,  r1,  r3, lsl #2
1200         mov             lr,  r9
1201         b               \type\()_h264_qpel8_hv_lowpass_neon
1202         .endfunc
1203
1204 function \type\()_h264_qpel16_hv_lowpass_l2_neon
1205         mov             r9,  lr
1206         sub             r2,  r4,  #256
1207         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1208         sub             r1,  r1,  r3, lsl #2
1209         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1210         sub             r1,  r1,  r3, lsl #4
1211         sub             r1,  r1,  r3, lsl #2
1212         add             r1,  r1,  #8
1213         sub             r0,  r0,  r3, lsl #4
1214         add             r0,  r0,  #8
1215         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1216         sub             r1,  r1,  r3, lsl #2
1217         mov             lr,  r9
1218         b               \type\()_h264_qpel8_hv_lowpass_l2_neon
1219         .endfunc
1220         .endm
1221
1222         h264_qpel16_hv put
1223         h264_qpel16_hv avg
1224
1225         .macro h264_qpel8 type
1226 function ff_\type\()_h264_qpel8_mc10_neon, export=1
1227         lowpass_const   r3
1228         mov             r3,  r1
1229         sub             r1,  r1,  #2
1230         mov             ip,  #8
1231         b               \type\()_h264_qpel8_h_lowpass_l2_neon
1232         .endfunc
1233
1234 function ff_\type\()_h264_qpel8_mc20_neon, export=1
1235         lowpass_const   r3
1236         sub             r1,  r1,  #2
1237         mov             r3,  r2
1238         mov             ip,  #8
1239         b               \type\()_h264_qpel8_h_lowpass_neon
1240         .endfunc
1241
1242 function ff_\type\()_h264_qpel8_mc30_neon, export=1
1243         lowpass_const   r3
1244         add             r3,  r1,  #1
1245         sub             r1,  r1,  #2
1246         mov             ip,  #8
1247         b               \type\()_h264_qpel8_h_lowpass_l2_neon
1248         .endfunc
1249
1250 function ff_\type\()_h264_qpel8_mc01_neon, export=1
1251         push            {lr}
1252         mov             ip,  r1
1253 \type\()_h264_qpel8_mc01:
1254         lowpass_const   r3
1255         mov             r3,  r2
1256         sub             r1,  r1,  r2, lsl #1
1257         vpush           {d8-d15}
1258         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1259         vpop            {d8-d15}
1260         pop             {pc}
1261         .endfunc
1262
1263 function ff_\type\()_h264_qpel8_mc11_neon, export=1
1264         push            {r0, r1, r11, lr}
1265 \type\()_h264_qpel8_mc11:
1266         lowpass_const   r3
1267         mov             r11, sp
1268         bic             sp,  sp,  #15
1269         sub             sp,  sp,  #64
1270         mov             r0,  sp
1271         sub             r1,  r1,  #2
1272         mov             r3,  #8
1273         mov             ip,  #8
1274         vpush           {d8-d15}
1275         bl              put_h264_qpel8_h_lowpass_neon
1276         ldrd            r0,  [r11]
1277         mov             r3,  r2
1278         add             ip,  sp,  #64
1279         sub             r1,  r1,  r2, lsl #1
1280         mov             r2,  #8
1281         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1282         vpop            {d8-d15}
1283         add             sp,  r11, #8
1284         pop             {r11, pc}
1285         .endfunc
1286
1287 function ff_\type\()_h264_qpel8_mc21_neon, export=1
1288         push            {r0, r1, r4, r10, r11, lr}
1289 \type\()_h264_qpel8_mc21:
1290         lowpass_const   r3
1291         mov             r11, sp
1292         bic             sp,  sp,  #15
1293         sub             sp,  sp,  #(8*8+16*12)
1294         sub             r1,  r1,  #2
1295         mov             r3,  #8
1296         mov             r0,  sp
1297         mov             ip,  #8
1298         vpush           {d8-d15}
1299         bl              put_h264_qpel8_h_lowpass_neon
1300         mov             r4,  r0
1301         ldrd            r0,  [r11]
1302         sub             r1,  r1,  r2, lsl #1
1303         sub             r1,  r1,  #2
1304         mov             r3,  r2
1305         sub             r2,  r4,  #64
1306         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1307         vpop            {d8-d15}
1308         add             sp,  r11,  #8
1309         pop             {r4, r10, r11, pc}
1310         .endfunc
1311
1312 function ff_\type\()_h264_qpel8_mc31_neon, export=1
1313         add             r1,  r1,  #1
1314         push            {r0, r1, r11, lr}
1315         sub             r1,  r1,  #1
1316         b               \type\()_h264_qpel8_mc11
1317         .endfunc
1318
1319 function ff_\type\()_h264_qpel8_mc02_neon, export=1
1320         push            {lr}
1321         lowpass_const   r3
1322         sub             r1,  r1,  r2, lsl #1
1323         mov             r3,  r2
1324         vpush           {d8-d15}
1325         bl              \type\()_h264_qpel8_v_lowpass_neon
1326         vpop            {d8-d15}
1327         pop             {pc}
1328         .endfunc
1329
1330 function ff_\type\()_h264_qpel8_mc12_neon, export=1
1331         push            {r0, r1, r4, r10, r11, lr}
1332 \type\()_h264_qpel8_mc12:
1333         lowpass_const   r3
1334         mov             r11, sp
1335         bic             sp,  sp,  #15
1336         sub             sp,  sp,  #(8*8+16*12)
1337         sub             r1,  r1,  r2, lsl #1
1338         mov             r3,  r2
1339         mov             r2,  #8
1340         mov             r0,  sp
1341         vpush           {d8-d15}
1342         bl              put_h264_qpel8_v_lowpass_neon
1343         mov             r4,  r0
1344         ldrd            r0,  [r11]
1345         sub             r1,  r1,  r3, lsl #1
1346         sub             r1,  r1,  #2
1347         sub             r2,  r4,  #64
1348         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1349         vpop            {d8-d15}
1350         add             sp,  r11,  #8
1351         pop             {r4, r10, r11, pc}
1352         .endfunc
1353
1354 function ff_\type\()_h264_qpel8_mc22_neon, export=1
1355         push            {r4, r10, r11, lr}
1356         mov             r11, sp
1357         bic             sp,  sp,  #15
1358         sub             r1,  r1,  r2, lsl #1
1359         sub             r1,  r1,  #2
1360         mov             r3,  r2
1361         sub             sp,  sp,  #(16*12)
1362         mov             r4,  sp
1363         vpush           {d8-d15}
1364         bl              \type\()_h264_qpel8_hv_lowpass_neon
1365         vpop            {d8-d15}
1366         mov             sp,  r11
1367         pop             {r4, r10, r11, pc}
1368         .endfunc
1369
1370 function ff_\type\()_h264_qpel8_mc32_neon, export=1
1371         push            {r0, r1, r4, r10, r11, lr}
1372         add             r1,  r1,  #1
1373         b               \type\()_h264_qpel8_mc12
1374         .endfunc
1375
1376 function ff_\type\()_h264_qpel8_mc03_neon, export=1
1377         push            {lr}
1378         add             ip,  r1,  r2
1379         b               \type\()_h264_qpel8_mc01
1380         .endfunc
1381
1382 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1383         push            {r0, r1, r11, lr}
1384         add             r1,  r1,  r2
1385         b               \type\()_h264_qpel8_mc11
1386         .endfunc
1387
1388 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1389         push            {r0, r1, r4, r10, r11, lr}
1390         add             r1,  r1,  r2
1391         b               \type\()_h264_qpel8_mc21
1392         .endfunc
1393
1394 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1395         add             r1,  r1,  #1
1396         push            {r0, r1, r11, lr}
1397         add             r1,  r1,  r2
1398         sub             r1,  r1,  #1
1399         b               \type\()_h264_qpel8_mc11
1400         .endfunc
1401         .endm
1402
1403         h264_qpel8 put
1404         h264_qpel8 avg
1405
1406         .macro h264_qpel16 type
1407 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1408         lowpass_const   r3
1409         mov             r3,  r1
1410         sub             r1,  r1,  #2
1411         b               \type\()_h264_qpel16_h_lowpass_l2_neon
1412         .endfunc
1413
1414 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1415         lowpass_const   r3
1416         sub             r1,  r1,  #2
1417         mov             r3,  r2
1418         b               \type\()_h264_qpel16_h_lowpass_neon
1419         .endfunc
1420
1421 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1422         lowpass_const   r3
1423         add             r3,  r1,  #1
1424         sub             r1,  r1,  #2
1425         b               \type\()_h264_qpel16_h_lowpass_l2_neon
1426         .endfunc
1427
1428 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1429         push            {r4, lr}
1430         mov             ip,  r1
1431 \type\()_h264_qpel16_mc01:
1432         lowpass_const   r3
1433         mov             r3,  r2
1434         sub             r1,  r1,  r2, lsl #1
1435         vpush           {d8-d15}
1436         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1437         vpop            {d8-d15}
1438         pop             {r4, pc}
1439         .endfunc
1440
1441 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1442         push            {r0, r1, r4, r11, lr}
1443 \type\()_h264_qpel16_mc11:
1444         lowpass_const   r3
1445         mov             r11, sp
1446         bic             sp,  sp,  #15
1447         sub             sp,  sp,  #256
1448         mov             r0,  sp
1449         sub             r1,  r1,  #2
1450         mov             r3,  #16
1451         vpush           {d8-d15}
1452         bl              put_h264_qpel16_h_lowpass_neon
1453         ldrd            r0,  [r11]
1454         mov             r3,  r2
1455         add             ip,  sp,  #64
1456         sub             r1,  r1,  r2, lsl #1
1457         mov             r2,  #16
1458         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1459         vpop            {d8-d15}
1460         add             sp,  r11, #8
1461         pop             {r4, r11, pc}
1462         .endfunc
1463
1464 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1465         push            {r0, r1, r4-r5, r9-r11, lr}
1466 \type\()_h264_qpel16_mc21:
1467         lowpass_const   r3
1468         mov             r11, sp
1469         bic             sp,  sp,  #15
1470         sub             sp,  sp,  #(16*16+16*12)
1471         sub             r1,  r1,  #2
1472         mov             r0,  sp
1473         vpush           {d8-d15}
1474         bl              put_h264_qpel16_h_lowpass_neon_packed
1475         mov             r4,  r0
1476         ldrd            r0,  [r11]
1477         sub             r1,  r1,  r2, lsl #1
1478         sub             r1,  r1,  #2
1479         mov             r3,  r2
1480         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1481         vpop            {d8-d15}
1482         add             sp,  r11,  #8
1483         pop             {r4-r5, r9-r11, pc}
1484         .endfunc
1485
1486 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1487         add             r1,  r1,  #1
1488         push            {r0, r1, r4, r11, lr}
1489         sub             r1,  r1,  #1
1490         b               \type\()_h264_qpel16_mc11
1491         .endfunc
1492
1493 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1494         push            {r4, lr}
1495         lowpass_const   r3
1496         sub             r1,  r1,  r2, lsl #1
1497         mov             r3,  r2
1498         vpush           {d8-d15}
1499         bl              \type\()_h264_qpel16_v_lowpass_neon
1500         vpop            {d8-d15}
1501         pop             {r4, pc}
1502         .endfunc
1503
1504 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1505         push            {r0, r1, r4-r5, r9-r11, lr}
1506 \type\()_h264_qpel16_mc12:
1507         lowpass_const   r3
1508         mov             r11, sp
1509         bic             sp,  sp,  #15
1510         sub             sp,  sp,  #(16*16+16*12)
1511         sub             r1,  r1,  r2, lsl #1
1512         mov             r0,  sp
1513         mov             r3,  r2
1514         vpush           {d8-d15}
1515         bl              put_h264_qpel16_v_lowpass_neon_packed
1516         mov             r4,  r0
1517         ldrd            r0,  [r11]
1518         sub             r1,  r1,  r3, lsl #1
1519         sub             r1,  r1,  #2
1520         mov             r2,  r3
1521         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1522         vpop            {d8-d15}
1523         add             sp,  r11,  #8
1524         pop             {r4-r5, r9-r11, pc}
1525         .endfunc
1526
1527 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1528         push            {r4, r9-r11, lr}
1529         lowpass_const   r3
1530         mov             r11, sp
1531         bic             sp,  sp,  #15
1532         sub             r1,  r1,  r2, lsl #1
1533         sub             r1,  r1,  #2
1534         mov             r3,  r2
1535         sub             sp,  sp,  #(16*12)
1536         mov             r4,  sp
1537         vpush           {d8-d15}
1538         bl              \type\()_h264_qpel16_hv_lowpass_neon
1539         vpop            {d8-d15}
1540         mov             sp,  r11
1541         pop             {r4, r9-r11, pc}
1542         .endfunc
1543
1544 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1545         push            {r0, r1, r4-r5, r9-r11, lr}
1546         add             r1,  r1,  #1
1547         b               \type\()_h264_qpel16_mc12
1548         .endfunc
1549
1550 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1551         push            {r4, lr}
1552         add             ip,  r1,  r2
1553         b               \type\()_h264_qpel16_mc01
1554         .endfunc
1555
1556 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1557         push            {r0, r1, r4, r11, lr}
1558         add             r1,  r1,  r2
1559         b               \type\()_h264_qpel16_mc11
1560         .endfunc
1561
1562 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1563         push            {r0, r1, r4-r5, r9-r11, lr}
1564         add             r1,  r1,  r2
1565         b               \type\()_h264_qpel16_mc21
1566         .endfunc
1567
1568 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1569         add             r1,  r1,  #1
1570         push            {r0, r1, r4, r11, lr}
1571         add             r1,  r1,  r2
1572         sub             r1,  r1,  #1
1573         b               \type\()_h264_qpel16_mc11
1574         .endfunc
1575         .endm
1576
1577         h264_qpel16 put
1578         h264_qpel16 avg
1579
1580 @ Biweighted prediction
1581
1582         .macro  biweight_16 macs, macd
1583         vdup.8          d0,  r4
1584         vdup.8          d1,  r5
1585         vmov            q2,  q8
1586         vmov            q3,  q8
1587 1:      subs            ip,  ip,  #2
1588         vld1.8          {d20-d21},[r0,:128], r2
1589         \macd           q2,  d0,  d20
1590         pld             [r0]
1591         \macd           q3,  d0,  d21
1592         vld1.8          {d22-d23},[r1,:128], r2
1593         \macs           q2,  d1,  d22
1594         pld             [r1]
1595         \macs           q3,  d1,  d23
1596         vmov            q12, q8
1597         vld1.8          {d28-d29},[r0,:128], r2
1598         vmov            q13, q8
1599         \macd           q12, d0,  d28
1600         pld             [r0]
1601         \macd           q13, d0,  d29
1602         vld1.8          {d30-d31},[r1,:128], r2
1603         \macs           q12, d1,  d30
1604         pld             [r1]
1605         \macs           q13, d1,  d31
1606         vshl.s16        q2,  q2,  q9
1607         vshl.s16        q3,  q3,  q9
1608         vqmovun.s16     d4,  q2
1609         vqmovun.s16     d5,  q3
1610         vshl.s16        q12, q12, q9
1611         vshl.s16        q13, q13, q9
1612         vqmovun.s16     d24, q12
1613         vqmovun.s16     d25, q13
1614         vmov            q3,  q8
1615         vst1.8          {d4- d5}, [r6,:128], r2
1616         vmov            q2,  q8
1617         vst1.8          {d24-d25},[r6,:128], r2
1618         bne             1b
1619         pop             {r4-r6, pc}
1620         .endm
1621
1622         .macro  biweight_8 macs, macd
1623         vdup.8          d0,  r4
1624         vdup.8          d1,  r5
1625         vmov            q1,  q8
1626         vmov            q10, q8
1627 1:      subs            ip,  ip,  #2
1628         vld1.8          {d4},[r0,:64], r2
1629         \macd           q1,  d0,  d4
1630         pld             [r0]
1631         vld1.8          {d5},[r1,:64], r2
1632         \macs           q1,  d1,  d5
1633         pld             [r1]
1634         vld1.8          {d6},[r0,:64], r2
1635         \macd           q10, d0,  d6
1636         pld             [r0]
1637         vld1.8          {d7},[r1,:64], r2
1638         \macs           q10, d1,  d7
1639         pld             [r1]
1640         vshl.s16        q1,  q1,  q9
1641         vqmovun.s16     d2,  q1
1642         vshl.s16        q10, q10, q9
1643         vqmovun.s16     d4,  q10
1644         vmov            q10, q8
1645         vst1.8          {d2},[r6,:64], r2
1646         vmov            q1,  q8
1647         vst1.8          {d4},[r6,:64], r2
1648         bne             1b
1649         pop             {r4-r6, pc}
1650         .endm
1651
1652         .macro  biweight_4 macs, macd
1653         vdup.8          d0,  r4
1654         vdup.8          d1,  r5
1655         vmov            q1,  q8
1656         vmov            q10, q8
1657 1:      subs            ip,  ip,  #4
1658         vld1.32         {d4[0]},[r0,:32], r2
1659         vld1.32         {d4[1]},[r0,:32], r2
1660         \macd           q1,  d0,  d4
1661         pld             [r0]
1662         vld1.32         {d5[0]},[r1,:32], r2
1663         vld1.32         {d5[1]},[r1,:32], r2
1664         \macs           q1,  d1,  d5
1665         pld             [r1]
1666         blt             2f
1667         vld1.32         {d6[0]},[r0,:32], r2
1668         vld1.32         {d6[1]},[r0,:32], r2
1669         \macd           q10, d0,  d6
1670         pld             [r0]
1671         vld1.32         {d7[0]},[r1,:32], r2
1672         vld1.32         {d7[1]},[r1,:32], r2
1673         \macs           q10, d1,  d7
1674         pld             [r1]
1675         vshl.s16        q1,  q1,  q9
1676         vqmovun.s16     d2,  q1
1677         vshl.s16        q10, q10, q9
1678         vqmovun.s16     d4,  q10
1679         vmov            q10, q8
1680         vst1.32         {d2[0]},[r6,:32], r2
1681         vst1.32         {d2[1]},[r6,:32], r2
1682         vmov            q1,  q8
1683         vst1.32         {d4[0]},[r6,:32], r2
1684         vst1.32         {d4[1]},[r6,:32], r2
1685         bne             1b
1686         pop             {r4-r6, pc}
1687 2:      vshl.s16        q1,  q1,  q9
1688         vqmovun.s16     d2,  q1
1689         vst1.32         {d2[0]},[r6,:32], r2
1690         vst1.32         {d2[1]},[r6,:32], r2
1691         pop             {r4-r6, pc}
1692         .endm
1693
1694         .macro  biweight_func w
1695 function biweight_h264_pixels_\w\()_neon
1696         push            {r4-r6, lr}
1697         add             r4,  sp,  #16
1698         ldm             r4,  {r4-r6}
1699         lsr             lr,  r4,  #31
1700         add             r6,  r6,  #1
1701         eors            lr,  lr,  r5,  lsr #30
1702         orr             r6,  r6,  #1
1703         vdup.16         q9,  r3
1704         lsl             r6,  r6,  r3
1705         vmvn            q9,  q9
1706         vdup.16         q8,  r6
1707         mov             r6,  r0
1708         beq             10f
1709         subs            lr,  lr,  #1
1710         beq             20f
1711         subs            lr,  lr,  #1
1712         beq             30f
1713         b               40f
1714 10:     biweight_\w     vmlal.u8, vmlal.u8
1715 20:     rsb             r4,  r4,  #0
1716         biweight_\w     vmlal.u8, vmlsl.u8
1717 30:     rsb             r4,  r4,  #0
1718         rsb             r5,  r5,  #0
1719         biweight_\w     vmlsl.u8, vmlsl.u8
1720 40:     rsb             r5,  r5,  #0
1721         biweight_\w     vmlsl.u8, vmlal.u8
1722         .endfunc
1723         .endm
1724
1725         .macro  biweight_entry w, h, b=1
1726 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1727         mov             ip,  #\h
1728 .if \b
1729         b               biweight_h264_pixels_\w\()_neon
1730 .endif
1731         .endfunc
1732         .endm
1733
1734         biweight_entry  16, 8
1735         biweight_entry  16, 16, b=0
1736         biweight_func   16
1737
1738         biweight_entry  8,  16
1739         biweight_entry  8,  4
1740         biweight_entry  8,  8,  b=0
1741         biweight_func   8
1742
1743         biweight_entry  4,  8
1744         biweight_entry  4,  2
1745         biweight_entry  4,  4,  b=0
1746         biweight_func   4
1747
1748 @ Weighted prediction
1749
1750         .macro  weight_16 add
1751         vdup.8          d0,  r3
1752 1:      subs            ip,  ip,  #2
1753         vld1.8          {d20-d21},[r0,:128], r1
1754         vmull.u8        q2,  d0,  d20
1755         pld             [r0]
1756         vmull.u8        q3,  d0,  d21
1757         vld1.8          {d28-d29},[r0,:128], r1
1758         vmull.u8        q12, d0,  d28
1759         pld             [r0]
1760         vmull.u8        q13, d0,  d29
1761         \add            q2,  q8,  q2
1762         vrshl.s16       q2,  q2,  q9
1763         \add            q3,  q8,  q3
1764         vrshl.s16       q3,  q3,  q9
1765         vqmovun.s16     d4,  q2
1766         vqmovun.s16     d5,  q3
1767         \add            q12, q8,  q12
1768         vrshl.s16       q12, q12, q9
1769         \add            q13, q8,  q13
1770         vrshl.s16       q13, q13, q9
1771         vqmovun.s16     d24, q12
1772         vqmovun.s16     d25, q13
1773         vst1.8          {d4- d5}, [r4,:128], r1
1774         vst1.8          {d24-d25},[r4,:128], r1
1775         bne             1b
1776         pop             {r4, pc}
1777         .endm
1778
1779         .macro  weight_8 add
1780         vdup.8          d0,  r3
1781 1:      subs            ip,  ip,  #2
1782         vld1.8          {d4},[r0,:64], r1
1783         vmull.u8        q1,  d0,  d4
1784         pld             [r0]
1785         vld1.8          {d6},[r0,:64], r1
1786         vmull.u8        q10, d0,  d6
1787         \add            q1,  q8,  q1
1788         pld             [r0]
1789         vrshl.s16       q1,  q1,  q9
1790         vqmovun.s16     d2,  q1
1791         \add            q10, q8,  q10
1792         vrshl.s16       q10, q10, q9
1793         vqmovun.s16     d4,  q10
1794         vst1.8          {d2},[r4,:64], r1
1795         vst1.8          {d4},[r4,:64], r1
1796         bne             1b
1797         pop             {r4, pc}
1798         .endm
1799
1800         .macro  weight_4 add
1801         vdup.8          d0,  r3
1802         vmov            q1,  q8
1803         vmov            q10, q8
1804 1:      subs            ip,  ip,  #4
1805         vld1.32         {d4[0]},[r0,:32], r1
1806         vld1.32         {d4[1]},[r0,:32], r1
1807         vmull.u8        q1,  d0,  d4
1808         pld             [r0]
1809         blt             2f
1810         vld1.32         {d6[0]},[r0,:32], r1
1811         vld1.32         {d6[1]},[r0,:32], r1
1812         vmull.u8        q10, d0,  d6
1813         pld             [r0]
1814         \add            q1,  q8,  q1
1815         vrshl.s16       q1,  q1,  q9
1816         vqmovun.s16     d2,  q1
1817         \add            q10, q8,  q10
1818         vrshl.s16       q10, q10, q9
1819         vqmovun.s16     d4,  q10
1820         vmov            q10, q8
1821         vst1.32         {d2[0]},[r4,:32], r1
1822         vst1.32         {d2[1]},[r4,:32], r1
1823         vmov            q1,  q8
1824         vst1.32         {d4[0]},[r4,:32], r1
1825         vst1.32         {d4[1]},[r4,:32], r1
1826         bne             1b
1827         pop             {r4, pc}
1828 2:      \add            q1,  q8,  q1
1829         vrshl.s16       q1,  q1,  q9
1830         vqmovun.s16     d2,  q1
1831         vst1.32         {d2[0]},[r4,:32], r1
1832         vst1.32         {d2[1]},[r4,:32], r1
1833         pop             {r4, pc}
1834         .endm
1835
1836         .macro  weight_func w
1837 function weight_h264_pixels_\w\()_neon
1838         push            {r4, lr}
1839         ldr             r4,  [sp, #8]
1840         cmp             r2,  #1
1841         lsl             r4,  r4,  r2
1842         vdup.16         q8,  r4
1843         mov             r4,  r0
1844         ble             20f
1845         rsb             lr,  r2,  #1
1846         vdup.16         q9,  lr
1847         cmp             r3,  #0
1848         blt             10f
1849         weight_\w       vhadd.s16
1850 10:     rsb             r3,  r3,  #0
1851         weight_\w       vhsub.s16
1852 20:     rsb             lr,  r2,  #0
1853         vdup.16         q9,  lr
1854         cmp             r3,  #0
1855         blt             10f
1856         weight_\w       vadd.s16
1857 10:     rsb             r3,  r3,  #0
1858         weight_\w       vsub.s16
1859         .endfunc
1860         .endm
1861
1862         .macro  weight_entry w, h, b=1
1863 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1864         mov             ip,  #\h
1865 .if \b
1866         b               weight_h264_pixels_\w\()_neon
1867 .endif
1868         .endfunc
1869         .endm
1870
1871         weight_entry    16, 8
1872         weight_entry    16, 16, b=0
1873         weight_func     16
1874
1875         weight_entry    8,  16
1876         weight_entry    8,  4
1877         weight_entry    8,  8,  b=0
1878         weight_func     8
1879
1880         weight_entry    4,  8
1881         weight_entry    4,  2
1882         weight_entry    4,  4,  b=0
1883         weight_func     4