]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264dsp_neon.S
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libavcodec / arm / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "asm.S"
22
23         .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24         vtrn.32         \r0, \r4
25         vtrn.32         \r1, \r5
26         vtrn.32         \r2, \r6
27         vtrn.32         \r3, \r7
28         vtrn.16         \r0, \r2
29         vtrn.16         \r1, \r3
30         vtrn.16         \r4, \r6
31         vtrn.16         \r5, \r7
32         vtrn.8          \r0, \r1
33         vtrn.8          \r2, \r3
34         vtrn.8          \r4, \r5
35         vtrn.8          \r6, \r7
36         .endm
37
38         .macro transpose_4x4 r0 r1 r2 r3
39         vtrn.16         \r0, \r2
40         vtrn.16         \r1, \r3
41         vtrn.8          \r0, \r1
42         vtrn.8          \r2, \r3
43         .endm
44
45         .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46         vswp            \r0, \r4
47         vswp            \r1, \r5
48         vswp            \r2, \r6
49         vswp            \r3, \r7
50         .endm
51
52         .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53         vtrn.32         \r0, \r2
54         vtrn.32         \r1, \r3
55         vtrn.32         \r4, \r6
56         vtrn.32         \r5, \r7
57         vtrn.16         \r0, \r1
58         vtrn.16         \r2, \r3
59         vtrn.16         \r4, \r5
60         vtrn.16         \r6, \r7
61         .endm
62
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64         .macro  h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
66         push            {r4-r7, lr}
67         ldrd            r4,  [sp, #20]
68 .ifc \type,avg
69         mov             lr,  r0
70 .endif
71         pld             [r1]
72         pld             [r1, r2]
73
74 A       muls            r7,  r4,  r5
75 T       mul             r7,  r4,  r5
76 T       cmp             r7,  #0
77         rsb             r6,  r7,  r5,  lsl #3
78         rsb             ip,  r7,  r4,  lsl #3
79         sub             r4,  r7,  r4,  lsl #3
80         sub             r4,  r4,  r5,  lsl #3
81         add             r4,  r4,  #64
82
83         beq             2f
84
85         add             r5,  r1,  r2
86
87         vdup.8          d0,  r4
88         lsl             r4,  r2,  #1
89         vdup.8          d1,  ip
90         vld1.64         {d4, d5}, [r1], r4
91         vdup.8          d2,  r6
92         vld1.64         {d6, d7}, [r5], r4
93         vdup.8          d3,  r7
94
95         vext.8          d5,  d4,  d5,  #1
96         vext.8          d7,  d6,  d7,  #1
97
98 1:      pld             [r5]
99         vmull.u8        q8,  d4,  d0
100         vmlal.u8        q8,  d5,  d1
101         vld1.64         {d4, d5}, [r1], r4
102         vmlal.u8        q8,  d6,  d2
103         vext.8          d5,  d4,  d5,  #1
104         vmlal.u8        q8,  d7,  d3
105         vmull.u8        q9,  d6,  d0
106         subs            r3,  r3,  #2
107         vmlal.u8        q9,  d7,  d1
108         vmlal.u8        q9,  d4,  d2
109         vmlal.u8        q9,  d5,  d3
110         vrshrn.u16      d16, q8,  #6
111         vld1.64         {d6, d7}, [r5], r4
112         pld             [r1]
113         vrshrn.u16      d17, q9,  #6
114 .ifc \type,avg
115         vld1.64         {d20}, [lr,:64], r2
116         vld1.64         {d21}, [lr,:64], r2
117         vrhadd.u8       q8,  q8,  q10
118 .endif
119         vext.8          d7,  d6,  d7,  #1
120         vst1.64         {d16}, [r0,:64], r2
121         vst1.64         {d17}, [r0,:64], r2
122         bgt             1b
123
124         pop             {r4-r7, pc}
125
126 2:      tst             r6,  r6
127         add             ip,  ip,  r6
128         vdup.8          d0,  r4
129         vdup.8          d1,  ip
130
131         beq             4f
132
133         add             r5,  r1,  r2
134         lsl             r4,  r2,  #1
135         vld1.64         {d4}, [r1], r4
136         vld1.64         {d6}, [r5], r4
137
138 3:      pld             [r5]
139         vmull.u8        q8,  d4,  d0
140         vmlal.u8        q8,  d6,  d1
141         vld1.64         {d4}, [r1], r4
142         vmull.u8        q9,  d6,  d0
143         vmlal.u8        q9,  d4,  d1
144         vld1.64         {d6}, [r5], r4
145         vrshrn.u16      d16, q8,  #6
146         vrshrn.u16      d17, q9,  #6
147 .ifc \type,avg
148         vld1.64         {d20}, [lr,:64], r2
149         vld1.64         {d21}, [lr,:64], r2
150         vrhadd.u8       q8,  q8,  q10
151 .endif
152         subs            r3,  r3,  #2
153         pld             [r1]
154         vst1.64         {d16}, [r0,:64], r2
155         vst1.64         {d17}, [r0,:64], r2
156         bgt             3b
157
158         pop             {r4-r7, pc}
159
160 4:      vld1.64         {d4, d5}, [r1], r2
161         vld1.64         {d6, d7}, [r1], r2
162         vext.8          d5,  d4,  d5,  #1
163         vext.8          d7,  d6,  d7,  #1
164
165 5:      pld             [r1]
166         subs            r3,  r3,  #2
167         vmull.u8        q8,  d4,  d0
168         vmlal.u8        q8,  d5,  d1
169         vld1.64         {d4, d5}, [r1], r2
170         vmull.u8        q9,  d6,  d0
171         vmlal.u8        q9,  d7,  d1
172         pld             [r1]
173         vext.8          d5,  d4,  d5,  #1
174         vrshrn.u16      d16, q8,  #6
175         vrshrn.u16      d17, q9,  #6
176 .ifc \type,avg
177         vld1.64         {d20}, [lr,:64], r2
178         vld1.64         {d21}, [lr,:64], r2
179         vrhadd.u8       q8,  q8,  q10
180 .endif
181         vld1.64         {d6, d7}, [r1], r2
182         vext.8          d7,  d6,  d7,  #1
183         vst1.64         {d16}, [r0,:64], r2
184         vst1.64         {d17}, [r0,:64], r2
185         bgt             5b
186
187         pop             {r4-r7, pc}
188 endfunc
189         .endm
190
191 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
192         .macro  h264_chroma_mc4 type
193 function ff_\type\()_h264_chroma_mc4_neon, export=1
194         push            {r4-r7, lr}
195         ldrd            r4,  [sp, #20]
196 .ifc \type,avg
197         mov             lr,  r0
198 .endif
199         pld             [r1]
200         pld             [r1, r2]
201
202 A       muls            r7,  r4,  r5
203 T       mul             r7,  r4,  r5
204 T       cmp             r7,  #0
205         rsb             r6,  r7,  r5,  lsl #3
206         rsb             ip,  r7,  r4,  lsl #3
207         sub             r4,  r7,  r4,  lsl #3
208         sub             r4,  r4,  r5,  lsl #3
209         add             r4,  r4,  #64
210
211         beq             2f
212
213         add             r5,  r1,  r2
214
215         vdup.8          d0,  r4
216         lsl             r4,  r2,  #1
217         vdup.8          d1,  ip
218         vld1.64         {d4},     [r1], r4
219         vdup.8          d2,  r6
220         vld1.64         {d6},     [r5], r4
221         vdup.8          d3,  r7
222
223         vext.8          d5,  d4,  d5,  #1
224         vext.8          d7,  d6,  d7,  #1
225         vtrn.32         d4,  d5
226         vtrn.32         d6,  d7
227
228         vtrn.32         d0,  d1
229         vtrn.32         d2,  d3
230
231 1:      pld             [r5]
232         vmull.u8        q8,  d4,  d0
233         vmlal.u8        q8,  d6,  d2
234         vld1.64         {d4},     [r1], r4
235         vext.8          d5,  d4,  d5,  #1
236         vtrn.32         d4,  d5
237         vmull.u8        q9,  d6,  d0
238         vmlal.u8        q9,  d4,  d2
239         vld1.64         {d6},     [r5], r4
240         vadd.i16        d16, d16, d17
241         vadd.i16        d17, d18, d19
242         vrshrn.u16      d16, q8,  #6
243         subs            r3,  r3,  #2
244         pld             [r1]
245 .ifc \type,avg
246         vld1.32         {d20[0]}, [lr,:32], r2
247         vld1.32         {d20[1]}, [lr,:32], r2
248         vrhadd.u8       d16, d16, d20
249 .endif
250         vext.8          d7,  d6,  d7,  #1
251         vtrn.32         d6,  d7
252         vst1.32         {d16[0]}, [r0,:32], r2
253         vst1.32         {d16[1]}, [r0,:32], r2
254         bgt             1b
255
256         pop             {r4-r7, pc}
257
258 2:      tst             r6,  r6
259         add             ip,  ip,  r6
260         vdup.8          d0,  r4
261         vdup.8          d1,  ip
262         vtrn.32         d0,  d1
263
264         beq             4f
265
266         vext.32         d1,  d0,  d1,  #1
267         add             r5,  r1,  r2
268         lsl             r4,  r2,  #1
269         vld1.32         {d4[0]},  [r1], r4
270         vld1.32         {d4[1]},  [r5], r4
271
272 3:      pld             [r5]
273         vmull.u8        q8,  d4,  d0
274         vld1.32         {d4[0]},  [r1], r4
275         vmull.u8        q9,  d4,  d1
276         vld1.32         {d4[1]},  [r5], r4
277         vadd.i16        d16, d16, d17
278         vadd.i16        d17, d18, d19
279         vrshrn.u16      d16, q8,  #6
280 .ifc \type,avg
281         vld1.32         {d20[0]}, [lr,:32], r2
282         vld1.32         {d20[1]}, [lr,:32], r2
283         vrhadd.u8       d16, d16, d20
284 .endif
285         subs            r3,  r3,  #2
286         pld             [r1]
287         vst1.32         {d16[0]}, [r0,:32], r2
288         vst1.32         {d16[1]}, [r0,:32], r2
289         bgt             3b
290
291         pop             {r4-r7, pc}
292
293 4:      vld1.64         {d4},     [r1], r2
294         vld1.64         {d6},     [r1], r2
295         vext.8          d5,  d4,  d5,  #1
296         vext.8          d7,  d6,  d7,  #1
297         vtrn.32         d4,  d5
298         vtrn.32         d6,  d7
299
300 5:      vmull.u8        q8,  d4,  d0
301         vmull.u8        q9,  d6,  d0
302         subs            r3,  r3,  #2
303         vld1.64         {d4},     [r1], r2
304         vext.8          d5,  d4,  d5,  #1
305         vtrn.32         d4,  d5
306         vadd.i16        d16, d16, d17
307         vadd.i16        d17, d18, d19
308         pld             [r1]
309         vrshrn.u16      d16, q8,  #6
310 .ifc \type,avg
311         vld1.32         {d20[0]}, [lr,:32], r2
312         vld1.32         {d20[1]}, [lr,:32], r2
313         vrhadd.u8       d16, d16, d20
314 .endif
315         vld1.64         {d6},     [r1], r2
316         vext.8          d7,  d6,  d7,  #1
317         vtrn.32         d6,  d7
318         pld             [r1]
319         vst1.32         {d16[0]}, [r0,:32], r2
320         vst1.32         {d16[1]}, [r0,:32], r2
321         bgt             5b
322
323         pop             {r4-r7, pc}
324 endfunc
325         .endm
326
327         .macro  h264_chroma_mc2 type
328 function ff_\type\()_h264_chroma_mc2_neon, export=1
329         push            {r4-r6, lr}
330         ldr             r4,  [sp, #16]
331         ldr             lr,  [sp, #20]
332         pld             [r1]
333         pld             [r1, r2]
334         orrs            r5,  r4,  lr
335         beq             2f
336
337         mul             r5,  r4,  lr
338         rsb             r6,  r5,  lr,  lsl #3
339         rsb             r12, r5,  r4,  lsl #3
340         sub             r4,  r5,  r4,  lsl #3
341         sub             r4,  r4,  lr,  lsl #3
342         add             r4,  r4,  #64
343         vdup.8          d0,  r4
344         vdup.8          d2,  r12
345         vdup.8          d1,  r6
346         vdup.8          d3,  r5
347         vtrn.16         q0,  q1
348 1:
349         vld1.32         {d4[0]},  [r1], r2
350         vld1.32         {d4[1]},  [r1], r2
351         vrev64.32       d5,  d4
352         vld1.32         {d5[1]},  [r1]
353         vext.8          q3,  q2,  q2,  #1
354         vtrn.16         q2,  q3
355         vmull.u8        q8,  d4,  d0
356         vmlal.u8        q8,  d5,  d1
357 .ifc \type,avg
358         vld1.16         {d18[0]}, [r0,:16], r2
359         vld1.16         {d18[1]}, [r0,:16]
360         sub             r0,  r0,  r2
361 .endif
362         vtrn.32         d16, d17
363         vadd.i16        d16, d16, d17
364         vrshrn.u16      d16, q8,  #6
365 .ifc \type,avg
366         vrhadd.u8       d16, d16, d18
367 .endif
368         vst1.16         {d16[0]}, [r0,:16], r2
369         vst1.16         {d16[1]}, [r0,:16], r2
370         subs            r3,  r3,  #2
371         bgt             1b
372         pop             {r4-r6, pc}
373 2:
374 .ifc \type,put
375         ldrh_post       r5,  r1,  r2
376         strh_post       r5,  r0,  r2
377         ldrh_post       r6,  r1,  r2
378         strh_post       r6,  r0,  r2
379 .else
380         vld1.16         {d16[0]}, [r1], r2
381         vld1.16         {d16[1]}, [r1], r2
382         vld1.16         {d18[0]}, [r0,:16], r2
383         vld1.16         {d18[1]}, [r0,:16]
384         sub             r0,  r0,  r2
385         vrhadd.u8       d16, d16, d18
386         vst1.16         {d16[0]}, [r0,:16], r2
387         vst1.16         {d16[1]}, [r0,:16], r2
388 .endif
389         subs            r3,  r3,  #2
390         bgt             2b
391         pop             {r4-r6, pc}
392 endfunc
393 .endm
394
395         .text
396         .align
397
398         h264_chroma_mc8 put
399         h264_chroma_mc8 avg
400         h264_chroma_mc4 put
401         h264_chroma_mc4 avg
402         h264_chroma_mc2 put
403         h264_chroma_mc2 avg
404
405         /* H.264 loop filter */
406
407         .macro h264_loop_filter_start
408         ldr             ip,  [sp]
409         tst             r2,  r2
410         ldr             ip,  [ip]
411         it              ne
412         tstne           r3,  r3
413         vmov.32         d24[0], ip
414         and             ip,  ip,  ip, lsl #16
415         it              eq
416         bxeq            lr
417         ands            ip,  ip,  ip, lsl #8
418         it              lt
419         bxlt            lr
420         .endm
421
422         .macro h264_loop_filter_luma
423         vdup.8          q11, r2         @ alpha
424         vmovl.u8        q12, d24
425         vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
426         vmovl.u16       q12, d24
427         vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
428         vsli.16         q12, q12, #8
429         vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
430         vsli.32         q12, q12, #16
431         vclt.u8         q6,  q6,  q11   @ < alpha
432         vdup.8          q11, r3         @ beta
433         vclt.s8         q7,  q12, #0
434         vclt.u8         q14, q14, q11   @ < beta
435         vclt.u8         q15, q15, q11   @ < beta
436         vbic            q6,  q6,  q7
437         vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
438         vand            q6,  q6,  q14
439         vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
440         vclt.u8         q4,  q4,  q11   @ < beta
441         vand            q6,  q6,  q15
442         vclt.u8         q5,  q5,  q11   @ < beta
443         vand            q4,  q4,  q6
444         vand            q5,  q5,  q6
445         vand            q12, q12, q6
446         vrhadd.u8       q14, q8,  q0
447         vsub.i8         q6,  q12, q4
448         vqadd.u8        q7,  q9,  q12
449         vhadd.u8        q10, q10, q14
450         vsub.i8         q6,  q6,  q5
451         vhadd.u8        q14, q2,  q14
452         vmin.u8         q7,  q7,  q10
453         vqsub.u8        q11, q9,  q12
454         vqadd.u8        q2,  q1,  q12
455         vmax.u8         q7,  q7,  q11
456         vqsub.u8        q11, q1,  q12
457         vmin.u8         q14, q2,  q14
458         vmovl.u8        q2,  d0
459         vmax.u8         q14, q14, q11
460         vmovl.u8        q10, d1
461         vsubw.u8        q2,  q2,  d16
462         vsubw.u8        q10, q10, d17
463         vshl.i16        q2,  q2,  #2
464         vshl.i16        q10, q10, #2
465         vaddw.u8        q2,  q2,  d18
466         vaddw.u8        q10, q10, d19
467         vsubw.u8        q2,  q2,  d2
468         vsubw.u8        q10, q10, d3
469         vrshrn.i16      d4,  q2,  #3
470         vrshrn.i16      d5,  q10, #3
471         vbsl            q4,  q7,  q9
472         vbsl            q5,  q14, q1
473         vneg.s8         q7,  q6
474         vmovl.u8        q14, d16
475         vmin.s8         q2,  q2,  q6
476         vmovl.u8        q6,  d17
477         vmax.s8         q2,  q2,  q7
478         vmovl.u8        q11, d0
479         vmovl.u8        q12, d1
480         vaddw.s8        q14, q14, d4
481         vaddw.s8        q6,  q6,  d5
482         vsubw.s8        q11, q11, d4
483         vsubw.s8        q12, q12, d5
484         vqmovun.s16     d16, q14
485         vqmovun.s16     d17, q6
486         vqmovun.s16     d0,  q11
487         vqmovun.s16     d1,  q12
488         .endm
489
490 function ff_h264_v_loop_filter_luma_neon, export=1
491         h264_loop_filter_start
492
493         vld1.64         {d0, d1},  [r0,:128], r1
494         vld1.64         {d2, d3},  [r0,:128], r1
495         vld1.64         {d4, d5},  [r0,:128], r1
496         sub             r0,  r0,  r1, lsl #2
497         sub             r0,  r0,  r1, lsl #1
498         vld1.64         {d20,d21}, [r0,:128], r1
499         vld1.64         {d18,d19}, [r0,:128], r1
500         vld1.64         {d16,d17}, [r0,:128], r1
501
502         vpush           {d8-d15}
503
504         h264_loop_filter_luma
505
506         sub             r0,  r0,  r1, lsl #1
507         vst1.64         {d8, d9},  [r0,:128], r1
508         vst1.64         {d16,d17}, [r0,:128], r1
509         vst1.64         {d0, d1},  [r0,:128], r1
510         vst1.64         {d10,d11}, [r0,:128]
511
512         vpop            {d8-d15}
513         bx              lr
514 endfunc
515
516 function ff_h264_h_loop_filter_luma_neon, export=1
517         h264_loop_filter_start
518
519         sub             r0,  r0,  #4
520         vld1.64         {d6},  [r0], r1
521         vld1.64         {d20}, [r0], r1
522         vld1.64         {d18}, [r0], r1
523         vld1.64         {d16}, [r0], r1
524         vld1.64         {d0},  [r0], r1
525         vld1.64         {d2},  [r0], r1
526         vld1.64         {d4},  [r0], r1
527         vld1.64         {d26}, [r0], r1
528         vld1.64         {d7},  [r0], r1
529         vld1.64         {d21}, [r0], r1
530         vld1.64         {d19}, [r0], r1
531         vld1.64         {d17}, [r0], r1
532         vld1.64         {d1},  [r0], r1
533         vld1.64         {d3},  [r0], r1
534         vld1.64         {d5},  [r0], r1
535         vld1.64         {d27}, [r0], r1
536
537         transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
538
539         vpush           {d8-d15}
540
541         h264_loop_filter_luma
542
543         transpose_4x4   q4, q8, q0, q5
544
545         sub             r0,  r0,  r1, lsl #4
546         add             r0,  r0,  #2
547         vst1.32         {d8[0]},  [r0], r1
548         vst1.32         {d16[0]}, [r0], r1
549         vst1.32         {d0[0]},  [r0], r1
550         vst1.32         {d10[0]}, [r0], r1
551         vst1.32         {d8[1]},  [r0], r1
552         vst1.32         {d16[1]}, [r0], r1
553         vst1.32         {d0[1]},  [r0], r1
554         vst1.32         {d10[1]}, [r0], r1
555         vst1.32         {d9[0]},  [r0], r1
556         vst1.32         {d17[0]}, [r0], r1
557         vst1.32         {d1[0]},  [r0], r1
558         vst1.32         {d11[0]}, [r0], r1
559         vst1.32         {d9[1]},  [r0], r1
560         vst1.32         {d17[1]}, [r0], r1
561         vst1.32         {d1[1]},  [r0], r1
562         vst1.32         {d11[1]}, [r0], r1
563
564         vpop            {d8-d15}
565         bx              lr
566 endfunc
567
568         .macro h264_loop_filter_chroma
569         vdup.8          d22, r2         @ alpha
570         vmovl.u8        q12, d24
571         vabd.u8         d26, d16, d0    @ abs(p0 - q0)
572         vmovl.u8        q2,  d0
573         vabd.u8         d28, d18, d16   @ abs(p1 - p0)
574         vsubw.u8        q2,  q2,  d16
575         vsli.16         d24, d24, #8
576         vshl.i16        q2,  q2,  #2
577         vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
578         vaddw.u8        q2,  q2,  d18
579         vclt.u8         d26, d26, d22   @ < alpha
580         vsubw.u8        q2,  q2,  d2
581         vdup.8          d22, r3         @ beta
582         vrshrn.i16      d4,  q2,  #3
583         vclt.u8         d28, d28, d22   @ < beta
584         vclt.u8         d30, d30, d22   @ < beta
585         vmin.s8         d4,  d4,  d24
586         vneg.s8         d25, d24
587         vand            d26, d26, d28
588         vmax.s8         d4,  d4,  d25
589         vand            d26, d26, d30
590         vmovl.u8        q11, d0
591         vand            d4,  d4,  d26
592         vmovl.u8        q14, d16
593         vaddw.s8        q14, q14, d4
594         vsubw.s8        q11, q11, d4
595         vqmovun.s16     d16, q14
596         vqmovun.s16     d0,  q11
597         .endm
598
599 function ff_h264_v_loop_filter_chroma_neon, export=1
600         h264_loop_filter_start
601
602         sub             r0,  r0,  r1, lsl #1
603         vld1.64         {d18}, [r0,:64], r1
604         vld1.64         {d16}, [r0,:64], r1
605         vld1.64         {d0},  [r0,:64], r1
606         vld1.64         {d2},  [r0,:64]
607
608         h264_loop_filter_chroma
609
610         sub             r0,  r0,  r1, lsl #1
611         vst1.64         {d16}, [r0,:64], r1
612         vst1.64         {d0},  [r0,:64], r1
613
614         bx              lr
615 endfunc
616
617 function ff_h264_h_loop_filter_chroma_neon, export=1
618         h264_loop_filter_start
619
620         sub             r0,  r0,  #2
621         vld1.32         {d18[0]}, [r0], r1
622         vld1.32         {d16[0]}, [r0], r1
623         vld1.32         {d0[0]},  [r0], r1
624         vld1.32         {d2[0]},  [r0], r1
625         vld1.32         {d18[1]}, [r0], r1
626         vld1.32         {d16[1]}, [r0], r1
627         vld1.32         {d0[1]},  [r0], r1
628         vld1.32         {d2[1]},  [r0], r1
629
630         vtrn.16         d18, d0
631         vtrn.16         d16, d2
632         vtrn.8          d18, d16
633         vtrn.8          d0,  d2
634
635         h264_loop_filter_chroma
636
637         vtrn.16         d18, d0
638         vtrn.16         d16, d2
639         vtrn.8          d18, d16
640         vtrn.8          d0,  d2
641
642         sub             r0,  r0,  r1, lsl #3
643         vst1.32         {d18[0]}, [r0], r1
644         vst1.32         {d16[0]}, [r0], r1
645         vst1.32         {d0[0]},  [r0], r1
646         vst1.32         {d2[0]},  [r0], r1
647         vst1.32         {d18[1]}, [r0], r1
648         vst1.32         {d16[1]}, [r0], r1
649         vst1.32         {d0[1]},  [r0], r1
650         vst1.32         {d2[1]},  [r0], r1
651
652         bx              lr
653 endfunc
654
655         /* H.264 qpel MC */
656
657         .macro  lowpass_const r
658         movw            \r,  #5
659         movt            \r,  #20
660         vmov.32         d6[0], \r
661         .endm
662
663         .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
664 .if \narrow
665         t0 .req q0
666         t1 .req q8
667 .else
668         t0 .req \d0
669         t1 .req \d1
670 .endif
671         vext.8          d2,  \r0, \r1, #2
672         vext.8          d3,  \r0, \r1, #3
673         vaddl.u8        q1,  d2,  d3
674         vext.8          d4,  \r0, \r1, #1
675         vext.8          d5,  \r0, \r1, #4
676         vaddl.u8        q2,  d4,  d5
677         vext.8          d30, \r0, \r1, #5
678         vaddl.u8        t0,  \r0, d30
679         vext.8          d18, \r2, \r3, #2
680         vmla.i16        t0,  q1,  d6[1]
681         vext.8          d19, \r2, \r3, #3
682         vaddl.u8        q9,  d18, d19
683         vext.8          d20, \r2, \r3, #1
684         vmls.i16        t0,  q2,  d6[0]
685         vext.8          d21, \r2, \r3, #4
686         vaddl.u8        q10, d20, d21
687         vext.8          d31, \r2, \r3, #5
688         vaddl.u8        t1,  \r2, d31
689         vmla.i16        t1,  q9,  d6[1]
690         vmls.i16        t1,  q10, d6[0]
691 .if \narrow
692         vqrshrun.s16    \d0, t0,  #5
693         vqrshrun.s16    \d1, t1,  #5
694 .endif
695         .unreq  t0
696         .unreq  t1
697         .endm
698
699         .macro  lowpass_8_1 r0, r1, d0, narrow=1
700 .if \narrow
701         t0 .req q0
702 .else
703         t0 .req \d0
704 .endif
705         vext.8          d2,  \r0, \r1, #2
706         vext.8          d3,  \r0, \r1, #3
707         vaddl.u8        q1,  d2,  d3
708         vext.8          d4,  \r0, \r1, #1
709         vext.8          d5,  \r0, \r1, #4
710         vaddl.u8        q2,  d4,  d5
711         vext.8          d30, \r0, \r1, #5
712         vaddl.u8        t0,  \r0, d30
713         vmla.i16        t0,  q1,  d6[1]
714         vmls.i16        t0,  q2,  d6[0]
715 .if \narrow
716         vqrshrun.s16    \d0, t0,  #5
717 .endif
718         .unreq  t0
719         .endm
720
721         .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
722         vext.16         q1,  \r0, \r1, #2
723         vext.16         q0,  \r0, \r1, #3
724         vaddl.s16       q9,  d2,  d0
725         vext.16         q2,  \r0, \r1, #1
726         vaddl.s16       q1,  d3,  d1
727         vext.16         q3,  \r0, \r1, #4
728         vaddl.s16       q10, d4,  d6
729         vext.16         \r1, \r0, \r1, #5
730         vaddl.s16       q2,  d5,  d7
731         vaddl.s16       q0,  \h0, \h1
732         vaddl.s16       q8,  \l0, \l1
733
734         vshl.i32        q3,  q9,  #4
735         vshl.i32        q9,  q9,  #2
736         vshl.i32        q15, q10, #2
737         vadd.i32        q9,  q9,  q3
738         vadd.i32        q10, q10, q15
739
740         vshl.i32        q3,  q1,  #4
741         vshl.i32        q1,  q1,  #2
742         vshl.i32        q15, q2,  #2
743         vadd.i32        q1,  q1,  q3
744         vadd.i32        q2,  q2,  q15
745
746         vadd.i32        q9,  q9,  q8
747         vsub.i32        q9,  q9,  q10
748
749         vadd.i32        q1,  q1,  q0
750         vsub.i32        q1,  q1,  q2
751
752         vrshrn.s32      d18, q9,  #10
753         vrshrn.s32      d19, q1,  #10
754
755         vqmovun.s16     \d,  q9
756         .endm
757
758 function put_h264_qpel16_h_lowpass_neon_packed
759         mov             r4,  lr
760         mov             ip,  #16
761         mov             r3,  #8
762         bl              put_h264_qpel8_h_lowpass_neon
763         sub             r1,  r1,  r2, lsl #4
764         add             r1,  r1,  #8
765         mov             ip,  #16
766         mov             lr,  r4
767         b               put_h264_qpel8_h_lowpass_neon
768 endfunc
769
770         .macro h264_qpel_h_lowpass type
771 function \type\()_h264_qpel16_h_lowpass_neon
772         push            {lr}
773         mov             ip,  #16
774         bl              \type\()_h264_qpel8_h_lowpass_neon
775         sub             r0,  r0,  r3, lsl #4
776         sub             r1,  r1,  r2, lsl #4
777         add             r0,  r0,  #8
778         add             r1,  r1,  #8
779         mov             ip,  #16
780         pop             {lr}
781 endfunc
782
783 function \type\()_h264_qpel8_h_lowpass_neon
784 1:      vld1.64         {d0, d1},  [r1], r2
785         vld1.64         {d16,d17}, [r1], r2
786         subs            ip,  ip,  #2
787         lowpass_8       d0,  d1,  d16, d17, d0,  d16
788 .ifc \type,avg
789         vld1.8          {d2},     [r0,:64], r3
790         vrhadd.u8       d0,  d0,  d2
791         vld1.8          {d3},     [r0,:64]
792         vrhadd.u8       d16, d16, d3
793         sub             r0,  r0,  r3
794 .endif
795         vst1.64         {d0},     [r0,:64], r3
796         vst1.64         {d16},    [r0,:64], r3
797         bne             1b
798         bx              lr
799 endfunc
800         .endm
801
802         h264_qpel_h_lowpass put
803         h264_qpel_h_lowpass avg
804
805         .macro h264_qpel_h_lowpass_l2 type
806 function \type\()_h264_qpel16_h_lowpass_l2_neon
807         push            {lr}
808         mov             ip,  #16
809         bl              \type\()_h264_qpel8_h_lowpass_l2_neon
810         sub             r0,  r0,  r2, lsl #4
811         sub             r1,  r1,  r2, lsl #4
812         sub             r3,  r3,  r2, lsl #4
813         add             r0,  r0,  #8
814         add             r1,  r1,  #8
815         add             r3,  r3,  #8
816         mov             ip,  #16
817         pop             {lr}
818 endfunc
819
820 function \type\()_h264_qpel8_h_lowpass_l2_neon
821 1:      vld1.64         {d0, d1},  [r1], r2
822         vld1.64         {d16,d17}, [r1], r2
823         vld1.64         {d28},     [r3], r2
824         vld1.64         {d29},     [r3], r2
825         subs            ip,  ip,  #2
826         lowpass_8       d0,  d1,  d16, d17, d0,  d1
827         vrhadd.u8       q0,  q0,  q14
828 .ifc \type,avg
829         vld1.8          {d2},      [r0,:64], r2
830         vrhadd.u8       d0,  d0,  d2
831         vld1.8          {d3},      [r0,:64]
832         vrhadd.u8       d1,  d1,  d3
833         sub             r0,  r0,  r2
834 .endif
835         vst1.64         {d0},      [r0,:64], r2
836         vst1.64         {d1},      [r0,:64], r2
837         bne             1b
838         bx              lr
839 endfunc
840         .endm
841
842         h264_qpel_h_lowpass_l2 put
843         h264_qpel_h_lowpass_l2 avg
844
845 function put_h264_qpel16_v_lowpass_neon_packed
846         mov             r4,  lr
847         mov             r2,  #8
848         bl              put_h264_qpel8_v_lowpass_neon
849         sub             r1,  r1,  r3, lsl #2
850         bl              put_h264_qpel8_v_lowpass_neon
851         sub             r1,  r1,  r3, lsl #4
852         sub             r1,  r1,  r3, lsl #2
853         add             r1,  r1,  #8
854         bl              put_h264_qpel8_v_lowpass_neon
855         sub             r1,  r1,  r3, lsl #2
856         mov             lr,  r4
857         b               put_h264_qpel8_v_lowpass_neon
858 endfunc
859
860         .macro h264_qpel_v_lowpass type
861 function \type\()_h264_qpel16_v_lowpass_neon
862         mov             r4,  lr
863         bl              \type\()_h264_qpel8_v_lowpass_neon
864         sub             r1,  r1,  r3, lsl #2
865         bl              \type\()_h264_qpel8_v_lowpass_neon
866         sub             r0,  r0,  r2, lsl #4
867         add             r0,  r0,  #8
868         sub             r1,  r1,  r3, lsl #4
869         sub             r1,  r1,  r3, lsl #2
870         add             r1,  r1,  #8
871         bl              \type\()_h264_qpel8_v_lowpass_neon
872         sub             r1,  r1,  r3, lsl #2
873         mov             lr,  r4
874 endfunc
875
876 function \type\()_h264_qpel8_v_lowpass_neon
877         vld1.64         {d8},  [r1], r3
878         vld1.64         {d10}, [r1], r3
879         vld1.64         {d12}, [r1], r3
880         vld1.64         {d14}, [r1], r3
881         vld1.64         {d22}, [r1], r3
882         vld1.64         {d24}, [r1], r3
883         vld1.64         {d26}, [r1], r3
884         vld1.64         {d28}, [r1], r3
885         vld1.64         {d9},  [r1], r3
886         vld1.64         {d11}, [r1], r3
887         vld1.64         {d13}, [r1], r3
888         vld1.64         {d15}, [r1], r3
889         vld1.64         {d23}, [r1]
890
891         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
892         lowpass_8       d8,  d9,  d10, d11, d8,  d10
893         lowpass_8       d12, d13, d14, d15, d12, d14
894         lowpass_8       d22, d23, d24, d25, d22, d24
895         lowpass_8       d26, d27, d28, d29, d26, d28
896         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
897
898 .ifc \type,avg
899         vld1.8          {d9},  [r0,:64], r2
900         vrhadd.u8       d8,  d8,  d9
901         vld1.8          {d11}, [r0,:64], r2
902         vrhadd.u8       d10, d10, d11
903         vld1.8          {d13}, [r0,:64], r2
904         vrhadd.u8       d12, d12, d13
905         vld1.8          {d15}, [r0,:64], r2
906         vrhadd.u8       d14, d14, d15
907         vld1.8          {d23}, [r0,:64], r2
908         vrhadd.u8       d22, d22, d23
909         vld1.8          {d25}, [r0,:64], r2
910         vrhadd.u8       d24, d24, d25
911         vld1.8          {d27}, [r0,:64], r2
912         vrhadd.u8       d26, d26, d27
913         vld1.8          {d29}, [r0,:64], r2
914         vrhadd.u8       d28, d28, d29
915         sub             r0,  r0,  r2,  lsl #3
916 .endif
917
918         vst1.64         {d8},  [r0,:64], r2
919         vst1.64         {d10}, [r0,:64], r2
920         vst1.64         {d12}, [r0,:64], r2
921         vst1.64         {d14}, [r0,:64], r2
922         vst1.64         {d22}, [r0,:64], r2
923         vst1.64         {d24}, [r0,:64], r2
924         vst1.64         {d26}, [r0,:64], r2
925         vst1.64         {d28}, [r0,:64], r2
926
927         bx              lr
928 endfunc
929         .endm
930
931         h264_qpel_v_lowpass put
932         h264_qpel_v_lowpass avg
933
934         .macro h264_qpel_v_lowpass_l2 type
935 function \type\()_h264_qpel16_v_lowpass_l2_neon
936         mov             r4,  lr
937         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
938         sub             r1,  r1,  r3, lsl #2
939         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
940         sub             r0,  r0,  r3, lsl #4
941         sub             ip,  ip,  r2, lsl #4
942         add             r0,  r0,  #8
943         add             ip,  ip,  #8
944         sub             r1,  r1,  r3, lsl #4
945         sub             r1,  r1,  r3, lsl #2
946         add             r1,  r1,  #8
947         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
948         sub             r1,  r1,  r3, lsl #2
949         mov             lr,  r4
950 endfunc
951
952 function \type\()_h264_qpel8_v_lowpass_l2_neon
953         vld1.64         {d8},  [r1], r3
954         vld1.64         {d10}, [r1], r3
955         vld1.64         {d12}, [r1], r3
956         vld1.64         {d14}, [r1], r3
957         vld1.64         {d22}, [r1], r3
958         vld1.64         {d24}, [r1], r3
959         vld1.64         {d26}, [r1], r3
960         vld1.64         {d28}, [r1], r3
961         vld1.64         {d9},  [r1], r3
962         vld1.64         {d11}, [r1], r3
963         vld1.64         {d13}, [r1], r3
964         vld1.64         {d15}, [r1], r3
965         vld1.64         {d23}, [r1]
966
967         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
968         lowpass_8       d8,  d9,  d10, d11, d8,  d9
969         lowpass_8       d12, d13, d14, d15, d12, d13
970         lowpass_8       d22, d23, d24, d25, d22, d23
971         lowpass_8       d26, d27, d28, d29, d26, d27
972         transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
973
974         vld1.64         {d0},  [ip], r2
975         vld1.64         {d1},  [ip], r2
976         vld1.64         {d2},  [ip], r2
977         vld1.64         {d3},  [ip], r2
978         vld1.64         {d4},  [ip], r2
979         vrhadd.u8       q0,  q0,  q4
980         vld1.64         {d5},  [ip], r2
981         vrhadd.u8       q1,  q1,  q6
982         vld1.64         {d10}, [ip], r2
983         vrhadd.u8       q2,  q2,  q11
984         vld1.64         {d11}, [ip], r2
985         vrhadd.u8       q5,  q5,  q13
986
987 .ifc \type,avg
988         vld1.8          {d16}, [r0,:64], r3
989         vrhadd.u8       d0,  d0,  d16
990         vld1.8          {d17}, [r0,:64], r3
991         vrhadd.u8       d1,  d1,  d17
992         vld1.8          {d16}, [r0,:64], r3
993         vrhadd.u8       d2,  d2,  d16
994         vld1.8          {d17}, [r0,:64], r3
995         vrhadd.u8       d3,  d3,  d17
996         vld1.8          {d16}, [r0,:64], r3
997         vrhadd.u8       d4,  d4,  d16
998         vld1.8          {d17}, [r0,:64], r3
999         vrhadd.u8       d5,  d5,  d17
1000         vld1.8          {d16}, [r0,:64], r3
1001         vrhadd.u8       d10, d10, d16
1002         vld1.8          {d17}, [r0,:64], r3
1003         vrhadd.u8       d11, d11, d17
1004         sub             r0,  r0,  r3,  lsl #3
1005 .endif
1006
1007         vst1.64         {d0},  [r0,:64], r3
1008         vst1.64         {d1},  [r0,:64], r3
1009         vst1.64         {d2},  [r0,:64], r3
1010         vst1.64         {d3},  [r0,:64], r3
1011         vst1.64         {d4},  [r0,:64], r3
1012         vst1.64         {d5},  [r0,:64], r3
1013         vst1.64         {d10}, [r0,:64], r3
1014         vst1.64         {d11}, [r0,:64], r3
1015
1016         bx              lr
1017 endfunc
1018         .endm
1019
1020         h264_qpel_v_lowpass_l2 put
1021         h264_qpel_v_lowpass_l2 avg
1022
1023 function put_h264_qpel8_hv_lowpass_neon_top
1024         lowpass_const   ip
1025         mov             ip,  #12
1026 1:      vld1.64         {d0, d1},  [r1], r3
1027         vld1.64         {d16,d17}, [r1], r3
1028         subs            ip,  ip,  #2
1029         lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
1030         vst1.64         {d22-d25}, [r4,:128]!
1031         bne             1b
1032
1033         vld1.64         {d0, d1},  [r1]
1034         lowpass_8_1     d0,  d1,  q12, narrow=0
1035
1036         mov             ip,  #-16
1037         add             r4,  r4,  ip
1038         vld1.64         {d30,d31}, [r4,:128], ip
1039         vld1.64         {d20,d21}, [r4,:128], ip
1040         vld1.64         {d18,d19}, [r4,:128], ip
1041         vld1.64         {d16,d17}, [r4,:128], ip
1042         vld1.64         {d14,d15}, [r4,:128], ip
1043         vld1.64         {d12,d13}, [r4,:128], ip
1044         vld1.64         {d10,d11}, [r4,:128], ip
1045         vld1.64         {d8, d9},  [r4,:128], ip
1046         vld1.64         {d6, d7},  [r4,:128], ip
1047         vld1.64         {d4, d5},  [r4,:128], ip
1048         vld1.64         {d2, d3},  [r4,:128], ip
1049         vld1.64         {d0, d1},  [r4,:128]
1050
1051         swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
1052         transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
1053
1054         swap4           d17, d19, d21, d31, d24, d26, d28, d22
1055         transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
1056
1057         vst1.64         {d30,d31}, [r4,:128]!
1058         vst1.64         {d6, d7},  [r4,:128]!
1059         vst1.64         {d20,d21}, [r4,:128]!
1060         vst1.64         {d4, d5},  [r4,:128]!
1061         vst1.64         {d18,d19}, [r4,:128]!
1062         vst1.64         {d2, d3},  [r4,:128]!
1063         vst1.64         {d16,d17}, [r4,:128]!
1064         vst1.64         {d0, d1},  [r4,:128]
1065
1066         lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
1067         lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
1068         lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
1069         lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
1070
1071         vld1.64         {d16,d17}, [r4,:128], ip
1072         vld1.64         {d30,d31}, [r4,:128], ip
1073         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
1074         vld1.64         {d16,d17}, [r4,:128], ip
1075         vld1.64         {d30,d31}, [r4,:128], ip
1076         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
1077         vld1.64         {d16,d17}, [r4,:128], ip
1078         vld1.64         {d30,d31}, [r4,:128], ip
1079         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
1080         vld1.64         {d16,d17}, [r4,:128], ip
1081         vld1.64         {d30,d31}, [r4,:128]
1082         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
1083
1084         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
1085
1086         bx              lr
1087 endfunc
1088
1089         .macro h264_qpel8_hv_lowpass type
1090 function \type\()_h264_qpel8_hv_lowpass_neon
1091         mov             r10, lr
1092         bl              put_h264_qpel8_hv_lowpass_neon_top
1093 .ifc \type,avg
1094         vld1.8          {d0},      [r0,:64], r2
1095         vrhadd.u8       d12, d12, d0
1096         vld1.8          {d1},      [r0,:64], r2
1097         vrhadd.u8       d13, d13, d1
1098         vld1.8          {d2},      [r0,:64], r2
1099         vrhadd.u8       d14, d14, d2
1100         vld1.8          {d3},      [r0,:64], r2
1101         vrhadd.u8       d15, d15, d3
1102         vld1.8          {d4},      [r0,:64], r2
1103         vrhadd.u8       d8,  d8,  d4
1104         vld1.8          {d5},      [r0,:64], r2
1105         vrhadd.u8       d9,  d9,  d5
1106         vld1.8          {d6},      [r0,:64], r2
1107         vrhadd.u8       d10, d10, d6
1108         vld1.8          {d7},      [r0,:64], r2
1109         vrhadd.u8       d11, d11, d7
1110         sub             r0,  r0,  r2,  lsl #3
1111 .endif
1112
1113         vst1.64         {d12},     [r0,:64], r2
1114         vst1.64         {d13},     [r0,:64], r2
1115         vst1.64         {d14},     [r0,:64], r2
1116         vst1.64         {d15},     [r0,:64], r2
1117         vst1.64         {d8},      [r0,:64], r2
1118         vst1.64         {d9},      [r0,:64], r2
1119         vst1.64         {d10},     [r0,:64], r2
1120         vst1.64         {d11},     [r0,:64], r2
1121
1122         mov             lr,  r10
1123         bx              lr
1124 endfunc
1125         .endm
1126
1127         h264_qpel8_hv_lowpass put
1128         h264_qpel8_hv_lowpass avg
1129
1130         .macro h264_qpel8_hv_lowpass_l2 type
1131 function \type\()_h264_qpel8_hv_lowpass_l2_neon
1132         mov             r10, lr
1133         bl              put_h264_qpel8_hv_lowpass_neon_top
1134
1135         vld1.64         {d0, d1},  [r2,:128]!
1136         vld1.64         {d2, d3},  [r2,:128]!
1137         vrhadd.u8       q0,  q0,  q6
1138         vld1.64         {d4, d5},  [r2,:128]!
1139         vrhadd.u8       q1,  q1,  q7
1140         vld1.64         {d6, d7},  [r2,:128]!
1141         vrhadd.u8       q2,  q2,  q4
1142         vrhadd.u8       q3,  q3,  q5
1143 .ifc \type,avg
1144         vld1.8          {d16},     [r0,:64], r3
1145         vrhadd.u8       d0,  d0,  d16
1146         vld1.8          {d17},     [r0,:64], r3
1147         vrhadd.u8       d1,  d1,  d17
1148         vld1.8          {d18},     [r0,:64], r3
1149         vrhadd.u8       d2,  d2,  d18
1150         vld1.8          {d19},     [r0,:64], r3
1151         vrhadd.u8       d3,  d3,  d19
1152         vld1.8          {d20},     [r0,:64], r3
1153         vrhadd.u8       d4,  d4,  d20
1154         vld1.8          {d21},     [r0,:64], r3
1155         vrhadd.u8       d5,  d5,  d21
1156         vld1.8          {d22},     [r0,:64], r3
1157         vrhadd.u8       d6,  d6,  d22
1158         vld1.8          {d23},     [r0,:64], r3
1159         vrhadd.u8       d7,  d7,  d23
1160         sub             r0,  r0,  r3,  lsl #3
1161 .endif
1162         vst1.64         {d0},      [r0,:64], r3
1163         vst1.64         {d1},      [r0,:64], r3
1164         vst1.64         {d2},      [r0,:64], r3
1165         vst1.64         {d3},      [r0,:64], r3
1166         vst1.64         {d4},      [r0,:64], r3
1167         vst1.64         {d5},      [r0,:64], r3
1168         vst1.64         {d6},      [r0,:64], r3
1169         vst1.64         {d7},      [r0,:64], r3
1170
1171         mov             lr,  r10
1172         bx              lr
1173 endfunc
1174         .endm
1175
1176         h264_qpel8_hv_lowpass_l2 put
1177         h264_qpel8_hv_lowpass_l2 avg
1178
1179         .macro h264_qpel16_hv type
1180 function \type\()_h264_qpel16_hv_lowpass_neon
1181         mov             r9,  lr
1182         bl              \type\()_h264_qpel8_hv_lowpass_neon
1183         sub             r1,  r1,  r3, lsl #2
1184         bl              \type\()_h264_qpel8_hv_lowpass_neon
1185         sub             r1,  r1,  r3, lsl #4
1186         sub             r1,  r1,  r3, lsl #2
1187         add             r1,  r1,  #8
1188         sub             r0,  r0,  r2, lsl #4
1189         add             r0,  r0,  #8
1190         bl              \type\()_h264_qpel8_hv_lowpass_neon
1191         sub             r1,  r1,  r3, lsl #2
1192         mov             lr,  r9
1193         b               \type\()_h264_qpel8_hv_lowpass_neon
1194 endfunc
1195
1196 function \type\()_h264_qpel16_hv_lowpass_l2_neon
1197         mov             r9,  lr
1198         sub             r2,  r4,  #256
1199         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1200         sub             r1,  r1,  r3, lsl #2
1201         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1202         sub             r1,  r1,  r3, lsl #4
1203         sub             r1,  r1,  r3, lsl #2
1204         add             r1,  r1,  #8
1205         sub             r0,  r0,  r3, lsl #4
1206         add             r0,  r0,  #8
1207         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1208         sub             r1,  r1,  r3, lsl #2
1209         mov             lr,  r9
1210         b               \type\()_h264_qpel8_hv_lowpass_l2_neon
1211 endfunc
1212         .endm
1213
1214         h264_qpel16_hv put
1215         h264_qpel16_hv avg
1216
1217         .macro h264_qpel8 type
1218 function ff_\type\()_h264_qpel8_mc10_neon, export=1
1219         lowpass_const   r3
1220         mov             r3,  r1
1221         sub             r1,  r1,  #2
1222         mov             ip,  #8
1223         b               \type\()_h264_qpel8_h_lowpass_l2_neon
1224 endfunc
1225
1226 function ff_\type\()_h264_qpel8_mc20_neon, export=1
1227         lowpass_const   r3
1228         sub             r1,  r1,  #2
1229         mov             r3,  r2
1230         mov             ip,  #8
1231         b               \type\()_h264_qpel8_h_lowpass_neon
1232 endfunc
1233
1234 function ff_\type\()_h264_qpel8_mc30_neon, export=1
1235         lowpass_const   r3
1236         add             r3,  r1,  #1
1237         sub             r1,  r1,  #2
1238         mov             ip,  #8
1239         b               \type\()_h264_qpel8_h_lowpass_l2_neon
1240 endfunc
1241
1242 function ff_\type\()_h264_qpel8_mc01_neon, export=1
1243         push            {lr}
1244         mov             ip,  r1
1245 \type\()_h264_qpel8_mc01:
1246         lowpass_const   r3
1247         mov             r3,  r2
1248         sub             r1,  r1,  r2, lsl #1
1249         vpush           {d8-d15}
1250         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1251         vpop            {d8-d15}
1252         pop             {pc}
1253 endfunc
1254
1255 function ff_\type\()_h264_qpel8_mc11_neon, export=1
1256         push            {r0, r1, r11, lr}
1257 \type\()_h264_qpel8_mc11:
1258         lowpass_const   r3
1259         mov             r11, sp
1260 A       bic             sp,  sp,  #15
1261 T       bic             r0,  r11, #15
1262 T       mov             sp,  r0
1263         sub             sp,  sp,  #64
1264         mov             r0,  sp
1265         sub             r1,  r1,  #2
1266         mov             r3,  #8
1267         mov             ip,  #8
1268         vpush           {d8-d15}
1269         bl              put_h264_qpel8_h_lowpass_neon
1270         ldrd            r0,  [r11], #8
1271         mov             r3,  r2
1272         add             ip,  sp,  #64
1273         sub             r1,  r1,  r2, lsl #1
1274         mov             r2,  #8
1275         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1276         vpop            {d8-d15}
1277         mov             sp,  r11
1278         pop             {r11, pc}
1279 endfunc
1280
1281 function ff_\type\()_h264_qpel8_mc21_neon, export=1
1282         push            {r0, r1, r4, r10, r11, lr}
1283 \type\()_h264_qpel8_mc21:
1284         lowpass_const   r3
1285         mov             r11, sp
1286 A       bic             sp,  sp,  #15
1287 T       bic             r0,  r11, #15
1288 T       mov             sp,  r0
1289         sub             sp,  sp,  #(8*8+16*12)
1290         sub             r1,  r1,  #2
1291         mov             r3,  #8
1292         mov             r0,  sp
1293         mov             ip,  #8
1294         vpush           {d8-d15}
1295         bl              put_h264_qpel8_h_lowpass_neon
1296         mov             r4,  r0
1297         ldrd            r0,  [r11], #8
1298         sub             r1,  r1,  r2, lsl #1
1299         sub             r1,  r1,  #2
1300         mov             r3,  r2
1301         sub             r2,  r4,  #64
1302         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1303         vpop            {d8-d15}
1304         mov             sp,  r11
1305         pop             {r4, r10, r11, pc}
1306 endfunc
1307
1308 function ff_\type\()_h264_qpel8_mc31_neon, export=1
1309         add             r1,  r1,  #1
1310         push            {r0, r1, r11, lr}
1311         sub             r1,  r1,  #1
1312         b               \type\()_h264_qpel8_mc11
1313 endfunc
1314
1315 function ff_\type\()_h264_qpel8_mc02_neon, export=1
1316         push            {lr}
1317         lowpass_const   r3
1318         sub             r1,  r1,  r2, lsl #1
1319         mov             r3,  r2
1320         vpush           {d8-d15}
1321         bl              \type\()_h264_qpel8_v_lowpass_neon
1322         vpop            {d8-d15}
1323         pop             {pc}
1324 endfunc
1325
1326 function ff_\type\()_h264_qpel8_mc12_neon, export=1
1327         push            {r0, r1, r4, r10, r11, lr}
1328 \type\()_h264_qpel8_mc12:
1329         lowpass_const   r3
1330         mov             r11, sp
1331 A       bic             sp,  sp,  #15
1332 T       bic             r0,  r11, #15
1333 T       mov             sp,  r0
1334         sub             sp,  sp,  #(8*8+16*12)
1335         sub             r1,  r1,  r2, lsl #1
1336         mov             r3,  r2
1337         mov             r2,  #8
1338         mov             r0,  sp
1339         vpush           {d8-d15}
1340         bl              put_h264_qpel8_v_lowpass_neon
1341         mov             r4,  r0
1342         ldrd            r0,  [r11], #8
1343         sub             r1,  r1,  r3, lsl #1
1344         sub             r1,  r1,  #2
1345         sub             r2,  r4,  #64
1346         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1347         vpop            {d8-d15}
1348         mov             sp,  r11
1349         pop             {r4, r10, r11, pc}
1350 endfunc
1351
1352 function ff_\type\()_h264_qpel8_mc22_neon, export=1
1353         push            {r4, r10, r11, lr}
1354         mov             r11, sp
1355 A       bic             sp,  sp,  #15
1356 T       bic             r4,  r11, #15
1357 T       mov             sp,  r4
1358         sub             r1,  r1,  r2, lsl #1
1359         sub             r1,  r1,  #2
1360         mov             r3,  r2
1361         sub             sp,  sp,  #(16*12)
1362         mov             r4,  sp
1363         vpush           {d8-d15}
1364         bl              \type\()_h264_qpel8_hv_lowpass_neon
1365         vpop            {d8-d15}
1366         mov             sp,  r11
1367         pop             {r4, r10, r11, pc}
1368 endfunc
1369
1370 function ff_\type\()_h264_qpel8_mc32_neon, export=1
1371         push            {r0, r1, r4, r10, r11, lr}
1372         add             r1,  r1,  #1
1373         b               \type\()_h264_qpel8_mc12
1374 endfunc
1375
1376 function ff_\type\()_h264_qpel8_mc03_neon, export=1
1377         push            {lr}
1378         add             ip,  r1,  r2
1379         b               \type\()_h264_qpel8_mc01
1380 endfunc
1381
1382 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1383         push            {r0, r1, r11, lr}
1384         add             r1,  r1,  r2
1385         b               \type\()_h264_qpel8_mc11
1386 endfunc
1387
1388 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1389         push            {r0, r1, r4, r10, r11, lr}
1390         add             r1,  r1,  r2
1391         b               \type\()_h264_qpel8_mc21
1392 endfunc
1393
1394 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1395         add             r1,  r1,  #1
1396         push            {r0, r1, r11, lr}
1397         add             r1,  r1,  r2
1398         sub             r1,  r1,  #1
1399         b               \type\()_h264_qpel8_mc11
1400 endfunc
1401         .endm
1402
1403         h264_qpel8 put
1404         h264_qpel8 avg
1405
1406         .macro h264_qpel16 type
1407 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1408         lowpass_const   r3
1409         mov             r3,  r1
1410         sub             r1,  r1,  #2
1411         b               \type\()_h264_qpel16_h_lowpass_l2_neon
1412 endfunc
1413
1414 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1415         lowpass_const   r3
1416         sub             r1,  r1,  #2
1417         mov             r3,  r2
1418         b               \type\()_h264_qpel16_h_lowpass_neon
1419 endfunc
1420
1421 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1422         lowpass_const   r3
1423         add             r3,  r1,  #1
1424         sub             r1,  r1,  #2
1425         b               \type\()_h264_qpel16_h_lowpass_l2_neon
1426 endfunc
1427
1428 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1429         push            {r4, lr}
1430         mov             ip,  r1
1431 \type\()_h264_qpel16_mc01:
1432         lowpass_const   r3
1433         mov             r3,  r2
1434         sub             r1,  r1,  r2, lsl #1
1435         vpush           {d8-d15}
1436         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1437         vpop            {d8-d15}
1438         pop             {r4, pc}
1439 endfunc
1440
1441 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1442         push            {r0, r1, r4, r11, lr}
1443 \type\()_h264_qpel16_mc11:
1444         lowpass_const   r3
1445         mov             r11, sp
1446 A       bic             sp,  sp,  #15
1447 T       bic             r0,  r11, #15
1448 T       mov             sp,  r0
1449         sub             sp,  sp,  #256
1450         mov             r0,  sp
1451         sub             r1,  r1,  #2
1452         mov             r3,  #16
1453         vpush           {d8-d15}
1454         bl              put_h264_qpel16_h_lowpass_neon
1455         ldrd            r0,  [r11], #8
1456         mov             r3,  r2
1457         add             ip,  sp,  #64
1458         sub             r1,  r1,  r2, lsl #1
1459         mov             r2,  #16
1460         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1461         vpop            {d8-d15}
1462         mov             sp,  r11
1463         pop             {r4, r11, pc}
1464 endfunc
1465
1466 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1467         push            {r0, r1, r4-r5, r9-r11, lr}
1468 \type\()_h264_qpel16_mc21:
1469         lowpass_const   r3
1470         mov             r11, sp
1471 A       bic             sp,  sp,  #15
1472 T       bic             r0,  r11, #15
1473 T       mov             sp,  r0
1474         sub             sp,  sp,  #(16*16+16*12)
1475         sub             r1,  r1,  #2
1476         mov             r0,  sp
1477         vpush           {d8-d15}
1478         bl              put_h264_qpel16_h_lowpass_neon_packed
1479         mov             r4,  r0
1480         ldrd            r0,  [r11], #8
1481         sub             r1,  r1,  r2, lsl #1
1482         sub             r1,  r1,  #2
1483         mov             r3,  r2
1484         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1485         vpop            {d8-d15}
1486         mov             sp,  r11
1487         pop             {r4-r5, r9-r11, pc}
1488 endfunc
1489
1490 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1491         add             r1,  r1,  #1
1492         push            {r0, r1, r4, r11, lr}
1493         sub             r1,  r1,  #1
1494         b               \type\()_h264_qpel16_mc11
1495 endfunc
1496
1497 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1498         push            {r4, lr}
1499         lowpass_const   r3
1500         sub             r1,  r1,  r2, lsl #1
1501         mov             r3,  r2
1502         vpush           {d8-d15}
1503         bl              \type\()_h264_qpel16_v_lowpass_neon
1504         vpop            {d8-d15}
1505         pop             {r4, pc}
1506 endfunc
1507
1508 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1509         push            {r0, r1, r4-r5, r9-r11, lr}
1510 \type\()_h264_qpel16_mc12:
1511         lowpass_const   r3
1512         mov             r11, sp
1513 A       bic             sp,  sp,  #15
1514 T       bic             r0,  r11, #15
1515 T       mov             sp,  r0
1516         sub             sp,  sp,  #(16*16+16*12)
1517         sub             r1,  r1,  r2, lsl #1
1518         mov             r0,  sp
1519         mov             r3,  r2
1520         vpush           {d8-d15}
1521         bl              put_h264_qpel16_v_lowpass_neon_packed
1522         mov             r4,  r0
1523         ldrd            r0,  [r11], #8
1524         sub             r1,  r1,  r3, lsl #1
1525         sub             r1,  r1,  #2
1526         mov             r2,  r3
1527         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1528         vpop            {d8-d15}
1529         mov             sp,  r11
1530         pop             {r4-r5, r9-r11, pc}
1531 endfunc
1532
1533 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1534         push            {r4, r9-r11, lr}
1535         lowpass_const   r3
1536         mov             r11, sp
1537 A       bic             sp,  sp,  #15
1538 T       bic             r4,  r11, #15
1539 T       mov             sp,  r4
1540         sub             r1,  r1,  r2, lsl #1
1541         sub             r1,  r1,  #2
1542         mov             r3,  r2
1543         sub             sp,  sp,  #(16*12)
1544         mov             r4,  sp
1545         vpush           {d8-d15}
1546         bl              \type\()_h264_qpel16_hv_lowpass_neon
1547         vpop            {d8-d15}
1548         mov             sp,  r11
1549         pop             {r4, r9-r11, pc}
1550 endfunc
1551
1552 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1553         push            {r0, r1, r4-r5, r9-r11, lr}
1554         add             r1,  r1,  #1
1555         b               \type\()_h264_qpel16_mc12
1556 endfunc
1557
1558 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1559         push            {r4, lr}
1560         add             ip,  r1,  r2
1561         b               \type\()_h264_qpel16_mc01
1562 endfunc
1563
1564 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1565         push            {r0, r1, r4, r11, lr}
1566         add             r1,  r1,  r2
1567         b               \type\()_h264_qpel16_mc11
1568 endfunc
1569
1570 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1571         push            {r0, r1, r4-r5, r9-r11, lr}
1572         add             r1,  r1,  r2
1573         b               \type\()_h264_qpel16_mc21
1574 endfunc
1575
1576 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1577         add             r1,  r1,  #1
1578         push            {r0, r1, r4, r11, lr}
1579         add             r1,  r1,  r2
1580         sub             r1,  r1,  #1
1581         b               \type\()_h264_qpel16_mc11
1582 endfunc
1583         .endm
1584
1585         h264_qpel16 put
1586         h264_qpel16 avg
1587
1588 @ Biweighted prediction
1589
1590         .macro  biweight_16 macs, macd
1591         vdup.8          d0,  r4
1592         vdup.8          d1,  r5
1593         vmov            q2,  q8
1594         vmov            q3,  q8
1595 1:      subs            r3,  r3,  #2
1596         vld1.8          {d20-d21},[r0,:128], r2
1597         \macd           q2,  d0,  d20
1598         pld             [r0]
1599         \macd           q3,  d0,  d21
1600         vld1.8          {d22-d23},[r1,:128], r2
1601         \macs           q2,  d1,  d22
1602         pld             [r1]
1603         \macs           q3,  d1,  d23
1604         vmov            q12, q8
1605         vld1.8          {d28-d29},[r0,:128], r2
1606         vmov            q13, q8
1607         \macd           q12, d0,  d28
1608         pld             [r0]
1609         \macd           q13, d0,  d29
1610         vld1.8          {d30-d31},[r1,:128], r2
1611         \macs           q12, d1,  d30
1612         pld             [r1]
1613         \macs           q13, d1,  d31
1614         vshl.s16        q2,  q2,  q9
1615         vshl.s16        q3,  q3,  q9
1616         vqmovun.s16     d4,  q2
1617         vqmovun.s16     d5,  q3
1618         vshl.s16        q12, q12, q9
1619         vshl.s16        q13, q13, q9
1620         vqmovun.s16     d24, q12
1621         vqmovun.s16     d25, q13
1622         vmov            q3,  q8
1623         vst1.8          {d4- d5}, [r6,:128], r2
1624         vmov            q2,  q8
1625         vst1.8          {d24-d25},[r6,:128], r2
1626         bne             1b
1627         pop             {r4-r6, pc}
1628         .endm
1629
1630         .macro  biweight_8 macs, macd
1631         vdup.8          d0,  r4
1632         vdup.8          d1,  r5
1633         vmov            q1,  q8
1634         vmov            q10, q8
1635 1:      subs            r3,  r3,  #2
1636         vld1.8          {d4},[r0,:64], r2
1637         \macd           q1,  d0,  d4
1638         pld             [r0]
1639         vld1.8          {d5},[r1,:64], r2
1640         \macs           q1,  d1,  d5
1641         pld             [r1]
1642         vld1.8          {d6},[r0,:64], r2
1643         \macd           q10, d0,  d6
1644         pld             [r0]
1645         vld1.8          {d7},[r1,:64], r2
1646         \macs           q10, d1,  d7
1647         pld             [r1]
1648         vshl.s16        q1,  q1,  q9
1649         vqmovun.s16     d2,  q1
1650         vshl.s16        q10, q10, q9
1651         vqmovun.s16     d4,  q10
1652         vmov            q10, q8
1653         vst1.8          {d2},[r6,:64], r2
1654         vmov            q1,  q8
1655         vst1.8          {d4},[r6,:64], r2
1656         bne             1b
1657         pop             {r4-r6, pc}
1658         .endm
1659
1660         .macro  biweight_4 macs, macd
1661         vdup.8          d0,  r4
1662         vdup.8          d1,  r5
1663         vmov            q1,  q8
1664         vmov            q10, q8
1665 1:      subs            r3,  r3,  #4
1666         vld1.32         {d4[0]},[r0,:32], r2
1667         vld1.32         {d4[1]},[r0,:32], r2
1668         \macd           q1,  d0,  d4
1669         pld             [r0]
1670         vld1.32         {d5[0]},[r1,:32], r2
1671         vld1.32         {d5[1]},[r1,:32], r2
1672         \macs           q1,  d1,  d5
1673         pld             [r1]
1674         blt             2f
1675         vld1.32         {d6[0]},[r0,:32], r2
1676         vld1.32         {d6[1]},[r0,:32], r2
1677         \macd           q10, d0,  d6
1678         pld             [r0]
1679         vld1.32         {d7[0]},[r1,:32], r2
1680         vld1.32         {d7[1]},[r1,:32], r2
1681         \macs           q10, d1,  d7
1682         pld             [r1]
1683         vshl.s16        q1,  q1,  q9
1684         vqmovun.s16     d2,  q1
1685         vshl.s16        q10, q10, q9
1686         vqmovun.s16     d4,  q10
1687         vmov            q10, q8
1688         vst1.32         {d2[0]},[r6,:32], r2
1689         vst1.32         {d2[1]},[r6,:32], r2
1690         vmov            q1,  q8
1691         vst1.32         {d4[0]},[r6,:32], r2
1692         vst1.32         {d4[1]},[r6,:32], r2
1693         bne             1b
1694         pop             {r4-r6, pc}
1695 2:      vshl.s16        q1,  q1,  q9
1696         vqmovun.s16     d2,  q1
1697         vst1.32         {d2[0]},[r6,:32], r2
1698         vst1.32         {d2[1]},[r6,:32], r2
1699         pop             {r4-r6, pc}
1700         .endm
1701
1702         .macro  biweight_func w
1703 function ff_biweight_h264_pixels_\w\()_neon, export=1
1704         push            {r4-r6, lr}
1705         ldr             r12, [sp, #16]
1706         add             r4,  sp,  #20
1707         ldm             r4,  {r4-r6}
1708         lsr             lr,  r4,  #31
1709         add             r6,  r6,  #1
1710         eors            lr,  lr,  r5,  lsr #30
1711         orr             r6,  r6,  #1
1712         vdup.16         q9,  r12
1713         lsl             r6,  r6,  r12
1714         vmvn            q9,  q9
1715         vdup.16         q8,  r6
1716         mov             r6,  r0
1717         beq             10f
1718         subs            lr,  lr,  #1
1719         beq             20f
1720         subs            lr,  lr,  #1
1721         beq             30f
1722         b               40f
1723 10:     biweight_\w     vmlal.u8, vmlal.u8
1724 20:     rsb             r4,  r4,  #0
1725         biweight_\w     vmlal.u8, vmlsl.u8
1726 30:     rsb             r4,  r4,  #0
1727         rsb             r5,  r5,  #0
1728         biweight_\w     vmlsl.u8, vmlsl.u8
1729 40:     rsb             r5,  r5,  #0
1730         biweight_\w     vmlsl.u8, vmlal.u8
1731 endfunc
1732         .endm
1733
1734         biweight_func   16
1735         biweight_func   8
1736         biweight_func   4
1737
1738 @ Weighted prediction
1739
1740         .macro  weight_16 add
1741         vdup.8          d0,  r12
1742 1:      subs            r2,  r2,  #2
1743         vld1.8          {d20-d21},[r0,:128], r1
1744         vmull.u8        q2,  d0,  d20
1745         pld             [r0]
1746         vmull.u8        q3,  d0,  d21
1747         vld1.8          {d28-d29},[r0,:128], r1
1748         vmull.u8        q12, d0,  d28
1749         pld             [r0]
1750         vmull.u8        q13, d0,  d29
1751         \add            q2,  q8,  q2
1752         vrshl.s16       q2,  q2,  q9
1753         \add            q3,  q8,  q3
1754         vrshl.s16       q3,  q3,  q9
1755         vqmovun.s16     d4,  q2
1756         vqmovun.s16     d5,  q3
1757         \add            q12, q8,  q12
1758         vrshl.s16       q12, q12, q9
1759         \add            q13, q8,  q13
1760         vrshl.s16       q13, q13, q9
1761         vqmovun.s16     d24, q12
1762         vqmovun.s16     d25, q13
1763         vst1.8          {d4- d5}, [r4,:128], r1
1764         vst1.8          {d24-d25},[r4,:128], r1
1765         bne             1b
1766         pop             {r4, pc}
1767         .endm
1768
1769         .macro  weight_8 add
1770         vdup.8          d0,  r12
1771 1:      subs            r2,  r2,  #2
1772         vld1.8          {d4},[r0,:64], r1
1773         vmull.u8        q1,  d0,  d4
1774         pld             [r0]
1775         vld1.8          {d6},[r0,:64], r1
1776         vmull.u8        q10, d0,  d6
1777         \add            q1,  q8,  q1
1778         pld             [r0]
1779         vrshl.s16       q1,  q1,  q9
1780         vqmovun.s16     d2,  q1
1781         \add            q10, q8,  q10
1782         vrshl.s16       q10, q10, q9
1783         vqmovun.s16     d4,  q10
1784         vst1.8          {d2},[r4,:64], r1
1785         vst1.8          {d4},[r4,:64], r1
1786         bne             1b
1787         pop             {r4, pc}
1788         .endm
1789
1790         .macro  weight_4 add
1791         vdup.8          d0,  r12
1792         vmov            q1,  q8
1793         vmov            q10, q8
1794 1:      subs            r2,  r2,  #4
1795         vld1.32         {d4[0]},[r0,:32], r1
1796         vld1.32         {d4[1]},[r0,:32], r1
1797         vmull.u8        q1,  d0,  d4
1798         pld             [r0]
1799         blt             2f
1800         vld1.32         {d6[0]},[r0,:32], r1
1801         vld1.32         {d6[1]},[r0,:32], r1
1802         vmull.u8        q10, d0,  d6
1803         pld             [r0]
1804         \add            q1,  q8,  q1
1805         vrshl.s16       q1,  q1,  q9
1806         vqmovun.s16     d2,  q1
1807         \add            q10, q8,  q10
1808         vrshl.s16       q10, q10, q9
1809         vqmovun.s16     d4,  q10
1810         vmov            q10, q8
1811         vst1.32         {d2[0]},[r4,:32], r1
1812         vst1.32         {d2[1]},[r4,:32], r1
1813         vmov            q1,  q8
1814         vst1.32         {d4[0]},[r4,:32], r1
1815         vst1.32         {d4[1]},[r4,:32], r1
1816         bne             1b
1817         pop             {r4, pc}
1818 2:      \add            q1,  q8,  q1
1819         vrshl.s16       q1,  q1,  q9
1820         vqmovun.s16     d2,  q1
1821         vst1.32         {d2[0]},[r4,:32], r1
1822         vst1.32         {d2[1]},[r4,:32], r1
1823         pop             {r4, pc}
1824         .endm
1825
1826         .macro  weight_func w
1827 function ff_weight_h264_pixels_\w\()_neon, export=1
1828         push            {r4, lr}
1829         ldr             r12, [sp, #8]
1830         ldr             r4,  [sp, #12]
1831         cmp             r3,  #1
1832         lsl             r4,  r4,  r3
1833         vdup.16         q8,  r4
1834         mov             r4,  r0
1835         ble             20f
1836         rsb             lr,  r3,  #1
1837         vdup.16         q9,  lr
1838         cmp             r12, #0
1839         blt             10f
1840         weight_\w       vhadd.s16
1841 10:     rsb             r12, r12, #0
1842         weight_\w       vhsub.s16
1843 20:     rsb             lr,  r3,  #0
1844         vdup.16         q9,  lr
1845         cmp             r12, #0
1846         blt             10f
1847         weight_\w       vadd.s16
1848 10:     rsb             r12, r12, #0
1849         weight_\w       vsub.s16
1850 endfunc
1851         .endm
1852
1853         weight_func     16
1854         weight_func     8
1855         weight_func     4