]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264dsp_neon.S
Merge remote branch 'qatar/master'
[ffmpeg] / libavcodec / arm / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "asm.S"
22
23         .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24         vtrn.32         \r0, \r4
25         vtrn.32         \r1, \r5
26         vtrn.32         \r2, \r6
27         vtrn.32         \r3, \r7
28         vtrn.16         \r0, \r2
29         vtrn.16         \r1, \r3
30         vtrn.16         \r4, \r6
31         vtrn.16         \r5, \r7
32         vtrn.8          \r0, \r1
33         vtrn.8          \r2, \r3
34         vtrn.8          \r4, \r5
35         vtrn.8          \r6, \r7
36         .endm
37
38         .macro transpose_4x4 r0 r1 r2 r3
39         vtrn.16         \r0, \r2
40         vtrn.16         \r1, \r3
41         vtrn.8          \r0, \r1
42         vtrn.8          \r2, \r3
43         .endm
44
45         .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46         vswp            \r0, \r4
47         vswp            \r1, \r5
48         vswp            \r2, \r6
49         vswp            \r3, \r7
50         .endm
51
52         .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53         vtrn.32         \r0, \r2
54         vtrn.32         \r1, \r3
55         vtrn.32         \r4, \r6
56         vtrn.32         \r5, \r7
57         vtrn.16         \r0, \r1
58         vtrn.16         \r2, \r3
59         vtrn.16         \r4, \r5
60         vtrn.16         \r6, \r7
61         .endm
62
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64         .macro  h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
66         push            {r4-r7, lr}
67         ldrd            r4,  [sp, #20]
68 .ifc \type,avg
69         mov             lr,  r0
70 .endif
71         pld             [r1]
72         pld             [r1, r2]
73
74         muls            r7,  r4,  r5
75         rsb             r6,  r7,  r5,  lsl #3
76         rsb             ip,  r7,  r4,  lsl #3
77         sub             r4,  r7,  r4,  lsl #3
78         sub             r4,  r4,  r5,  lsl #3
79         add             r4,  r4,  #64
80
81         beq             2f
82
83         add             r5,  r1,  r2
84
85         vdup.8          d0,  r4
86         lsl             r4,  r2,  #1
87         vdup.8          d1,  ip
88         vld1.64         {d4, d5}, [r1], r4
89         vdup.8          d2,  r6
90         vld1.64         {d6, d7}, [r5], r4
91         vdup.8          d3,  r7
92
93         vext.8          d5,  d4,  d5,  #1
94         vext.8          d7,  d6,  d7,  #1
95
96 1:      pld             [r5]
97         vmull.u8        q8,  d4,  d0
98         vmlal.u8        q8,  d5,  d1
99         vld1.64         {d4, d5}, [r1], r4
100         vmlal.u8        q8,  d6,  d2
101         vext.8          d5,  d4,  d5,  #1
102         vmlal.u8        q8,  d7,  d3
103         vmull.u8        q9,  d6,  d0
104         subs            r3,  r3,  #2
105         vmlal.u8        q9,  d7,  d1
106         vmlal.u8        q9,  d4,  d2
107         vmlal.u8        q9,  d5,  d3
108         vrshrn.u16      d16, q8,  #6
109         vld1.64         {d6, d7}, [r5], r4
110         pld             [r1]
111         vrshrn.u16      d17, q9,  #6
112 .ifc \type,avg
113         vld1.64         {d20}, [lr,:64], r2
114         vld1.64         {d21}, [lr,:64], r2
115         vrhadd.u8       q8,  q8,  q10
116 .endif
117         vext.8          d7,  d6,  d7,  #1
118         vst1.64         {d16}, [r0,:64], r2
119         vst1.64         {d17}, [r0,:64], r2
120         bgt             1b
121
122         pop             {r4-r7, pc}
123
124 2:      tst             r6,  r6
125         add             ip,  ip,  r6
126         vdup.8          d0,  r4
127         vdup.8          d1,  ip
128
129         beq             4f
130
131         add             r5,  r1,  r2
132         lsl             r4,  r2,  #1
133         vld1.64         {d4}, [r1], r4
134         vld1.64         {d6}, [r5], r4
135
136 3:      pld             [r5]
137         vmull.u8        q8,  d4,  d0
138         vmlal.u8        q8,  d6,  d1
139         vld1.64         {d4}, [r1], r4
140         vmull.u8        q9,  d6,  d0
141         vmlal.u8        q9,  d4,  d1
142         vld1.64         {d6}, [r5], r4
143         vrshrn.u16      d16, q8,  #6
144         vrshrn.u16      d17, q9,  #6
145 .ifc \type,avg
146         vld1.64         {d20}, [lr,:64], r2
147         vld1.64         {d21}, [lr,:64], r2
148         vrhadd.u8       q8,  q8,  q10
149 .endif
150         subs            r3,  r3,  #2
151         pld             [r1]
152         vst1.64         {d16}, [r0,:64], r2
153         vst1.64         {d17}, [r0,:64], r2
154         bgt             3b
155
156         pop             {r4-r7, pc}
157
158 4:      vld1.64         {d4, d5}, [r1], r2
159         vld1.64         {d6, d7}, [r1], r2
160         vext.8          d5,  d4,  d5,  #1
161         vext.8          d7,  d6,  d7,  #1
162
163 5:      pld             [r1]
164         subs            r3,  r3,  #2
165         vmull.u8        q8,  d4,  d0
166         vmlal.u8        q8,  d5,  d1
167         vld1.64         {d4, d5}, [r1], r2
168         vmull.u8        q9,  d6,  d0
169         vmlal.u8        q9,  d7,  d1
170         pld             [r1]
171         vext.8          d5,  d4,  d5,  #1
172         vrshrn.u16      d16, q8,  #6
173         vrshrn.u16      d17, q9,  #6
174 .ifc \type,avg
175         vld1.64         {d20}, [lr,:64], r2
176         vld1.64         {d21}, [lr,:64], r2
177         vrhadd.u8       q8,  q8,  q10
178 .endif
179         vld1.64         {d6, d7}, [r1], r2
180         vext.8          d7,  d6,  d7,  #1
181         vst1.64         {d16}, [r0,:64], r2
182         vst1.64         {d17}, [r0,:64], r2
183         bgt             5b
184
185         pop             {r4-r7, pc}
186 endfunc
187         .endm
188
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190         .macro  h264_chroma_mc4 type
191 function ff_\type\()_h264_chroma_mc4_neon, export=1
192         push            {r4-r7, lr}
193         ldrd            r4,  [sp, #20]
194 .ifc \type,avg
195         mov             lr,  r0
196 .endif
197         pld             [r1]
198         pld             [r1, r2]
199
200         muls            r7,  r4,  r5
201         rsb             r6,  r7,  r5,  lsl #3
202         rsb             ip,  r7,  r4,  lsl #3
203         sub             r4,  r7,  r4,  lsl #3
204         sub             r4,  r4,  r5,  lsl #3
205         add             r4,  r4,  #64
206
207         beq             2f
208
209         add             r5,  r1,  r2
210
211         vdup.8          d0,  r4
212         lsl             r4,  r2,  #1
213         vdup.8          d1,  ip
214         vld1.64         {d4},     [r1], r4
215         vdup.8          d2,  r6
216         vld1.64         {d6},     [r5], r4
217         vdup.8          d3,  r7
218
219         vext.8          d5,  d4,  d5,  #1
220         vext.8          d7,  d6,  d7,  #1
221         vtrn.32         d4,  d5
222         vtrn.32         d6,  d7
223
224         vtrn.32         d0,  d1
225         vtrn.32         d2,  d3
226
227 1:      pld             [r5]
228         vmull.u8        q8,  d4,  d0
229         vmlal.u8        q8,  d6,  d2
230         vld1.64         {d4},     [r1], r4
231         vext.8          d5,  d4,  d5,  #1
232         vtrn.32         d4,  d5
233         vmull.u8        q9,  d6,  d0
234         vmlal.u8        q9,  d4,  d2
235         vld1.64         {d6},     [r5], r4
236         vadd.i16        d16, d16, d17
237         vadd.i16        d17, d18, d19
238         vrshrn.u16      d16, q8,  #6
239         subs            r3,  r3,  #2
240         pld             [r1]
241 .ifc \type,avg
242         vld1.32         {d20[0]}, [lr,:32], r2
243         vld1.32         {d20[1]}, [lr,:32], r2
244         vrhadd.u8       d16, d16, d20
245 .endif
246         vext.8          d7,  d6,  d7,  #1
247         vtrn.32         d6,  d7
248         vst1.32         {d16[0]}, [r0,:32], r2
249         vst1.32         {d16[1]}, [r0,:32], r2
250         bgt             1b
251
252         pop             {r4-r7, pc}
253
254 2:      tst             r6,  r6
255         add             ip,  ip,  r6
256         vdup.8          d0,  r4
257         vdup.8          d1,  ip
258         vtrn.32         d0,  d1
259
260         beq             4f
261
262         vext.32         d1,  d0,  d1,  #1
263         add             r5,  r1,  r2
264         lsl             r4,  r2,  #1
265         vld1.32         {d4[0]},  [r1], r4
266         vld1.32         {d4[1]},  [r5], r4
267
268 3:      pld             [r5]
269         vmull.u8        q8,  d4,  d0
270         vld1.32         {d4[0]},  [r1], r4
271         vmull.u8        q9,  d4,  d1
272         vld1.32         {d4[1]},  [r5], r4
273         vadd.i16        d16, d16, d17
274         vadd.i16        d17, d18, d19
275         vrshrn.u16      d16, q8,  #6
276 .ifc \type,avg
277         vld1.32         {d20[0]}, [lr,:32], r2
278         vld1.32         {d20[1]}, [lr,:32], r2
279         vrhadd.u8       d16, d16, d20
280 .endif
281         subs            r3,  r3,  #2
282         pld             [r1]
283         vst1.32         {d16[0]}, [r0,:32], r2
284         vst1.32         {d16[1]}, [r0,:32], r2
285         bgt             3b
286
287         pop             {r4-r7, pc}
288
289 4:      vld1.64         {d4},     [r1], r2
290         vld1.64         {d6},     [r1], r2
291         vext.8          d5,  d4,  d5,  #1
292         vext.8          d7,  d6,  d7,  #1
293         vtrn.32         d4,  d5
294         vtrn.32         d6,  d7
295
296 5:      vmull.u8        q8,  d4,  d0
297         vmull.u8        q9,  d6,  d0
298         subs            r3,  r3,  #2
299         vld1.64         {d4},     [r1], r2
300         vext.8          d5,  d4,  d5,  #1
301         vtrn.32         d4,  d5
302         vadd.i16        d16, d16, d17
303         vadd.i16        d17, d18, d19
304         pld             [r1]
305         vrshrn.u16      d16, q8,  #6
306 .ifc \type,avg
307         vld1.32         {d20[0]}, [lr,:32], r2
308         vld1.32         {d20[1]}, [lr,:32], r2
309         vrhadd.u8       d16, d16, d20
310 .endif
311         vld1.64         {d6},     [r1], r2
312         vext.8          d7,  d6,  d7,  #1
313         vtrn.32         d6,  d7
314         pld             [r1]
315         vst1.32         {d16[0]}, [r0,:32], r2
316         vst1.32         {d16[1]}, [r0,:32], r2
317         bgt             5b
318
319         pop             {r4-r7, pc}
320 endfunc
321         .endm
322
323         .macro  h264_chroma_mc2 type
324 function ff_\type\()_h264_chroma_mc2_neon, export=1
325         push            {r4-r6, lr}
326         ldr             r4,  [sp, #16]
327         ldr             lr,  [sp, #20]
328         pld             [r1]
329         pld             [r1, r2]
330         orrs            r5,  r4,  lr
331         beq             2f
332
333         mul             r5,  r4,  lr
334         rsb             r6,  r5,  lr,  lsl #3
335         rsb             r12, r5,  r4,  lsl #3
336         sub             r4,  r5,  r4,  lsl #3
337         sub             r4,  r4,  lr,  lsl #3
338         add             r4,  r4,  #64
339         vdup.8          d0,  r4
340         vdup.8          d2,  r12
341         vdup.8          d1,  r6
342         vdup.8          d3,  r5
343         vtrn.16         q0,  q1
344 1:
345         vld1.32         {d4[0]},  [r1], r2
346         vld1.32         {d4[1]},  [r1], r2
347         vrev64.32       d5,  d4
348         vld1.32         {d5[1]},  [r1]
349         vext.8          q3,  q2,  q2,  #1
350         vtrn.16         q2,  q3
351         vmull.u8        q8,  d4,  d0
352         vmlal.u8        q8,  d5,  d1
353 .ifc \type,avg
354         vld1.16         {d18[0]}, [r0,:16], r2
355         vld1.16         {d18[1]}, [r0,:16]
356         sub             r0,  r0,  r2
357 .endif
358         vtrn.32         d16, d17
359         vadd.i16        d16, d16, d17
360         vrshrn.u16      d16, q8,  #6
361 .ifc \type,avg
362         vrhadd.u8       d16, d16, d18
363 .endif
364         vst1.16         {d16[0]}, [r0,:16], r2
365         vst1.16         {d16[1]}, [r0,:16], r2
366         subs            r3,  r3,  #2
367         bgt             1b
368         pop             {r4-r6, pc}
369 2:
370 .ifc \type,put
371         ldrh            r5,  [r1], r2
372         strh            r5,  [r0], r2
373         ldrh            r6,  [r1], r2
374         strh            r6,  [r0], r2
375 .else
376         vld1.16         {d16[0]}, [r1], r2
377         vld1.16         {d16[1]}, [r1], r2
378         vld1.16         {d18[0]}, [r0,:16], r2
379         vld1.16         {d18[1]}, [r0,:16]
380         sub             r0,  r0,  r2
381         vrhadd.u8       d16, d16, d18
382         vst1.16         {d16[0]}, [r0,:16], r2
383         vst1.16         {d16[1]}, [r0,:16], r2
384 .endif
385         subs            r3,  r3,  #2
386         bgt             2b
387         pop             {r4-r6, pc}
388 endfunc
389 .endm
390
391         .text
392         .align
393
394         h264_chroma_mc8 put
395         h264_chroma_mc8 avg
396         h264_chroma_mc4 put
397         h264_chroma_mc4 avg
398         h264_chroma_mc2 put
399         h264_chroma_mc2 avg
400
401         /* H.264 loop filter */
402
403         .macro h264_loop_filter_start
404         ldr             ip,  [sp]
405         tst             r2,  r2
406         ldr             ip,  [ip]
407         tstne           r3,  r3
408         vmov.32         d24[0], ip
409         and             ip,  ip,  ip, lsl #16
410         bxeq            lr
411         ands            ip,  ip,  ip, lsl #8
412         bxlt            lr
413         .endm
414
415         .macro align_push_regs
416         and             ip,  sp,  #15
417         add             ip,  ip,  #32
418         sub             sp,  sp,  ip
419         vst1.64         {d12-d15}, [sp,:128]
420         sub             sp,  sp,  #32
421         vst1.64         {d8-d11},  [sp,:128]
422         .endm
423
424         .macro align_pop_regs
425         vld1.64         {d8-d11},  [sp,:128]!
426         vld1.64         {d12-d15}, [sp,:128], ip
427         .endm
428
429         .macro h264_loop_filter_luma
430         vdup.8          q11, r2         @ alpha
431         vmovl.u8        q12, d24
432         vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
433         vmovl.u16       q12, d24
434         vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
435         vsli.16         q12, q12, #8
436         vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
437         vsli.32         q12, q12, #16
438         vclt.u8         q6,  q6,  q11   @ < alpha
439         vdup.8          q11, r3         @ beta
440         vclt.s8         q7,  q12, #0
441         vclt.u8         q14, q14, q11   @ < beta
442         vclt.u8         q15, q15, q11   @ < beta
443         vbic            q6,  q6,  q7
444         vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
445         vand            q6,  q6,  q14
446         vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
447         vclt.u8         q4,  q4,  q11   @ < beta
448         vand            q6,  q6,  q15
449         vclt.u8         q5,  q5,  q11   @ < beta
450         vand            q4,  q4,  q6
451         vand            q5,  q5,  q6
452         vand            q12, q12, q6
453         vrhadd.u8       q14, q8,  q0
454         vsub.i8         q6,  q12, q4
455         vqadd.u8        q7,  q9,  q12
456         vhadd.u8        q10, q10, q14
457         vsub.i8         q6,  q6,  q5
458         vhadd.u8        q14, q2,  q14
459         vmin.u8         q7,  q7,  q10
460         vqsub.u8        q11, q9,  q12
461         vqadd.u8        q2,  q1,  q12
462         vmax.u8         q7,  q7,  q11
463         vqsub.u8        q11, q1,  q12
464         vmin.u8         q14, q2,  q14
465         vmovl.u8        q2,  d0
466         vmax.u8         q14, q14, q11
467         vmovl.u8        q10, d1
468         vsubw.u8        q2,  q2,  d16
469         vsubw.u8        q10, q10, d17
470         vshl.i16        q2,  q2,  #2
471         vshl.i16        q10, q10, #2
472         vaddw.u8        q2,  q2,  d18
473         vaddw.u8        q10, q10, d19
474         vsubw.u8        q2,  q2,  d2
475         vsubw.u8        q10, q10, d3
476         vrshrn.i16      d4,  q2,  #3
477         vrshrn.i16      d5,  q10, #3
478         vbsl            q4,  q7,  q9
479         vbsl            q5,  q14, q1
480         vneg.s8         q7,  q6
481         vmovl.u8        q14, d16
482         vmin.s8         q2,  q2,  q6
483         vmovl.u8        q6,  d17
484         vmax.s8         q2,  q2,  q7
485         vmovl.u8        q11, d0
486         vmovl.u8        q12, d1
487         vaddw.s8        q14, q14, d4
488         vaddw.s8        q6,  q6,  d5
489         vsubw.s8        q11, q11, d4
490         vsubw.s8        q12, q12, d5
491         vqmovun.s16     d16, q14
492         vqmovun.s16     d17, q6
493         vqmovun.s16     d0,  q11
494         vqmovun.s16     d1,  q12
495         .endm
496
497 function ff_h264_v_loop_filter_luma_neon, export=1
498         h264_loop_filter_start
499
500         vld1.64         {d0, d1},  [r0,:128], r1
501         vld1.64         {d2, d3},  [r0,:128], r1
502         vld1.64         {d4, d5},  [r0,:128], r1
503         sub             r0,  r0,  r1, lsl #2
504         sub             r0,  r0,  r1, lsl #1
505         vld1.64         {d20,d21}, [r0,:128], r1
506         vld1.64         {d18,d19}, [r0,:128], r1
507         vld1.64         {d16,d17}, [r0,:128], r1
508
509         align_push_regs
510
511         h264_loop_filter_luma
512
513         sub             r0,  r0,  r1, lsl #1
514         vst1.64         {d8, d9},  [r0,:128], r1
515         vst1.64         {d16,d17}, [r0,:128], r1
516         vst1.64         {d0, d1},  [r0,:128], r1
517         vst1.64         {d10,d11}, [r0,:128]
518
519         align_pop_regs
520         bx              lr
521 endfunc
522
523 function ff_h264_h_loop_filter_luma_neon, export=1
524         h264_loop_filter_start
525
526         sub             r0,  r0,  #4
527         vld1.64         {d6},  [r0], r1
528         vld1.64         {d20}, [r0], r1
529         vld1.64         {d18}, [r0], r1
530         vld1.64         {d16}, [r0], r1
531         vld1.64         {d0},  [r0], r1
532         vld1.64         {d2},  [r0], r1
533         vld1.64         {d4},  [r0], r1
534         vld1.64         {d26}, [r0], r1
535         vld1.64         {d7},  [r0], r1
536         vld1.64         {d21}, [r0], r1
537         vld1.64         {d19}, [r0], r1
538         vld1.64         {d17}, [r0], r1
539         vld1.64         {d1},  [r0], r1
540         vld1.64         {d3},  [r0], r1
541         vld1.64         {d5},  [r0], r1
542         vld1.64         {d27}, [r0], r1
543
544         transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
545
546         align_push_regs
547
548         h264_loop_filter_luma
549
550         transpose_4x4   q4, q8, q0, q5
551
552         sub             r0,  r0,  r1, lsl #4
553         add             r0,  r0,  #2
554         vst1.32         {d8[0]},  [r0], r1
555         vst1.32         {d16[0]}, [r0], r1
556         vst1.32         {d0[0]},  [r0], r1
557         vst1.32         {d10[0]}, [r0], r1
558         vst1.32         {d8[1]},  [r0], r1
559         vst1.32         {d16[1]}, [r0], r1
560         vst1.32         {d0[1]},  [r0], r1
561         vst1.32         {d10[1]}, [r0], r1
562         vst1.32         {d9[0]},  [r0], r1
563         vst1.32         {d17[0]}, [r0], r1
564         vst1.32         {d1[0]},  [r0], r1
565         vst1.32         {d11[0]}, [r0], r1
566         vst1.32         {d9[1]},  [r0], r1
567         vst1.32         {d17[1]}, [r0], r1
568         vst1.32         {d1[1]},  [r0], r1
569         vst1.32         {d11[1]}, [r0], r1
570
571         align_pop_regs
572         bx              lr
573 endfunc
574
575         .macro h264_loop_filter_chroma
576         vdup.8          d22, r2         @ alpha
577         vmovl.u8        q12, d24
578         vabd.u8         d26, d16, d0    @ abs(p0 - q0)
579         vmovl.u8        q2,  d0
580         vabd.u8         d28, d18, d16   @ abs(p1 - p0)
581         vsubw.u8        q2,  q2,  d16
582         vsli.16         d24, d24, #8
583         vshl.i16        q2,  q2,  #2
584         vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
585         vaddw.u8        q2,  q2,  d18
586         vclt.u8         d26, d26, d22   @ < alpha
587         vsubw.u8        q2,  q2,  d2
588         vdup.8          d22, r3         @ beta
589         vrshrn.i16      d4,  q2,  #3
590         vclt.u8         d28, d28, d22   @ < beta
591         vclt.u8         d30, d30, d22   @ < beta
592         vmin.s8         d4,  d4,  d24
593         vneg.s8         d25, d24
594         vand            d26, d26, d28
595         vmax.s8         d4,  d4,  d25
596         vand            d26, d26, d30
597         vmovl.u8        q11, d0
598         vand            d4,  d4,  d26
599         vmovl.u8        q14, d16
600         vaddw.s8        q14, q14, d4
601         vsubw.s8        q11, q11, d4
602         vqmovun.s16     d16, q14
603         vqmovun.s16     d0,  q11
604         .endm
605
606 function ff_h264_v_loop_filter_chroma_neon, export=1
607         h264_loop_filter_start
608
609         sub             r0,  r0,  r1, lsl #1
610         vld1.64         {d18}, [r0,:64], r1
611         vld1.64         {d16}, [r0,:64], r1
612         vld1.64         {d0},  [r0,:64], r1
613         vld1.64         {d2},  [r0,:64]
614
615         h264_loop_filter_chroma
616
617         sub             r0,  r0,  r1, lsl #1
618         vst1.64         {d16}, [r0,:64], r1
619         vst1.64         {d0},  [r0,:64], r1
620
621         bx              lr
622 endfunc
623
624 function ff_h264_h_loop_filter_chroma_neon, export=1
625         h264_loop_filter_start
626
627         sub             r0,  r0,  #2
628         vld1.32         {d18[0]}, [r0], r1
629         vld1.32         {d16[0]}, [r0], r1
630         vld1.32         {d0[0]},  [r0], r1
631         vld1.32         {d2[0]},  [r0], r1
632         vld1.32         {d18[1]}, [r0], r1
633         vld1.32         {d16[1]}, [r0], r1
634         vld1.32         {d0[1]},  [r0], r1
635         vld1.32         {d2[1]},  [r0], r1
636
637         vtrn.16         d18, d0
638         vtrn.16         d16, d2
639         vtrn.8          d18, d16
640         vtrn.8          d0,  d2
641
642         h264_loop_filter_chroma
643
644         vtrn.16         d18, d0
645         vtrn.16         d16, d2
646         vtrn.8          d18, d16
647         vtrn.8          d0,  d2
648
649         sub             r0,  r0,  r1, lsl #3
650         vst1.32         {d18[0]}, [r0], r1
651         vst1.32         {d16[0]}, [r0], r1
652         vst1.32         {d0[0]},  [r0], r1
653         vst1.32         {d2[0]},  [r0], r1
654         vst1.32         {d18[1]}, [r0], r1
655         vst1.32         {d16[1]}, [r0], r1
656         vst1.32         {d0[1]},  [r0], r1
657         vst1.32         {d2[1]},  [r0], r1
658
659         bx              lr
660 endfunc
661
662         /* H.264 qpel MC */
663
664         .macro  lowpass_const r
665         movw            \r,  #5
666         movt            \r,  #20
667         vmov.32         d6[0], \r
668         .endm
669
670         .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
671 .if \narrow
672         t0 .req q0
673         t1 .req q8
674 .else
675         t0 .req \d0
676         t1 .req \d1
677 .endif
678         vext.8          d2,  \r0, \r1, #2
679         vext.8          d3,  \r0, \r1, #3
680         vaddl.u8        q1,  d2,  d3
681         vext.8          d4,  \r0, \r1, #1
682         vext.8          d5,  \r0, \r1, #4
683         vaddl.u8        q2,  d4,  d5
684         vext.8          d30, \r0, \r1, #5
685         vaddl.u8        t0,  \r0, d30
686         vext.8          d18, \r2, \r3, #2
687         vmla.i16        t0,  q1,  d6[1]
688         vext.8          d19, \r2, \r3, #3
689         vaddl.u8        q9,  d18, d19
690         vext.8          d20, \r2, \r3, #1
691         vmls.i16        t0,  q2,  d6[0]
692         vext.8          d21, \r2, \r3, #4
693         vaddl.u8        q10, d20, d21
694         vext.8          d31, \r2, \r3, #5
695         vaddl.u8        t1,  \r2, d31
696         vmla.i16        t1,  q9,  d6[1]
697         vmls.i16        t1,  q10, d6[0]
698 .if \narrow
699         vqrshrun.s16    \d0, t0,  #5
700         vqrshrun.s16    \d1, t1,  #5
701 .endif
702         .unreq  t0
703         .unreq  t1
704         .endm
705
706         .macro  lowpass_8_1 r0, r1, d0, narrow=1
707 .if \narrow
708         t0 .req q0
709 .else
710         t0 .req \d0
711 .endif
712         vext.8          d2,  \r0, \r1, #2
713         vext.8          d3,  \r0, \r1, #3
714         vaddl.u8        q1,  d2,  d3
715         vext.8          d4,  \r0, \r1, #1
716         vext.8          d5,  \r0, \r1, #4
717         vaddl.u8        q2,  d4,  d5
718         vext.8          d30, \r0, \r1, #5
719         vaddl.u8        t0,  \r0, d30
720         vmla.i16        t0,  q1,  d6[1]
721         vmls.i16        t0,  q2,  d6[0]
722 .if \narrow
723         vqrshrun.s16    \d0, t0,  #5
724 .endif
725         .unreq  t0
726         .endm
727
728         .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
729         vext.16         q1,  \r0, \r1, #2
730         vext.16         q0,  \r0, \r1, #3
731         vaddl.s16       q9,  d2,  d0
732         vext.16         q2,  \r0, \r1, #1
733         vaddl.s16       q1,  d3,  d1
734         vext.16         q3,  \r0, \r1, #4
735         vaddl.s16       q10, d4,  d6
736         vext.16         \r1, \r0, \r1, #5
737         vaddl.s16       q2,  d5,  d7
738         vaddl.s16       q0,  \h0, \h1
739         vaddl.s16       q8,  \l0, \l1
740
741         vshl.i32        q3,  q9,  #4
742         vshl.i32        q9,  q9,  #2
743         vshl.i32        q15, q10, #2
744         vadd.i32        q9,  q9,  q3
745         vadd.i32        q10, q10, q15
746
747         vshl.i32        q3,  q1,  #4
748         vshl.i32        q1,  q1,  #2
749         vshl.i32        q15, q2,  #2
750         vadd.i32        q1,  q1,  q3
751         vadd.i32        q2,  q2,  q15
752
753         vadd.i32        q9,  q9,  q8
754         vsub.i32        q9,  q9,  q10
755
756         vadd.i32        q1,  q1,  q0
757         vsub.i32        q1,  q1,  q2
758
759         vrshrn.s32      d18, q9,  #10
760         vrshrn.s32      d19, q1,  #10
761
762         vqmovun.s16     \d,  q9
763         .endm
764
765 function put_h264_qpel16_h_lowpass_neon_packed
766         mov             r4,  lr
767         mov             ip,  #16
768         mov             r3,  #8
769         bl              put_h264_qpel8_h_lowpass_neon
770         sub             r1,  r1,  r2, lsl #4
771         add             r1,  r1,  #8
772         mov             ip,  #16
773         mov             lr,  r4
774         b               put_h264_qpel8_h_lowpass_neon
775 endfunc
776
777         .macro h264_qpel_h_lowpass type
778 function \type\()_h264_qpel16_h_lowpass_neon
779         push            {lr}
780         mov             ip,  #16
781         bl              \type\()_h264_qpel8_h_lowpass_neon
782         sub             r0,  r0,  r3, lsl #4
783         sub             r1,  r1,  r2, lsl #4
784         add             r0,  r0,  #8
785         add             r1,  r1,  #8
786         mov             ip,  #16
787         pop             {lr}
788 endfunc
789
790 function \type\()_h264_qpel8_h_lowpass_neon
791 1:      vld1.64         {d0, d1},  [r1], r2
792         vld1.64         {d16,d17}, [r1], r2
793         subs            ip,  ip,  #2
794         lowpass_8       d0,  d1,  d16, d17, d0,  d16
795 .ifc \type,avg
796         vld1.8          {d2},     [r0,:64], r3
797         vrhadd.u8       d0,  d0,  d2
798         vld1.8          {d3},     [r0,:64]
799         vrhadd.u8       d16, d16, d3
800         sub             r0,  r0,  r3
801 .endif
802         vst1.64         {d0},     [r0,:64], r3
803         vst1.64         {d16},    [r0,:64], r3
804         bne             1b
805         bx              lr
806 endfunc
807         .endm
808
809         h264_qpel_h_lowpass put
810         h264_qpel_h_lowpass avg
811
812         .macro h264_qpel_h_lowpass_l2 type
813 function \type\()_h264_qpel16_h_lowpass_l2_neon
814         push            {lr}
815         mov             ip,  #16
816         bl              \type\()_h264_qpel8_h_lowpass_l2_neon
817         sub             r0,  r0,  r2, lsl #4
818         sub             r1,  r1,  r2, lsl #4
819         sub             r3,  r3,  r2, lsl #4
820         add             r0,  r0,  #8
821         add             r1,  r1,  #8
822         add             r3,  r3,  #8
823         mov             ip,  #16
824         pop             {lr}
825 endfunc
826
827 function \type\()_h264_qpel8_h_lowpass_l2_neon
828 1:      vld1.64         {d0, d1},  [r1], r2
829         vld1.64         {d16,d17}, [r1], r2
830         vld1.64         {d28},     [r3], r2
831         vld1.64         {d29},     [r3], r2
832         subs            ip,  ip,  #2
833         lowpass_8       d0,  d1,  d16, d17, d0,  d1
834         vrhadd.u8       q0,  q0,  q14
835 .ifc \type,avg
836         vld1.8          {d2},      [r0,:64], r2
837         vrhadd.u8       d0,  d0,  d2
838         vld1.8          {d3},      [r0,:64]
839         vrhadd.u8       d1,  d1,  d3
840         sub             r0,  r0,  r2
841 .endif
842         vst1.64         {d0},      [r0,:64], r2
843         vst1.64         {d1},      [r0,:64], r2
844         bne             1b
845         bx              lr
846 endfunc
847         .endm
848
849         h264_qpel_h_lowpass_l2 put
850         h264_qpel_h_lowpass_l2 avg
851
852 function put_h264_qpel16_v_lowpass_neon_packed
853         mov             r4,  lr
854         mov             r2,  #8
855         bl              put_h264_qpel8_v_lowpass_neon
856         sub             r1,  r1,  r3, lsl #2
857         bl              put_h264_qpel8_v_lowpass_neon
858         sub             r1,  r1,  r3, lsl #4
859         sub             r1,  r1,  r3, lsl #2
860         add             r1,  r1,  #8
861         bl              put_h264_qpel8_v_lowpass_neon
862         sub             r1,  r1,  r3, lsl #2
863         mov             lr,  r4
864         b               put_h264_qpel8_v_lowpass_neon
865 endfunc
866
867         .macro h264_qpel_v_lowpass type
868 function \type\()_h264_qpel16_v_lowpass_neon
869         mov             r4,  lr
870         bl              \type\()_h264_qpel8_v_lowpass_neon
871         sub             r1,  r1,  r3, lsl #2
872         bl              \type\()_h264_qpel8_v_lowpass_neon
873         sub             r0,  r0,  r2, lsl #4
874         add             r0,  r0,  #8
875         sub             r1,  r1,  r3, lsl #4
876         sub             r1,  r1,  r3, lsl #2
877         add             r1,  r1,  #8
878         bl              \type\()_h264_qpel8_v_lowpass_neon
879         sub             r1,  r1,  r3, lsl #2
880         mov             lr,  r4
881 endfunc
882
883 function \type\()_h264_qpel8_v_lowpass_neon
884         vld1.64         {d8},  [r1], r3
885         vld1.64         {d10}, [r1], r3
886         vld1.64         {d12}, [r1], r3
887         vld1.64         {d14}, [r1], r3
888         vld1.64         {d22}, [r1], r3
889         vld1.64         {d24}, [r1], r3
890         vld1.64         {d26}, [r1], r3
891         vld1.64         {d28}, [r1], r3
892         vld1.64         {d9},  [r1], r3
893         vld1.64         {d11}, [r1], r3
894         vld1.64         {d13}, [r1], r3
895         vld1.64         {d15}, [r1], r3
896         vld1.64         {d23}, [r1]
897
898         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
899         lowpass_8       d8,  d9,  d10, d11, d8,  d10
900         lowpass_8       d12, d13, d14, d15, d12, d14
901         lowpass_8       d22, d23, d24, d25, d22, d24
902         lowpass_8       d26, d27, d28, d29, d26, d28
903         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
904
905 .ifc \type,avg
906         vld1.8          {d9},  [r0,:64], r2
907         vrhadd.u8       d8,  d8,  d9
908         vld1.8          {d11}, [r0,:64], r2
909         vrhadd.u8       d10, d10, d11
910         vld1.8          {d13}, [r0,:64], r2
911         vrhadd.u8       d12, d12, d13
912         vld1.8          {d15}, [r0,:64], r2
913         vrhadd.u8       d14, d14, d15
914         vld1.8          {d23}, [r0,:64], r2
915         vrhadd.u8       d22, d22, d23
916         vld1.8          {d25}, [r0,:64], r2
917         vrhadd.u8       d24, d24, d25
918         vld1.8          {d27}, [r0,:64], r2
919         vrhadd.u8       d26, d26, d27
920         vld1.8          {d29}, [r0,:64], r2
921         vrhadd.u8       d28, d28, d29
922         sub             r0,  r0,  r2,  lsl #3
923 .endif
924
925         vst1.64         {d8},  [r0,:64], r2
926         vst1.64         {d10}, [r0,:64], r2
927         vst1.64         {d12}, [r0,:64], r2
928         vst1.64         {d14}, [r0,:64], r2
929         vst1.64         {d22}, [r0,:64], r2
930         vst1.64         {d24}, [r0,:64], r2
931         vst1.64         {d26}, [r0,:64], r2
932         vst1.64         {d28}, [r0,:64], r2
933
934         bx              lr
935 endfunc
936         .endm
937
938         h264_qpel_v_lowpass put
939         h264_qpel_v_lowpass avg
940
941         .macro h264_qpel_v_lowpass_l2 type
942 function \type\()_h264_qpel16_v_lowpass_l2_neon
943         mov             r4,  lr
944         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
945         sub             r1,  r1,  r3, lsl #2
946         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
947         sub             r0,  r0,  r3, lsl #4
948         sub             ip,  ip,  r2, lsl #4
949         add             r0,  r0,  #8
950         add             ip,  ip,  #8
951         sub             r1,  r1,  r3, lsl #4
952         sub             r1,  r1,  r3, lsl #2
953         add             r1,  r1,  #8
954         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
955         sub             r1,  r1,  r3, lsl #2
956         mov             lr,  r4
957 endfunc
958
959 function \type\()_h264_qpel8_v_lowpass_l2_neon
960         vld1.64         {d8},  [r1], r3
961         vld1.64         {d10}, [r1], r3
962         vld1.64         {d12}, [r1], r3
963         vld1.64         {d14}, [r1], r3
964         vld1.64         {d22}, [r1], r3
965         vld1.64         {d24}, [r1], r3
966         vld1.64         {d26}, [r1], r3
967         vld1.64         {d28}, [r1], r3
968         vld1.64         {d9},  [r1], r3
969         vld1.64         {d11}, [r1], r3
970         vld1.64         {d13}, [r1], r3
971         vld1.64         {d15}, [r1], r3
972         vld1.64         {d23}, [r1]
973
974         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
975         lowpass_8       d8,  d9,  d10, d11, d8,  d9
976         lowpass_8       d12, d13, d14, d15, d12, d13
977         lowpass_8       d22, d23, d24, d25, d22, d23
978         lowpass_8       d26, d27, d28, d29, d26, d27
979         transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
980
981         vld1.64         {d0},  [ip], r2
982         vld1.64         {d1},  [ip], r2
983         vld1.64         {d2},  [ip], r2
984         vld1.64         {d3},  [ip], r2
985         vld1.64         {d4},  [ip], r2
986         vrhadd.u8       q0,  q0,  q4
987         vld1.64         {d5},  [ip], r2
988         vrhadd.u8       q1,  q1,  q6
989         vld1.64         {d10}, [ip], r2
990         vrhadd.u8       q2,  q2,  q11
991         vld1.64         {d11}, [ip], r2
992         vrhadd.u8       q5,  q5,  q13
993
994 .ifc \type,avg
995         vld1.8          {d16}, [r0,:64], r3
996         vrhadd.u8       d0,  d0,  d16
997         vld1.8          {d17}, [r0,:64], r3
998         vrhadd.u8       d1,  d1,  d17
999         vld1.8          {d16}, [r0,:64], r3
1000         vrhadd.u8       d2,  d2,  d16
1001         vld1.8          {d17}, [r0,:64], r3
1002         vrhadd.u8       d3,  d3,  d17
1003         vld1.8          {d16}, [r0,:64], r3
1004         vrhadd.u8       d4,  d4,  d16
1005         vld1.8          {d17}, [r0,:64], r3
1006         vrhadd.u8       d5,  d5,  d17
1007         vld1.8          {d16}, [r0,:64], r3
1008         vrhadd.u8       d10, d10, d16
1009         vld1.8          {d17}, [r0,:64], r3
1010         vrhadd.u8       d11, d11, d17
1011         sub             r0,  r0,  r3,  lsl #3
1012 .endif
1013
1014         vst1.64         {d0},  [r0,:64], r3
1015         vst1.64         {d1},  [r0,:64], r3
1016         vst1.64         {d2},  [r0,:64], r3
1017         vst1.64         {d3},  [r0,:64], r3
1018         vst1.64         {d4},  [r0,:64], r3
1019         vst1.64         {d5},  [r0,:64], r3
1020         vst1.64         {d10}, [r0,:64], r3
1021         vst1.64         {d11}, [r0,:64], r3
1022
1023         bx              lr
1024 endfunc
1025         .endm
1026
1027         h264_qpel_v_lowpass_l2 put
1028         h264_qpel_v_lowpass_l2 avg
1029
1030 function put_h264_qpel8_hv_lowpass_neon_top
1031         lowpass_const   ip
1032         mov             ip,  #12
1033 1:      vld1.64         {d0, d1},  [r1], r3
1034         vld1.64         {d16,d17}, [r1], r3
1035         subs            ip,  ip,  #2
1036         lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
1037         vst1.64         {d22-d25}, [r4,:128]!
1038         bne             1b
1039
1040         vld1.64         {d0, d1},  [r1]
1041         lowpass_8_1     d0,  d1,  q12, narrow=0
1042
1043         mov             ip,  #-16
1044         add             r4,  r4,  ip
1045         vld1.64         {d30,d31}, [r4,:128], ip
1046         vld1.64         {d20,d21}, [r4,:128], ip
1047         vld1.64         {d18,d19}, [r4,:128], ip
1048         vld1.64         {d16,d17}, [r4,:128], ip
1049         vld1.64         {d14,d15}, [r4,:128], ip
1050         vld1.64         {d12,d13}, [r4,:128], ip
1051         vld1.64         {d10,d11}, [r4,:128], ip
1052         vld1.64         {d8, d9},  [r4,:128], ip
1053         vld1.64         {d6, d7},  [r4,:128], ip
1054         vld1.64         {d4, d5},  [r4,:128], ip
1055         vld1.64         {d2, d3},  [r4,:128], ip
1056         vld1.64         {d0, d1},  [r4,:128]
1057
1058         swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
1059         transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
1060
1061         swap4           d17, d19, d21, d31, d24, d26, d28, d22
1062         transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
1063
1064         vst1.64         {d30,d31}, [r4,:128]!
1065         vst1.64         {d6, d7},  [r4,:128]!
1066         vst1.64         {d20,d21}, [r4,:128]!
1067         vst1.64         {d4, d5},  [r4,:128]!
1068         vst1.64         {d18,d19}, [r4,:128]!
1069         vst1.64         {d2, d3},  [r4,:128]!
1070         vst1.64         {d16,d17}, [r4,:128]!
1071         vst1.64         {d0, d1},  [r4,:128]
1072
1073         lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
1074         lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
1075         lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
1076         lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
1077
1078         vld1.64         {d16,d17}, [r4,:128], ip
1079         vld1.64         {d30,d31}, [r4,:128], ip
1080         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
1081         vld1.64         {d16,d17}, [r4,:128], ip
1082         vld1.64         {d30,d31}, [r4,:128], ip
1083         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
1084         vld1.64         {d16,d17}, [r4,:128], ip
1085         vld1.64         {d30,d31}, [r4,:128], ip
1086         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
1087         vld1.64         {d16,d17}, [r4,:128], ip
1088         vld1.64         {d30,d31}, [r4,:128]
1089         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
1090
1091         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
1092
1093         bx              lr
1094 endfunc
1095
1096         .macro h264_qpel8_hv_lowpass type
1097 function \type\()_h264_qpel8_hv_lowpass_neon
1098         mov             r10, lr
1099         bl              put_h264_qpel8_hv_lowpass_neon_top
1100 .ifc \type,avg
1101         vld1.8          {d0},      [r0,:64], r2
1102         vrhadd.u8       d12, d12, d0
1103         vld1.8          {d1},      [r0,:64], r2
1104         vrhadd.u8       d13, d13, d1
1105         vld1.8          {d2},      [r0,:64], r2
1106         vrhadd.u8       d14, d14, d2
1107         vld1.8          {d3},      [r0,:64], r2
1108         vrhadd.u8       d15, d15, d3
1109         vld1.8          {d4},      [r0,:64], r2
1110         vrhadd.u8       d8,  d8,  d4
1111         vld1.8          {d5},      [r0,:64], r2
1112         vrhadd.u8       d9,  d9,  d5
1113         vld1.8          {d6},      [r0,:64], r2
1114         vrhadd.u8       d10, d10, d6
1115         vld1.8          {d7},      [r0,:64], r2
1116         vrhadd.u8       d11, d11, d7
1117         sub             r0,  r0,  r2,  lsl #3
1118 .endif
1119         vst1.64         {d12},     [r0,:64], r2
1120         vst1.64         {d13},     [r0,:64], r2
1121         vst1.64         {d14},     [r0,:64], r2
1122         vst1.64         {d15},     [r0,:64], r2
1123         vst1.64         {d8},      [r0,:64], r2
1124         vst1.64         {d9},      [r0,:64], r2
1125         vst1.64         {d10},     [r0,:64], r2
1126         vst1.64         {d11},     [r0,:64], r2
1127
1128         mov             lr,  r10
1129         bx              lr
1130 endfunc
1131         .endm
1132
1133         h264_qpel8_hv_lowpass put
1134         h264_qpel8_hv_lowpass avg
1135
1136         .macro h264_qpel8_hv_lowpass_l2 type
1137 function \type\()_h264_qpel8_hv_lowpass_l2_neon
1138         mov             r10, lr
1139         bl              put_h264_qpel8_hv_lowpass_neon_top
1140
1141         vld1.64         {d0, d1},  [r2,:128]!
1142         vld1.64         {d2, d3},  [r2,:128]!
1143         vrhadd.u8       q0,  q0,  q6
1144         vld1.64         {d4, d5},  [r2,:128]!
1145         vrhadd.u8       q1,  q1,  q7
1146         vld1.64         {d6, d7},  [r2,:128]!
1147         vrhadd.u8       q2,  q2,  q4
1148         vrhadd.u8       q3,  q3,  q5
1149 .ifc \type,avg
1150         vld1.8          {d16},     [r0,:64], r3
1151         vrhadd.u8       d0,  d0,  d16
1152         vld1.8          {d17},     [r0,:64], r3
1153         vrhadd.u8       d1,  d1,  d17
1154         vld1.8          {d18},     [r0,:64], r3
1155         vrhadd.u8       d2,  d2,  d18
1156         vld1.8          {d19},     [r0,:64], r3
1157         vrhadd.u8       d3,  d3,  d19
1158         vld1.8          {d20},     [r0,:64], r3
1159         vrhadd.u8       d4,  d4,  d20
1160         vld1.8          {d21},     [r0,:64], r3
1161         vrhadd.u8       d5,  d5,  d21
1162         vld1.8          {d22},     [r0,:64], r3
1163         vrhadd.u8       d6,  d6,  d22
1164         vld1.8          {d23},     [r0,:64], r3
1165         vrhadd.u8       d7,  d7,  d23
1166         sub             r0,  r0,  r3,  lsl #3
1167 .endif
1168         vst1.64         {d0},      [r0,:64], r3
1169         vst1.64         {d1},      [r0,:64], r3
1170         vst1.64         {d2},      [r0,:64], r3
1171         vst1.64         {d3},      [r0,:64], r3
1172         vst1.64         {d4},      [r0,:64], r3
1173         vst1.64         {d5},      [r0,:64], r3
1174         vst1.64         {d6},      [r0,:64], r3
1175         vst1.64         {d7},      [r0,:64], r3
1176
1177         mov             lr,  r10
1178         bx              lr
1179 endfunc
1180         .endm
1181
1182         h264_qpel8_hv_lowpass_l2 put
1183         h264_qpel8_hv_lowpass_l2 avg
1184
1185         .macro h264_qpel16_hv type
1186 function \type\()_h264_qpel16_hv_lowpass_neon
1187         mov             r9,  lr
1188         bl              \type\()_h264_qpel8_hv_lowpass_neon
1189         sub             r1,  r1,  r3, lsl #2
1190         bl              \type\()_h264_qpel8_hv_lowpass_neon
1191         sub             r1,  r1,  r3, lsl #4
1192         sub             r1,  r1,  r3, lsl #2
1193         add             r1,  r1,  #8
1194         sub             r0,  r0,  r2, lsl #4
1195         add             r0,  r0,  #8
1196         bl              \type\()_h264_qpel8_hv_lowpass_neon
1197         sub             r1,  r1,  r3, lsl #2
1198         mov             lr,  r9
1199         b               \type\()_h264_qpel8_hv_lowpass_neon
1200 endfunc
1201
1202 function \type\()_h264_qpel16_hv_lowpass_l2_neon
1203         mov             r9,  lr
1204         sub             r2,  r4,  #256
1205         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1206         sub             r1,  r1,  r3, lsl #2
1207         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1208         sub             r1,  r1,  r3, lsl #4
1209         sub             r1,  r1,  r3, lsl #2
1210         add             r1,  r1,  #8
1211         sub             r0,  r0,  r3, lsl #4
1212         add             r0,  r0,  #8
1213         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1214         sub             r1,  r1,  r3, lsl #2
1215         mov             lr,  r9
1216         b               \type\()_h264_qpel8_hv_lowpass_l2_neon
1217 endfunc
1218         .endm
1219
1220         h264_qpel16_hv put
1221         h264_qpel16_hv avg
1222
1223         .macro h264_qpel8 type
1224 function ff_\type\()_h264_qpel8_mc10_neon, export=1
1225         lowpass_const   r3
1226         mov             r3,  r1
1227         sub             r1,  r1,  #2
1228         mov             ip,  #8
1229         b               \type\()_h264_qpel8_h_lowpass_l2_neon
1230 endfunc
1231
1232 function ff_\type\()_h264_qpel8_mc20_neon, export=1
1233         lowpass_const   r3
1234         sub             r1,  r1,  #2
1235         mov             r3,  r2
1236         mov             ip,  #8
1237         b               \type\()_h264_qpel8_h_lowpass_neon
1238 endfunc
1239
1240 function ff_\type\()_h264_qpel8_mc30_neon, export=1
1241         lowpass_const   r3
1242         add             r3,  r1,  #1
1243         sub             r1,  r1,  #2
1244         mov             ip,  #8
1245         b               \type\()_h264_qpel8_h_lowpass_l2_neon
1246 endfunc
1247
1248 function ff_\type\()_h264_qpel8_mc01_neon, export=1
1249         push            {lr}
1250         mov             ip,  r1
1251 \type\()_h264_qpel8_mc01:
1252         lowpass_const   r3
1253         mov             r3,  r2
1254         sub             r1,  r1,  r2, lsl #1
1255         vpush           {d8-d15}
1256         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1257         vpop            {d8-d15}
1258         pop             {pc}
1259 endfunc
1260
1261 function ff_\type\()_h264_qpel8_mc11_neon, export=1
1262         push            {r0, r1, r11, lr}
1263 \type\()_h264_qpel8_mc11:
1264         lowpass_const   r3
1265         mov             r11, sp
1266         bic             sp,  sp,  #15
1267         sub             sp,  sp,  #64
1268         mov             r0,  sp
1269         sub             r1,  r1,  #2
1270         mov             r3,  #8
1271         mov             ip,  #8
1272         vpush           {d8-d15}
1273         bl              put_h264_qpel8_h_lowpass_neon
1274         ldrd            r0,  [r11]
1275         mov             r3,  r2
1276         add             ip,  sp,  #64
1277         sub             r1,  r1,  r2, lsl #1
1278         mov             r2,  #8
1279         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1280         vpop            {d8-d15}
1281         add             sp,  r11, #8
1282         pop             {r11, pc}
1283 endfunc
1284
1285 function ff_\type\()_h264_qpel8_mc21_neon, export=1
1286         push            {r0, r1, r4, r10, r11, lr}
1287 \type\()_h264_qpel8_mc21:
1288         lowpass_const   r3
1289         mov             r11, sp
1290         bic             sp,  sp,  #15
1291         sub             sp,  sp,  #(8*8+16*12)
1292         sub             r1,  r1,  #2
1293         mov             r3,  #8
1294         mov             r0,  sp
1295         mov             ip,  #8
1296         vpush           {d8-d15}
1297         bl              put_h264_qpel8_h_lowpass_neon
1298         mov             r4,  r0
1299         ldrd            r0,  [r11]
1300         sub             r1,  r1,  r2, lsl #1
1301         sub             r1,  r1,  #2
1302         mov             r3,  r2
1303         sub             r2,  r4,  #64
1304         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1305         vpop            {d8-d15}
1306         add             sp,  r11,  #8
1307         pop             {r4, r10, r11, pc}
1308 endfunc
1309
1310 function ff_\type\()_h264_qpel8_mc31_neon, export=1
1311         add             r1,  r1,  #1
1312         push            {r0, r1, r11, lr}
1313         sub             r1,  r1,  #1
1314         b               \type\()_h264_qpel8_mc11
1315 endfunc
1316
1317 function ff_\type\()_h264_qpel8_mc02_neon, export=1
1318         push            {lr}
1319         lowpass_const   r3
1320         sub             r1,  r1,  r2, lsl #1
1321         mov             r3,  r2
1322         vpush           {d8-d15}
1323         bl              \type\()_h264_qpel8_v_lowpass_neon
1324         vpop            {d8-d15}
1325         pop             {pc}
1326 endfunc
1327
1328 function ff_\type\()_h264_qpel8_mc12_neon, export=1
1329         push            {r0, r1, r4, r10, r11, lr}
1330 \type\()_h264_qpel8_mc12:
1331         lowpass_const   r3
1332         mov             r11, sp
1333         bic             sp,  sp,  #15
1334         sub             sp,  sp,  #(8*8+16*12)
1335         sub             r1,  r1,  r2, lsl #1
1336         mov             r3,  r2
1337         mov             r2,  #8
1338         mov             r0,  sp
1339         vpush           {d8-d15}
1340         bl              put_h264_qpel8_v_lowpass_neon
1341         mov             r4,  r0
1342         ldrd            r0,  [r11]
1343         sub             r1,  r1,  r3, lsl #1
1344         sub             r1,  r1,  #2
1345         sub             r2,  r4,  #64
1346         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1347         vpop            {d8-d15}
1348         add             sp,  r11,  #8
1349         pop             {r4, r10, r11, pc}
1350 endfunc
1351
1352 function ff_\type\()_h264_qpel8_mc22_neon, export=1
1353         push            {r4, r10, r11, lr}
1354         mov             r11, sp
1355         bic             sp,  sp,  #15
1356         sub             r1,  r1,  r2, lsl #1
1357         sub             r1,  r1,  #2
1358         mov             r3,  r2
1359         sub             sp,  sp,  #(16*12)
1360         mov             r4,  sp
1361         vpush           {d8-d15}
1362         bl              \type\()_h264_qpel8_hv_lowpass_neon
1363         vpop            {d8-d15}
1364         mov             sp,  r11
1365         pop             {r4, r10, r11, pc}
1366 endfunc
1367
1368 function ff_\type\()_h264_qpel8_mc32_neon, export=1
1369         push            {r0, r1, r4, r10, r11, lr}
1370         add             r1,  r1,  #1
1371         b               \type\()_h264_qpel8_mc12
1372 endfunc
1373
1374 function ff_\type\()_h264_qpel8_mc03_neon, export=1
1375         push            {lr}
1376         add             ip,  r1,  r2
1377         b               \type\()_h264_qpel8_mc01
1378 endfunc
1379
1380 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1381         push            {r0, r1, r11, lr}
1382         add             r1,  r1,  r2
1383         b               \type\()_h264_qpel8_mc11
1384 endfunc
1385
1386 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1387         push            {r0, r1, r4, r10, r11, lr}
1388         add             r1,  r1,  r2
1389         b               \type\()_h264_qpel8_mc21
1390 endfunc
1391
1392 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1393         add             r1,  r1,  #1
1394         push            {r0, r1, r11, lr}
1395         add             r1,  r1,  r2
1396         sub             r1,  r1,  #1
1397         b               \type\()_h264_qpel8_mc11
1398 endfunc
1399         .endm
1400
1401         h264_qpel8 put
1402         h264_qpel8 avg
1403
1404         .macro h264_qpel16 type
1405 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1406         lowpass_const   r3
1407         mov             r3,  r1
1408         sub             r1,  r1,  #2
1409         b               \type\()_h264_qpel16_h_lowpass_l2_neon
1410 endfunc
1411
1412 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1413         lowpass_const   r3
1414         sub             r1,  r1,  #2
1415         mov             r3,  r2
1416         b               \type\()_h264_qpel16_h_lowpass_neon
1417 endfunc
1418
1419 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1420         lowpass_const   r3
1421         add             r3,  r1,  #1
1422         sub             r1,  r1,  #2
1423         b               \type\()_h264_qpel16_h_lowpass_l2_neon
1424 endfunc
1425
1426 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1427         push            {r4, lr}
1428         mov             ip,  r1
1429 \type\()_h264_qpel16_mc01:
1430         lowpass_const   r3
1431         mov             r3,  r2
1432         sub             r1,  r1,  r2, lsl #1
1433         vpush           {d8-d15}
1434         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1435         vpop            {d8-d15}
1436         pop             {r4, pc}
1437 endfunc
1438
1439 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1440         push            {r0, r1, r4, r11, lr}
1441 \type\()_h264_qpel16_mc11:
1442         lowpass_const   r3
1443         mov             r11, sp
1444         bic             sp,  sp,  #15
1445         sub             sp,  sp,  #256
1446         mov             r0,  sp
1447         sub             r1,  r1,  #2
1448         mov             r3,  #16
1449         vpush           {d8-d15}
1450         bl              put_h264_qpel16_h_lowpass_neon
1451         ldrd            r0,  [r11]
1452         mov             r3,  r2
1453         add             ip,  sp,  #64
1454         sub             r1,  r1,  r2, lsl #1
1455         mov             r2,  #16
1456         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1457         vpop            {d8-d15}
1458         add             sp,  r11, #8
1459         pop             {r4, r11, pc}
1460 endfunc
1461
1462 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1463         push            {r0, r1, r4-r5, r9-r11, lr}
1464 \type\()_h264_qpel16_mc21:
1465         lowpass_const   r3
1466         mov             r11, sp
1467         bic             sp,  sp,  #15
1468         sub             sp,  sp,  #(16*16+16*12)
1469         sub             r1,  r1,  #2
1470         mov             r0,  sp
1471         vpush           {d8-d15}
1472         bl              put_h264_qpel16_h_lowpass_neon_packed
1473         mov             r4,  r0
1474         ldrd            r0,  [r11]
1475         sub             r1,  r1,  r2, lsl #1
1476         sub             r1,  r1,  #2
1477         mov             r3,  r2
1478         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1479         vpop            {d8-d15}
1480         add             sp,  r11,  #8
1481         pop             {r4-r5, r9-r11, pc}
1482 endfunc
1483
1484 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1485         add             r1,  r1,  #1
1486         push            {r0, r1, r4, r11, lr}
1487         sub             r1,  r1,  #1
1488         b               \type\()_h264_qpel16_mc11
1489 endfunc
1490
1491 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1492         push            {r4, lr}
1493         lowpass_const   r3
1494         sub             r1,  r1,  r2, lsl #1
1495         mov             r3,  r2
1496         vpush           {d8-d15}
1497         bl              \type\()_h264_qpel16_v_lowpass_neon
1498         vpop            {d8-d15}
1499         pop             {r4, pc}
1500 endfunc
1501
1502 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1503         push            {r0, r1, r4-r5, r9-r11, lr}
1504 \type\()_h264_qpel16_mc12:
1505         lowpass_const   r3
1506         mov             r11, sp
1507         bic             sp,  sp,  #15
1508         sub             sp,  sp,  #(16*16+16*12)
1509         sub             r1,  r1,  r2, lsl #1
1510         mov             r0,  sp
1511         mov             r3,  r2
1512         vpush           {d8-d15}
1513         bl              put_h264_qpel16_v_lowpass_neon_packed
1514         mov             r4,  r0
1515         ldrd            r0,  [r11]
1516         sub             r1,  r1,  r3, lsl #1
1517         sub             r1,  r1,  #2
1518         mov             r2,  r3
1519         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1520         vpop            {d8-d15}
1521         add             sp,  r11,  #8
1522         pop             {r4-r5, r9-r11, pc}
1523 endfunc
1524
1525 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1526         push            {r4, r9-r11, lr}
1527         lowpass_const   r3
1528         mov             r11, sp
1529         bic             sp,  sp,  #15
1530         sub             r1,  r1,  r2, lsl #1
1531         sub             r1,  r1,  #2
1532         mov             r3,  r2
1533         sub             sp,  sp,  #(16*12)
1534         mov             r4,  sp
1535         vpush           {d8-d15}
1536         bl              \type\()_h264_qpel16_hv_lowpass_neon
1537         vpop            {d8-d15}
1538         mov             sp,  r11
1539         pop             {r4, r9-r11, pc}
1540 endfunc
1541
1542 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1543         push            {r0, r1, r4-r5, r9-r11, lr}
1544         add             r1,  r1,  #1
1545         b               \type\()_h264_qpel16_mc12
1546 endfunc
1547
1548 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1549         push            {r4, lr}
1550         add             ip,  r1,  r2
1551         b               \type\()_h264_qpel16_mc01
1552 endfunc
1553
1554 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1555         push            {r0, r1, r4, r11, lr}
1556         add             r1,  r1,  r2
1557         b               \type\()_h264_qpel16_mc11
1558 endfunc
1559
1560 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1561         push            {r0, r1, r4-r5, r9-r11, lr}
1562         add             r1,  r1,  r2
1563         b               \type\()_h264_qpel16_mc21
1564 endfunc
1565
1566 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1567         add             r1,  r1,  #1
1568         push            {r0, r1, r4, r11, lr}
1569         add             r1,  r1,  r2
1570         sub             r1,  r1,  #1
1571         b               \type\()_h264_qpel16_mc11
1572 endfunc
1573         .endm
1574
1575         h264_qpel16 put
1576         h264_qpel16 avg
1577
1578 @ Biweighted prediction
1579
1580         .macro  biweight_16 macs, macd
1581         vdup.8          d0,  r4
1582         vdup.8          d1,  r5
1583         vmov            q2,  q8
1584         vmov            q3,  q8
1585 1:      subs            ip,  ip,  #2
1586         vld1.8          {d20-d21},[r0,:128], r2
1587         \macd           q2,  d0,  d20
1588         pld             [r0]
1589         \macd           q3,  d0,  d21
1590         vld1.8          {d22-d23},[r1,:128], r2
1591         \macs           q2,  d1,  d22
1592         pld             [r1]
1593         \macs           q3,  d1,  d23
1594         vmov            q12, q8
1595         vld1.8          {d28-d29},[r0,:128], r2
1596         vmov            q13, q8
1597         \macd           q12, d0,  d28
1598         pld             [r0]
1599         \macd           q13, d0,  d29
1600         vld1.8          {d30-d31},[r1,:128], r2
1601         \macs           q12, d1,  d30
1602         pld             [r1]
1603         \macs           q13, d1,  d31
1604         vshl.s16        q2,  q2,  q9
1605         vshl.s16        q3,  q3,  q9
1606         vqmovun.s16     d4,  q2
1607         vqmovun.s16     d5,  q3
1608         vshl.s16        q12, q12, q9
1609         vshl.s16        q13, q13, q9
1610         vqmovun.s16     d24, q12
1611         vqmovun.s16     d25, q13
1612         vmov            q3,  q8
1613         vst1.8          {d4- d5}, [r6,:128], r2
1614         vmov            q2,  q8
1615         vst1.8          {d24-d25},[r6,:128], r2
1616         bne             1b
1617         pop             {r4-r6, pc}
1618         .endm
1619
1620         .macro  biweight_8 macs, macd
1621         vdup.8          d0,  r4
1622         vdup.8          d1,  r5
1623         vmov            q1,  q8
1624         vmov            q10, q8
1625 1:      subs            ip,  ip,  #2
1626         vld1.8          {d4},[r0,:64], r2
1627         \macd           q1,  d0,  d4
1628         pld             [r0]
1629         vld1.8          {d5},[r1,:64], r2
1630         \macs           q1,  d1,  d5
1631         pld             [r1]
1632         vld1.8          {d6},[r0,:64], r2
1633         \macd           q10, d0,  d6
1634         pld             [r0]
1635         vld1.8          {d7},[r1,:64], r2
1636         \macs           q10, d1,  d7
1637         pld             [r1]
1638         vshl.s16        q1,  q1,  q9
1639         vqmovun.s16     d2,  q1
1640         vshl.s16        q10, q10, q9
1641         vqmovun.s16     d4,  q10
1642         vmov            q10, q8
1643         vst1.8          {d2},[r6,:64], r2
1644         vmov            q1,  q8
1645         vst1.8          {d4},[r6,:64], r2
1646         bne             1b
1647         pop             {r4-r6, pc}
1648         .endm
1649
1650         .macro  biweight_4 macs, macd
1651         vdup.8          d0,  r4
1652         vdup.8          d1,  r5
1653         vmov            q1,  q8
1654         vmov            q10, q8
1655 1:      subs            ip,  ip,  #4
1656         vld1.32         {d4[0]},[r0,:32], r2
1657         vld1.32         {d4[1]},[r0,:32], r2
1658         \macd           q1,  d0,  d4
1659         pld             [r0]
1660         vld1.32         {d5[0]},[r1,:32], r2
1661         vld1.32         {d5[1]},[r1,:32], r2
1662         \macs           q1,  d1,  d5
1663         pld             [r1]
1664         blt             2f
1665         vld1.32         {d6[0]},[r0,:32], r2
1666         vld1.32         {d6[1]},[r0,:32], r2
1667         \macd           q10, d0,  d6
1668         pld             [r0]
1669         vld1.32         {d7[0]},[r1,:32], r2
1670         vld1.32         {d7[1]},[r1,:32], r2
1671         \macs           q10, d1,  d7
1672         pld             [r1]
1673         vshl.s16        q1,  q1,  q9
1674         vqmovun.s16     d2,  q1
1675         vshl.s16        q10, q10, q9
1676         vqmovun.s16     d4,  q10
1677         vmov            q10, q8
1678         vst1.32         {d2[0]},[r6,:32], r2
1679         vst1.32         {d2[1]},[r6,:32], r2
1680         vmov            q1,  q8
1681         vst1.32         {d4[0]},[r6,:32], r2
1682         vst1.32         {d4[1]},[r6,:32], r2
1683         bne             1b
1684         pop             {r4-r6, pc}
1685 2:      vshl.s16        q1,  q1,  q9
1686         vqmovun.s16     d2,  q1
1687         vst1.32         {d2[0]},[r6,:32], r2
1688         vst1.32         {d2[1]},[r6,:32], r2
1689         pop             {r4-r6, pc}
1690         .endm
1691
1692         .macro  biweight_func w
1693 function biweight_h264_pixels_\w\()_neon
1694         push            {r4-r6, lr}
1695         add             r4,  sp,  #16
1696         ldm             r4,  {r4-r6}
1697         lsr             lr,  r4,  #31
1698         add             r6,  r6,  #1
1699         eors            lr,  lr,  r5,  lsr #30
1700         orr             r6,  r6,  #1
1701         vdup.16         q9,  r3
1702         lsl             r6,  r6,  r3
1703         vmvn            q9,  q9
1704         vdup.16         q8,  r6
1705         mov             r6,  r0
1706         beq             10f
1707         subs            lr,  lr,  #1
1708         beq             20f
1709         subs            lr,  lr,  #1
1710         beq             30f
1711         b               40f
1712 10:     biweight_\w     vmlal.u8, vmlal.u8
1713 20:     rsb             r4,  r4,  #0
1714         biweight_\w     vmlal.u8, vmlsl.u8
1715 30:     rsb             r4,  r4,  #0
1716         rsb             r5,  r5,  #0
1717         biweight_\w     vmlsl.u8, vmlsl.u8
1718 40:     rsb             r5,  r5,  #0
1719         biweight_\w     vmlsl.u8, vmlal.u8
1720 endfunc
1721         .endm
1722
1723         .macro  biweight_entry w, h, b=1
1724 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1725         mov             ip,  #\h
1726 .if \b
1727         b               biweight_h264_pixels_\w\()_neon
1728 .endif
1729 endfunc
1730         .endm
1731
1732         biweight_entry  16, 8
1733         biweight_entry  16, 16, b=0
1734         biweight_func   16
1735
1736         biweight_entry  8,  16
1737         biweight_entry  8,  4
1738         biweight_entry  8,  8,  b=0
1739         biweight_func   8
1740
1741         biweight_entry  4,  8
1742         biweight_entry  4,  2
1743         biweight_entry  4,  4,  b=0
1744         biweight_func   4
1745
1746 @ Weighted prediction
1747
1748         .macro  weight_16 add
1749         vdup.8          d0,  r3
1750 1:      subs            ip,  ip,  #2
1751         vld1.8          {d20-d21},[r0,:128], r1
1752         vmull.u8        q2,  d0,  d20
1753         pld             [r0]
1754         vmull.u8        q3,  d0,  d21
1755         vld1.8          {d28-d29},[r0,:128], r1
1756         vmull.u8        q12, d0,  d28
1757         pld             [r0]
1758         vmull.u8        q13, d0,  d29
1759         \add            q2,  q8,  q2
1760         vrshl.s16       q2,  q2,  q9
1761         \add            q3,  q8,  q3
1762         vrshl.s16       q3,  q3,  q9
1763         vqmovun.s16     d4,  q2
1764         vqmovun.s16     d5,  q3
1765         \add            q12, q8,  q12
1766         vrshl.s16       q12, q12, q9
1767         \add            q13, q8,  q13
1768         vrshl.s16       q13, q13, q9
1769         vqmovun.s16     d24, q12
1770         vqmovun.s16     d25, q13
1771         vst1.8          {d4- d5}, [r4,:128], r1
1772         vst1.8          {d24-d25},[r4,:128], r1
1773         bne             1b
1774         pop             {r4, pc}
1775         .endm
1776
1777         .macro  weight_8 add
1778         vdup.8          d0,  r3
1779 1:      subs            ip,  ip,  #2
1780         vld1.8          {d4},[r0,:64], r1
1781         vmull.u8        q1,  d0,  d4
1782         pld             [r0]
1783         vld1.8          {d6},[r0,:64], r1
1784         vmull.u8        q10, d0,  d6
1785         \add            q1,  q8,  q1
1786         pld             [r0]
1787         vrshl.s16       q1,  q1,  q9
1788         vqmovun.s16     d2,  q1
1789         \add            q10, q8,  q10
1790         vrshl.s16       q10, q10, q9
1791         vqmovun.s16     d4,  q10
1792         vst1.8          {d2},[r4,:64], r1
1793         vst1.8          {d4},[r4,:64], r1
1794         bne             1b
1795         pop             {r4, pc}
1796         .endm
1797
1798         .macro  weight_4 add
1799         vdup.8          d0,  r3
1800         vmov            q1,  q8
1801         vmov            q10, q8
1802 1:      subs            ip,  ip,  #4
1803         vld1.32         {d4[0]},[r0,:32], r1
1804         vld1.32         {d4[1]},[r0,:32], r1
1805         vmull.u8        q1,  d0,  d4
1806         pld             [r0]
1807         blt             2f
1808         vld1.32         {d6[0]},[r0,:32], r1
1809         vld1.32         {d6[1]},[r0,:32], r1
1810         vmull.u8        q10, d0,  d6
1811         pld             [r0]
1812         \add            q1,  q8,  q1
1813         vrshl.s16       q1,  q1,  q9
1814         vqmovun.s16     d2,  q1
1815         \add            q10, q8,  q10
1816         vrshl.s16       q10, q10, q9
1817         vqmovun.s16     d4,  q10
1818         vmov            q10, q8
1819         vst1.32         {d2[0]},[r4,:32], r1
1820         vst1.32         {d2[1]},[r4,:32], r1
1821         vmov            q1,  q8
1822         vst1.32         {d4[0]},[r4,:32], r1
1823         vst1.32         {d4[1]},[r4,:32], r1
1824         bne             1b
1825         pop             {r4, pc}
1826 2:      \add            q1,  q8,  q1
1827         vrshl.s16       q1,  q1,  q9
1828         vqmovun.s16     d2,  q1
1829         vst1.32         {d2[0]},[r4,:32], r1
1830         vst1.32         {d2[1]},[r4,:32], r1
1831         pop             {r4, pc}
1832         .endm
1833
1834         .macro  weight_func w
1835 function weight_h264_pixels_\w\()_neon
1836         push            {r4, lr}
1837         ldr             r4,  [sp, #8]
1838         cmp             r2,  #1
1839         lsl             r4,  r4,  r2
1840         vdup.16         q8,  r4
1841         mov             r4,  r0
1842         ble             20f
1843         rsb             lr,  r2,  #1
1844         vdup.16         q9,  lr
1845         cmp             r3,  #0
1846         blt             10f
1847         weight_\w       vhadd.s16
1848 10:     rsb             r3,  r3,  #0
1849         weight_\w       vhsub.s16
1850 20:     rsb             lr,  r2,  #0
1851         vdup.16         q9,  lr
1852         cmp             r3,  #0
1853         blt             10f
1854         weight_\w       vadd.s16
1855 10:     rsb             r3,  r3,  #0
1856         weight_\w       vsub.s16
1857 endfunc
1858         .endm
1859
1860         .macro  weight_entry w, h, b=1
1861 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1862         mov             ip,  #\h
1863 .if \b
1864         b               weight_h264_pixels_\w\()_neon
1865 .endif
1866 endfunc
1867         .endm
1868
1869         weight_entry    16, 8
1870         weight_entry    16, 16, b=0
1871         weight_func     16
1872
1873         weight_entry    8,  16
1874         weight_entry    8,  4
1875         weight_entry    8,  8,  b=0
1876         weight_func     8
1877
1878         weight_entry    4,  8
1879         weight_entry    4,  2
1880         weight_entry    4,  4,  b=0
1881         weight_func     4