]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264dsp_neon.S
lavf: move ff_codec_get_tag() and ff_codec_get_id() definitions to internal.h
[ffmpeg] / libavcodec / arm / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22 #include "neon.S"
23
24         /* H.264 loop filter */
25
26 .macro  h264_loop_filter_start
27         ldr             r12, [sp]
28         tst             r2,  r2
29         ldr             r12, [r12]
30         it              ne
31         tstne           r3,  r3
32         vmov.32         d24[0], r12
33         and             r12, r12, r12, lsl #16
34         it              eq
35         bxeq            lr
36         ands            r12, r12, r12, lsl #8
37         it              lt
38         bxlt            lr
39 .endm
40
41 .macro  h264_loop_filter_luma
42         vdup.8          q11, r2         @ alpha
43         vmovl.u8        q12, d24
44         vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
45         vmovl.u16       q12, d24
46         vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
47         vsli.16         q12, q12, #8
48         vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
49         vsli.32         q12, q12, #16
50         vclt.u8         q6,  q6,  q11   @ < alpha
51         vdup.8          q11, r3         @ beta
52         vclt.s8         q7,  q12, #0
53         vclt.u8         q14, q14, q11   @ < beta
54         vclt.u8         q15, q15, q11   @ < beta
55         vbic            q6,  q6,  q7
56         vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
57         vand            q6,  q6,  q14
58         vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
59         vclt.u8         q4,  q4,  q11   @ < beta
60         vand            q6,  q6,  q15
61         vclt.u8         q5,  q5,  q11   @ < beta
62         vand            q4,  q4,  q6
63         vand            q5,  q5,  q6
64         vand            q12, q12, q6
65         vrhadd.u8       q14, q8,  q0
66         vsub.i8         q6,  q12, q4
67         vqadd.u8        q7,  q9,  q12
68         vhadd.u8        q10, q10, q14
69         vsub.i8         q6,  q6,  q5
70         vhadd.u8        q14, q2,  q14
71         vmin.u8         q7,  q7,  q10
72         vqsub.u8        q11, q9,  q12
73         vqadd.u8        q2,  q1,  q12
74         vmax.u8         q7,  q7,  q11
75         vqsub.u8        q11, q1,  q12
76         vmin.u8         q14, q2,  q14
77         vmovl.u8        q2,  d0
78         vmax.u8         q14, q14, q11
79         vmovl.u8        q10, d1
80         vsubw.u8        q2,  q2,  d16
81         vsubw.u8        q10, q10, d17
82         vshl.i16        q2,  q2,  #2
83         vshl.i16        q10, q10, #2
84         vaddw.u8        q2,  q2,  d18
85         vaddw.u8        q10, q10, d19
86         vsubw.u8        q2,  q2,  d2
87         vsubw.u8        q10, q10, d3
88         vrshrn.i16      d4,  q2,  #3
89         vrshrn.i16      d5,  q10, #3
90         vbsl            q4,  q7,  q9
91         vbsl            q5,  q14, q1
92         vneg.s8         q7,  q6
93         vmovl.u8        q14, d16
94         vmin.s8         q2,  q2,  q6
95         vmovl.u8        q6,  d17
96         vmax.s8         q2,  q2,  q7
97         vmovl.u8        q11, d0
98         vmovl.u8        q12, d1
99         vaddw.s8        q14, q14, d4
100         vaddw.s8        q6,  q6,  d5
101         vsubw.s8        q11, q11, d4
102         vsubw.s8        q12, q12, d5
103         vqmovun.s16     d16, q14
104         vqmovun.s16     d17, q6
105         vqmovun.s16     d0,  q11
106         vqmovun.s16     d1,  q12
107 .endm
108
109 function ff_h264_v_loop_filter_luma_neon, export=1
110         h264_loop_filter_start
111
112         vld1.8          {d0, d1},  [r0,:128], r1
113         vld1.8          {d2, d3},  [r0,:128], r1
114         vld1.8          {d4, d5},  [r0,:128], r1
115         sub             r0,  r0,  r1, lsl #2
116         sub             r0,  r0,  r1, lsl #1
117         vld1.8          {d20,d21}, [r0,:128], r1
118         vld1.8          {d18,d19}, [r0,:128], r1
119         vld1.8          {d16,d17}, [r0,:128], r1
120
121         vpush           {d8-d15}
122
123         h264_loop_filter_luma
124
125         sub             r0,  r0,  r1, lsl #1
126         vst1.8          {d8, d9},  [r0,:128], r1
127         vst1.8          {d16,d17}, [r0,:128], r1
128         vst1.8          {d0, d1},  [r0,:128], r1
129         vst1.8          {d10,d11}, [r0,:128]
130
131         vpop            {d8-d15}
132         bx              lr
133 endfunc
134
135 function ff_h264_h_loop_filter_luma_neon, export=1
136         h264_loop_filter_start
137
138         sub             r0,  r0,  #4
139         vld1.8          {d6},  [r0], r1
140         vld1.8          {d20}, [r0], r1
141         vld1.8          {d18}, [r0], r1
142         vld1.8          {d16}, [r0], r1
143         vld1.8          {d0},  [r0], r1
144         vld1.8          {d2},  [r0], r1
145         vld1.8          {d4},  [r0], r1
146         vld1.8          {d26}, [r0], r1
147         vld1.8          {d7},  [r0], r1
148         vld1.8          {d21}, [r0], r1
149         vld1.8          {d19}, [r0], r1
150         vld1.8          {d17}, [r0], r1
151         vld1.8          {d1},  [r0], r1
152         vld1.8          {d3},  [r0], r1
153         vld1.8          {d5},  [r0], r1
154         vld1.8          {d27}, [r0], r1
155
156         transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
157
158         vpush           {d8-d15}
159
160         h264_loop_filter_luma
161
162         transpose_4x4   q4, q8, q0, q5
163
164         sub             r0,  r0,  r1, lsl #4
165         add             r0,  r0,  #2
166         vst1.32         {d8[0]},  [r0], r1
167         vst1.32         {d16[0]}, [r0], r1
168         vst1.32         {d0[0]},  [r0], r1
169         vst1.32         {d10[0]}, [r0], r1
170         vst1.32         {d8[1]},  [r0], r1
171         vst1.32         {d16[1]}, [r0], r1
172         vst1.32         {d0[1]},  [r0], r1
173         vst1.32         {d10[1]}, [r0], r1
174         vst1.32         {d9[0]},  [r0], r1
175         vst1.32         {d17[0]}, [r0], r1
176         vst1.32         {d1[0]},  [r0], r1
177         vst1.32         {d11[0]}, [r0], r1
178         vst1.32         {d9[1]},  [r0], r1
179         vst1.32         {d17[1]}, [r0], r1
180         vst1.32         {d1[1]},  [r0], r1
181         vst1.32         {d11[1]}, [r0], r1
182
183         vpop            {d8-d15}
184         bx              lr
185 endfunc
186
187 .macro  h264_loop_filter_chroma
188         vdup.8          d22, r2         @ alpha
189         vmovl.u8        q12, d24
190         vabd.u8         d26, d16, d0    @ abs(p0 - q0)
191         vmovl.u8        q2,  d0
192         vabd.u8         d28, d18, d16   @ abs(p1 - p0)
193         vsubw.u8        q2,  q2,  d16
194         vsli.16         d24, d24, #8
195         vshl.i16        q2,  q2,  #2
196         vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
197         vaddw.u8        q2,  q2,  d18
198         vclt.u8         d26, d26, d22   @ < alpha
199         vsubw.u8        q2,  q2,  d2
200         vdup.8          d22, r3         @ beta
201         vrshrn.i16      d4,  q2,  #3
202         vclt.u8         d28, d28, d22   @ < beta
203         vclt.u8         d30, d30, d22   @ < beta
204         vmin.s8         d4,  d4,  d24
205         vneg.s8         d25, d24
206         vand            d26, d26, d28
207         vmax.s8         d4,  d4,  d25
208         vand            d26, d26, d30
209         vmovl.u8        q11, d0
210         vand            d4,  d4,  d26
211         vmovl.u8        q14, d16
212         vaddw.s8        q14, q14, d4
213         vsubw.s8        q11, q11, d4
214         vqmovun.s16     d16, q14
215         vqmovun.s16     d0,  q11
216 .endm
217
218 function ff_h264_v_loop_filter_chroma_neon, export=1
219         h264_loop_filter_start
220
221         sub             r0,  r0,  r1, lsl #1
222         vld1.8          {d18}, [r0,:64], r1
223         vld1.8          {d16}, [r0,:64], r1
224         vld1.8          {d0},  [r0,:64], r1
225         vld1.8          {d2},  [r0,:64]
226
227         h264_loop_filter_chroma
228
229         sub             r0,  r0,  r1, lsl #1
230         vst1.8          {d16}, [r0,:64], r1
231         vst1.8          {d0},  [r0,:64], r1
232
233         bx              lr
234 endfunc
235
236 function ff_h264_h_loop_filter_chroma_neon, export=1
237         h264_loop_filter_start
238
239         sub             r0,  r0,  #2
240         vld1.32         {d18[0]}, [r0], r1
241         vld1.32         {d16[0]}, [r0], r1
242         vld1.32         {d0[0]},  [r0], r1
243         vld1.32         {d2[0]},  [r0], r1
244         vld1.32         {d18[1]}, [r0], r1
245         vld1.32         {d16[1]}, [r0], r1
246         vld1.32         {d0[1]},  [r0], r1
247         vld1.32         {d2[1]},  [r0], r1
248
249         vtrn.16         d18, d0
250         vtrn.16         d16, d2
251         vtrn.8          d18, d16
252         vtrn.8          d0,  d2
253
254         h264_loop_filter_chroma
255
256         vtrn.16         d18, d0
257         vtrn.16         d16, d2
258         vtrn.8          d18, d16
259         vtrn.8          d0,  d2
260
261         sub             r0,  r0,  r1, lsl #3
262         vst1.32         {d18[0]}, [r0], r1
263         vst1.32         {d16[0]}, [r0], r1
264         vst1.32         {d0[0]},  [r0], r1
265         vst1.32         {d2[0]},  [r0], r1
266         vst1.32         {d18[1]}, [r0], r1
267         vst1.32         {d16[1]}, [r0], r1
268         vst1.32         {d0[1]},  [r0], r1
269         vst1.32         {d2[1]},  [r0], r1
270
271         bx              lr
272 endfunc
273
274         /* H.264 qpel MC */
275
276 .macro  lowpass_const   r
277         movw            \r,  #5
278         movt            \r,  #20
279         vmov.32         d6[0], \r
280 .endm
281
282 .macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
283   .if \narrow
284         t0 .req q0
285         t1 .req q8
286   .else
287         t0 .req \d0
288         t1 .req \d1
289   .endif
290         vext.8          d2,  \r0, \r1, #2
291         vext.8          d3,  \r0, \r1, #3
292         vaddl.u8        q1,  d2,  d3
293         vext.8          d4,  \r0, \r1, #1
294         vext.8          d5,  \r0, \r1, #4
295         vaddl.u8        q2,  d4,  d5
296         vext.8          d30, \r0, \r1, #5
297         vaddl.u8        t0,  \r0, d30
298         vext.8          d18, \r2, \r3, #2
299         vmla.i16        t0,  q1,  d6[1]
300         vext.8          d19, \r2, \r3, #3
301         vaddl.u8        q9,  d18, d19
302         vext.8          d20, \r2, \r3, #1
303         vmls.i16        t0,  q2,  d6[0]
304         vext.8          d21, \r2, \r3, #4
305         vaddl.u8        q10, d20, d21
306         vext.8          d31, \r2, \r3, #5
307         vaddl.u8        t1,  \r2, d31
308         vmla.i16        t1,  q9,  d6[1]
309         vmls.i16        t1,  q10, d6[0]
310   .if \narrow
311         vqrshrun.s16    \d0, t0,  #5
312         vqrshrun.s16    \d1, t1,  #5
313   .endif
314         .unreq  t0
315         .unreq  t1
316 .endm
317
318 .macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
319   .if \narrow
320         t0 .req q0
321   .else
322         t0 .req \d0
323   .endif
324         vext.8          d2,  \r0, \r1, #2
325         vext.8          d3,  \r0, \r1, #3
326         vaddl.u8        q1,  d2,  d3
327         vext.8          d4,  \r0, \r1, #1
328         vext.8          d5,  \r0, \r1, #4
329         vaddl.u8        q2,  d4,  d5
330         vext.8          d30, \r0, \r1, #5
331         vaddl.u8        t0,  \r0, d30
332         vmla.i16        t0,  q1,  d6[1]
333         vmls.i16        t0,  q2,  d6[0]
334   .if \narrow
335         vqrshrun.s16    \d0, t0,  #5
336   .endif
337         .unreq  t0
338 .endm
339
340 .macro  lowpass_8.16    r0,  r1,  l0,  h0,  l1,  h1,  d
341         vext.16         q1,  \r0, \r1, #2
342         vext.16         q0,  \r0, \r1, #3
343         vaddl.s16       q9,  d2,  d0
344         vext.16         q2,  \r0, \r1, #1
345         vaddl.s16       q1,  d3,  d1
346         vext.16         q3,  \r0, \r1, #4
347         vaddl.s16       q10, d4,  d6
348         vext.16         \r1, \r0, \r1, #5
349         vaddl.s16       q2,  d5,  d7
350         vaddl.s16       q0,  \h0, \h1
351         vaddl.s16       q8,  \l0, \l1
352
353         vshl.i32        q3,  q9,  #4
354         vshl.i32        q9,  q9,  #2
355         vshl.i32        q15, q10, #2
356         vadd.i32        q9,  q9,  q3
357         vadd.i32        q10, q10, q15
358
359         vshl.i32        q3,  q1,  #4
360         vshl.i32        q1,  q1,  #2
361         vshl.i32        q15, q2,  #2
362         vadd.i32        q1,  q1,  q3
363         vadd.i32        q2,  q2,  q15
364
365         vadd.i32        q9,  q9,  q8
366         vsub.i32        q9,  q9,  q10
367
368         vadd.i32        q1,  q1,  q0
369         vsub.i32        q1,  q1,  q2
370
371         vrshrn.s32      d18, q9,  #10
372         vrshrn.s32      d19, q1,  #10
373
374         vqmovun.s16     \d,  q9
375 .endm
376
377 function put_h264_qpel16_h_lowpass_neon_packed
378         mov             r4,  lr
379         mov             r12, #16
380         mov             r3,  #8
381         bl              put_h264_qpel8_h_lowpass_neon
382         sub             r1,  r1,  r2, lsl #4
383         add             r1,  r1,  #8
384         mov             r12, #16
385         mov             lr,  r4
386         b               put_h264_qpel8_h_lowpass_neon
387 endfunc
388
389 .macro  h264_qpel_h_lowpass type
390 function \type\()_h264_qpel16_h_lowpass_neon
391         push            {lr}
392         mov             r12, #16
393         bl              \type\()_h264_qpel8_h_lowpass_neon
394         sub             r0,  r0,  r3, lsl #4
395         sub             r1,  r1,  r2, lsl #4
396         add             r0,  r0,  #8
397         add             r1,  r1,  #8
398         mov             r12, #16
399         pop             {lr}
400 endfunc
401
402 function \type\()_h264_qpel8_h_lowpass_neon
403 1:      vld1.8          {d0, d1},  [r1], r2
404         vld1.8          {d16,d17}, [r1], r2
405         subs            r12, r12, #2
406         lowpass_8       d0,  d1,  d16, d17, d0,  d16
407   .ifc \type,avg
408         vld1.8          {d2},     [r0,:64], r3
409         vrhadd.u8       d0,  d0,  d2
410         vld1.8          {d3},     [r0,:64]
411         vrhadd.u8       d16, d16, d3
412         sub             r0,  r0,  r3
413   .endif
414         vst1.8          {d0},     [r0,:64], r3
415         vst1.8          {d16},    [r0,:64], r3
416         bne             1b
417         bx              lr
418 endfunc
419 .endm
420
421         h264_qpel_h_lowpass put
422         h264_qpel_h_lowpass avg
423
424 .macro  h264_qpel_h_lowpass_l2 type
425 function \type\()_h264_qpel16_h_lowpass_l2_neon
426         push            {lr}
427         mov             r12, #16
428         bl              \type\()_h264_qpel8_h_lowpass_l2_neon
429         sub             r0,  r0,  r2, lsl #4
430         sub             r1,  r1,  r2, lsl #4
431         sub             r3,  r3,  r2, lsl #4
432         add             r0,  r0,  #8
433         add             r1,  r1,  #8
434         add             r3,  r3,  #8
435         mov             r12, #16
436         pop             {lr}
437 endfunc
438
439 function \type\()_h264_qpel8_h_lowpass_l2_neon
440 1:      vld1.8          {d0, d1},  [r1], r2
441         vld1.8          {d16,d17}, [r1], r2
442         vld1.8          {d28},     [r3], r2
443         vld1.8          {d29},     [r3], r2
444         subs            r12, r12, #2
445         lowpass_8       d0,  d1,  d16, d17, d0,  d1
446         vrhadd.u8       q0,  q0,  q14
447   .ifc \type,avg
448         vld1.8          {d2},      [r0,:64], r2
449         vrhadd.u8       d0,  d0,  d2
450         vld1.8          {d3},      [r0,:64]
451         vrhadd.u8       d1,  d1,  d3
452         sub             r0,  r0,  r2
453   .endif
454         vst1.8          {d0},      [r0,:64], r2
455         vst1.8          {d1},      [r0,:64], r2
456         bne             1b
457         bx              lr
458 endfunc
459 .endm
460
461         h264_qpel_h_lowpass_l2 put
462         h264_qpel_h_lowpass_l2 avg
463
464 function put_h264_qpel16_v_lowpass_neon_packed
465         mov             r4,  lr
466         mov             r2,  #8
467         bl              put_h264_qpel8_v_lowpass_neon
468         sub             r1,  r1,  r3, lsl #2
469         bl              put_h264_qpel8_v_lowpass_neon
470         sub             r1,  r1,  r3, lsl #4
471         sub             r1,  r1,  r3, lsl #2
472         add             r1,  r1,  #8
473         bl              put_h264_qpel8_v_lowpass_neon
474         sub             r1,  r1,  r3, lsl #2
475         mov             lr,  r4
476         b               put_h264_qpel8_v_lowpass_neon
477 endfunc
478
479 .macro  h264_qpel_v_lowpass type
480 function \type\()_h264_qpel16_v_lowpass_neon
481         mov             r4,  lr
482         bl              \type\()_h264_qpel8_v_lowpass_neon
483         sub             r1,  r1,  r3, lsl #2
484         bl              \type\()_h264_qpel8_v_lowpass_neon
485         sub             r0,  r0,  r2, lsl #4
486         add             r0,  r0,  #8
487         sub             r1,  r1,  r3, lsl #4
488         sub             r1,  r1,  r3, lsl #2
489         add             r1,  r1,  #8
490         bl              \type\()_h264_qpel8_v_lowpass_neon
491         sub             r1,  r1,  r3, lsl #2
492         mov             lr,  r4
493 endfunc
494
495 function \type\()_h264_qpel8_v_lowpass_neon
496         vld1.8          {d8},  [r1], r3
497         vld1.8          {d10}, [r1], r3
498         vld1.8          {d12}, [r1], r3
499         vld1.8          {d14}, [r1], r3
500         vld1.8          {d22}, [r1], r3
501         vld1.8          {d24}, [r1], r3
502         vld1.8          {d26}, [r1], r3
503         vld1.8          {d28}, [r1], r3
504         vld1.8          {d9},  [r1], r3
505         vld1.8          {d11}, [r1], r3
506         vld1.8          {d13}, [r1], r3
507         vld1.8          {d15}, [r1], r3
508         vld1.8          {d23}, [r1]
509
510         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
511         lowpass_8       d8,  d9,  d10, d11, d8,  d10
512         lowpass_8       d12, d13, d14, d15, d12, d14
513         lowpass_8       d22, d23, d24, d25, d22, d24
514         lowpass_8       d26, d27, d28, d29, d26, d28
515         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
516
517   .ifc \type,avg
518         vld1.8          {d9},  [r0,:64], r2
519         vrhadd.u8       d8,  d8,  d9
520         vld1.8          {d11}, [r0,:64], r2
521         vrhadd.u8       d10, d10, d11
522         vld1.8          {d13}, [r0,:64], r2
523         vrhadd.u8       d12, d12, d13
524         vld1.8          {d15}, [r0,:64], r2
525         vrhadd.u8       d14, d14, d15
526         vld1.8          {d23}, [r0,:64], r2
527         vrhadd.u8       d22, d22, d23
528         vld1.8          {d25}, [r0,:64], r2
529         vrhadd.u8       d24, d24, d25
530         vld1.8          {d27}, [r0,:64], r2
531         vrhadd.u8       d26, d26, d27
532         vld1.8          {d29}, [r0,:64], r2
533         vrhadd.u8       d28, d28, d29
534         sub             r0,  r0,  r2,  lsl #3
535   .endif
536
537         vst1.8          {d8},  [r0,:64], r2
538         vst1.8          {d10}, [r0,:64], r2
539         vst1.8          {d12}, [r0,:64], r2
540         vst1.8          {d14}, [r0,:64], r2
541         vst1.8          {d22}, [r0,:64], r2
542         vst1.8          {d24}, [r0,:64], r2
543         vst1.8          {d26}, [r0,:64], r2
544         vst1.8          {d28}, [r0,:64], r2
545
546         bx              lr
547 endfunc
548 .endm
549
550         h264_qpel_v_lowpass put
551         h264_qpel_v_lowpass avg
552
553 .macro  h264_qpel_v_lowpass_l2 type
554 function \type\()_h264_qpel16_v_lowpass_l2_neon
555         mov             r4,  lr
556         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
557         sub             r1,  r1,  r3, lsl #2
558         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
559         sub             r0,  r0,  r3, lsl #4
560         sub             r12, r12, r2, lsl #4
561         add             r0,  r0,  #8
562         add             r12, r12, #8
563         sub             r1,  r1,  r3, lsl #4
564         sub             r1,  r1,  r3, lsl #2
565         add             r1,  r1,  #8
566         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
567         sub             r1,  r1,  r3, lsl #2
568         mov             lr,  r4
569 endfunc
570
571 function \type\()_h264_qpel8_v_lowpass_l2_neon
572         vld1.8          {d8},  [r1], r3
573         vld1.8          {d10}, [r1], r3
574         vld1.8          {d12}, [r1], r3
575         vld1.8          {d14}, [r1], r3
576         vld1.8          {d22}, [r1], r3
577         vld1.8          {d24}, [r1], r3
578         vld1.8          {d26}, [r1], r3
579         vld1.8          {d28}, [r1], r3
580         vld1.8          {d9},  [r1], r3
581         vld1.8          {d11}, [r1], r3
582         vld1.8          {d13}, [r1], r3
583         vld1.8          {d15}, [r1], r3
584         vld1.8          {d23}, [r1]
585
586         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
587         lowpass_8       d8,  d9,  d10, d11, d8,  d9
588         lowpass_8       d12, d13, d14, d15, d12, d13
589         lowpass_8       d22, d23, d24, d25, d22, d23
590         lowpass_8       d26, d27, d28, d29, d26, d27
591         transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
592
593         vld1.8          {d0},  [r12], r2
594         vld1.8          {d1},  [r12], r2
595         vld1.8          {d2},  [r12], r2
596         vld1.8          {d3},  [r12], r2
597         vld1.8          {d4},  [r12], r2
598         vrhadd.u8       q0,  q0,  q4
599         vld1.8          {d5},  [r12], r2
600         vrhadd.u8       q1,  q1,  q6
601         vld1.8          {d10}, [r12], r2
602         vrhadd.u8       q2,  q2,  q11
603         vld1.8          {d11}, [r12], r2
604         vrhadd.u8       q5,  q5,  q13
605
606   .ifc \type,avg
607         vld1.8          {d16}, [r0,:64], r3
608         vrhadd.u8       d0,  d0,  d16
609         vld1.8          {d17}, [r0,:64], r3
610         vrhadd.u8       d1,  d1,  d17
611         vld1.8          {d16}, [r0,:64], r3
612         vrhadd.u8       d2,  d2,  d16
613         vld1.8          {d17}, [r0,:64], r3
614         vrhadd.u8       d3,  d3,  d17
615         vld1.8          {d16}, [r0,:64], r3
616         vrhadd.u8       d4,  d4,  d16
617         vld1.8          {d17}, [r0,:64], r3
618         vrhadd.u8       d5,  d5,  d17
619         vld1.8          {d16}, [r0,:64], r3
620         vrhadd.u8       d10, d10, d16
621         vld1.8          {d17}, [r0,:64], r3
622         vrhadd.u8       d11, d11, d17
623         sub             r0,  r0,  r3,  lsl #3
624   .endif
625
626         vst1.8          {d0},  [r0,:64], r3
627         vst1.8          {d1},  [r0,:64], r3
628         vst1.8          {d2},  [r0,:64], r3
629         vst1.8          {d3},  [r0,:64], r3
630         vst1.8          {d4},  [r0,:64], r3
631         vst1.8          {d5},  [r0,:64], r3
632         vst1.8          {d10}, [r0,:64], r3
633         vst1.8          {d11}, [r0,:64], r3
634
635         bx              lr
636 endfunc
637 .endm
638
639         h264_qpel_v_lowpass_l2 put
640         h264_qpel_v_lowpass_l2 avg
641
642 function put_h264_qpel8_hv_lowpass_neon_top
643         lowpass_const   r12
644         mov             r12, #12
645 1:      vld1.8          {d0, d1},  [r1], r3
646         vld1.8          {d16,d17}, [r1], r3
647         subs            r12, r12, #2
648         lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
649         vst1.8          {d22-d25}, [r4,:128]!
650         bne             1b
651
652         vld1.8          {d0, d1},  [r1]
653         lowpass_8_1     d0,  d1,  q12, narrow=0
654
655         mov             r12, #-16
656         add             r4,  r4,  r12
657         vld1.8          {d30,d31}, [r4,:128], r12
658         vld1.8          {d20,d21}, [r4,:128], r12
659         vld1.8          {d18,d19}, [r4,:128], r12
660         vld1.8          {d16,d17}, [r4,:128], r12
661         vld1.8          {d14,d15}, [r4,:128], r12
662         vld1.8          {d12,d13}, [r4,:128], r12
663         vld1.8          {d10,d11}, [r4,:128], r12
664         vld1.8          {d8, d9},  [r4,:128], r12
665         vld1.8          {d6, d7},  [r4,:128], r12
666         vld1.8          {d4, d5},  [r4,:128], r12
667         vld1.8          {d2, d3},  [r4,:128], r12
668         vld1.8          {d0, d1},  [r4,:128]
669
670         swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
671         transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
672
673         swap4           d17, d19, d21, d31, d24, d26, d28, d22
674         transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
675
676         vst1.8          {d30,d31}, [r4,:128]!
677         vst1.8          {d6, d7},  [r4,:128]!
678         vst1.8          {d20,d21}, [r4,:128]!
679         vst1.8          {d4, d5},  [r4,:128]!
680         vst1.8          {d18,d19}, [r4,:128]!
681         vst1.8          {d2, d3},  [r4,:128]!
682         vst1.8          {d16,d17}, [r4,:128]!
683         vst1.8          {d0, d1},  [r4,:128]
684
685         lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
686         lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
687         lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
688         lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
689
690         vld1.8          {d16,d17}, [r4,:128], r12
691         vld1.8          {d30,d31}, [r4,:128], r12
692         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
693         vld1.8          {d16,d17}, [r4,:128], r12
694         vld1.8          {d30,d31}, [r4,:128], r12
695         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
696         vld1.8          {d16,d17}, [r4,:128], r12
697         vld1.8          {d30,d31}, [r4,:128], r12
698         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
699         vld1.8          {d16,d17}, [r4,:128], r12
700         vld1.8          {d30,d31}, [r4,:128]
701         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
702
703         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
704
705         bx              lr
706 endfunc
707
708 .macro  h264_qpel8_hv_lowpass type
709 function \type\()_h264_qpel8_hv_lowpass_neon
710         mov             r10, lr
711         bl              put_h264_qpel8_hv_lowpass_neon_top
712   .ifc \type,avg
713         vld1.8          {d0},      [r0,:64], r2
714         vrhadd.u8       d12, d12, d0
715         vld1.8          {d1},      [r0,:64], r2
716         vrhadd.u8       d13, d13, d1
717         vld1.8          {d2},      [r0,:64], r2
718         vrhadd.u8       d14, d14, d2
719         vld1.8          {d3},      [r0,:64], r2
720         vrhadd.u8       d15, d15, d3
721         vld1.8          {d4},      [r0,:64], r2
722         vrhadd.u8       d8,  d8,  d4
723         vld1.8          {d5},      [r0,:64], r2
724         vrhadd.u8       d9,  d9,  d5
725         vld1.8          {d6},      [r0,:64], r2
726         vrhadd.u8       d10, d10, d6
727         vld1.8          {d7},      [r0,:64], r2
728         vrhadd.u8       d11, d11, d7
729         sub             r0,  r0,  r2,  lsl #3
730   .endif
731
732         vst1.8          {d12},     [r0,:64], r2
733         vst1.8          {d13},     [r0,:64], r2
734         vst1.8          {d14},     [r0,:64], r2
735         vst1.8          {d15},     [r0,:64], r2
736         vst1.8          {d8},      [r0,:64], r2
737         vst1.8          {d9},      [r0,:64], r2
738         vst1.8          {d10},     [r0,:64], r2
739         vst1.8          {d11},     [r0,:64], r2
740
741         mov             lr,  r10
742         bx              lr
743 endfunc
744 .endm
745
746         h264_qpel8_hv_lowpass put
747         h264_qpel8_hv_lowpass avg
748
749 .macro  h264_qpel8_hv_lowpass_l2 type
750 function \type\()_h264_qpel8_hv_lowpass_l2_neon
751         mov             r10, lr
752         bl              put_h264_qpel8_hv_lowpass_neon_top
753
754         vld1.8          {d0, d1},  [r2,:128]!
755         vld1.8          {d2, d3},  [r2,:128]!
756         vrhadd.u8       q0,  q0,  q6
757         vld1.8          {d4, d5},  [r2,:128]!
758         vrhadd.u8       q1,  q1,  q7
759         vld1.8          {d6, d7},  [r2,:128]!
760         vrhadd.u8       q2,  q2,  q4
761         vrhadd.u8       q3,  q3,  q5
762   .ifc \type,avg
763         vld1.8          {d16},     [r0,:64], r3
764         vrhadd.u8       d0,  d0,  d16
765         vld1.8          {d17},     [r0,:64], r3
766         vrhadd.u8       d1,  d1,  d17
767         vld1.8          {d18},     [r0,:64], r3
768         vrhadd.u8       d2,  d2,  d18
769         vld1.8          {d19},     [r0,:64], r3
770         vrhadd.u8       d3,  d3,  d19
771         vld1.8          {d20},     [r0,:64], r3
772         vrhadd.u8       d4,  d4,  d20
773         vld1.8          {d21},     [r0,:64], r3
774         vrhadd.u8       d5,  d5,  d21
775         vld1.8          {d22},     [r0,:64], r3
776         vrhadd.u8       d6,  d6,  d22
777         vld1.8          {d23},     [r0,:64], r3
778         vrhadd.u8       d7,  d7,  d23
779         sub             r0,  r0,  r3,  lsl #3
780   .endif
781         vst1.8          {d0},      [r0,:64], r3
782         vst1.8          {d1},      [r0,:64], r3
783         vst1.8          {d2},      [r0,:64], r3
784         vst1.8          {d3},      [r0,:64], r3
785         vst1.8          {d4},      [r0,:64], r3
786         vst1.8          {d5},      [r0,:64], r3
787         vst1.8          {d6},      [r0,:64], r3
788         vst1.8          {d7},      [r0,:64], r3
789
790         mov             lr,  r10
791         bx              lr
792 endfunc
793 .endm
794
795         h264_qpel8_hv_lowpass_l2 put
796         h264_qpel8_hv_lowpass_l2 avg
797
798 .macro  h264_qpel16_hv  type
799 function \type\()_h264_qpel16_hv_lowpass_neon
800         mov             r9,  lr
801         bl              \type\()_h264_qpel8_hv_lowpass_neon
802         sub             r1,  r1,  r3, lsl #2
803         bl              \type\()_h264_qpel8_hv_lowpass_neon
804         sub             r1,  r1,  r3, lsl #4
805         sub             r1,  r1,  r3, lsl #2
806         add             r1,  r1,  #8
807         sub             r0,  r0,  r2, lsl #4
808         add             r0,  r0,  #8
809         bl              \type\()_h264_qpel8_hv_lowpass_neon
810         sub             r1,  r1,  r3, lsl #2
811         mov             lr,  r9
812         b               \type\()_h264_qpel8_hv_lowpass_neon
813 endfunc
814
815 function \type\()_h264_qpel16_hv_lowpass_l2_neon
816         mov             r9,  lr
817         sub             r2,  r4,  #256
818         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
819         sub             r1,  r1,  r3, lsl #2
820         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
821         sub             r1,  r1,  r3, lsl #4
822         sub             r1,  r1,  r3, lsl #2
823         add             r1,  r1,  #8
824         sub             r0,  r0,  r3, lsl #4
825         add             r0,  r0,  #8
826         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
827         sub             r1,  r1,  r3, lsl #2
828         mov             lr,  r9
829         b               \type\()_h264_qpel8_hv_lowpass_l2_neon
830 endfunc
831 .endm
832
833         h264_qpel16_hv put
834         h264_qpel16_hv avg
835
836 .macro  h264_qpel8      type
837 function ff_\type\()_h264_qpel8_mc10_neon, export=1
838         lowpass_const   r3
839         mov             r3,  r1
840         sub             r1,  r1,  #2
841         mov             r12, #8
842         b               \type\()_h264_qpel8_h_lowpass_l2_neon
843 endfunc
844
845 function ff_\type\()_h264_qpel8_mc20_neon, export=1
846         lowpass_const   r3
847         sub             r1,  r1,  #2
848         mov             r3,  r2
849         mov             r12, #8
850         b               \type\()_h264_qpel8_h_lowpass_neon
851 endfunc
852
853 function ff_\type\()_h264_qpel8_mc30_neon, export=1
854         lowpass_const   r3
855         add             r3,  r1,  #1
856         sub             r1,  r1,  #2
857         mov             r12, #8
858         b               \type\()_h264_qpel8_h_lowpass_l2_neon
859 endfunc
860
861 function ff_\type\()_h264_qpel8_mc01_neon, export=1
862         push            {lr}
863         mov             r12, r1
864 \type\()_h264_qpel8_mc01:
865         lowpass_const   r3
866         mov             r3,  r2
867         sub             r1,  r1,  r2, lsl #1
868         vpush           {d8-d15}
869         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
870         vpop            {d8-d15}
871         pop             {pc}
872 endfunc
873
874 function ff_\type\()_h264_qpel8_mc11_neon, export=1
875         push            {r0, r1, r11, lr}
876 \type\()_h264_qpel8_mc11:
877         lowpass_const   r3
878         mov             r11, sp
879 A       bic             sp,  sp,  #15
880 T       bic             r0,  r11, #15
881 T       mov             sp,  r0
882         sub             sp,  sp,  #64
883         mov             r0,  sp
884         sub             r1,  r1,  #2
885         mov             r3,  #8
886         mov             r12, #8
887         vpush           {d8-d15}
888         bl              put_h264_qpel8_h_lowpass_neon
889         ldrd            r0,  r1,  [r11], #8
890         mov             r3,  r2
891         add             r12, sp,  #64
892         sub             r1,  r1,  r2, lsl #1
893         mov             r2,  #8
894         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
895         vpop            {d8-d15}
896         mov             sp,  r11
897         pop             {r11, pc}
898 endfunc
899
900 function ff_\type\()_h264_qpel8_mc21_neon, export=1
901         push            {r0, r1, r4, r10, r11, lr}
902 \type\()_h264_qpel8_mc21:
903         lowpass_const   r3
904         mov             r11, sp
905 A       bic             sp,  sp,  #15
906 T       bic             r0,  r11, #15
907 T       mov             sp,  r0
908         sub             sp,  sp,  #(8*8+16*12)
909         sub             r1,  r1,  #2
910         mov             r3,  #8
911         mov             r0,  sp
912         mov             r12, #8
913         vpush           {d8-d15}
914         bl              put_h264_qpel8_h_lowpass_neon
915         mov             r4,  r0
916         ldrd            r0,  r1,  [r11], #8
917         sub             r1,  r1,  r2, lsl #1
918         sub             r1,  r1,  #2
919         mov             r3,  r2
920         sub             r2,  r4,  #64
921         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
922         vpop            {d8-d15}
923         mov             sp,  r11
924         pop             {r4, r10, r11, pc}
925 endfunc
926
927 function ff_\type\()_h264_qpel8_mc31_neon, export=1
928         add             r1,  r1,  #1
929         push            {r0, r1, r11, lr}
930         sub             r1,  r1,  #1
931         b               \type\()_h264_qpel8_mc11
932 endfunc
933
934 function ff_\type\()_h264_qpel8_mc02_neon, export=1
935         push            {lr}
936         lowpass_const   r3
937         sub             r1,  r1,  r2, lsl #1
938         mov             r3,  r2
939         vpush           {d8-d15}
940         bl              \type\()_h264_qpel8_v_lowpass_neon
941         vpop            {d8-d15}
942         pop             {pc}
943 endfunc
944
945 function ff_\type\()_h264_qpel8_mc12_neon, export=1
946         push            {r0, r1, r4, r10, r11, lr}
947 \type\()_h264_qpel8_mc12:
948         lowpass_const   r3
949         mov             r11, sp
950 A       bic             sp,  sp,  #15
951 T       bic             r0,  r11, #15
952 T       mov             sp,  r0
953         sub             sp,  sp,  #(8*8+16*12)
954         sub             r1,  r1,  r2, lsl #1
955         mov             r3,  r2
956         mov             r2,  #8
957         mov             r0,  sp
958         vpush           {d8-d15}
959         bl              put_h264_qpel8_v_lowpass_neon
960         mov             r4,  r0
961         ldrd            r0,  r1,  [r11], #8
962         sub             r1,  r1,  r3, lsl #1
963         sub             r1,  r1,  #2
964         sub             r2,  r4,  #64
965         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
966         vpop            {d8-d15}
967         mov             sp,  r11
968         pop             {r4, r10, r11, pc}
969 endfunc
970
971 function ff_\type\()_h264_qpel8_mc22_neon, export=1
972         push            {r4, r10, r11, lr}
973         mov             r11, sp
974 A       bic             sp,  sp,  #15
975 T       bic             r4,  r11, #15
976 T       mov             sp,  r4
977         sub             r1,  r1,  r2, lsl #1
978         sub             r1,  r1,  #2
979         mov             r3,  r2
980         sub             sp,  sp,  #(16*12)
981         mov             r4,  sp
982         vpush           {d8-d15}
983         bl              \type\()_h264_qpel8_hv_lowpass_neon
984         vpop            {d8-d15}
985         mov             sp,  r11
986         pop             {r4, r10, r11, pc}
987 endfunc
988
989 function ff_\type\()_h264_qpel8_mc32_neon, export=1
990         push            {r0, r1, r4, r10, r11, lr}
991         add             r1,  r1,  #1
992         b               \type\()_h264_qpel8_mc12
993 endfunc
994
995 function ff_\type\()_h264_qpel8_mc03_neon, export=1
996         push            {lr}
997         add             r12, r1,  r2
998         b               \type\()_h264_qpel8_mc01
999 endfunc
1000
1001 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1002         push            {r0, r1, r11, lr}
1003         add             r1,  r1,  r2
1004         b               \type\()_h264_qpel8_mc11
1005 endfunc
1006
1007 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1008         push            {r0, r1, r4, r10, r11, lr}
1009         add             r1,  r1,  r2
1010         b               \type\()_h264_qpel8_mc21
1011 endfunc
1012
1013 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1014         add             r1,  r1,  #1
1015         push            {r0, r1, r11, lr}
1016         add             r1,  r1,  r2
1017         sub             r1,  r1,  #1
1018         b               \type\()_h264_qpel8_mc11
1019 endfunc
1020 .endm
1021
1022         h264_qpel8 put
1023         h264_qpel8 avg
1024
1025 .macro  h264_qpel16     type
1026 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1027         lowpass_const   r3
1028         mov             r3,  r1
1029         sub             r1,  r1,  #2
1030         b               \type\()_h264_qpel16_h_lowpass_l2_neon
1031 endfunc
1032
1033 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1034         lowpass_const   r3
1035         sub             r1,  r1,  #2
1036         mov             r3,  r2
1037         b               \type\()_h264_qpel16_h_lowpass_neon
1038 endfunc
1039
1040 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1041         lowpass_const   r3
1042         add             r3,  r1,  #1
1043         sub             r1,  r1,  #2
1044         b               \type\()_h264_qpel16_h_lowpass_l2_neon
1045 endfunc
1046
1047 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1048         push            {r4, lr}
1049         mov             r12, r1
1050 \type\()_h264_qpel16_mc01:
1051         lowpass_const   r3
1052         mov             r3,  r2
1053         sub             r1,  r1,  r2, lsl #1
1054         vpush           {d8-d15}
1055         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1056         vpop            {d8-d15}
1057         pop             {r4, pc}
1058 endfunc
1059
1060 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1061         push            {r0, r1, r4, r11, lr}
1062 \type\()_h264_qpel16_mc11:
1063         lowpass_const   r3
1064         mov             r11, sp
1065 A       bic             sp,  sp,  #15
1066 T       bic             r0,  r11, #15
1067 T       mov             sp,  r0
1068         sub             sp,  sp,  #256
1069         mov             r0,  sp
1070         sub             r1,  r1,  #2
1071         mov             r3,  #16
1072         vpush           {d8-d15}
1073         bl              put_h264_qpel16_h_lowpass_neon
1074         ldrd            r0,  r1,  [r11], #8
1075         mov             r3,  r2
1076         add             r12, sp,  #64
1077         sub             r1,  r1,  r2, lsl #1
1078         mov             r2,  #16
1079         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1080         vpop            {d8-d15}
1081         mov             sp,  r11
1082         pop             {r4, r11, pc}
1083 endfunc
1084
1085 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1086         push            {r0, r1, r4-r5, r9-r11, lr}
1087 \type\()_h264_qpel16_mc21:
1088         lowpass_const   r3
1089         mov             r11, sp
1090 A       bic             sp,  sp,  #15
1091 T       bic             r0,  r11, #15
1092 T       mov             sp,  r0
1093         sub             sp,  sp,  #(16*16+16*12)
1094         sub             r1,  r1,  #2
1095         mov             r0,  sp
1096         vpush           {d8-d15}
1097         bl              put_h264_qpel16_h_lowpass_neon_packed
1098         mov             r4,  r0
1099         ldrd            r0,  r1,  [r11], #8
1100         sub             r1,  r1,  r2, lsl #1
1101         sub             r1,  r1,  #2
1102         mov             r3,  r2
1103         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1104         vpop            {d8-d15}
1105         mov             sp,  r11
1106         pop             {r4-r5, r9-r11, pc}
1107 endfunc
1108
1109 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1110         add             r1,  r1,  #1
1111         push            {r0, r1, r4, r11, lr}
1112         sub             r1,  r1,  #1
1113         b               \type\()_h264_qpel16_mc11
1114 endfunc
1115
1116 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1117         push            {r4, lr}
1118         lowpass_const   r3
1119         sub             r1,  r1,  r2, lsl #1
1120         mov             r3,  r2
1121         vpush           {d8-d15}
1122         bl              \type\()_h264_qpel16_v_lowpass_neon
1123         vpop            {d8-d15}
1124         pop             {r4, pc}
1125 endfunc
1126
1127 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1128         push            {r0, r1, r4-r5, r9-r11, lr}
1129 \type\()_h264_qpel16_mc12:
1130         lowpass_const   r3
1131         mov             r11, sp
1132 A       bic             sp,  sp,  #15
1133 T       bic             r0,  r11, #15
1134 T       mov             sp,  r0
1135         sub             sp,  sp,  #(16*16+16*12)
1136         sub             r1,  r1,  r2, lsl #1
1137         mov             r0,  sp
1138         mov             r3,  r2
1139         vpush           {d8-d15}
1140         bl              put_h264_qpel16_v_lowpass_neon_packed
1141         mov             r4,  r0
1142         ldrd            r0,  r1,  [r11], #8
1143         sub             r1,  r1,  r3, lsl #1
1144         sub             r1,  r1,  #2
1145         mov             r2,  r3
1146         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1147         vpop            {d8-d15}
1148         mov             sp,  r11
1149         pop             {r4-r5, r9-r11, pc}
1150 endfunc
1151
1152 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1153         push            {r4, r9-r11, lr}
1154         lowpass_const   r3
1155         mov             r11, sp
1156 A       bic             sp,  sp,  #15
1157 T       bic             r4,  r11, #15
1158 T       mov             sp,  r4
1159         sub             r1,  r1,  r2, lsl #1
1160         sub             r1,  r1,  #2
1161         mov             r3,  r2
1162         sub             sp,  sp,  #(16*12)
1163         mov             r4,  sp
1164         vpush           {d8-d15}
1165         bl              \type\()_h264_qpel16_hv_lowpass_neon
1166         vpop            {d8-d15}
1167         mov             sp,  r11
1168         pop             {r4, r9-r11, pc}
1169 endfunc
1170
1171 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1172         push            {r0, r1, r4-r5, r9-r11, lr}
1173         add             r1,  r1,  #1
1174         b               \type\()_h264_qpel16_mc12
1175 endfunc
1176
1177 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1178         push            {r4, lr}
1179         add             r12, r1,  r2
1180         b               \type\()_h264_qpel16_mc01
1181 endfunc
1182
1183 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1184         push            {r0, r1, r4, r11, lr}
1185         add             r1,  r1,  r2
1186         b               \type\()_h264_qpel16_mc11
1187 endfunc
1188
1189 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1190         push            {r0, r1, r4-r5, r9-r11, lr}
1191         add             r1,  r1,  r2
1192         b               \type\()_h264_qpel16_mc21
1193 endfunc
1194
1195 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1196         add             r1,  r1,  #1
1197         push            {r0, r1, r4, r11, lr}
1198         add             r1,  r1,  r2
1199         sub             r1,  r1,  #1
1200         b               \type\()_h264_qpel16_mc11
1201 endfunc
1202 .endm
1203
1204         h264_qpel16 put
1205         h264_qpel16 avg
1206
1207 @ Biweighted prediction
1208
1209 .macro  biweight_16     macs, macd
1210         vdup.8          d0,  r4
1211         vdup.8          d1,  r5
1212         vmov            q2,  q8
1213         vmov            q3,  q8
1214 1:      subs            r3,  r3,  #2
1215         vld1.8          {d20-d21},[r0,:128], r2
1216         \macd           q2,  d0,  d20
1217         pld             [r0]
1218         \macd           q3,  d0,  d21
1219         vld1.8          {d22-d23},[r1,:128], r2
1220         \macs           q2,  d1,  d22
1221         pld             [r1]
1222         \macs           q3,  d1,  d23
1223         vmov            q12, q8
1224         vld1.8          {d28-d29},[r0,:128], r2
1225         vmov            q13, q8
1226         \macd           q12, d0,  d28
1227         pld             [r0]
1228         \macd           q13, d0,  d29
1229         vld1.8          {d30-d31},[r1,:128], r2
1230         \macs           q12, d1,  d30
1231         pld             [r1]
1232         \macs           q13, d1,  d31
1233         vshl.s16        q2,  q2,  q9
1234         vshl.s16        q3,  q3,  q9
1235         vqmovun.s16     d4,  q2
1236         vqmovun.s16     d5,  q3
1237         vshl.s16        q12, q12, q9
1238         vshl.s16        q13, q13, q9
1239         vqmovun.s16     d24, q12
1240         vqmovun.s16     d25, q13
1241         vmov            q3,  q8
1242         vst1.8          {d4- d5}, [r6,:128], r2
1243         vmov            q2,  q8
1244         vst1.8          {d24-d25},[r6,:128], r2
1245         bne             1b
1246         pop             {r4-r6, pc}
1247 .endm
1248
1249 .macro  biweight_8      macs, macd
1250         vdup.8          d0,  r4
1251         vdup.8          d1,  r5
1252         vmov            q1,  q8
1253         vmov            q10, q8
1254 1:      subs            r3,  r3,  #2
1255         vld1.8          {d4},[r0,:64], r2
1256         \macd           q1,  d0,  d4
1257         pld             [r0]
1258         vld1.8          {d5},[r1,:64], r2
1259         \macs           q1,  d1,  d5
1260         pld             [r1]
1261         vld1.8          {d6},[r0,:64], r2
1262         \macd           q10, d0,  d6
1263         pld             [r0]
1264         vld1.8          {d7},[r1,:64], r2
1265         \macs           q10, d1,  d7
1266         pld             [r1]
1267         vshl.s16        q1,  q1,  q9
1268         vqmovun.s16     d2,  q1
1269         vshl.s16        q10, q10, q9
1270         vqmovun.s16     d4,  q10
1271         vmov            q10, q8
1272         vst1.8          {d2},[r6,:64], r2
1273         vmov            q1,  q8
1274         vst1.8          {d4},[r6,:64], r2
1275         bne             1b
1276         pop             {r4-r6, pc}
1277 .endm
1278
1279 .macro  biweight_4      macs, macd
1280         vdup.8          d0,  r4
1281         vdup.8          d1,  r5
1282         vmov            q1,  q8
1283         vmov            q10, q8
1284 1:      subs            r3,  r3,  #4
1285         vld1.32         {d4[0]},[r0,:32], r2
1286         vld1.32         {d4[1]},[r0,:32], r2
1287         \macd           q1,  d0,  d4
1288         pld             [r0]
1289         vld1.32         {d5[0]},[r1,:32], r2
1290         vld1.32         {d5[1]},[r1,:32], r2
1291         \macs           q1,  d1,  d5
1292         pld             [r1]
1293         blt             2f
1294         vld1.32         {d6[0]},[r0,:32], r2
1295         vld1.32         {d6[1]},[r0,:32], r2
1296         \macd           q10, d0,  d6
1297         pld             [r0]
1298         vld1.32         {d7[0]},[r1,:32], r2
1299         vld1.32         {d7[1]},[r1,:32], r2
1300         \macs           q10, d1,  d7
1301         pld             [r1]
1302         vshl.s16        q1,  q1,  q9
1303         vqmovun.s16     d2,  q1
1304         vshl.s16        q10, q10, q9
1305         vqmovun.s16     d4,  q10
1306         vmov            q10, q8
1307         vst1.32         {d2[0]},[r6,:32], r2
1308         vst1.32         {d2[1]},[r6,:32], r2
1309         vmov            q1,  q8
1310         vst1.32         {d4[0]},[r6,:32], r2
1311         vst1.32         {d4[1]},[r6,:32], r2
1312         bne             1b
1313         pop             {r4-r6, pc}
1314 2:      vshl.s16        q1,  q1,  q9
1315         vqmovun.s16     d2,  q1
1316         vst1.32         {d2[0]},[r6,:32], r2
1317         vst1.32         {d2[1]},[r6,:32], r2
1318         pop             {r4-r6, pc}
1319 .endm
1320
1321 .macro  biweight_func   w
1322 function ff_biweight_h264_pixels_\w\()_neon, export=1
1323         push            {r4-r6, lr}
1324         ldr             r12, [sp, #16]
1325         add             r4,  sp,  #20
1326         ldm             r4,  {r4-r6}
1327         lsr             lr,  r4,  #31
1328         add             r6,  r6,  #1
1329         eors            lr,  lr,  r5,  lsr #30
1330         orr             r6,  r6,  #1
1331         vdup.16         q9,  r12
1332         lsl             r6,  r6,  r12
1333         vmvn            q9,  q9
1334         vdup.16         q8,  r6
1335         mov             r6,  r0
1336         beq             10f
1337         subs            lr,  lr,  #1
1338         beq             20f
1339         subs            lr,  lr,  #1
1340         beq             30f
1341         b               40f
1342 10:     biweight_\w     vmlal.u8, vmlal.u8
1343 20:     rsb             r4,  r4,  #0
1344         biweight_\w     vmlal.u8, vmlsl.u8
1345 30:     rsb             r4,  r4,  #0
1346         rsb             r5,  r5,  #0
1347         biweight_\w     vmlsl.u8, vmlsl.u8
1348 40:     rsb             r5,  r5,  #0
1349         biweight_\w     vmlsl.u8, vmlal.u8
1350 endfunc
1351 .endm
1352
1353         biweight_func   16
1354         biweight_func   8
1355         biweight_func   4
1356
1357 @ Weighted prediction
1358
1359 .macro  weight_16       add
1360         vdup.8          d0,  r12
1361 1:      subs            r2,  r2,  #2
1362         vld1.8          {d20-d21},[r0,:128], r1
1363         vmull.u8        q2,  d0,  d20
1364         pld             [r0]
1365         vmull.u8        q3,  d0,  d21
1366         vld1.8          {d28-d29},[r0,:128], r1
1367         vmull.u8        q12, d0,  d28
1368         pld             [r0]
1369         vmull.u8        q13, d0,  d29
1370         \add            q2,  q8,  q2
1371         vrshl.s16       q2,  q2,  q9
1372         \add            q3,  q8,  q3
1373         vrshl.s16       q3,  q3,  q9
1374         vqmovun.s16     d4,  q2
1375         vqmovun.s16     d5,  q3
1376         \add            q12, q8,  q12
1377         vrshl.s16       q12, q12, q9
1378         \add            q13, q8,  q13
1379         vrshl.s16       q13, q13, q9
1380         vqmovun.s16     d24, q12
1381         vqmovun.s16     d25, q13
1382         vst1.8          {d4- d5}, [r4,:128], r1
1383         vst1.8          {d24-d25},[r4,:128], r1
1384         bne             1b
1385         pop             {r4, pc}
1386 .endm
1387
1388 .macro  weight_8        add
1389         vdup.8          d0,  r12
1390 1:      subs            r2,  r2,  #2
1391         vld1.8          {d4},[r0,:64], r1
1392         vmull.u8        q1,  d0,  d4
1393         pld             [r0]
1394         vld1.8          {d6},[r0,:64], r1
1395         vmull.u8        q10, d0,  d6
1396         \add            q1,  q8,  q1
1397         pld             [r0]
1398         vrshl.s16       q1,  q1,  q9
1399         vqmovun.s16     d2,  q1
1400         \add            q10, q8,  q10
1401         vrshl.s16       q10, q10, q9
1402         vqmovun.s16     d4,  q10
1403         vst1.8          {d2},[r4,:64], r1
1404         vst1.8          {d4},[r4,:64], r1
1405         bne             1b
1406         pop             {r4, pc}
1407 .endm
1408
1409 .macro  weight_4        add
1410         vdup.8          d0,  r12
1411         vmov            q1,  q8
1412         vmov            q10, q8
1413 1:      subs            r2,  r2,  #4
1414         vld1.32         {d4[0]},[r0,:32], r1
1415         vld1.32         {d4[1]},[r0,:32], r1
1416         vmull.u8        q1,  d0,  d4
1417         pld             [r0]
1418         blt             2f
1419         vld1.32         {d6[0]},[r0,:32], r1
1420         vld1.32         {d6[1]},[r0,:32], r1
1421         vmull.u8        q10, d0,  d6
1422         pld             [r0]
1423         \add            q1,  q8,  q1
1424         vrshl.s16       q1,  q1,  q9
1425         vqmovun.s16     d2,  q1
1426         \add            q10, q8,  q10
1427         vrshl.s16       q10, q10, q9
1428         vqmovun.s16     d4,  q10
1429         vmov            q10, q8
1430         vst1.32         {d2[0]},[r4,:32], r1
1431         vst1.32         {d2[1]},[r4,:32], r1
1432         vmov            q1,  q8
1433         vst1.32         {d4[0]},[r4,:32], r1
1434         vst1.32         {d4[1]},[r4,:32], r1
1435         bne             1b
1436         pop             {r4, pc}
1437 2:      \add            q1,  q8,  q1
1438         vrshl.s16       q1,  q1,  q9
1439         vqmovun.s16     d2,  q1
1440         vst1.32         {d2[0]},[r4,:32], r1
1441         vst1.32         {d2[1]},[r4,:32], r1
1442         pop             {r4, pc}
1443 .endm
1444
1445 .macro  weight_func     w
1446 function ff_weight_h264_pixels_\w\()_neon, export=1
1447         push            {r4, lr}
1448         ldr             r12, [sp, #8]
1449         ldr             r4,  [sp, #12]
1450         cmp             r3,  #1
1451         lsl             r4,  r4,  r3
1452         vdup.16         q8,  r4
1453         mov             r4,  r0
1454         ble             20f
1455         rsb             lr,  r3,  #1
1456         vdup.16         q9,  lr
1457         cmp             r12, #0
1458         blt             10f
1459         weight_\w       vhadd.s16
1460 10:     rsb             r12, r12, #0
1461         weight_\w       vhsub.s16
1462 20:     rsb             lr,  r3,  #0
1463         vdup.16         q9,  lr
1464         cmp             r12, #0
1465         blt             10f
1466         weight_\w       vadd.s16
1467 10:     rsb             r12, r12, #0
1468         weight_\w       vsub.s16
1469 endfunc
1470 .endm
1471
1472         weight_func     16
1473         weight_func     8
1474         weight_func     4