]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264qpel_neon.S
build: Add explanatory comments to (optimization) blocks in the Makefiles
[ffmpeg] / libavcodec / arm / h264qpel_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22 #include "neon.S"
23
24         /* H.264 qpel MC */
25
26 .macro  lowpass_const   r
27         movw            \r,  #5
28         movt            \r,  #20
29         vmov.32         d6[0], \r
30 .endm
31
32 .macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
33   .if \narrow
34         t0 .req q0
35         t1 .req q8
36   .else
37         t0 .req \d0
38         t1 .req \d1
39   .endif
40         vext.8          d2,  \r0, \r1, #2
41         vext.8          d3,  \r0, \r1, #3
42         vaddl.u8        q1,  d2,  d3
43         vext.8          d4,  \r0, \r1, #1
44         vext.8          d5,  \r0, \r1, #4
45         vaddl.u8        q2,  d4,  d5
46         vext.8          d30, \r0, \r1, #5
47         vaddl.u8        t0,  \r0, d30
48         vext.8          d18, \r2, \r3, #2
49         vmla.i16        t0,  q1,  d6[1]
50         vext.8          d19, \r2, \r3, #3
51         vaddl.u8        q9,  d18, d19
52         vext.8          d20, \r2, \r3, #1
53         vmls.i16        t0,  q2,  d6[0]
54         vext.8          d21, \r2, \r3, #4
55         vaddl.u8        q10, d20, d21
56         vext.8          d31, \r2, \r3, #5
57         vaddl.u8        t1,  \r2, d31
58         vmla.i16        t1,  q9,  d6[1]
59         vmls.i16        t1,  q10, d6[0]
60   .if \narrow
61         vqrshrun.s16    \d0, t0,  #5
62         vqrshrun.s16    \d1, t1,  #5
63   .endif
64         .unreq  t0
65         .unreq  t1
66 .endm
67
68 .macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
69   .if \narrow
70         t0 .req q0
71   .else
72         t0 .req \d0
73   .endif
74         vext.8          d2,  \r0, \r1, #2
75         vext.8          d3,  \r0, \r1, #3
76         vaddl.u8        q1,  d2,  d3
77         vext.8          d4,  \r0, \r1, #1
78         vext.8          d5,  \r0, \r1, #4
79         vaddl.u8        q2,  d4,  d5
80         vext.8          d30, \r0, \r1, #5
81         vaddl.u8        t0,  \r0, d30
82         vmla.i16        t0,  q1,  d6[1]
83         vmls.i16        t0,  q2,  d6[0]
84   .if \narrow
85         vqrshrun.s16    \d0, t0,  #5
86   .endif
87         .unreq  t0
88 .endm
89
90 .macro  lowpass_8.16    r0,  r1,  l0,  h0,  l1,  h1,  d
91         vext.16         q1,  \r0, \r1, #2
92         vext.16         q0,  \r0, \r1, #3
93         vaddl.s16       q9,  d2,  d0
94         vext.16         q2,  \r0, \r1, #1
95         vaddl.s16       q1,  d3,  d1
96         vext.16         q3,  \r0, \r1, #4
97         vaddl.s16       q10, d4,  d6
98         vext.16         \r1, \r0, \r1, #5
99         vaddl.s16       q2,  d5,  d7
100         vaddl.s16       q0,  \h0, \h1
101         vaddl.s16       q8,  \l0, \l1
102
103         vshl.i32        q3,  q9,  #4
104         vshl.i32        q9,  q9,  #2
105         vshl.i32        q15, q10, #2
106         vadd.i32        q9,  q9,  q3
107         vadd.i32        q10, q10, q15
108
109         vshl.i32        q3,  q1,  #4
110         vshl.i32        q1,  q1,  #2
111         vshl.i32        q15, q2,  #2
112         vadd.i32        q1,  q1,  q3
113         vadd.i32        q2,  q2,  q15
114
115         vadd.i32        q9,  q9,  q8
116         vsub.i32        q9,  q9,  q10
117
118         vadd.i32        q1,  q1,  q0
119         vsub.i32        q1,  q1,  q2
120
121         vrshrn.s32      d18, q9,  #10
122         vrshrn.s32      d19, q1,  #10
123
124         vqmovun.s16     \d,  q9
125 .endm
126
127 function put_h264_qpel16_h_lowpass_neon_packed
128         mov             r4,  lr
129         mov             r12, #16
130         mov             r3,  #8
131         bl              put_h264_qpel8_h_lowpass_neon
132         sub             r1,  r1,  r2, lsl #4
133         add             r1,  r1,  #8
134         mov             r12, #16
135         mov             lr,  r4
136         b               put_h264_qpel8_h_lowpass_neon
137 endfunc
138
139 .macro  h264_qpel_h_lowpass type
140 function \type\()_h264_qpel16_h_lowpass_neon
141         push            {lr}
142         mov             r12, #16
143         bl              \type\()_h264_qpel8_h_lowpass_neon
144         sub             r0,  r0,  r3, lsl #4
145         sub             r1,  r1,  r2, lsl #4
146         add             r0,  r0,  #8
147         add             r1,  r1,  #8
148         mov             r12, #16
149         pop             {lr}
150 endfunc
151
152 function \type\()_h264_qpel8_h_lowpass_neon
153 1:      vld1.8          {d0, d1},  [r1], r2
154         vld1.8          {d16,d17}, [r1], r2
155         subs            r12, r12, #2
156         lowpass_8       d0,  d1,  d16, d17, d0,  d16
157   .ifc \type,avg
158         vld1.8          {d2},     [r0,:64], r3
159         vrhadd.u8       d0,  d0,  d2
160         vld1.8          {d3},     [r0,:64]
161         vrhadd.u8       d16, d16, d3
162         sub             r0,  r0,  r3
163   .endif
164         vst1.8          {d0},     [r0,:64], r3
165         vst1.8          {d16},    [r0,:64], r3
166         bne             1b
167         bx              lr
168 endfunc
169 .endm
170
171         h264_qpel_h_lowpass put
172         h264_qpel_h_lowpass avg
173
174 .macro  h264_qpel_h_lowpass_l2 type
175 function \type\()_h264_qpel16_h_lowpass_l2_neon
176         push            {lr}
177         mov             r12, #16
178         bl              \type\()_h264_qpel8_h_lowpass_l2_neon
179         sub             r0,  r0,  r2, lsl #4
180         sub             r1,  r1,  r2, lsl #4
181         sub             r3,  r3,  r2, lsl #4
182         add             r0,  r0,  #8
183         add             r1,  r1,  #8
184         add             r3,  r3,  #8
185         mov             r12, #16
186         pop             {lr}
187 endfunc
188
189 function \type\()_h264_qpel8_h_lowpass_l2_neon
190 1:      vld1.8          {d0, d1},  [r1], r2
191         vld1.8          {d16,d17}, [r1], r2
192         vld1.8          {d28},     [r3], r2
193         vld1.8          {d29},     [r3], r2
194         subs            r12, r12, #2
195         lowpass_8       d0,  d1,  d16, d17, d0,  d1
196         vrhadd.u8       q0,  q0,  q14
197   .ifc \type,avg
198         vld1.8          {d2},      [r0,:64], r2
199         vrhadd.u8       d0,  d0,  d2
200         vld1.8          {d3},      [r0,:64]
201         vrhadd.u8       d1,  d1,  d3
202         sub             r0,  r0,  r2
203   .endif
204         vst1.8          {d0},      [r0,:64], r2
205         vst1.8          {d1},      [r0,:64], r2
206         bne             1b
207         bx              lr
208 endfunc
209 .endm
210
211         h264_qpel_h_lowpass_l2 put
212         h264_qpel_h_lowpass_l2 avg
213
214 function put_h264_qpel16_v_lowpass_neon_packed
215         mov             r4,  lr
216         mov             r2,  #8
217         bl              put_h264_qpel8_v_lowpass_neon
218         sub             r1,  r1,  r3, lsl #2
219         bl              put_h264_qpel8_v_lowpass_neon
220         sub             r1,  r1,  r3, lsl #4
221         sub             r1,  r1,  r3, lsl #2
222         add             r1,  r1,  #8
223         bl              put_h264_qpel8_v_lowpass_neon
224         sub             r1,  r1,  r3, lsl #2
225         mov             lr,  r4
226         b               put_h264_qpel8_v_lowpass_neon
227 endfunc
228
229 .macro  h264_qpel_v_lowpass type
230 function \type\()_h264_qpel16_v_lowpass_neon
231         mov             r4,  lr
232         bl              \type\()_h264_qpel8_v_lowpass_neon
233         sub             r1,  r1,  r3, lsl #2
234         bl              \type\()_h264_qpel8_v_lowpass_neon
235         sub             r0,  r0,  r2, lsl #4
236         add             r0,  r0,  #8
237         sub             r1,  r1,  r3, lsl #4
238         sub             r1,  r1,  r3, lsl #2
239         add             r1,  r1,  #8
240         bl              \type\()_h264_qpel8_v_lowpass_neon
241         sub             r1,  r1,  r3, lsl #2
242         mov             lr,  r4
243 endfunc
244
245 function \type\()_h264_qpel8_v_lowpass_neon
246         vld1.8          {d8},  [r1], r3
247         vld1.8          {d10}, [r1], r3
248         vld1.8          {d12}, [r1], r3
249         vld1.8          {d14}, [r1], r3
250         vld1.8          {d22}, [r1], r3
251         vld1.8          {d24}, [r1], r3
252         vld1.8          {d26}, [r1], r3
253         vld1.8          {d28}, [r1], r3
254         vld1.8          {d9},  [r1], r3
255         vld1.8          {d11}, [r1], r3
256         vld1.8          {d13}, [r1], r3
257         vld1.8          {d15}, [r1], r3
258         vld1.8          {d23}, [r1]
259
260         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
261         lowpass_8       d8,  d9,  d10, d11, d8,  d10
262         lowpass_8       d12, d13, d14, d15, d12, d14
263         lowpass_8       d22, d23, d24, d25, d22, d24
264         lowpass_8       d26, d27, d28, d29, d26, d28
265         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
266
267   .ifc \type,avg
268         vld1.8          {d9},  [r0,:64], r2
269         vrhadd.u8       d8,  d8,  d9
270         vld1.8          {d11}, [r0,:64], r2
271         vrhadd.u8       d10, d10, d11
272         vld1.8          {d13}, [r0,:64], r2
273         vrhadd.u8       d12, d12, d13
274         vld1.8          {d15}, [r0,:64], r2
275         vrhadd.u8       d14, d14, d15
276         vld1.8          {d23}, [r0,:64], r2
277         vrhadd.u8       d22, d22, d23
278         vld1.8          {d25}, [r0,:64], r2
279         vrhadd.u8       d24, d24, d25
280         vld1.8          {d27}, [r0,:64], r2
281         vrhadd.u8       d26, d26, d27
282         vld1.8          {d29}, [r0,:64], r2
283         vrhadd.u8       d28, d28, d29
284         sub             r0,  r0,  r2,  lsl #3
285   .endif
286
287         vst1.8          {d8},  [r0,:64], r2
288         vst1.8          {d10}, [r0,:64], r2
289         vst1.8          {d12}, [r0,:64], r2
290         vst1.8          {d14}, [r0,:64], r2
291         vst1.8          {d22}, [r0,:64], r2
292         vst1.8          {d24}, [r0,:64], r2
293         vst1.8          {d26}, [r0,:64], r2
294         vst1.8          {d28}, [r0,:64], r2
295
296         bx              lr
297 endfunc
298 .endm
299
300         h264_qpel_v_lowpass put
301         h264_qpel_v_lowpass avg
302
303 .macro  h264_qpel_v_lowpass_l2 type
304 function \type\()_h264_qpel16_v_lowpass_l2_neon
305         mov             r4,  lr
306         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
307         sub             r1,  r1,  r3, lsl #2
308         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
309         sub             r0,  r0,  r3, lsl #4
310         sub             r12, r12, r2, lsl #4
311         add             r0,  r0,  #8
312         add             r12, r12, #8
313         sub             r1,  r1,  r3, lsl #4
314         sub             r1,  r1,  r3, lsl #2
315         add             r1,  r1,  #8
316         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
317         sub             r1,  r1,  r3, lsl #2
318         mov             lr,  r4
319 endfunc
320
321 function \type\()_h264_qpel8_v_lowpass_l2_neon
322         vld1.8          {d8},  [r1], r3
323         vld1.8          {d10}, [r1], r3
324         vld1.8          {d12}, [r1], r3
325         vld1.8          {d14}, [r1], r3
326         vld1.8          {d22}, [r1], r3
327         vld1.8          {d24}, [r1], r3
328         vld1.8          {d26}, [r1], r3
329         vld1.8          {d28}, [r1], r3
330         vld1.8          {d9},  [r1], r3
331         vld1.8          {d11}, [r1], r3
332         vld1.8          {d13}, [r1], r3
333         vld1.8          {d15}, [r1], r3
334         vld1.8          {d23}, [r1]
335
336         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
337         lowpass_8       d8,  d9,  d10, d11, d8,  d9
338         lowpass_8       d12, d13, d14, d15, d12, d13
339         lowpass_8       d22, d23, d24, d25, d22, d23
340         lowpass_8       d26, d27, d28, d29, d26, d27
341         transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
342
343         vld1.8          {d0},  [r12], r2
344         vld1.8          {d1},  [r12], r2
345         vld1.8          {d2},  [r12], r2
346         vld1.8          {d3},  [r12], r2
347         vld1.8          {d4},  [r12], r2
348         vrhadd.u8       q0,  q0,  q4
349         vld1.8          {d5},  [r12], r2
350         vrhadd.u8       q1,  q1,  q6
351         vld1.8          {d10}, [r12], r2
352         vrhadd.u8       q2,  q2,  q11
353         vld1.8          {d11}, [r12], r2
354         vrhadd.u8       q5,  q5,  q13
355
356   .ifc \type,avg
357         vld1.8          {d16}, [r0,:64], r3
358         vrhadd.u8       d0,  d0,  d16
359         vld1.8          {d17}, [r0,:64], r3
360         vrhadd.u8       d1,  d1,  d17
361         vld1.8          {d16}, [r0,:64], r3
362         vrhadd.u8       d2,  d2,  d16
363         vld1.8          {d17}, [r0,:64], r3
364         vrhadd.u8       d3,  d3,  d17
365         vld1.8          {d16}, [r0,:64], r3
366         vrhadd.u8       d4,  d4,  d16
367         vld1.8          {d17}, [r0,:64], r3
368         vrhadd.u8       d5,  d5,  d17
369         vld1.8          {d16}, [r0,:64], r3
370         vrhadd.u8       d10, d10, d16
371         vld1.8          {d17}, [r0,:64], r3
372         vrhadd.u8       d11, d11, d17
373         sub             r0,  r0,  r3,  lsl #3
374   .endif
375
376         vst1.8          {d0},  [r0,:64], r3
377         vst1.8          {d1},  [r0,:64], r3
378         vst1.8          {d2},  [r0,:64], r3
379         vst1.8          {d3},  [r0,:64], r3
380         vst1.8          {d4},  [r0,:64], r3
381         vst1.8          {d5},  [r0,:64], r3
382         vst1.8          {d10}, [r0,:64], r3
383         vst1.8          {d11}, [r0,:64], r3
384
385         bx              lr
386 endfunc
387 .endm
388
389         h264_qpel_v_lowpass_l2 put
390         h264_qpel_v_lowpass_l2 avg
391
392 function put_h264_qpel8_hv_lowpass_neon_top
393         lowpass_const   r12
394         mov             r12, #12
395 1:      vld1.8          {d0, d1},  [r1], r3
396         vld1.8          {d16,d17}, [r1], r3
397         subs            r12, r12, #2
398         lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
399         vst1.8          {d22-d25}, [r4,:128]!
400         bne             1b
401
402         vld1.8          {d0, d1},  [r1]
403         lowpass_8_1     d0,  d1,  q12, narrow=0
404
405         mov             r12, #-16
406         add             r4,  r4,  r12
407         vld1.8          {d30,d31}, [r4,:128], r12
408         vld1.8          {d20,d21}, [r4,:128], r12
409         vld1.8          {d18,d19}, [r4,:128], r12
410         vld1.8          {d16,d17}, [r4,:128], r12
411         vld1.8          {d14,d15}, [r4,:128], r12
412         vld1.8          {d12,d13}, [r4,:128], r12
413         vld1.8          {d10,d11}, [r4,:128], r12
414         vld1.8          {d8, d9},  [r4,:128], r12
415         vld1.8          {d6, d7},  [r4,:128], r12
416         vld1.8          {d4, d5},  [r4,:128], r12
417         vld1.8          {d2, d3},  [r4,:128], r12
418         vld1.8          {d0, d1},  [r4,:128]
419
420         swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
421         transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
422
423         swap4           d17, d19, d21, d31, d24, d26, d28, d22
424         transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
425
426         vst1.8          {d30,d31}, [r4,:128]!
427         vst1.8          {d6, d7},  [r4,:128]!
428         vst1.8          {d20,d21}, [r4,:128]!
429         vst1.8          {d4, d5},  [r4,:128]!
430         vst1.8          {d18,d19}, [r4,:128]!
431         vst1.8          {d2, d3},  [r4,:128]!
432         vst1.8          {d16,d17}, [r4,:128]!
433         vst1.8          {d0, d1},  [r4,:128]
434
435         lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
436         lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
437         lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
438         lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
439
440         vld1.8          {d16,d17}, [r4,:128], r12
441         vld1.8          {d30,d31}, [r4,:128], r12
442         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
443         vld1.8          {d16,d17}, [r4,:128], r12
444         vld1.8          {d30,d31}, [r4,:128], r12
445         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
446         vld1.8          {d16,d17}, [r4,:128], r12
447         vld1.8          {d30,d31}, [r4,:128], r12
448         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
449         vld1.8          {d16,d17}, [r4,:128], r12
450         vld1.8          {d30,d31}, [r4,:128]
451         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
452
453         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
454
455         bx              lr
456 endfunc
457
458 .macro  h264_qpel8_hv_lowpass type
459 function \type\()_h264_qpel8_hv_lowpass_neon
460         mov             r10, lr
461         bl              put_h264_qpel8_hv_lowpass_neon_top
462   .ifc \type,avg
463         vld1.8          {d0},      [r0,:64], r2
464         vrhadd.u8       d12, d12, d0
465         vld1.8          {d1},      [r0,:64], r2
466         vrhadd.u8       d13, d13, d1
467         vld1.8          {d2},      [r0,:64], r2
468         vrhadd.u8       d14, d14, d2
469         vld1.8          {d3},      [r0,:64], r2
470         vrhadd.u8       d15, d15, d3
471         vld1.8          {d4},      [r0,:64], r2
472         vrhadd.u8       d8,  d8,  d4
473         vld1.8          {d5},      [r0,:64], r2
474         vrhadd.u8       d9,  d9,  d5
475         vld1.8          {d6},      [r0,:64], r2
476         vrhadd.u8       d10, d10, d6
477         vld1.8          {d7},      [r0,:64], r2
478         vrhadd.u8       d11, d11, d7
479         sub             r0,  r0,  r2,  lsl #3
480   .endif
481
482         vst1.8          {d12},     [r0,:64], r2
483         vst1.8          {d13},     [r0,:64], r2
484         vst1.8          {d14},     [r0,:64], r2
485         vst1.8          {d15},     [r0,:64], r2
486         vst1.8          {d8},      [r0,:64], r2
487         vst1.8          {d9},      [r0,:64], r2
488         vst1.8          {d10},     [r0,:64], r2
489         vst1.8          {d11},     [r0,:64], r2
490
491         mov             lr,  r10
492         bx              lr
493 endfunc
494 .endm
495
496         h264_qpel8_hv_lowpass put
497         h264_qpel8_hv_lowpass avg
498
499 .macro  h264_qpel8_hv_lowpass_l2 type
500 function \type\()_h264_qpel8_hv_lowpass_l2_neon
501         mov             r10, lr
502         bl              put_h264_qpel8_hv_lowpass_neon_top
503
504         vld1.8          {d0, d1},  [r2,:128]!
505         vld1.8          {d2, d3},  [r2,:128]!
506         vrhadd.u8       q0,  q0,  q6
507         vld1.8          {d4, d5},  [r2,:128]!
508         vrhadd.u8       q1,  q1,  q7
509         vld1.8          {d6, d7},  [r2,:128]!
510         vrhadd.u8       q2,  q2,  q4
511         vrhadd.u8       q3,  q3,  q5
512   .ifc \type,avg
513         vld1.8          {d16},     [r0,:64], r3
514         vrhadd.u8       d0,  d0,  d16
515         vld1.8          {d17},     [r0,:64], r3
516         vrhadd.u8       d1,  d1,  d17
517         vld1.8          {d18},     [r0,:64], r3
518         vrhadd.u8       d2,  d2,  d18
519         vld1.8          {d19},     [r0,:64], r3
520         vrhadd.u8       d3,  d3,  d19
521         vld1.8          {d20},     [r0,:64], r3
522         vrhadd.u8       d4,  d4,  d20
523         vld1.8          {d21},     [r0,:64], r3
524         vrhadd.u8       d5,  d5,  d21
525         vld1.8          {d22},     [r0,:64], r3
526         vrhadd.u8       d6,  d6,  d22
527         vld1.8          {d23},     [r0,:64], r3
528         vrhadd.u8       d7,  d7,  d23
529         sub             r0,  r0,  r3,  lsl #3
530   .endif
531         vst1.8          {d0},      [r0,:64], r3
532         vst1.8          {d1},      [r0,:64], r3
533         vst1.8          {d2},      [r0,:64], r3
534         vst1.8          {d3},      [r0,:64], r3
535         vst1.8          {d4},      [r0,:64], r3
536         vst1.8          {d5},      [r0,:64], r3
537         vst1.8          {d6},      [r0,:64], r3
538         vst1.8          {d7},      [r0,:64], r3
539
540         mov             lr,  r10
541         bx              lr
542 endfunc
543 .endm
544
545         h264_qpel8_hv_lowpass_l2 put
546         h264_qpel8_hv_lowpass_l2 avg
547
548 .macro  h264_qpel16_hv  type
549 function \type\()_h264_qpel16_hv_lowpass_neon
550         mov             r9,  lr
551         bl              \type\()_h264_qpel8_hv_lowpass_neon
552         sub             r1,  r1,  r3, lsl #2
553         bl              \type\()_h264_qpel8_hv_lowpass_neon
554         sub             r1,  r1,  r3, lsl #4
555         sub             r1,  r1,  r3, lsl #2
556         add             r1,  r1,  #8
557         sub             r0,  r0,  r2, lsl #4
558         add             r0,  r0,  #8
559         bl              \type\()_h264_qpel8_hv_lowpass_neon
560         sub             r1,  r1,  r3, lsl #2
561         mov             lr,  r9
562         b               \type\()_h264_qpel8_hv_lowpass_neon
563 endfunc
564
565 function \type\()_h264_qpel16_hv_lowpass_l2_neon
566         mov             r9,  lr
567         sub             r2,  r4,  #256
568         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
569         sub             r1,  r1,  r3, lsl #2
570         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
571         sub             r1,  r1,  r3, lsl #4
572         sub             r1,  r1,  r3, lsl #2
573         add             r1,  r1,  #8
574         sub             r0,  r0,  r3, lsl #4
575         add             r0,  r0,  #8
576         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
577         sub             r1,  r1,  r3, lsl #2
578         mov             lr,  r9
579         b               \type\()_h264_qpel8_hv_lowpass_l2_neon
580 endfunc
581 .endm
582
583         h264_qpel16_hv put
584         h264_qpel16_hv avg
585
586 .macro  h264_qpel8      type
587 function ff_\type\()_h264_qpel8_mc10_neon, export=1
588         lowpass_const   r3
589         mov             r3,  r1
590         sub             r1,  r1,  #2
591         mov             r12, #8
592         b               \type\()_h264_qpel8_h_lowpass_l2_neon
593 endfunc
594
595 function ff_\type\()_h264_qpel8_mc20_neon, export=1
596         lowpass_const   r3
597         sub             r1,  r1,  #2
598         mov             r3,  r2
599         mov             r12, #8
600         b               \type\()_h264_qpel8_h_lowpass_neon
601 endfunc
602
603 function ff_\type\()_h264_qpel8_mc30_neon, export=1
604         lowpass_const   r3
605         add             r3,  r1,  #1
606         sub             r1,  r1,  #2
607         mov             r12, #8
608         b               \type\()_h264_qpel8_h_lowpass_l2_neon
609 endfunc
610
611 function ff_\type\()_h264_qpel8_mc01_neon, export=1
612         push            {lr}
613         mov             r12, r1
614 \type\()_h264_qpel8_mc01:
615         lowpass_const   r3
616         mov             r3,  r2
617         sub             r1,  r1,  r2, lsl #1
618         vpush           {d8-d15}
619         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
620         vpop            {d8-d15}
621         pop             {pc}
622 endfunc
623
624 function ff_\type\()_h264_qpel8_mc11_neon, export=1
625         push            {r0, r1, r11, lr}
626 \type\()_h264_qpel8_mc11:
627         lowpass_const   r3
628         mov             r11, sp
629 A       bic             sp,  sp,  #15
630 T       bic             r0,  r11, #15
631 T       mov             sp,  r0
632         sub             sp,  sp,  #64
633         mov             r0,  sp
634         sub             r1,  r1,  #2
635         mov             r3,  #8
636         mov             r12, #8
637         vpush           {d8-d15}
638         bl              put_h264_qpel8_h_lowpass_neon
639         ldrd            r0,  r1,  [r11], #8
640         mov             r3,  r2
641         add             r12, sp,  #64
642         sub             r1,  r1,  r2, lsl #1
643         mov             r2,  #8
644         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
645         vpop            {d8-d15}
646         mov             sp,  r11
647         pop             {r11, pc}
648 endfunc
649
650 function ff_\type\()_h264_qpel8_mc21_neon, export=1
651         push            {r0, r1, r4, r10, r11, lr}
652 \type\()_h264_qpel8_mc21:
653         lowpass_const   r3
654         mov             r11, sp
655 A       bic             sp,  sp,  #15
656 T       bic             r0,  r11, #15
657 T       mov             sp,  r0
658         sub             sp,  sp,  #(8*8+16*12)
659         sub             r1,  r1,  #2
660         mov             r3,  #8
661         mov             r0,  sp
662         mov             r12, #8
663         vpush           {d8-d15}
664         bl              put_h264_qpel8_h_lowpass_neon
665         mov             r4,  r0
666         ldrd            r0,  r1,  [r11], #8
667         sub             r1,  r1,  r2, lsl #1
668         sub             r1,  r1,  #2
669         mov             r3,  r2
670         sub             r2,  r4,  #64
671         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
672         vpop            {d8-d15}
673         mov             sp,  r11
674         pop             {r4, r10, r11, pc}
675 endfunc
676
677 function ff_\type\()_h264_qpel8_mc31_neon, export=1
678         add             r1,  r1,  #1
679         push            {r0, r1, r11, lr}
680         sub             r1,  r1,  #1
681         b               \type\()_h264_qpel8_mc11
682 endfunc
683
684 function ff_\type\()_h264_qpel8_mc02_neon, export=1
685         push            {lr}
686         lowpass_const   r3
687         sub             r1,  r1,  r2, lsl #1
688         mov             r3,  r2
689         vpush           {d8-d15}
690         bl              \type\()_h264_qpel8_v_lowpass_neon
691         vpop            {d8-d15}
692         pop             {pc}
693 endfunc
694
695 function ff_\type\()_h264_qpel8_mc12_neon, export=1
696         push            {r0, r1, r4, r10, r11, lr}
697 \type\()_h264_qpel8_mc12:
698         lowpass_const   r3
699         mov             r11, sp
700 A       bic             sp,  sp,  #15
701 T       bic             r0,  r11, #15
702 T       mov             sp,  r0
703         sub             sp,  sp,  #(8*8+16*12)
704         sub             r1,  r1,  r2, lsl #1
705         mov             r3,  r2
706         mov             r2,  #8
707         mov             r0,  sp
708         vpush           {d8-d15}
709         bl              put_h264_qpel8_v_lowpass_neon
710         mov             r4,  r0
711         ldrd            r0,  r1,  [r11], #8
712         sub             r1,  r1,  r3, lsl #1
713         sub             r1,  r1,  #2
714         sub             r2,  r4,  #64
715         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
716         vpop            {d8-d15}
717         mov             sp,  r11
718         pop             {r4, r10, r11, pc}
719 endfunc
720
721 function ff_\type\()_h264_qpel8_mc22_neon, export=1
722         push            {r4, r10, r11, lr}
723         mov             r11, sp
724 A       bic             sp,  sp,  #15
725 T       bic             r4,  r11, #15
726 T       mov             sp,  r4
727         sub             r1,  r1,  r2, lsl #1
728         sub             r1,  r1,  #2
729         mov             r3,  r2
730         sub             sp,  sp,  #(16*12)
731         mov             r4,  sp
732         vpush           {d8-d15}
733         bl              \type\()_h264_qpel8_hv_lowpass_neon
734         vpop            {d8-d15}
735         mov             sp,  r11
736         pop             {r4, r10, r11, pc}
737 endfunc
738
739 function ff_\type\()_h264_qpel8_mc32_neon, export=1
740         push            {r0, r1, r4, r10, r11, lr}
741         add             r1,  r1,  #1
742         b               \type\()_h264_qpel8_mc12
743 endfunc
744
745 function ff_\type\()_h264_qpel8_mc03_neon, export=1
746         push            {lr}
747         add             r12, r1,  r2
748         b               \type\()_h264_qpel8_mc01
749 endfunc
750
751 function ff_\type\()_h264_qpel8_mc13_neon, export=1
752         push            {r0, r1, r11, lr}
753         add             r1,  r1,  r2
754         b               \type\()_h264_qpel8_mc11
755 endfunc
756
757 function ff_\type\()_h264_qpel8_mc23_neon, export=1
758         push            {r0, r1, r4, r10, r11, lr}
759         add             r1,  r1,  r2
760         b               \type\()_h264_qpel8_mc21
761 endfunc
762
763 function ff_\type\()_h264_qpel8_mc33_neon, export=1
764         add             r1,  r1,  #1
765         push            {r0, r1, r11, lr}
766         add             r1,  r1,  r2
767         sub             r1,  r1,  #1
768         b               \type\()_h264_qpel8_mc11
769 endfunc
770 .endm
771
772         h264_qpel8 put
773         h264_qpel8 avg
774
775 .macro  h264_qpel16     type
776 function ff_\type\()_h264_qpel16_mc10_neon, export=1
777         lowpass_const   r3
778         mov             r3,  r1
779         sub             r1,  r1,  #2
780         b               \type\()_h264_qpel16_h_lowpass_l2_neon
781 endfunc
782
783 function ff_\type\()_h264_qpel16_mc20_neon, export=1
784         lowpass_const   r3
785         sub             r1,  r1,  #2
786         mov             r3,  r2
787         b               \type\()_h264_qpel16_h_lowpass_neon
788 endfunc
789
790 function ff_\type\()_h264_qpel16_mc30_neon, export=1
791         lowpass_const   r3
792         add             r3,  r1,  #1
793         sub             r1,  r1,  #2
794         b               \type\()_h264_qpel16_h_lowpass_l2_neon
795 endfunc
796
797 function ff_\type\()_h264_qpel16_mc01_neon, export=1
798         push            {r4, lr}
799         mov             r12, r1
800 \type\()_h264_qpel16_mc01:
801         lowpass_const   r3
802         mov             r3,  r2
803         sub             r1,  r1,  r2, lsl #1
804         vpush           {d8-d15}
805         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
806         vpop            {d8-d15}
807         pop             {r4, pc}
808 endfunc
809
810 function ff_\type\()_h264_qpel16_mc11_neon, export=1
811         push            {r0, r1, r4, r11, lr}
812 \type\()_h264_qpel16_mc11:
813         lowpass_const   r3
814         mov             r11, sp
815 A       bic             sp,  sp,  #15
816 T       bic             r0,  r11, #15
817 T       mov             sp,  r0
818         sub             sp,  sp,  #256
819         mov             r0,  sp
820         sub             r1,  r1,  #2
821         mov             r3,  #16
822         vpush           {d8-d15}
823         bl              put_h264_qpel16_h_lowpass_neon
824         ldrd            r0,  r1,  [r11], #8
825         mov             r3,  r2
826         add             r12, sp,  #64
827         sub             r1,  r1,  r2, lsl #1
828         mov             r2,  #16
829         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
830         vpop            {d8-d15}
831         mov             sp,  r11
832         pop             {r4, r11, pc}
833 endfunc
834
835 function ff_\type\()_h264_qpel16_mc21_neon, export=1
836         push            {r0, r1, r4-r5, r9-r11, lr}
837 \type\()_h264_qpel16_mc21:
838         lowpass_const   r3
839         mov             r11, sp
840 A       bic             sp,  sp,  #15
841 T       bic             r0,  r11, #15
842 T       mov             sp,  r0
843         sub             sp,  sp,  #(16*16+16*12)
844         sub             r1,  r1,  #2
845         mov             r0,  sp
846         vpush           {d8-d15}
847         bl              put_h264_qpel16_h_lowpass_neon_packed
848         mov             r4,  r0
849         ldrd            r0,  r1,  [r11], #8
850         sub             r1,  r1,  r2, lsl #1
851         sub             r1,  r1,  #2
852         mov             r3,  r2
853         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
854         vpop            {d8-d15}
855         mov             sp,  r11
856         pop             {r4-r5, r9-r11, pc}
857 endfunc
858
859 function ff_\type\()_h264_qpel16_mc31_neon, export=1
860         add             r1,  r1,  #1
861         push            {r0, r1, r4, r11, lr}
862         sub             r1,  r1,  #1
863         b               \type\()_h264_qpel16_mc11
864 endfunc
865
866 function ff_\type\()_h264_qpel16_mc02_neon, export=1
867         push            {r4, lr}
868         lowpass_const   r3
869         sub             r1,  r1,  r2, lsl #1
870         mov             r3,  r2
871         vpush           {d8-d15}
872         bl              \type\()_h264_qpel16_v_lowpass_neon
873         vpop            {d8-d15}
874         pop             {r4, pc}
875 endfunc
876
877 function ff_\type\()_h264_qpel16_mc12_neon, export=1
878         push            {r0, r1, r4-r5, r9-r11, lr}
879 \type\()_h264_qpel16_mc12:
880         lowpass_const   r3
881         mov             r11, sp
882 A       bic             sp,  sp,  #15
883 T       bic             r0,  r11, #15
884 T       mov             sp,  r0
885         sub             sp,  sp,  #(16*16+16*12)
886         sub             r1,  r1,  r2, lsl #1
887         mov             r0,  sp
888         mov             r3,  r2
889         vpush           {d8-d15}
890         bl              put_h264_qpel16_v_lowpass_neon_packed
891         mov             r4,  r0
892         ldrd            r0,  r1,  [r11], #8
893         sub             r1,  r1,  r3, lsl #1
894         sub             r1,  r1,  #2
895         mov             r2,  r3
896         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
897         vpop            {d8-d15}
898         mov             sp,  r11
899         pop             {r4-r5, r9-r11, pc}
900 endfunc
901
902 function ff_\type\()_h264_qpel16_mc22_neon, export=1
903         push            {r4, r9-r11, lr}
904         lowpass_const   r3
905         mov             r11, sp
906 A       bic             sp,  sp,  #15
907 T       bic             r4,  r11, #15
908 T       mov             sp,  r4
909         sub             r1,  r1,  r2, lsl #1
910         sub             r1,  r1,  #2
911         mov             r3,  r2
912         sub             sp,  sp,  #(16*12)
913         mov             r4,  sp
914         vpush           {d8-d15}
915         bl              \type\()_h264_qpel16_hv_lowpass_neon
916         vpop            {d8-d15}
917         mov             sp,  r11
918         pop             {r4, r9-r11, pc}
919 endfunc
920
921 function ff_\type\()_h264_qpel16_mc32_neon, export=1
922         push            {r0, r1, r4-r5, r9-r11, lr}
923         add             r1,  r1,  #1
924         b               \type\()_h264_qpel16_mc12
925 endfunc
926
927 function ff_\type\()_h264_qpel16_mc03_neon, export=1
928         push            {r4, lr}
929         add             r12, r1,  r2
930         b               \type\()_h264_qpel16_mc01
931 endfunc
932
933 function ff_\type\()_h264_qpel16_mc13_neon, export=1
934         push            {r0, r1, r4, r11, lr}
935         add             r1,  r1,  r2
936         b               \type\()_h264_qpel16_mc11
937 endfunc
938
939 function ff_\type\()_h264_qpel16_mc23_neon, export=1
940         push            {r0, r1, r4-r5, r9-r11, lr}
941         add             r1,  r1,  r2
942         b               \type\()_h264_qpel16_mc21
943 endfunc
944
945 function ff_\type\()_h264_qpel16_mc33_neon, export=1
946         add             r1,  r1,  #1
947         push            {r0, r1, r4, r11, lr}
948         add             r1,  r1,  r2
949         sub             r1,  r1,  #1
950         b               \type\()_h264_qpel16_mc11
951 endfunc
952 .endm
953
954         h264_qpel16 put
955         h264_qpel16 avg