]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/rv40dsp_neon.S
avcodec: Constify all the AVCodecParsers
[ffmpeg] / libavcodec / arm / rv40dsp_neon.S
1 /*
2  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
3  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/arm/asm.S"
23 #include "neon.S"
24
25 .macro  qpel_lowpass    r0,  r1,  rc1, rc2, shift
26         vext.8          d25, \r0, \r1, #1       @ src[-1]
27         vext.8          d26, \r0, \r1, #4       @ src[ 2]
28         vext.8          d24, \r0, \r1, #5       @ src[ 3]
29         vaddl.u8        q9,  d25, d26
30         vaddl.u8        q8,  \r0, d24
31         vext.8          d27, \r0, \r1, #2       @ src[ 0]
32         vshl.s16        q12, q9,  #2
33         vsub.s16        q8,  q8,  q9
34         vext.8          d28, \r0, \r1, #3       @ src[ 1]
35         vsub.s16        q8,  q8,  q12
36         vmlal.u8        q8,  d27, \rc1
37         vmlal.u8        q8,  d28, \rc2
38         vqrshrun.s16    \r0, q8,  #\shift
39 .endm
40
41 .macro  qpel_lowpass_x2 r0,  r1,  r2,  r3,  rc1, rc2, shift
42         vext.8          d25, \r0, \r1, #1       @ src[-1]
43         vext.8          d26, \r0, \r1, #4       @ src[ 2]
44         vext.8          d24, \r0, \r1, #5       @ src[ 3]
45         vaddl.u8        q9,  d25, d26
46         vaddl.u8        q8,  \r0, d24
47         vext.8          d29, \r0, \r1, #2       @ src[ 0]
48         vext.8          d28, \r0, \r1, #3       @ src[ 1]
49         vshl.s16        q10, q9,  #2
50         vext.8          \r1, \r2, \r3, #1       @ src[-1]
51         vsub.s16        q8,  q8,  q9
52         vext.8          d22, \r2, \r3, #4       @ src[ 2]
53         vext.8          \r0, \r2, \r3, #5       @ src[ 3]
54         vaddl.u8        q13, \r1, d22
55         vaddl.u8        q12, \r2, \r0
56         vsub.s16        q8,  q8,  q10
57         vshl.s16        q9,  q13, #2
58         vsub.s16        q12, q12, q13
59         vmlal.u8        q8,  d29, \rc1
60         vmlal.u8        q8,  d28, \rc2
61         vsub.s16        q12, q12, q9
62         vext.8          d26, \r2, \r3, #2       @ src[ 0]
63         vext.8          d27, \r2, \r3, #3       @ src[ 1]
64         vmlal.u8        q12, d26, \rc1
65         vmlal.u8        q12, d27, \rc2
66         vqrshrun.s16    \r0, q8,  #\shift
67         vqrshrun.s16    \r2, q12, #\shift
68 .endm
69
70 .macro  rv40_qpel8_h    shift
71 function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
72 1:
73         vld1.8          {q2},     [r1], r2
74         vld1.8          {q3},     [r1], r2
75         qpel_lowpass_x2 d4,  d5,  d6,  d7,  d0,  d1,  \shift
76         vst1.8          {d4},     [r12,:64]!
77         vst1.8          {d6},     [r12,:64]!
78         subs            r3,  r3,  #2
79         bgt             1b
80         vld1.8          {q2},     [r1]
81         qpel_lowpass    d4,  d5,  d0,  d1,  \shift
82         vst1.8          {d4},     [r12,:64]!
83         bx              lr
84 endfunc
85 .endm
86
87 .macro  rv40_qpel8_v    shift, type
88 function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
89         vld1.64         {d2},     [r1,:64]!
90         vld1.64         {d3},     [r1,:64]!
91         vld1.64         {d4},     [r1,:64]!
92         vld1.64         {d5},     [r1,:64]!
93         vld1.64         {d6},     [r1,:64]!
94         vld1.64         {d7},     [r1,:64]!
95         vld1.64         {d8},     [r1,:64]!
96         vld1.64         {d9},     [r1,:64]!
97         vld1.64         {d10},    [r1,:64]!
98         vld1.64         {d11},    [r1,:64]!
99         vld1.64         {d12},    [r1,:64]!
100         vld1.64         {d13},    [r1,:64]!
101         vld1.64         {d14},    [r1,:64]!
102         transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
103         transpose_8x8   d10, d11, d12, d13, d14, d15, d30, d31
104         qpel_lowpass_x2 d2,  d10, d3,  d11, d0,  d1,  \shift
105         qpel_lowpass_x2 d4,  d12, d5,  d13, d0,  d1,  \shift
106         qpel_lowpass_x2 d6,  d14, d7,  d15, d0,  d1,  \shift
107         qpel_lowpass_x2 d8,  d30, d9,  d31, d0,  d1,  \shift
108         transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
109   .ifc \type,avg
110         vld1.64         d12,      [r0,:64], r2
111         vld1.64         d13,      [r0,:64], r2
112         vld1.64         d14,      [r0,:64], r2
113         vld1.64         d15,      [r0,:64], r2
114         vld1.64         d16,      [r0,:64], r2
115         vld1.64         d17,      [r0,:64], r2
116         vld1.64         d18,      [r0,:64], r2
117         vld1.64         d19,      [r0,:64], r2
118         sub             r0,  r0,  r2,  lsl #3
119         vrhadd.u8       q1,  q1,  q6
120         vrhadd.u8       q2,  q2,  q7
121         vrhadd.u8       q3,  q3,  q8
122         vrhadd.u8       q4,  q4,  q9
123   .endif
124         vst1.64         d2,       [r0,:64], r2
125         vst1.64         d3,       [r0,:64], r2
126         vst1.64         d4,       [r0,:64], r2
127         vst1.64         d5,       [r0,:64], r2
128         vst1.64         d6,       [r0,:64], r2
129         vst1.64         d7,       [r0,:64], r2
130         vst1.64         d8,       [r0,:64], r2
131         vst1.64         d9,       [r0,:64], r2
132         bx              lr
133 endfunc
134 .endm
135
136         rv40_qpel8_h    5
137         rv40_qpel8_h    6
138
139 .macro  rv40_qpel       type
140 function \type\()_rv40_qpel8_h_lowpass_neon
141   .ifc \type,avg
142         mov             r12, r0
143   .endif
144 1:
145         vld1.8          {q2},     [r1], r2
146         vld1.8          {q3},     [r1], r2
147         qpel_lowpass_x2 d4,  d5,  d6,  d7,  d0,  d1,  6
148   .ifc \type,avg
149         vld1.8          {d3},     [r12,:64], r2
150         vld1.8          {d16},    [r12,:64], r2
151         vrhadd.u8       d4,  d4,  d3
152         vrhadd.u8       d6,  d6,  d16
153   .endif
154         vst1.8          {d4},     [r0,:64], r2
155         vst1.8          {d6},     [r0,:64], r2
156         subs            r3,  r3,  #2
157         bgt             1b
158         bx              lr
159 endfunc
160
161 function \type\()_rv40_qpel8_v_lowpass_neon
162         vld1.64         {d2},     [r1], r2
163         vld1.64         {d3},     [r1], r2
164         vld1.64         {d4},     [r1], r2
165         vld1.64         {d5},     [r1], r2
166         vld1.64         {d6},     [r1], r2
167         vld1.64         {d7},     [r1], r2
168         vld1.64         {d8},     [r1], r2
169         vld1.64         {d9},     [r1], r2
170         vld1.64         {d10},    [r1], r2
171         vld1.64         {d11},    [r1], r2
172         vld1.64         {d12},    [r1], r2
173         vld1.64         {d13},    [r1], r2
174         vld1.64         {d14},    [r1]
175         transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
176         transpose_8x8   d10, d11, d12, d13, d14, d15, d30, d31
177         qpel_lowpass_x2 d2,  d10, d3,  d11, d0,  d1,  6
178         qpel_lowpass_x2 d4,  d12, d5,  d13, d0,  d1,  6
179         qpel_lowpass_x2 d6,  d14, d7,  d15, d0,  d1,  6
180         qpel_lowpass_x2 d8,  d30, d9,  d31, d0,  d1,  6
181         transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
182   .ifc \type,avg
183         vld1.64         d12,      [r0,:64], r2
184         vld1.64         d13,      [r0,:64], r2
185         vld1.64         d14,      [r0,:64], r2
186         vld1.64         d15,      [r0,:64], r2
187         vld1.64         d16,      [r0,:64], r2
188         vld1.64         d17,      [r0,:64], r2
189         vld1.64         d18,      [r0,:64], r2
190         vld1.64         d19,      [r0,:64], r2
191         sub             r0,  r0,  r2,  lsl #3
192         vrhadd.u8       q1,  q1,  q6
193         vrhadd.u8       q2,  q2,  q7
194         vrhadd.u8       q3,  q3,  q8
195         vrhadd.u8       q4,  q4,  q9
196   .endif
197         vst1.64         d2,       [r0,:64], r2
198         vst1.64         d3,       [r0,:64], r2
199         vst1.64         d4,       [r0,:64], r2
200         vst1.64         d5,       [r0,:64], r2
201         vst1.64         d6,       [r0,:64], r2
202         vst1.64         d7,       [r0,:64], r2
203         vst1.64         d8,       [r0,:64], r2
204         vst1.64         d9,       [r0,:64], r2
205         bx              lr
206 endfunc
207
208         rv40_qpel8_v    5, \type
209         rv40_qpel8_v    6, \type
210
211 function ff_\type\()_rv40_qpel8_mc10_neon, export=1
212         sub             r1,  r1,  #2
213         mov             r3,  #8
214         vmov.i8         d0,  #52
215         vmov.i8         d1,  #20
216         b               \type\()_rv40_qpel8_h_lowpass_neon
217 endfunc
218
219 function ff_\type\()_rv40_qpel8_mc30_neon, export=1
220         sub             r1,  r1,  #2
221         mov             r3,  #8
222         vmov.i8         d0,  #20
223         vmov.i8         d1,  #52
224         b               \type\()_rv40_qpel8_h_lowpass_neon
225 endfunc
226
227 function ff_\type\()_rv40_qpel8_mc01_neon, export=1
228         push            {r4, lr}
229         vpush           {d8-d15}
230         sub             r1,  r1,  r2,  lsl #1
231         vmov.i8         d0,  #52
232         vmov.i8         d1,  #20
233         bl              \type\()_rv40_qpel8_v_lowpass_neon
234         vpop            {d8-d15}
235         pop             {r4, pc}
236 endfunc
237
238 function ff_\type\()_rv40_qpel8_mc11_neon, export=1
239         push            {r4, lr}
240         vpush           {d8-d15}
241         sub             sp,  sp,  #14*8
242         add             r12, sp,  #7
243         bic             r12, r12, #7
244         sub             r1,  r1,  r2,  lsl #1
245         sub             r1,  r1,  #2
246         mov             r3,  #12
247         vmov.i8         d0,  #52
248         vmov.i8         d1,  #20
249         bl              put_rv40_qpel8_h_lp_packed_s6_neon
250         add             r1,  sp,  #7
251         bic             r1,  r1,  #7
252         bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
253         add             sp,  sp,  #14*8
254         vpop            {d8-d15}
255         pop             {r4, pc}
256 endfunc
257
258 function ff_\type\()_rv40_qpel8_mc21_neon, export=1
259         push            {r4, lr}
260         vpush           {d8-d15}
261         sub             sp,  sp,  #14*8
262         add             r12, sp,  #7
263         bic             r12, r12, #7
264         sub             r1,  r1,  r2,  lsl #1
265         sub             r1,  r1,  #2
266         mov             r3,  #12
267         vmov.i8         d0,  #20
268         vmov.i8         d1,  #20
269         bl              put_rv40_qpel8_h_lp_packed_s5_neon
270         add             r1,  sp,  #7
271         bic             r1,  r1,  #7
272         vmov.i8         d0,  #52
273         bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
274         add             sp,  sp,  #14*8
275         vpop            {d8-d15}
276         pop             {r4, pc}
277 endfunc
278
279 function ff_\type\()_rv40_qpel8_mc31_neon, export=1
280         push            {r4, lr}
281         vpush           {d8-d15}
282         sub             sp,  sp,  #14*8
283         add             r12, sp,  #7
284         bic             r12, r12, #7
285         sub             r1,  r1,  r2,  lsl #1
286         sub             r1,  r1,  #2
287         mov             r3,  #12
288         vmov.i8         d0,  #20
289         vmov.i8         d1,  #52
290         bl              put_rv40_qpel8_h_lp_packed_s6_neon
291         add             r1,  sp,  #7
292         bic             r1,  r1,  #7
293         vswp            d0,  d1
294         bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
295         add             sp,  sp,  #14*8
296         vpop            {d8-d15}
297         pop             {r4, pc}
298 endfunc
299
300 function ff_\type\()_rv40_qpel8_mc12_neon, export=1
301         push            {r4, lr}
302         vpush           {d8-d15}
303         sub             sp,  sp,  #14*8
304         add             r12, sp,  #7
305         bic             r12, r12, #7
306         sub             r1,  r1,  r2,  lsl #1
307         sub             r1,  r1,  #2
308         mov             r3,  #12
309         vmov.i8         d0,  #52
310         vmov.i8         d1,  #20
311         bl              put_rv40_qpel8_h_lp_packed_s6_neon
312         add             r1,  sp,  #7
313         bic             r1,  r1,  #7
314         vmov.i8         d0,  #20
315         bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
316         add             sp,  sp,  #14*8
317         vpop            {d8-d15}
318         pop             {r4, pc}
319 endfunc
320
321 function ff_\type\()_rv40_qpel8_mc22_neon, export=1
322         push            {r4, lr}
323         vpush           {d8-d15}
324         sub             sp,  sp,  #14*8
325         add             r12, sp,  #7
326         bic             r12, r12, #7
327         sub             r1,  r1,  r2,  lsl #1
328         sub             r1,  r1,  #2
329         mov             r3,  #12
330         vmov.i8         d0,  #20
331         vmov.i8         d1,  #20
332         bl              put_rv40_qpel8_h_lp_packed_s5_neon
333         add             r1,  sp,  #7
334         bic             r1,  r1,  #7
335         bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
336         add             sp,  sp,  #14*8
337         vpop            {d8-d15}
338         pop             {r4, pc}
339 endfunc
340
341 function ff_\type\()_rv40_qpel8_mc32_neon, export=1
342         push            {r4, lr}
343         vpush           {d8-d15}
344         sub             sp,  sp,  #14*8
345         add             r12, sp,  #7
346         bic             r12, r12, #7
347         sub             r1,  r1,  r2,  lsl #1
348         sub             r1,  r1,  #2
349         mov             r3,  #12
350         vmov.i8         d0,  #20
351         vmov.i8         d1,  #52
352         bl              put_rv40_qpel8_h_lp_packed_s6_neon
353         add             r1,  sp,  #7
354         bic             r1,  r1,  #7
355         vmov.i8         d1,  #20
356         bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
357         add             sp,  sp,  #14*8
358         vpop            {d8-d15}
359         pop             {r4, pc}
360 endfunc
361
362 function ff_\type\()_rv40_qpel8_mc03_neon, export=1
363         push            {r4, lr}
364         vpush           {d8-d15}
365         sub             r1,  r1,  r2,  lsl #1
366         vmov.i8         d0,  #20
367         vmov.i8         d1,  #52
368         bl              \type\()_rv40_qpel8_v_lowpass_neon
369         vpop            {d8-d15}
370         pop             {r4, pc}
371 endfunc
372
373 function ff_\type\()_rv40_qpel8_mc33_neon, export=1
374         mov             r3,  #8
375         b               X(ff_\type\()_pixels8_xy2_neon)
376 endfunc
377
378 function ff_\type\()_rv40_qpel8_mc13_neon, export=1
379         push            {r4, lr}
380         vpush           {d8-d15}
381         sub             sp,  sp,  #14*8
382         add             r12, sp,  #7
383         bic             r12, r12, #7
384         sub             r1,  r1,  r2,  lsl #1
385         sub             r1,  r1,  #2
386         mov             r3,  #12
387         vmov.i8         d0,  #52
388         vmov.i8         d1,  #20
389         bl              put_rv40_qpel8_h_lp_packed_s6_neon
390         add             r1,  sp,  #7
391         bic             r1,  r1,  #7
392         vswp            d0,  d1
393         bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
394         add             sp,  sp,  #14*8
395         vpop            {d8-d15}
396         pop             {r4, pc}
397 endfunc
398
399 function ff_\type\()_rv40_qpel8_mc23_neon, export=1
400         push            {r4, lr}
401         vpush           {d8-d15}
402         sub             sp,  sp,  #14*8
403         add             r12, sp,  #7
404         bic             r12, r12, #7
405         sub             r1,  r1,  r2,  lsl #1
406         sub             r1,  r1,  #2
407         mov             r3,  #12
408         vmov.i8         d0,  #20
409         vmov.i8         d1,  #20
410         bl              put_rv40_qpel8_h_lp_packed_s5_neon
411         add             r1,  sp,  #7
412         bic             r1,  r1,  #7
413         vmov.i8         d1,  #52
414         bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
415         add             sp,  sp,  #14*8
416         vpop            {d8-d15}
417         pop             {r4, pc}
418 endfunc
419
420 function ff_\type\()_rv40_qpel16_mc10_neon, export=1
421         vmov.i8         d0,  #52
422         vmov.i8         d1,  #20
423 .L\type\()_rv40_qpel16_h:
424         push            {r1, lr}
425         sub             r1,  r1,  #2
426         mov             r3,  #16
427         bl              \type\()_rv40_qpel8_h_lowpass_neon
428         pop             {r1, lr}
429         sub             r0,  r0,  r2,  lsl #4
430         add             r0,  r0,  #8
431         add             r1,  r1,  #6
432         mov             r3,  #16
433         b               \type\()_rv40_qpel8_h_lowpass_neon
434 endfunc
435
436 function ff_\type\()_rv40_qpel16_mc30_neon, export=1
437         vmov.i8         d0,  #20
438         vmov.i8         d1,  #52
439         b               .L\type\()_rv40_qpel16_h
440 endfunc
441
442 function ff_\type\()_rv40_qpel16_mc01_neon, export=1
443         vmov.i8         d0,  #52
444         vmov.i8         d1,  #20
445 .L\type\()_rv40_qpel16_v:
446         sub             r1,  r1,  r2,  lsl #1
447         push            {r1, lr}
448         vpush           {d8-d15}
449         bl              \type\()_rv40_qpel8_v_lowpass_neon
450         sub             r1,  r1,  r2,  lsl #2
451         bl              \type\()_rv40_qpel8_v_lowpass_neon
452         ldr             r1,  [sp, #64]
453         sub             r0,  r0,  r2,  lsl #4
454         add             r0,  r0,  #8
455         add             r1,  r1,  #8
456         bl              \type\()_rv40_qpel8_v_lowpass_neon
457         sub             r1,  r1,  r2,  lsl #2
458         bl              \type\()_rv40_qpel8_v_lowpass_neon
459         vpop            {d8-d15}
460         pop             {r1, pc}
461 endfunc
462
463 function ff_\type\()_rv40_qpel16_mc11_neon, export=1
464         sub             r1,  r1,  r2,  lsl #1
465         sub             r1,  r1,  #2
466         push            {r1, lr}
467         vpush           {d8-d15}
468         sub             sp,  sp,  #44*8
469         add             r12, sp,  #7
470         bic             r12, r12, #7
471         mov             r3,  #20
472         vmov.i8         d0,  #52
473         vmov.i8         d1,  #20
474         bl              put_rv40_qpel8_h_lp_packed_s6_neon
475         ldr             r1,  [sp, #416]
476         add             r1,  r1,  #8
477         mov             r3,  #20
478         bl              put_rv40_qpel8_h_lp_packed_s6_neon
479 .L\type\()_rv40_qpel16_v_s6:
480         add             r1,  sp,  #7
481         bic             r1,  r1,  #7
482         bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
483         sub             r1,  r1,  #40
484         bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
485         sub             r0,  r0,  r2,  lsl #4
486         add             r0,  r0,  #8
487         bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
488         sub             r1,  r1,  #40
489         bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
490         add             sp,  sp,  #44*8
491         vpop            {d8-d15}
492         pop             {r1, pc}
493 endfunc
494
495 function ff_\type\()_rv40_qpel16_mc21_neon, export=1
496         sub             r1,  r1,  r2,  lsl #1
497         sub             r1,  r1,  #2
498         push            {r1, lr}
499         vpush           {d8-d15}
500         sub             sp,  sp,  #44*8
501         add             r12, sp,  #7
502         bic             r12, r12, #7
503         mov             r3,  #20
504         vmov.i8         d0,  #20
505         vmov.i8         d1,  #20
506         bl              put_rv40_qpel8_h_lp_packed_s5_neon
507         ldr             r1,  [sp, #416]
508         add             r1,  r1,  #8
509         mov             r3,  #20
510         bl              put_rv40_qpel8_h_lp_packed_s5_neon
511         vmov.i8         d0,  #52
512         b               .L\type\()_rv40_qpel16_v_s6
513 endfunc
514
515 function ff_\type\()_rv40_qpel16_mc31_neon, export=1
516         sub             r1,  r1,  r2,  lsl #1
517         sub             r1,  r1,  #2
518         push            {r1, lr}
519         vpush           {d8-d15}
520         sub             sp,  sp,  #44*8
521         add             r12, sp,  #7
522         bic             r12, r12, #7
523         mov             r3,  #20
524         vmov.i8         d0,  #20
525         vmov.i8         d1,  #52
526         bl              put_rv40_qpel8_h_lp_packed_s6_neon
527         ldr             r1,  [sp, #416]
528         add             r1,  r1,  #8
529         mov             r3,  #20
530         bl              put_rv40_qpel8_h_lp_packed_s6_neon
531         vswp            d0,  d1
532         b               .L\type\()_rv40_qpel16_v_s6
533 endfunc
534
535 function ff_\type\()_rv40_qpel16_mc12_neon, export=1
536         sub             r1,  r1,  r2,  lsl #1
537         sub             r1,  r1,  #2
538         push            {r1, lr}
539         vpush           {d8-d15}
540         sub             sp,  sp,  #44*8
541         add             r12, sp,  #7
542         bic             r12, r12, #7
543         mov             r3,  #20
544         vmov.i8         d0,  #52
545         vmov.i8         d1,  #20
546         bl              put_rv40_qpel8_h_lp_packed_s6_neon
547         ldr             r1,  [sp, #416]
548         add             r1,  r1,  #8
549         mov             r3,  #20
550         bl              put_rv40_qpel8_h_lp_packed_s6_neon
551         vmov.i8         d0,  #20
552 .L\type\()_rv40_qpel16_v_s5:
553         add             r1,  sp,  #7
554         bic             r1,  r1,  #7
555         bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
556         sub             r1,  r1,  #40
557         bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
558         sub             r0,  r0,  r2,  lsl #4
559         add             r0,  r0,  #8
560         bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
561         sub             r1,  r1,  #40
562         bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
563         add             sp,  sp,  #44*8
564         vpop            {d8-d15}
565         pop             {r1, pc}
566 endfunc
567
568 function ff_\type\()_rv40_qpel16_mc22_neon, export=1
569         sub             r1,  r1,  r2,  lsl #1
570         sub             r1,  r1,  #2
571         push            {r1, lr}
572         vpush           {d8-d15}
573         sub             sp,  sp,  #44*8
574         add             r12, sp,  #7
575         bic             r12, r12, #7
576         mov             r3,  #20
577         vmov.i8         d0,  #20
578         vmov.i8         d1,  #20
579         bl              put_rv40_qpel8_h_lp_packed_s5_neon
580         ldr             r1,  [sp, #416]
581         add             r1,  r1,  #8
582         mov             r3,  #20
583         bl              put_rv40_qpel8_h_lp_packed_s5_neon
584         b               .L\type\()_rv40_qpel16_v_s5
585 endfunc
586
587 function ff_\type\()_rv40_qpel16_mc32_neon, export=1
588         sub             r1,  r1,  r2,  lsl #1
589         sub             r1,  r1,  #2
590         push            {r1, lr}
591         vpush           {d8-d15}
592         sub             sp,  sp,  #44*8
593         add             r12, sp,  #7
594         bic             r12, r12, #7
595         mov             r3,  #20
596         vmov.i8         d0,  #20
597         vmov.i8         d1,  #52
598         bl              put_rv40_qpel8_h_lp_packed_s6_neon
599         ldr             r1,  [sp, #416]
600         add             r1,  r1,  #8
601         mov             r3,  #20
602         bl              put_rv40_qpel8_h_lp_packed_s6_neon
603         vmov.i8         d1,  #20
604         b               .L\type\()_rv40_qpel16_v_s5
605 endfunc
606
607 function ff_\type\()_rv40_qpel16_mc03_neon, export=1
608         vmov.i8         d0,  #20
609         vmov.i8         d1,  #52
610         b               .L\type\()_rv40_qpel16_v
611 endfunc
612
613 function ff_\type\()_rv40_qpel16_mc13_neon, export=1
614         sub             r1,  r1,  r2,  lsl #1
615         sub             r1,  r1,  #2
616         push            {r1, lr}
617         vpush           {d8-d15}
618         sub             sp,  sp,  #44*8
619         add             r12, sp,  #7
620         bic             r12, r12, #7
621         mov             r3,  #20
622         vmov.i8         d0,  #52
623         vmov.i8         d1,  #20
624         bl              put_rv40_qpel8_h_lp_packed_s6_neon
625         ldr             r1,  [sp, #416]
626         add             r1,  r1,  #8
627         mov             r3,  #20
628         bl              put_rv40_qpel8_h_lp_packed_s6_neon
629         vswp            d0,  d1
630         b               .L\type\()_rv40_qpel16_v_s6
631 endfunc
632
633 function ff_\type\()_rv40_qpel16_mc23_neon, export=1
634         sub             r1,  r1,  r2,  lsl #1
635         sub             r1,  r1,  #2
636         push            {r1, lr}
637         vpush           {d8-d15}
638         sub             sp,  sp,  #44*8
639         add             r12, sp,  #7
640         bic             r12, r12, #7
641         mov             r3,  #20
642         vmov.i8         d0,  #20
643         vmov.i8         d1,  #20
644         bl              put_rv40_qpel8_h_lp_packed_s5_neon
645         ldr             r1,  [sp, #416]
646         add             r1,  r1,  #8
647         mov             r3,  #20
648         bl              put_rv40_qpel8_h_lp_packed_s5_neon
649         vmov.i8         d1,  #52
650         b               .L\type\()_rv40_qpel16_v_s6
651 endfunc
652
653 function ff_\type\()_rv40_qpel16_mc33_neon, export=1
654         mov             r3,  #16
655         b               X(ff_\type\()_pixels16_xy2_neon)
656 endfunc
657 .endm
658
659         rv40_qpel       put
660         rv40_qpel       avg
661
662 .macro  rv40_weight
663         vmovl.u8        q8,  d2
664         vmovl.u8        q9,  d3
665         vmovl.u8        q10, d4
666         vmovl.u8        q11, d5
667         vmull.u16       q2,  d16, d0[2]
668         vmull.u16       q3,  d17, d0[2]
669         vmull.u16       q8,  d18, d0[2]
670         vmull.u16       q9,  d19, d0[2]
671         vmull.u16       q12, d20, d0[0]
672         vmull.u16       q13, d21, d0[0]
673         vmull.u16       q14, d22, d0[0]
674         vmull.u16       q15, d23, d0[0]
675         vshrn.i32       d4,  q2,  #9
676         vshrn.i32       d5,  q3,  #9
677         vshrn.i32       d6,  q8,  #9
678         vshrn.i32       d7,  q9,  #9
679         vshrn.i32       d16, q12, #9
680         vshrn.i32       d17, q13, #9
681         vshrn.i32       d18, q14, #9
682         vshrn.i32       d19, q15, #9
683         vadd.u16        q2,  q2,  q8
684         vadd.u16        q3,  q3,  q9
685         vrshrn.i16      d2,  q2,  #5
686         vrshrn.i16      d3,  q3,  #5
687 .endm
688
689 /* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
690                                     int w1, int w2, int stride) */
691 function ff_rv40_weight_func_16_neon, export=1
692         ldr             r12, [sp]
693         vmov            d0,  r3,  r12
694         ldr             r12, [sp, #4]
695         mov             r3,  #16
696 1:
697         vld1.8          {q1},     [r1,:128], r12
698         vld1.8          {q2},     [r2,:128], r12
699         rv40_weight
700         vst1.8          {q1},     [r0,:128], r12
701         subs            r3,  r3,  #1
702         bne             1b
703         bx              lr
704 endfunc
705
706 /* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
707                                    int w1, int w2, int stride) */
708 function ff_rv40_weight_func_8_neon, export=1
709         ldr             r12, [sp]
710         vmov            d0,  r3,  r12
711         ldr             r12, [sp, #4]
712         mov             r3,  #8
713 1:
714         vld1.8          {d2},     [r1,:64], r12
715         vld1.8          {d3},     [r1,:64], r12
716         vld1.8          {d4},     [r2,:64], r12
717         vld1.8          {d5},     [r2,:64], r12
718         rv40_weight
719         vst1.8          {d2},     [r0,:64], r12
720         vst1.8          {d3},     [r0,:64], r12
721         subs            r3,  r3,  #2
722         bne             1b
723         bx              lr
724 endfunc
725
726 function ff_rv40_h_loop_filter_strength_neon, export=1
727         pkhbt           r2,  r3,  r2,  lsl #18
728
729         ldr             r3,  [r0]
730         ldr_dpre        r12, r0,  r1
731         teq             r3,  r12
732         beq             1f
733
734         sub             r0,  r0,  r1,  lsl #1
735
736         vld1.32         {d4[]},   [r0,:32], r1  @ -3
737         vld1.32         {d0[]},   [r0,:32], r1  @ -2
738         vld1.32         {d4[1]},  [r0,:32], r1  @ -1
739         vld1.32         {d5[]},   [r0,:32], r1  @  0
740         vld1.32         {d1[]},   [r0,:32], r1  @  1
741         vld1.32         {d5[0]},  [r0,:32], r1  @  2
742
743         vpaddl.u8       q8,  q0                 @ -2, -2, -2, -2,  1,  1,  1,  1
744         vpaddl.u8       q9,  q2                 @ -3, -3, -1, -1,  2,  2,  0,  0
745         vdup.32         d30, r2                 @ beta2, beta << 2
746         vpadd.u16       d16, d16, d17           @ -2, -2,  1,  1
747         vpadd.u16       d18, d18, d19           @ -3, -1,  2,  0
748         vabd.u16        d16, d18, d16
749         vclt.u16        d16, d16, d30
750
751         ldrd            r2,  r3,  [sp, #4]
752         vmovl.u16       q12, d16
753         vtrn.16         d16, d17
754         vshr.u32        q12, q12, #15
755         ldr             r0,  [sp]
756         vst1.32         {d24[1]}, [r2,:32]
757         vst1.32         {d25[1]}, [r3,:32]
758
759         cmp             r0,  #0
760         it              eq
761         bxeq            lr
762
763         vand            d18, d16, d17
764         vtrn.32         d18, d19
765         vand            d18, d18, d19
766         vmov.u16        r0,  d18[0]
767         bx              lr
768 1:
769         ldrd            r2,  r3,  [sp, #4]
770         mov             r0,  #0
771         str             r0,  [r2]
772         str             r0,  [r3]
773         bx              lr
774 endfunc
775
776 function ff_rv40_v_loop_filter_strength_neon, export=1
777         sub             r0,  r0,  #3
778         pkhbt           r2,  r3,  r2,  lsl #18
779
780         vld1.8          {d0},     [r0], r1
781         vld1.8          {d1},     [r0], r1
782         vld1.8          {d2},     [r0], r1
783         vld1.8          {d3},     [r0], r1
784
785         vaddl.u8        q0,  d0,  d1
786         vaddl.u8        q1,  d2,  d3
787         vdup.32         q15, r2
788         vadd.u16        q0,  q0,  q1            @ -3, -2, -1,  0,  1,  2
789         vext.16         q1,  q0,  q0,  #1       @ -2, -1,  0,  1,  2
790         vabd.u16        q0,  q1,  q0
791         vclt.u16        q0,  q0,  q15
792
793         ldrd            r2,  r3,  [sp, #4]
794         vmovl.u16       q1,  d0
795         vext.16         d1,  d0,  d1,  #3
796         vshr.u32        q1,  q1,  #15
797         ldr             r0,  [sp]
798         vst1.32         {d2[1]},  [r2,:32]
799         vst1.32         {d3[1]},  [r3,:32]
800
801         cmp             r0,  #0
802         it              eq
803         bxeq            lr
804
805         vand            d0,  d0,  d1
806         vtrn.16         d0,  d1
807         vand            d0,  d0,  d1
808         vmov.u16        r0,  d0[0]
809         bx              lr
810 endfunc
811
812 .macro  rv40_weak_loop_filter
813         vdup.16         d30, r2                 @ filter_p1
814         vdup.16         d31, r3                 @ filter_q1
815         ldrd            r2,  r3,  [sp]
816         vdup.16         d28, r2                 @ alpha
817         vdup.16         d29, r3                 @ beta
818         ldr             r12, [sp, #8]
819         vdup.16         d25, r12                @ lim_p0q0
820         ldrd            r2,  r3,  [sp, #12]
821         vsubl.u8        q9,  d5,  d4            @ x, t
822         vabdl.u8        q8,  d5,  d4            @ x, abs(t)
823         vneg.s16        q15, q15
824         vceq.i16        d16, d19, #0            @ !t
825         vshl.s16        d19, d19, #2            @ t << 2
826         vmul.u16        d18, d17, d28           @ alpha * abs(t)
827         vand            d24, d30, d31           @ filter_p1 & filter_q1
828         vsubl.u8        q1,  d0,  d4            @ p1p2, p1p0
829         vsubl.u8        q3,  d1,  d5            @ q1q2, q1q0
830         vmov.i16        d22, #3
831         vshr.u16        d18, d18, #7
832         vadd.i16        d22, d22, d24           @ 3 - (filter_p1 & filter_q1)
833         vsubl.u8        q10, d0,  d1            @ src[-2] - src[1]
834         vcle.u16        d18, d18, d22
835         vand            d20, d20, d24
836         vneg.s16        d23, d25                @ -lim_p0q0
837         vadd.s16        d19, d19, d20
838         vbic            d16, d18, d16           @ t && u <= 3 - (fp1 & fq1)
839         vtrn.32         d4,  d5                 @ -3,  2, -1,  0
840         vrshr.s16       d19, d19, #3
841         vmov            d28, d29                @ beta
842         vswp            d3,  d6                 @ q1q2, p1p0
843         vmin.s16        d19, d19, d25
844         vand            d30, d30, d16
845         vand            d31, d31, d16
846         vadd.s16        q10, q1,  q3            @ p1p2 + p1p0, q1q2 + q1q0
847         vmax.s16        d19, d19, d23           @ diff
848         vabs.s16        q1,  q1                 @ abs(p1p2), abs(q1q2)
849         vand            d18, d19, d16           @ diff
850         vcle.u16        q1,  q1,  q14
851         vneg.s16        d19, d18                @ -diff
852         vdup.16         d26, r3                 @ lim_p1
853         vaddw.u8        q2,  q9,  d5            @ src[-1]+diff, src[0]-diff
854         vhsub.s16       q11, q10, q9
855         vand            q1,  q1,  q15
856         vqmovun.s16     d4,  q2                 @ -1,  0
857         vand            q9,  q11, q1
858         vdup.16         d27, r2                 @ lim_q1
859         vneg.s16        q9,  q9
860         vneg.s16        q14, q13
861         vmin.s16        q9,  q9,  q13
862         vtrn.32         d0,  d1                 @ -2,  1,  -2,  1
863         vmax.s16        q9,  q9,  q14
864         vaddw.u8        q3,  q9,  d0
865         vqmovun.s16     d5,  q3                 @ -2,  1
866 .endm
867
868 function ff_rv40_h_weak_loop_filter_neon, export=1
869         sub             r0,  r0,  r1,  lsl #1
870         sub             r0,  r0,  r1
871
872         vld1.32         {d4[]},   [r0,:32], r1
873         vld1.32         {d0[]},   [r0,:32], r1
874         vld1.32         {d4[1]},  [r0,:32], r1
875         vld1.32         {d5[]},   [r0,:32], r1
876         vld1.32         {d1[]},   [r0,:32], r1
877         vld1.32         {d5[0]},  [r0,:32]
878
879         sub             r0,  r0,  r1,  lsl #2
880
881         rv40_weak_loop_filter
882
883         vst1.32         {d5[0]},  [r0,:32], r1
884         vst1.32         {d4[0]},  [r0,:32], r1
885         vst1.32         {d4[1]},  [r0,:32], r1
886         vst1.32         {d5[1]},  [r0,:32], r1
887
888         bx              lr
889 endfunc
890
891 function ff_rv40_v_weak_loop_filter_neon, export=1
892         sub             r12, r0,  #3
893         sub             r0,  r0,  #2
894
895         vld1.8          {d4},     [r12], r1
896         vld1.8          {d5},     [r12], r1
897         vld1.8          {d2},     [r12], r1
898         vld1.8          {d3},     [r12], r1
899
900         vtrn.16         q2,  q1
901         vtrn.8          d4,  d5
902         vtrn.8          d2,  d3
903
904         vrev64.32       d5,  d5
905         vtrn.32         q2,  q1
906         vdup.32         d0,  d3[0]
907         vdup.32         d1,  d2[0]
908
909         rv40_weak_loop_filter
910
911         vtrn.32         q2,  q3
912         vswp            d4,  d5
913
914         vst4.8          {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
915         vst4.8          {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
916         vst4.8          {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
917         vst4.8          {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
918
919         bx              lr
920 endfunc