]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264pred_neon.S
avfilter/formats: Remove avfilter_make_format64_list()
[ffmpeg] / libavcodec / arm / h264pred_neon.S
1 /*
2  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23         .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
24 .if \n == 8 || \hi == 0
25         vld1.8          {\rd[0]}, [\rs], \rt
26         vld1.8          {\rd[1]}, [\rs], \rt
27         vld1.8          {\rd[2]}, [\rs], \rt
28         vld1.8          {\rd[3]}, [\rs], \rt
29 .endif
30 .if \n == 8 || \hi == 1
31         vld1.8          {\rd[4]}, [\rs], \rt
32         vld1.8          {\rd[5]}, [\rs], \rt
33         vld1.8          {\rd[6]}, [\rs], \rt
34         vld1.8          {\rd[7]}, [\rs], \rt
35 .endif
36         .endm
37
38         .macro add16x8  dq,  dl,  dh,  rl,  rh
39         vaddl.u8        \dq, \rl, \rh
40         vadd.u16        \dl, \dl, \dh
41         vpadd.u16       \dl, \dl, \dl
42         vpadd.u16       \dl, \dl, \dl
43         .endm
44
45 function ff_pred16x16_128_dc_neon, export=1
46         vmov.i8         q0,  #128
47         b               .L_pred16x16_dc_end
48 endfunc
49
50 function ff_pred16x16_top_dc_neon, export=1
51         sub             r2,  r0,  r1
52         vld1.8          {q0},     [r2,:128]
53         add16x8         q0,  d0,  d1,  d0,  d1
54         vrshrn.u16      d0,  q0,  #4
55         vdup.8          q0,  d0[0]
56         b               .L_pred16x16_dc_end
57 endfunc
58
59 function ff_pred16x16_left_dc_neon, export=1
60         sub             r2,  r0,  #1
61         ldcol.8         d0,  r2,  r1
62         ldcol.8         d1,  r2,  r1
63         add16x8         q0,  d0,  d1,  d0,  d1
64         vrshrn.u16      d0,  q0,  #4
65         vdup.8          q0,  d0[0]
66         b               .L_pred16x16_dc_end
67 endfunc
68
69 function ff_pred16x16_dc_neon, export=1
70         sub             r2,  r0,  r1
71         vld1.8          {q0},     [r2,:128]
72         sub             r2,  r0,  #1
73         ldcol.8         d2,  r2,  r1
74         ldcol.8         d3,  r2,  r1
75         vaddl.u8        q0,  d0,  d1
76         vaddl.u8        q1,  d2,  d3
77         vadd.u16        q0,  q0,  q1
78         vadd.u16        d0,  d0,  d1
79         vpadd.u16       d0,  d0,  d0
80         vpadd.u16       d0,  d0,  d0
81         vrshrn.u16      d0,  q0,  #5
82         vdup.8          q0,  d0[0]
83 .L_pred16x16_dc_end:
84         mov             r3,  #8
85 6:      vst1.8          {q0},     [r0,:128], r1
86         vst1.8          {q0},     [r0,:128], r1
87         subs            r3,  r3,  #1
88         bne             6b
89         bx              lr
90 endfunc
91
92 function ff_pred16x16_hor_neon, export=1
93         sub             r2,  r0,  #1
94         mov             r3,  #16
95 1:      vld1.8          {d0[],d1[]},[r2],      r1
96         vst1.8          {q0},       [r0,:128], r1
97         subs            r3,  r3,  #1
98         bne             1b
99         bx              lr
100 endfunc
101
102 function ff_pred16x16_vert_neon, export=1
103         sub             r0,  r0,  r1
104         vld1.8          {q0},     [r0,:128], r1
105         mov             r3,  #8
106 1:      vst1.8          {q0},     [r0,:128], r1
107         vst1.8          {q0},     [r0,:128], r1
108         subs            r3,  r3,  #1
109         bne             1b
110         bx              lr
111 endfunc
112
113 function ff_pred16x16_plane_neon, export=1
114         sub             r3,  r0,  r1
115         add             r2,  r3,  #8
116         sub             r3,  r3,  #1
117         vld1.8          {d0},     [r3]
118         vld1.8          {d2},     [r2,:64], r1
119         ldcol.8         d1,  r3,  r1
120         add             r3,  r3,  r1
121         ldcol.8         d3,  r3,  r1
122         vrev64.8        q0,  q0
123         vaddl.u8        q8,  d2,  d3
124         vsubl.u8        q2,  d2,  d0
125         vsubl.u8        q3,  d3,  d1
126         movrel          r3,  p16weight
127         vld1.8          {q0},     [r3,:128]
128         vmul.s16        q2,  q2,  q0
129         vmul.s16        q3,  q3,  q0
130         vadd.i16        d4,  d4,  d5
131         vadd.i16        d5,  d6,  d7
132         vpadd.i16       d4,  d4,  d5
133         vpadd.i16       d4,  d4,  d4
134         vshll.s16       q3,  d4,  #2
135         vaddw.s16       q2,  q3,  d4
136         vrshrn.s32      d4,  q2,  #6
137         mov             r3,  #0
138         vtrn.16         d4,  d5
139         vadd.i16        d2,  d4,  d5
140         vshl.i16        d3,  d2,  #3
141         vrev64.16       d16, d17
142         vsub.i16        d3,  d3,  d2
143         vadd.i16        d16, d16, d0
144         vshl.i16        d2,  d16, #4
145         vsub.i16        d2,  d2,  d3
146         vshl.i16        d3,  d4,  #4
147         vext.16         q0,  q0,  q0,  #7
148         vsub.i16        d6,  d5,  d3
149         vmov.16         d0[0], r3
150         vmul.i16        q0,  q0,  d4[0]
151         vdup.16         q1,  d2[0]
152         vdup.16         q2,  d4[0]
153         vdup.16         q3,  d6[0]
154         vshl.i16        q2,  q2,  #3
155         vadd.i16        q1,  q1,  q0
156         vadd.i16        q3,  q3,  q2
157         mov             r3,  #16
158 1:
159         vqshrun.s16     d0,  q1,  #5
160         vadd.i16        q1,  q1,  q2
161         vqshrun.s16     d1,  q1,  #5
162         vadd.i16        q1,  q1,  q3
163         vst1.8          {q0},     [r0,:128], r1
164         subs            r3,  r3,  #1
165         bne             1b
166         bx              lr
167 endfunc
168
169 const   p16weight, align=4
170         .short          1,2,3,4,5,6,7,8
171 endconst
172
173 function ff_pred8x8_hor_neon, export=1
174         sub             r2,  r0,  #1
175         mov             r3,  #8
176 1:      vld1.8          {d0[]},   [r2],     r1
177         vst1.8          {d0},     [r0,:64], r1
178         subs            r3,  r3,  #1
179         bne             1b
180         bx              lr
181 endfunc
182
183 function ff_pred8x8_vert_neon, export=1
184         sub             r0,  r0,  r1
185         vld1.8          {d0},     [r0,:64], r1
186         mov             r3,  #4
187 1:      vst1.8          {d0},     [r0,:64], r1
188         vst1.8          {d0},     [r0,:64], r1
189         subs            r3,  r3,  #1
190         bne             1b
191         bx              lr
192 endfunc
193
194 function ff_pred8x8_plane_neon, export=1
195         sub             r3,  r0,  r1
196         add             r2,  r3,  #4
197         sub             r3,  r3,  #1
198         vld1.32         {d0[0]},  [r3]
199         vld1.32         {d2[0]},  [r2,:32], r1
200         ldcol.8         d0,  r3,  r1,  4,  hi=1
201         add             r3,  r3,  r1
202         ldcol.8         d3,  r3,  r1,  4
203         vaddl.u8        q8,  d2,  d3
204         vrev32.8        d0,  d0
205         vtrn.32         d2,  d3
206         vsubl.u8        q2,  d2,  d0
207         movrel          r3,  p16weight
208         vld1.16         {q0},     [r3,:128]
209         vmul.s16        d4,  d4,  d0
210         vmul.s16        d5,  d5,  d0
211         vpadd.i16       d4,  d4,  d5
212         vpaddl.s16      d4,  d4
213         vshl.i32        d5,  d4,  #4
214         vadd.s32        d4,  d4,  d5
215         vrshrn.s32      d4,  q2,  #5
216         mov             r3,  #0
217         vtrn.16         d4,  d5
218         vadd.i16        d2,  d4,  d5
219         vshl.i16        d3,  d2,  #2
220         vrev64.16       d16, d16
221         vsub.i16        d3,  d3,  d2
222         vadd.i16        d16, d16, d0
223         vshl.i16        d2,  d16, #4
224         vsub.i16        d2,  d2,  d3
225         vshl.i16        d3,  d4,  #3
226         vext.16         q0,  q0,  q0,  #7
227         vsub.i16        d6,  d5,  d3
228         vmov.16         d0[0], r3
229         vmul.i16        q0,  q0,  d4[0]
230         vdup.16         q1,  d2[0]
231         vdup.16         q2,  d4[0]
232         vdup.16         q3,  d6[0]
233         vshl.i16        q2,  q2,  #3
234         vadd.i16        q1,  q1,  q0
235         vadd.i16        q3,  q3,  q2
236         mov             r3,  #8
237 1:
238         vqshrun.s16     d0,  q1,  #5
239         vadd.i16        q1,  q1,  q3
240         vst1.8          {d0},     [r0,:64], r1
241         subs            r3,  r3,  #1
242         bne             1b
243         bx              lr
244 endfunc
245
246 function ff_pred8x8_128_dc_neon, export=1
247         vmov.i8         q0,  #128
248         b               .L_pred8x8_dc_end
249 endfunc
250
251 function ff_pred8x8_top_dc_neon, export=1
252         sub             r2,  r0,  r1
253         vld1.8          {d0},     [r2,:64]
254         vpaddl.u8       d0,  d0
255         vpadd.u16       d0,  d0,  d0
256         vrshrn.u16      d0,  q0,  #2
257         vdup.8          d1,  d0[1]
258         vdup.8          d0,  d0[0]
259         vtrn.32         d0,  d1
260         b               .L_pred8x8_dc_end
261 endfunc
262
263 function ff_pred8x8_left_dc_neon, export=1
264         sub             r2,  r0,  #1
265         ldcol.8         d0,  r2,  r1
266         vpaddl.u8       d0,  d0
267         vpadd.u16       d0,  d0,  d0
268         vrshrn.u16      d0,  q0,  #2
269         vdup.8          d1,  d0[1]
270         vdup.8          d0,  d0[0]
271         b               .L_pred8x8_dc_end
272 endfunc
273
274 function ff_pred8x8_dc_neon, export=1
275         sub             r2,  r0,  r1
276         vld1.8          {d0},     [r2,:64]
277         sub             r2,  r0,  #1
278         ldcol.8         d1,  r2,  r1
279         vtrn.32         d0,  d1
280         vpaddl.u8       q0,  q0
281         vpadd.u16       d0,  d0,  d1
282         vpadd.u16       d1,  d0,  d0
283         vrshrn.u16      d2,  q0,  #3
284         vrshrn.u16      d3,  q0,  #2
285         vdup.8          d0,  d2[4]
286         vdup.8          d1,  d3[3]
287         vdup.8          d4,  d3[2]
288         vdup.8          d5,  d2[5]
289         vtrn.32         q0,  q2
290 .L_pred8x8_dc_end:
291         mov             r3,  #4
292         add             r2,  r0,  r1,  lsl #2
293 6:      vst1.8          {d0},     [r0,:64], r1
294         vst1.8          {d1},     [r2,:64], r1
295         subs            r3,  r3,  #1
296         bne             6b
297         bx              lr
298 endfunc
299
300 function ff_pred8x8_l0t_dc_neon, export=1
301         sub             r2,  r0,  r1
302         vld1.8          {d0},     [r2,:64]
303         sub             r2,  r0,  #1
304         ldcol.8         d1,  r2,  r1,  4
305         vtrn.32         d0,  d1
306         vpaddl.u8       q0,  q0
307         vpadd.u16       d0,  d0,  d1
308         vpadd.u16       d1,  d0,  d0
309         vrshrn.u16      d2,  q0,  #3
310         vrshrn.u16      d3,  q0,  #2
311         vdup.8          d0,  d2[4]
312         vdup.8          d1,  d3[0]
313         vdup.8          q2,  d3[2]
314         vtrn.32         q0,  q2
315         b               .L_pred8x8_dc_end
316 endfunc
317
318 function ff_pred8x8_l00_dc_neon, export=1
319         sub             r2,  r0,  #1
320         ldcol.8         d0,  r2,  r1,  4
321         vpaddl.u8       d0,  d0
322         vpadd.u16       d0,  d0,  d0
323         vrshrn.u16      d0,  q0,  #2
324         vmov.i8         d1,  #128
325         vdup.8          d0,  d0[0]
326         b               .L_pred8x8_dc_end
327 endfunc
328
329 function ff_pred8x8_0lt_dc_neon, export=1
330         sub             r2,  r0,  r1
331         vld1.8          {d0},     [r2,:64]
332         add             r2,  r0,  r1,  lsl #2
333         sub             r2,  r2,  #1
334         ldcol.8         d1,  r2,  r1,  4,  hi=1
335         vtrn.32         d0,  d1
336         vpaddl.u8       q0,  q0
337         vpadd.u16       d0,  d0,  d1
338         vpadd.u16       d1,  d0,  d0
339         vrshrn.u16      d3,  q0,  #2
340         vrshrn.u16      d2,  q0,  #3
341         vdup.8          d0,  d3[0]
342         vdup.8          d1,  d3[3]
343         vdup.8          d4,  d3[2]
344         vdup.8          d5,  d2[5]
345         vtrn.32         q0,  q2
346         b               .L_pred8x8_dc_end
347 endfunc
348
349 function ff_pred8x8_0l0_dc_neon, export=1
350         add             r2,  r0,  r1,  lsl #2
351         sub             r2,  r2,  #1
352         ldcol.8         d1,  r2,  r1,  4
353         vpaddl.u8       d2,  d1
354         vpadd.u16       d2,  d2,  d2
355         vrshrn.u16      d1,  q1,  #2
356         vmov.i8         d0,  #128
357         vdup.8          d1,  d1[0]
358         b               .L_pred8x8_dc_end
359 endfunc