]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264pred_neon.S
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libavcodec / arm / h264pred_neon.S
1 /*
2  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "asm.S"
22
23         .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
24 .if \n == 8 || \hi == 0
25         vld1.8          {\rd[0]}, [\rs], \rt
26         vld1.8          {\rd[1]}, [\rs], \rt
27         vld1.8          {\rd[2]}, [\rs], \rt
28         vld1.8          {\rd[3]}, [\rs], \rt
29 .endif
30 .if \n == 8 || \hi == 1
31         vld1.8          {\rd[4]}, [\rs], \rt
32         vld1.8          {\rd[5]}, [\rs], \rt
33         vld1.8          {\rd[6]}, [\rs], \rt
34         vld1.8          {\rd[7]}, [\rs], \rt
35 .endif
36         .endm
37
38         .macro add16x8  dq,  dl,  dh,  rl,  rh
39         vaddl.u8        \dq, \rl, \rh
40         vadd.u16        \dl, \dl, \dh
41         vpadd.u16       \dl, \dl, \dl
42         vpadd.u16       \dl, \dl, \dl
43         .endm
44
45 function ff_pred16x16_128_dc_neon, export=1
46         vmov.i8         q0,  #128
47         b               .L_pred16x16_dc_end
48 endfunc
49
50 function ff_pred16x16_top_dc_neon, export=1
51         sub             r2,  r0,  r1
52         vld1.8          {q0},     [r2,:128]
53         add16x8         q0,  d0,  d1,  d0,  d1
54         vrshrn.u16      d0,  q0,  #4
55         vdup.8          q0,  d0[0]
56         b               .L_pred16x16_dc_end
57 endfunc
58
59 function ff_pred16x16_left_dc_neon, export=1
60         sub             r2,  r0,  #1
61         ldcol.8         d0,  r2,  r1
62         ldcol.8         d1,  r2,  r1
63         add16x8         q0,  d0,  d1,  d0,  d1
64         vrshrn.u16      d0,  q0,  #4
65         vdup.8          q0,  d0[0]
66         b               .L_pred16x16_dc_end
67 endfunc
68
69 function ff_pred16x16_dc_neon, export=1
70         sub             r2,  r0,  r1
71         vld1.8          {q0},     [r2,:128]
72         sub             r2,  r0,  #1
73         ldcol.8         d2,  r2,  r1
74         ldcol.8         d3,  r2,  r1
75         vaddl.u8        q0,  d0,  d1
76         vaddl.u8        q1,  d2,  d3
77         vadd.u16        q0,  q0,  q1
78         vadd.u16        d0,  d0,  d1
79         vpadd.u16       d0,  d0,  d0
80         vpadd.u16       d0,  d0,  d0
81         vrshrn.u16      d0,  q0,  #5
82         vdup.8          q0,  d0[0]
83 .L_pred16x16_dc_end:
84         mov             r3,  #8
85 6:      vst1.8          {q0},     [r0,:128], r1
86         vst1.8          {q0},     [r0,:128], r1
87         subs            r3,  r3,  #1
88         bne             6b
89         bx              lr
90 endfunc
91
92 function ff_pred16x16_hor_neon, export=1
93         sub             r2,  r0,  #1
94         mov             r3,  #16
95 1:      vld1.8          {d0[],d1[]},[r2],      r1
96         vst1.8          {q0},       [r0,:128], r1
97         subs            r3,  r3,  #1
98         bne             1b
99         bx              lr
100 endfunc
101
102 function ff_pred16x16_vert_neon, export=1
103         sub             r0,  r0,  r1
104         vld1.8          {q0},     [r0,:128], r1
105         mov             r3,  #8
106 1:      vst1.8          {q0},     [r0,:128], r1
107         vst1.8          {q0},     [r0,:128], r1
108         subs            r3,  r3,  #1
109         bne             1b
110         bx              lr
111 endfunc
112
113 function ff_pred16x16_plane_neon, export=1
114         sub             r3,  r0,  r1
115         add             r2,  r3,  #8
116         sub             r3,  r3,  #1
117         vld1.8          {d0},     [r3]
118         vld1.8          {d2},     [r2,:64], r1
119         ldcol.8         d1,  r3,  r1
120         add             r3,  r3,  r1
121         ldcol.8         d3,  r3,  r1
122         vrev64.8        q0,  q0
123         vaddl.u8        q8,  d2,  d3
124         vsubl.u8        q2,  d2,  d0
125         vsubl.u8        q3,  d3,  d1
126         movrel          r3,  p16weight
127         vld1.8          {q0},     [r3,:128]
128         vmul.s16        q2,  q2,  q0
129         vmul.s16        q3,  q3,  q0
130         vadd.i16        d4,  d4,  d5
131         vadd.i16        d5,  d6,  d7
132         vpadd.i16       d4,  d4,  d5
133         vpadd.i16       d4,  d4,  d4
134         vshll.s16       q3,  d4,  #2
135         vaddw.s16       q2,  q3,  d4
136         vrshrn.s32      d4,  q2,  #6
137         mov             r3,  #0
138         vtrn.16         d4,  d5
139         vadd.i16        d2,  d4,  d5
140         vshl.i16        d3,  d2,  #3
141         vrev64.16       d16, d17
142         vsub.i16        d3,  d3,  d2
143         vadd.i16        d16, d16, d0
144         vshl.i16        d2,  d16, #4
145         vsub.i16        d2,  d2,  d3
146         vshl.i16        d3,  d4,  #4
147         vext.16         q0,  q0,  q0,  #7
148         vsub.i16        d6,  d5,  d3
149         vmov.16         d0[0], r3
150         vmul.i16        q0,  q0,  d4[0]
151         vdup.16         q1,  d2[0]
152         vdup.16         q2,  d4[0]
153         vdup.16         q3,  d6[0]
154         vshl.i16        q2,  q2,  #3
155         vadd.i16        q1,  q1,  q0
156         vadd.i16        q3,  q3,  q2
157         mov             r3,  #16
158 1:
159         vqshrun.s16     d0,  q1,  #5
160         vadd.i16        q1,  q1,  q2
161         vqshrun.s16     d1,  q1,  #5
162         vadd.i16        q1,  q1,  q3
163         vst1.8          {q0},     [r0,:128], r1
164         subs            r3,  r3,  #1
165         bne             1b
166         bx              lr
167 endfunc
168
169         .section        .rodata
170         .align          4
171 p16weight:
172         .short          1,2,3,4,5,6,7,8
173
174         .text
175
176 function ff_pred8x8_hor_neon, export=1
177         sub             r2,  r0,  #1
178         mov             r3,  #8
179 1:      vld1.8          {d0[]},   [r2],     r1
180         vst1.8          {d0},     [r0,:64], r1
181         subs            r3,  r3,  #1
182         bne             1b
183         bx              lr
184 endfunc
185
186 function ff_pred8x8_vert_neon, export=1
187         sub             r0,  r0,  r1
188         vld1.8          {d0},     [r0,:64], r1
189         mov             r3,  #4
190 1:      vst1.8          {d0},     [r0,:64], r1
191         vst1.8          {d0},     [r0,:64], r1
192         subs            r3,  r3,  #1
193         bne             1b
194         bx              lr
195 endfunc
196
197 function ff_pred8x8_plane_neon, export=1
198         sub             r3,  r0,  r1
199         add             r2,  r3,  #4
200         sub             r3,  r3,  #1
201         vld1.32         {d0[0]},  [r3]
202         vld1.32         {d2[0]},  [r2,:32], r1
203         ldcol.8         d0,  r3,  r1,  4,  hi=1
204         add             r3,  r3,  r1
205         ldcol.8         d3,  r3,  r1,  4
206         vaddl.u8        q8,  d2,  d3
207         vrev32.8        d0,  d0
208         vtrn.32         d2,  d3
209         vsubl.u8        q2,  d2,  d0
210         movrel          r3,  p16weight
211         vld1.16         {q0},     [r3,:128]
212         vmul.s16        d4,  d4,  d0
213         vmul.s16        d5,  d5,  d0
214         vpadd.i16       d4,  d4,  d5
215         vpaddl.s16      d4,  d4
216         vshl.i32        d5,  d4,  #4
217         vadd.s32        d4,  d4,  d5
218         vrshrn.s32      d4,  q2,  #5
219         mov             r3,  #0
220         vtrn.16         d4,  d5
221         vadd.i16        d2,  d4,  d5
222         vshl.i16        d3,  d2,  #2
223         vrev64.16       d16, d16
224         vsub.i16        d3,  d3,  d2
225         vadd.i16        d16, d16, d0
226         vshl.i16        d2,  d16, #4
227         vsub.i16        d2,  d2,  d3
228         vshl.i16        d3,  d4,  #3
229         vext.16         q0,  q0,  q0,  #7
230         vsub.i16        d6,  d5,  d3
231         vmov.16         d0[0], r3
232         vmul.i16        q0,  q0,  d4[0]
233         vdup.16         q1,  d2[0]
234         vdup.16         q2,  d4[0]
235         vdup.16         q3,  d6[0]
236         vshl.i16        q2,  q2,  #3
237         vadd.i16        q1,  q1,  q0
238         vadd.i16        q3,  q3,  q2
239         mov             r3,  #8
240 1:
241         vqshrun.s16     d0,  q1,  #5
242         vadd.i16        q1,  q1,  q3
243         vst1.8          {d0},     [r0,:64], r1
244         subs            r3,  r3,  #1
245         bne             1b
246         bx              lr
247 endfunc
248
249 function ff_pred8x8_128_dc_neon, export=1
250         vmov.i8         q0,  #128
251         b               .L_pred8x8_dc_end
252 endfunc
253
254 function ff_pred8x8_top_dc_neon, export=1
255         sub             r2,  r0,  r1
256         vld1.8          {d0},     [r2,:64]
257         vpaddl.u8       d0,  d0
258         vpadd.u16       d0,  d0,  d0
259         vrshrn.u16      d0,  q0,  #2
260         vdup.8          d1,  d0[1]
261         vdup.8          d0,  d0[0]
262         vtrn.32         d0,  d1
263         b               .L_pred8x8_dc_end
264 endfunc
265
266 function ff_pred8x8_left_dc_neon, export=1
267         sub             r2,  r0,  #1
268         ldcol.8         d0,  r2,  r1
269         vpaddl.u8       d0,  d0
270         vpadd.u16       d0,  d0,  d0
271         vrshrn.u16      d0,  q0,  #2
272         vdup.8          d1,  d0[1]
273         vdup.8          d0,  d0[0]
274         b               .L_pred8x8_dc_end
275 endfunc
276
277 function ff_pred8x8_dc_neon, export=1
278         sub             r2,  r0,  r1
279         vld1.8          {d0},     [r2,:64]
280         sub             r2,  r0,  #1
281         ldcol.8         d1,  r2,  r1
282         vtrn.32         d0,  d1
283         vpaddl.u8       q0,  q0
284         vpadd.u16       d0,  d0,  d1
285         vpadd.u16       d1,  d0,  d0
286         vrshrn.u16      d2,  q0,  #3
287         vrshrn.u16      d3,  q0,  #2
288         vdup.8          d0,  d2[4]
289         vdup.8          d1,  d3[3]
290         vdup.8          d4,  d3[2]
291         vdup.8          d5,  d2[5]
292         vtrn.32         q0,  q2
293 .L_pred8x8_dc_end:
294         mov             r3,  #4
295         add             r2,  r0,  r1,  lsl #2
296 6:      vst1.8          {d0},     [r0,:64], r1
297         vst1.8          {d1},     [r2,:64], r1
298         subs            r3,  r3,  #1
299         bne             6b
300         bx              lr
301 endfunc
302
303 function ff_pred8x8_l0t_dc_neon, export=1
304         sub             r2,  r0,  r1
305         vld1.8          {d0},     [r2,:64]
306         sub             r2,  r0,  #1
307         ldcol.8         d1,  r2,  r1,  4
308         vtrn.32         d0,  d1
309         vpaddl.u8       q0,  q0
310         vpadd.u16       d0,  d0,  d1
311         vpadd.u16       d1,  d0,  d0
312         vrshrn.u16      d2,  q0,  #3
313         vrshrn.u16      d3,  q0,  #2
314         vdup.8          d0,  d2[4]
315         vdup.8          d1,  d3[0]
316         vdup.8          q2,  d3[2]
317         vtrn.32         q0,  q2
318         b               .L_pred8x8_dc_end
319 endfunc
320
321 function ff_pred8x8_l00_dc_neon, export=1
322         sub             r2,  r0,  #1
323         ldcol.8         d0,  r2,  r1,  4
324         vpaddl.u8       d0,  d0
325         vpadd.u16       d0,  d0,  d0
326         vrshrn.u16      d0,  q0,  #2
327         vmov.i8         d1,  #128
328         vdup.8          d0,  d0[0]
329         b               .L_pred8x8_dc_end
330 endfunc
331
332 function ff_pred8x8_0lt_dc_neon, export=1
333         sub             r2,  r0,  r1
334         vld1.8          {d0},     [r2,:64]
335         add             r2,  r0,  r1,  lsl #2
336         sub             r2,  r2,  #1
337         ldcol.8         d1,  r2,  r1,  4,  hi=1
338         vtrn.32         d0,  d1
339         vpaddl.u8       q0,  q0
340         vpadd.u16       d0,  d0,  d1
341         vpadd.u16       d1,  d0,  d0
342         vrshrn.u16      d3,  q0,  #2
343         vrshrn.u16      d2,  q0,  #3
344         vdup.8          d0,  d3[0]
345         vdup.8          d1,  d3[3]
346         vdup.8          d4,  d3[2]
347         vdup.8          d5,  d2[5]
348         vtrn.32         q0,  q2
349         b               .L_pred8x8_dc_end
350 endfunc
351
352 function ff_pred8x8_0l0_dc_neon, export=1
353         add             r2,  r0,  r1,  lsl #2
354         sub             r2,  r2,  #1
355         ldcol.8         d1,  r2,  r1,  4
356         vpaddl.u8       d2,  d1
357         vpadd.u16       d2,  d2,  d2
358         vrshrn.u16      d1,  q1,  #2
359         vmov.i8         d0,  #128
360         vdup.8          d1,  d1[0]
361         b               .L_pred8x8_dc_end
362 endfunc