]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264cmc_neon.S
arm: dsputil: Add a bunch of missing #includes
[ffmpeg] / libavcodec / arm / h264cmc_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
24 .macro  h264_chroma_mc8 type, codec=h264
25 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
26         push            {r4-r7, lr}
27         ldrd            r4,  r5,  [sp, #20]
28   .ifc \type,avg
29         mov             lr,  r0
30   .endif
31         pld             [r1]
32         pld             [r1, r2]
33
34   .ifc \codec,rv40
35         movrel          r6,  rv40bias
36         lsr             r7,  r5,  #1
37         add             r6,  r6,  r7,  lsl #3
38         lsr             r7,  r4,  #1
39         add             r6,  r6,  r7,  lsl #1
40         vld1.16         {d22[],d23[]}, [r6,:16]
41   .endif
42   .ifc \codec,vc1
43         vmov.u16        q11, #28
44   .endif
45
46 A       muls            r7,  r4,  r5
47 T       mul             r7,  r4,  r5
48 T       cmp             r7,  #0
49         rsb             r6,  r7,  r5,  lsl #3
50         rsb             r12, r7,  r4,  lsl #3
51         sub             r4,  r7,  r4,  lsl #3
52         sub             r4,  r4,  r5,  lsl #3
53         add             r4,  r4,  #64
54
55         beq             2f
56
57         vdup.8          d0,  r4
58         vdup.8          d1,  r12
59         vld1.8          {d4, d5}, [r1], r2
60         vdup.8          d2,  r6
61         vdup.8          d3,  r7
62         vext.8          d5,  d4,  d5,  #1
63
64 1:      vld1.8          {d6, d7}, [r1], r2
65         vmull.u8        q8,  d4,  d0
66         vmlal.u8        q8,  d5,  d1
67         vext.8          d7,  d6,  d7,  #1
68         vld1.8          {d4, d5}, [r1], r2
69         vmlal.u8        q8,  d6,  d2
70         pld             [r1]
71         vext.8          d5,  d4,  d5,  #1
72         vmlal.u8        q8,  d7,  d3
73         vmull.u8        q9,  d6,  d0
74         subs            r3,  r3,  #2
75         vmlal.u8        q9,  d7,  d1
76         vmlal.u8        q9,  d4,  d2
77         vmlal.u8        q9,  d5,  d3
78         pld             [r1, r2]
79   .ifc \codec,h264
80         vrshrn.u16      d16, q8,  #6
81         vrshrn.u16      d17, q9,  #6
82   .else
83         vadd.u16        q8,  q8,  q11
84         vadd.u16        q9,  q9,  q11
85         vshrn.u16       d16, q8,  #6
86         vshrn.u16       d17, q9,  #6
87   .endif
88   .ifc \type,avg
89         vld1.8          {d20}, [lr,:64], r2
90         vld1.8          {d21}, [lr,:64], r2
91         vrhadd.u8       q8,  q8,  q10
92   .endif
93         vst1.8          {d16}, [r0,:64], r2
94         vst1.8          {d17}, [r0,:64], r2
95         bgt             1b
96
97         pop             {r4-r7, pc}
98
99 2:      tst             r6,  r6
100         add             r12, r12, r6
101         vdup.8          d0,  r4
102         vdup.8          d1,  r12
103
104         beq             4f
105
106         vld1.8          {d4}, [r1], r2
107
108 3:      vld1.8          {d6}, [r1], r2
109         vmull.u8        q8,  d4,  d0
110         vmlal.u8        q8,  d6,  d1
111         vld1.8          {d4}, [r1], r2
112         vmull.u8        q9,  d6,  d0
113         vmlal.u8        q9,  d4,  d1
114         pld             [r1]
115   .ifc \codec,h264
116         vrshrn.u16      d16, q8,  #6
117         vrshrn.u16      d17, q9,  #6
118   .else
119         vadd.u16        q8,  q8,  q11
120         vadd.u16        q9,  q9,  q11
121         vshrn.u16       d16, q8,  #6
122         vshrn.u16       d17, q9,  #6
123   .endif
124         pld             [r1, r2]
125   .ifc \type,avg
126         vld1.8          {d20}, [lr,:64], r2
127         vld1.8          {d21}, [lr,:64], r2
128         vrhadd.u8       q8,  q8,  q10
129   .endif
130         subs            r3,  r3,  #2
131         vst1.8          {d16}, [r0,:64], r2
132         vst1.8          {d17}, [r0,:64], r2
133         bgt             3b
134
135         pop             {r4-r7, pc}
136
137 4:      vld1.8          {d4, d5}, [r1], r2
138         vld1.8          {d6, d7}, [r1], r2
139         vext.8          d5,  d4,  d5,  #1
140         vext.8          d7,  d6,  d7,  #1
141         pld             [r1]
142         subs            r3,  r3,  #2
143         vmull.u8        q8,  d4,  d0
144         vmlal.u8        q8,  d5,  d1
145         vmull.u8        q9,  d6,  d0
146         vmlal.u8        q9,  d7,  d1
147         pld             [r1, r2]
148   .ifc \codec,h264
149         vrshrn.u16      d16, q8,  #6
150         vrshrn.u16      d17, q9,  #6
151   .else
152         vadd.u16        q8,  q8,  q11
153         vadd.u16        q9,  q9,  q11
154         vshrn.u16       d16, q8,  #6
155         vshrn.u16       d17, q9,  #6
156   .endif
157   .ifc \type,avg
158         vld1.8          {d20}, [lr,:64], r2
159         vld1.8          {d21}, [lr,:64], r2
160         vrhadd.u8       q8,  q8,  q10
161   .endif
162         vst1.8          {d16}, [r0,:64], r2
163         vst1.8          {d17}, [r0,:64], r2
164         bgt             4b
165
166         pop             {r4-r7, pc}
167 endfunc
168 .endm
169
170 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
171 .macro  h264_chroma_mc4 type, codec=h264
172 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
173         push            {r4-r7, lr}
174         ldrd            r4,  r5,  [sp, #20]
175   .ifc \type,avg
176         mov             lr,  r0
177   .endif
178         pld             [r1]
179         pld             [r1, r2]
180
181   .ifc \codec,rv40
182         movrel          r6,  rv40bias
183         lsr             r7,  r5,  #1
184         add             r6,  r6,  r7,  lsl #3
185         lsr             r7,  r4,  #1
186         add             r6,  r6,  r7,  lsl #1
187         vld1.16         {d22[],d23[]}, [r6,:16]
188   .endif
189   .ifc \codec,vc1
190         vmov.u16        q11, #28
191   .endif
192
193 A       muls            r7,  r4,  r5
194 T       mul             r7,  r4,  r5
195 T       cmp             r7,  #0
196         rsb             r6,  r7,  r5,  lsl #3
197         rsb             r12, r7,  r4,  lsl #3
198         sub             r4,  r7,  r4,  lsl #3
199         sub             r4,  r4,  r5,  lsl #3
200         add             r4,  r4,  #64
201
202         beq             2f
203
204         vdup.8          d0,  r4
205         vdup.8          d1,  r12
206         vld1.8          {d4},     [r1], r2
207         vdup.8          d2,  r6
208         vdup.8          d3,  r7
209
210         vext.8          d5,  d4,  d5,  #1
211         vtrn.32         d4,  d5
212
213         vtrn.32         d0,  d1
214         vtrn.32         d2,  d3
215
216 1:      vld1.8          {d6},     [r1], r2
217         vext.8          d7,  d6,  d7,  #1
218         vtrn.32         d6,  d7
219         vmull.u8        q8,  d4,  d0
220         vmlal.u8        q8,  d6,  d2
221         vld1.8          {d4},     [r1], r2
222         vext.8          d5,  d4,  d5,  #1
223         vtrn.32         d4,  d5
224         pld             [r1]
225         vmull.u8        q9,  d6,  d0
226         vmlal.u8        q9,  d4,  d2
227         vadd.i16        d16, d16, d17
228         vadd.i16        d17, d18, d19
229   .ifc \codec,h264
230         vrshrn.u16      d16, q8,  #6
231   .else
232         vadd.u16        q8,  q8,  q11
233         vshrn.u16       d16, q8,  #6
234   .endif
235         subs            r3,  r3,  #2
236         pld             [r1, r2]
237   .ifc \type,avg
238         vld1.32         {d20[0]}, [lr,:32], r2
239         vld1.32         {d20[1]}, [lr,:32], r2
240         vrhadd.u8       d16, d16, d20
241   .endif
242         vst1.32         {d16[0]}, [r0,:32], r2
243         vst1.32         {d16[1]}, [r0,:32], r2
244         bgt             1b
245
246         pop             {r4-r7, pc}
247
248 2:      tst             r6,  r6
249         add             r12, r12, r6
250         vdup.8          d0,  r4
251         vdup.8          d1,  r12
252         vtrn.32         d0,  d1
253
254         beq             4f
255
256         vext.32         d1,  d0,  d1,  #1
257         vld1.32         {d4[0]},  [r1], r2
258
259 3:      vld1.32         {d4[1]},  [r1], r2
260         vmull.u8        q8,  d4,  d0
261         vld1.32         {d4[0]},  [r1], r2
262         vmull.u8        q9,  d4,  d1
263         vadd.i16        d16, d16, d17
264         vadd.i16        d17, d18, d19
265         pld             [r1]
266   .ifc \codec,h264
267         vrshrn.u16      d16, q8,  #6
268   .else
269         vadd.u16        q8,  q8,  q11
270         vshrn.u16       d16, q8,  #6
271   .endif
272   .ifc \type,avg
273         vld1.32         {d20[0]}, [lr,:32], r2
274         vld1.32         {d20[1]}, [lr,:32], r2
275         vrhadd.u8       d16, d16, d20
276   .endif
277         subs            r3,  r3,  #2
278         pld             [r1, r2]
279         vst1.32         {d16[0]}, [r0,:32], r2
280         vst1.32         {d16[1]}, [r0,:32], r2
281         bgt             3b
282
283         pop             {r4-r7, pc}
284
285 4:      vld1.8          {d4},     [r1], r2
286         vld1.8          {d6},     [r1], r2
287         vext.8          d5,  d4,  d5,  #1
288         vext.8          d7,  d6,  d7,  #1
289         vtrn.32         d4,  d5
290         vtrn.32         d6,  d7
291         vmull.u8        q8,  d4,  d0
292         vmull.u8        q9,  d6,  d0
293         subs            r3,  r3,  #2
294         vadd.i16        d16, d16, d17
295         vadd.i16        d17, d18, d19
296         pld             [r1]
297   .ifc \codec,h264
298         vrshrn.u16      d16, q8,  #6
299   .else
300         vadd.u16        q8,  q8,  q11
301         vshrn.u16       d16, q8,  #6
302   .endif
303   .ifc \type,avg
304         vld1.32         {d20[0]}, [lr,:32], r2
305         vld1.32         {d20[1]}, [lr,:32], r2
306         vrhadd.u8       d16, d16, d20
307   .endif
308         pld             [r1]
309         vst1.32         {d16[0]}, [r0,:32], r2
310         vst1.32         {d16[1]}, [r0,:32], r2
311         bgt             4b
312
313         pop             {r4-r7, pc}
314 endfunc
315 .endm
316
317 .macro  h264_chroma_mc2 type
318 function ff_\type\()_h264_chroma_mc2_neon, export=1
319         push            {r4-r6, lr}
320         ldr             r4,  [sp, #16]
321         ldr             lr,  [sp, #20]
322         pld             [r1]
323         pld             [r1, r2]
324         orrs            r5,  r4,  lr
325         beq             2f
326
327         mul             r5,  r4,  lr
328         rsb             r6,  r5,  lr,  lsl #3
329         rsb             r12, r5,  r4,  lsl #3
330         sub             r4,  r5,  r4,  lsl #3
331         sub             r4,  r4,  lr,  lsl #3
332         add             r4,  r4,  #64
333         vdup.8          d0,  r4
334         vdup.8          d2,  r12
335         vdup.8          d1,  r6
336         vdup.8          d3,  r5
337         vtrn.16         q0,  q1
338 1:
339         vld1.32         {d4[0]},  [r1], r2
340         vld1.32         {d4[1]},  [r1], r2
341         vrev64.32       d5,  d4
342         vld1.32         {d5[1]},  [r1]
343         vext.8          q3,  q2,  q2,  #1
344         vtrn.16         q2,  q3
345         vmull.u8        q8,  d4,  d0
346         vmlal.u8        q8,  d5,  d1
347   .ifc \type,avg
348         vld1.16         {d18[0]}, [r0,:16], r2
349         vld1.16         {d18[1]}, [r0,:16]
350         sub             r0,  r0,  r2
351   .endif
352         vtrn.32         d16, d17
353         vadd.i16        d16, d16, d17
354         vrshrn.u16      d16, q8,  #6
355   .ifc \type,avg
356         vrhadd.u8       d16, d16, d18
357   .endif
358         vst1.16         {d16[0]}, [r0,:16], r2
359         vst1.16         {d16[1]}, [r0,:16], r2
360         subs            r3,  r3,  #2
361         bgt             1b
362         pop             {r4-r6, pc}
363 2:
364   .ifc \type,put
365         ldrh_post       r5,  r1,  r2
366         strh_post       r5,  r0,  r2
367         ldrh_post       r6,  r1,  r2
368         strh_post       r6,  r0,  r2
369   .else
370         vld1.16         {d16[0]}, [r1], r2
371         vld1.16         {d16[1]}, [r1], r2
372         vld1.16         {d18[0]}, [r0,:16], r2
373         vld1.16         {d18[1]}, [r0,:16]
374         sub             r0,  r0,  r2
375         vrhadd.u8       d16, d16, d18
376         vst1.16         {d16[0]}, [r0,:16], r2
377         vst1.16         {d16[1]}, [r0,:16], r2
378   .endif
379         subs            r3,  r3,  #2
380         bgt             2b
381         pop             {r4-r6, pc}
382 endfunc
383 .endm
384
385         h264_chroma_mc8 put
386         h264_chroma_mc8 avg
387         h264_chroma_mc4 put
388         h264_chroma_mc4 avg
389         h264_chroma_mc2 put
390         h264_chroma_mc2 avg
391
392 #if CONFIG_RV40_DECODER
393 const   rv40bias
394         .short           0, 16, 32, 16
395         .short          32, 28, 32, 28
396         .short           0, 32, 16, 32
397         .short          32, 28, 32, 28
398 endconst
399
400         h264_chroma_mc8 put, rv40
401         h264_chroma_mc8 avg, rv40
402         h264_chroma_mc4 put, rv40
403         h264_chroma_mc4 avg, rv40
404 #endif
405
406 #if CONFIG_VC1_DECODER
407         h264_chroma_mc8 put, vc1
408         h264_chroma_mc8 avg, vc1
409         h264_chroma_mc4 put, vc1
410         h264_chroma_mc4 avg, vc1
411 #endif