]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264cmc_neon.S
arm: h264chroma: Do not compile h264_chroma_mc* dependent on h264 decoder
[ffmpeg] / libavcodec / arm / h264cmc_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
24 .macro  h264_chroma_mc8 type, codec=h264
25 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
26         push            {r4-r7, lr}
27         ldrd            r4,  r5,  [sp, #20]
28   .ifc \type,avg
29         mov             lr,  r0
30   .endif
31         pld             [r1]
32         pld             [r1, r2]
33
34   .ifc \codec,rv40
35         movrel          r6,  rv40bias
36         lsr             r7,  r5,  #1
37         add             r6,  r6,  r7,  lsl #3
38         lsr             r7,  r4,  #1
39         add             r6,  r6,  r7,  lsl #1
40         vld1.16         {d22[],d23[]}, [r6,:16]
41   .endif
42
43 A       muls            r7,  r4,  r5
44 T       mul             r7,  r4,  r5
45 T       cmp             r7,  #0
46         rsb             r6,  r7,  r5,  lsl #3
47         rsb             r12, r7,  r4,  lsl #3
48         sub             r4,  r7,  r4,  lsl #3
49         sub             r4,  r4,  r5,  lsl #3
50         add             r4,  r4,  #64
51
52         beq             2f
53
54         vdup.8          d0,  r4
55         vdup.8          d1,  r12
56         vld1.8          {d4, d5}, [r1], r2
57         vdup.8          d2,  r6
58         vdup.8          d3,  r7
59         vext.8          d5,  d4,  d5,  #1
60
61 1:      vld1.8          {d6, d7}, [r1], r2
62         vmull.u8        q8,  d4,  d0
63         vmlal.u8        q8,  d5,  d1
64         vext.8          d7,  d6,  d7,  #1
65         vld1.8          {d4, d5}, [r1], r2
66         vmlal.u8        q8,  d6,  d2
67         pld             [r1]
68         vext.8          d5,  d4,  d5,  #1
69         vmlal.u8        q8,  d7,  d3
70         vmull.u8        q9,  d6,  d0
71         subs            r3,  r3,  #2
72         vmlal.u8        q9,  d7,  d1
73         vmlal.u8        q9,  d4,  d2
74         vmlal.u8        q9,  d5,  d3
75         pld             [r1, r2]
76   .ifc \codec,h264
77         vrshrn.u16      d16, q8,  #6
78         vrshrn.u16      d17, q9,  #6
79   .else
80         vadd.u16        q8,  q8,  q11
81         vadd.u16        q9,  q9,  q11
82         vshrn.u16       d16, q8,  #6
83         vshrn.u16       d17, q9,  #6
84   .endif
85   .ifc \type,avg
86         vld1.8          {d20}, [lr,:64], r2
87         vld1.8          {d21}, [lr,:64], r2
88         vrhadd.u8       q8,  q8,  q10
89   .endif
90         vst1.8          {d16}, [r0,:64], r2
91         vst1.8          {d17}, [r0,:64], r2
92         bgt             1b
93
94         pop             {r4-r7, pc}
95
96 2:      tst             r6,  r6
97         add             r12, r12, r6
98         vdup.8          d0,  r4
99         vdup.8          d1,  r12
100
101         beq             4f
102
103         vld1.8          {d4}, [r1], r2
104
105 3:      vld1.8          {d6}, [r1], r2
106         vmull.u8        q8,  d4,  d0
107         vmlal.u8        q8,  d6,  d1
108         vld1.8          {d4}, [r1], r2
109         vmull.u8        q9,  d6,  d0
110         vmlal.u8        q9,  d4,  d1
111         pld             [r1]
112   .ifc \codec,h264
113         vrshrn.u16      d16, q8,  #6
114         vrshrn.u16      d17, q9,  #6
115   .else
116         vadd.u16        q8,  q8,  q11
117         vadd.u16        q9,  q9,  q11
118         vshrn.u16       d16, q8,  #6
119         vshrn.u16       d17, q9,  #6
120   .endif
121         pld             [r1, r2]
122   .ifc \type,avg
123         vld1.8          {d20}, [lr,:64], r2
124         vld1.8          {d21}, [lr,:64], r2
125         vrhadd.u8       q8,  q8,  q10
126   .endif
127         subs            r3,  r3,  #2
128         vst1.8          {d16}, [r0,:64], r2
129         vst1.8          {d17}, [r0,:64], r2
130         bgt             3b
131
132         pop             {r4-r7, pc}
133
134 4:      vld1.8          {d4, d5}, [r1], r2
135         vld1.8          {d6, d7}, [r1], r2
136         vext.8          d5,  d4,  d5,  #1
137         vext.8          d7,  d6,  d7,  #1
138         pld             [r1]
139         subs            r3,  r3,  #2
140         vmull.u8        q8,  d4,  d0
141         vmlal.u8        q8,  d5,  d1
142         vmull.u8        q9,  d6,  d0
143         vmlal.u8        q9,  d7,  d1
144         pld             [r1, r2]
145   .ifc \codec,h264
146         vrshrn.u16      d16, q8,  #6
147         vrshrn.u16      d17, q9,  #6
148   .else
149         vadd.u16        q8,  q8,  q11
150         vadd.u16        q9,  q9,  q11
151         vshrn.u16       d16, q8,  #6
152         vshrn.u16       d17, q9,  #6
153   .endif
154   .ifc \type,avg
155         vld1.8          {d20}, [lr,:64], r2
156         vld1.8          {d21}, [lr,:64], r2
157         vrhadd.u8       q8,  q8,  q10
158   .endif
159         vst1.8          {d16}, [r0,:64], r2
160         vst1.8          {d17}, [r0,:64], r2
161         bgt             4b
162
163         pop             {r4-r7, pc}
164 endfunc
165 .endm
166
167 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
168 .macro  h264_chroma_mc4 type, codec=h264
169 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
170         push            {r4-r7, lr}
171         ldrd            r4,  r5,  [sp, #20]
172   .ifc \type,avg
173         mov             lr,  r0
174   .endif
175         pld             [r1]
176         pld             [r1, r2]
177
178   .ifc \codec,rv40
179         movrel          r6,  rv40bias
180         lsr             r7,  r5,  #1
181         add             r6,  r6,  r7,  lsl #3
182         lsr             r7,  r4,  #1
183         add             r6,  r6,  r7,  lsl #1
184         vld1.16         {d22[],d23[]}, [r6,:16]
185   .endif
186
187 A       muls            r7,  r4,  r5
188 T       mul             r7,  r4,  r5
189 T       cmp             r7,  #0
190         rsb             r6,  r7,  r5,  lsl #3
191         rsb             r12, r7,  r4,  lsl #3
192         sub             r4,  r7,  r4,  lsl #3
193         sub             r4,  r4,  r5,  lsl #3
194         add             r4,  r4,  #64
195
196         beq             2f
197
198         vdup.8          d0,  r4
199         vdup.8          d1,  r12
200         vld1.8          {d4},     [r1], r2
201         vdup.8          d2,  r6
202         vdup.8          d3,  r7
203
204         vext.8          d5,  d4,  d5,  #1
205         vtrn.32         d4,  d5
206
207         vtrn.32         d0,  d1
208         vtrn.32         d2,  d3
209
210 1:      vld1.8          {d6},     [r1], r2
211         vext.8          d7,  d6,  d7,  #1
212         vtrn.32         d6,  d7
213         vmull.u8        q8,  d4,  d0
214         vmlal.u8        q8,  d6,  d2
215         vld1.8          {d4},     [r1], r2
216         vext.8          d5,  d4,  d5,  #1
217         vtrn.32         d4,  d5
218         pld             [r1]
219         vmull.u8        q9,  d6,  d0
220         vmlal.u8        q9,  d4,  d2
221         vadd.i16        d16, d16, d17
222         vadd.i16        d17, d18, d19
223   .ifc \codec,h264
224         vrshrn.u16      d16, q8,  #6
225   .else
226         vadd.u16        q8,  q8,  q11
227         vshrn.u16       d16, q8,  #6
228   .endif
229         subs            r3,  r3,  #2
230         pld             [r1, r2]
231   .ifc \type,avg
232         vld1.32         {d20[0]}, [lr,:32], r2
233         vld1.32         {d20[1]}, [lr,:32], r2
234         vrhadd.u8       d16, d16, d20
235   .endif
236         vst1.32         {d16[0]}, [r0,:32], r2
237         vst1.32         {d16[1]}, [r0,:32], r2
238         bgt             1b
239
240         pop             {r4-r7, pc}
241
242 2:      tst             r6,  r6
243         add             r12, r12, r6
244         vdup.8          d0,  r4
245         vdup.8          d1,  r12
246         vtrn.32         d0,  d1
247
248         beq             4f
249
250         vext.32         d1,  d0,  d1,  #1
251         vld1.32         {d4[0]},  [r1], r2
252
253 3:      vld1.32         {d4[1]},  [r1], r2
254         vmull.u8        q8,  d4,  d0
255         vld1.32         {d4[0]},  [r1], r2
256         vmull.u8        q9,  d4,  d1
257         vadd.i16        d16, d16, d17
258         vadd.i16        d17, d18, d19
259         pld             [r1]
260   .ifc \codec,h264
261         vrshrn.u16      d16, q8,  #6
262   .else
263         vadd.u16        q8,  q8,  q11
264         vshrn.u16       d16, q8,  #6
265   .endif
266   .ifc \type,avg
267         vld1.32         {d20[0]}, [lr,:32], r2
268         vld1.32         {d20[1]}, [lr,:32], r2
269         vrhadd.u8       d16, d16, d20
270   .endif
271         subs            r3,  r3,  #2
272         pld             [r1, r2]
273         vst1.32         {d16[0]}, [r0,:32], r2
274         vst1.32         {d16[1]}, [r0,:32], r2
275         bgt             3b
276
277         pop             {r4-r7, pc}
278
279 4:      vld1.8          {d4},     [r1], r2
280         vld1.8          {d6},     [r1], r2
281         vext.8          d5,  d4,  d5,  #1
282         vext.8          d7,  d6,  d7,  #1
283         vtrn.32         d4,  d5
284         vtrn.32         d6,  d7
285         vmull.u8        q8,  d4,  d0
286         vmull.u8        q9,  d6,  d0
287         subs            r3,  r3,  #2
288         vadd.i16        d16, d16, d17
289         vadd.i16        d17, d18, d19
290         pld             [r1]
291   .ifc \codec,h264
292         vrshrn.u16      d16, q8,  #6
293   .else
294         vadd.u16        q8,  q8,  q11
295         vshrn.u16       d16, q8,  #6
296   .endif
297   .ifc \type,avg
298         vld1.32         {d20[0]}, [lr,:32], r2
299         vld1.32         {d20[1]}, [lr,:32], r2
300         vrhadd.u8       d16, d16, d20
301   .endif
302         pld             [r1]
303         vst1.32         {d16[0]}, [r0,:32], r2
304         vst1.32         {d16[1]}, [r0,:32], r2
305         bgt             4b
306
307         pop             {r4-r7, pc}
308 endfunc
309 .endm
310
311 .macro  h264_chroma_mc2 type
312 function ff_\type\()_h264_chroma_mc2_neon, export=1
313         push            {r4-r6, lr}
314         ldr             r4,  [sp, #16]
315         ldr             lr,  [sp, #20]
316         pld             [r1]
317         pld             [r1, r2]
318         orrs            r5,  r4,  lr
319         beq             2f
320
321         mul             r5,  r4,  lr
322         rsb             r6,  r5,  lr,  lsl #3
323         rsb             r12, r5,  r4,  lsl #3
324         sub             r4,  r5,  r4,  lsl #3
325         sub             r4,  r4,  lr,  lsl #3
326         add             r4,  r4,  #64
327         vdup.8          d0,  r4
328         vdup.8          d2,  r12
329         vdup.8          d1,  r6
330         vdup.8          d3,  r5
331         vtrn.16         q0,  q1
332 1:
333         vld1.32         {d4[0]},  [r1], r2
334         vld1.32         {d4[1]},  [r1], r2
335         vrev64.32       d5,  d4
336         vld1.32         {d5[1]},  [r1]
337         vext.8          q3,  q2,  q2,  #1
338         vtrn.16         q2,  q3
339         vmull.u8        q8,  d4,  d0
340         vmlal.u8        q8,  d5,  d1
341   .ifc \type,avg
342         vld1.16         {d18[0]}, [r0,:16], r2
343         vld1.16         {d18[1]}, [r0,:16]
344         sub             r0,  r0,  r2
345   .endif
346         vtrn.32         d16, d17
347         vadd.i16        d16, d16, d17
348         vrshrn.u16      d16, q8,  #6
349   .ifc \type,avg
350         vrhadd.u8       d16, d16, d18
351   .endif
352         vst1.16         {d16[0]}, [r0,:16], r2
353         vst1.16         {d16[1]}, [r0,:16], r2
354         subs            r3,  r3,  #2
355         bgt             1b
356         pop             {r4-r6, pc}
357 2:
358   .ifc \type,put
359         ldrh_post       r5,  r1,  r2
360         strh_post       r5,  r0,  r2
361         ldrh_post       r6,  r1,  r2
362         strh_post       r6,  r0,  r2
363   .else
364         vld1.16         {d16[0]}, [r1], r2
365         vld1.16         {d16[1]}, [r1], r2
366         vld1.16         {d18[0]}, [r0,:16], r2
367         vld1.16         {d18[1]}, [r0,:16]
368         sub             r0,  r0,  r2
369         vrhadd.u8       d16, d16, d18
370         vst1.16         {d16[0]}, [r0,:16], r2
371         vst1.16         {d16[1]}, [r0,:16], r2
372   .endif
373         subs            r3,  r3,  #2
374         bgt             2b
375         pop             {r4-r6, pc}
376 endfunc
377 .endm
378
379         h264_chroma_mc8 put
380         h264_chroma_mc8 avg
381         h264_chroma_mc4 put
382         h264_chroma_mc4 avg
383         h264_chroma_mc2 put
384         h264_chroma_mc2 avg
385
386 #if CONFIG_RV40_DECODER
387 const   rv40bias
388         .short           0, 16, 32, 16
389         .short          32, 28, 32, 28
390         .short           0, 32, 16, 32
391         .short          32, 28, 32, 28
392 endconst
393
394         h264_chroma_mc8 put, rv40
395         h264_chroma_mc8 avg, rv40
396         h264_chroma_mc4 put, rv40
397         h264_chroma_mc4 avg, rv40
398 #endif