]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264cmc_neon.S
avcodec: add Cintel RAW decoder
[ffmpeg] / libavcodec / arm / h264cmc_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 /* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
24 .macro  h264_chroma_mc8 type, codec=h264
25 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
26         push            {r4-r7, lr}
27         ldrd            r4,  r5,  [sp, #20]
28   .ifc \type,avg
29         mov             lr,  r0
30   .endif
31         pld             [r1]
32         pld             [r1, r2]
33
34   .ifc \codec,rv40
35         movrel          r6,  rv40bias
36         lsr             r7,  r5,  #1
37         add             r6,  r6,  r7,  lsl #3
38         lsr             r7,  r4,  #1
39         add             r6,  r6,  r7,  lsl #1
40         vld1.16         {d22[],d23[]}, [r6,:16]
41   .endif
42   .ifc \codec,vc1
43         vmov.u16        q11, #28
44   .endif
45
46 A       muls            r7,  r4,  r5
47 T       mul             r7,  r4,  r5
48 T       cmp             r7,  #0
49         rsb             r6,  r7,  r5,  lsl #3
50         rsb             r12, r7,  r4,  lsl #3
51         sub             r4,  r7,  r4,  lsl #3
52         sub             r4,  r4,  r5,  lsl #3
53         add             r4,  r4,  #64
54
55         beq             2f
56
57         vdup.8          d0,  r4
58         vdup.8          d1,  r12
59         vld1.8          {d4, d5}, [r1], r2
60         vdup.8          d2,  r6
61         vdup.8          d3,  r7
62         vext.8          d5,  d4,  d5,  #1
63
64 1:      vld1.8          {d6, d7}, [r1], r2
65         vmull.u8        q8,  d4,  d0
66         vmlal.u8        q8,  d5,  d1
67         vext.8          d7,  d6,  d7,  #1
68         vld1.8          {d4, d5}, [r1], r2
69         vmlal.u8        q8,  d6,  d2
70         pld             [r1]
71         vext.8          d5,  d4,  d5,  #1
72         vmlal.u8        q8,  d7,  d3
73         vmull.u8        q9,  d6,  d0
74         subs            r3,  r3,  #2
75         vmlal.u8        q9,  d7,  d1
76         vmlal.u8        q9,  d4,  d2
77         vmlal.u8        q9,  d5,  d3
78         pld             [r1, r2]
79   .ifc \codec,h264
80         vrshrn.u16      d16, q8,  #6
81         vrshrn.u16      d17, q9,  #6
82   .else
83         vadd.u16        q8,  q8,  q11
84         vadd.u16        q9,  q9,  q11
85         vshrn.u16       d16, q8,  #6
86         vshrn.u16       d17, q9,  #6
87   .endif
88   .ifc \type,avg
89         vld1.8          {d20}, [lr,:64], r2
90         vld1.8          {d21}, [lr,:64], r2
91         vrhadd.u8       q8,  q8,  q10
92   .endif
93         vst1.8          {d16}, [r0,:64], r2
94         vst1.8          {d17}, [r0,:64], r2
95         bgt             1b
96
97         pop             {r4-r7, pc}
98
99 2:      adds            r12, r12, r6
100         vdup.8          d0,  r4
101         beq             5f
102         tst             r6,  r6
103         vdup.8          d1,  r12
104
105         beq             4f
106
107         vld1.8          {d4}, [r1], r2
108
109 3:      vld1.8          {d6}, [r1], r2
110         vmull.u8        q8,  d4,  d0
111         vmlal.u8        q8,  d6,  d1
112         vld1.8          {d4}, [r1], r2
113         vmull.u8        q9,  d6,  d0
114         vmlal.u8        q9,  d4,  d1
115         pld             [r1]
116   .ifc \codec,h264
117         vrshrn.u16      d16, q8,  #6
118         vrshrn.u16      d17, q9,  #6
119   .else
120         vadd.u16        q8,  q8,  q11
121         vadd.u16        q9,  q9,  q11
122         vshrn.u16       d16, q8,  #6
123         vshrn.u16       d17, q9,  #6
124   .endif
125         pld             [r1, r2]
126   .ifc \type,avg
127         vld1.8          {d20}, [lr,:64], r2
128         vld1.8          {d21}, [lr,:64], r2
129         vrhadd.u8       q8,  q8,  q10
130   .endif
131         subs            r3,  r3,  #2
132         vst1.8          {d16}, [r0,:64], r2
133         vst1.8          {d17}, [r0,:64], r2
134         bgt             3b
135
136         pop             {r4-r7, pc}
137
138 4:      vld1.8          {d4, d5}, [r1], r2
139         vld1.8          {d6, d7}, [r1], r2
140         vext.8          d5,  d4,  d5,  #1
141         vext.8          d7,  d6,  d7,  #1
142         pld             [r1]
143         subs            r3,  r3,  #2
144         vmull.u8        q8,  d4,  d0
145         vmlal.u8        q8,  d5,  d1
146         vmull.u8        q9,  d6,  d0
147         vmlal.u8        q9,  d7,  d1
148         pld             [r1, r2]
149   .ifc \codec,h264
150         vrshrn.u16      d16, q8,  #6
151         vrshrn.u16      d17, q9,  #6
152   .else
153         vadd.u16        q8,  q8,  q11
154         vadd.u16        q9,  q9,  q11
155         vshrn.u16       d16, q8,  #6
156         vshrn.u16       d17, q9,  #6
157   .endif
158   .ifc \type,avg
159         vld1.8          {d20}, [lr,:64], r2
160         vld1.8          {d21}, [lr,:64], r2
161         vrhadd.u8       q8,  q8,  q10
162   .endif
163         vst1.8          {d16}, [r0,:64], r2
164         vst1.8          {d17}, [r0,:64], r2
165         bgt             4b
166
167         pop             {r4-r7, pc}
168
169 5:      vld1.8          {d4}, [r1], r2
170         vld1.8          {d5}, [r1], r2
171         pld             [r1]
172         subs            r3,  r3,  #2
173         vmull.u8        q8,  d4,  d0
174         vmull.u8        q9,  d5,  d0
175         pld             [r1, r2]
176   .ifc \codec,h264
177         vrshrn.u16      d16, q8,  #6
178         vrshrn.u16      d17, q9,  #6
179   .else
180         vadd.u16        q8,  q8,  q11
181         vadd.u16        q9,  q9,  q11
182         vshrn.u16       d16, q8,  #6
183         vshrn.u16       d17, q9,  #6
184   .endif
185   .ifc \type,avg
186         vld1.8          {d20}, [lr,:64], r2
187         vld1.8          {d21}, [lr,:64], r2
188         vrhadd.u8       q8,  q8,  q10
189   .endif
190         vst1.8          {d16}, [r0,:64], r2
191         vst1.8          {d17}, [r0,:64], r2
192         bgt             5b
193
194         pop             {r4-r7, pc}
195 endfunc
196 .endm
197
198 /* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
199 .macro  h264_chroma_mc4 type, codec=h264
200 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
201         push            {r4-r7, lr}
202         ldrd            r4,  r5,  [sp, #20]
203   .ifc \type,avg
204         mov             lr,  r0
205   .endif
206         pld             [r1]
207         pld             [r1, r2]
208
209   .ifc \codec,rv40
210         movrel          r6,  rv40bias
211         lsr             r7,  r5,  #1
212         add             r6,  r6,  r7,  lsl #3
213         lsr             r7,  r4,  #1
214         add             r6,  r6,  r7,  lsl #1
215         vld1.16         {d22[],d23[]}, [r6,:16]
216   .endif
217   .ifc \codec,vc1
218         vmov.u16        q11, #28
219   .endif
220
221 A       muls            r7,  r4,  r5
222 T       mul             r7,  r4,  r5
223 T       cmp             r7,  #0
224         rsb             r6,  r7,  r5,  lsl #3
225         rsb             r12, r7,  r4,  lsl #3
226         sub             r4,  r7,  r4,  lsl #3
227         sub             r4,  r4,  r5,  lsl #3
228         add             r4,  r4,  #64
229
230         beq             2f
231
232         vdup.8          d0,  r4
233         vdup.8          d1,  r12
234         vld1.8          {d4},     [r1], r2
235         vdup.8          d2,  r6
236         vdup.8          d3,  r7
237
238         vext.8          d5,  d4,  d5,  #1
239         vtrn.32         d4,  d5
240
241         vtrn.32         d0,  d1
242         vtrn.32         d2,  d3
243
244 1:      vld1.8          {d6},     [r1], r2
245         vext.8          d7,  d6,  d7,  #1
246         vtrn.32         d6,  d7
247         vmull.u8        q8,  d4,  d0
248         vmlal.u8        q8,  d6,  d2
249         vld1.8          {d4},     [r1], r2
250         vext.8          d5,  d4,  d5,  #1
251         vtrn.32         d4,  d5
252         pld             [r1]
253         vmull.u8        q9,  d6,  d0
254         vmlal.u8        q9,  d4,  d2
255         vadd.i16        d16, d16, d17
256         vadd.i16        d17, d18, d19
257   .ifc \codec,h264
258         vrshrn.u16      d16, q8,  #6
259   .else
260         vadd.u16        q8,  q8,  q11
261         vshrn.u16       d16, q8,  #6
262   .endif
263         subs            r3,  r3,  #2
264         pld             [r1, r2]
265   .ifc \type,avg
266         vld1.32         {d20[0]}, [lr,:32], r2
267         vld1.32         {d20[1]}, [lr,:32], r2
268         vrhadd.u8       d16, d16, d20
269   .endif
270         vst1.32         {d16[0]}, [r0,:32], r2
271         vst1.32         {d16[1]}, [r0,:32], r2
272         bgt             1b
273
274         pop             {r4-r7, pc}
275
276 2:      adds            r12, r12, r6
277         vdup.8          d0,  r4
278         beq             5f
279         tst             r6,  r6
280         vdup.8          d1,  r12
281         vtrn.32         d0,  d1
282
283         beq             4f
284
285         vext.32         d1,  d0,  d1,  #1
286         vld1.32         {d4[0]},  [r1], r2
287
288 3:      vld1.32         {d4[1]},  [r1], r2
289         vmull.u8        q8,  d4,  d0
290         vld1.32         {d4[0]},  [r1], r2
291         vmull.u8        q9,  d4,  d1
292         vadd.i16        d16, d16, d17
293         vadd.i16        d17, d18, d19
294         pld             [r1]
295   .ifc \codec,h264
296         vrshrn.u16      d16, q8,  #6
297   .else
298         vadd.u16        q8,  q8,  q11
299         vshrn.u16       d16, q8,  #6
300   .endif
301   .ifc \type,avg
302         vld1.32         {d20[0]}, [lr,:32], r2
303         vld1.32         {d20[1]}, [lr,:32], r2
304         vrhadd.u8       d16, d16, d20
305   .endif
306         subs            r3,  r3,  #2
307         pld             [r1, r2]
308         vst1.32         {d16[0]}, [r0,:32], r2
309         vst1.32         {d16[1]}, [r0,:32], r2
310         bgt             3b
311
312         pop             {r4-r7, pc}
313
314 4:      vld1.8          {d4},     [r1], r2
315         vld1.8          {d6},     [r1], r2
316         vext.8          d5,  d4,  d5,  #1
317         vext.8          d7,  d6,  d7,  #1
318         vtrn.32         d4,  d5
319         vtrn.32         d6,  d7
320         vmull.u8        q8,  d4,  d0
321         vmull.u8        q9,  d6,  d0
322         subs            r3,  r3,  #2
323         vadd.i16        d16, d16, d17
324         vadd.i16        d17, d18, d19
325         pld             [r1]
326   .ifc \codec,h264
327         vrshrn.u16      d16, q8,  #6
328   .else
329         vadd.u16        q8,  q8,  q11
330         vshrn.u16       d16, q8,  #6
331   .endif
332   .ifc \type,avg
333         vld1.32         {d20[0]}, [lr,:32], r2
334         vld1.32         {d20[1]}, [lr,:32], r2
335         vrhadd.u8       d16, d16, d20
336   .endif
337         pld             [r1]
338         vst1.32         {d16[0]}, [r0,:32], r2
339         vst1.32         {d16[1]}, [r0,:32], r2
340         bgt             4b
341
342         pop             {r4-r7, pc}
343
344 5:      vld1.32         {d4[0]},  [r1], r2
345         vld1.32         {d4[1]},  [r1], r2
346         vmull.u8        q8,  d4,  d0
347         subs            r3,  r3,  #2
348         pld             [r1]
349   .ifc \codec,h264
350         vrshrn.u16      d16, q8,  #6
351   .else
352         vadd.u16        q8,  q8,  q11
353         vshrn.u16       d16, q8,  #6
354   .endif
355   .ifc \type,avg
356         vld1.32         {d20[0]}, [lr,:32], r2
357         vld1.32         {d20[1]}, [lr,:32], r2
358         vrhadd.u8       d16, d16, d20
359   .endif
360         pld             [r1]
361         vst1.32         {d16[0]}, [r0,:32], r2
362         vst1.32         {d16[1]}, [r0,:32], r2
363         bgt             5b
364
365         pop             {r4-r7, pc}
366 endfunc
367 .endm
368
369 .macro  h264_chroma_mc2 type
370 function ff_\type\()_h264_chroma_mc2_neon, export=1
371         push            {r4-r6, lr}
372         ldr             r4,  [sp, #16]
373         ldr             lr,  [sp, #20]
374         pld             [r1]
375         pld             [r1, r2]
376         orrs            r5,  r4,  lr
377         beq             2f
378
379         mul             r5,  r4,  lr
380         rsb             r6,  r5,  lr,  lsl #3
381         rsb             r12, r5,  r4,  lsl #3
382         sub             r4,  r5,  r4,  lsl #3
383         sub             r4,  r4,  lr,  lsl #3
384         add             r4,  r4,  #64
385         vdup.8          d0,  r4
386         vdup.8          d2,  r12
387         vdup.8          d1,  r6
388         vdup.8          d3,  r5
389         vtrn.16         q0,  q1
390 1:
391         vld1.32         {d4[0]},  [r1], r2
392         vld1.32         {d4[1]},  [r1], r2
393         vrev64.32       d5,  d4
394         vld1.32         {d5[1]},  [r1]
395         vext.8          q3,  q2,  q2,  #1
396         vtrn.16         q2,  q3
397         vmull.u8        q8,  d4,  d0
398         vmlal.u8        q8,  d5,  d1
399   .ifc \type,avg
400         vld1.16         {d18[0]}, [r0,:16], r2
401         vld1.16         {d18[1]}, [r0,:16]
402         sub             r0,  r0,  r2
403   .endif
404         vtrn.32         d16, d17
405         vadd.i16        d16, d16, d17
406         vrshrn.u16      d16, q8,  #6
407   .ifc \type,avg
408         vrhadd.u8       d16, d16, d18
409   .endif
410         vst1.16         {d16[0]}, [r0,:16], r2
411         vst1.16         {d16[1]}, [r0,:16], r2
412         subs            r3,  r3,  #2
413         bgt             1b
414         pop             {r4-r6, pc}
415 2:
416   .ifc \type,put
417         ldrh_post       r5,  r1,  r2
418         strh_post       r5,  r0,  r2
419         ldrh_post       r6,  r1,  r2
420         strh_post       r6,  r0,  r2
421   .else
422         vld1.16         {d16[0]}, [r1], r2
423         vld1.16         {d16[1]}, [r1], r2
424         vld1.16         {d18[0]}, [r0,:16], r2
425         vld1.16         {d18[1]}, [r0,:16]
426         sub             r0,  r0,  r2
427         vrhadd.u8       d16, d16, d18
428         vst1.16         {d16[0]}, [r0,:16], r2
429         vst1.16         {d16[1]}, [r0,:16], r2
430   .endif
431         subs            r3,  r3,  #2
432         bgt             2b
433         pop             {r4-r6, pc}
434 endfunc
435 .endm
436
437         h264_chroma_mc8 put
438         h264_chroma_mc8 avg
439         h264_chroma_mc4 put
440         h264_chroma_mc4 avg
441         h264_chroma_mc2 put
442         h264_chroma_mc2 avg
443
444 #if CONFIG_RV40_DECODER
445 const   rv40bias
446         .short           0, 16, 32, 16
447         .short          32, 28, 32, 28
448         .short           0, 32, 16, 32
449         .short          32, 28, 32, 28
450 endconst
451
452         h264_chroma_mc8 put, rv40
453         h264_chroma_mc8 avg, rv40
454         h264_chroma_mc4 put, rv40
455         h264_chroma_mc4 avg, rv40
456 #endif
457
458 #if CONFIG_VC1DSP
459         h264_chroma_mc8 put, vc1
460         h264_chroma_mc8 avg, vc1
461         h264_chroma_mc4 put, vc1
462         h264_chroma_mc4 avg, vc1
463 #endif