]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/h264cmc_neon.S
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libavcodec / arm / h264cmc_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
24 .macro  h264_chroma_mc8 type, codec=h264
25 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
26         push            {r4-r7, lr}
27         ldrd            r4,  [sp, #20]
28   .ifc \type,avg
29         mov             lr,  r0
30   .endif
31         pld             [r1]
32         pld             [r1, r2]
33
34   .ifc \codec,rv40
35         movrel          r6,  rv40bias
36         lsr             r7,  r5,  #1
37         add             r6,  r6,  r7,  lsl #3
38         lsr             r7,  r4,  #1
39         add             r6,  r6,  r7,  lsl #1
40         vld1.16         {d22[],d23[]}, [r6,:16]
41   .endif
42
43 A       muls            r7,  r4,  r5
44 T       mul             r7,  r4,  r5
45 T       cmp             r7,  #0
46         rsb             r6,  r7,  r5,  lsl #3
47         rsb             r12, r7,  r4,  lsl #3
48         sub             r4,  r7,  r4,  lsl #3
49         sub             r4,  r4,  r5,  lsl #3
50         add             r4,  r4,  #64
51
52         beq             2f
53
54         add             r5,  r1,  r2
55
56         vdup.8          d0,  r4
57         lsl             r4,  r2,  #1
58         vdup.8          d1,  r12
59         vld1.8          {d4, d5}, [r1], r4
60         vdup.8          d2,  r6
61         vld1.8          {d6, d7}, [r5], r4
62         vdup.8          d3,  r7
63
64         vext.8          d5,  d4,  d5,  #1
65         vext.8          d7,  d6,  d7,  #1
66
67 1:      pld             [r5]
68         vmull.u8        q8,  d4,  d0
69         vmlal.u8        q8,  d5,  d1
70         vld1.8          {d4, d5}, [r1], r4
71         vmlal.u8        q8,  d6,  d2
72         vext.8          d5,  d4,  d5,  #1
73         vmlal.u8        q8,  d7,  d3
74         vmull.u8        q9,  d6,  d0
75         subs            r3,  r3,  #2
76         vmlal.u8        q9,  d7,  d1
77         vmlal.u8        q9,  d4,  d2
78         vmlal.u8        q9,  d5,  d3
79         vld1.8          {d6, d7}, [r5], r4
80         pld             [r1]
81   .ifc \codec,h264
82         vrshrn.u16      d16, q8,  #6
83         vrshrn.u16      d17, q9,  #6
84   .else
85         vadd.u16        q8,  q8,  q11
86         vadd.u16        q9,  q9,  q11
87         vshrn.u16       d16, q8,  #6
88         vshrn.u16       d17, q9,  #6
89   .endif
90   .ifc \type,avg
91         vld1.8          {d20}, [lr,:64], r2
92         vld1.8          {d21}, [lr,:64], r2
93         vrhadd.u8       q8,  q8,  q10
94   .endif
95         vext.8          d7,  d6,  d7,  #1
96         vst1.8          {d16}, [r0,:64], r2
97         vst1.8          {d17}, [r0,:64], r2
98         bgt             1b
99
100         pop             {r4-r7, pc}
101
102 2:      tst             r6,  r6
103         add             r12, r12, r6
104         vdup.8          d0,  r4
105         vdup.8          d1,  r12
106
107         beq             4f
108
109         add             r5,  r1,  r2
110         lsl             r4,  r2,  #1
111         vld1.8          {d4}, [r1], r4
112         vld1.8          {d6}, [r5], r4
113
114 3:      pld             [r5]
115         vmull.u8        q8,  d4,  d0
116         vmlal.u8        q8,  d6,  d1
117         vld1.8          {d4}, [r1], r4
118         vmull.u8        q9,  d6,  d0
119         vmlal.u8        q9,  d4,  d1
120         vld1.8          {d6}, [r5], r4
121   .ifc \codec,h264
122         vrshrn.u16      d16, q8,  #6
123         vrshrn.u16      d17, q9,  #6
124   .else
125         vadd.u16        q8,  q8,  q11
126         vadd.u16        q9,  q9,  q11
127         vshrn.u16       d16, q8,  #6
128         vshrn.u16       d17, q9,  #6
129   .endif
130   .ifc \type,avg
131         vld1.8          {d20}, [lr,:64], r2
132         vld1.8          {d21}, [lr,:64], r2
133         vrhadd.u8       q8,  q8,  q10
134   .endif
135         subs            r3,  r3,  #2
136         pld             [r1]
137         vst1.8          {d16}, [r0,:64], r2
138         vst1.8          {d17}, [r0,:64], r2
139         bgt             3b
140
141         pop             {r4-r7, pc}
142
143 4:      vld1.8          {d4, d5}, [r1], r2
144         vld1.8          {d6, d7}, [r1], r2
145         vext.8          d5,  d4,  d5,  #1
146         vext.8          d7,  d6,  d7,  #1
147
148 5:      pld             [r1]
149         subs            r3,  r3,  #2
150         vmull.u8        q8,  d4,  d0
151         vmlal.u8        q8,  d5,  d1
152         vld1.8          {d4, d5}, [r1], r2
153         vmull.u8        q9,  d6,  d0
154         vmlal.u8        q9,  d7,  d1
155         pld             [r1]
156         vext.8          d5,  d4,  d5,  #1
157   .ifc \codec,h264
158         vrshrn.u16      d16, q8,  #6
159         vrshrn.u16      d17, q9,  #6
160   .else
161         vadd.u16        q8,  q8,  q11
162         vadd.u16        q9,  q9,  q11
163         vshrn.u16       d16, q8,  #6
164         vshrn.u16       d17, q9,  #6
165   .endif
166   .ifc \type,avg
167         vld1.8          {d20}, [lr,:64], r2
168         vld1.8          {d21}, [lr,:64], r2
169         vrhadd.u8       q8,  q8,  q10
170   .endif
171         vld1.8          {d6, d7}, [r1], r2
172         vext.8          d7,  d6,  d7,  #1
173         vst1.8          {d16}, [r0,:64], r2
174         vst1.8          {d17}, [r0,:64], r2
175         bgt             5b
176
177         pop             {r4-r7, pc}
178 endfunc
179 .endm
180
181 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
182 .macro  h264_chroma_mc4 type, codec=h264
183 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
184         push            {r4-r7, lr}
185         ldrd            r4,  [sp, #20]
186   .ifc \type,avg
187         mov             lr,  r0
188   .endif
189         pld             [r1]
190         pld             [r1, r2]
191
192   .ifc \codec,rv40
193         movrel          r6,  rv40bias
194         lsr             r7,  r5,  #1
195         add             r6,  r6,  r7,  lsl #3
196         lsr             r7,  r4,  #1
197         add             r6,  r6,  r7,  lsl #1
198         vld1.16         {d22[],d23[]}, [r6,:16]
199   .endif
200
201 A       muls            r7,  r4,  r5
202 T       mul             r7,  r4,  r5
203 T       cmp             r7,  #0
204         rsb             r6,  r7,  r5,  lsl #3
205         rsb             r12, r7,  r4,  lsl #3
206         sub             r4,  r7,  r4,  lsl #3
207         sub             r4,  r4,  r5,  lsl #3
208         add             r4,  r4,  #64
209
210         beq             2f
211
212         add             r5,  r1,  r2
213
214         vdup.8          d0,  r4
215         lsl             r4,  r2,  #1
216         vdup.8          d1,  r12
217         vld1.8          {d4},     [r1], r4
218         vdup.8          d2,  r6
219         vld1.8          {d6},     [r5], r4
220         vdup.8          d3,  r7
221
222         vext.8          d5,  d4,  d5,  #1
223         vext.8          d7,  d6,  d7,  #1
224         vtrn.32         d4,  d5
225         vtrn.32         d6,  d7
226
227         vtrn.32         d0,  d1
228         vtrn.32         d2,  d3
229
230 1:      pld             [r5]
231         vmull.u8        q8,  d4,  d0
232         vmlal.u8        q8,  d6,  d2
233         vld1.8          {d4},     [r1], r4
234         vext.8          d5,  d4,  d5,  #1
235         vtrn.32         d4,  d5
236         vmull.u8        q9,  d6,  d0
237         vmlal.u8        q9,  d4,  d2
238         vld1.8          {d6},     [r5], r4
239         vadd.i16        d16, d16, d17
240         vadd.i16        d17, d18, d19
241   .ifc \codec,h264
242         vrshrn.u16      d16, q8,  #6
243   .else
244         vadd.u16        q8,  q8,  q11
245         vshrn.u16       d16, q8,  #6
246   .endif
247         subs            r3,  r3,  #2
248         pld             [r1]
249   .ifc \type,avg
250         vld1.32         {d20[0]}, [lr,:32], r2
251         vld1.32         {d20[1]}, [lr,:32], r2
252         vrhadd.u8       d16, d16, d20
253   .endif
254         vext.8          d7,  d6,  d7,  #1
255         vtrn.32         d6,  d7
256         vst1.32         {d16[0]}, [r0,:32], r2
257         vst1.32         {d16[1]}, [r0,:32], r2
258         bgt             1b
259
260         pop             {r4-r7, pc}
261
262 2:      tst             r6,  r6
263         add             r12, r12, r6
264         vdup.8          d0,  r4
265         vdup.8          d1,  r12
266         vtrn.32         d0,  d1
267
268         beq             4f
269
270         vext.32         d1,  d0,  d1,  #1
271         add             r5,  r1,  r2
272         lsl             r4,  r2,  #1
273         vld1.32         {d4[0]},  [r1], r4
274         vld1.32         {d4[1]},  [r5], r4
275
276 3:      pld             [r5]
277         vmull.u8        q8,  d4,  d0
278         vld1.32         {d4[0]},  [r1], r4
279         vmull.u8        q9,  d4,  d1
280         vld1.32         {d4[1]},  [r5], r4
281         vadd.i16        d16, d16, d17
282         vadd.i16        d17, d18, d19
283   .ifc \codec,h264
284         vrshrn.u16      d16, q8,  #6
285   .else
286         vadd.u16        q8,  q8,  q11
287         vshrn.u16       d16, q8,  #6
288   .endif
289   .ifc \type,avg
290         vld1.32         {d20[0]}, [lr,:32], r2
291         vld1.32         {d20[1]}, [lr,:32], r2
292         vrhadd.u8       d16, d16, d20
293   .endif
294         subs            r3,  r3,  #2
295         pld             [r1]
296         vst1.32         {d16[0]}, [r0,:32], r2
297         vst1.32         {d16[1]}, [r0,:32], r2
298         bgt             3b
299
300         pop             {r4-r7, pc}
301
302 4:      vld1.8          {d4},     [r1], r2
303         vld1.8          {d6},     [r1], r2
304         vext.8          d5,  d4,  d5,  #1
305         vext.8          d7,  d6,  d7,  #1
306         vtrn.32         d4,  d5
307         vtrn.32         d6,  d7
308
309 5:      vmull.u8        q8,  d4,  d0
310         vmull.u8        q9,  d6,  d0
311         subs            r3,  r3,  #2
312         vld1.8          {d4},     [r1], r2
313         vext.8          d5,  d4,  d5,  #1
314         vtrn.32         d4,  d5
315         vadd.i16        d16, d16, d17
316         vadd.i16        d17, d18, d19
317         pld             [r1]
318   .ifc \codec,h264
319         vrshrn.u16      d16, q8,  #6
320   .else
321         vadd.u16        q8,  q8,  q11
322         vshrn.u16       d16, q8,  #6
323   .endif
324   .ifc \type,avg
325         vld1.32         {d20[0]}, [lr,:32], r2
326         vld1.32         {d20[1]}, [lr,:32], r2
327         vrhadd.u8       d16, d16, d20
328   .endif
329         vld1.8          {d6},     [r1], r2
330         vext.8          d7,  d6,  d7,  #1
331         vtrn.32         d6,  d7
332         pld             [r1]
333         vst1.32         {d16[0]}, [r0,:32], r2
334         vst1.32         {d16[1]}, [r0,:32], r2
335         bgt             5b
336
337         pop             {r4-r7, pc}
338 endfunc
339 .endm
340
341 .macro  h264_chroma_mc2 type
342 function ff_\type\()_h264_chroma_mc2_neon, export=1
343         push            {r4-r6, lr}
344         ldr             r4,  [sp, #16]
345         ldr             lr,  [sp, #20]
346         pld             [r1]
347         pld             [r1, r2]
348         orrs            r5,  r4,  lr
349         beq             2f
350
351         mul             r5,  r4,  lr
352         rsb             r6,  r5,  lr,  lsl #3
353         rsb             r12, r5,  r4,  lsl #3
354         sub             r4,  r5,  r4,  lsl #3
355         sub             r4,  r4,  lr,  lsl #3
356         add             r4,  r4,  #64
357         vdup.8          d0,  r4
358         vdup.8          d2,  r12
359         vdup.8          d1,  r6
360         vdup.8          d3,  r5
361         vtrn.16         q0,  q1
362 1:
363         vld1.32         {d4[0]},  [r1], r2
364         vld1.32         {d4[1]},  [r1], r2
365         vrev64.32       d5,  d4
366         vld1.32         {d5[1]},  [r1]
367         vext.8          q3,  q2,  q2,  #1
368         vtrn.16         q2,  q3
369         vmull.u8        q8,  d4,  d0
370         vmlal.u8        q8,  d5,  d1
371   .ifc \type,avg
372         vld1.16         {d18[0]}, [r0,:16], r2
373         vld1.16         {d18[1]}, [r0,:16]
374         sub             r0,  r0,  r2
375   .endif
376         vtrn.32         d16, d17
377         vadd.i16        d16, d16, d17
378         vrshrn.u16      d16, q8,  #6
379   .ifc \type,avg
380         vrhadd.u8       d16, d16, d18
381   .endif
382         vst1.16         {d16[0]}, [r0,:16], r2
383         vst1.16         {d16[1]}, [r0,:16], r2
384         subs            r3,  r3,  #2
385         bgt             1b
386         pop             {r4-r6, pc}
387 2:
388   .ifc \type,put
389         ldrh_post       r5,  r1,  r2
390         strh_post       r5,  r0,  r2
391         ldrh_post       r6,  r1,  r2
392         strh_post       r6,  r0,  r2
393   .else
394         vld1.16         {d16[0]}, [r1], r2
395         vld1.16         {d16[1]}, [r1], r2
396         vld1.16         {d18[0]}, [r0,:16], r2
397         vld1.16         {d18[1]}, [r0,:16]
398         sub             r0,  r0,  r2
399         vrhadd.u8       d16, d16, d18
400         vst1.16         {d16[0]}, [r0,:16], r2
401         vst1.16         {d16[1]}, [r0,:16], r2
402   .endif
403         subs            r3,  r3,  #2
404         bgt             2b
405         pop             {r4-r6, pc}
406 endfunc
407 .endm
408
409 #if CONFIG_H264_DECODER
410         h264_chroma_mc8 put
411         h264_chroma_mc8 avg
412         h264_chroma_mc4 put
413         h264_chroma_mc4 avg
414         h264_chroma_mc2 put
415         h264_chroma_mc2 avg
416 #endif
417
418 #if CONFIG_RV40_DECODER
419 const   rv40bias
420         .short           0, 16, 32, 16
421         .short          32, 28, 32, 28
422         .short           0, 32, 16, 32
423         .short          32, 28, 32, 28
424 endconst
425
426         h264_chroma_mc8 put, rv40
427         h264_chroma_mc8 avg, rv40
428         h264_chroma_mc4 put, rv40
429         h264_chroma_mc4 avg, rv40
430 #endif