]> git.sesse.net Git - ffmpeg/blob - libavcodec/arm/dsputil_armv6.S
Drop DCTELEM typedef
[ffmpeg] / libavcodec / arm / dsputil_armv6.S
1 /*
2  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 .macro  call_2x_pixels  type, subp
24 function ff_\type\()_pixels16\subp\()_armv6, export=1
25         push            {r0-r3, lr}
26         bl              ff_\type\()_pixels8\subp\()_armv6
27         pop             {r0-r3, lr}
28         add             r0,  r0,  #8
29         add             r1,  r1,  #8
30         b               ff_\type\()_pixels8\subp\()_armv6
31 endfunc
32 .endm
33
34 call_2x_pixels          avg
35 call_2x_pixels          put, _x2
36 call_2x_pixels          put, _y2
37 call_2x_pixels          put, _x2_no_rnd
38 call_2x_pixels          put, _y2_no_rnd
39
40 function ff_put_pixels16_armv6, export=1
41         push            {r4-r11}
42 1:
43         ldr             r5,  [r1, #4]
44         ldr             r6,  [r1, #8]
45         ldr             r7,  [r1, #12]
46         ldr_post        r4,  r1,  r2
47         strd            r6,  r7,  [r0, #8]
48         ldr             r9,  [r1, #4]
49         strd_post       r4,  r5,  r0,  r2
50         ldr             r10, [r1, #8]
51         ldr             r11, [r1, #12]
52         ldr_post        r8,  r1,  r2
53         strd            r10, r11, [r0, #8]
54         subs            r3,  r3,  #2
55         strd_post       r8,  r9,  r0,  r2
56         bne             1b
57
58         pop             {r4-r11}
59         bx              lr
60 endfunc
61
62 function ff_put_pixels8_armv6, export=1
63         push            {r4-r7}
64 1:
65         ldr             r5,  [r1, #4]
66         ldr_post        r4,  r1,  r2
67         ldr             r7,  [r1, #4]
68         strd_post       r4,  r5,  r0,  r2
69         ldr_post        r6,  r1,  r2
70         subs            r3,  r3,  #2
71         strd_post       r6,  r7,  r0,  r2
72         bne             1b
73
74         pop             {r4-r7}
75         bx              lr
76 endfunc
77
78 function ff_put_pixels8_x2_armv6, export=1
79         push            {r4-r11, lr}
80         mov             r12, #1
81         orr             r12, r12, r12, lsl #8
82         orr             r12, r12, r12, lsl #16
83 1:
84         ldr             r4,  [r1]
85         subs            r3,  r3,  #2
86         ldr             r5,  [r1, #4]
87         ldr             r7,  [r1, #5]
88         lsr             r6,  r4,  #8
89         ldr_pre         r8,  r1,  r2
90         orr             r6,  r6,  r5,  lsl #24
91         ldr             r9,  [r1, #4]
92         ldr             r11, [r1, #5]
93         lsr             r10, r8,  #8
94         add             r1,  r1,  r2
95         orr             r10, r10, r9,  lsl #24
96         eor             r14, r4,  r6
97         uhadd8          r4,  r4,  r6
98         eor             r6,  r5,  r7
99         uhadd8          r5,  r5,  r7
100         and             r14, r14, r12
101         and             r6,  r6,  r12
102         uadd8           r4,  r4,  r14
103         eor             r14, r8,  r10
104         uadd8           r5,  r5,  r6
105         eor             r6,  r9,  r11
106         uhadd8          r8,  r8,  r10
107         and             r14, r14, r12
108         uhadd8          r9,  r9,  r11
109         and             r6,  r6,  r12
110         uadd8           r8,  r8,  r14
111         strd_post       r4,  r5,  r0,  r2
112         uadd8           r9,  r9,  r6
113         strd_post       r8,  r9,  r0,  r2
114         bne             1b
115
116         pop             {r4-r11, pc}
117 endfunc
118
119 function ff_put_pixels8_y2_armv6, export=1
120         push            {r4-r11}
121         mov             r12, #1
122         orr             r12, r12, r12, lsl #8
123         orr             r12, r12, r12, lsl #16
124         ldr             r4,  [r1]
125         ldr             r5,  [r1, #4]
126         ldr_pre         r6,  r1,  r2
127         ldr             r7,  [r1, #4]
128 1:
129         subs            r3,  r3,  #2
130         uhadd8          r8,  r4,  r6
131         eor             r10, r4,  r6
132         uhadd8          r9,  r5,  r7
133         eor             r11, r5,  r7
134         and             r10, r10, r12
135         ldr_pre         r4,  r1,  r2
136         uadd8           r8,  r8,  r10
137         and             r11, r11, r12
138         uadd8           r9,  r9,  r11
139         ldr             r5,  [r1, #4]
140         uhadd8          r10, r4,  r6
141         eor             r6,  r4,  r6
142         uhadd8          r11, r5,  r7
143         and             r6,  r6,  r12
144         eor             r7,  r5,  r7
145         uadd8           r10, r10, r6
146         and             r7,  r7,  r12
147         ldr_pre         r6,  r1,  r2
148         uadd8           r11, r11, r7
149         strd_post       r8,  r9,  r0,  r2
150         ldr             r7,  [r1, #4]
151         strd_post       r10, r11, r0,  r2
152         bne             1b
153
154         pop             {r4-r11}
155         bx              lr
156 endfunc
157
158 function ff_put_pixels8_x2_no_rnd_armv6, export=1
159         push            {r4-r9, lr}
160 1:
161         subs            r3,  r3,  #2
162         ldr             r4,  [r1]
163         ldr             r5,  [r1, #4]
164         ldr             r7,  [r1, #5]
165         ldr_pre         r8,  r1,  r2
166         ldr             r9,  [r1, #4]
167         ldr             r14, [r1, #5]
168         add             r1,  r1,  r2
169         lsr             r6,  r4,  #8
170         orr             r6,  r6,  r5,  lsl #24
171         lsr             r12, r8,  #8
172         orr             r12, r12, r9,  lsl #24
173         uhadd8          r4,  r4,  r6
174         uhadd8          r5,  r5,  r7
175         uhadd8          r8,  r8,  r12
176         uhadd8          r9,  r9,  r14
177         stm             r0,  {r4,r5}
178         add             r0,  r0,  r2
179         stm             r0,  {r8,r9}
180         add             r0,  r0,  r2
181         bne             1b
182
183         pop             {r4-r9, pc}
184 endfunc
185
186 function ff_put_pixels8_y2_no_rnd_armv6, export=1
187         push            {r4-r9, lr}
188         ldr             r4,  [r1]
189         ldr             r5,  [r1, #4]
190         ldr_pre         r6,  r1,  r2
191         ldr             r7,  [r1, #4]
192 1:
193         subs            r3,  r3,  #2
194         uhadd8          r8,  r4,  r6
195         ldr_pre         r4,  r1,  r2
196         uhadd8          r9,  r5,  r7
197         ldr             r5,  [r1, #4]
198         uhadd8          r12, r4,  r6
199         ldr_pre         r6,  r1,  r2
200         uhadd8          r14, r5,  r7
201         ldr             r7,  [r1, #4]
202         stm             r0,  {r8,r9}
203         add             r0,  r0,  r2
204         stm             r0,  {r12,r14}
205         add             r0,  r0,  r2
206         bne             1b
207
208         pop             {r4-r9, pc}
209 endfunc
210
211 function ff_avg_pixels8_armv6, export=1
212         pld             [r1, r2]
213         push            {r4-r10, lr}
214         mov             lr,  #1
215         orr             lr,  lr,  lr,  lsl #8
216         orr             lr,  lr,  lr,  lsl #16
217         ldrd            r4,  r5,  [r0]
218         ldr             r10, [r1, #4]
219         ldr_post        r9,  r1,  r2
220         subs            r3,  r3,  #2
221 1:
222         pld             [r1, r2]
223         eor             r8,  r4,  r9
224         uhadd8          r4,  r4,  r9
225         eor             r12, r5,  r10
226         ldrd_reg        r6,  r7,  r0,  r2
227         uhadd8          r5,  r5,  r10
228         and             r8,  r8,  lr
229         ldr             r10, [r1, #4]
230         and             r12, r12, lr
231         uadd8           r4,  r4,  r8
232         ldr_post        r9,  r1,  r2
233         eor             r8,  r6,  r9
234         uadd8           r5,  r5,  r12
235         pld             [r1, r2,  lsl #1]
236         eor             r12, r7,  r10
237         uhadd8          r6,  r6,  r9
238         strd_post       r4,  r5,  r0,  r2
239         uhadd8          r7,  r7,  r10
240         beq             2f
241         and             r8,  r8,  lr
242         ldrd_reg        r4,  r5,  r0,  r2
243         uadd8           r6,  r6,  r8
244         ldr             r10, [r1, #4]
245         and             r12, r12, lr
246         subs            r3,  r3,  #2
247         uadd8           r7,  r7,  r12
248         ldr_post        r9,  r1,  r2
249         strd_post       r6,  r7,  r0,  r2
250         b               1b
251 2:
252         and             r8,  r8,  lr
253         and             r12, r12, lr
254         uadd8           r6,  r6,  r8
255         uadd8           r7,  r7,  r12
256         strd_post       r6,  r7,  r0,  r2
257
258         pop             {r4-r10, pc}
259 endfunc
260
261 function ff_add_pixels_clamped_armv6, export=1
262         push            {r4-r8,lr}
263         mov             r3,  #8
264 1:
265         ldm             r0!, {r4,r5,r12,lr}
266         ldrd            r6,  r7,  [r1]
267         pkhbt           r8,  r4,  r5,  lsl #16
268         pkhtb           r5,  r5,  r4,  asr #16
269         pkhbt           r4,  r12, lr,  lsl #16
270         pkhtb           lr,  lr,  r12, asr #16
271         pld             [r1, r2]
272         uxtab16         r8,  r8,  r6
273         uxtab16         r5,  r5,  r6,  ror #8
274         uxtab16         r4,  r4,  r7
275         uxtab16         lr,  lr,  r7,  ror #8
276         usat16          r8,  #8,  r8
277         usat16          r5,  #8,  r5
278         usat16          r4,  #8,  r4
279         usat16          lr,  #8,  lr
280         orr             r6,  r8,  r5,  lsl #8
281         orr             r7,  r4,  lr,  lsl #8
282         subs            r3,  r3,  #1
283         strd_post       r6,  r7,  r1,  r2
284         bgt             1b
285         pop             {r4-r8,pc}
286 endfunc
287
288 function ff_get_pixels_armv6, export=1
289         pld             [r1, r2]
290         push            {r4-r8, lr}
291         mov             lr,  #8
292 1:
293         ldrd_post       r4,  r5,  r1,  r2
294         subs            lr,  lr,  #1
295         uxtb16          r6,  r4
296         uxtb16          r4,  r4,  ror #8
297         uxtb16          r12, r5
298         uxtb16          r8,  r5,  ror #8
299         pld             [r1, r2]
300         pkhbt           r5,  r6,  r4,  lsl #16
301         pkhtb           r6,  r4,  r6,  asr #16
302         pkhbt           r7,  r12, r8,  lsl #16
303         pkhtb           r12, r8,  r12, asr #16
304         stm             r0!, {r5,r6,r7,r12}
305         bgt             1b
306
307         pop             {r4-r8, pc}
308 endfunc
309
310 function ff_diff_pixels_armv6, export=1
311         pld             [r1, r3]
312         pld             [r2, r3]
313         push            {r4-r9, lr}
314         mov             lr,  #8
315 1:
316         ldrd_post       r4,  r5,  r1,  r3
317         ldrd_post       r6,  r7,  r2,  r3
318         uxtb16          r8,  r4
319         uxtb16          r4,  r4,  ror #8
320         uxtb16          r9,  r6
321         uxtb16          r6,  r6,  ror #8
322         pld             [r1, r3]
323         ssub16          r9,  r8,  r9
324         ssub16          r6,  r4,  r6
325         uxtb16          r8,  r5
326         uxtb16          r5,  r5,  ror #8
327         pld             [r2, r3]
328         pkhbt           r4,  r9,  r6,  lsl #16
329         pkhtb           r6,  r6,  r9,  asr #16
330         uxtb16          r9,  r7
331         uxtb16          r7,  r7,  ror #8
332         ssub16          r9,  r8,  r9
333         ssub16          r5,  r5,  r7
334         subs            lr,  lr,  #1
335         pkhbt           r8,  r9,  r5,  lsl #16
336         pkhtb           r9,  r5,  r9,  asr #16
337         stm             r0!, {r4,r6,r8,r9}
338         bgt             1b
339
340         pop             {r4-r9, pc}
341 endfunc
342
343 function ff_pix_abs16_armv6, export=1
344         ldr             r0,  [sp]
345         push            {r4-r9, lr}
346         mov             r12, #0
347         mov             lr,  #0
348         ldm             r1,  {r4-r7}
349         ldr             r8,  [r2]
350 1:
351         ldr             r9,  [r2, #4]
352         pld             [r1, r3]
353         usada8          r12, r4,  r8,  r12
354         ldr             r8,  [r2, #8]
355         pld             [r2, r3]
356         usada8          lr,  r5,  r9,  lr
357         ldr             r9,  [r2, #12]
358         usada8          r12, r6,  r8,  r12
359         subs            r0,  r0,  #1
360         usada8          lr,  r7,  r9,  lr
361         beq             2f
362         add             r1,  r1,  r3
363         ldm             r1,  {r4-r7}
364         add             r2,  r2,  r3
365         ldr             r8,  [r2]
366         b               1b
367 2:
368         add             r0,  r12, lr
369         pop             {r4-r9, pc}
370 endfunc
371
372 function ff_pix_abs16_x2_armv6, export=1
373         ldr             r12, [sp]
374         push            {r4-r11, lr}
375         mov             r0,  #0
376         mov             lr,  #1
377         orr             lr,  lr,  lr,  lsl #8
378         orr             lr,  lr,  lr,  lsl #16
379 1:
380         ldr             r8,  [r2]
381         ldr             r9,  [r2, #4]
382         lsr             r10, r8,  #8
383         ldr             r4,  [r1]
384         lsr             r6,  r9,  #8
385         orr             r10, r10, r9,  lsl #24
386         ldr             r5,  [r2, #8]
387         eor             r11, r8,  r10
388         uhadd8          r7,  r8,  r10
389         orr             r6,  r6,  r5,  lsl #24
390         and             r11, r11, lr
391         uadd8           r7,  r7,  r11
392         ldr             r8,  [r1, #4]
393         usada8          r0,  r4,  r7,  r0
394         eor             r7,  r9,  r6
395         lsr             r10, r5,  #8
396         and             r7,  r7,  lr
397         uhadd8          r4,  r9,  r6
398         ldr             r6,  [r2, #12]
399         uadd8           r4,  r4,  r7
400         pld             [r1, r3]
401         orr             r10, r10, r6,  lsl #24
402         usada8          r0,  r8,  r4,  r0
403         ldr             r4,  [r1, #8]
404         eor             r11, r5,  r10
405         ldrb            r7,  [r2, #16]
406         and             r11, r11, lr
407         uhadd8          r8,  r5,  r10
408         ldr             r5,  [r1, #12]
409         uadd8           r8,  r8,  r11
410         pld             [r2, r3]
411         lsr             r10, r6,  #8
412         usada8          r0,  r4,  r8,  r0
413         orr             r10, r10, r7,  lsl #24
414         subs            r12,  r12,  #1
415         eor             r11, r6,  r10
416         add             r1,  r1,  r3
417         uhadd8          r9,  r6,  r10
418         and             r11, r11, lr
419         uadd8           r9,  r9,  r11
420         add             r2,  r2,  r3
421         usada8          r0,  r5,  r9,  r0
422         bgt             1b
423
424         pop             {r4-r11, pc}
425 endfunc
426
427 .macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
428         ldr             \n0, [r2]
429         eor             \n1, \p0, \n0
430         uhadd8          \p0, \p0, \n0
431         and             \n1, \n1, lr
432         ldr             \n2, [r1]
433         uadd8           \p0, \p0, \n1
434         ldr             \n1, [r2, #4]
435         usada8          r0,  \p0, \n2, r0
436         pld             [r1,  r3]
437         eor             \n3, \p1, \n1
438         uhadd8          \p1, \p1, \n1
439         and             \n3, \n3, lr
440         ldr             \p0, [r1, #4]
441         uadd8           \p1, \p1, \n3
442         ldr             \n2, [r2, #8]
443         usada8          r0,  \p1, \p0, r0
444         pld             [r2,  r3]
445         eor             \p0, \p2, \n2
446         uhadd8          \p2, \p2, \n2
447         and             \p0, \p0, lr
448         ldr             \p1, [r1, #8]
449         uadd8           \p2, \p2, \p0
450         ldr             \n3, [r2, #12]
451         usada8          r0,  \p2, \p1, r0
452         eor             \p1, \p3, \n3
453         uhadd8          \p3, \p3, \n3
454         and             \p1, \p1, lr
455         ldr             \p0,  [r1, #12]
456         uadd8           \p3, \p3, \p1
457         add             r1,  r1,  r3
458         usada8          r0,  \p3, \p0,  r0
459         add             r2,  r2,  r3
460 .endm
461
462 function ff_pix_abs16_y2_armv6, export=1
463         pld             [r1]
464         pld             [r2]
465         ldr             r12, [sp]
466         push            {r4-r11, lr}
467         mov             r0,  #0
468         mov             lr,  #1
469         orr             lr,  lr,  lr,  lsl #8
470         orr             lr,  lr,  lr,  lsl #16
471         ldr             r4,  [r2]
472         ldr             r5,  [r2, #4]
473         ldr             r6,  [r2, #8]
474         ldr             r7,  [r2, #12]
475         add             r2,  r2,  r3
476 1:
477         usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
478         subs            r12, r12, #2
479         usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
480         bgt             1b
481
482         pop             {r4-r11, pc}
483 endfunc
484
485 function ff_pix_abs8_armv6, export=1
486         pld             [r2, r3]
487         ldr             r12, [sp]
488         push            {r4-r9, lr}
489         mov             r0,  #0
490         mov             lr,  #0
491         ldrd_post       r4,  r5,  r1,  r3
492 1:
493         subs            r12, r12, #2
494         ldr             r7,  [r2, #4]
495         ldr_post        r6,  r2,  r3
496         ldrd_post       r8,  r9,  r1,  r3
497         usada8          r0,  r4,  r6,  r0
498         pld             [r2, r3]
499         usada8          lr,  r5,  r7,  lr
500         ldr             r7,  [r2, #4]
501         ldr_post        r6,  r2,  r3
502         beq             2f
503         ldrd_post       r4,  r5,  r1,  r3
504         usada8          r0,  r8,  r6,  r0
505         pld             [r2, r3]
506         usada8          lr,  r9,  r7,  lr
507         b               1b
508 2:
509         usada8          r0,  r8,  r6,  r0
510         usada8          lr,  r9,  r7,  lr
511         add             r0,  r0,  lr
512         pop             {r4-r9, pc}
513 endfunc
514
515 function ff_sse16_armv6, export=1
516         ldr             r12, [sp]
517         push            {r4-r9, lr}
518         mov             r0,  #0
519 1:
520         ldrd            r4,  r5,  [r1]
521         ldr             r8,  [r2]
522         uxtb16          lr,  r4
523         uxtb16          r4,  r4,  ror #8
524         uxtb16          r9,  r8
525         uxtb16          r8,  r8,  ror #8
526         ldr             r7,  [r2, #4]
527         usub16          lr,  lr,  r9
528         usub16          r4,  r4,  r8
529         smlad           r0,  lr,  lr,  r0
530         uxtb16          r6,  r5
531         uxtb16          lr,  r5,  ror #8
532         uxtb16          r8,  r7
533         uxtb16          r9,  r7,  ror #8
534         smlad           r0,  r4,  r4,  r0
535         ldrd            r4,  r5,  [r1, #8]
536         usub16          r6,  r6,  r8
537         usub16          r8,  lr,  r9
538         ldr             r7,  [r2, #8]
539         smlad           r0,  r6,  r6,  r0
540         uxtb16          lr,  r4
541         uxtb16          r4,  r4,  ror #8
542         uxtb16          r9,  r7
543         uxtb16          r7,  r7, ror #8
544         smlad           r0,  r8,  r8,  r0
545         ldr             r8,  [r2, #12]
546         usub16          lr,  lr,  r9
547         usub16          r4,  r4,  r7
548         smlad           r0,  lr,  lr,  r0
549         uxtb16          r6,  r5
550         uxtb16          r5,  r5,  ror #8
551         uxtb16          r9,  r8
552         uxtb16          r8,  r8,  ror #8
553         smlad           r0,  r4,  r4,  r0
554         usub16          r6,  r6,  r9
555         usub16          r5,  r5,  r8
556         smlad           r0,  r6,  r6,  r0
557         add             r1,  r1,  r3
558         add             r2,  r2,  r3
559         subs            r12, r12, #1
560         smlad           r0,  r5,  r5,  r0
561         bgt             1b
562
563         pop             {r4-r9, pc}
564 endfunc
565
566 function ff_pix_norm1_armv6, export=1
567         push            {r4-r6, lr}
568         mov             r12, #16
569         mov             lr,  #0
570 1:
571         ldm             r0,  {r2-r5}
572         uxtb16          r6,  r2
573         uxtb16          r2,  r2,  ror #8
574         smlad           lr,  r6,  r6,  lr
575         uxtb16          r6,  r3
576         smlad           lr,  r2,  r2,  lr
577         uxtb16          r3,  r3,  ror #8
578         smlad           lr,  r6,  r6,  lr
579         uxtb16          r6,  r4
580         smlad           lr,  r3,  r3,  lr
581         uxtb16          r4,  r4,  ror #8
582         smlad           lr,  r6,  r6,  lr
583         uxtb16          r6,  r5
584         smlad           lr,  r4,  r4,  lr
585         uxtb16          r5,  r5,  ror #8
586         smlad           lr,  r6,  r6,  lr
587         subs            r12, r12, #1
588         add             r0,  r0,  r1
589         smlad           lr,  r5,  r5,  lr
590         bgt             1b
591
592         mov             r0,  lr
593         pop             {r4-r6, pc}
594 endfunc
595
596 function ff_pix_sum_armv6, export=1
597         push            {r4-r7, lr}
598         mov             r12, #16
599         mov             r2,  #0
600         mov             r3,  #0
601         mov             lr,  #0
602         ldr             r4,  [r0]
603 1:
604         subs            r12, r12, #1
605         ldr             r5,  [r0, #4]
606         usada8          r2,  r4,  lr,  r2
607         ldr             r6,  [r0, #8]
608         usada8          r3,  r5,  lr,  r3
609         ldr             r7,  [r0, #12]
610         usada8          r2,  r6,  lr,  r2
611         beq             2f
612         ldr_pre         r4,  r0,  r1
613         usada8          r3,  r7,  lr,  r3
614         bgt             1b
615 2:
616         usada8          r3,  r7,  lr,  r3
617         add             r0,  r2,  r3
618         pop             {r4-r7, pc}
619 endfunc