]> git.sesse.net Git - ffmpeg/blob - libswscale/arm/yuv2rgb_neon.S
Merge commit 'fb8753ada23189076bdf903c1c001c0ca8287fae'
[ffmpeg] / libswscale / arm / yuv2rgb_neon.S
1 /*
2  * Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
3  * Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/arm/asm.S"
23
24 .macro compute_premult half_u half_v
25     vmov                d2, \half_u                                    @ copy left q14 to left q1
26     vmov                d3, \half_u                                    @ copy left q14 to right q1
27     vmov                d4, \half_v                                    @ copy left q15 to left q2
28     vmov                d5, \half_v                                    @ copy left q15 to right q2
29
30     vzip.16             d2, d3                                         @ U1U1U2U2U3U3U4U4
31     vzip.16             d4, d5                                         @ V1V1V2V2V3V3V4V4
32
33     vmull.s16           q8,  d4, d1[0]                                 @  V * v2r             (left,  red)
34     vmull.s16           q9,  d5, d1[0]                                 @  V * v2r             (right, red)
35     vmull.s16           q10, d2, d1[1]                                 @  U * u2g
36     vmull.s16           q11, d3, d1[1]                                 @  U * u2g
37     vmlal.s16           q10, d4, d1[2]                                 @  U * u2g + V * v2g   (left,  green)
38     vmlal.s16           q11, d5, d1[2]                                 @  U * u2g + V * v2g   (right, green)
39     vmull.s16           q12, d2, d1[3]                                 @  U * u2b             (left,  blue)
40     vmull.s16           q13, d3, d1[3]                                 @  U * u2b             (right, blue)
41 .endm
42
43 .macro compute_color dst_comp pre1 pre2
44     vadd.s32            q3, q1, \pre1
45     vadd.s32            q4, q2, \pre2
46     vqrshrun.s32        d10, q3, #13
47     vqrshrun.s32        d11, q4, #13                                   @ q5 = ({q3,q4} + (1<<12)) >> 13
48     vqmovn.u16          \dst_comp, q5                                  @ saturate 16bit -> 8bit
49 .endm
50
51 .macro compute_rgba r g b a
52     compute_color       \r, q8,  q9
53     compute_color       \g, q10, q11
54     compute_color       \b, q12, q13
55     vmov.u8             \a, #255
56 .endm
57
58 .macro compute_half_line dst half_y ofmt
59     vmovl.u8            q7, \half_y                                    @ 8px of Y
60     vdup.16             q5, r9
61     vsub.s16            q7, q5
62     vmull.s16           q1, d14, d0                                    @ q1 = (srcY - y_offset) * y_coeff (left)
63     vmull.s16           q2, d15, d0                                    @ q2 = (srcY - y_offset) * y_coeff (right)
64
65 .ifc \ofmt,argb
66     compute_rgba        d13, d14, d15, d12
67 .endif
68
69 .ifc \ofmt,rgba
70     compute_rgba        d12, d13, d14, d15
71 .endif
72
73 .ifc \ofmt,abgr
74     compute_rgba        d15, d14, d13, d12
75 .endif
76
77 .ifc \ofmt,bgra
78     compute_rgba        d14, d13, d12, d15
79 .endif
80
81     vst4.8              {q6, q7}, [\dst,:128]!
82 .endm
83
84 .macro declare_func ifmt ofmt
85 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
86     push                {r4-r12, lr}
87     vpush               {q4-q7}
88     ldr                 r4, [sp, #104]                                 @ r4  = srcY
89     ldr                 r5, [sp, #108]                                 @ r5  = linesizeY
90     ldr                 r6, [sp, #112]                                 @ r6  = srcC
91     ldr                 r7, [sp, #116]                                 @ r7  = linesizeC
92     ldr                 r8, [sp, #120]                                 @ r8  = table
93     ldr                 r9, [sp, #124]                                 @ r9  = y_offset
94     ldr                 r10,[sp, #128]                                 @ r10 = y_coeff
95     vdup.16             d0, r10                                        @ d0  = y_coeff
96     vld1.16             {d1}, [r8]                                     @ d1  = *table
97     add                 r11, r2, r3                                    @ r11 = dst + linesize (dst2)
98     add                 r12, r4, r5                                    @ r12 = srcY + linesizeY (srcY2)
99     lsl                 r3, r3, #1
100     lsl                 r5, r5, #1
101     lsl                 r8, r0, #2
102     sub                 r3, r3, r8                                     @ r3 = linesize  * 2 - width * 4 (padding)
103     sub                 r5, r5, r0                                     @ r5 = linesizeY * 2 - width     (paddingY)
104     sub                 r7, r7, r0                                     @ r7 = linesizeC     - width     (paddingC)
105 1:
106     mov                 r8, r0                                         @ r8 = width
107 2:
108     pld [r6, #64*3]
109     pld [r4, #64*3]
110     pld [r12, #64*3]
111
112     vld2.8              {d2, d3}, [r6]!                                @ q1: interleaved chroma line
113     vmov.i8             d10, #128
114 .ifc \ifmt,nv12
115     vsubl.u8            q14, d2, d10                                   @ q14 = U - 128
116     vsubl.u8            q15, d3, d10                                   @ q15 = V - 128
117 .else
118     vsubl.u8            q14, d3, d10                                   @ q14 = U - 128
119     vsubl.u8            q15, d2, d10                                   @ q15 = V - 128
120 .endif
121
122     compute_premult     d28, d30
123
124     vld1.8              {q7}, [r4]!                                    @ first line of luma
125     vmov                d28, d15                                       @ save right of the first line of luma for later use
126     compute_half_line   r2, d14, \ofmt
127
128     vld1.8              {q7}, [r12]!                                   @ second line of luma
129     vmov                d30, d15                                       @ save right of the second line of luma for later use
130     compute_half_line   r11, d14, \ofmt
131
132     compute_premult     d29, d31
133     compute_half_line   r2,  d28, \ofmt
134     compute_half_line   r11, d30, \ofmt
135
136     subs                r8, r8, #16                                    @ width -= 16
137     bgt                 2b
138
139     add                 r2, r2, r3                                     @ dst   += padding
140     add                 r4, r4, r5                                     @ srcY  += paddingY
141     add                 r11, r11, r3                                   @ dst2  += padding
142     add                 r12, r12, r5                                   @ srcY2 += paddingY
143     add                 r6, r6, r7                                     @ srcC  += paddingC
144
145     subs                r1, r1, #2                                     @ height -= 2
146     bgt                 1b
147
148     vpop                {q4-q7}
149     pop                 {r4-r12, lr}
150     mov                 pc, lr
151 endfunc
152 .endm
153
154 .macro declare_rgb_funcs ifmt
155     declare_func \ifmt, argb
156     declare_func \ifmt, rgba
157     declare_func \ifmt, abgr
158     declare_func \ifmt, bgra
159 .endm
160
161 declare_rgb_funcs nv12
162 declare_rgb_funcs nv21