1 @*****************************************************************************
2 @ nv21_rgb.S : ARM NEONv1 NV21 to RGB chroma conversion
3 @*****************************************************************************
4 @ Copyright (C) 2011 Sébastien Toque
7 @ This program is free software; you can redistribute it and/or modify it
8 @ under the terms of the GNU Lesser General Public License as published by
9 @ the Free Software Foundation; either version 2.1 of the License, or
10 @ (at your option) any later version.
12 @ This program is distributed in the hope that it will be useful,
13 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 @ GNU Lesser General Public License for more details.
17 @ You should have received a copy of the GNU Lesser General Public License
18 @ along with this program; if not, write to the Free Software Foundation,
19 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20 @****************************************************************************/
80 .type nv21_rgb_neon, %function
82 push {r4-r8,r10-r11,lr}
86 ldmia r0, {O1, OPITCH}
87 ldmia r1, {Y1, U, V, YPITCH}
89 /* round the width to be a multiple of 16 */
91 sub WIDTH, WIDTH, OPAD
92 addne WIDTH, WIDTH, #16
94 /* init constants (scale value by 64) */
100 adr OPAD, coefficients
101 vld1.s16 {d6[], d7[]}, [OPAD]!
102 vld1.s16 {d8[], d9[]}, [OPAD]!
103 vld1.s16 {d10[], d11[]}, [OPAD]!
108 sub OPAD, OPITCH, WIDTH, lsl #2
109 sub YPAD, YPITCH, WIDTH
115 /* exit if all rows have been processed */
117 pople {r4-r8,r10-r11,pc}
123 vld2.u8 {u,v}, [U,:128]!
125 vmull.u8 chro_r, u, coefRV
126 vmull.u8 chro_g, v, coefGU
127 vmlal.u8 chro_g, u, coefGV
128 vmull.u8 chro_b, v, coefBU
130 vadd.s16 chro_r, Rc, chro_r
131 vsub.s16 chro_g, Gc, chro_g
132 vadd.s16 chro_b, Bc, chro_b
137 vld2.u8 {y1,y2}, [Y1,:128]!
139 /* y1 : chrominance + luminance, then clamp (divide by 64) */
140 vmull.u8 lumi, y1, coefY
141 vqadd.s16 red, lumi, chro_r
142 vqadd.s16 green, lumi, chro_g
143 vqadd.s16 blue, lumi, chro_b
144 vqrshrun.s16 red1, red, #6
145 vqrshrun.s16 green1, green, #6
146 vqrshrun.s16 blue1, blue, #6
148 /* y2 : chrominance + luminance, then clamp (divide by 64) */
149 vmull.u8 lumi, y2, coefY
150 vqadd.s16 red, lumi, chro_r
151 vqadd.s16 green, lumi, chro_g
152 vqadd.s16 blue, lumi, chro_b
153 vqrshrun.s16 red2, red, #6
154 vqrshrun.s16 green2, green, #6
155 vqrshrun.s16 blue2, blue, #6
161 vzip.u8 green1, green2
164 vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
165 vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
168 vld2.u8 {y1,y2}, [Y2,:128]!
170 /* y1 : chrominance + luminance, then clamp (divide by 64) */
171 vmull.u8 lumi, y1, coefY
172 vqadd.s16 red, lumi, chro_r
173 vqadd.s16 green, lumi, chro_g
174 vqadd.s16 blue, lumi, chro_b
175 vqrshrun.s16 red1, red, #6
176 vqrshrun.s16 green1, green, #6
177 vqrshrun.s16 blue1, blue, #6
179 /* y2 : chrominance + luminance, then clamp (divide by 64) */
180 vmull.u8 lumi, y2, coefY
181 vqadd.s16 red, lumi, chro_r
182 vqadd.s16 green, lumi, chro_g
183 vqadd.s16 blue, lumi, chro_b
184 vqrshrun.s16 red2, red, #6
185 vqrshrun.s16 green2, green, #6
186 vqrshrun.s16 blue2, blue, #6
192 vzip.u8 green1, green2
195 vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
196 vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
198 /* next columns (x16) */
199 subs COUNT, COUNT, #16