1 @*****************************************************************************
2 @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
3 @*****************************************************************************
4 @ Copyright (C) 2011 Sébastien Toque
7 @ This program is free software; you can redistribute it and/or modify
8 @ it under the terms of the GNU General Public License as published by
9 @ the Free Software Foundation; either version 2 of the License, or
10 @ (at your option) any later version.
12 @ This program is distributed in the hope that it will be useful,
13 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 @ GNU General Public License for more details.
17 @ You should have received a copy of the GNU General Public License
18 @ along with this program; if not, write to the Free Software Foundation,
19 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20 @****************************************************************************/
80 .type i420_rgb_neon, %function
82 push {r4-r8,r10-r11,lr}
86 ldmia r0, {O1, OPITCH}
87 ldmia r1, {Y1, U, V, YPITCH}
89 /* round the width to be a multiple of 16 */
91 sub WIDTH, WIDTH, OPAD
92 addne WIDTH, WIDTH, #16
94 /* init constants (scale value by 64) */
100 adr OPAD, coefficients
101 vld1.s16 {d6[], d7[]}, [OPAD]!
102 vld1.s16 {d8[], d9[]}, [OPAD]!
103 vld1.s16 {d10[], d11[]}, [OPAD]!
108 sub OPAD, OPITCH, WIDTH, lsl #2
109 sub YPAD, YPITCH, WIDTH
115 /* exit if all rows have been processed */
117 pople {r4-r8,r10-r11,pc}
123 vld1.u8 {u}, [U,:64]!
124 vld1.u8 {v}, [V,:64]!
126 vmull.u8 chro_r, v, coefRV
127 vmull.u8 chro_g, u, coefGU
128 vmlal.u8 chro_g, v, coefGV
129 vmull.u8 chro_b, u, coefBU
131 vadd.s16 chro_r, Rc, chro_r
132 vsub.s16 chro_g, Gc, chro_g
133 vadd.s16 chro_b, Bc, chro_b
139 vld2.u8 {y1,y2}, [Y1,:128]!
141 /* y1 : chrominance + luminance, then clamp (divide by 64) */
142 vmull.u8 lumi, y1, coefY
143 vqadd.s16 red, lumi, chro_r
144 vqadd.s16 green, lumi, chro_g
145 vqadd.s16 blue, lumi, chro_b
146 vqrshrun.s16 red1, red, #6
147 vqrshrun.s16 green1, green, #6
148 vqrshrun.s16 blue1, blue, #6
150 /* y2 : chrominance + luminance, then clamp (divide by 64) */
151 vmull.u8 lumi, y2, coefY
152 vqadd.s16 red, lumi, chro_r
153 vqadd.s16 green, lumi, chro_g
154 vqadd.s16 blue, lumi, chro_b
155 vqrshrun.s16 red2, red, #6
156 vqrshrun.s16 green2, green, #6
157 vqrshrun.s16 blue2, blue, #6
163 vzip.u8 green1, green2
166 vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
167 vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
170 vld2.u8 {y1,y2}, [Y2,:128]!
172 /* y1 : chrominance + luminance, then clamp (divide by 64) */
173 vmull.u8 lumi, y1, coefY
174 vqadd.s16 red, lumi, chro_r
175 vqadd.s16 green, lumi, chro_g
176 vqadd.s16 blue, lumi, chro_b
177 vqrshrun.s16 red1, red, #6
178 vqrshrun.s16 green1, green, #6
179 vqrshrun.s16 blue1, blue, #6
181 /* y2 : chrominance + luminance, then clamp (divide by 64) */
182 vmull.u8 lumi, y2, coefY
183 vqadd.s16 red, lumi, chro_r
184 vqadd.s16 green, lumi, chro_g
185 vqadd.s16 blue, lumi, chro_b
186 vqrshrun.s16 red2, red, #6
187 vqrshrun.s16 green2, green, #6
188 vqrshrun.s16 blue2, blue, #6
194 vzip.u8 green1, green2
197 vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
198 vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
200 /* next columns (x16) */
201 subs COUNT, COUNT, #16
208 add U, U, YPAD, lsr #1
209 add V, V, YPAD, lsr #1