1 @*****************************************************************************
2 @ nv12_rgb.S : ARM NEONv1 NV12 to RGB chroma conversion
3 @*****************************************************************************
4 @ Copyright (C) 2011 Sébastien Toque
7 @ This program is free software; you can redistribute it and/or modify
8 @ it under the terms of the GNU General Public License as published by
9 @ the Free Software Foundation; either version 2 of the License, or
10 @ (at your option) any later version.
12 @ This program is distributed in the hope that it will be useful,
13 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 @ GNU General Public License for more details.
17 @ You should have received a copy of the GNU General Public License
18 @ along with this program; if not, write to the Free Software Foundation,
19 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20 @****************************************************************************/
79 .type nv12_rgb_neon, %function
81 push {r4-r8,r10-r11,lr}
85 ldmia r0, {O1, OPITCH}
86 ldmia r1, {Y1, U, V, YPITCH}
88 /* round the width to be a multiple of 16 */
90 sub WIDTH, WIDTH, OPAD
91 addne WIDTH, WIDTH, #16
93 /* init constants (scale value by 64) */
99 adr OPAD, coefficients
100 vld1.s16 {d6[], d7[]}, [OPAD]!
101 vld1.s16 {d8[], d9[]}, [OPAD]!
102 vld1.s16 {d10[], d11[]}, [OPAD]!
107 sub OPAD, OPITCH, WIDTH, lsl #2
108 sub YPAD, YPITCH, WIDTH
114 /* exit if all rows have been processed */
116 pople {r4-r8,r10-r11,pc}
122 vld2.u8 {u,v}, [U,:128]!
124 vmull.u8 chro_r, v, coefRV
125 vmull.u8 chro_g, u, coefGU
126 vmlal.u8 chro_g, v, coefGV
127 vmull.u8 chro_b, u, coefBU
129 vadd.s16 chro_r, Rc, chro_r
130 vsub.s16 chro_g, Gc, chro_g
131 vadd.s16 chro_b, Bc, chro_b
136 vld2.u8 {y1,y2}, [Y1,:128]!
138 /* y1 : chrominance + luminance, then clamp (divide by 64) */
139 vmull.u8 lumi, y1, coefY
140 vqadd.s16 red, lumi, chro_r
141 vqadd.s16 green, lumi, chro_g
142 vqadd.s16 blue, lumi, chro_b
143 vqrshrun.s16 red1, red, #6
144 vqrshrun.s16 green1, green, #6
145 vqrshrun.s16 blue1, blue, #6
147 /* y2 : chrominance + luminance, then clamp (divide by 64) */
148 vmull.u8 lumi, y2, coefY
149 vqadd.s16 red, lumi, chro_r
150 vqadd.s16 green, lumi, chro_g
151 vqadd.s16 blue, lumi, chro_b
152 vqrshrun.s16 red2, red, #6
153 vqrshrun.s16 green2, green, #6
154 vqrshrun.s16 blue2, blue, #6
160 vzip.u8 green1, green2
163 vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
164 vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
167 vld2.u8 {y1,y2}, [Y2,:128]!
169 /* y1 : chrominance + luminance, then clamp (divide by 64) */
170 vmull.u8 lumi, y1, coefY
171 vqadd.s16 red, lumi, chro_r
172 vqadd.s16 green, lumi, chro_g
173 vqadd.s16 blue, lumi, chro_b
174 vqrshrun.s16 red1, red, #6
175 vqrshrun.s16 green1, green, #6
176 vqrshrun.s16 blue1, blue, #6
178 /* y2 : chrominance + luminance, then clamp (divide by 64) */
179 vmull.u8 lumi, y2, coefY
180 vqadd.s16 red, lumi, chro_r
181 vqadd.s16 green, lumi, chro_g
182 vqadd.s16 blue, lumi, chro_b
183 vqrshrun.s16 red2, red, #6
184 vqrshrun.s16 green2, green, #6
185 vqrshrun.s16 blue2, blue, #6
191 vzip.u8 green1, green2
194 vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
195 vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
197 /* next columns (x16) */
198 subs COUNT, COUNT, #16