2 * ARM assembly optimized color format conversion functions
3 * (YV12 -> YUY2, YV12 -> some custom YUV420 format used by
4 * Epson graphics chip in Nokia N800)
6 * Copyright (C) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public License
10 * version 2.1 as published by the Free Software Foundation.
12 * This library is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24 /*******************************************************/
27 .global yv12_to_yuy2_line_arm
28 .func yv12_to_yuy2_line_arm
29 yv12_to_yuy2_line_arm:
38 stmfd sp!, {r4-r8, lr}
50 ldrb TMP1, [SRC_U], #1
51 ldrb TMP2, [SRC_Y], #1
52 ldrb TMP3, [SRC_V], #1
53 add r4, r4, TMP1, lsl #8
54 add r4, r4, TMP2, lsl #16
55 add r4, r4, TMP3, lsl #24
58 ldrb TMP1, [SRC_U], #1
59 ldrb TMP2, [SRC_Y], #1
60 ldrb TMP3, [SRC_V], #1
61 add r5, r5, TMP1, lsl #8
62 add r5, r5, TMP2, lsl #16
63 add r5, r5, TMP3, lsl #24
66 ldrb TMP1, [SRC_U], #1
67 ldrb TMP2, [SRC_Y], #1
68 ldrb TMP3, [SRC_V], #1
69 add r6, r6, TMP1, lsl #8
70 add r6, r6, TMP2, lsl #16
71 add r6, r6, TMP3, lsl #24
74 ldrb TMP1, [SRC_U], #1
75 ldrb TMP2, [SRC_Y], #1
76 ldrb TMP3, [SRC_V], #1
77 add r7, r7, TMP1, lsl #8
78 add r7, r7, TMP2, lsl #16
79 add r7, r7, TMP3, lsl #24
89 ldrb TMP1, [SRC_U], #1
90 ldrb TMP2, [SRC_Y], #1
91 ldrb TMP3, [SRC_V], #1
92 add r4, r4, TMP1, lsl #8
93 add r4, r4, TMP2, lsl #16
94 add r4, r4, TMP3, lsl #24
99 ldmfd sp!, {r4-r8, pc}
112 /*******************************************************/
122 .macro YUV420_function_template function_name, USE_PLD, USE_ARMV6
125 .global \function_name
129 /* Read information about 4 pixels, convert them to YUV420 and store into 6 bytes using 16-bit writes */
130 .macro CONVERT_4_PIXELS_MACROBLOCK
132 ldrb TMP1, [SRC_U], #1
134 ldrb TMP2, [SRC_Y], #1
136 ldrb TMP3, [SRC_Y], #2
137 add r4, r4, TMP1, lsl #8
138 add r5, r5, TMP2, lsl #8
139 add r6, r6, TMP3, lsl #8
147 .macro CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG1, DST_REG2, FLAG1, FLAG2, PLD_FLAG
149 ldrb \DST_REG1, [SRC_U], #1
150 ldrh TMP1, [SRC_Y], #2
151 ldrb TMP2, [SRC_U], #1
154 ldrh \DST_REG2, [SRC_Y], #2
159 add \DST_REG1, \DST_REG1, TMP1, lsl #8
160 add \DST_REG1, \DST_REG1, TMP2, lsl #24
162 ldrb TMP1, [SRC_U], #1
163 ldrb TMP2, [SRC_Y], #1
165 rev16 \DST_REG1, \DST_REG1
168 .macro CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1
170 ldrh \DST_REG1, [SRC_Y], #2
171 ldrb TMP1, [SRC_U], #1
172 ldrb TMP2, [SRC_Y], #1
175 ldrb \DST_REG2, [SRC_Y], #1
177 add \DST_REG1, \DST_REG1, TMP1, lsl #16
178 add \DST_REG1, \DST_REG1, TMP2, lsl #24
180 ldrb TMP1, [SRC_U], #1
181 ldrh TMP2, [SRC_Y], #2
183 rev16 \DST_REG1, \DST_REG1
186 .macro CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1
188 ldrb \DST_REG1, [SRC_Y], #1
189 ldrb TMP1, [SRC_U], #1
190 ldrh TMP2, [SRC_Y], #2
193 ldrb \DST_REG2, [SRC_U], #1
195 add \DST_REG1, \DST_REG1, TMP1, lsl #8
196 add \DST_REG1, \DST_REG1, TMP2, lsl #16
198 ldrh TMP1, [SRC_Y], #2
199 ldrb TMP2, [SRC_U], #1
201 rev16 \DST_REG1, \DST_REG1
206 /* Prepare the first 32-bit output value for 8 pixels macroblock */
207 .macro CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG, DUMMY1, DUMMY2, DUMMY3, PLD_FLAG
208 ldrb \DST_REG, [SRC_Y], #1
209 ldrb TMP1, [SRC_U], #1
210 ldrb TMP2, [SRC_U], #1
211 ldrb TMP3, [SRC_Y], #1
212 .if \USE_PLD && (\PLD_FLAG == 1)
215 add \DST_REG, \DST_REG, TMP1, lsl #8
216 add \DST_REG, \DST_REG, TMP2, lsl #16
217 add \DST_REG, \DST_REG, TMP3, lsl #24
220 /* Prepare the second 32-bit output value for 8 pixels macroblock */
221 .macro CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4
222 ldrb \DST_REG, [SRC_Y, #1]
223 ldrb TMP1, [SRC_Y], #2
224 ldrb TMP2, [SRC_Y], #1
225 ldrb TMP3, [SRC_U], #1
226 add \DST_REG, \DST_REG, TMP1, lsl #8
227 add \DST_REG, \DST_REG, TMP2, lsl #16
228 add \DST_REG, \DST_REG, TMP3, lsl #24
231 /* Prepare the third 32-bit output value for 8 pixels macroblock */
232 .macro CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4
233 ldrb \DST_REG, [SRC_U], #1
234 ldrb TMP1, [SRC_Y], #1
235 ldrb TMP2, [SRC_Y, #1]
236 ldrb TMP3, [SRC_Y], #2
237 add \DST_REG, \DST_REG, TMP1, lsl #8
238 add \DST_REG, \DST_REG, TMP2, lsl #16
239 add \DST_REG, \DST_REG, TMP3, lsl #24
247 stmfd sp!, {r4-r8, r10, lr}
249 /* Destination buffer should be at least 16-bit aligned, image width should be multiple of 4 */
253 /* Ensure 32-bit alignment of the destination buffer */
258 CONVERT_4_PIXELS_MACROBLOCK
262 2: /* Convert 32 pixels per loop iteration */
263 CONVERT_8_PIXELS_MACROBLOCK_1 r4, r6, 0, 1, 1 /* Also do cache preload for SRC_Y */
264 CONVERT_8_PIXELS_MACROBLOCK_2 r6, r7, 1, 1, 0
265 CONVERT_8_PIXELS_MACROBLOCK_3 r7, r8, 1, 1, 0
266 CONVERT_8_PIXELS_MACROBLOCK_1 r8, r5, 1, 1, 0
267 stmia DST!, {r4, r6, r7, r8}
271 CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0
272 CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 1, 0
273 CONVERT_8_PIXELS_MACROBLOCK_1 r7, r8, 1, 1, 0
274 CONVERT_8_PIXELS_MACROBLOCK_2 r8, r4, 1, 1, 0
275 stmia DST!, {r5, r6, r7, r8}
277 /* Do cache preload for SRC_U */
280 CONVERT_8_PIXELS_MACROBLOCK_3 r4, r6, 1, 1, 0
281 CONVERT_8_PIXELS_MACROBLOCK_1 r6, r7, 1, 1, 0
282 CONVERT_8_PIXELS_MACROBLOCK_2 r7, r8, 1, 1, 0
283 CONVERT_8_PIXELS_MACROBLOCK_3 r8, r4, 1, 0, 0
284 stmia DST!, {r4, r6, r7, r8}
288 adds WIDTH, WIDTH, #32
291 subs WIDTH, WIDTH, #8
293 4: /* Convert remaining pixels processing them 8 per iteration */
294 CONVERT_8_PIXELS_MACROBLOCK_1 r4, r5, 0, 1, 0
295 CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0
296 CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 0, 0
298 subs WIDTH, WIDTH, #8
300 5: /* Convert the last 4 pixels if needed */
301 adds WIDTH, WIDTH, #8
303 CONVERT_4_PIXELS_MACROBLOCK
306 6: /* Restore all registers and return */
307 ldmfd sp!, {r4-r8, r10, pc}
309 .purgem CONVERT_4_PIXELS_MACROBLOCK
310 .purgem CONVERT_8_PIXELS_MACROBLOCK_1
311 .purgem CONVERT_8_PIXELS_MACROBLOCK_2
312 .purgem CONVERT_8_PIXELS_MACROBLOCK_3
326 YUV420_function_template yv12_to_yuv420_line_arm, 0, 0
327 YUV420_function_template yv12_to_yuv420_line_armv5, 1, 0
328 YUV420_function_template yv12_to_yuv420_line_armv6, 1, 1