/* * ARM assembly optimized color format conversion functions * (YV12 -> YUY2, YV12 -> some custom YUV420 format used by * Epson graphics chip in Nokia N800) * * Copyright (C) 2007 Siarhei Siamashka * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * version 2.1 as published by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA */ .text /*******************************************************/ .align .global yv12_to_yuy2_line_arm .func yv12_to_yuy2_line_arm yv12_to_yuy2_line_arm: #define DST r0 #define SRC_Y r1 #define SRC_U r2 #define SRC_V r3 #define WIDTH ip ldr ip, [sp], #0 stmfd sp!, {r4-r8, lr} #define TMP1 r8 #define TMP2 r12 #define TMP3 lr bic WIDTH, #1 subs WIDTH, #8 blt 2f 1: ldrb r4, [SRC_Y], #1 ldrb TMP1, [SRC_U], #1 ldrb TMP2, [SRC_Y], #1 ldrb TMP3, [SRC_V], #1 add r4, r4, TMP1, lsl #8 add r4, r4, TMP2, lsl #16 add r4, r4, TMP3, lsl #24 ldrb r5, [SRC_Y], #1 ldrb TMP1, [SRC_U], #1 ldrb TMP2, [SRC_Y], #1 ldrb TMP3, [SRC_V], #1 add r5, r5, TMP1, lsl #8 add r5, r5, TMP2, lsl #16 add r5, r5, TMP3, lsl #24 ldrb r6, [SRC_Y], #1 ldrb TMP1, [SRC_U], #1 ldrb TMP2, [SRC_Y], #1 ldrb TMP3, [SRC_V], #1 add r6, r6, TMP1, lsl #8 add r6, r6, TMP2, lsl #16 add r6, r6, TMP3, lsl #24 ldrb r7, [SRC_Y], #1 ldrb TMP1, [SRC_U], #1 ldrb TMP2, [SRC_Y], #1 ldrb TMP3, [SRC_V], #1 add r7, r7, TMP1, lsl #8 add r7, r7, TMP2, lsl #16 add r7, r7, TMP3, lsl #24 stmia DST!, {r4-r7} subs WIDTH, WIDTH, #8 bge 1b 2: adds WIDTH, WIDTH, #8 ble 4f 3: ldrb r4, [SRC_Y], #1 ldrb TMP1, [SRC_U], #1 ldrb TMP2, [SRC_Y], #1 ldrb TMP3, [SRC_V], #1 add r4, r4, TMP1, lsl #8 add r4, r4, TMP2, lsl #16 add r4, r4, TMP3, lsl #24 str r4, [DST], #4 subs WIDTH, WIDTH, #2 bgt 3b 4: ldmfd sp!, {r4-r8, pc} #undef DST #undef SRC_Y #undef SRC_U #undef SRC_V #undef WIDTH #undef TMP1 #undef TMP2 #undef TMP3 .endfunc /*******************************************************/ #define DST r0 #define SRC_Y r1 #define SRC_U r2 #define WIDTH r3 #define TMP1 r10 #define TMP2 r12 #define TMP3 lr .macro YUV420_function_template function_name, USE_PLD, USE_ARMV6 .align .global \function_name .func \function_name \function_name: /* Read information about 4 pixels, convert them to YUV420 and store into 6 bytes using 16-bit writes */ .macro CONVERT_4_PIXELS_MACROBLOCK ldrb r4, [SRC_Y], #1 ldrb TMP1, [SRC_U], #1 ldrb r5, [SRC_U], #1 ldrb TMP2, [SRC_Y], #1 ldrb r6, [SRC_Y, #1] ldrb TMP3, [SRC_Y], #2 add r4, r4, TMP1, lsl #8 add r5, r5, TMP2, lsl #8 add r6, r6, TMP3, lsl #8 strh r4, [DST], #2 strh r5, [DST], #2 strh r6, [DST], #2 .endm .if \USE_ARMV6 .macro CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG1, DST_REG2, FLAG1, FLAG2, PLD_FLAG .if \FLAG1 == 0 ldrb \DST_REG1, [SRC_U], #1 ldrh TMP1, [SRC_Y], #2 ldrb TMP2, [SRC_U], #1 .endif .if \FLAG2 == 1 ldrh \DST_REG2, [SRC_Y], #2 .endif .if \PLD_FLAG == 1 pld [SRC_Y, #48] .endif add \DST_REG1, \DST_REG1, TMP1, lsl #8 add \DST_REG1, \DST_REG1, TMP2, lsl #24 .if \FLAG2 == 1 ldrb TMP1, [SRC_U], #1 ldrb TMP2, [SRC_Y], #1 .endif rev16 \DST_REG1, \DST_REG1 .endm .macro CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1 .if \FLAG1 == 0 ldrh \DST_REG1, [SRC_Y], #2 ldrb TMP1, [SRC_U], #1 ldrb TMP2, [SRC_Y], #1 .endif .if \FLAG2 == 1 ldrb \DST_REG2, [SRC_Y], #1 .endif add \DST_REG1, \DST_REG1, TMP1, lsl #16 add \DST_REG1, \DST_REG1, TMP2, lsl #24 .if \FLAG2 == 1 ldrb TMP1, [SRC_U], #1 ldrh TMP2, [SRC_Y], #2 .endif rev16 \DST_REG1, \DST_REG1 .endm .macro CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1 .if \FLAG1 == 0 ldrb \DST_REG1, [SRC_Y], #1 ldrb TMP1, [SRC_U], #1 ldrh TMP2, [SRC_Y], #2 .endif .if \FLAG2 == 1 ldrb \DST_REG2, [SRC_U], #1 .endif add \DST_REG1, \DST_REG1, TMP1, lsl #8 add \DST_REG1, \DST_REG1, TMP2, lsl #16 .if \FLAG2 == 1 ldrh TMP1, [SRC_Y], #2 ldrb TMP2, [SRC_U], #1 .endif rev16 \DST_REG1, \DST_REG1 .endm .else /* Prepare the first 32-bit output value for 8 pixels macroblock */ .macro CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG, DUMMY1, DUMMY2, DUMMY3, PLD_FLAG ldrb \DST_REG, [SRC_Y], #1 ldrb TMP1, [SRC_U], #1 ldrb TMP2, [SRC_U], #1 ldrb TMP3, [SRC_Y], #1 .if \USE_PLD && (\PLD_FLAG == 1) pld [SRC_Y, #48] .endif add \DST_REG, \DST_REG, TMP1, lsl #8 add \DST_REG, \DST_REG, TMP2, lsl #16 add \DST_REG, \DST_REG, TMP3, lsl #24 .endm /* Prepare the second 32-bit output value for 8 pixels macroblock */ .macro CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4 ldrb \DST_REG, [SRC_Y, #1] ldrb TMP1, [SRC_Y], #2 ldrb TMP2, [SRC_Y], #1 ldrb TMP3, [SRC_U], #1 add \DST_REG, \DST_REG, TMP1, lsl #8 add \DST_REG, \DST_REG, TMP2, lsl #16 add \DST_REG, \DST_REG, TMP3, lsl #24 .endm /* Prepare the third 32-bit output value for 8 pixels macroblock */ .macro CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4 ldrb \DST_REG, [SRC_U], #1 ldrb TMP1, [SRC_Y], #1 ldrb TMP2, [SRC_Y, #1] ldrb TMP3, [SRC_Y], #2 add \DST_REG, \DST_REG, TMP1, lsl #8 add \DST_REG, \DST_REG, TMP2, lsl #16 add \DST_REG, \DST_REG, TMP3, lsl #24 .endm .endif .if \USE_PLD pld [SRC_Y] .endif stmfd sp!, {r4-r8, r10, lr} /* Destination buffer should be at least 16-bit aligned, image width should be multiple of 4 */ bic DST, #1 bic WIDTH, #3 /* Ensure 32-bit alignment of the destination buffer */ tst DST, #2 beq 1f subs WIDTH, #4 blt 6f CONVERT_4_PIXELS_MACROBLOCK 1: subs WIDTH, #32 blt 3f 2: /* Convert 32 pixels per loop iteration */ CONVERT_8_PIXELS_MACROBLOCK_1 r4, r6, 0, 1, 1 /* Also do cache preload for SRC_Y */ CONVERT_8_PIXELS_MACROBLOCK_2 r6, r7, 1, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_3 r7, r8, 1, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_1 r8, r5, 1, 1, 0 stmia DST!, {r4, r6, r7, r8} subs WIDTH, #32 CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_1 r7, r8, 1, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_2 r8, r4, 1, 1, 0 stmia DST!, {r5, r6, r7, r8} .if \USE_PLD /* Do cache preload for SRC_U */ pld [SRC_U, #48] .endif CONVERT_8_PIXELS_MACROBLOCK_3 r4, r6, 1, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_1 r6, r7, 1, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_2 r7, r8, 1, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_3 r8, r4, 1, 0, 0 stmia DST!, {r4, r6, r7, r8} bge 2b 3: adds WIDTH, WIDTH, #32 ble 6f subs WIDTH, WIDTH, #8 blt 5f 4: /* Convert remaining pixels processing them 8 per iteration */ CONVERT_8_PIXELS_MACROBLOCK_1 r4, r5, 0, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0 CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 0, 0 stmia DST!, {r4-r6} subs WIDTH, WIDTH, #8 bge 4b 5: /* Convert the last 4 pixels if needed */ adds WIDTH, WIDTH, #8 ble 6f CONVERT_4_PIXELS_MACROBLOCK subs WIDTH, #4 bgt 4b 6: /* Restore all registers and return */ ldmfd sp!, {r4-r8, r10, pc} .purgem CONVERT_4_PIXELS_MACROBLOCK .purgem CONVERT_8_PIXELS_MACROBLOCK_1 .purgem CONVERT_8_PIXELS_MACROBLOCK_2 .purgem CONVERT_8_PIXELS_MACROBLOCK_3 #undef DST #undef SRC_Y #undef SRC_U #undef WIDTH #undef TMP1 #undef TMP2 #undef TMP3 .endfunc .endm YUV420_function_template yv12_to_yuv420_line_arm, 0, 0 YUV420_function_template yv12_to_yuv420_line_armv5, 1, 0 YUV420_function_template yv12_to_yuv420_line_armv6, 1, 1