git.sesse.net Git - vlc/blob - modules/video_filter/libswscale_nokia770/arm_colorconv.S

   1 /*
   2  * ARM assembly optimized color format conversion functions
   3  * (YV12 -> YUY2, YV12 -> some custom YUV420 format used by
   4  * Epson graphics chip in Nokia N800)
   5  *
   6  * Copyright (C) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public License
  10  * version 2.1 as published by the Free Software Foundation.
  11  *
  12  * This library is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  20  * 02110-1301 USA
  21  */
  22         .text
  23
  24 /*******************************************************/
  25
  26         .align
  27         .global yv12_to_yuy2_line_arm
  28         .func yv12_to_yuy2_line_arm
  29 yv12_to_yuy2_line_arm:
  30
  31 #define DST     r0
  32 #define SRC_Y   r1
  33 #define SRC_U   r2
  34 #define SRC_V   r3
  35 #define WIDTH   ip
  36
  37         ldr     ip, [sp], #0
  38         stmfd   sp!, {r4-r8, lr}
  39
  40 #define TMP1    r8
  41 #define TMP2    r12
  42 #define TMP3    lr
  43
  44         bic     WIDTH, #1
  45
  46         subs    WIDTH, #8
  47         blt     2f
  48 1:
  49         ldrb    r4, [SRC_Y], #1
  50         ldrb    TMP1, [SRC_U], #1
  51         ldrb    TMP2, [SRC_Y], #1
  52         ldrb    TMP3, [SRC_V], #1
  53         add     r4, r4, TMP1, lsl #8
  54         add     r4, r4, TMP2, lsl #16
  55         add     r4, r4, TMP3, lsl #24
  56
  57         ldrb    r5, [SRC_Y], #1
  58         ldrb    TMP1, [SRC_U], #1
  59         ldrb    TMP2, [SRC_Y], #1
  60         ldrb    TMP3, [SRC_V], #1
  61         add     r5, r5, TMP1, lsl #8
  62         add     r5, r5, TMP2, lsl #16
  63         add     r5, r5, TMP3, lsl #24
  64
  65         ldrb    r6, [SRC_Y], #1
  66         ldrb    TMP1, [SRC_U], #1
  67         ldrb    TMP2, [SRC_Y], #1
  68         ldrb    TMP3, [SRC_V], #1
  69         add     r6, r6, TMP1, lsl #8
  70         add     r6, r6, TMP2, lsl #16
  71         add     r6, r6, TMP3, lsl #24
  72
  73         ldrb    r7, [SRC_Y], #1
  74         ldrb    TMP1, [SRC_U], #1
  75         ldrb    TMP2, [SRC_Y], #1
  76         ldrb    TMP3, [SRC_V], #1
  77         add     r7, r7, TMP1, lsl #8
  78         add     r7, r7, TMP2, lsl #16
  79         add     r7, r7, TMP3, lsl #24
  80
  81         stmia   DST!, {r4-r7}
  82         subs    WIDTH, WIDTH, #8
  83         bge     1b
  84 2:
  85         adds    WIDTH, WIDTH, #8
  86         ble     4f
  87 3:
  88         ldrb    r4, [SRC_Y], #1
  89         ldrb    TMP1, [SRC_U], #1
  90         ldrb    TMP2, [SRC_Y], #1
  91         ldrb    TMP3, [SRC_V], #1
  92         add     r4, r4, TMP1, lsl #8
  93         add     r4, r4, TMP2, lsl #16
  94         add     r4, r4, TMP3, lsl #24
  95         str     r4, [DST], #4
  96         subs    WIDTH, WIDTH, #2
  97         bgt     3b
  98 4:
  99         ldmfd  sp!, {r4-r8, pc}
 100
 101 #undef  DST
 102 #undef  SRC_Y
 103 #undef  SRC_U
 104 #undef  SRC_V
 105 #undef  WIDTH
 106 #undef  TMP1
 107 #undef  TMP2
 108 #undef  TMP3
 109
 110         .endfunc
 111
 112 /*******************************************************/
 113
 114 #define DST     r0
 115 #define SRC_Y   r1
 116 #define SRC_U   r2
 117 #define WIDTH   r3
 118 #define TMP1    r10
 119 #define TMP2    r12
 120 #define TMP3    lr
 121
 122 .macro YUV420_function_template function_name, USE_PLD, USE_ARMV6
 123
 124         .align
 125         .global \function_name
 126         .func \function_name
 127 \function_name:
 128
 129 /* Read information about 4 pixels, convert them to YUV420 and store into 6 bytes using 16-bit writes */
 130 .macro  CONVERT_4_PIXELS_MACROBLOCK
 131         ldrb    r4, [SRC_Y], #1
 132         ldrb    TMP1, [SRC_U], #1
 133         ldrb    r5, [SRC_U], #1
 134         ldrb    TMP2, [SRC_Y], #1
 135         ldrb    r6, [SRC_Y, #1]
 136         ldrb    TMP3, [SRC_Y], #2
 137         add     r4, r4, TMP1, lsl #8
 138         add     r5, r5, TMP2, lsl #8
 139         add     r6, r6, TMP3, lsl #8
 140         strh    r4, [DST], #2
 141         strh    r5, [DST], #2
 142         strh    r6, [DST], #2
 143 .endm
 144
 145 .if \USE_ARMV6
 146
 147 .macro  CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG1, DST_REG2, FLAG1, FLAG2, PLD_FLAG
 148 .if \FLAG1 == 0
 149         ldrb    \DST_REG1, [SRC_U], #1
 150         ldrh    TMP1, [SRC_Y], #2
 151         ldrb    TMP2, [SRC_U], #1
 152 .endif
 153 .if \FLAG2 == 1
 154         ldrh    \DST_REG2, [SRC_Y], #2
 155 .endif
 156 .if \PLD_FLAG == 1
 157         pld     [SRC_Y, #48]
 158 .endif
 159         add     \DST_REG1, \DST_REG1, TMP1, lsl #8
 160         add     \DST_REG1, \DST_REG1, TMP2, lsl #24
 161 .if \FLAG2 == 1
 162         ldrb    TMP1, [SRC_U], #1
 163         ldrb    TMP2, [SRC_Y], #1
 164 .endif
 165         rev16   \DST_REG1, \DST_REG1
 166 .endm
 167
 168 .macro  CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1
 169 .if \FLAG1 == 0
 170         ldrh    \DST_REG1, [SRC_Y], #2
 171         ldrb    TMP1, [SRC_U], #1
 172         ldrb    TMP2, [SRC_Y], #1
 173 .endif
 174 .if \FLAG2 == 1
 175         ldrb    \DST_REG2, [SRC_Y], #1
 176 .endif
 177         add     \DST_REG1, \DST_REG1, TMP1, lsl #16
 178         add     \DST_REG1, \DST_REG1, TMP2, lsl #24
 179 .if \FLAG2 == 1
 180         ldrb    TMP1, [SRC_U], #1
 181         ldrh    TMP2, [SRC_Y], #2
 182 .endif
 183         rev16   \DST_REG1, \DST_REG1
 184 .endm
 185
 186 .macro  CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1
 187 .if \FLAG1 == 0
 188         ldrb    \DST_REG1, [SRC_Y], #1
 189         ldrb    TMP1, [SRC_U], #1
 190         ldrh    TMP2, [SRC_Y], #2
 191 .endif
 192 .if \FLAG2 == 1
 193         ldrb    \DST_REG2, [SRC_U], #1
 194 .endif
 195         add     \DST_REG1, \DST_REG1, TMP1, lsl #8
 196         add     \DST_REG1, \DST_REG1, TMP2, lsl #16
 197 .if \FLAG2 == 1
 198         ldrh    TMP1, [SRC_Y], #2
 199         ldrb    TMP2, [SRC_U], #1
 200 .endif
 201         rev16   \DST_REG1, \DST_REG1
 202 .endm
 203
 204 .else
 205
 206 /* Prepare the first 32-bit output value for 8 pixels macroblock */
 207 .macro  CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG, DUMMY1, DUMMY2, DUMMY3, PLD_FLAG
 208         ldrb    \DST_REG, [SRC_Y], #1
 209         ldrb    TMP1, [SRC_U], #1
 210         ldrb    TMP2, [SRC_U], #1
 211         ldrb    TMP3, [SRC_Y], #1
 212 .if \USE_PLD && (\PLD_FLAG == 1)
 213         pld     [SRC_Y, #48]
 214 .endif
 215         add     \DST_REG, \DST_REG, TMP1, lsl #8
 216         add     \DST_REG, \DST_REG, TMP2, lsl #16
 217         add     \DST_REG, \DST_REG, TMP3, lsl #24
 218 .endm
 219
 220 /* Prepare the second 32-bit output value for 8 pixels macroblock */
 221 .macro  CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4
 222         ldrb    \DST_REG, [SRC_Y, #1]
 223         ldrb    TMP1, [SRC_Y], #2
 224         ldrb    TMP2, [SRC_Y], #1
 225         ldrb    TMP3, [SRC_U], #1
 226         add     \DST_REG, \DST_REG, TMP1, lsl #8
 227         add     \DST_REG, \DST_REG, TMP2, lsl #16
 228         add     \DST_REG, \DST_REG, TMP3, lsl #24
 229 .endm
 230
 231 /* Prepare the third 32-bit output value for 8 pixels macroblock */
 232 .macro  CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4
 233         ldrb    \DST_REG, [SRC_U], #1
 234         ldrb    TMP1, [SRC_Y], #1
 235         ldrb    TMP2, [SRC_Y, #1]
 236         ldrb    TMP3, [SRC_Y], #2
 237         add     \DST_REG, \DST_REG, TMP1, lsl #8
 238         add     \DST_REG, \DST_REG, TMP2, lsl #16
 239         add     \DST_REG, \DST_REG, TMP3, lsl #24
 240 .endm
 241
 242 .endif
 243
 244 .if \USE_PLD
 245         pld     [SRC_Y]
 246 .endif
 247         stmfd   sp!, {r4-r8, r10, lr}
 248
 249         /* Destination buffer should be at least 16-bit aligned, image width should be multiple of 4 */
 250         bic     DST, #1
 251         bic     WIDTH, #3
 252
 253         /* Ensure 32-bit alignment of the destination buffer */
 254         tst     DST, #2
 255         beq     1f
 256         subs    WIDTH, #4
 257         blt     6f
 258         CONVERT_4_PIXELS_MACROBLOCK
 259 1:
 260         subs    WIDTH, #32
 261         blt     3f
 262 2:      /* Convert 32 pixels per loop iteration */
 263         CONVERT_8_PIXELS_MACROBLOCK_1 r4, r6, 0, 1, 1 /* Also do cache preload for SRC_Y */
 264         CONVERT_8_PIXELS_MACROBLOCK_2 r6, r7, 1, 1, 0
 265         CONVERT_8_PIXELS_MACROBLOCK_3 r7, r8, 1, 1, 0
 266         CONVERT_8_PIXELS_MACROBLOCK_1 r8, r5, 1, 1, 0
 267         stmia   DST!, {r4, r6, r7, r8}
 268
 269         subs    WIDTH, #32
 270
 271         CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0
 272         CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 1, 0
 273         CONVERT_8_PIXELS_MACROBLOCK_1 r7, r8, 1, 1, 0
 274         CONVERT_8_PIXELS_MACROBLOCK_2 r8, r4, 1, 1, 0
 275         stmia   DST!, {r5, r6, r7, r8}
 276 .if \USE_PLD
 277          /* Do cache preload for SRC_U */
 278         pld     [SRC_U, #48]
 279 .endif
 280         CONVERT_8_PIXELS_MACROBLOCK_3 r4, r6, 1, 1, 0
 281         CONVERT_8_PIXELS_MACROBLOCK_1 r6, r7, 1, 1, 0
 282         CONVERT_8_PIXELS_MACROBLOCK_2 r7, r8, 1, 1, 0
 283         CONVERT_8_PIXELS_MACROBLOCK_3 r8, r4, 1, 0, 0
 284         stmia   DST!, {r4, r6, r7, r8}
 285
 286         bge     2b
 287 3:
 288         adds    WIDTH, WIDTH, #32
 289         ble     6f
 290
 291         subs    WIDTH, WIDTH, #8
 292         blt     5f
 293 4:      /* Convert remaining pixels processing them 8 per iteration */
 294         CONVERT_8_PIXELS_MACROBLOCK_1 r4, r5, 0, 1, 0
 295         CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0
 296         CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 0, 0
 297         stmia   DST!, {r4-r6}
 298         subs    WIDTH, WIDTH, #8
 299         bge     4b
 300 5:      /* Convert the last 4 pixels if needed */
 301         adds    WIDTH, WIDTH, #8
 302         ble     6f
 303         CONVERT_4_PIXELS_MACROBLOCK
 304         subs    WIDTH, #4
 305         bgt     4b
 306 6:      /* Restore all registers and return */
 307         ldmfd  sp!, {r4-r8, r10, pc}
 308
 309 .purgem CONVERT_4_PIXELS_MACROBLOCK
 310 .purgem CONVERT_8_PIXELS_MACROBLOCK_1
 311 .purgem CONVERT_8_PIXELS_MACROBLOCK_2
 312 .purgem CONVERT_8_PIXELS_MACROBLOCK_3
 313
 314 #undef  DST
 315 #undef  SRC_Y
 316 #undef  SRC_U
 317 #undef  WIDTH
 318 #undef  TMP1
 319 #undef  TMP2
 320 #undef  TMP3
 321
 322         .endfunc
 323
 324 .endm
 325
 326 YUV420_function_template yv12_to_yuv420_line_arm,   0, 0
 327 YUV420_function_template yv12_to_yuv420_line_armv5, 1, 0
 328 YUV420_function_template yv12_to_yuv420_line_armv6, 1, 1