git.sesse.net Git - vlc/blob - src/misc/unicode.c

   1 /*****************************************************************************
   2  * unicode.c: UTF8 <-> locale functions
   3  *****************************************************************************
   4  * Copyright (C) 2005 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Rémi Denis-Courmont <rem # videolan.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Preamble
  26  *****************************************************************************/
  27 #include <vlc/vlc.h>
  28 #include "charset.h"
  29 #include <stdio.h>
  30
  31 /*****************************************************************************
  32  * vlc_fopen: Calls fopen() after conversion of file name to OS locale
  33  *****************************************************************************/
  34 FILE *vlc_fopen( const char *filename, const char *mode )
  35 {
  36 #if !defined WIN32 /*|| !defined UNICODE*/
  37     const char *local_name = ToLocale( filename );
  38
  39     if( local_name != NULL )
  40     {
  41         FILE *stream = fopen( local_name, mode );
  42         LocaleFree( local_name );
  43         return stream;
  44     }
  45     return NULL;
  46 #else
  47     wchar_t wpath[MAX_PATH];
  48     wchar_t wmode[4];
  49
  50    if( !MultiByteToWideChar( CP_UTF8, 0, filename, -1, wpath, MAX_PATH - 1)
  51     || !MultiByteToWideChar( CP_ACP, 0, mode, -1, wmode, 3 ) )
  52         return NULL;
  53
  54     return _wfopen( wpath, wmode );
  55 #endif
  56 }
  57
  58 /*****************************************************************************
  59  * EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
  60  *****************************************************************************
  61  * Not Todo : convert Latin1 to UTF-8 on the flu
  62  * It is not possible given UTF-8 needs more space
  63  *****************************************************************************/
  64 #define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF))
  65 char *EnsureUTF8( char *str )
  66 {
  67     unsigned char *ptr, c;
  68
  69     ptr = (unsigned char *)str;
  70     while( (c = *ptr) != '\0' )
  71     {
  72         /* US-ASCII, 1 byte */
  73         if( ( ( c >= 0x20 ) && ( c <= 0x7F ) )
  74          || ( c == 0x09 ) || ( c == 0x0A ) || ( c == 0x0D ) )
  75         {
  76             ptr++; /* OK */
  77         }
  78         else
  79         /* 2 bytes */
  80         if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
  81         {
  82             c = ptr[1];
  83             if( isutf8cont( c ) )
  84                 ptr += 2; /* OK */
  85             else
  86                 *ptr++ = '?'; /* invalid */
  87         }
  88         else
  89         /* 3 bytes */
  90         if( c == 0xE0 )
  91         {
  92             c = ptr[1];
  93             if( ( c >= 0xA0 ) && ( c <= 0xBF ) )
  94             {
  95                 c = ptr[2];
  96                 if( isutf8cont( c ) )
  97                     ptr += 3; /* OK */
  98                 else
  99                     *ptr++ = '?';
 100             }
 101             else
 102                 *ptr++ = '?';
 103         }
 104         else
 105         if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
 106          || ( c == 0xEE ) || ( c == 0xEF ) )
 107         {
 108             c = ptr[1];
 109             if( isutf8cont( c ) )
 110             {
 111                 c = ptr[2];
 112                 if( isutf8cont( c ) )
 113                     ptr += 3; /* OK */
 114                 else
 115                     *ptr++ = '?';
 116             }
 117             else
 118                 *ptr++ = '?';
 119         }
 120         else
 121         if( c == 0xED )
 122         {
 123             c = ptr[1];
 124             if( ( c >= 0x80 ) && ( c <= 0x9F ) )
 125             {
 126                 c = ptr[2];
 127                 if( isutf8cont( c ) )
 128                     ptr += 3; /* OK */
 129                 else
 130                     *ptr++ = '?';
 131             }
 132             else
 133                 *ptr++ = '?';
 134         }
 135         else
 136         /* 4 bytes */
 137         if( c == 0xF0 )
 138         {
 139             c = ptr[1];
 140             if( ( c >= 0x90 ) && ( c <= 0xBF ) )
 141             {
 142                 c = ptr[2];
 143                 if( isutf8cont( c ) )
 144                 {
 145                     c = ptr[3];
 146                     if( isutf8cont( c ) )
 147                         ptr += 4; /* OK */
 148                     else
 149                         *ptr++ = '?';
 150                 }
 151                 else
 152                     *ptr++ = '?';
 153             }
 154             else
 155                 *ptr++ = '?';
 156         }
 157         else
 158         if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
 159         {
 160             c = ptr[1];
 161             if( isutf8cont( c ) )
 162             {
 163                 c = ptr[2];
 164                 if( isutf8cont( c ) )
 165                 {
 166                     c = ptr[3];
 167                     if( isutf8cont( c ) )
 168                         ptr += 4; /* OK */
 169                     else
 170                         *ptr++ = '?';
 171                 }
 172                 else
 173                     *ptr++ = '?';
 174             }
 175             else
 176                 *ptr++ = '?';
 177         }
 178         else
 179         if( c == 0xF4 )
 180         {
 181             c = ptr[1];
 182             if( ( c >= 0x80 ) && ( c <= 0x8F ) )
 183             {
 184                 c = ptr[2];
 185                 if( isutf8cont( c ) )
 186                 {
 187                     c = ptr[3];
 188                     if( isutf8cont( c ) )
 189                         ptr += 4; /* OK */
 190                     else
 191                         *ptr++ = '?';
 192                 }
 193                 else
 194                     *ptr++ = '?';
 195             }
 196             else
 197                 *ptr++ = '?';
 198         }
 199         else
 200             *ptr++ = '?';
 201     }
 202
 203     return str;
 204 }
 205
 206 /**********************************************************************
 207  * UTF32toUTF8: converts an array from UTF-32 to UTF-8
 208  *********************************************************************/
 209 char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
 210 {
 211     char *res, *out;
 212
 213     /* allocate memory */
 214     out = res = (char *)malloc( 6 * len );
 215     if( res == NULL )
 216         return NULL;
 217
 218     while( len > 0 )
 219     {
 220         uint32_t uv = *src++;
 221         len--;
 222
 223         if( uv < 0x80 )
 224         {
 225             *out++ = uv;
 226             continue;
 227         }
 228         else
 229         if( uv < 0x800 )
 230         {
 231             *out++ = (( uv >>  6)         | 0xc0);
 232             *out++ = (( uv        & 0x3f) | 0x80);
 233             continue;
 234         }
 235         else
 236         if( uv < 0x10000 )
 237         {
 238             *out++ = (( uv >> 12)         | 0xe0);
 239             *out++ = (((uv >>  6) & 0x3f) | 0x80);
 240             *out++ = (( uv        & 0x3f) | 0x80);
 241             continue;
 242         }
 243         else
 244         {
 245             *out++ = (( uv >> 18)         | 0xf0);
 246             *out++ = (((uv >> 12) & 0x3f) | 0x80);
 247             *out++ = (((uv >>  6) & 0x3f) | 0x80);
 248             *out++ = (( uv        & 0x3f) | 0x80);
 249             continue;
 250         }
 251     }
 252     len = out - res;
 253     res = realloc( res, len );
 254     if( newlen != NULL )
 255         *newlen = len;
 256     return res;
 257 }
 258
 259 /**********************************************************************
 260  * FromUTF32: converts an UTF-32 string to UTF-8
 261  **********************************************************************
 262  * The result must be free()'d. NULL on error.
 263  *********************************************************************/
 264 char *FromUTF32( const wchar_t *src )
 265 {
 266     size_t len;
 267     const wchar_t *in;
 268
 269     /* determine the size of the string */
 270     for( len = 1, in = src; GetWBE( in ); len++ )
 271         in++;
 272
 273     return UTF32toUTF8( src, len, NULL );
 274 }