1 /*****************************************************************************
2 * unicode.c: UTF8 <-> locale functions
3 *****************************************************************************
4 * Copyright (C) 2005 the VideoLAN team
7 * Authors: RĂ©mi Denis-Courmont <rem # videolan.org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 /*****************************************************************************
26 *****************************************************************************/
31 /*****************************************************************************
32 * vlc_fopen: Calls fopen() after conversion of file name to OS locale
33 *****************************************************************************/
34 FILE *vlc_fopen( const char *filename, const char *mode )
36 #if !defined WIN32 /*|| !defined UNICODE*/
37 const char *local_name = ToLocale( filename );
39 if( local_name != NULL )
41 FILE *stream = fopen( local_name, mode );
42 LocaleFree( local_name );
47 wchar_t wpath[MAX_PATH];
50 if( !MultiByteToWideChar( CP_UTF8, 0, filename, -1, wpath, MAX_PATH - 1)
51 || !MultiByteToWideChar( CP_ACP, 0, mode, -1, wmode, 3 ) )
54 return _wfopen( wpath, wmode );
58 /*****************************************************************************
59 * EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
60 *****************************************************************************
61 * Not Todo : convert Latin1 to UTF-8 on the flu
62 * It is not possible given UTF-8 needs more space
63 *****************************************************************************/
64 #define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF))
65 char *EnsureUTF8( char *str )
67 unsigned char *ptr, c;
69 ptr = (unsigned char *)str;
70 while( (c = *ptr) != '\0' )
72 /* US-ASCII, 1 byte */
73 if( ( ( c >= 0x20 ) && ( c <= 0x7F ) )
74 || ( c == 0x09 ) || ( c == 0x0A ) || ( c == 0x0D ) )
80 if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
86 *ptr++ = '?'; /* invalid */
93 if( ( c >= 0xA0 ) && ( c <= 0xBF ) )
105 if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
106 || ( c == 0xEE ) || ( c == 0xEF ) )
109 if( isutf8cont( c ) )
112 if( isutf8cont( c ) )
124 if( ( c >= 0x80 ) && ( c <= 0x9F ) )
127 if( isutf8cont( c ) )
140 if( ( c >= 0x90 ) && ( c <= 0xBF ) )
143 if( isutf8cont( c ) )
146 if( isutf8cont( c ) )
158 if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
161 if( isutf8cont( c ) )
164 if( isutf8cont( c ) )
167 if( isutf8cont( c ) )
182 if( ( c >= 0x80 ) && ( c <= 0x8F ) )
185 if( isutf8cont( c ) )
188 if( isutf8cont( c ) )
206 /**********************************************************************
207 * UTF32toUTF8: converts an array from UTF-32 to UTF-8
208 *********************************************************************/
209 char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
213 /* allocate memory */
214 out = res = (char *)malloc( 6 * len );
220 uint32_t uv = *src++;
231 *out++ = (( uv >> 6) | 0xc0);
232 *out++ = (( uv & 0x3f) | 0x80);
238 *out++ = (( uv >> 12) | 0xe0);
239 *out++ = (((uv >> 6) & 0x3f) | 0x80);
240 *out++ = (( uv & 0x3f) | 0x80);
245 *out++ = (( uv >> 18) | 0xf0);
246 *out++ = (((uv >> 12) & 0x3f) | 0x80);
247 *out++ = (((uv >> 6) & 0x3f) | 0x80);
248 *out++ = (( uv & 0x3f) | 0x80);
253 res = realloc( res, len );
259 /**********************************************************************
260 * FromUTF32: converts an UTF-32 string to UTF-8
261 **********************************************************************
262 * The result must be free()'d. NULL on error.
263 *********************************************************************/
264 char *FromUTF32( const wchar_t *src )
269 /* determine the size of the string */
270 for( len = 1, in = src; GetWBE( in ); len++ )
273 return UTF32toUTF8( src, len, NULL );