]> git.sesse.net Git - vlc/blob - src/misc/unicode.c
Support opening of non ANSI filenames on Win32
[vlc] / src / misc / unicode.c
1 /*****************************************************************************
2  * unicode.c: UTF8 <-> locale functions
3  *****************************************************************************
4  * Copyright (C) 2005 the VideoLAN team
5  * $Id$
6  *
7  * Authors: RĂ©mi Denis-Courmont <rem # videolan.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22  *****************************************************************************/
23
24 /*****************************************************************************
25  * Preamble
26  *****************************************************************************/
27 #include <vlc/vlc.h>
28 #include "charset.h"
29 #include <stdio.h>
30
31 /*****************************************************************************
32  * vlc_fopen: Calls fopen() after conversion of file name to OS locale
33  *****************************************************************************/
34 FILE *vlc_fopen( const char *filename, const char *mode )
35 {
36 #if !defined WIN32 /*|| !defined UNICODE*/
37     const char *local_name = ToLocale( filename );
38
39     if( local_name != NULL )
40     {
41         FILE *stream = fopen( local_name, mode );
42         LocaleFree( local_name );
43         return stream;
44     }
45     return NULL;
46 #else
47     wchar_t wpath[MAX_PATH];
48     wchar_t wmode[4];
49
50    if( !MultiByteToWideChar( CP_UTF8, 0, filename, -1, wpath, MAX_PATH - 1)
51     || !MultiByteToWideChar( CP_ACP, 0, mode, -1, wmode, 3 ) )
52         return NULL;
53
54     return _wfopen( wpath, wmode );
55 #endif
56 }
57
58 /*****************************************************************************
59  * EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
60  *****************************************************************************
61  * Not Todo : convert Latin1 to UTF-8 on the flu
62  * It is not possible given UTF-8 needs more space
63  *****************************************************************************/
64 #define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF)) 
65 char *EnsureUTF8( char *str )
66 {
67     unsigned char *ptr, c;
68
69     ptr = (unsigned char *)str;
70     while( (c = *ptr) != '\0' )
71     {
72         /* US-ASCII, 1 byte */
73         if( ( ( c >= 0x20 ) && ( c <= 0x7F ) )
74          || ( c == 0x09 ) || ( c == 0x0A ) || ( c == 0x0D ) )
75         {
76             ptr++; /* OK */
77         }
78         else
79         /* 2 bytes */
80         if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
81         {
82             c = ptr[1];
83             if( isutf8cont( c ) )
84                 ptr += 2; /* OK */
85             else
86                 *ptr++ = '?'; /* invalid */
87         }
88         else
89         /* 3 bytes */
90         if( c == 0xE0 )
91         {
92             c = ptr[1];
93             if( ( c >= 0xA0 ) && ( c <= 0xBF ) )
94             {
95                 c = ptr[2];
96                 if( isutf8cont( c ) )
97                     ptr += 3; /* OK */
98                 else
99                     *ptr++ = '?';
100             }
101             else
102                 *ptr++ = '?';
103         }
104         else
105         if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
106          || ( c == 0xEE ) || ( c == 0xEF ) )
107         {
108             c = ptr[1];
109             if( isutf8cont( c ) )
110             {
111                 c = ptr[2];
112                 if( isutf8cont( c ) )
113                     ptr += 3; /* OK */
114                 else
115                     *ptr++ = '?';
116             }
117             else
118                 *ptr++ = '?';
119         }
120         else
121         if( c == 0xED )
122         {
123             c = ptr[1];
124             if( ( c >= 0x80 ) && ( c <= 0x9F ) )
125             {
126                 c = ptr[2];
127                 if( isutf8cont( c ) )
128                     ptr += 3; /* OK */
129                 else
130                     *ptr++ = '?';
131             }
132             else
133                 *ptr++ = '?';
134         }
135         else
136         /* 4 bytes */
137         if( c == 0xF0 )
138         {
139             c = ptr[1];
140             if( ( c >= 0x90 ) && ( c <= 0xBF ) )
141             {
142                 c = ptr[2];
143                 if( isutf8cont( c ) )
144                 {
145                     c = ptr[3];
146                     if( isutf8cont( c ) )
147                         ptr += 4; /* OK */
148                     else
149                         *ptr++ = '?';
150                 }
151                 else
152                     *ptr++ = '?';
153             }
154             else
155                 *ptr++ = '?';
156         }
157         else
158         if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
159         {
160             c = ptr[1];
161             if( isutf8cont( c ) )
162             {
163                 c = ptr[2];
164                 if( isutf8cont( c ) )
165                 {
166                     c = ptr[3];
167                     if( isutf8cont( c ) )
168                         ptr += 4; /* OK */
169                     else
170                         *ptr++ = '?';
171                 }
172                 else
173                     *ptr++ = '?';
174             }
175             else
176                 *ptr++ = '?';
177         }
178         else
179         if( c == 0xF4 )
180         {
181             c = ptr[1];
182             if( ( c >= 0x80 ) && ( c <= 0x8F ) )
183             {
184                 c = ptr[2];
185                 if( isutf8cont( c ) )
186                 {
187                     c = ptr[3];
188                     if( isutf8cont( c ) )
189                         ptr += 4; /* OK */
190                     else
191                         *ptr++ = '?';
192                 }
193                 else
194                     *ptr++ = '?';
195             }
196             else
197                 *ptr++ = '?';
198         }
199         else
200             *ptr++ = '?';
201     }
202
203     return str;
204 }
205
206 /**********************************************************************
207  * UTF32toUTF8: converts an array from UTF-32 to UTF-8
208  *********************************************************************/
209 char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
210 {
211     char *res, *out;
212
213     /* allocate memory */
214     out = res = (char *)malloc( 6 * len );
215     if( res == NULL )
216         return NULL;
217
218     while( len > 0 )
219     {
220         uint32_t uv = *src++;
221         len--;
222
223         if( uv < 0x80 )
224         {
225             *out++ = uv;
226             continue;
227         }
228         else
229         if( uv < 0x800 )
230         {
231             *out++ = (( uv >>  6)         | 0xc0);
232             *out++ = (( uv        & 0x3f) | 0x80);
233             continue;
234         }
235         else
236         if( uv < 0x10000 )
237         {
238             *out++ = (( uv >> 12)         | 0xe0);
239             *out++ = (((uv >>  6) & 0x3f) | 0x80);
240             *out++ = (( uv        & 0x3f) | 0x80);
241             continue;
242         }
243         else
244         {
245             *out++ = (( uv >> 18)         | 0xf0);
246             *out++ = (((uv >> 12) & 0x3f) | 0x80);
247             *out++ = (((uv >>  6) & 0x3f) | 0x80);
248             *out++ = (( uv        & 0x3f) | 0x80);
249             continue;
250         }
251     }
252     len = out - res;
253     res = realloc( res, len );
254     if( newlen != NULL )
255         *newlen = len;
256     return res;
257 }
258
259 /**********************************************************************
260  * FromUTF32: converts an UTF-32 string to UTF-8
261  **********************************************************************
262  * The result must be free()'d. NULL on error.
263  *********************************************************************/
264 char *FromUTF32( const wchar_t *src )
265 {
266     size_t len;
267     const wchar_t *in;
268
269     /* determine the size of the string */
270     for( len = 1, in = src; GetWBE( in ); len++ )
271         in++;
272
273     return UTF32toUTF8( src, len, NULL );
274 }