1 /*****************************************************************************
2 * unicode.c: Unicode <-> locale functions
3 *****************************************************************************
4 * Copyright (C) 2005-2006 VLC authors and VideoLAN
5 * Copyright © 2005-2010 Rémi Denis-Courmont
7 * Authors: Rémi Denis-Courmont <rem # videolan.org>
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU Lesser General Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 /*****************************************************************************
26 *****************************************************************************/
31 #include <vlc_common.h>
34 #include <vlc_charset.h>
41 #include <sys/types.h>
49 * Formats an UTF-8 string as vfprintf(), then print it, with
50 * appropriate conversion to local encoding.
52 int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
55 return vfprintf (stream, fmt, ap);
58 int res = vasprintf (&str, fmt, ap);
59 if (unlikely(res == -1))
62 /* Writing to the console is a lot of fun on Microsoft Windows.
63 * If you use the standard I/O functions, you must use the OEM code page,
64 * which is different from the usual ANSI code page. Or maybe not, if the
65 * user called "chcp". Anyway, we prefer Unicode. */
66 int fd = _fileno (stream);
67 if (likely(fd != -1) && _isatty (fd))
69 wchar_t *wide = ToWide (str);
70 if (likely(wide != NULL))
72 HANDLE h = (HANDLE)((uintptr_t)_get_osfhandle (fd));
74 /* XXX: It is not clear whether WriteConsole() wants the number of
75 * Unicode characters or the size of the wchar_t array. */
76 BOOL ok = WriteConsoleW (h, wide, wcslen (wide), &out, NULL);
83 char *ansi = ToANSI (str);
98 * Formats an UTF-8 string as fprintf(), then print it, with
99 * appropriate conversion to local encoding.
101 int utf8_fprintf( FILE *stream, const char *fmt, ... )
107 res = utf8_vfprintf( stream, fmt, ap );
114 * Converts the first character from a UTF-8 sequence into a code point.
116 * @param str an UTF-8 bytes sequence
117 * @return 0 if str points to an empty string, i.e. the first character is NUL;
118 * number of bytes that the first character occupies (from 1 to 4) otherwise;
119 * -1 if the byte sequence was not a valid UTF-8 sequence.
121 size_t vlc_towc (const char *str, uint32_t *restrict pwc)
123 uint8_t *ptr = (uint8_t *)str, c;
126 assert (str != NULL);
129 if (unlikely(c > 0xF4))
132 int charlen = clz8 (c ^ 0xFF);
135 case 0: // 7-bit ASCII character -> short cut
139 case 1: // continuation byte -> error
143 if (unlikely(c < 0xC2)) // ASCII overlong
145 cp = (c & 0x1F) << 6;
149 cp = (c & 0x0F) << 12;
153 cp = (c & 0x07) << 16;
160 /* Unrolled continuation bytes decoding */
165 if (unlikely((c >> 6) != 2)) // not a continuation byte
167 cp |= (c & 0x3f) << 12;
169 if (unlikely(cp >= 0x110000)) // beyond Unicode range
174 if (unlikely((c >> 6) != 2)) // not a continuation byte
176 cp |= (c & 0x3f) << 6;
178 if (unlikely(cp >= 0xD800 && cp < 0xE000)) // UTF-16 surrogate
180 if (unlikely(cp < (1u << (5 * charlen - 4)))) // non-ASCII overlong
185 if (unlikely((c >> 6) != 2)) // not a continuation byte
196 * Look for an UTF-8 string within another one in a case-insensitive fashion.
197 * Beware that this is quite slow. Contrary to strcasestr(), this function
198 * works regardless of the system character encoding, and handles multibyte
199 * code points correctly.
201 * @param haystack string to look into
202 * @param needle string to look for
203 * @return a pointer to the first occurence of the needle within the haystack,
204 * or NULL if no occurence were found.
206 char *vlc_strcasestr (const char *haystack, const char *needle)
212 const char *h = haystack, *n = needle;
218 s = vlc_towc (n, &cpn);
220 return (char *)haystack;
225 s = vlc_towc (h, &cph);
226 if (s <= 0 || towlower (cph) != towlower (cpn))
231 s = vlc_towc (haystack, &(uint32_t) { 0 });
240 * Replaces invalid/overlong UTF-8 sequences with question marks.
241 * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
242 * so we don't try that, even though it would be less disruptive.
244 * @return str if it was valid UTF-8, NULL if not.
246 char *EnsureUTF8( char *str )
252 while ((n = vlc_towc (str, &cp)) != 0)
253 if (likely(n != (size_t)-1))
265 * Checks whether a string is a valid UTF-8 byte sequence.
267 * @param str nul-terminated string to be checked
269 * @return str if it was valid UTF-8, NULL if not.
271 const char *IsUTF8( const char *str )
276 while ((n = vlc_towc (str, &cp)) != 0)
277 if (likely(n != (size_t)-1))
285 * Converts a string from the given character encoding to utf-8.
287 * @return a nul-terminated utf-8 string, or null in case of error.
288 * The result must be freed using free().
290 char *FromCharset(const char *charset, const void *data, size_t data_size)
292 vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
293 if (handle == (vlc_iconv_t)(-1))
297 for(unsigned mul = 4; mul < 8; mul++ )
299 size_t in_size = data_size;
300 const char *in = data;
301 size_t out_max = mul * data_size;
302 char *tmp = out = malloc (1 + out_max);
306 if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
316 vlc_iconv_close(handle);
321 * Converts a nul-terminated UTF-8 string to a given character encoding.
322 * @param charset iconv name of the character set
323 * @param in nul-terminated UTF-8 string
324 * @param outsize pointer to hold the byte size of result
326 * @return A pointer to the result, which must be released using free().
327 * The UTF-8 nul terminator is included in the conversion if the target
328 * character encoding supports it. However it is not included in the returned
330 * In case of error, NULL is returned and the byte size is undefined.
332 void *ToCharset(const char *charset, const char *in, size_t *outsize)
334 vlc_iconv_t hd = vlc_iconv_open (charset, "UTF-8");
335 if (hd == (vlc_iconv_t)(-1))
338 const size_t inlen = strlen (in);
341 for (unsigned mul = 4; mul < 16; mul++)
343 size_t outlen = mul * (inlen + 1);
344 res = malloc (outlen);
345 if (unlikely(res == NULL))
348 const char *inp = in;
351 size_t outb = outlen - mul;
353 if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
355 *outsize = outlen - mul - outb;
357 inb = 1; /* append nul terminator if possible */
358 if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
360 if (errno == EILSEQ) /* cannot translate nul terminator!? */
366 if (errno != E2BIG) /* conversion failure */
369 vlc_iconv_close (hd);