#ifdef UNDER_CE
# include <tchar.h>
#endif
-
-#if defined (__APPLE__) || defined (HAVE_MAEMO)
-/* Define this if the OS always use UTF-8 internally */
-# define ASSUME_UTF8 1
-#endif
+#include <errno.h>
#if defined (ASSUME_UTF8)
/* Cool */
+
#elif defined (WIN32) || defined (UNDER_CE)
# define USE_MB2MB 1
-#elif defined (HAVE_ICONV)
-# define USE_ICONV 1
-#else
-# error No UTF8 charset conversion implemented on this platform!
-#endif
+# include <io.h>
-#if defined (USE_ICONV)
-# include <langinfo.h>
-static char charset[sizeof ("CSISO11SWEDISHFORNAMES")] = "";
-
-static void find_charset_once (void)
+static char *locale_dup (const char *string, bool from)
{
- strlcpy (charset, nl_langinfo (CODESET), sizeof (charset));
- if (!strcasecmp (charset, "ASCII")
- || !strcasecmp (charset, "ANSI_X3.4-1968"))
- strcpy (charset, "UTF-8"); /* superset... */
-}
+ char *out;
+ int len;
-static int find_charset (void)
-{
- static pthread_once_t once = PTHREAD_ONCE_INIT;
- pthread_once (&once, find_charset_once);
- return !strcasecmp (charset, "UTF-8");
+ len = 1 + MultiByteToWideChar (from ? CP_ACP : CP_UTF8,
+ 0, string, -1, NULL, 0);
+ wchar_t *wide = malloc (len * sizeof (wchar_t));
+ if (wide == NULL)
+ return NULL;
+
+ MultiByteToWideChar (from ? CP_ACP : CP_UTF8, 0, string, -1, wide, len);
+ len = 1 + WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1,
+ NULL, 0, NULL, NULL);
+ out = malloc (len);
+ if (out != NULL)
+ WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1, out, len,
+ NULL, NULL);
+ free (wide);
+ return out;
}
-#endif
+#elif defined (HAVE_ICONV)
+# define USE_ICONV 1
-static char *locale_fast (const char *string, bool from)
+static char *locale_dup (const char *string, bool from)
{
- if( string == NULL )
- return NULL;
-
-#if defined (USE_ICONV)
- if (find_charset ())
- return (char *)string;
-
- vlc_iconv_t hd = vlc_iconv_open (from ? "UTF-8" : charset,
- from ? charset : "UTF-8");
+ vlc_iconv_t hd = vlc_iconv_open (from ? "UTF-8" : "",
+ from ? "" : "UTF-8");
if (hd == (vlc_iconv_t)(-1))
return NULL; /* Uho! */
assert (*optr == '\0');
assert (strlen (output) == (size_t)(optr - output));
return strdup (output);
-#elif defined (USE_MB2MB)
- char *out;
- int len;
-
- len = 1 + MultiByteToWideChar (from ? CP_ACP : CP_UTF8,
- 0, string, -1, NULL, 0);
- wchar_t *wide = malloc (len * sizeof (wchar_t));
- if (wide == NULL)
- return NULL;
-
- MultiByteToWideChar (from ? CP_ACP : CP_UTF8, 0, string, -1, wide, len);
- len = 1 + WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1,
- NULL, 0, NULL, NULL);
- out = malloc (len);
- if (out != NULL)
- WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1, out, len,
- NULL, NULL);
- free (wide);
- return out;
-#else
- (void)from;
- return (char *)string;
-#endif
}
-
-static inline char *locale_dup (const char *string, bool from)
-{
- assert( string );
-
-#if defined (USE_ICONV)
- if (find_charset ())
- return strdup (string);
- return locale_fast (string, from);
-#elif defined (USE_MB2MB)
- return locale_fast (string, from);
#else
- (void)from;
- return strdup (string);
+# error No UTF8 charset conversion implemented on this platform!
#endif
-}
+
/**
* Releases (if needed) a localized or uniformized string.
*/
void LocaleFree (const char *str)
{
-#if defined (USE_ICONV)
- if (!find_charset ())
- free ((char *)str);
-#elif defined (USE_MB2MB)
- free ((char *)str);
+#ifdef ASSUME_UTF8
+ (void) str;
#else
- (void)str;
+ free ((char *)str);
#endif
}
*/
char *FromLocale (const char *locale)
{
- return locale_fast (locale, true);
+#ifdef ASSUME_UTF8
+ return (char *)locale;
+#else
+ return locale ? locale_dup (locale, true) : NULL;
+#endif
}
/**
*/
char *FromLocaleDup (const char *locale)
{
+#ifdef ASSUME_UTF8
+ return strdup (locale);
+#else
return locale_dup (locale, true);
+#endif
}
*/
char *ToLocale (const char *utf8)
{
- return locale_fast (utf8, false);
+#ifdef ASSUME_UTF8
+ return (char *)utf8;
+#else
+ return utf8 ? locale_dup (utf8, false) : NULL;
+#endif
}
*/
char *ToLocaleDup (const char *utf8)
{
+#ifdef ASSUME_UTF8
+ return strdup (utf8);
+#else
return locale_dup (utf8, false);
-}
-
-/**
- * Formats an UTF-8 string as vasprintf(), then print it to stdout, with
- * appropriate conversion to local encoding.
- */
-static int utf8_vasprintf( char **str, const char *fmt, va_list ap )
-{
- char *utf8;
- int res = vasprintf( &utf8, fmt, ap );
- if( res == -1 )
- return -1;
-
- *str = ToLocaleDup( utf8 );
- free( utf8 );
- return res;
+#endif
}
/**
*/
int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
{
+#ifdef ASSUME_UTF8
+ return vfprintf (stream, fmt, ap);
+#else
char *str;
- int res = utf8_vasprintf( &str, fmt, ap );
- if( res == -1 )
+ int res;
+
+# if defined( WIN32 ) && !defined( UNDER_CE )
+ /* Writing to the console is a lot of fun on Microsoft Windows.
+ * If you use the standard I/O functions, you must use the OEM code page,
+ * which is different from the usual ANSI code page. Or maybe not, if the
+ * user called "chcp". Anyway, we prefer Unicode. */
+ int fd = _fileno (stream);
+ if (likely(fd != -1) && _isatty (fd))
+ {
+ res = vasprintf (&str, fmt, ap);
+ if (unlikely(res == -1))
+ return -1;
+
+ size_t wlen = 2 * (res + 1);
+ wchar_t *wide = malloc (wlen);
+ if (likely(wide != NULL))
+ {
+ wlen = MultiByteToWideChar (CP_UTF8, 0, str, res + 1, wide, wlen);
+ if (wlen > 0)
+ {
+ HANDLE h = (HANDLE)(intptr_t)_get_osfhandle (fd);
+ DWORD out;
+
+ WriteConsoleW (h, wide, wlen - 1, &out, NULL);
+ }
+ else
+ res = -1;
+ free (wide);
+ }
+ else
+ res = -1;
+ free (str);
+ return res;
+ }
+# endif
+
+ res = vasprintf (&str, fmt, ap);
+ if (unlikely(res == -1))
return -1;
- fputs( str, stream );
- free( str );
+ char *ansi = ToLocaleDup (str);
+ free (str);
+
+ fputs (ansi, stream);
+ free (ansi);
return res;
+#endif
}
/**
for (;;)
{
uint8_t c = ptr[0];
- int charlen = -1;
if (c == '\0')
break;
- for (int i = 0; i < 7; i++)
- if ((c >> (7 - i)) == ((0xff >> (7 - i)) ^ 1))
- {
- charlen = i;
- break;
- }
+ if (c > 0xF4)
+ goto error;
+ int charlen = clz8 (c ^ 0xFF);
switch (charlen)
{
case 0: // 7-bit ASCII character -> OK
ptr++;
continue;
- case -1: // 1111111x -> error
case 1: // continuation byte -> error
goto error;
}
- assert (charlen >= 2);
+ assert (charlen >= 2 && charlen <= 4);
uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
for (int i = 1; i < charlen; i++)
assert (cp < (1 << 26));
c = ptr[i];
- if ((c == '\0') // unexpected end of string
- || ((c >> 6) != 2)) // not a continuation byte
+ if ((c >> 6) != 2) // not a continuation byte
goto error;
cp = (cp << 6) | (ptr[i] & 0x3f);
}
- if (cp < 128) // overlong (special case for ASCII)
- goto error;
- if (cp < (1u << (5 * charlen - 3))) // overlong
- goto error;
-
+ switch (charlen)
+ {
+ case 4:
+ if (cp > 0x10FFFF) // beyond Unicode
+ goto error;
+ case 3:
+ if (cp >= 0xD800 && cp < 0xC000) // UTF-16 surrogate
+ goto error;
+ case 2:
+ if (cp < 128) // ASCII overlong
+ goto error;
+ if (cp < (1u << (5 * charlen - 3))) // overlong
+ goto error;
+ }
ptr += charlen;
continue;
{
return CheckUTF8( (char *)str, 0 );
}
+
+/**
+ * Converts a string from the given character encoding to utf-8.
+ *
+ * @return a nul-terminated utf-8 string, or null in case of error.
+ * The result must be freed using free().
+ */
+char *FromCharset(const char *charset, const void *data, size_t data_size)
+{
+ vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
+ if (handle == (vlc_iconv_t)(-1))
+ return NULL;
+
+ char *out = NULL;
+ for(unsigned mul = 4; mul < 8; mul++ )
+ {
+ size_t in_size = data_size;
+ const char *in = data;
+ size_t out_max = mul * data_size;
+ char *tmp = out = malloc (1 + out_max);
+ if (!out)
+ break;
+
+ if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
+ *tmp = '\0';
+ break;
+ }
+ free(out);
+ out = NULL;
+
+ if (errno != E2BIG)
+ break;
+ }
+ vlc_iconv_close(handle);
+ return out;
+}
+