*****************************************************************************
* See also unicode.c for Unicode to locale conversion helpers.
*
- * Copyright (C) 2003-2006 the VideoLAN team
- * $Id$
+ * Copyright (C) 2003-2008 the VideoLAN team
*
- * Authors: Derk-Jan Hartman <thedj at users.sf.net>
- * Christophe Massiot
+ * Authors: Christophe Massiot
* Rémi Denis-Courmont
*
- * vlc_current_charset() an adaption of mp_locale_charset():
- *
- * Copyright (C) 2001-2003 The Mape Project
- * Written by Karel Zak <zakkr@zf.jcu.cz>.
- *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
-#include <stdlib.h>
-#include <stdio.h>
-#include <vlc/vlc.h>
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <vlc_common.h>
#if !defined WIN32
-# if HAVE_LANGINFO_CODESET
-# include <langinfo.h>
-# endif
# include <locale.h>
#else
# include <windows.h>
#endif
#ifdef __APPLE__
-# include <errno.h>
# include <string.h>
+# include <xlocale.h>
#endif
+#include "libvlc.h"
#include <vlc_charset.h>
-typedef struct VLCCharsetAlias
-{
- char *psz_alias, *psz_name;
-} VLCCharsetAlias;
-
-/*
- * The libcharset load all from external text file, but it's strange and
- * slow solution, we rather use array(s) compiled into source. In the
- * "good" libc this is not needful -- for example in linux.
- *
- * Please, put to this funtion exotic aliases only. The libc 'iconv' knows
- * a lot of basic aliases (check it first by iconv -l).
- *
- */
-#if (defined OS2 || !HAVE_LANGINFO_CODESET) && !defined WIN32
-static const char* vlc_encoding_from_language( const char *l )
-{
- /* check for language (and perhaps country) codes */
- if (strstr(l, "zh_TW")) return "Big5";
- if (strstr(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */
- if (strstr(l, "zh")) return "GB2312";
- if (strstr(l, "th")) return "TIS-620";
- if (strstr(l, "ja")) return "EUC-JP";
- if (strstr(l, "ko")) return "EUC-KR";
- if (strstr(l, "ru")) return "KOI8-R";
- if (strstr(l, "uk")) return "KOI8-U";
- if (strstr(l, "pl") || strstr(l, "hr") ||
- strstr(l, "hu") || strstr(l, "cs") ||
- strstr(l, "sk") || strstr(l, "sl")) return "ISO-8859-2";
- if (strstr(l, "eo") || strstr(l, "mt")) return "ISO-8859-3";
- if (strstr(l, "lt") || strstr(l, "la")) return "ISO-8859-4";
- if (strstr(l, "bg") || strstr(l, "be") ||
- strstr(l, "mk") || strstr(l, "uk")) return "ISO-8859-5";
- if (strstr(l, "ar")) return "ISO-8859-6";
- if (strstr(l, "el")) return "ISO-8859-7";
- if (strstr(l, "he") || strstr(l, "iw")) return "ISO-8859-8";
- if (strstr(l, "tr")) return "ISO-8859-9";
- if (strstr(l, "th")) return "ISO-8859-11";
- if (strstr(l, "lv")) return "ISO-8859-13";
- if (strstr(l, "cy")) return "ISO-8859-14";
- if (strstr(l, "et")) return "ISO-8859-15"; /* all latin1 could be iso15 as well */
- if (strstr(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */
- if (strstr(l, "am") || strstr(l, "vi")) return "UTF-8";
- /* We don't know. This ain't working go to default. */
- return "ISO-8859-1";
-}
-#endif
-
-static const char* vlc_charset_aliases( const char *psz_name )
-{
- VLCCharsetAlias *a;
-
-#if defined WIN32
- VLCCharsetAlias aliases[] =
- {
- { "CP936", "GBK" },
- { "CP1361", "JOHAB" },
- { "CP20127", "ASCII" },
- { "CP20866", "KOI8-R" },
- { "CP21866", "KOI8-RU" },
- { "CP28591", "ISO-8859-1" },
- { "CP28592", "ISO-8859-2" },
- { "CP28593", "ISO-8859-3" },
- { "CP28594", "ISO-8859-4" },
- { "CP28595", "ISO-8859-5" },
- { "CP28596", "ISO-8859-6" },
- { "CP28597", "ISO-8859-7" },
- { "CP28598", "ISO-8859-8" },
- { "CP28599", "ISO-8859-9" },
- { "CP28605", "ISO-8859-15" },
- { NULL, NULL }
- };
-#elif defined (SYS_AIX)
- VLCCharsetAlias aliases[] =
- {
- { "IBM-850", "CP850" },
- { "IBM-856", "CP856" },
- { "IBM-921", "ISO-8859-13" },
- { "IBM-922", "CP922" },
- { "IBM-932", "CP932" },
- { "IBM-943", "CP943" },
- { "IBM-1046", "CP1046" },
- { "IBM-1124", "CP1124" },
- { "IBM-1129", "CP1129" },
- { "IBM-1252", "CP1252" },
- { "IBM-EUCCN", "GB2312" },
- { "IBM-EUCJP", "EUC-JP" },
- { "IBM-EUCKR", "EUC-KR" },
- { "IBM-EUCTW", "EUC-TW" },
- { NULL, NULL }
- };
-#elif defined (SYS_HPUX)
- VLCCharsetAlias aliases[] =
- {
- { "ROMAN8", "HP-ROMAN8" },
- { "ARABIC8", "HP-ARABIC8" },
- { "GREEK8", "HP-GREEK8" },
- { "HEBREW8", "HP-HEBREW8" },
- { "TURKISH8", "HP-TURKISH8" },
- { "KANA8", "HP-KANA8" },
- { "HP15CN", "GB2312" },
- { NULL, NULL }
- };
-#elif defined (SYS_IRIX)
- VLCCharsetAlias aliases[] =
- {
- { "EUCCN", "GB2312" },
- { NULL, NULL }
- };
-#elif defined (SYS_OSF)
- VLCCharsetAlias aliases[] =
- {
- { "KSC5601", "CP949" },
- { "SDECKANJI", "EUC-JP" },
- { "TACTIS", "TIS-620" },
- { NULL, NULL }
- };
-#elif defined (SYS_SOLARIS)
- VLCCharsetAlias aliases[] =
- {
- { "646", "ASCII" },
- { "CNS11643", "EUC-TW" },
- { "5601", "EUC-KR" },
- { "JOHAP92", "JOHAB" },
- { "PCK", "SHIFT_JIS" },
- { "2533", "TIS-620" },
- { NULL, NULL }
- };
-#elif defined (SYS_BSD)
- VLCCharsetAlias aliases[] =
- {
- { "646", " ASCII" },
- { "EUCCN", "GB2312" },
- { NULL, NULL }
- };
-#else
- VLCCharsetAlias aliases[] = {{NULL, NULL}};
-#endif
-
- for (a = aliases; a->psz_alias; a++)
- if (strcasecmp (a->psz_alias, psz_name) == 0)
- return a->psz_name;
-
- /* we return original name beacuse iconv() probably will know
- * something better about name if we don't know it :-) */
- return psz_name;
-}
-
-/* Returns charset from "language_COUNTRY.charset@modifier" string */
-#if (defined OS2 || !HAVE_LANGINFO_CODESET) && !defined WIN32
-static void vlc_encoding_from_locale( char *psz_locale, char *psz_charset )
-{
- char *psz_dot = strchr( psz_locale, '.' );
-
- if( psz_dot != NULL )
- {
- const char *psz_modifier;
-
- psz_dot++;
-
- /* Look for the possible @... trailer and remove it, if any. */
- psz_modifier = strchr( psz_dot, '@' );
-
- if( psz_modifier == NULL )
- {
- strcpy( psz_charset, psz_dot );
- return;
- }
- if( 0 < ( psz_modifier - psz_dot )
- && ( psz_modifier - psz_dot ) < 2 + 10 + 1 )
- {
- memcpy( psz_charset, psz_dot, psz_modifier - psz_dot );
- psz_charset[ psz_modifier - psz_dot ] = '\0';
- return;
- }
- }
- /* try language mapping */
- strcpy( psz_charset, vlc_encoding_from_language( psz_locale ) );
-}
-#endif
-
-vlc_bool_t vlc_current_charset( char **psz_charset )
-{
- const char *psz_codeset;
-
-#if !(defined WIN32 || defined OS2 || defined __APPLE__)
-
-# if HAVE_LANGINFO_CODESET
- /* Most systems support nl_langinfo( CODESET ) nowadays. */
- psz_codeset = nl_langinfo( CODESET );
- if( !strcmp( psz_codeset, "ANSI_X3.4-1968" ) )
- psz_codeset = "ASCII";
-# else
- /* On old systems which lack it, use setlocale or getenv. */
- const char *psz_locale = NULL;
- char buf[2 + 10 + 1];
-
- /* But most old systems don't have a complete set of locales. Some
- * (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
- * use setlocale here; it would return "C" when it doesn't support the
- * locale name the user has set. Darwin's setlocale is broken. */
-# if HAVE_SETLOCALE && !__APPLE__
- psz_locale = setlocale( LC_ALL, NULL );
-# endif
- if( psz_locale == NULL || psz_locale[0] == '\0' )
- {
- psz_locale = getenv( "LC_ALL" );
- if( psz_locale == NULL || psz_locale[0] == '\0' )
- {
- psz_locale = getenv( "LC_CTYPE" );
- if( psz_locale == NULL || psz_locale[0] == '\0')
- psz_locale = getenv( "LANG" );
- }
- }
-
- /* On some old systems, one used to set locale = "iso8859_1". On others,
- * you set it to "language_COUNTRY.charset". Darwin only has LANG :( */
- vlc_encoding_from_locale( (char *)psz_locale, buf );
- psz_codeset = buf;
-# endif /* HAVE_LANGINFO_CODESET */
-
-#elif defined __APPLE__
-
- /* Darwin is always using UTF-8 internally. */
- psz_codeset = "UTF-8";
-
-#elif defined WIN32
-
- char buf[2 + 10 + 1];
-
- /* Woe32 has a function returning the locale's codepage as a number. */
- snprintf( buf, sizeof( buf ), "CP%u", GetACP() );
- psz_codeset = buf;
-
-#elif defined OS2
-
- const char *psz_locale;
- char buf[2 + 10 + 1];
- ULONG cp[3];
- ULONG cplen;
-
- /* Allow user to override the codeset, as set in the operating system,
- * with standard language environment variables. */
- psz_locale = getenv( "LC_ALL" );
- if( psz_locale == NULL || psz_locale[0] == '\0' )
- {
- psz+locale = getenv( "LC_CTYPE" );
- if( psz_locale == NULL || locale[0] == '\0' )
- locale = getenv( "LANG" );
- }
- if( psz_locale != NULL && psz_locale[0] != '\0' )
- vlc_encoding_from_locale( psz_locale, buf );
- psz_codeset = buf;
- else
- {
- /* OS/2 has a function returning the locale's codepage as a number. */
- if( DosQueryCp( sizeof( cp ), cp, &cplen ) )
- psz_codeset = "";
- else
- {
- snprintf( buf, sizeof( buf ), "CP%u", cp[0] );
- psz_codeset = buf;
- }
- }
-#endif
- if( psz_codeset == NULL )
- /* The canonical name cannot be determined. */
- psz_codeset = "";
- else
- psz_codeset = vlc_charset_aliases( psz_codeset );
-
- /* Don't return an empty string. GNU libc and GNU libiconv interpret
- * the empty string as denoting "the locale's character encoding",
- * thus GNU libiconv would call this function a second time. */
- if( psz_codeset[0] == '\0' )
- {
- /* Last possibility is 'CHARSET' enviroment variable */
- if( !( psz_codeset = getenv( "CHARSET" ) ) )
- psz_codeset = "ISO-8859-1";
- }
-
- if( psz_charset )
- *psz_charset = strdup(psz_codeset);
-
- if( !strcasecmp(psz_codeset, "UTF8") || !strcasecmp(psz_codeset, "UTF-8") )
- return VLC_TRUE;
-
- return VLC_FALSE;
-}
-
-
char *vlc_fix_readdir( const char *psz_string )
{
#ifdef __APPLE__
*psz_out = '\0';
return psz_utf8;
}
-#else
- return strdup( psz_string );
#endif
-}
-
-
-static inline int locale_match (const char *tab, const char *locale)
-{
- for (;*tab; tab += 2)
- if (memcmp (tab, locale, 2) == 0)
- return 0;
- return 1;
+ return strdup( psz_string );
}
/**
- * @return a fallback characters encoding to be used, given a locale.
+ * us_strtod() has the same prototype as ANSI C strtod() but it uses the
+ * POSIX/C decimal format, regardless of the current numeric locale.
*/
-static const char *FindFallbackEncoding (const char *locale)
+double us_strtod( const char *str, char **end )
{
- if ((locale == NULL) || (strlen (locale) < 2))
- return "ASCII";
-
-
- /*** The ISO-8859 series (anything but Asia) ***/
- // Latin-1 Western-European languages (ISO-8859-1)
- static const char western[] =
- "aa" "af" "an" "br" "ca" "da" "de" "en" "es" "et" "eu" "fi" "fo" "fr"
- "ga" "gd" "gl" "gv" "id" "is" "it" "kl" "kw" "mg" "ms" "nb" "nl" "nn"
- "no" "oc" "om" "pt" "so" "sq" "st" "sv" "tl" "uz" "wa" "xh" "zu"
- "eo" "mt" "cy";
- if (!locale_match (western, locale))
- return "CP1252"; // Compatible Microsoft superset
-
- // Latin-2 Slavic languages (ISO-8859-2)
- static const char slavic[] = "bs" "cs" "hr" "hu" "pl" "ro" "sk" "sl";
- if (!locale_match (slavic, locale))
- return "CP1250"; // CP1250 is more common, but incompatible
-
- // Latin-3 Southern European languages (ISO-8859-3)
- // "eo" and "mt" -> Latin-1 instead, I presume(?).
- // "tr" -> ISO-8859-9 instead
-
- // Latin-4 North-European languages (ISO-8859-4)
- // -> Latin-1 instead
-
- /* Cyrillic alphabet languages (ISO-8859-5) */
- static const char cyrillic[] = "be" "bg" "mk" "ru" "sr";
- if (!locale_match (cyrillic, locale))
- return "CP1251"; // KOI8, ISO-8859-5 and CP1251 are incompatible(?)
-
- /* Arabic (ISO-8859-6) */
- if (!locale_match ("ar", locale))
- // FIXME: someone check if we should return CP1256 or ISO-8859-6
- return "CP1256"; // CP1256 is(?) more common, but incompatible(?)
-
- /* Greek (ISO-8859-7) */
- if (!locale_match ("el", locale))
- // FIXME: someone check if we should return CP1253 or ISO-8859-7
- return "CP1253"; // CP1253 is(?) more common and less incompatible
-
- /* Hebrew (ISO-8859-8) */
- if (!locale_match ("he" "iw" "yi", locale))
- return "CP1255"; // Compatible Microsoft superset
+ locale_t loc = newlocale (LC_NUMERIC_MASK, "C", NULL);
+ locale_t oldloc = uselocale (loc);
+ double res = strtod (str, end);
- /* Latin-5 Turkish (ISO-8859-9) */
- if (!locale_match ("tr" "ku", locale))
- return "CP1254"; // Compatible Microsoft superset
-
- /* Latin-6 “North-European” languages (ISO-8859-10) */
- /* It is so much north European that glibc only uses that for Luganda
- * which is spoken in Uganda... unless someone complains, I'm not
- * using this one; let's fallback to CP1252 here. */
-
- // ISO-8859-11 does arguably not exist. Thai is handled below.
-
- // ISO-8859-12 really doesn't exist.
-
- // Latin-7 Baltic languages (ISO-8859-13)
- if (!locale_match ("lt" "lv" "mi", locale))
- // FIXME: mi = New Zealand, doesn't sound baltic!
- return "CP1257"; // Compatible Microsoft superset
-
- // Latin-8 Celtic languages (ISO-8859-14)
- // "cy" -> use Latin-1 instead (most likely English or French)
-
- // Latin-9 (ISO-8859-15) -> see Latin-1
-
- // Latin-10 (ISO-8859-16) does not seem to be used
-
- /*** KOI series ***/
- // For Russian, we use CP1251
- if (!locale_match ("uk", locale))
- return "KOI8-U";
-
- if (!locale_match ("tg", locale))
- return "KOI8-T";
-
- /*** Asia ***/
- // Japanese
- if (!locale_match ("jp", locale))
- return "SHIFT-JIS"; // Shift-JIS is way more common than EUC-JP
-
- // Korean
- if (!locale_match ("ko", locale))
- return "EUC-KR";
-
- // Thai
- if (!locale_match ("th", locale))
- return "TIS-620";
-
- // Vietnamese (FIXME: more infos needed)
- if (!locale_match ("vt", locale))
- /* VISCII is probably a bad idea as it is not extended ASCII */
- /* glibc has TCVN5712-1 */
- return "CP1258";
-
- /* Kazakh (FIXME: more infos needed) */
- if (!locale_match ("kk", locale))
- return "PT154";
-
- // Chinese. The politically incompatible character sets.
- if (!locale_match ("zh", locale))
+ if (loc != (locale_t)0)
{
- if ((strlen (locale) >= 5) && (locale[2] != '_'))
- locale += 3;
-
- // Hong Kong
- if (!locale_match ("HK", locale))
- return "BIG5-HKSCS"; /* FIXME: use something else? */
-
- // Taiwan island
- if (!locale_match ("TW", locale))
- return "BIG5";
-
- // People's Republic of China and Singapore
- /*
- * GB18030 can represent any Unicode code point
- * (like UTF-8), while remaining compatible with GBK
- * FIXME: is it compatible with GB2312? if not, should we
- * use GB2312 instead?
- */
- return "GB18030";
+ uselocale (oldloc);
+ freelocale (loc);
}
-
- return "ASCII";
+ return res;
}
-/**
- * GetFallbackEncoding() suggests an encoding to be used for non UTF-8
- * text files accord to the system's local settings. It is only a best
- * guess.
- */
-const char *GetFallbackEncoding( void )
-{
-#ifndef WIN32
- const char *psz_lang = NULL;
-
- /* Some systems (like Darwin, SunOS 4 or DJGPP) have only the C locale.
- * Therefore we don't use setlocale here; it would return "C". */
-# if defined (HAVE_SETLOCALE) && !defined ( __APPLE__)
- psz_lang = setlocale( LC_ALL, NULL );
-# endif
- if( psz_lang == NULL || psz_lang[0] == '\0' )
- {
- psz_lang = getenv( "LC_ALL" );
- if( psz_lang == NULL || psz_lang == '\0' )
- {
- psz_lang = getenv( "LC_CTYPE" );
- if( psz_lang == NULL || psz_lang[0] == '\0')
- psz_lang = getenv( "LANG" );
- }
- }
-
- return FindFallbackEncoding( psz_lang );
-#else
- /*
- * This should be thread-safe given GetACP() should always return
- * the same result.
- */
- static char buf[2 + 10 + 1] = "";
-
- if( buf[0] == 0 )
- snprintf( buf, sizeof( buf ), "CP%u", GetACP() );
- return buf;
-#endif
-}
/**
- * There are two decimal separators in the computer world-wide locales:
- * dot (which is the american default), and comma (which is used in France,
- * the country with the most VLC developers, among others).
- *
- * i18n_strtod() has the same prototype as ANSI C strtod() but it accepts
- * either decimal separator when deserializing the string to a float number,
- * independant of the local computer setting.
+ * us_strtof() has the same prototype as ANSI C strtof() but it uses the
+ * POSIX/C decimal format, regardless of the current numeric locale.
*/
-double i18n_strtod( const char *str, char **end )
+float us_strtof( const char *str, char **end )
{
- char *end_buf, e;
- double d;
-
- if( end == NULL )
- end = &end_buf;
- d = strtod( str, end );
+ locale_t loc = newlocale (LC_NUMERIC_MASK, "C", NULL);
+ locale_t oldloc = uselocale (loc);
+ float res = strtof (str, end);
- e = **end;
- if(( e == ',' ) || ( e == '.' ))
+ if (loc != (locale_t)0)
{
- char dup[strlen( str ) + 1];
- strcpy( dup, str );
-
- if( dup == NULL )
- return d;
-
- dup[*end - str] = ( e == ',' ) ? '.' : ',';
- d = strtod( dup, end );
+ uselocale (oldloc);
+ freelocale (loc);
}
- return d;
+ return res;
}
+
/**
- * i18n_atof() has the same prototype as ANSI C atof() but it accepts
- * either decimal separator when deserializing the string to a float number,
- * independant of the local computer setting.
+ * us_atof() has the same prototype as ANSI C atof() but it expects a dot
+ * as decimal separator, regardless of the system locale.
*/
-double i18n_atof( const char *str )
+double us_atof( const char *str )
{
- return i18n_strtod( str, NULL );
+ return us_strtod( str, NULL );
}
/**
- * us_strtod() has the same prototype as ANSI C strtod() but it expects
- * a dot as decimal separator regardless of the system locale.
+ * us_asprintf() has the same prototype as asprintf(), but doesn't use
+ * the system locale.
*/
-double us_strtod( const char *str, char **end )
+int us_asprintf( char **ret, const char *format, ... )
{
- char dup[strlen( str ) + 1], *ptr;
- double d;
- strcpy( dup, str );
-
- ptr = strchr( dup, ',' );
- if( ptr != NULL )
- *ptr = '\0';
+ va_list ap;
+ locale_t loc = newlocale( LC_NUMERIC_MASK, "C", NULL );
+ locale_t oldloc = uselocale( loc );
+ int i_rc;
- d = strtod( dup, &ptr );
- if( end != NULL )
- *end = (char *)&str[ptr - dup];
+ va_start( ap, format );
+ i_rc = vasprintf( ret, format, ap );
+ va_end( ap );
- return d;
-}
+ if ( loc != (locale_t)0 )
+ {
+ uselocale( oldloc );
+ freelocale( loc );
+ }
-/**
- * us_atof() has the same prototype as ANSI C atof() but it expects a dot
- * as decimal separator, regardless of the system locale.
- */
-double us_atof( const char *str )
-{
- return us_strtod( str, NULL );
+ return i_rc;
}
-