1 /*****************************************************************************
2 * charset.c: Locale's character encoding stuff.
3 *****************************************************************************
4 * See also unicode.c for Unicode to locale conversion helpers.
6 * Copyright (C) 2003-2006 the VideoLAN team
9 * Authors: Derk-Jan Hartman <thedj at users.sf.net>
13 * vlc_current_charset() an adaption of mp_locale_charset():
15 * Copyright (C) 2001-2003 The Mape Project
16 * Written by Karel Zak <zakkr@zf.jcu.cz>.
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
28 * You should have received a copy of the GNU General Public License
29 * along with this program; if not, write to the Free Software
30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
31 *****************************************************************************/
38 # if HAVE_LANGINFO_CODESET
39 # include <langinfo.h>
53 typedef struct VLCCharsetAlias
55 char *psz_alias, *psz_name;
59 * The libcharset load all from external text file, but it's strange and
60 * slow solution, we rather use array(s) compiled into source. In the
61 * "good" libc this is not needful -- for example in linux.
63 * Please, put to this funtion exotic aliases only. The libc 'iconv' knows
64 * a lot of basic aliases (check it first by iconv -l).
67 #if (defined OS2 || !HAVE_LANGINFO_CODESET) && !defined WIN32
68 static const char* vlc_encoding_from_language( const char *l )
70 /* check for language (and perhaps country) codes */
71 if (strstr(l, "zh_TW")) return "Big5";
72 if (strstr(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */
73 if (strstr(l, "zh")) return "GB2312";
74 if (strstr(l, "th")) return "TIS-620";
75 if (strstr(l, "ja")) return "EUC-JP";
76 if (strstr(l, "ko")) return "EUC-KR";
77 if (strstr(l, "ru")) return "KOI8-R";
78 if (strstr(l, "uk")) return "KOI8-U";
79 if (strstr(l, "pl") || strstr(l, "hr") ||
80 strstr(l, "hu") || strstr(l, "cs") ||
81 strstr(l, "sk") || strstr(l, "sl")) return "ISO-8859-2";
82 if (strstr(l, "eo") || strstr(l, "mt")) return "ISO-8859-3";
83 if (strstr(l, "lt") || strstr(l, "la")) return "ISO-8859-4";
84 if (strstr(l, "bg") || strstr(l, "be") ||
85 strstr(l, "mk") || strstr(l, "uk")) return "ISO-8859-5";
86 if (strstr(l, "ar")) return "ISO-8859-6";
87 if (strstr(l, "el")) return "ISO-8859-7";
88 if (strstr(l, "he") || strstr(l, "iw")) return "ISO-8859-8";
89 if (strstr(l, "tr")) return "ISO-8859-9";
90 if (strstr(l, "th")) return "ISO-8859-11";
91 if (strstr(l, "lv")) return "ISO-8859-13";
92 if (strstr(l, "cy")) return "ISO-8859-14";
93 if (strstr(l, "et")) return "ISO-8859-15"; /* all latin1 could be iso15 as well */
94 if (strstr(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */
95 if (strstr(l, "am") || strstr(l, "vi")) return "UTF-8";
96 /* We don't know. This ain't working go to default. */
101 static const char* vlc_charset_aliases( const char *psz_name )
106 VLCCharsetAlias aliases[] =
109 { "CP1361", "JOHAB" },
110 { "CP20127", "ASCII" },
111 { "CP20866", "KOI8-R" },
112 { "CP21866", "KOI8-RU" },
113 { "CP28591", "ISO-8859-1" },
114 { "CP28592", "ISO-8859-2" },
115 { "CP28593", "ISO-8859-3" },
116 { "CP28594", "ISO-8859-4" },
117 { "CP28595", "ISO-8859-5" },
118 { "CP28596", "ISO-8859-6" },
119 { "CP28597", "ISO-8859-7" },
120 { "CP28598", "ISO-8859-8" },
121 { "CP28599", "ISO-8859-9" },
122 { "CP28605", "ISO-8859-15" },
125 #elif defined (SYS_AIX)
126 VLCCharsetAlias aliases[] =
128 { "IBM-850", "CP850" },
129 { "IBM-856", "CP856" },
130 { "IBM-921", "ISO-8859-13" },
131 { "IBM-922", "CP922" },
132 { "IBM-932", "CP932" },
133 { "IBM-943", "CP943" },
134 { "IBM-1046", "CP1046" },
135 { "IBM-1124", "CP1124" },
136 { "IBM-1129", "CP1129" },
137 { "IBM-1252", "CP1252" },
138 { "IBM-EUCCN", "GB2312" },
139 { "IBM-EUCJP", "EUC-JP" },
140 { "IBM-EUCKR", "EUC-KR" },
141 { "IBM-EUCTW", "EUC-TW" },
144 #elif defined (SYS_HPUX)
145 VLCCharsetAlias aliases[] =
147 { "ROMAN8", "HP-ROMAN8" },
148 { "ARABIC8", "HP-ARABIC8" },
149 { "GREEK8", "HP-GREEK8" },
150 { "HEBREW8", "HP-HEBREW8" },
151 { "TURKISH8", "HP-TURKISH8" },
152 { "KANA8", "HP-KANA8" },
153 { "HP15CN", "GB2312" },
156 #elif defined (SYS_IRIX)
157 VLCCharsetAlias aliases[] =
159 { "EUCCN", "GB2312" },
162 #elif defined (SYS_OSF)
163 VLCCharsetAlias aliases[] =
165 { "KSC5601", "CP949" },
166 { "SDECKANJI", "EUC-JP" },
167 { "TACTIS", "TIS-620" },
170 #elif defined (SYS_SOLARIS)
171 VLCCharsetAlias aliases[] =
174 { "CNS11643", "EUC-TW" },
175 { "5601", "EUC-KR" },
176 { "JOHAP92", "JOHAB" },
177 { "PCK", "SHIFT_JIS" },
178 { "2533", "TIS-620" },
181 #elif defined (SYS_BSD)
182 VLCCharsetAlias aliases[] =
185 { "EUCCN", "GB2312" },
189 VLCCharsetAlias aliases[] = {{NULL, NULL}};
192 for (a = aliases; a->psz_alias; a++)
193 if (strcasecmp (a->psz_alias, psz_name) == 0)
196 /* we return original name beacuse iconv() probably will know
197 * something better about name if we don't know it :-) */
201 /* Returns charset from "language_COUNTRY.charset@modifier" string */
202 #if (defined OS2 || !HAVE_LANGINFO_CODESET) && !defined WIN32
203 static void vlc_encoding_from_locale( char *psz_locale, char *psz_charset )
205 char *psz_dot = strchr( psz_locale, '.' );
207 if( psz_dot != NULL )
209 const char *psz_modifier;
213 /* Look for the possible @... trailer and remove it, if any. */
214 psz_modifier = strchr( psz_dot, '@' );
216 if( psz_modifier == NULL )
218 strcpy( psz_charset, psz_dot );
221 if( 0 < ( psz_modifier - psz_dot )
222 && ( psz_modifier - psz_dot ) < 2 + 10 + 1 )
224 memcpy( psz_charset, psz_dot, psz_modifier - psz_dot );
225 psz_charset[ psz_modifier - psz_dot ] = '\0';
229 /* try language mapping */
230 strcpy( psz_charset, vlc_encoding_from_language( psz_locale ) );
234 vlc_bool_t vlc_current_charset( char **psz_charset )
236 const char *psz_codeset;
238 #if !(defined WIN32 || defined OS2 || defined __APPLE__)
240 # if HAVE_LANGINFO_CODESET
241 /* Most systems support nl_langinfo( CODESET ) nowadays. */
242 psz_codeset = nl_langinfo( CODESET );
243 if( !strcmp( psz_codeset, "ANSI_X3.4-1968" ) )
244 psz_codeset = "ASCII";
246 /* On old systems which lack it, use setlocale or getenv. */
247 const char *psz_locale = NULL;
248 char buf[2 + 10 + 1];
250 /* But most old systems don't have a complete set of locales. Some
251 * (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
252 * use setlocale here; it would return "C" when it doesn't support the
253 * locale name the user has set. Darwin's setlocale is broken. */
254 # if HAVE_SETLOCALE && !__APPLE__
255 psz_locale = setlocale( LC_ALL, NULL );
257 if( psz_locale == NULL || psz_locale[0] == '\0' )
259 psz_locale = getenv( "LC_ALL" );
260 if( psz_locale == NULL || psz_locale[0] == '\0' )
262 psz_locale = getenv( "LC_CTYPE" );
263 if( psz_locale == NULL || psz_locale[0] == '\0')
264 psz_locale = getenv( "LANG" );
268 /* On some old systems, one used to set locale = "iso8859_1". On others,
269 * you set it to "language_COUNTRY.charset". Darwin only has LANG :( */
270 vlc_encoding_from_locale( (char *)psz_locale, buf );
272 # endif /* HAVE_LANGINFO_CODESET */
274 #elif defined __APPLE__
276 /* Darwin is always using UTF-8 internally. */
277 psz_codeset = "UTF-8";
281 char buf[2 + 10 + 1];
283 /* Woe32 has a function returning the locale's codepage as a number. */
284 snprintf( buf, sizeof( buf ), "CP%u", GetACP() );
289 const char *psz_locale;
290 char buf[2 + 10 + 1];
294 /* Allow user to override the codeset, as set in the operating system,
295 * with standard language environment variables. */
296 psz_locale = getenv( "LC_ALL" );
297 if( psz_locale == NULL || psz_locale[0] == '\0' )
299 psz+locale = getenv( "LC_CTYPE" );
300 if( psz_locale == NULL || locale[0] == '\0' )
301 locale = getenv( "LANG" );
303 if( psz_locale != NULL && psz_locale[0] != '\0' )
304 vlc_encoding_from_locale( psz_locale, buf );
308 /* OS/2 has a function returning the locale's codepage as a number. */
309 if( DosQueryCp( sizeof( cp ), cp, &cplen ) )
313 snprintf( buf, sizeof( buf ), "CP%u", cp[0] );
318 if( psz_codeset == NULL )
319 /* The canonical name cannot be determined. */
322 psz_codeset = vlc_charset_aliases( psz_codeset );
324 /* Don't return an empty string. GNU libc and GNU libiconv interpret
325 * the empty string as denoting "the locale's character encoding",
326 * thus GNU libiconv would call this function a second time. */
327 if( psz_codeset[0] == '\0' )
329 /* Last possibility is 'CHARSET' enviroment variable */
330 if( !( psz_codeset = getenv( "CHARSET" ) ) )
331 psz_codeset = "ISO-8859-1";
335 *psz_charset = strdup(psz_codeset);
337 if( !strcasecmp(psz_codeset, "UTF8") || !strcasecmp(psz_codeset, "UTF-8") )
344 char *vlc_fix_readdir( const char *psz_string )
347 vlc_iconv_t hd = vlc_iconv_open( "UTF-8", "UTF-8-MAC" );
349 if (hd != (vlc_iconv_t)(-1))
351 const char *psz_in = psz_string;
352 size_t i_in = strlen(psz_in);
353 size_t i_out = i_in * 2;
354 char *psz_utf8 = malloc(i_out + 1);
355 char *psz_out = psz_utf8;
357 size_t i_ret = vlc_iconv (hd, &psz_in, &i_in, &psz_out, &i_out);
358 vlc_iconv_close (hd);
359 if( i_ret == (size_t)(-1) || i_in )
362 return strdup( psz_string );
369 return strdup( psz_string );
374 static inline int locale_match (const char *tab, const char *locale)
376 for (;*tab; tab += 2)
377 if (memcmp (tab, locale, 2) == 0)
384 * @return a fallback characters encoding to be used, given a locale.
386 static const char *FindFallbackEncoding (const char *locale)
388 if ((locale == NULL) || (strlen (locale) < 2))
392 /*** The ISO-8859 series (anything but Asia) ***/
393 // Latin-1 Western-European languages (ISO-8859-1)
394 static const char western[] =
395 "aa" "af" "an" "br" "ca" "da" "de" "en" "es" "et" "eu" "fi" "fo" "fr"
396 "ga" "gd" "gl" "gv" "id" "is" "it" "kl" "kw" "mg" "ms" "nb" "nl" "nn"
397 "no" "oc" "om" "pt" "so" "sq" "st" "sv" "tl" "uz" "wa" "xh" "zu"
399 if (!locale_match (western, locale))
400 return "CP1252"; // Compatible Microsoft superset
402 // Latin-2 Slavic languages (ISO-8859-2)
403 static const char slavic[] = "bs" "cs" "hr" "hu" "pl" "ro" "sk" "sl";
404 if (!locale_match (slavic, locale))
405 return "CP1250"; // CP1250 is more common, but incompatible
407 // Latin-3 Southern European languages (ISO-8859-3)
408 // "eo" and "mt" -> Latin-1 instead, I presume(?).
409 // "tr" -> ISO-8859-9 instead
411 // Latin-4 North-European languages (ISO-8859-4)
412 // -> Latin-1 instead
414 /* Cyrillic alphabet languages (ISO-8859-5) */
415 static const char cyrillic[] = "be" "bg" "mk" "ru" "sr";
416 if (!locale_match (cyrillic, locale))
417 return "CP1251"; // KOI8, ISO-8859-5 and CP1251 are incompatible(?)
419 /* Arabic (ISO-8859-6) */
420 if (!locale_match ("ar", locale))
421 // FIXME: someone check if we should return CP1256 or ISO-8859-6
422 return "CP1256"; // CP1256 is(?) more common, but incompatible(?)
424 /* Greek (ISO-8859-7) */
425 if (!locale_match ("el", locale))
426 // FIXME: someone check if we should return CP1253 or ISO-8859-7
427 return "CP1253"; // CP1253 is(?) more common and less incompatible
429 /* Hebrew (ISO-8859-8) */
430 if (!locale_match ("he" "iw" "yi", locale))
431 return "CP1255"; // Compatible Microsoft superset
433 /* Latin-5 Turkish (ISO-8859-9) */
434 if (!locale_match ("tr" "ku", locale))
435 return "CP1254"; // Compatible Microsoft superset
437 /* Latin-6 “North-European” languages (ISO-8859-10) */
438 /* It is so much north European that glibc only uses that for Luganda
439 * which is spoken in Uganda... unless someone complains, I'm not
440 * using this one; let's fallback to CP1252 here. */
442 // ISO-8859-11 does arguably not exist. Thai is handled below.
444 // ISO-8859-12 really doesn't exist.
446 // Latin-7 Baltic languages (ISO-8859-13)
447 if (!locale_match ("lt" "lv" "mi", locale))
448 // FIXME: mi = New Zealand, doesn't sound baltic!
449 return "CP1257"; // Compatible Microsoft superset
451 // Latin-8 Celtic languages (ISO-8859-14)
452 // "cy" -> use Latin-1 instead (most likely English or French)
454 // Latin-9 (ISO-8859-15) -> see Latin-1
456 // Latin-10 (ISO-8859-16) does not seem to be used
459 // For Russian, we use CP1251
460 if (!locale_match ("uk", locale))
463 if (!locale_match ("tg", locale))
468 if (!locale_match ("jp", locale))
469 return "SHIFT-JIS"; // Shift-JIS is way more common than EUC-JP
472 if (!locale_match ("ko", locale))
476 if (!locale_match ("th", locale))
479 // Vietnamese (FIXME: more infos needed)
480 if (!locale_match ("vt", locale))
481 /* VISCII is probably a bad idea as it is not extended ASCII */
482 /* glibc has TCVN5712-1 */
485 /* Kazakh (FIXME: more infos needed) */
486 if (!locale_match ("kk", locale))
489 // Chinese. The politically incompatible character sets.
490 if (!locale_match ("zh", locale))
492 if ((strlen (locale) >= 5) && (locale[2] != '_'))
496 if (!locale_match ("HK", locale))
497 return "BIG5-HKSCS"; /* FIXME: use something else? */
500 if (!locale_match ("TW", locale))
503 // People's Republic of China and Singapore
505 * GB18030 can represent any Unicode code point
506 * (like UTF-8), while remaining compatible with GBK
507 * FIXME: is it compatible with GB2312? if not, should we
508 * use GB2312 instead?
517 * GetFallbackEncoding() suggests an encoding to be used for non UTF-8
518 * text files accord to the system's local settings. It is only a best
521 const char *GetFallbackEncoding( void )
524 const char *psz_lang = NULL;
526 /* Some systems (like Darwin, SunOS 4 or DJGPP) have only the C locale.
527 * Therefore we don't use setlocale here; it would return "C". */
528 # if defined (HAVE_SETLOCALE) && !defined ( __APPLE__)
529 psz_lang = setlocale( LC_ALL, NULL );
531 if( psz_lang == NULL || psz_lang[0] == '\0' )
533 psz_lang = getenv( "LC_ALL" );
534 if( psz_lang == NULL || psz_lang == '\0' )
536 psz_lang = getenv( "LC_CTYPE" );
537 if( psz_lang == NULL || psz_lang[0] == '\0')
538 psz_lang = getenv( "LANG" );
542 return FindFallbackEncoding( psz_lang );
545 * This should be thread-safe given GetACP() should always return
548 static char buf[2 + 10 + 1] = "";
551 snprintf( buf, sizeof( buf ), "CP%u", GetACP() );
557 * There are two decimal separators in the computer world-wide locales:
558 * dot (which is the american default), and comma (which is used in France,
559 * the country with the most VLC developers, among others).
561 * i18n_strtod() has the same prototype as ANSI C strtod() but it accepts
562 * either decimal separator when deserializing the string to a float number,
563 * independant of the local computer setting.
565 double i18n_strtod( const char *str, char **end )
572 d = strtod( str, end );
575 if(( e == ',' ) || ( e == '.' ))
577 char dup[strlen( str ) + 1];
583 dup[*end - str] = ( e == ',' ) ? '.' : ',';
584 d = strtod( dup, end );
590 * i18n_atof() has the same prototype as ANSI C atof() but it accepts
591 * either decimal separator when deserializing the string to a float number,
592 * independant of the local computer setting.
594 double i18n_atof( const char *str )
596 return i18n_strtod( str, NULL );
601 * us_strtod() has the same prototype as ANSI C strtod() but it expects
602 * a dot as decimal separator regardless of the system locale.
604 double us_strtod( const char *str, char **end )
606 char dup[strlen( str ) + 1], *ptr;
610 ptr = strchr( dup, ',' );
614 d = strtod( dup, &ptr );
616 *end = (char *)&str[ptr - dup];
622 * us_atof() has the same prototype as ANSI C atof() but it expects a dot
623 * as decimal separator, regardless of the system locale.
625 double us_atof( const char *str )
627 return us_strtod( str, NULL );