1 /*****************************************************************************
2 * charset.c: Locale's character encoding stuff.
3 *****************************************************************************
4 * See also unicode.c for Unicode to locale conversion helpers.
6 * Copyright (C) 2003-2006 the VideoLAN team
9 * Authors: Derk-Jan Hartman <thedj at users.sf.net>
13 * vlc_current_charset() an adaption of mp_locale_charset():
15 * Copyright (C) 2001-2003 The Mape Project
16 * Written by Karel Zak <zakkr@zf.jcu.cz>.
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
28 * You should have received a copy of the GNU General Public License
29 * along with this program; if not, write to the Free Software
30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
31 *****************************************************************************/
38 # if HAVE_LANGINFO_CODESET
39 # include <langinfo.h>
51 #include <vlc_charset.h>
53 typedef struct VLCCharsetAlias
55 char *psz_alias, *psz_name;
59 * The libcharset load all from external text file, but it's strange and
60 * slow solution, we rather use array(s) compiled into source. In the
61 * "good" libc this is not needful -- for example in linux.
63 * Please, put to this funtion exotic aliases only. The libc 'iconv' knows
64 * a lot of basic aliases (check it first by iconv -l).
67 #if (defined OS2 || !HAVE_LANGINFO_CODESET) && !defined WIN32
68 static const char* vlc_encoding_from_language( const char *l )
70 /* check for language (and perhaps country) codes */
71 if (strstr(l, "zh_TW")) return "Big5";
72 if (strstr(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */
73 if (strstr(l, "zh")) return "GB2312";
74 if (strstr(l, "th")) return "TIS-620";
75 if (strstr(l, "ja")) return "EUC-JP";
76 if (strstr(l, "ko")) return "EUC-KR";
77 if (strstr(l, "ru")) return "KOI8-R";
78 if (strstr(l, "uk")) return "KOI8-U";
79 if (strstr(l, "pl") || strstr(l, "hr") ||
80 strstr(l, "hu") || strstr(l, "cs") ||
81 strstr(l, "sk") || strstr(l, "sl")) return "ISO-8859-2";
82 if (strstr(l, "eo") || strstr(l, "mt")) return "ISO-8859-3";
83 if (strstr(l, "lt") || strstr(l, "la")) return "ISO-8859-4";
84 if (strstr(l, "bg") || strstr(l, "be") ||
85 strstr(l, "mk") || strstr(l, "uk")) return "ISO-8859-5";
86 if (strstr(l, "ar")) return "ISO-8859-6";
87 if (strstr(l, "el")) return "ISO-8859-7";
88 if (strstr(l, "he") || strstr(l, "iw")) return "ISO-8859-8";
89 if (strstr(l, "tr")) return "ISO-8859-9";
90 if (strstr(l, "th")) return "ISO-8859-11";
91 if (strstr(l, "lv")) return "ISO-8859-13";
92 if (strstr(l, "cy")) return "ISO-8859-14";
93 if (strstr(l, "et")) return "ISO-8859-15"; /* all latin1 could be iso15 as well */
94 if (strstr(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */
95 if (strstr(l, "am") || strstr(l, "vi")) return "UTF-8";
96 /* We don't know. This ain't working go to default. */
101 static const char* vlc_charset_aliases( const char *psz_name )
106 VLCCharsetAlias aliases[] =
109 { "CP1361", "JOHAB" },
110 { "CP20127", "ASCII" },
111 { "CP20866", "KOI8-R" },
112 { "CP21866", "KOI8-RU" },
113 { "CP28591", "ISO-8859-1" },
114 { "CP28592", "ISO-8859-2" },
115 { "CP28593", "ISO-8859-3" },
116 { "CP28594", "ISO-8859-4" },
117 { "CP28595", "ISO-8859-5" },
118 { "CP28596", "ISO-8859-6" },
119 { "CP28597", "ISO-8859-7" },
120 { "CP28598", "ISO-8859-8" },
121 { "CP28599", "ISO-8859-9" },
122 { "CP28605", "ISO-8859-15" },
125 #elif defined (SYS_AIX)
126 VLCCharsetAlias aliases[] =
128 { "IBM-850", "CP850" },
129 { "IBM-856", "CP856" },
130 { "IBM-921", "ISO-8859-13" },
131 { "IBM-922", "CP922" },
132 { "IBM-932", "CP932" },
133 { "IBM-943", "CP943" },
134 { "IBM-1046", "CP1046" },
135 { "IBM-1124", "CP1124" },
136 { "IBM-1129", "CP1129" },
137 { "IBM-1252", "CP1252" },
138 { "IBM-EUCCN", "GB2312" },
139 { "IBM-EUCJP", "EUC-JP" },
140 { "IBM-EUCKR", "EUC-KR" },
141 { "IBM-EUCTW", "EUC-TW" },
144 #elif defined (SYS_HPUX)
145 VLCCharsetAlias aliases[] =
147 { "ROMAN8", "HP-ROMAN8" },
148 { "ARABIC8", "HP-ARABIC8" },
149 { "GREEK8", "HP-GREEK8" },
150 { "HEBREW8", "HP-HEBREW8" },
151 { "TURKISH8", "HP-TURKISH8" },
152 { "KANA8", "HP-KANA8" },
153 { "HP15CN", "GB2312" },
156 #elif defined (SYS_IRIX)
157 VLCCharsetAlias aliases[] =
159 { "EUCCN", "GB2312" },
162 #elif defined (SYS_OSF)
163 VLCCharsetAlias aliases[] =
165 { "KSC5601", "CP949" },
166 { "SDECKANJI", "EUC-JP" },
167 { "TACTIS", "TIS-620" },
170 #elif defined (SYS_SOLARIS)
171 VLCCharsetAlias aliases[] =
174 { "CNS11643", "EUC-TW" },
175 { "5601", "EUC-KR" },
176 { "JOHAP92", "JOHAB" },
177 { "PCK", "SHIFT_JIS" },
178 { "2533", "TIS-620" },
181 #elif defined (SYS_BSD)
182 VLCCharsetAlias aliases[] =
185 { "EUCCN", "GB2312" },
189 VLCCharsetAlias aliases[] = {{NULL, NULL}};
192 for (a = aliases; a->psz_alias; a++)
193 if (strcasecmp (a->psz_alias, psz_name) == 0)
196 /* we return original name beacuse iconv() probably will know
197 * something better about name if we don't know it :-) */
201 /* Returns charset from "language_COUNTRY.charset@modifier" string */
202 #if (defined OS2 || !HAVE_LANGINFO_CODESET) && !defined WIN32
203 static void vlc_encoding_from_locale( char *psz_locale, char *psz_charset )
205 char *psz_dot = strchr( psz_locale, '.' );
207 if( psz_dot != NULL )
209 const char *psz_modifier;
213 /* Look for the possible @... trailer and remove it, if any. */
214 psz_modifier = strchr( psz_dot, '@' );
216 if( psz_modifier == NULL )
218 strcpy( psz_charset, psz_dot );
221 if( 0 < ( psz_modifier - psz_dot )
222 && ( psz_modifier - psz_dot ) < 2 + 10 + 1 )
224 memcpy( psz_charset, psz_dot, psz_modifier - psz_dot );
225 psz_charset[ psz_modifier - psz_dot ] = '\0';
229 /* try language mapping */
230 strcpy( psz_charset, vlc_encoding_from_language( psz_locale ) );
234 vlc_bool_t vlc_current_charset( char **psz_charset )
236 const char *psz_codeset;
238 #if !(defined WIN32 || defined OS2 || defined __APPLE__)
240 # if HAVE_LANGINFO_CODESET
241 /* Most systems support nl_langinfo( CODESET ) nowadays. */
242 psz_codeset = nl_langinfo( CODESET );
243 if( !strcmp( psz_codeset, "ANSI_X3.4-1968" ) )
244 psz_codeset = "ASCII";
246 /* On old systems which lack it, use setlocale or getenv. */
247 const char *psz_locale = NULL;
248 char buf[2 + 10 + 1];
250 /* But most old systems don't have a complete set of locales. Some
251 * (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
252 * use setlocale here; it would return "C" when it doesn't support the
253 * locale name the user has set. Darwin's setlocale is broken. */
254 # if HAVE_SETLOCALE && !__APPLE__
255 psz_locale = setlocale( LC_ALL, NULL );
257 if( psz_locale == NULL || psz_locale[0] == '\0' )
259 psz_locale = getenv( "LC_ALL" );
260 if( psz_locale == NULL || psz_locale[0] == '\0' )
262 psz_locale = getenv( "LC_CTYPE" );
263 if( psz_locale == NULL || psz_locale[0] == '\0')
264 psz_locale = getenv( "LANG" );
268 /* On some old systems, one used to set locale = "iso8859_1". On others,
269 * you set it to "language_COUNTRY.charset". Darwin only has LANG :( */
270 vlc_encoding_from_locale( (char *)psz_locale, buf );
272 # endif /* HAVE_LANGINFO_CODESET */
274 #elif defined __APPLE__
276 /* Darwin is always using UTF-8 internally. */
277 psz_codeset = "UTF-8";
281 char buf[2 + 10 + 1];
283 /* Woe32 has a function returning the locale's codepage as a number. */
284 snprintf( buf, sizeof( buf ), "CP%u", GetACP() );
289 const char *psz_locale;
290 char buf[2 + 10 + 1];
294 /* Allow user to override the codeset, as set in the operating system,
295 * with standard language environment variables. */
296 psz_locale = getenv( "LC_ALL" );
297 if( psz_locale == NULL || psz_locale[0] == '\0' )
299 psz+locale = getenv( "LC_CTYPE" );
300 if( psz_locale == NULL || locale[0] == '\0' )
301 locale = getenv( "LANG" );
303 if( psz_locale != NULL && psz_locale[0] != '\0' )
304 vlc_encoding_from_locale( psz_locale, buf );
308 /* OS/2 has a function returning the locale's codepage as a number. */
309 if( DosQueryCp( sizeof( cp ), cp, &cplen ) )
313 snprintf( buf, sizeof( buf ), "CP%u", cp[0] );
318 if( psz_codeset == NULL )
319 /* The canonical name cannot be determined. */
322 psz_codeset = vlc_charset_aliases( psz_codeset );
324 /* Don't return an empty string. GNU libc and GNU libiconv interpret
325 * the empty string as denoting "the locale's character encoding",
326 * thus GNU libiconv would call this function a second time. */
327 if( psz_codeset[0] == '\0' )
329 /* Last possibility is 'CHARSET' enviroment variable */
330 if( !( psz_codeset = getenv( "CHARSET" ) ) )
331 psz_codeset = "ISO-8859-1";
335 *psz_charset = strdup(psz_codeset);
337 if( !strcasecmp(psz_codeset, "UTF8") || !strcasecmp(psz_codeset, "UTF-8") )
344 char *vlc_fix_readdir( const char *psz_string )
347 vlc_iconv_t hd = vlc_iconv_open( "UTF-8", "UTF-8-MAC" );
349 if (hd != (vlc_iconv_t)(-1))
351 const char *psz_in = psz_string;
352 size_t i_in = strlen(psz_in);
353 size_t i_out = i_in * 2;
354 char *psz_utf8 = malloc(i_out + 1);
355 char *psz_out = psz_utf8;
357 size_t i_ret = vlc_iconv (hd, &psz_in, &i_in, &psz_out, &i_out);
358 vlc_iconv_close (hd);
359 if( i_ret == (size_t)(-1) || i_in )
362 return strdup( psz_string );
369 return strdup( psz_string );
373 static inline int locale_match (const char *tab, const char *locale)
375 for (;*tab; tab += 2)
376 if (memcmp (tab, locale, 2) == 0)
383 * @return a fallback characters encoding to be used, given a locale.
385 static const char *FindFallbackEncoding (const char *locale)
387 if ((locale == NULL) || (strlen (locale) < 2))
391 /*** The ISO-8859 series (anything but Asia) ***/
392 // Latin-1 Western-European languages (ISO-8859-1)
393 static const char western[] =
394 "aa" "af" "an" "br" "ca" "da" "de" "en" "es" "et" "eu" "fi" "fo" "fr"
395 "ga" "gd" "gl" "gv" "id" "is" "it" "kl" "kw" "mg" "ms" "nb" "nl" "nn"
396 "no" "oc" "om" "pt" "so" "sq" "st" "sv" "tl" "uz" "wa" "xh" "zu"
398 if (!locale_match (western, locale))
399 return "CP1252"; // Compatible Microsoft superset
401 // Latin-2 Slavic languages (ISO-8859-2)
402 static const char slavic[] = "bs" "cs" "hr" "hu" "pl" "ro" "sk" "sl";
403 if (!locale_match (slavic, locale))
404 return "CP1250"; // CP1250 is more common, but incompatible
406 // Latin-3 Southern European languages (ISO-8859-3)
407 // "eo" and "mt" -> Latin-1 instead, I presume(?).
408 // "tr" -> ISO-8859-9 instead
410 // Latin-4 North-European languages (ISO-8859-4)
411 // -> Latin-1 instead
413 /* Cyrillic alphabet languages (ISO-8859-5) */
414 static const char cyrillic[] = "be" "bg" "mk" "ru" "sr";
415 if (!locale_match (cyrillic, locale))
416 return "CP1251"; // KOI8, ISO-8859-5 and CP1251 are incompatible(?)
418 /* Arabic (ISO-8859-6) */
419 if (!locale_match ("ar", locale))
420 // FIXME: someone check if we should return CP1256 or ISO-8859-6
421 return "CP1256"; // CP1256 is(?) more common, but incompatible(?)
423 /* Greek (ISO-8859-7) */
424 if (!locale_match ("el", locale))
425 // FIXME: someone check if we should return CP1253 or ISO-8859-7
426 return "CP1253"; // CP1253 is(?) more common and less incompatible
428 /* Hebrew (ISO-8859-8) */
429 if (!locale_match ("he" "iw" "yi", locale))
430 return "CP1255"; // Compatible Microsoft superset
432 /* Latin-5 Turkish (ISO-8859-9) */
433 if (!locale_match ("tr" "ku", locale))
434 return "CP1254"; // Compatible Microsoft superset
436 /* Latin-6 “North-European” languages (ISO-8859-10) */
437 /* It is so much north European that glibc only uses that for Luganda
438 * which is spoken in Uganda... unless someone complains, I'm not
439 * using this one; let's fallback to CP1252 here. */
441 // ISO-8859-11 does arguably not exist. Thai is handled below.
443 // ISO-8859-12 really doesn't exist.
445 // Latin-7 Baltic languages (ISO-8859-13)
446 if (!locale_match ("lt" "lv" "mi", locale))
447 // FIXME: mi = New Zealand, doesn't sound baltic!
448 return "CP1257"; // Compatible Microsoft superset
450 // Latin-8 Celtic languages (ISO-8859-14)
451 // "cy" -> use Latin-1 instead (most likely English or French)
453 // Latin-9 (ISO-8859-15) -> see Latin-1
455 // Latin-10 (ISO-8859-16) does not seem to be used
458 // For Russian, we use CP1251
459 if (!locale_match ("uk", locale))
462 if (!locale_match ("tg", locale))
467 if (!locale_match ("jp", locale))
468 return "SHIFT-JIS"; // Shift-JIS is way more common than EUC-JP
471 if (!locale_match ("ko", locale))
475 if (!locale_match ("th", locale))
478 // Vietnamese (FIXME: more infos needed)
479 if (!locale_match ("vt", locale))
480 /* VISCII is probably a bad idea as it is not extended ASCII */
481 /* glibc has TCVN5712-1 */
484 /* Kazakh (FIXME: more infos needed) */
485 if (!locale_match ("kk", locale))
488 // Chinese. The politically incompatible character sets.
489 if (!locale_match ("zh", locale))
491 if ((strlen (locale) >= 5) && (locale[2] != '_'))
495 if (!locale_match ("HK", locale))
496 return "BIG5-HKSCS"; /* FIXME: use something else? */
499 if (!locale_match ("TW", locale))
502 // People's Republic of China and Singapore
504 * GB18030 can represent any Unicode code point
505 * (like UTF-8), while remaining compatible with GBK
506 * FIXME: is it compatible with GB2312? if not, should we
507 * use GB2312 instead?
516 * GetFallbackEncoding() suggests an encoding to be used for non UTF-8
517 * text files accord to the system's local settings. It is only a best
520 const char *GetFallbackEncoding( void )
523 const char *psz_lang = NULL;
525 /* Some systems (like Darwin, SunOS 4 or DJGPP) have only the C locale.
526 * Therefore we don't use setlocale here; it would return "C". */
527 # if defined (HAVE_SETLOCALE) && !defined ( __APPLE__)
528 psz_lang = setlocale( LC_ALL, NULL );
530 if( psz_lang == NULL || psz_lang[0] == '\0' )
532 psz_lang = getenv( "LC_ALL" );
533 if( psz_lang == NULL || psz_lang == '\0' )
535 psz_lang = getenv( "LC_CTYPE" );
536 if( psz_lang == NULL || psz_lang[0] == '\0')
537 psz_lang = getenv( "LANG" );
541 return FindFallbackEncoding( psz_lang );
544 * This should be thread-safe given GetACP() should always return
547 static char buf[2 + 10 + 1] = "";
550 snprintf( buf, sizeof( buf ), "CP%u", GetACP() );
556 * There are two decimal separators in the computer world-wide locales:
557 * dot (which is the american default), and comma (which is used in France,
558 * the country with the most VLC developers, among others).
560 * i18n_strtod() has the same prototype as ANSI C strtod() but it accepts
561 * either decimal separator when deserializing the string to a float number,
562 * independant of the local computer setting.
564 double i18n_strtod( const char *str, char **end )
571 d = strtod( str, end );
574 if(( e == ',' ) || ( e == '.' ))
576 char dup[strlen( str ) + 1];
582 dup[*end - str] = ( e == ',' ) ? '.' : ',';
583 d = strtod( dup, end );
589 * i18n_atof() has the same prototype as ANSI C atof() but it accepts
590 * either decimal separator when deserializing the string to a float number,
591 * independant of the local computer setting.
593 double i18n_atof( const char *str )
595 return i18n_strtod( str, NULL );
600 * us_strtod() has the same prototype as ANSI C strtod() but it expects
601 * a dot as decimal separator regardless of the system locale.
603 double us_strtod( const char *str, char **end )
605 char dup[strlen( str ) + 1], *ptr;
609 ptr = strchr( dup, ',' );
613 d = strtod( dup, &ptr );
615 *end = (char *)&str[ptr - dup];
621 * us_atof() has the same prototype as ANSI C atof() but it expects a dot
622 * as decimal separator, regardless of the system locale.
624 double us_atof( const char *str )
626 return us_strtod( str, NULL );