1 /*****************************************************************************
2 * charset.c: Locale's character encoding stuff.
3 *****************************************************************************
4 * See also unicode.c for Unicode to locale conversion helpers.
6 * Copyright (C) 2003-2006 the VideoLAN team
9 * Authors: Derk-Jan Hartman <thedj at users.sf.net>
13 * vlc_current_charset() an adaption of mp_locale_charset():
15 * Copyright (C) 2001-2003 The Mape Project
16 * Written by Karel Zak <zakkr@zf.jcu.cz>.
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
28 * You should have received a copy of the GNU General Public License
29 * along with this program; if not, write to the Free Software
30 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
31 *****************************************************************************/
38 # if HAVE_LANGINFO_CODESET
39 # include <langinfo.h>
55 typedef struct VLCCharsetAlias
57 char *psz_alias, *psz_name;
61 * The libcharset load all from external text file, but it's strange and
62 * slow solution, we rather use array(s) compiled into source. In the
63 * "good" libc this is not needful -- for example in linux.
65 * Please, put to this funtion exotic aliases only. The libc 'iconv' knows
66 * a lot of basic aliases (check it first by iconv -l).
69 #if (defined OS2 || !HAVE_LANGINFO_CODESET) && !defined WIN32
70 static const char* vlc_encoding_from_language( const char *l )
72 /* check for language (and perhaps country) codes */
73 if (strstr(l, "zh_TW")) return "Big5";
74 if (strstr(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */
75 if (strstr(l, "zh")) return "GB2312";
76 if (strstr(l, "th")) return "TIS-620";
77 if (strstr(l, "ja")) return "EUC-JP";
78 if (strstr(l, "ko")) return "EUC-KR";
79 if (strstr(l, "ru")) return "KOI8-R";
80 if (strstr(l, "uk")) return "KOI8-U";
81 if (strstr(l, "pl") || strstr(l, "hr") ||
82 strstr(l, "hu") || strstr(l, "cs") ||
83 strstr(l, "sk") || strstr(l, "sl")) return "ISO-8859-2";
84 if (strstr(l, "eo") || strstr(l, "mt")) return "ISO-8859-3";
85 if (strstr(l, "lt") || strstr(l, "la")) return "ISO-8859-4";
86 if (strstr(l, "bg") || strstr(l, "be") ||
87 strstr(l, "mk") || strstr(l, "uk")) return "ISO-8859-5";
88 if (strstr(l, "ar")) return "ISO-8859-6";
89 if (strstr(l, "el")) return "ISO-8859-7";
90 if (strstr(l, "he") || strstr(l, "iw")) return "ISO-8859-8";
91 if (strstr(l, "tr")) return "ISO-8859-9";
92 if (strstr(l, "th")) return "ISO-8859-11";
93 if (strstr(l, "lv")) return "ISO-8859-13";
94 if (strstr(l, "cy")) return "ISO-8859-14";
95 if (strstr(l, "et")) return "ISO-8859-15"; /* all latin1 could be iso15 as well */
96 if (strstr(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */
97 if (strstr(l, "am") || strstr(l, "vi")) return "UTF-8";
98 /* We don't know. This ain't working go to default. */
103 static const char* vlc_charset_aliases( const char *psz_name )
108 VLCCharsetAlias aliases[] =
111 { "CP1361", "JOHAB" },
112 { "CP20127", "ASCII" },
113 { "CP20866", "KOI8-R" },
114 { "CP21866", "KOI8-RU" },
115 { "CP28591", "ISO-8859-1" },
116 { "CP28592", "ISO-8859-2" },
117 { "CP28593", "ISO-8859-3" },
118 { "CP28594", "ISO-8859-4" },
119 { "CP28595", "ISO-8859-5" },
120 { "CP28596", "ISO-8859-6" },
121 { "CP28597", "ISO-8859-7" },
122 { "CP28598", "ISO-8859-8" },
123 { "CP28599", "ISO-8859-9" },
124 { "CP28605", "ISO-8859-15" },
128 VLCCharsetAlias aliases[] =
130 { "IBM-850", "CP850" },
131 { "IBM-856", "CP856" },
132 { "IBM-921", "ISO-8859-13" },
133 { "IBM-922", "CP922" },
134 { "IBM-932", "CP932" },
135 { "IBM-943", "CP943" },
136 { "IBM-1046", "CP1046" },
137 { "IBM-1124", "CP1124" },
138 { "IBM-1129", "CP1129" },
139 { "IBM-1252", "CP1252" },
140 { "IBM-EUCCN", "GB2312" },
141 { "IBM-EUCJP", "EUC-JP" },
142 { "IBM-EUCKR", "EUC-KR" },
143 { "IBM-EUCTW", "EUC-TW" },
147 VLCCharsetAlias aliases[] =
149 { "ROMAN8", "HP-ROMAN8" },
150 { "ARABIC8", "HP-ARABIC8" },
151 { "GREEK8", "HP-GREEK8" },
152 { "HEBREW8", "HP-HEBREW8" },
153 { "TURKISH8", "HP-TURKISH8" },
154 { "KANA8", "HP-KANA8" },
155 { "HP15CN", "GB2312" },
159 VLCCharsetAlias aliases[] =
161 { "EUCCN", "GB2312" },
165 VLCCharsetAlias aliases[] =
167 { "KSC5601", "CP949" },
168 { "SDECKANJI", "EUC-JP" },
169 { "TACTIS", "TIS-620" },
173 VLCCharsetAlias aliases[] =
176 { "CNS11643", "EUC-TW" },
177 { "5601", "EUC-KR" },
178 { "JOHAP92", "JOHAB" },
179 { "PCK", "SHIFT_JIS" },
180 { "2533", "TIS-620" },
184 VLCCharsetAlias aliases[] =
187 { "EUCCN", "GB2312" },
191 VLCCharsetAlias aliases[] = {{NULL, NULL}};
194 for (a = aliases; a->psz_alias; a++)
195 if (strcasecmp (a->psz_alias, psz_name) == 0)
198 /* we return original name beacuse iconv() probably will know
199 * something better about name if we don't know it :-) */
203 /* Returns charset from "language_COUNTRY.charset@modifier" string */
204 #if (defined OS2 || !HAVE_LANGINFO_CODESET) && !defined WIN32
205 static void vlc_encoding_from_locale( char *psz_locale, char *psz_charset )
207 char *psz_dot = strchr( psz_locale, '.' );
209 if( psz_dot != NULL )
211 const char *psz_modifier;
215 /* Look for the possible @... trailer and remove it, if any. */
216 psz_modifier = strchr( psz_dot, '@' );
218 if( psz_modifier == NULL )
220 strcpy( psz_charset, psz_dot );
223 if( 0 < ( psz_modifier - psz_dot )
224 && ( psz_modifier - psz_dot ) < 2 + 10 + 1 )
226 memcpy( psz_charset, psz_dot, psz_modifier - psz_dot );
227 psz_charset[ psz_modifier - psz_dot ] = '\0';
231 /* try language mapping */
232 strcpy( psz_charset, vlc_encoding_from_language( psz_locale ) );
236 vlc_bool_t vlc_current_charset( char **psz_charset )
238 const char *psz_codeset;
240 #if !(defined WIN32 || defined OS2 || defined __APPLE__)
242 # if HAVE_LANGINFO_CODESET
243 /* Most systems support nl_langinfo( CODESET ) nowadays. */
244 psz_codeset = nl_langinfo( CODESET );
245 if( !strcmp( psz_codeset, "ANSI_X3.4-1968" ) )
246 psz_codeset = "ASCII";
248 /* On old systems which lack it, use setlocale or getenv. */
249 const char *psz_locale = NULL;
250 char buf[2 + 10 + 1];
252 /* But most old systems don't have a complete set of locales. Some
253 * (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
254 * use setlocale here; it would return "C" when it doesn't support the
255 * locale name the user has set. Darwin's setlocale is broken. */
256 # if HAVE_SETLOCALE && !__APPLE__
257 psz_locale = setlocale( LC_ALL, NULL );
259 if( psz_locale == NULL || psz_locale[0] == '\0' )
261 psz_locale = getenv( "LC_ALL" );
262 if( psz_locale == NULL || psz_locale[0] == '\0' )
264 psz_locale = getenv( "LC_CTYPE" );
265 if( psz_locale == NULL || psz_locale[0] == '\0')
266 psz_locale = getenv( "LANG" );
270 /* On some old systems, one used to set locale = "iso8859_1". On others,
271 * you set it to "language_COUNTRY.charset". Darwin only has LANG :( */
272 vlc_encoding_from_locale( (char *)psz_locale, buf );
274 # endif /* HAVE_LANGINFO_CODESET */
276 #elif defined __APPLE__
278 /* Darwin is always using UTF-8 internally. */
279 psz_codeset = "UTF-8";
283 char buf[2 + 10 + 1];
285 /* Woe32 has a function returning the locale's codepage as a number. */
286 snprintf( buf, sizeof( buf ), "CP%u", GetACP() );
291 const char *psz_locale;
292 char buf[2 + 10 + 1];
296 /* Allow user to override the codeset, as set in the operating system,
297 * with standard language environment variables. */
298 psz_locale = getenv( "LC_ALL" );
299 if( psz_locale == NULL || psz_locale[0] == '\0' )
301 psz+locale = getenv( "LC_CTYPE" );
302 if( psz_locale == NULL || locale[0] == '\0' )
303 locale = getenv( "LANG" );
305 if( psz_locale != NULL && psz_locale[0] != '\0' )
306 vlc_encoding_from_locale( psz_locale, buf );
310 /* OS/2 has a function returning the locale's codepage as a number. */
311 if( DosQueryCp( sizeof( cp ), cp, &cplen ) )
315 snprintf( buf, sizeof( buf ), "CP%u", cp[0] );
320 if( psz_codeset == NULL )
321 /* The canonical name cannot be determined. */
324 psz_codeset = vlc_charset_aliases( psz_codeset );
326 /* Don't return an empty string. GNU libc and GNU libiconv interpret
327 * the empty string as denoting "the locale's character encoding",
328 * thus GNU libiconv would call this function a second time. */
329 if( psz_codeset[0] == '\0' )
331 /* Last possibility is 'CHARSET' enviroment variable */
332 if( !( psz_codeset = getenv( "CHARSET" ) ) )
333 psz_codeset = "ISO-8859-1";
337 *psz_charset = strdup(psz_codeset);
339 if( !strcasecmp(psz_codeset, "UTF8") || !strcasecmp(psz_codeset, "UTF-8") )
345 char *__vlc_fix_readdir_charset( vlc_object_t *p_this, const char *psz_string )
348 if ( p_this->p_libvlc_global->iconv_macosx != (vlc_iconv_t)-1 )
350 const char *psz_in = psz_string;
351 size_t i_in = strlen(psz_in);
352 size_t i_out = i_in * 2;
353 char *psz_utf8 = malloc(i_out + 1);
354 char *psz_out = psz_utf8;
356 vlc_mutex_lock( &p_this->p_libvlc_global->iconv_lock );
357 size_t i_ret = vlc_iconv( p_this->p_libvlc_global->iconv_macosx,
358 &psz_in, &i_in, &psz_out, &i_out );
359 vlc_mutex_unlock( &p_this->p_libvlc_global->iconv_lock );
360 if( i_ret == (size_t)-1 || i_in )
363 "failed to convert \"%s\" from HFS+ charset (%s)",
364 psz_string, strerror(errno) );
366 return strdup( psz_string );
375 return strdup( psz_string );
379 * @return a fallback characters encoding to be used, given a locale.
381 const char *FindFallbackEncoding( const char *locale )
383 if( ( locale == NULL ) || ( strlen( locale ) < 2 ) )
386 switch( U16_AT( locale ) )
388 /*** The ISO-8859 series (anything but Asia) ***/
389 /* Latin-1 Western-European languages (ISO-8859-1) */
431 /* Compatible Microsoft superset */
434 /* Latin-2 Slavic languages (ISO-8859-2) */
443 /* CP1250 is more common, but incompatible */
446 /* Latin-3 Southern European languages (ISO-8859-3) */
449 /*case 'tr': Turkish uses ISO-8859-9 instead */
452 /* Latin-4 North-European languages (ISO-8859-4) */
453 /* All use Latin-1 or Latin-6 instead */
455 /* Cyrillic alphabet languages (ISO-8859-5) */
461 /* KOI8, ISO-8859-5 and CP1251 are supposedly incompatible */
464 /* Arabic (ISO-8859-6) */
466 /* FIXME: someone check if we should return CP1256
468 /* CP1256 is(?) more common, but incompatible(?) */
471 /* Greek (ISO-8859-7) */
473 /* FIXME: someone check if we should return CP1253
475 /* CP1253 is(?) more common and partially compatible */
478 /* Hebrew (ISO-8859-8) */
482 /* Compatible Microsoft superset */
485 /* Latin-5 Turkish (ISO-8859-9) */
488 /* Compatible Microsoft superset */
491 /* Latin-6 “North-European” languages (ISO-8859-10) */
492 /* It is so much north European that glibc only uses that for Luganda
493 * which is spoken in Uganda... unless someone complains, I'm not
494 * using this one; let's fallback to CP1252 here. */
495 /* ISO-8859-11 does arguably not exist. Thai is handled below. */
496 /* ISO-8859-12 really doesn't exist. */
498 /* Latin-7 Baltic languages (ISO-8859-13) */
501 case 'mi': /* FIXME: ??? that's in New Zealand, doesn't sound baltic */
502 /* Compatible Microsoft superset */
505 /* Latin-8 Celtic languages (ISO-8859-14) */
507 return "ISO-8859-14";
509 /* Latin-9 (ISO-8859-15) -> see Latin-1 */
510 /* Latin-10 (ISO-8859-16) does not seem to be used */
513 /* For Russian, we use CP1251 */
520 case 'jp': /* Japanese */
521 /* Shift-JIS is way more common than EUC-JP */
523 case 'ko': /* Korean */
525 case 'th': /* Thai */
527 case 'vt': /* Vietnamese FIXME: infos needed */
528 /* VISCII is probably a bad idea as it is not extended ASCII */
529 /* glibc has TCVN5712-1, but I could find no infos on this one */
532 case 'kk': /* Kazakh FIXME: infos needed */
535 case 'zh': /* Chinese, charset is country dependant */
536 if( ( strlen( locale ) >= 5 ) && ( locale[2] != '_' ) )
537 switch( U16_AT( locale + 3 ) )
539 case 'HK': /* Hong Kong */
540 /* FIXME: use something else? */
543 case 'TW': /* Taiwan */
546 /* People's Republic of China */
549 * GB18030 can represent any Unicode code point
550 * (like UTF-8), while remaining compatible with GBK
551 * FIXME: is it compatible with GB2312? if not, should we
552 * use GB2312 instead?
561 * GetFallbackEncoding() suggests an encoding to be used for non UTF-8
562 * text files accord to the system's local settings. It is only a best
565 const char *GetFallbackEncoding( void )
568 const char *psz_lang = NULL;
570 /* Some systems (like Darwin, SunOS 4 or DJGPP) have only the C locale.
571 * Therefore we don't use setlocale here; it would return "C". */
572 # if defined (HAVE_SETLOCALE) && !defined ( __APPLE__)
573 psz_lang = setlocale( LC_ALL, NULL );
575 if( psz_lang == NULL || psz_lang[0] == '\0' )
577 psz_lang = getenv( "LC_ALL" );
578 if( psz_lang == NULL || psz_lang == '\0' )
580 psz_lang = getenv( "LC_CTYPE" );
581 if( psz_lang == NULL || psz_lang[0] == '\0')
582 psz_lang = getenv( "LANG" );
586 return FindFallbackEncoding( psz_lang );
589 * This should be thread-safe given GetACP() should always return
592 static char buf[2 + 10 + 1] = "";
595 snprintf( buf, sizeof( buf ), "CP%u", GetACP() );
601 * There are two decimal separators in the computer world-wide locales:
602 * dot (which is the american default), and comma (which is used in France,
603 * the country with the most VLC developers, among others).
605 * i18n_strtod() has the same prototype as ANSI C strtod() but it accepts
606 * either decimal separator when deserializing the string to a float number,
607 * independant of the local computer setting.
609 double i18n_strtod( const char *str, char **end )
616 d = strtod( str, end );
619 if(( e == ',' ) || ( e == '.' ))
621 char dup[strlen( str ) + 1];
627 dup[*end - str] = ( e == ',' ) ? '.' : ',';
628 d = strtod( dup, end );
634 * i18n_atof() has the same prototype as ANSI C atof() but it accepts
635 * either decimal separator when deserializing the string to a float number,
636 * independant of the local computer setting.
638 double i18n_atof( const char *str )
640 return i18n_strtod( str, NULL );
645 * us_strtod() has the same prototype as ANSI C strtod() but it expects
646 * a dot as decimal separator regardless of the system locale.
648 double us_strtod( const char *str, char **end )
650 char dup[strlen( str ) + 1], *ptr;
654 ptr = strchr( dup, ',' );
658 d = strtod( dup, &ptr );
660 *end = (char *)&str[ptr - dup];
666 * us_atof() has the same prototype as ANSI C atof() but it expects a dot
667 * as decimal separator, regardless of the system locale.
669 double us_atof( const char *str )
671 return us_strtod( str, NULL );