1 /*****************************************************************************
2 * charset.c: Locale's character encoding stuff.
3 *****************************************************************************
4 * See also unicode.c for Unicode to locale conversion helpers.
6 * Copyright (C) 2003-2006 the VideoLAN team
9 * Authors: Derk-Jan Hartman <thedj at users.sf.net>
11 * RĂ©mi Denis-Courmont
13 * vlc_current_charset() an adaption of mp_locale_charset():
15 * Copyright (C) 2001-2003 The Mape Project
16 * Written by Karel Zak <zakkr@zf.jcu.cz>.
18 * which itself is an adaptation of locale_charset():
20 * Copyright (C) 2000-2002 Free Software Foundation, Inc.
21 * Written by Bruno Haible <bruno@clisp.org>.
23 * This program is free software; you can redistribute it and/or modify
24 * it under the terms of the GNU General Public License as published by
25 * the Free Software Foundation; either version 2 of the License, or
26 * (at your option) any later version.
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 * GNU General Public License for more details.
33 * You should have received a copy of the GNU General Public License
34 * along with this program; if not, write to the Free Software
35 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
36 *****************************************************************************/
42 #include <vlc_common.h>
45 # ifdef HAVE_LANGINFO_CODESET
46 # include <langinfo.h>
60 #include <vlc_charset.h>
62 typedef struct VLCCharsetAlias
64 char *psz_alias, *psz_name;
68 * The libcharset load all from external text file, but it's strange and
69 * slow solution, we rather use array(s) compiled into source. In the
70 * "good" libc this is not needful -- for example in linux.
72 * Please, put to this funtion exotic aliases only. The libc 'iconv' knows
73 * a lot of basic aliases (check it first by iconv -l).
76 #if (defined OS2 || !defined(HAVE_LANGINFO_CODESET)) && !defined WIN32
77 static const char* vlc_encoding_from_language( const char *l )
79 /* check for language (and perhaps country) codes */
80 if (strstr(l, "zh_TW")) return "Big5";
81 if (strstr(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */
82 if (strstr(l, "zh")) return "GB2312";
83 if (strstr(l, "th")) return "TIS-620";
84 if (strstr(l, "ja")) return "EUC-JP";
85 if (strstr(l, "ko")) return "EUC-KR";
86 if (strstr(l, "ru")) return "KOI8-R";
87 if (strstr(l, "uk")) return "KOI8-U";
88 if (strstr(l, "pl") || strstr(l, "hr") ||
89 strstr(l, "hu") || strstr(l, "cs") ||
90 strstr(l, "sk") || strstr(l, "sl")) return "ISO-8859-2";
91 if (strstr(l, "eo") || strstr(l, "mt")) return "ISO-8859-3";
92 if (strstr(l, "lt") || strstr(l, "la")) return "ISO-8859-4";
93 if (strstr(l, "bg") || strstr(l, "be") ||
94 strstr(l, "mk") || strstr(l, "uk")) return "ISO-8859-5";
95 if (strstr(l, "ar")) return "ISO-8859-6";
96 if (strstr(l, "el")) return "ISO-8859-7";
97 if (strstr(l, "he") || strstr(l, "iw")) return "ISO-8859-8";
98 if (strstr(l, "tr")) return "ISO-8859-9";
99 if (strstr(l, "th")) return "ISO-8859-11";
100 if (strstr(l, "lv")) return "ISO-8859-13";
101 if (strstr(l, "cy")) return "ISO-8859-14";
102 if (strstr(l, "et")) return "ISO-8859-15"; /* all latin1 could be iso15 as well */
103 if (strstr(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */
104 if (strstr(l, "am") || strstr(l, "vi")) return "UTF-8";
105 /* We don't know. This ain't working go to default. */
110 static const char* vlc_charset_aliases( const char *psz_name )
115 VLCCharsetAlias aliases[] =
118 { "CP1361", "JOHAB" },
119 { "CP20127", "ASCII" },
120 { "CP20866", "KOI8-R" },
121 { "CP21866", "KOI8-RU" },
122 { "CP28591", "ISO-8859-1" },
123 { "CP28592", "ISO-8859-2" },
124 { "CP28593", "ISO-8859-3" },
125 { "CP28594", "ISO-8859-4" },
126 { "CP28595", "ISO-8859-5" },
127 { "CP28596", "ISO-8859-6" },
128 { "CP28597", "ISO-8859-7" },
129 { "CP28598", "ISO-8859-8" },
130 { "CP28599", "ISO-8859-9" },
131 { "CP28605", "ISO-8859-15" },
134 #elif defined (SYS_AIX)
135 VLCCharsetAlias aliases[] =
137 { "IBM-850", "CP850" },
138 { "IBM-856", "CP856" },
139 { "IBM-921", "ISO-8859-13" },
140 { "IBM-922", "CP922" },
141 { "IBM-932", "CP932" },
142 { "IBM-943", "CP943" },
143 { "IBM-1046", "CP1046" },
144 { "IBM-1124", "CP1124" },
145 { "IBM-1129", "CP1129" },
146 { "IBM-1252", "CP1252" },
147 { "IBM-EUCCN", "GB2312" },
148 { "IBM-EUCJP", "EUC-JP" },
149 { "IBM-EUCKR", "EUC-KR" },
150 { "IBM-EUCTW", "EUC-TW" },
153 #elif defined (SYS_HPUX)
154 VLCCharsetAlias aliases[] =
156 { "ROMAN8", "HP-ROMAN8" },
157 { "ARABIC8", "HP-ARABIC8" },
158 { "GREEK8", "HP-GREEK8" },
159 { "HEBREW8", "HP-HEBREW8" },
160 { "TURKISH8", "HP-TURKISH8" },
161 { "KANA8", "HP-KANA8" },
162 { "HP15CN", "GB2312" },
165 #elif defined (SYS_IRIX)
166 VLCCharsetAlias aliases[] =
168 { "EUCCN", "GB2312" },
171 #elif defined (SYS_OSF)
172 VLCCharsetAlias aliases[] =
174 { "KSC5601", "CP949" },
175 { "SDECKANJI", "EUC-JP" },
176 { "TACTIS", "TIS-620" },
179 #elif defined (SYS_SOLARIS)
180 VLCCharsetAlias aliases[] =
183 { "CNS11643", "EUC-TW" },
184 { "5601", "EUC-KR" },
185 { "JOHAP92", "JOHAB" },
186 { "PCK", "SHIFT_JIS" },
187 { "2533", "TIS-620" },
190 #elif defined (SYS_BSD)
191 VLCCharsetAlias aliases[] =
194 { "EUCCN", "GB2312" },
198 VLCCharsetAlias aliases[] = {{NULL, NULL}};
201 for (a = aliases; a->psz_alias; a++)
202 if (strcasecmp (a->psz_alias, psz_name) == 0)
205 /* we return original name beacuse iconv() probably will know
206 * something better about name if we don't know it :-) */
210 /* Returns charset from "language_COUNTRY.charset@modifier" string */
211 #if (defined OS2 || !defined(HAVE_LANGINFO_CODESET)) && !defined WIN32
212 static void vlc_encoding_from_locale( char *psz_locale, char *psz_charset )
214 char *psz_dot = strchr( psz_locale, '.' );
216 if( psz_dot != NULL )
218 const char *psz_modifier;
222 /* Look for the possible @... trailer and remove it, if any. */
223 psz_modifier = strchr( psz_dot, '@' );
225 if( psz_modifier == NULL )
227 strcpy( psz_charset, psz_dot );
230 if( 0 < ( psz_modifier - psz_dot )
231 && ( psz_modifier - psz_dot ) < 2 + 10 + 1 )
233 memcpy( psz_charset, psz_dot, psz_modifier - psz_dot );
234 psz_charset[ psz_modifier - psz_dot ] = '\0';
238 /* try language mapping */
239 strcpy( psz_charset, vlc_encoding_from_language( psz_locale ) );
243 bool vlc_current_charset( char **psz_charset )
245 const char *psz_codeset;
247 #if !(defined WIN32 || defined OS2 || defined __APPLE__)
249 # ifdef HAVE_LANGINFO_CODESET
250 /* Most systems support nl_langinfo( CODESET ) nowadays. */
251 psz_codeset = nl_langinfo( CODESET );
252 if( !strcmp( psz_codeset, "ANSI_X3.4-1968" ) )
253 psz_codeset = "ASCII";
255 /* On old systems which lack it, use setlocale or getenv. */
256 const char *psz_locale = NULL;
257 char buf[2 + 10 + 1];
259 /* But most old systems don't have a complete set of locales. Some
260 * (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
261 * use setlocale here; it would return "C" when it doesn't support the
262 * locale name the user has set. Darwin's setlocale is broken. */
263 # if defined (HAVE_SETLOCALE) && !defined (__APPLE__)
264 psz_locale = setlocale( LC_ALL, NULL );
266 if( psz_locale == NULL || psz_locale[0] == '\0' )
268 psz_locale = getenv( "LC_ALL" );
269 if( psz_locale == NULL || psz_locale[0] == '\0' )
271 psz_locale = getenv( "LC_CTYPE" );
272 if( psz_locale == NULL || psz_locale[0] == '\0')
273 psz_locale = getenv( "LANG" );
277 /* On some old systems, one used to set locale = "iso8859_1". On others,
278 * you set it to "language_COUNTRY.charset". Darwin only has LANG :( */
279 vlc_encoding_from_locale( (char *)psz_locale, buf );
281 # endif /* HAVE_LANGINFO_CODESET */
283 #elif defined __APPLE__
285 /* Darwin is always using UTF-8 internally. */
286 psz_codeset = "UTF-8";
290 char buf[2 + 10 + 1];
292 /* Woe32 has a function returning the locale's codepage as a number. */
293 snprintf( buf, sizeof( buf ), "CP%u", GetACP() );
298 const char *psz_locale;
299 char buf[2 + 10 + 1];
303 /* Allow user to override the codeset, as set in the operating system,
304 * with standard language environment variables. */
305 psz_locale = getenv( "LC_ALL" );
306 if( psz_locale == NULL || psz_locale[0] == '\0' )
308 psz+locale = getenv( "LC_CTYPE" );
309 if( psz_locale == NULL || locale[0] == '\0' )
310 locale = getenv( "LANG" );
312 if( psz_locale != NULL && psz_locale[0] != '\0' )
313 vlc_encoding_from_locale( psz_locale, buf );
317 /* OS/2 has a function returning the locale's codepage as a number. */
318 if( DosQueryCp( sizeof( cp ), cp, &cplen ) )
322 snprintf( buf, sizeof( buf ), "CP%u", cp[0] );
327 if( psz_codeset == NULL )
328 /* The canonical name cannot be determined. */
331 psz_codeset = vlc_charset_aliases( psz_codeset );
333 /* Don't return an empty string. GNU libc and GNU libiconv interpret
334 * the empty string as denoting "the locale's character encoding",
335 * thus GNU libiconv would call this function a second time. */
336 if( psz_codeset[0] == '\0' )
338 /* Last possibility is 'CHARSET' enviroment variable */
339 if( !( psz_codeset = getenv( "CHARSET" ) ) )
340 psz_codeset = "ISO-8859-1";
344 *psz_charset = strdup(psz_codeset);
346 if( !strcasecmp(psz_codeset, "UTF8") || !strcasecmp(psz_codeset, "UTF-8") )
353 char *vlc_fix_readdir( const char *psz_string )
356 vlc_iconv_t hd = vlc_iconv_open( "UTF-8", "UTF-8-MAC" );
358 if (hd != (vlc_iconv_t)(-1))
360 const char *psz_in = psz_string;
361 size_t i_in = strlen(psz_in);
362 size_t i_out = i_in * 2;
363 char *psz_utf8 = malloc(i_out + 1);
364 char *psz_out = psz_utf8;
366 size_t i_ret = vlc_iconv (hd, &psz_in, &i_in, &psz_out, &i_out);
367 vlc_iconv_close (hd);
368 if( i_ret == (size_t)(-1) || i_in )
371 return strdup( psz_string );
378 return strdup( psz_string );
383 * us_strtod() has the same prototype as ANSI C strtod() but it uses the
384 * POSIX/C decimal format, regardless of the current numeric locale.
386 double us_strtod( const char *str, char **end )
388 locale_t loc = newlocale (LC_NUMERIC_MASK, "C", NULL);
389 locale_t oldloc = uselocale (loc);
390 double res = strtod (str, end);
392 if (loc != (locale_t)0)
401 * us_atof() has the same prototype as ANSI C atof() but it expects a dot
402 * as decimal separator, regardless of the system locale.
404 double us_atof( const char *str )
406 return us_strtod( str, NULL );