git.sesse.net Git - vlc/blob - src/text/wincp.c

   1 /*****************************************************************************
   2  * wincp.c: Guessing "local" ANSI code page on Microsoft Windows®
   3  *****************************************************************************
   4  *
   5  * Copyright © 2006-2007 Rémi Denis-Courmont
   6  * $Id$
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  21  *****************************************************************************/
  22
  23 /*** We need your help to complete this file!! Look for FIXME ***/
  24
  25 #ifdef HAVE_CONFIG_H
  26 # include "config.h"
  27 #endif
  28
  29 #include <vlc_common.h>
  30
  31 #ifndef WIN32
  32 # include <locale.h>
  33 #else
  34 # include <windows.h>
  35 #endif
  36
  37 #ifdef __APPLE__
  38 #   include <errno.h>
  39 #   include <string.h>
  40 #endif
  41
  42 #include <vlc_charset.h>
  43
  44
  45 #ifndef WIN32 /* should work on Win32, but useless */
  46 static inline int locale_match (const char *tab, const char *locale)
  47 {
  48     for (;*tab; tab += 2)
  49         if (memcmp (tab, locale, 2) == 0)
  50             return 0;
  51     return 1;
  52 }
  53
  54
  55 /**
  56  * @return a fallback characters encoding to be used, given a locale.
  57  */
  58 static const char *FindFallbackEncoding (const char *locale)
  59 {
  60     if ((locale == NULL) || (strlen (locale) < 2)
  61      || !strcasecmp (locale, "POSIX"))
  62         return "CP1252"; /* Yeah, this is totally western-biased */
  63
  64
  65     /*** The ISO-8859 series (anything but Asia) ***/
  66     // Latin-1 Western-European languages (ISO-8859-1)
  67     static const char western[] =
  68         "aa" "af" "an" "br" "ca" "da" "de" "en" "es" "et" "eu" "fi" "fo" "fr"
  69         "ga" "gd" "gl" "gv" "id" "is" "it" "kl" "kw" "mg" "ms" "nb" "nl" "nn"
  70         "no" "oc" "om" "pt" "so" "sq" "st" "sv" "tl" "uz" "wa" "xh" "zu"
  71         "eo" "mt" "cy";
  72     if (!locale_match (western, locale))
  73         return "CP1252"; // Compatible Microsoft superset
  74
  75     // Latin-2 Slavic languages (ISO-8859-2)
  76     static const char slavic[] = "bs" "cs" "hr" "hu" "pl" "ro" "sk" "sl";
  77     if (!locale_match (slavic, locale))
  78         return "CP1250"; // CP1250 is more common, but incompatible
  79
  80     // Latin-3 Southern European languages (ISO-8859-3)
  81     // "eo" and "mt" -> Latin-1 instead, I presume(?).
  82     // "tr" -> ISO-8859-9 instead
  83
  84     // Latin-4 North-European languages (ISO-8859-4)
  85     // -> Latin-1 instead
  86
  87     /* Cyrillic alphabet languages (ISO-8859-5) */
  88     static const char cyrillic[] = "be" "bg" "mk" "ru" "sr" "mn";
  89     // FIXME: cyrillic only true for mn in Mongolia
  90     if (!locale_match (cyrillic, locale))
  91         return "CP1251"; // KOI8, ISO-8859-5 and CP1251 are incompatible(?)
  92
  93     /* Arabic (ISO-8859-6) */
  94     static const char arabic[] = "ar" "ckb" "fa";
  95     if (!locale_match (arabic, locale))
  96         // FIXME: someone check if we should return CP1256 or ISO-8859-6
  97         return "CP1256"; // CP1256 is(?) more common, but incompatible(?)
  98
  99     /* Greek (ISO-8859-7) */
 100     if (!locale_match ("el", locale))
 101         // FIXME: someone check if we should return CP1253 or ISO-8859-7
 102         return "CP1253"; // CP1253 is(?) more common and less incompatible
 103
 104     /* Hebrew (ISO-8859-8) */
 105     if (!locale_match ("he" "iw" "yi", locale))
 106         return "ISO-8859-8"; // CP1255 is reportedly screwed up
 107
 108     /* Latin-5 Turkish (ISO-8859-9) */
 109     if (!locale_match ("tr" "ku", locale))
 110         return "CP1254"; // Compatible Microsoft superset
 111
 112     /* Latin-6 “North-European” languages (ISO-8859-10) */
 113     /* It is so much north European that glibc only uses that for Luganda
 114      * which is spoken in Uganda... unless someone complains, I'm not
 115      * using this one; let's fallback to CP1252 here. */
 116
 117     // ISO-8859-11 does arguably not exist. Thai is handled below.
 118
 119     // ISO-8859-12 really doesn't exist.
 120
 121     // Latin-7 Baltic languages (ISO-8859-13)
 122     if (!locale_match ("lt" "lv" "mi", locale))
 123         // FIXME: mi = New Zealand, doesn't sound baltic!
 124         return "CP1257"; // Compatible Microsoft superset
 125
 126     // Latin-8 Celtic languages (ISO-8859-14)
 127     // "cy" -> use Latin-1 instead (most likely English or French)
 128
 129     // Latin-9 (ISO-8859-15) -> see Latin-1
 130
 131     // Latin-10 (ISO-8859-16) does not seem to be used
 132
 133     /*** KOI series ***/
 134     // For Russian, we use CP1251
 135     if (!locale_match ("uk", locale))
 136         return "KOI8-U";
 137
 138     if (!locale_match ("tg", locale))
 139         return "KOI8-T";
 140
 141     /*** Asia ***/
 142     // Japanese
 143     if (!locale_match ("jp", locale))
 144         return "SHIFT-JIS"; // Shift-JIS is way more common than EUC-JP
 145
 146     // Korean
 147     if (!locale_match ("ko", locale))
 148         return "CP949"; // Microsoft non-standard superset of EUC-KR
 149
 150     // Thai
 151     static const char thai[] = "th" "km" "lo";
 152     //FIXME: afaik, khmer and lao are/were not in windows and are close to tahi
 153     if (!locale_match (thai, locale))
 154         return "TIS-620";
 155
 156     // Vietnamese (FIXME: more infos needed)
 157     if (!locale_match ("vt", locale))
 158         /* VISCII is probably a bad idea as it is not extended ASCII */
 159         /* glibc has TCVN5712-1 */
 160         return "CP1258";
 161
 162     /* Kazakh (FIXME: more infos needed) */
 163     if (!locale_match ("kk", locale))
 164         return "PT154";
 165
 166     // Chinese. The politically incompatible character sets.
 167     if (!locale_match ("zh", locale))
 168     {
 169         if ((strlen (locale) >= 5) && (locale[2] != '_'))
 170             locale += 3;
 171
 172         // Hong Kong
 173         if (!locale_match ("HK", locale))
 174             return "BIG5-HKSCS"; /* FIXME: use something else? */
 175
 176         // Taiwan island
 177         if (!locale_match ("TW", locale))
 178             return "BIG5";
 179
 180         // People's Republic of China and Singapore
 181         /*
 182          * GB18030 can represent any Unicode code point
 183          * (like UTF-8), while remaining compatible with GBK
 184          * FIXME: is it compatible with GB2312? if not, should we
 185          * use GB2312 instead?
 186          */
 187         return "GB18030";
 188     }
 189
 190     return "ASCII";
 191 }
 192 #endif
 193
 194 /**
 195  * GetFallbackEncoding() suggests an encoding to be used for non UTF-8
 196  * text files accord to the system's local settings. It is only a best
 197  * guess.
 198  */
 199 const char *GetFallbackEncoding( void )
 200 {
 201 #ifndef WIN32
 202     const char *psz_lang;
 203
 204     psz_lang = getenv ("LC_ALL");
 205     if ((psz_lang == NULL) || !*psz_lang)
 206     {
 207         psz_lang = getenv ("LC_CTYPE");
 208         if ((psz_lang == NULL) || !*psz_lang)
 209             psz_lang = getenv ("LANG");
 210     }
 211
 212     return FindFallbackEncoding (psz_lang);
 213 #else
 214     static char buf[16] = "";
 215
 216     if (buf[0] == 0)
 217     {
 218         int cp = GetACP ();
 219
 220         switch (cp)
 221         {
 222             case 1255: // Hebrew, CP1255 screws up somewhat
 223                 strcpy (buf, "ISO-8859-8");
 224                 break;
 225             default:
 226                 snprintf (buf, sizeof (buf), "CP%u", cp);
 227         }
 228     }
 229     return buf;
 230 #endif
 231 }