git.sesse.net Git - vlc/blob - src/text/unicode.c

   1 /*****************************************************************************
   2  * unicode.c: Unicode <-> locale functions
   3  *****************************************************************************
   4  * Copyright (C) 2005-2006 the VideoLAN team
   5  * Copyright © 2005-2010 Rémi Denis-Courmont
   6  *
   7  * Authors: Rémi Denis-Courmont <rem # videolan.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Preamble
  26  *****************************************************************************/
  27 #ifdef HAVE_CONFIG_H
  28 # include "config.h"
  29 #endif
  30
  31 #include <vlc_common.h>
  32
  33 #include "libvlc.h"
  34 #include <vlc_charset.h>
  35
  36 #include <assert.h>
  37
  38 #include <stdio.h>
  39 #include <stdarg.h>
  40 #include <stdlib.h>
  41 #include <sys/types.h>
  42 #ifdef UNDER_CE
  43 #  include <tchar.h>
  44 #elif defined(WIN32)
  45 #  include <io.h>
  46 #endif
  47 #include <errno.h>
  48 #include <wctype.h>
  49
  50 /**
  51  * Releases (if needed) a localized or uniformized string.
  52  * @param str non-NULL return value from FromLocale() or ToLocale().
  53  */
  54 void LocaleFree (const char *str)
  55 {
  56 #ifdef ASSUME_UTF8
  57     (void) str;
  58 #else
  59     free ((char *)str);
  60 #endif
  61 }
  62
  63
  64 /**
  65  * Converts a string from the system locale character encoding to UTF-8.
  66  *
  67  * @param locale nul-terminated string to convert
  68  *
  69  * @return a nul-terminated UTF-8 string, or NULL in case of error.
  70  * To avoid memory leak, you have to pass the result to LocaleFree()
  71  * when it is no longer needed.
  72  */
  73 char *FromLocale (const char *locale)
  74 {
  75 #ifdef ASSUME_UTF8
  76     return (char *)locale;
  77 #else
  78     return locale ? FromCharset ("", locale, strlen(locale)) : NULL;
  79 #endif
  80 }
  81
  82 /**
  83  * converts a string from the system locale character encoding to utf-8,
  84  * the result is always allocated on the heap.
  85  *
  86  * @param locale nul-terminated string to convert
  87  *
  88  * @return a nul-terminated utf-8 string, or null in case of error.
  89  * The result must be freed using free() - as with the strdup() function.
  90  */
  91 char *FromLocaleDup (const char *locale)
  92 {
  93 #ifdef ASSUME_UTF8
  94     return strdup (locale);
  95 #else
  96     return FromCharset ("", locale, strlen(locale));
  97 #endif
  98 }
  99
 100
 101 /**
 102  * ToLocale: converts an UTF-8 string to local system encoding.
 103  *
 104  * @param utf8 nul-terminated string to be converted
 105  *
 106  * @return a nul-terminated string, or NULL in case of error.
 107  * To avoid memory leak, you have to pass the result to LocaleFree()
 108  * when it is no longer needed.
 109  */
 110 char *ToLocale (const char *utf8)
 111 {
 112 #ifdef ASSUME_UTF8
 113     return (char *)utf8;
 114 #else
 115     size_t outsize;
 116     return utf8 ? ToCharset ("", utf8, &outsize) : NULL;
 117 #endif
 118 }
 119
 120
 121 /**
 122  * converts a string from UTF-8 to the system locale character encoding,
 123  * the result is always allocated on the heap.
 124  *
 125  * @param utf8 nul-terminated string to convert
 126  *
 127  * @return a nul-terminated string, or null in case of error.
 128  * The result must be freed using free() - as with the strdup() function.
 129  */
 130 char *ToLocaleDup (const char *utf8)
 131 {
 132 #ifdef ASSUME_UTF8
 133     return strdup (utf8);
 134 #else
 135     size_t outsize;
 136     return ToCharset ("", utf8, &outsize);
 137 #endif
 138 }
 139
 140 /**
 141  * Formats an UTF-8 string as vfprintf(), then print it, with
 142  * appropriate conversion to local encoding.
 143  */
 144 int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
 145 {
 146 #ifdef ASSUME_UTF8
 147     return vfprintf (stream, fmt, ap);
 148 #else
 149     char *str;
 150     int res;
 151
 152 # if defined( WIN32 ) && !defined( UNDER_CE )
 153     /* Writing to the console is a lot of fun on Microsoft Windows.
 154      * If you use the standard I/O functions, you must use the OEM code page,
 155      * which is different from the usual ANSI code page. Or maybe not, if the
 156      * user called "chcp". Anyway, we prefer Unicode. */
 157     int fd = _fileno (stream);
 158     if (likely(fd != -1) && _isatty (fd))
 159     {
 160         res = vasprintf (&str, fmt, ap);
 161         if (unlikely(res == -1))
 162             return -1;
 163
 164         size_t wlen = 2 * (res + 1);
 165         wchar_t *wide = malloc (wlen);
 166         if (likely(wide != NULL))
 167         {
 168             wlen = MultiByteToWideChar (CP_UTF8, 0, str, res + 1, wide, wlen);
 169             if (wlen > 0)
 170             {
 171                 HANDLE h = (HANDLE)(intptr_t)_get_osfhandle (fd);
 172                 DWORD out;
 173
 174                 WriteConsoleW (h, wide, wlen - 1, &out, NULL);
 175             }
 176             else
 177                 res = -1;
 178             free (wide);
 179         }
 180         else
 181             res = -1;
 182         free (str);
 183         return res;
 184     }
 185 # endif
 186
 187     res = vasprintf (&str, fmt, ap);
 188     if (unlikely(res == -1))
 189         return -1;
 190
 191     char *ansi = ToLocaleDup (str);
 192     free (str);
 193
 194     if (ansi == NULL)
 195         return -1;
 196     fputs (ansi, stream);
 197     free (ansi);
 198     return res;
 199 #endif
 200 }
 201
 202 /**
 203  * Formats an UTF-8 string as fprintf(), then print it, with
 204  * appropriate conversion to local encoding.
 205  */
 206 int utf8_fprintf( FILE *stream, const char *fmt, ... )
 207 {
 208     va_list ap;
 209     int res;
 210
 211     va_start( ap, fmt );
 212     res = utf8_vfprintf( stream, fmt, ap );
 213     va_end( ap );
 214     return res;
 215 }
 216
 217
 218 /**
 219  * Converts the first character from a UTF-8 sequence into a code point.
 220  *
 221  * @param str an UTF-8 bytes sequence
 222  * @return 0 if str points to an empty string, i.e. the first character is NUL;
 223  * number of bytes that the first character occupies (from 1 to 4) otherwise;
 224  * -1 if the byte sequence was not a valid UTF-8 sequence.
 225  */
 226 size_t vlc_towc (const char *str, uint32_t *restrict pwc)
 227 {
 228     uint8_t *ptr = (uint8_t *)str;
 229     assert (str != NULL);
 230
 231     uint8_t c = ptr[0];
 232
 233     if (unlikely(c == '\0'))
 234     {
 235         *pwc = 0;
 236         return 0;
 237     }
 238
 239     if (unlikely(c > 0xF4))
 240         return -1;
 241
 242     int charlen = clz8 (c ^ 0xFF);
 243     switch (charlen)
 244     {
 245         case 0: // 7-bit ASCII character -> OK
 246             *pwc = c;
 247             return 1;
 248
 249         case 1: // continuation byte -> error
 250             return -1;
 251     }
 252
 253     assert (charlen >= 2 && charlen <= 4);
 254
 255     uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
 256     for (int i = 1; i < charlen; i++)
 257     {
 258         assert (cp < (1 << 26));
 259         c = ptr[i];
 260
 261         if (unlikely((c >> 6) != 2)) // not a continuation byte
 262             return -1;
 263
 264         cp = (cp << 6) | (ptr[i] & 0x3f);
 265     }
 266
 267     switch (charlen)
 268     {
 269         case 4:
 270             if (unlikely(cp > 0x10FFFF)) // beyond Unicode
 271                 return -1;
 272         case 3:
 273             if (unlikely(cp >= 0xD800 && cp < 0xC000)) // UTF-16 surrogate
 274                 return -1;
 275         case 2:
 276             if (unlikely(cp < 128)) // ASCII overlong
 277                 return -1;
 278             if (unlikely(cp < (1u << (5 * charlen - 3)))) // overlong
 279                 return -1;
 280     }
 281     *pwc = cp;
 282     return charlen;
 283 }
 284
 285 /**
 286  * Look for an UTF-8 string within another one in a case-insensitive fashion.
 287  * Beware that this is quite slow. Contrary to strcasestr(), this function
 288  * works regardless of the system character encoding, and handles multibyte
 289  * code points correctly.
 290
 291  * @param haystack string to look into
 292  * @param needle string to look for
 293  * @return a pointer to the first occurence of the needle within the haystack,
 294  * or NULL if no occurence were found.
 295  */
 296 char *vlc_strcasestr (const char *haystack, const char *needle)
 297 {
 298     ssize_t s;
 299
 300     do
 301     {
 302         const char *h = haystack, *n = needle;
 303
 304         for (;;)
 305         {
 306             uint32_t cph, cpn;
 307
 308             s = vlc_towc (n, &cpn);
 309             if (s == 0)
 310                 return (char *)haystack;
 311             if (unlikely(s < 0))
 312                 return NULL;
 313             n += s;
 314
 315             s = vlc_towc (h, &cph);
 316             if (s <= 0 || towlower (cph) != towlower (cpn))
 317                 break;
 318             h += s;
 319         }
 320
 321         s = vlc_towc (haystack, &(uint32_t) { 0 });
 322         haystack += s;
 323     }
 324     while (s != 0);
 325
 326     return NULL;
 327 }
 328
 329 /**
 330  * Replaces invalid/overlong UTF-8 sequences with question marks.
 331  * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
 332  * so we don't try that, even though it would be less disruptive.
 333  *
 334  * @return str if it was valid UTF-8, NULL if not.
 335  */
 336 char *EnsureUTF8( char *str )
 337 {
 338     char *ret = str;
 339     size_t n;
 340     uint32_t cp;
 341
 342     while ((n = vlc_towc (str, &cp)) != 0)
 343         if (likely(n != (size_t)-1))
 344             str += n;
 345         else
 346         {
 347             *str++ = '?';
 348             ret = NULL;
 349         }
 350     return ret;
 351 }
 352
 353
 354 /**
 355  * Checks whether a string is a valid UTF-8 byte sequence.
 356  *
 357  * @param str nul-terminated string to be checked
 358  *
 359  * @return str if it was valid UTF-8, NULL if not.
 360  */
 361 const char *IsUTF8( const char *str )
 362 {
 363     size_t n;
 364     uint32_t cp;
 365
 366     while ((n = vlc_towc (str, &cp)) != 0)
 367         if (likely(n != (size_t)-1))
 368             str += n;
 369         else
 370             return NULL;
 371     return str;
 372 }
 373
 374 /**
 375  * Converts a string from the given character encoding to utf-8.
 376  *
 377  * @return a nul-terminated utf-8 string, or null in case of error.
 378  * The result must be freed using free().
 379  */
 380 char *FromCharset(const char *charset, const void *data, size_t data_size)
 381 {
 382     vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
 383     if (handle == (vlc_iconv_t)(-1))
 384         return NULL;
 385
 386     char *out = NULL;
 387     for(unsigned mul = 4; mul < 8; mul++ )
 388     {
 389         size_t in_size = data_size;
 390         const char *in = data;
 391         size_t out_max = mul * data_size;
 392         char *tmp = out = malloc (1 + out_max);
 393         if (!out)
 394             break;
 395
 396         if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
 397             *tmp = '\0';
 398             break;
 399         }
 400         free(out);
 401         out = NULL;
 402
 403         if (errno != E2BIG)
 404             break;
 405     }
 406     vlc_iconv_close(handle);
 407     return out;
 408 }
 409
 410 /**
 411  * Converts a nul-terminated UTF-8 string to a given character encoding.
 412  * @param charset iconv name of the character set
 413  * @param in nul-terminated UTF-8 string
 414  * @param outsize pointer to hold the byte size of result
 415  *
 416  * @return A pointer to the result, which must be released using free().
 417  * The UTF-8 nul terminator is included in the conversion if the target
 418  * character encoding supports it. However it is not included in the returned
 419  * byte size.
 420  * In case of error, NULL is returned and the byte size is undefined.
 421  */
 422 void *ToCharset(const char *charset, const char *in, size_t *outsize)
 423 {
 424     vlc_iconv_t hd = vlc_iconv_open (charset, "UTF-8");
 425     if (hd == (vlc_iconv_t)(-1))
 426         return NULL;
 427
 428     const size_t inlen = strlen (in);
 429     void *res;
 430
 431     for (unsigned mul = 4; mul < 16; mul++)
 432     {
 433         size_t outlen = mul * (inlen + 1);
 434         res = malloc (outlen);
 435         if (unlikely(res == NULL))
 436             break;
 437
 438         const char *inp = in;
 439         char *outp = res;
 440         size_t inb = inlen;
 441         size_t outb = outlen - mul;
 442
 443         if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
 444         {
 445             *outsize = outlen - mul - outb;
 446             outb += mul;
 447             inb = 1; /* append nul terminator if possible */
 448             if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
 449                 break;
 450             if (errno == EILSEQ) /* cannot translate nul terminator!? */
 451                 break;
 452         }
 453
 454         free (res);
 455         res = NULL;
 456         if (errno != E2BIG) /* conversion failure */
 457             break;
 458     }
 459     vlc_iconv_close (hd);
 460     return res;
 461 }
 462