git.sesse.net Git - vlc/blob - src/text/unicode.c

   1 /*****************************************************************************
   2  * unicode.c: Unicode <-> locale functions
   3  *****************************************************************************
   4  * Copyright (C) 2005-2006 VLC authors and VideoLAN
   5  * Copyright © 2005-2010 Rémi Denis-Courmont
   6  *
   7  * Authors: Rémi Denis-Courmont <rem # videolan.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify it
  10  * under the terms of the GNU Lesser General Public License as published by
  11  * the Free Software Foundation; either version 2.1 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public License
  20  * along with this program; if not, write to the Free Software Foundation,
  21  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 /*****************************************************************************
  25  * Preamble
  26  *****************************************************************************/
  27 #ifdef HAVE_CONFIG_H
  28 # include "config.h"
  29 #endif
  30
  31 #include <vlc_common.h>
  32
  33 #include "libvlc.h"
  34 #include <vlc_charset.h>
  35
  36 #include <assert.h>
  37
  38 #include <stdio.h>
  39 #include <stdarg.h>
  40 #include <stdlib.h>
  41 #include <sys/types.h>
  42 #ifdef UNDER_CE
  43 #  include <tchar.h>
  44 #elif defined(WIN32)
  45 #  include <io.h>
  46 #endif
  47 #include <errno.h>
  48 #include <wctype.h>
  49
  50 /**
  51  * Formats an UTF-8 string as vfprintf(), then print it, with
  52  * appropriate conversion to local encoding.
  53  */
  54 int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
  55 {
  56 #ifndef WIN32
  57     return vfprintf (stream, fmt, ap);
  58 #else
  59     char *str;
  60     int res = vasprintf (&str, fmt, ap);
  61     if (unlikely(res == -1))
  62         return -1;
  63
  64 # ifndef UNDER_CE
  65     /* Writing to the console is a lot of fun on Microsoft Windows.
  66      * If you use the standard I/O functions, you must use the OEM code page,
  67      * which is different from the usual ANSI code page. Or maybe not, if the
  68      * user called "chcp". Anyway, we prefer Unicode. */
  69     int fd = _fileno (stream);
  70     if (likely(fd != -1) && _isatty (fd))
  71     {
  72         wchar_t *wide = ToWide (str);
  73         if (likely(wide != NULL))
  74         {
  75             HANDLE h = (HANDLE)((uintptr_t)_get_osfhandle (fd));
  76             DWORD out;
  77             /* XXX: It is not clear whether WriteConsole() wants the number of
  78              * Unicode characters or the size of the wchar_t array. */
  79             BOOL ok = WriteConsoleW (h, wide, wcslen (wide), &out, NULL);
  80             free (wide);
  81             if (ok)
  82                 goto out;
  83         }
  84     }
  85 # endif
  86     char *ansi = ToANSI (str);
  87     if (ansi != NULL)
  88     {
  89         fputs (ansi, stream);
  90         free (ansi);
  91     }
  92     else
  93         res = -1;
  94 out:
  95     free (str);
  96     return res;
  97 #endif
  98 }
  99
 100 /**
 101  * Formats an UTF-8 string as fprintf(), then print it, with
 102  * appropriate conversion to local encoding.
 103  */
 104 int utf8_fprintf( FILE *stream, const char *fmt, ... )
 105 {
 106     va_list ap;
 107     int res;
 108
 109     va_start( ap, fmt );
 110     res = utf8_vfprintf( stream, fmt, ap );
 111     va_end( ap );
 112     return res;
 113 }
 114
 115
 116 /**
 117  * Converts the first character from a UTF-8 sequence into a code point.
 118  *
 119  * @param str an UTF-8 bytes sequence
 120  * @return 0 if str points to an empty string, i.e. the first character is NUL;
 121  * number of bytes that the first character occupies (from 1 to 4) otherwise;
 122  * -1 if the byte sequence was not a valid UTF-8 sequence.
 123  */
 124 size_t vlc_towc (const char *str, uint32_t *restrict pwc)
 125 {
 126     uint8_t *ptr = (uint8_t *)str, c;
 127     uint32_t cp;
 128
 129     assert (str != NULL);
 130
 131     c = *ptr;
 132     if (unlikely(c > 0xF4))
 133         return -1;
 134
 135     int charlen = clz8 (c ^ 0xFF);
 136     switch (charlen)
 137     {
 138         case 0: // 7-bit ASCII character -> short cut
 139             *pwc = c;
 140             return c != '\0';
 141
 142         case 1: // continuation byte -> error
 143             return -1;
 144
 145         case 2:
 146             if (unlikely(c < 0xC2)) // ASCII overlong
 147                 return -1;
 148             cp = (c & 0x1F) << 6;
 149             break;
 150
 151         case 3:
 152             cp = (c & 0x0F) << 12;
 153             break;
 154
 155         case 4:
 156             cp = (c & 0x07) << 16;
 157             break;
 158
 159         default:
 160             assert (0);
 161     }
 162
 163     /* Unrolled continuation bytes decoding */
 164     switch (charlen)
 165     {
 166         case 4:
 167             c = *++ptr;
 168             if (unlikely((c >> 6) != 2)) // not a continuation byte
 169                 return -1;
 170             cp |= (c & 0x3f) << 12;
 171
 172             if (unlikely(cp >= 0x110000)) // beyond Unicode range
 173                 return -1;
 174             /* fall through */
 175         case 3:
 176             c = *++ptr;
 177             if (unlikely((c >> 6) != 2)) // not a continuation byte
 178                 return -1;
 179             cp |= (c & 0x3f) << 6;
 180
 181             if (unlikely(cp >= 0xD800 && cp < 0xE000)) // UTF-16 surrogate
 182                 return -1;
 183             if (unlikely(cp < (1u << (5 * charlen - 4)))) // non-ASCII overlong
 184                 return -1;
 185             /* fall through */
 186         case 2:
 187             c = *++ptr;
 188             if (unlikely((c >> 6) != 2)) // not a continuation byte
 189                 return -1;
 190             cp |= (c & 0x3f);
 191             break;
 192     }
 193
 194     *pwc = cp;
 195     return charlen;
 196 }
 197
 198 /**
 199  * Look for an UTF-8 string within another one in a case-insensitive fashion.
 200  * Beware that this is quite slow. Contrary to strcasestr(), this function
 201  * works regardless of the system character encoding, and handles multibyte
 202  * code points correctly.
 203
 204  * @param haystack string to look into
 205  * @param needle string to look for
 206  * @return a pointer to the first occurence of the needle within the haystack,
 207  * or NULL if no occurence were found.
 208  */
 209 char *vlc_strcasestr (const char *haystack, const char *needle)
 210 {
 211     ssize_t s;
 212
 213     do
 214     {
 215         const char *h = haystack, *n = needle;
 216
 217         for (;;)
 218         {
 219             uint32_t cph, cpn;
 220
 221             s = vlc_towc (n, &cpn);
 222             if (s == 0)
 223                 return (char *)haystack;
 224             if (unlikely(s < 0))
 225                 return NULL;
 226             n += s;
 227
 228             s = vlc_towc (h, &cph);
 229             if (s <= 0 || towlower (cph) != towlower (cpn))
 230                 break;
 231             h += s;
 232         }
 233
 234         s = vlc_towc (haystack, &(uint32_t) { 0 });
 235         haystack += s;
 236     }
 237     while (s > 0);
 238
 239     return NULL;
 240 }
 241
 242 /**
 243  * Replaces invalid/overlong UTF-8 sequences with question marks.
 244  * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
 245  * so we don't try that, even though it would be less disruptive.
 246  *
 247  * @return str if it was valid UTF-8, NULL if not.
 248  */
 249 char *EnsureUTF8( char *str )
 250 {
 251     char *ret = str;
 252     size_t n;
 253     uint32_t cp;
 254
 255     while ((n = vlc_towc (str, &cp)) != 0)
 256         if (likely(n != (size_t)-1))
 257             str += n;
 258         else
 259         {
 260             *str++ = '?';
 261             ret = NULL;
 262         }
 263     return ret;
 264 }
 265
 266
 267 /**
 268  * Checks whether a string is a valid UTF-8 byte sequence.
 269  *
 270  * @param str nul-terminated string to be checked
 271  *
 272  * @return str if it was valid UTF-8, NULL if not.
 273  */
 274 const char *IsUTF8( const char *str )
 275 {
 276     size_t n;
 277     uint32_t cp;
 278
 279     while ((n = vlc_towc (str, &cp)) != 0)
 280         if (likely(n != (size_t)-1))
 281             str += n;
 282         else
 283             return NULL;
 284     return str;
 285 }
 286
 287 /**
 288  * Converts a string from the given character encoding to utf-8.
 289  *
 290  * @return a nul-terminated utf-8 string, or null in case of error.
 291  * The result must be freed using free().
 292  */
 293 char *FromCharset(const char *charset, const void *data, size_t data_size)
 294 {
 295     vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
 296     if (handle == (vlc_iconv_t)(-1))
 297         return NULL;
 298
 299     char *out = NULL;
 300     for(unsigned mul = 4; mul < 8; mul++ )
 301     {
 302         size_t in_size = data_size;
 303         const char *in = data;
 304         size_t out_max = mul * data_size;
 305         char *tmp = out = malloc (1 + out_max);
 306         if (!out)
 307             break;
 308
 309         if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
 310             *tmp = '\0';
 311             break;
 312         }
 313         free(out);
 314         out = NULL;
 315
 316         if (errno != E2BIG)
 317             break;
 318     }
 319     vlc_iconv_close(handle);
 320     return out;
 321 }
 322
 323 /**
 324  * Converts a nul-terminated UTF-8 string to a given character encoding.
 325  * @param charset iconv name of the character set
 326  * @param in nul-terminated UTF-8 string
 327  * @param outsize pointer to hold the byte size of result
 328  *
 329  * @return A pointer to the result, which must be released using free().
 330  * The UTF-8 nul terminator is included in the conversion if the target
 331  * character encoding supports it. However it is not included in the returned
 332  * byte size.
 333  * In case of error, NULL is returned and the byte size is undefined.
 334  */
 335 void *ToCharset(const char *charset, const char *in, size_t *outsize)
 336 {
 337     vlc_iconv_t hd = vlc_iconv_open (charset, "UTF-8");
 338     if (hd == (vlc_iconv_t)(-1))
 339         return NULL;
 340
 341     const size_t inlen = strlen (in);
 342     void *res;
 343
 344     for (unsigned mul = 4; mul < 16; mul++)
 345     {
 346         size_t outlen = mul * (inlen + 1);
 347         res = malloc (outlen);
 348         if (unlikely(res == NULL))
 349             break;
 350
 351         const char *inp = in;
 352         char *outp = res;
 353         size_t inb = inlen;
 354         size_t outb = outlen - mul;
 355
 356         if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
 357         {
 358             *outsize = outlen - mul - outb;
 359             outb += mul;
 360             inb = 1; /* append nul terminator if possible */
 361             if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
 362                 break;
 363             if (errno == EILSEQ) /* cannot translate nul terminator!? */
 364                 break;
 365         }
 366
 367         free (res);
 368         res = NULL;
 369         if (errno != E2BIG) /* conversion failure */
 370             break;
 371     }
 372     vlc_iconv_close (hd);
 373     return res;
 374 }
 375