git.sesse.net Git - vlc/blob - src/misc/unicode.c

   1 /*****************************************************************************
   2  * unicode.c: Unicode <-> locale functions
   3  *****************************************************************************
   4  * Copyright (C) 2005-2006 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Rémi Denis-Courmont <rem # videolan.org>
   8  *
   9  * UTF16toUTF8() adapted from Perl 5 (also GPL'd)
  10  * Copyright (C) 1998-2002, Larry Wall
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * This program is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU General Public License
  23  * along with this program; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  25  *****************************************************************************/
  26
  27 /*****************************************************************************
  28  * Preamble
  29  *****************************************************************************/
  30 #include <vlc/vlc.h>
  31 #include "charset.h"
  32
  33 #include <assert.h>
  34
  35 #include <stdio.h>
  36 #include <stdarg.h>
  37 #include <errno.h>
  38 #include <sys/types.h>
  39 #ifdef HAVE_DIRENT_H
  40 #  include <dirent.h>
  41 #endif
  42 #ifdef UNDER_CE
  43 #  include <tchar.h>
  44 #endif
  45 #ifdef HAVE_SYS_STAT_H
  46 # include <sys/stat.h>
  47 #endif
  48 #ifndef HAVE_LSTAT
  49 # define lstat( a, b ) stat(a, b)
  50 #endif
  51
  52 #ifdef __APPLE__
  53 /* Define this if the OS always use UTF-8 internally */
  54 # define ASSUME_UTF8 1
  55 #endif
  56
  57 #ifndef ASSUME_UTF8
  58 # if defined (HAVE_ICONV)
  59 /* libiconv is more powerful than Win32 API (it has translit) */
  60 #  define USE_ICONV 1
  61 # elif defined (WIN32) || defined (UNDER_CE)
  62 #  define USE_MB2MB 1
  63 # else
  64 #  error No UTF8 charset conversion implemented on this platform!
  65 # endif
  66 #endif
  67
  68 #ifdef USE_ICONV
  69 static struct {
  70     vlc_iconv_t hd;
  71     vlc_mutex_t lock;
  72 } from_locale, to_locale;
  73 #endif
  74
  75 void LocaleInit( vlc_object_t *p_this )
  76 {
  77 #ifdef USE_ICONV
  78     char *psz_charset;
  79
  80     if( vlc_current_charset( &psz_charset ) )
  81         /* UTF-8 */
  82         from_locale.hd = to_locale.hd = (vlc_iconv_t)(-1);
  83     else
  84     {
  85         /* not UTF-8 */
  86         char psz_buf[strlen( psz_charset ) + sizeof( "//translit" )];
  87         const char *psz_conv;
  88
  89         /*
  90          * Still allow non-ASCII characters when the locale is not set.
  91          * Western Europeans are being favored for historical reasons.
  92          */
  93         if( strcmp( psz_charset, "ASCII" ) )
  94         {
  95             sprintf( psz_buf, "%s//translit", psz_charset );
  96             psz_conv = psz_buf;
  97         }
  98         else
  99             psz_conv = "ISO-8859-1//translit";
 100
 101         vlc_mutex_init( p_this, &from_locale.lock );
 102         vlc_mutex_init( p_this, &to_locale.lock );
 103         from_locale.hd = vlc_iconv_open( "UTF-8", psz_conv );
 104         to_locale.hd = vlc_iconv_open( psz_conv, "UTF-8" );
 105     }
 106
 107     free( psz_charset );
 108
 109     assert( (from_locale.hd == (vlc_iconv_t)(-1))
 110             == (to_locale.hd == (vlc_iconv_t)(-1)) );
 111 #else
 112     (void)p_this;
 113 #endif
 114 }
 115
 116 void LocaleDeinit( void )
 117 {
 118 #ifdef USE_ICONV
 119     if( to_locale.hd != (vlc_iconv_t)(-1) )
 120     {
 121         vlc_iconv_close( to_locale.hd );
 122         vlc_mutex_destroy( &to_locale.lock );
 123     }
 124
 125     if( from_locale.hd != (vlc_iconv_t)(-1) )
 126     {
 127         vlc_iconv_close( from_locale.hd );
 128         vlc_mutex_destroy( &from_locale.lock );
 129     }
 130 #endif
 131 }
 132
 133 #ifdef USE_MB2MB
 134 static char *MB2MB( const char *string, UINT fromCP, UINT toCP )
 135 {
 136     char *out;
 137     wchar_t *wide;
 138     int len;
 139
 140     len = MultiByteToWideChar( fromCP, 0, string, -1, NULL, 0 );
 141     assert( len > 0 );
 142     wide = (wchar_t *)malloc (len * sizeof (wchar_t));
 143     if( wide == NULL )
 144         return NULL;
 145
 146     MultiByteToWideChar( fromCP, 0, string, -1, wide, len );
 147     len = WideCharToMultiByte( toCP, 0, wide, -1, NULL, 0, NULL, NULL );
 148     assert( len > 0 );
 149     out = malloc( len );
 150
 151     WideCharToMultiByte( toCP, 0, wide, -1, out, len, NULL, NULL );
 152     free( wide );
 153     return out;
 154 }
 155 #endif
 156
 157 /**
 158  * FromLocale: converts a locale string to UTF-8
 159  *
 160  * @param locale nul-terminated string to be converted
 161  *
 162  * @return a nul-terminated UTF-8 string, or NULL in case of error.
 163  * To avoid memory leak, you have to pass the result to LocaleFree()
 164  * when it is no longer needed.
 165  */
 166 char *FromLocale( const char *locale )
 167 {
 168     if( locale == NULL )
 169         return NULL;
 170
 171 #ifndef USE_MB2MB
 172 # ifdef USE_ICONV
 173     if( from_locale.hd != (vlc_iconv_t)(-1) )
 174     {
 175         const char *iptr = locale;
 176         size_t inb = strlen( locale );
 177         size_t outb = inb * 6 + 1;
 178         char output[outb], *optr = output;
 179
 180         vlc_mutex_lock( &from_locale.lock );
 181         vlc_iconv( from_locale.hd, NULL, NULL, NULL, NULL );
 182
 183         while( vlc_iconv( from_locale.hd, &iptr, &inb, &optr, &outb )
 184                == (size_t)-1 )
 185         {
 186             *optr++ = '?';
 187             outb--;
 188             iptr++;
 189             inb--;
 190             vlc_iconv( from_locale.hd, NULL, NULL, NULL, NULL );
 191         }
 192         vlc_mutex_unlock( &from_locale.lock );
 193         *optr = '\0';
 194
 195         assert (inb == 0);
 196         assert (*iptr == '\0');
 197         assert (*optr == '\0');
 198         assert (strlen( output ) == (size_t)(optr - output));
 199         return strdup( output );
 200     }
 201 # endif /* USE_ICONV */
 202     return (char *)locale;
 203 #else /* MB2MB */
 204     return MB2MB( locale, CP_ACP, CP_UTF8 );
 205 #endif
 206 }
 207
 208 char *FromLocaleDup( const char *locale )
 209 {
 210 #if defined (ASSUME_UTF8)
 211     return strdup( locale );
 212 #else
 213 # ifdef USE_ICONV
 214     if (from_locale.hd == (vlc_iconv_t)(-1))
 215         return strdup( locale );
 216 # endif
 217     return FromLocale( locale );
 218 #endif
 219 }
 220
 221
 222 /**
 223  * ToLocale: converts a UTF-8 string to local system encoding.
 224  *
 225  * @param utf8 nul-terminated string to be converted
 226  *
 227  * @return a nul-terminated string, or NULL in case of error.
 228  * To avoid memory leak, you have to pass the result to LocaleFree()
 229  * when it is no longer needed.
 230  */
 231 char *ToLocale( const char *utf8 )
 232 {
 233     if( utf8 == NULL )
 234         return NULL;
 235
 236 #ifndef USE_MB2MB
 237 # ifdef USE_ICONV
 238     if( to_locale.hd != (vlc_iconv_t)(-1) )
 239     {
 240         const char *iptr = utf8;
 241         size_t inb = strlen( utf8 );
 242         /* FIXME: I'm not sure about the value for the multiplication
 243         * (for western people, multiplication is not needed) */
 244         size_t outb = inb * 2 + 1;
 245
 246         char output[outb], *optr = output;
 247
 248         vlc_mutex_lock( &to_locale.lock );
 249         vlc_iconv( to_locale.hd, NULL, NULL, NULL, NULL );
 250
 251         while( vlc_iconv( to_locale.hd, &iptr, &inb, &optr, &outb )
 252                == (size_t)-1 )
 253         {
 254             *optr++ = '?'; /* should not happen, and yes, it sucks */
 255             outb--;
 256             iptr++;
 257             inb--;
 258             vlc_iconv( to_locale.hd, NULL, NULL, NULL, NULL );
 259         }
 260         vlc_mutex_unlock( &to_locale.lock );
 261         *optr = '\0';
 262
 263         assert (inb == 0);
 264         assert (*iptr == '\0');
 265         assert (*optr == '\0');
 266         assert (strlen( output ) == (size_t)(optr - output));
 267         return strdup( output );
 268     }
 269 # endif /* USE_ICONV */
 270     return (char *)utf8;
 271 #else /* MB2MB */
 272     return MB2MB( utf8, CP_UTF8, CP_ACP );
 273 #endif
 274 }
 275
 276 char *ToLocaleDup( const char *utf8 )
 277 {
 278 #if defined (ASSUME_UTF8)
 279     return strdup( utf8 );
 280 #else
 281 # ifdef USE_ICONV
 282     if (to_locale.hd == (vlc_iconv_t)(-1))
 283         return strdup( utf8 );
 284 # endif
 285     return ToLocale( utf8 );
 286 #endif
 287 }
 288
 289 void LocaleFree( const char *str )
 290 {
 291 #ifdef USE_ICONV
 292     if( to_locale.hd == (vlc_iconv_t)(-1) )
 293         return;
 294 #endif
 295
 296 #ifndef ASSUME_UTF8
 297     if( str != NULL )
 298         free( (char *)str );
 299 #endif
 300 }
 301
 302 /*****************************************************************************
 303  * utf8_fopen: Calls fopen() after conversion of file name to OS locale
 304  *****************************************************************************/
 305 FILE *utf8_fopen( const char *filename, const char *mode )
 306 {
 307 #if !(defined (WIN32) || defined (UNDER_CE))
 308     const char *local_name = ToLocale( filename );
 309
 310     if( local_name != NULL )
 311     {
 312         FILE *stream = fopen( local_name, mode );
 313         LocaleFree( local_name );
 314         return stream;
 315     }
 316     else
 317         errno = ENOENT;
 318     return NULL;
 319 #else
 320     wchar_t wpath[MAX_PATH + 1];
 321     size_t len = strlen( mode ) + 1;
 322     wchar_t wmode[len];
 323
 324     if( !MultiByteToWideChar( CP_UTF8, 0, filename, -1, wpath, MAX_PATH )
 325      || !MultiByteToWideChar( CP_ACP, 0, mode, len, wmode, len ) )
 326     {
 327         errno = ENOENT;
 328         return NULL;
 329     }
 330     wpath[MAX_PATH] = L'\0';
 331
 332     /*
 333      * fopen() cannot open files with non-“ANSI” characters on Windows.
 334      * We use _wfopen() instead. Same thing for mkdir() and stat().
 335      */
 336     return _wfopen( wpath, wmode );
 337 #endif
 338 }
 339
 340 /*****************************************************************************
 341  * utf8_mkdir: Calls mkdir() after conversion of file name to OS locale
 342  *****************************************************************************/
 343 int utf8_mkdir( const char *dirname )
 344 {
 345 #if defined (UNDER_CE) || defined (WIN32)
 346     wchar_t wname[MAX_PATH + 1];
 347     char mod[MAX_PATH + 1];
 348     int i;
 349
 350     /* Convert '/' into '\' */
 351     for( i = 0; *dirname; i++ )
 352     {
 353         if( i == MAX_PATH )
 354             return -1; /* overflow */
 355
 356         if( *dirname == '/' )
 357             mod[i] = '\\';
 358         else
 359             mod[i] = *dirname;
 360         dirname++;
 361
 362     }
 363     mod[i] = 0;
 364
 365     if( MultiByteToWideChar( CP_UTF8, 0, mod, -1, wname, MAX_PATH ) == 0 )
 366     {
 367         errno = ENOENT;
 368         return -1;
 369     }
 370     wname[MAX_PATH] = L'\0';
 371
 372     if( CreateDirectoryW( wname, NULL ) == 0 )
 373     {
 374         if( GetLastError( ) == ERROR_ALREADY_EXISTS )
 375             errno = EEXIST;
 376         errno = ENOENT;
 377         return -1;
 378     }
 379     return 0;
 380 #else
 381     char *locname = ToLocale( dirname );
 382     int res;
 383
 384     if( locname == NULL )
 385     {
 386         errno = ENOENT;
 387         return -1;
 388     }
 389     res = mkdir( locname, 0755 );
 390
 391     LocaleFree( locname );
 392     return res;
 393 #endif
 394 }
 395
 396
 397 void *utf8_opendir( const char *dirname )
 398 {
 399     const char *local_name = ToLocale( dirname );
 400
 401     if( local_name != NULL )
 402     {
 403         DIR *dir = opendir( local_name );
 404         LocaleFree( local_name );
 405         return dir;
 406     }
 407     else
 408         errno = ENOENT;
 409     return NULL;
 410 }
 411
 412 const char *utf8_readdir( void *dir )
 413 {
 414     struct dirent *ent;
 415
 416     ent = readdir( (DIR *)dir );
 417     if( ent == NULL )
 418         return NULL;
 419
 420     return FromLocale( ent->d_name );
 421 }
 422
 423
 424 static int utf8_statEx( const char *filename, void *buf,
 425                         vlc_bool_t deref )
 426 {
 427 #if !(defined (WIN32) || defined (UNDER_CE))
 428 # ifdef HAVE_SYS_STAT_H
 429     const char *local_name = ToLocale( filename );
 430
 431     if( local_name != NULL )
 432     {
 433         int res = deref ? stat( local_name, (struct stat *)buf )
 434                        : lstat( local_name, (struct stat *)buf );
 435         LocaleFree( local_name );
 436         return res;
 437     }
 438     errno = ENOENT;
 439 # endif
 440     return -1;
 441 #else
 442     wchar_t wpath[MAX_PATH + 1];
 443
 444     if( !MultiByteToWideChar( CP_UTF8, 0, filename, -1, wpath, MAX_PATH ) )
 445     {
 446         errno = ENOENT;
 447         return -1;
 448     }
 449     wpath[MAX_PATH] = L'\0';
 450
 451     /* struct _stat is just a silly Microsoft alias for struct stat */
 452     return _wstat( wpath, (struct _stat *)buf );
 453 #endif
 454 }
 455
 456
 457 int utf8_stat( const char *filename, void *buf)
 458 {
 459     return utf8_statEx( filename, buf, VLC_TRUE );
 460 }
 461
 462 int utf8_lstat( const char *filename, void *buf)
 463 {
 464     return utf8_statEx( filename, buf, VLC_FALSE );
 465 }
 466
 467 /*****************************************************************************
 468  * utf8_*printf: *printf with conversion from UTF-8 to local encoding
 469  *****************************************************************************/
 470 static int utf8_vasprintf( char **str, const char *fmt, va_list ap )
 471 {
 472     char *utf8;
 473     int res = vasprintf( &utf8, fmt, ap );
 474     if( res == -1 )
 475         return -1;
 476
 477     *str = ToLocaleDup( utf8 );
 478     free( utf8 );
 479     return res;
 480 }
 481
 482 static int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
 483 {
 484     char *str;
 485     int res = utf8_vasprintf( &str, fmt, ap );
 486     if( res == -1 )
 487         return -1;
 488
 489     fputs( str, stream );
 490     free( str );
 491     return res;
 492 }
 493
 494 int utf8_fprintf( FILE *stream, const char *fmt, ... )
 495 {
 496     va_list ap;
 497     int res;
 498
 499     va_start( ap, fmt );
 500     res = utf8_vfprintf( stream, fmt, ap );
 501     va_end( ap );
 502     return res;
 503 }
 504
 505 /*****************************************************************************
 506  * EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
 507  *****************************************************************************
 508  * Not Todo : convert Latin1 to UTF-8 on the fly
 509  * It is not possible given UTF-8 needs more space
 510  * Returns str if it was valid UTF-8, NULL if not.
 511  *****************************************************************************/
 512 #define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF))
 513 char *EnsureUTF8( char *str )
 514 {
 515     unsigned char *ptr, c;
 516
 517     ptr = (unsigned char *)str;
 518     while( (c = *ptr) != '\0' )
 519     {
 520         /* US-ASCII, 1 byte */
 521         if( c <= 0x7F )
 522             ptr++; /* OK */
 523         else
 524         /* 2 bytes */
 525         if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
 526         {
 527             c = ptr[1];
 528             if( isutf8cont( c ) )
 529                 ptr += 2; /* OK */
 530             else
 531                 goto error;
 532         }
 533         else
 534         /* 3 bytes */
 535         if( c == 0xE0 )
 536         {
 537             c = ptr[1];
 538             if( ( c >= 0xA0 ) && ( c <= 0xBF ) )
 539             {
 540                 c = ptr[2];
 541                 if( isutf8cont( c ) )
 542                     ptr += 3; /* OK */
 543                 else
 544                     goto error;
 545             }
 546             else
 547                 goto error;
 548         }
 549         else
 550         if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
 551          || ( c == 0xEE ) || ( c == 0xEF ) )
 552         {
 553             c = ptr[1];
 554             if( isutf8cont( c ) )
 555             {
 556                 c = ptr[2];
 557                 if( isutf8cont( c ) )
 558                     ptr += 3; /* OK */
 559                 else
 560                     goto error;
 561             }
 562             else
 563                 goto error;
 564         }
 565         else
 566         if( c == 0xED )
 567         {
 568             c = ptr[1];
 569             if( ( c >= 0x80 ) && ( c <= 0x9F ) )
 570             {
 571                 c = ptr[2];
 572                 if( isutf8cont( c ) )
 573                     ptr += 3; /* OK */
 574                 else
 575                     goto error;
 576             }
 577             else
 578                 goto error;
 579         }
 580         else
 581         /* 4 bytes */
 582         if( c == 0xF0 )
 583         {
 584             c = ptr[1];
 585             if( ( c >= 0x90 ) && ( c <= 0xBF ) )
 586             {
 587                 c = ptr[2];
 588                 if( isutf8cont( c ) )
 589                 {
 590                     c = ptr[3];
 591                     if( isutf8cont( c ) )
 592                         ptr += 4; /* OK */
 593                     else
 594                         goto error;
 595                 }
 596                 else
 597                     goto error;
 598             }
 599             else
 600                 goto error;
 601         }
 602         else
 603         if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
 604         {
 605             c = ptr[1];
 606             if( isutf8cont( c ) )
 607             {
 608                 c = ptr[2];
 609                 if( isutf8cont( c ) )
 610                 {
 611                     c = ptr[3];
 612                     if( isutf8cont( c ) )
 613                         ptr += 4; /* OK */
 614                     goto error;
 615                 }
 616                 else
 617                     goto error;
 618             }
 619             else
 620                 goto error;
 621         }
 622         else
 623         if( c == 0xF4 )
 624         {
 625             c = ptr[1];
 626             if( ( c >= 0x80 ) && ( c <= 0x8F ) )
 627             {
 628                 c = ptr[2];
 629                 if( isutf8cont( c ) )
 630                 {
 631                     c = ptr[3];
 632                     if( isutf8cont( c ) )
 633                         ptr += 4; /* OK */
 634                     else
 635                         goto error;
 636                 }
 637                 else
 638                     goto error;
 639             }
 640             else
 641                 goto error;
 642         }
 643         else
 644             goto error;
 645
 646         continue;
 647
 648 error:
 649         *ptr++ = '?';
 650         str = NULL;
 651     }
 652
 653     return str;
 654 }
 655
 656 /**
 657  * UTF32toUTF8(): converts an array from UTF-32 (host byte order)
 658  * to UTF-8.
 659  *
 660  * @param src the UTF-32 table to be converted
 661  * @param len the number of code points to be converted from src
 662  * (ie. the number of uint32_t in the table pointed to by src)
 663  * @param newlen an optional pointer. If not NULL, *newlen will
 664  * contain the total number of bytes written.
 665  *
 666  * @return the result of the conversion (must be free'd())
 667  * or NULL on error (in that case, *newlen is undefined).
 668  */
 669 static char *
 670 UTF32toUTF8( const uint32_t *src, size_t len, size_t *newlen )
 671 {
 672     char *res, *out;
 673
 674     /* allocate memory */
 675     out = res = (char *)malloc( 6 * len );
 676     if( res == NULL )
 677         return NULL;
 678
 679     while( len > 0 )
 680     {
 681         uint32_t uv = *src++;
 682         len--;
 683
 684         if( uv < 0x80 )
 685         {
 686             *out++ = uv;
 687             continue;
 688         }
 689         else
 690         if( uv < 0x800 )
 691         {
 692             *out++ = (( uv >>  6)         | 0xc0);
 693             *out++ = (( uv        & 0x3f) | 0x80);
 694             continue;
 695         }
 696         else
 697         if( uv < 0x10000 )
 698         {
 699             *out++ = (( uv >> 12)         | 0xe0);
 700             *out++ = (((uv >>  6) & 0x3f) | 0x80);
 701             *out++ = (( uv        & 0x3f) | 0x80);
 702             continue;
 703         }
 704         else
 705         if( uv < 0x110000 )
 706         {
 707             *out++ = (( uv >> 18)         | 0xf0);
 708             *out++ = (((uv >> 12) & 0x3f) | 0x80);
 709             *out++ = (((uv >>  6) & 0x3f) | 0x80);
 710             *out++ = (( uv        & 0x3f) | 0x80);
 711             continue;
 712         }
 713         else
 714         {
 715             free( res );
 716             return NULL;
 717         }
 718     }
 719     len = out - res;
 720     res = realloc( res, len );
 721     if( newlen != NULL )
 722         *newlen = len;
 723     return res;
 724 }
 725
 726 /**
 727  * FromUTF32(): converts an UTF-32 string to UTF-8.
 728  *
 729  * @param src UTF-32 bytes sequence, aligned on a 32-bits boundary.
 730  *
 731  * @return the result of the conversion (must be free()'d),
 732  * or NULL in case of error.
 733  */
 734 char *FromUTF32( const uint32_t *src )
 735 {
 736     const uint32_t *in;
 737     size_t len;
 738
 739     /* determine the size of the string */
 740     for( len = 1, in = src; *in; len++ )
 741         in++;
 742
 743     return UTF32toUTF8( src, len, NULL );
 744 }
 745
 746 /**
 747  * UTF16toUTF8: converts UTF-16 (host byte order) to UTF-8
 748  *
 749  * @param src UTF-16 bytes sequence, aligned on a 16-bits boundary
 750  * @param len number of uint16_t to convert
 751  */
 752 static char *
 753 UTF16toUTF8( const uint16_t *in, size_t len, size_t *newlen )
 754 {
 755     char *res, *out;
 756
 757     /* allocate memory */
 758     out = res = (char *)malloc( 3 * len );
 759     if( res == NULL )
 760         return NULL;
 761
 762     while( len > 0 )
 763     {
 764         uint32_t uv = *in;
 765
 766         in++;
 767         len--;
 768
 769         if( uv < 0x80 )
 770         {
 771             *out++ = uv;
 772             continue;
 773         }
 774         if( uv < 0x800 )
 775         {
 776             *out++ = (( uv >>  6)         | 0xc0);
 777             *out++ = (( uv        & 0x3f) | 0x80);
 778             continue;
 779         }
 780         if( (uv >= 0xd800) && (uv < 0xdbff) )
 781         {   /* surrogates */
 782             uint16_t low = GetWBE( in );
 783             in++;
 784             len--;
 785
 786             if( (low < 0xdc00) || (low >= 0xdfff) )
 787             {
 788                 *out++ = '?'; /* Malformed surrogate */
 789                 continue;
 790             }
 791             else
 792                 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 793         }
 794         if( uv < 0x10000 )
 795         {
 796             *out++ = (( uv >> 12)         | 0xe0);
 797             *out++ = (((uv >>  6) & 0x3f) | 0x80);
 798             *out++ = (( uv        & 0x3f) | 0x80);
 799             continue;
 800         }
 801         else
 802         {
 803             *out++ = (( uv >> 18)         | 0xf0);
 804             *out++ = (((uv >> 12) & 0x3f) | 0x80);
 805             *out++ = (((uv >>  6) & 0x3f) | 0x80);
 806             *out++ = (( uv        & 0x3f) | 0x80);
 807             continue;
 808         }
 809     }
 810     len = out - res;
 811     res = realloc( res, len );
 812     if( newlen != NULL )
 813         *newlen = len;
 814     return res;
 815 }
 816
 817
 818 /**
 819  * FromUTF16(): converts an UTF-16 string to UTF-8.
 820  *
 821  * @param src UTF-16 bytes sequence, aligned on a 16-bits boundary.
 822  *
 823  * @return the result of the conversion (must be free()'d),
 824  * or NULL in case of error.
 825  */
 826 char *FromUTF16( const uint16_t *src )
 827 {
 828     const uint16_t *in;
 829     size_t len;
 830
 831     /* determine the size of the string */
 832     for( len = 1, in = src; *in; len++ )
 833         in += 2;
 834
 835     return UTF16toUTF8( src, len, NULL );
 836 }