X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Ftext%2Funicode.c;h=036027cfdbf2dd5d02261d4c06b543eb50d9359a;hb=be378fbc80c384e2541517d6853b59411b7e67de;hp=10a103bc3c9a582d78a5abf0b22cdcbd20627d8b;hpb=f556b69004ce0978944d0f1ce024f04b9130b53a;p=vlc diff --git a/src/text/unicode.c b/src/text/unicode.c index 10a103bc3c..036027cfdb 100644 --- a/src/text/unicode.c +++ b/src/text/unicode.c @@ -2,8 +2,7 @@ * unicode.c: Unicode <-> locale functions ***************************************************************************** * Copyright (C) 2005-2006 the VideoLAN team - * Copyright © 2005-2006 Rémi Denis-Courmont - * $Id$ + * Copyright © 2005-2008 Rémi Denis-Courmont * * Authors: Rémi Denis-Courmont * @@ -25,8 +24,13 @@ /***************************************************************************** * Preamble *****************************************************************************/ -#include +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include #include +#include "libvlc.h" /* utf8_mkdir */ #include @@ -62,100 +66,47 @@ # define ASSUME_UTF8 1 #endif -#ifndef ASSUME_UTF8 -# if defined (HAVE_ICONV) -/* libiconv is more powerful than Win32 API (it has translit) */ -# define USE_ICONV 1 -# elif defined (WIN32) || defined (UNDER_CE) -# define USE_MB2MB 1 -# else -# error No UTF8 charset conversion implemented on this platform! -# endif +#if defined (ASSUME_UTF8) +/* Cool */ +#elif defined (WIN32) || defined (UNDER_CE) +# define USE_MB2MB 1 +#elif defined (HAVE_ICONV) +# define USE_ICONV 1 +#else +# error No UTF8 charset conversion implemented on this platform! #endif -typedef struct locale_data_t -{ #if defined (USE_ICONV) - vlc_iconv_t hd; - vlc_mutex_t lock; -#elif defined (USE_MB2MB) - UINT fromCP; - UINT toCP; -#endif -} locale_data_t; - -static locale_data_t from_locale, to_locale; - +# include +static char charset[sizeof ("CSISO11SWEDISHFORNAMES")] = ""; -void LocaleInit( vlc_object_t *p_this ) +static void find_charset_once (void) { -#if defined USE_ICONV - char *psz_charset; - - if( vlc_current_charset( &psz_charset ) ) - /* UTF-8 */ - from_locale.hd = to_locale.hd = (vlc_iconv_t)(-1); - else - { - /* not UTF-8 */ - char psz_buf[strlen( psz_charset ) + sizeof( "//translit" )]; - const char *psz_conv; - - /* - * Still allow non-ASCII characters when the locale is not set. - * Western Europeans are being favored for historical reasons. - */ - if( strcmp( psz_charset, "ASCII" ) ) - { - sprintf( psz_buf, "%s//translit", psz_charset ); - psz_conv = psz_buf; - } - else - psz_conv = "ISO-8859-1//translit"; - - vlc_mutex_init( p_this, &from_locale.lock ); - vlc_mutex_init( p_this, &to_locale.lock ); - from_locale.hd = vlc_iconv_open( "UTF-8", psz_conv ); - to_locale.hd = vlc_iconv_open( psz_conv, "UTF-8" ); - } - - free( psz_charset ); - - assert( (from_locale.hd == (vlc_iconv_t)(-1)) - == (to_locale.hd == (vlc_iconv_t)(-1)) ); - -#elif defined (USE_MB2MB) - to_locale.toCP = from_locale.fromCP = CP_ACP; - from_locale.toCP = to_locale.fromCP = CP_UTF8; -#else - (void)p_this; -#endif + strlcpy (charset, nl_langinfo (CODESET), sizeof (charset)); + if (!strcasecmp (charset, "ASCII") + || !strcasecmp (charset, "ANSI_X3.4-1968")) + strcpy (charset, "UTF-8"); /* superset... */ } -void LocaleDeinit( void ) +static int find_charset (void) { -#ifdef USE_ICONV - if( to_locale.hd != (vlc_iconv_t)(-1) ) - { - vlc_iconv_close( to_locale.hd ); - vlc_mutex_destroy( &to_locale.lock ); - } - - if( from_locale.hd != (vlc_iconv_t)(-1) ) - { - vlc_iconv_close( from_locale.hd ); - vlc_mutex_destroy( &from_locale.lock ); - } -#endif + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once (&once, find_charset_once); + return !strcasecmp (charset, "UTF-8"); } +#endif + -static char *locale_fast (const char *string, locale_data_t *p) +static char *locale_fast (const char *string, bool from) { #if defined (USE_ICONV) - vlc_iconv_t hd = p->hd; + if (find_charset ()) + return (char *)string; + vlc_iconv_t hd = vlc_iconv_open (from ? "UTF-8" : charset, + from ? charset : "UTF-8"); if (hd == (vlc_iconv_t)(-1)) - return (char *)string; + return NULL; /* Uho! */ const char *iptr = string; size_t inb = strlen (string); @@ -165,19 +116,16 @@ static char *locale_fast (const char *string, locale_data_t *p) if (string == NULL) return NULL; - vlc_mutex_lock (&p->lock); - vlc_iconv (hd, NULL, NULL, NULL, NULL); - while (vlc_iconv (hd, &iptr, &inb, &optr, &outb) == (size_t)(-1)) { *optr++ = '?'; outb--; iptr++; inb--; - vlc_iconv (hd, NULL, NULL, NULL, NULL); + vlc_iconv (hd, NULL, NULL, NULL, NULL); /* reset */ } - vlc_mutex_unlock (&p->lock); *optr = '\0'; + vlc_iconv_close (hd); assert (inb == 0); assert (*iptr == '\0'); @@ -186,64 +134,69 @@ static char *locale_fast (const char *string, locale_data_t *p) return strdup (output); #elif defined (USE_MB2MB) char *out; - wchar_t *wide; int len; if (string == NULL) return NULL; - len = MultiByteToWideChar (p->fromCP, 0, string, -1, NULL, 0); - if (len == 0) - return NULL; - + len = 1 + MultiByteToWideChar (from ? CP_ACP : CP_UTF8, + 0, string, -1, NULL, 0); wchar_t wide[len]; - MultiByteToWideChar (p->fromCP, 0, string, -1, wide, len); - len = WideCharToMultiByte (p->toCP, 0, wide, -1, NULL, 0, NULL, NULL); - if (len == 0) - return NULL; + MultiByteToWideChar (from ? CP_ACP : CP_UTF8, 0, string, -1, wide, len); + len = 1 + WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1, + NULL, 0, NULL, NULL); out = malloc (len); + if (out == NULL) + return NULL; - WideCharToMultiByte (p->toCP, 0, wide, -1, out, len, NULL, NULL); + WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1, out, len, + NULL, NULL); return out; #else + (void)from; return (char *)string; #endif } -static inline char *locale_dup (const char *string, locale_data_t *p) +static inline char *locale_dup (const char *string, bool from) { + assert( string ); + #if defined (USE_ICONV) - return (p->hd == (vlc_iconv_t)(-1)) - ? strdup (string) - : locale_fast (string, p); + if (find_charset ()) + return strdup (string); + return locale_fast (string, from); #elif defined (USE_MB2MB) - return locale_fast (string, p); + return locale_fast (string, from); #else + (void)from; return strdup (string); #endif } - +/** + * Releases (if needed) a localized or uniformized string. + * @param str non-NULL return value from FromLocale() or ToLocale(). + */ void LocaleFree (const char *str) { #if defined (USE_ICONV) - assert ((to_locale.hd == (vlc_iconv_t)(-1)) - == (from_locale.hd == (vlc_iconv_t)(-1))); - - if( to_locale.hd != (vlc_iconv_t)(-1) ) + if (!find_charset ()) free ((char *)str); #elif defined (USE_MB2MB) free ((char *)str); +#else + (void)str; #endif } /** - * FromLocale: converts a locale string to UTF-8 + * Converts a string from the system locale character encoding to UTF-8. * - * @param locale nul-terminated string to be converted + * @param locale nul-terminated string to convert * * @return a nul-terminated UTF-8 string, or NULL in case of error. * To avoid memory leak, you have to pass the result to LocaleFree() @@ -251,17 +204,26 @@ void LocaleFree (const char *str) */ char *FromLocale (const char *locale) { - return locale_fast (locale, &from_locale); + return locale_fast (locale, true); } +/** + * converts a string from the system locale character encoding to utf-8, + * the result is always allocated on the heap. + * + * @param locale nul-terminated string to convert + * + * @return a nul-terminated utf-8 string, or null in case of error. + * The result must be freed using free() - as with the strdup() function. + */ char *FromLocaleDup (const char *locale) { - return locale_dup (locale, &from_locale); + return locale_dup (locale, true); } /** - * ToLocale: converts a UTF-8 string to local system encoding. + * ToLocale: converts an UTF-8 string to local system encoding. * * @param utf8 nul-terminated string to be converted * @@ -271,18 +233,32 @@ char *FromLocaleDup (const char *locale) */ char *ToLocale (const char *utf8) { - return locale_fast (utf8, &to_locale); + return locale_fast (utf8, false); } -static char *ToLocaleDup (const char *utf8) +/** + * converts a string from UTF-8 to the system locale character encoding, + * the result is always allocated on the heap. + * + * @param utf8 nul-terminated string to convert + * + * @return a nul-terminated string, or null in case of error. + * The result must be freed using free() - as with the strdup() function. + */ +char *ToLocaleDup (const char *utf8) { - return locale_dup (utf8, &to_locale); + return locale_dup (utf8, false); } /** - * utf8_open: open() wrapper for UTF-8 filenames + * Opens a system file handle using UTF-8 paths. + * + * @param filename file path to open (with UTF-8 encoding) + * @param flags open() flags, see the C library open() documentation + * @param mode file permissions if creating a new file + * @return a file handle on success, -1 on error (see errno). */ int utf8_open (const char *filename, int flags, mode_t mode) { @@ -320,12 +296,15 @@ int utf8_open (const char *filename, int flags, mode_t mode) } /** - * utf8_fopen: fopen() wrapper for UTF-8 filenames + * Opens a FILE pointer using UTF-8 filenames. + * @param filename file path, using UTF-8 encoding + * @param mode fopen file open mode + * @return NULL on error, an open FILE pointer on success. */ FILE *utf8_fopen (const char *filename, const char *mode) { int rwflags = 0, oflags = 0; - vlc_bool_t append = VLC_FALSE; + bool append = false; for (const char *ptr = mode; *ptr; ptr++) { @@ -338,7 +317,7 @@ FILE *utf8_fopen (const char *filename, const char *mode) case 'a': rwflags = O_WRONLY; oflags |= O_CREAT; - append = VLC_TRUE; + append = true; break; case 'w': @@ -349,6 +328,12 @@ FILE *utf8_fopen (const char *filename, const char *mode) case '+': rwflags = O_RDWR; break; + +#ifdef O_TEXT + case 't': + oflags |= O_TEXT; + break; +#endif } } @@ -370,16 +355,18 @@ FILE *utf8_fopen (const char *filename, const char *mode) } /** - * utf8_mkdir: Calls mkdir() after conversion of file name to OS locale + * Creates a directory using UTF-8 paths. * * @param dirname a UTF-8 string with the name of the directory that you * want to create. - * @return A 0 return value indicates success. A -1 return value indicates an - * error, and an error code is stored in errno + * @param mode directory permissions + * @return 0 on success, -1 on error (see errno). */ -int utf8_mkdir( const char *dirname ) +int utf8_mkdir( const char *dirname, mode_t mode ) { #if defined (UNDER_CE) || defined (WIN32) + VLC_UNUSED( mode ); + wchar_t wname[MAX_PATH + 1]; char mod[MAX_PATH + 1]; int i; @@ -424,7 +411,7 @@ int utf8_mkdir( const char *dirname ) errno = ENOENT; return -1; } - res = mkdir( locname, 0755 ); + res = mkdir( locname, mode ); LocaleFree( locname ); return res; @@ -432,11 +419,11 @@ int utf8_mkdir( const char *dirname ) } /** - * utf8_opendir: wrapper that converts dirname to the locale in use by the OS + * Opens a DIR pointer using UTF-8 paths * * @param dirname UTF-8 representation of the directory name - * - * @return a pointer to the DIR struct. Release with closedir(). + * @return a pointer to the DIR struct, or NULL in case of error. + * Release with standard closedir(). */ DIR *utf8_opendir( const char *dirname ) { @@ -464,12 +451,12 @@ DIR *utf8_opendir( const char *dirname ) } /** - * utf8_readdir: a readdir wrapper that returns the name of the next entry - * in the directory as a UTF-8 string. + * Reads the next file name from an open directory. * * @param dir The directory that is being read * - * @return a UTF-8 string of the directory entry. Use LocaleFree() to free this memory + * @return a UTF-8 string of the directory entry. + * Use free() to free this memory. */ char *utf8_readdir( DIR *dir ) { @@ -496,6 +483,10 @@ static int dummy_select( const char *str ) return 1; } +/** + * Does the same as utf8_scandir(), but takes an open directory pointer + * instead of a directory path. + */ int utf8_loaddir( DIR *dir, char ***namelist, int (*select)( const char * ), int (*compar)( const char **, const char ** ) ) @@ -552,6 +543,18 @@ int utf8_loaddir( DIR *dir, char ***namelist, return -1; } +/** + * Selects file entries from a directory, as GNU C scandir(), yet using + * UTF-8 file names. + * + * @param dirname UTF-8 diretory path + * @param pointer [OUT] pointer set, on succesful completion, to the address + * of a table of UTF-8 filenames. All filenames must be freed with free(). + * The table itself must be freed with free() as well. + * + * @return How many file names were selected (possibly 0), + * or -1 in case of error. + */ int utf8_scandir( const char *dirname, char ***namelist, int (*select)( const char * ), int (*compar)( const char **, const char ** ) ) @@ -568,7 +571,7 @@ int utf8_scandir( const char *dirname, char ***namelist, } static int utf8_statEx( const char *filename, struct stat *buf, - vlc_bool_t deref ) + bool deref ) { #if defined (WIN32) || defined (UNDER_CE) /* retrieve Windows OS version */ @@ -602,19 +605,75 @@ static int utf8_statEx( const char *filename, struct stat *buf, return -1; } - +/** + * Finds file/inode informations, as stat(). + * Consider usign fstat() instead, if possible. + * + * @param filename UTF-8 file path + */ int utf8_stat( const char *filename, struct stat *buf) { - return utf8_statEx( filename, buf, VLC_TRUE ); + return utf8_statEx( filename, buf, true ); } +/** + * Finds file/inode informations, as lstat(). + * Consider usign fstat() instead, if possible. + * + * @param filename UTF-8 file path + */ int utf8_lstat( const char *filename, struct stat *buf) { - return utf8_statEx( filename, buf, VLC_FALSE ); + return utf8_statEx( filename, buf, false ); } /** - * utf8_*printf: *printf with conversion from UTF-8 to local encoding + * Removes a file. + * + * @param filename a UTF-8 string with the name of the file you want to delete. + * @return A 0 return value indicates success. A -1 return value indicates an + * error, and an error code is stored in errno + */ +int utf8_unlink( const char *filename ) +{ +#if defined (WIN32) || defined (UNDER_CE) + if( GetVersion() < 0x80000000 ) + { + /* for Windows NT and above */ + wchar_t wpath[MAX_PATH + 1]; + + if( !MultiByteToWideChar( CP_UTF8, 0, filename, -1, wpath, MAX_PATH ) ) + { + errno = ENOENT; + return -1; + } + wpath[MAX_PATH] = L'\0'; + + /* + * unlink() cannot open files with non-“ANSI” characters on Windows. + * We use _wunlink() instead. + */ + return _wunlink( wpath ); + } +#endif + const char *local_name = ToLocale( filename ); + + if( local_name == NULL ) + { + errno = ENOENT; + return -1; + } + + int ret = unlink( local_name ); + LocaleFree( local_name ); + return ret; +} + + + +/** + * Formats an UTF-8 string as vasprintf(), then print it to stdout, with + * appropriate conversion to local encoding. */ static int utf8_vasprintf( char **str, const char *fmt, va_list ap ) { @@ -628,6 +687,10 @@ static int utf8_vasprintf( char **str, const char *fmt, va_list ap ) return res; } +/** + * Formats an UTF-8 string as vfprintf(), then print it, with + * appropriate conversion to local encoding. + */ int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap ) { char *str; @@ -640,6 +703,10 @@ int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap ) return res; } +/** + * Formats an UTF-8 string as fprintf(), then print it, with + * appropriate conversion to local encoding. + */ int utf8_fprintf( FILE *stream, const char *fmt, ... ) { va_list ap; @@ -717,7 +784,7 @@ static char *CheckUTF8( char *str, char rep ) } /** - * EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks + * Replaces invalid/overlong UTF-8 sequences with question marks. * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly, * so we don't try that, even though it would be less disruptive. * @@ -730,7 +797,7 @@ char *EnsureUTF8( char *str ) /** - * IsUTF8: checks whether a string is a valid UTF-8 byte sequence. + * Checks whether a string is a valid UTF-8 byte sequence. * * @param str nul-terminated string to be checked *