From 168549fa946743ac0e645ebf6bf9b014d5d38177 Mon Sep 17 00:00:00 2001 From: =?utf8?q?R=C3=A9mi=20Denis-Courmont?= Date: Mon, 27 Mar 2006 13:28:24 +0000 Subject: [PATCH] Support URL-escape non-ASCII characters properly as far as possible (ie. all characters from the Basic Multilingual Plane from Unicode) --- include/vlc_url.h | 57 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/include/vlc_url.h b/include/vlc_url.h index 60cade1bc4..62646af7dc 100644 --- a/include/vlc_url.h +++ b/include/vlc_url.h @@ -1,7 +1,7 @@ /***************************************************************************** * vlc_url.h: URL related macros ***************************************************************************** - * Copyright (C) 2002-2005 the VideoLAN team + * Copyright (C) 2002-2006 the VideoLAN team * $Id$ * * Authors: Christophe Massiot @@ -174,6 +174,7 @@ static inline int isurlsafe( int c ) return ( (unsigned char)( c - 'a' ) < 26 ) || ( (unsigned char)( c - 'A' ) < 26 ) || ( (unsigned char)( c - '0' ) < 10 ) + || ( (unsigned char)( c ) > 127 ) /* Hmm, we should not encode character that are allowed in URLs * (even if they are not URL-safe), nor URL-safe characters. * We still encode some of them because of Microsoft's crap browser. @@ -181,6 +182,11 @@ static inline int isurlsafe( int c ) || ( strchr( "-_.", c ) != NULL ); } +static inline char url_hexchar( int c ) +{ + return ( c < 10 ) ? c + '0' : c + 'A' - 10; +} + /***************************************************************************** * vlc_UrlEncode: ***************************************************************************** @@ -189,14 +195,9 @@ static inline int isurlsafe( int c ) *****************************************************************************/ static inline char *vlc_UrlEncode( const char *psz_url ) { - char *psz_enc, *out; + char psz_enc[3 * strlen( psz_url ) + 1], *out = psz_enc; const unsigned char *in; - psz_enc = (char *)malloc( 3 * strlen( psz_url ) + 1 ); - if( psz_enc == NULL ) - return NULL; - - out = psz_enc; for( in = (const unsigned char *)psz_url; *in; in++ ) { unsigned char c = *in; @@ -205,16 +206,48 @@ static inline char *vlc_UrlEncode( const char *psz_url ) *out++ = (char)c; else { + uint16_t cp; + *out++ = '%'; - *out++ = ( ( c >> 4 ) >= 0xA ) ? 'A' + ( c >> 4 ) - 0xA - : '0' + ( c >> 4 ); - *out++ = ( ( c & 0xf ) >= 0xA ) ? 'A' + ( c & 0xf ) - 0xA - : '0' + ( c & 0xf ); + /* UTF-8 to UCS-2 conversion */ + if( ( c & 0x7f ) == 0 ) + cp = c; + else + if( ( c & 0xe0 ) == 0xc0 ) + { + cp = ((c & 0x1f) << 6) | (in[1] & 0x3f); + in++; + } + else + if( ( c & 0xf0 ) == 0xe0 ) + { + cp = ((c & 0xf) << 12) | ((in[1] & 0x3f) << 6) | (in[2] & 0x3f); + in += 2; + } + else + /* cannot URL-encode code points outside the BMP */ + return NULL; + + if( cp < 0xff ) + { + /* Encode ISO-8859-1 characters */ + *out++ = url_hexchar( cp >> 4 ); + *out++ = url_hexchar( cp & 0xf ); + } + else + { + /* Encode non-Latin-1 characters */ + *out++ = 'u'; + *out++ = url_hexchar( cp >> 12 ); + *out++ = url_hexchar((cp >> 8) & 0xf ); + *out++ = url_hexchar((cp >> 4) & 0xf ); + *out++ = url_hexchar( cp & 0xf ); + } } } *out++ = '\0'; - return (char *)realloc( psz_enc, out - psz_enc ); + return strdup( psz_enc ); } /***************************************************************************** -- 2.39.2