From 73508ddd4ce715e88b8fc26563cb879c32c01403 Mon Sep 17 00:00:00 2001 From: =?utf8?q?R=C3=A9mi=20Denis-Courmont?= Date: Thu, 6 Apr 2006 20:38:43 +0000 Subject: [PATCH] - Reimplement URL component decoding separately from URL unescaping - Replace crap URL escaping with URL encoding, which is what we really need --- include/vlc_symbols.h | 9 ++++ include/vlc_url.h | 91 ++++------------------------------- src/misc/strings.c | 108 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 125 insertions(+), 83 deletions(-) diff --git a/include/vlc_symbols.h b/include/vlc_symbols.h index de064afc2b..c9dac0d99f 100644 --- a/include/vlc_symbols.h +++ b/include/vlc_symbols.h @@ -488,6 +488,9 @@ struct module_symbols_t const char * (*IsUTF8_inner) (const char *); const char * (*GetFallbackEncoding_inner) (void); int (*utf8_scandir_inner) (const char *dirname, char ***namelist, int (*select)( const char * ), int (*compar)( const char **, const char ** )); + char * (*decode_URI_duplicate_inner) (const char *psz); + void (*decode_URI_inner) (char *psz); + char * (*encode_URI_inner) (const char *psz); }; # if defined (__PLUGIN__) # define aout_FiltersCreatePipeline (p_symbols)->aout_FiltersCreatePipeline_inner @@ -956,6 +959,9 @@ struct module_symbols_t # define IsUTF8 (p_symbols)->IsUTF8_inner # define GetFallbackEncoding (p_symbols)->GetFallbackEncoding_inner # define utf8_scandir (p_symbols)->utf8_scandir_inner +# define decode_URI_duplicate (p_symbols)->decode_URI_duplicate_inner +# define decode_URI (p_symbols)->decode_URI_inner +# define encode_URI (p_symbols)->encode_URI_inner # elif defined (HAVE_DYNAMIC_PLUGINS) && !defined (__BUILTIN__) /****************************************************************** * STORE_SYMBOLS: store VLC APIs into p_symbols for plugin access. @@ -1427,6 +1433,9 @@ struct module_symbols_t ((p_symbols)->IsUTF8_inner) = IsUTF8; \ ((p_symbols)->GetFallbackEncoding_inner) = GetFallbackEncoding; \ ((p_symbols)->utf8_scandir_inner) = utf8_scandir; \ + ((p_symbols)->decode_URI_duplicate_inner) = decode_URI_duplicate; \ + ((p_symbols)->decode_URI_inner) = decode_URI; \ + ((p_symbols)->encode_URI_inner) = encode_URI; \ (p_symbols)->net_ConvertIPv4_deprecated = NULL; \ (p_symbols)->__stats_CounterGet_deprecated = NULL; \ (p_symbols)->__stats_TimerDumpAll_deprecated = NULL; \ diff --git a/include/vlc_url.h b/include/vlc_url.h index 32d92eb755..e91c697f80 100644 --- a/include/vlc_url.h +++ b/include/vlc_url.h @@ -171,90 +171,14 @@ static inline void vlc_UrlClean( vlc_url_t *url ) VLC_EXPORT( char *, unescape_URI_duplicate, ( const char *psz ) ); VLC_EXPORT( void, unescape_URI, ( char *psz ) ); +VLC_EXPORT( char *, decode_URI_duplicate, ( const char *psz ) ); +VLC_EXPORT( void, decode_URI, ( char *psz ) ); +VLC_EXPORT( char *, encode_URI, ( const char *psz ) ); -static inline int isurlsafe( int c ) -{ - return ( (unsigned char)( c - 'a' ) < 26 ) - || ( (unsigned char)( c - 'A' ) < 26 ) - || ( (unsigned char)( c - '0' ) < 10 ) - /* Hmm, we should not encode character that are allowed in URLs - * (even if they are not URL-safe), nor URL-safe characters. - * We still encode some of them because of Microsoft's crap browser. - */ - || ( strchr( "-_.", c ) != NULL ); -} - -static inline char url_hexchar( int c ) -{ - return ( c < 10 ) ? c + '0' : c + 'A' - 10; -} - -/***************************************************************************** - * vlc_UrlEncode: - ***************************************************************************** - * perform URL encoding - * (you do NOT want to do URL decoding - it is not reversible - do NOT do it) - *****************************************************************************/ static inline char *vlc_UrlEncode( const char *psz_url ) { - char psz_enc[3 * strlen( psz_url ) + 1], *out = psz_enc; - const uint8_t *in; - - for( in = (const uint8_t *)psz_url; *in; in++ ) - { - uint8_t c = *in; - - if( isurlsafe( c ) ) - { - *out++ = (char)c; - } - else - { - uint16_t cp; - - *out++ = '%'; - /* UTF-8 to UCS-2 conversion */ - if( ( c & 0x80 ) == 0 ) - { - cp = c; - } - else if( ( c & 0xe0 ) == 0xc0 ) - { - cp = (((uint16_t)c & 0x1f) << 6) | (in[1] & 0x3f); - in++; - } - else if( ( c & 0xf0 ) == 0xe0 ) - { - cp = (((uint16_t)c & 0xf) << 12) | (((uint16_t)(in[1]) & 0x3f) << 6) | (in[2] & 0x3f); - in += 2; - } - else - { - /* cannot URL-encode code points outside the BMP */ - /* better a wrong conversion than a crash */ - cp = '?'; - } - - if( cp < 0xff ) - { - /* Encode ISO-8859-1 characters */ - *out++ = url_hexchar( cp >> 4 ); - *out++ = url_hexchar( cp & 0xf ); - } - else - { - /* Encode non-Latin-1 characters */ - *out++ = 'u'; - *out++ = url_hexchar( cp >> 12 ); - *out++ = url_hexchar((cp >> 8) & 0xf ); - *out++ = url_hexchar((cp >> 4) & 0xf ); - *out++ = url_hexchar( cp & 0xf ); - } - } - } - *out++ = '\0'; - - return strdup( psz_enc ); + /* FIXME: do not encode / : ? and & _when_ not needed */ + return encode_URI( psz_url ); } /***************************************************************************** @@ -279,7 +203,10 @@ static inline int vlc_UrlIsNotEncoded( const char *psz_url ) ptr += 2; } else - if( !isurlsafe( c ) ) + if( ( (unsigned char)( c - 'a' ) < 26 ) + || ( (unsigned char)( c - 'A' ) < 26 ) + || ( (unsigned char)( c - '0' ) < 10 ) + || ( strchr( "-_.", c ) != NULL ) ) return 1; } return 0; /* looks fine - but maybe it is not encoded */ diff --git a/src/misc/strings.c b/src/misc/strings.c index 1b6fe8d02e..3e86471606 100644 --- a/src/misc/strings.c +++ b/src/misc/strings.c @@ -33,7 +33,8 @@ #include #include "vlc_strings.h" -#include "vlc_url.h" +#include "vlc_url.h" +#include "charset.h" /** * Unescape URI encoded string @@ -116,6 +117,111 @@ void unescape_URI( char *psz ) *out = '\0'; } +/** + * Decode encoded URI string + * \return decoded duplicated string + */ +char *decode_URI_duplicate( const char *psz ) +{ + char *psz_dup = strdup( psz ); + unescape_URI( psz_dup ); + return psz_dup; +} + +/** + * Decode encoded URI string in place + * \return nothing + */ +void decode_URI( char *psz ) +{ + unsigned char *in = (unsigned char *)psz, *out = in, c; + + while( ( c = *in++ ) != '\0' ) + { + switch( c ) + { + case '%': + { + char hex[2]; + + if( ( ( hex[0] = *in++ ) == 0 ) + || ( ( hex[1] = *in++ ) == 0 ) ) + return; + + hex[2] = '\0'; + *out++ = (unsigned char)strtoul( hex, NULL, 0x10 ); + break; + } + + case '+': + *out++ = ' '; + + default: + /* Inserting non-ASCII or non-printable characters is unsafe, + * and no sane browser will send these unencoded */ + if( ( c < 32 ) || ( c > 127 ) ) + *out++ = '?'; + else + *out++ = c; + } + } + *out = '\0'; + EnsureUTF8( psz ); +} + +static inline int isurlsafe( int c ) +{ + return ( (unsigned char)( c - 'a' ) < 26 ) + || ( (unsigned char)( c - 'A' ) < 26 ) + || ( (unsigned char)( c - '0' ) < 10 ) + /* Hmm, we should not encode character that are allowed in URLs + * (even if they are not URL-safe), nor URL-safe characters. + * We still encode some of them because of Microsoft's crap browser. + */ + || ( strchr( "-_.", c ) != NULL ); +} + +static inline char url_hexchar( int c ) +{ + return ( c < 10 ) ? c + '0' : c + 'A' - 10; +} + +/** + * encode_URI_component + * Encodes an URI component. + * + * @param psz_url nul-terminated UTF-8 representation of the component. + * Obviously, you can't pass an URI containing a nul character, but you don't + * want to do that, do you? + * + * @return encoded string (must be free()'d) + */ +char *encode_URI_component( const char *psz_url ) +{ + char psz_enc[3 * strlen( psz_url ) + 1], *out = psz_enc; + const uint8_t *in; + + for( in = (const uint8_t *)psz_url; *in; in++ ) + { + uint8_t c = *in; + + if( isurlsafe( c ) ) + *out++ = (char)c; + else + if ( c == ' ') + *out++ = '+'; + else + { + *out++ = '%'; + *out++ = url_hexchar( c >> 4 ); + *out++ = url_hexchar( c & 0xf ); + } + } + *out++ = '\0'; + + return strdup( psz_enc ); +} + /** * Converts "<", ">" and "&" to "<", ">" and "&" * \param string to convert -- 2.39.2