]> git.sesse.net Git - vlc/commitdiff
Support URL-escape non-ASCII characters properly as far as possible
authorRémi Denis-Courmont <rem@videolan.org>
Mon, 27 Mar 2006 13:28:24 +0000 (13:28 +0000)
committerRémi Denis-Courmont <rem@videolan.org>
Mon, 27 Mar 2006 13:28:24 +0000 (13:28 +0000)
(ie. all characters from the Basic Multilingual Plane from Unicode)

include/vlc_url.h

index 60cade1bc41ab0a33c2dfabbb88da6ccee585541..62646af7dcc8c92547a1f1888f57f04376cbc105 100644 (file)
@@ -1,7 +1,7 @@
 /*****************************************************************************
  * vlc_url.h: URL related macros
  *****************************************************************************
- * Copyright (C) 2002-2005 the VideoLAN team
+ * Copyright (C) 2002-2006 the VideoLAN team
  * $Id$
  *
  * Authors: Christophe Massiot <massiot@via.ecp.fr>
@@ -174,6 +174,7 @@ static inline int isurlsafe( int c )
     return ( (unsigned char)( c - 'a' ) < 26 )
         || ( (unsigned char)( c - 'A' ) < 26 )
         || ( (unsigned char)( c - '0' ) < 10 )
+        || ( (unsigned char)( c ) > 127 )
         /* Hmm, we should not encode character that are allowed in URLs
          * (even if they are not URL-safe), nor URL-safe characters.
          * We still encode some of them because of Microsoft's crap browser.
@@ -181,6 +182,11 @@ static inline int isurlsafe( int c )
         || ( strchr( "-_.", c ) != NULL );
 }
 
+static inline char url_hexchar( int c )
+{
+    return ( c < 10 ) ? c + '0' : c + 'A' - 10;
+}
+
 /*****************************************************************************
  * vlc_UrlEncode:
  *****************************************************************************
@@ -189,14 +195,9 @@ static inline int isurlsafe( int c )
  *****************************************************************************/
 static inline char *vlc_UrlEncode( const char *psz_url )
 {
-    char *psz_enc, *out;
+    char psz_enc[3 * strlen( psz_url ) + 1], *out = psz_enc;
     const unsigned char *in;
 
-    psz_enc = (char *)malloc( 3 * strlen( psz_url ) + 1 );
-    if( psz_enc == NULL )
-        return NULL;
-
-    out = psz_enc;
     for( in = (const unsigned char *)psz_url; *in; in++ )
     {
         unsigned char c = *in;
@@ -205,16 +206,48 @@ static inline char *vlc_UrlEncode( const char *psz_url )
             *out++ = (char)c;
         else
         {
+            uint16_t cp;
+
             *out++ = '%';
-            *out++ = ( ( c >> 4 ) >= 0xA ) ? 'A' + ( c >> 4 ) - 0xA
-                                           : '0' + ( c >> 4 );
-            *out++ = ( ( c & 0xf ) >= 0xA ) ? 'A' + ( c & 0xf ) - 0xA
-                                           : '0' + ( c & 0xf );
+            /* UTF-8 to UCS-2 conversion */
+            if( ( c & 0x7f ) == 0 )
+                cp = c;
+            else
+            if( ( c & 0xe0 ) == 0xc0 )
+            {
+                cp = ((c & 0x1f) << 6) | (in[1] & 0x3f);
+                in++;
+            }
+            else
+            if( ( c & 0xf0 ) == 0xe0 )
+            {
+                cp = ((c & 0xf) << 12) | ((in[1] & 0x3f) << 6) | (in[2] & 0x3f);
+                in += 2;
+            }
+            else
+                /* cannot URL-encode code points outside the BMP */
+                return NULL;
+
+            if( cp < 0xff )
+            {
+                /* Encode ISO-8859-1 characters */
+                *out++ = url_hexchar( cp >> 4 );
+                *out++ = url_hexchar( cp & 0xf );
+            }
+            else
+            {
+                /* Encode non-Latin-1 characters */
+                *out++ = 'u';
+                *out++ = url_hexchar( cp >> 12       );
+                *out++ = url_hexchar((cp >>  8) & 0xf );
+                *out++ = url_hexchar((cp >>  4) & 0xf );
+                *out++ = url_hexchar( cp        & 0xf );
+            }
         }
     }
     *out++ = '\0';
 
-    return (char *)realloc( psz_enc, out - psz_enc );
+    return strdup( psz_enc );
 }
 
 /*****************************************************************************