git.sesse.net Git - vlc/blob - src/misc/strings.c

   1 /*****************************************************************************
   2  * strings.c: String related functions
   3  *****************************************************************************
   4  * Copyright (C) 2006 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Antoine Cellerier <dionoea at videolan dot org>
   8  *          Daniel Stranger <vlc at schmaller dot de>
   9  *          Rémi Denis-Courmont <rem # videolan org>
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  24  *****************************************************************************/
  25
  26 /*****************************************************************************
  27  * Preamble
  28  *****************************************************************************/
  29 #include <vlc/vlc.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #include <stdlib.h>
  33 #include <assert.h>
  34
  35 #include "vlc_strings.h"
  36 #include "vlc_url.h"
  37 #include "charset.h"
  38
  39 /**
  40  * Unescape URI encoded string
  41  * \return decoded duplicated string
  42  */
  43 char *unescape_URI_duplicate( const char *psz )
  44 {
  45     char *psz_dup = strdup( psz );
  46     unescape_URI( psz_dup );
  47     return psz_dup;
  48 }
  49
  50 /**
  51  * Unescape URI encoded string in place
  52  * \return nothing
  53  */
  54 void unescape_URI( char *psz )
  55 {
  56     unsigned char *in = (unsigned char *)psz, *out = in, c;
  57
  58     while( ( c = *in++ ) != '\0' )
  59     {
  60         switch( c )
  61         {
  62             case '%':
  63             {
  64                 char val[5], *pval = val;
  65                 unsigned long cp;
  66
  67                 switch( c = *in++ )
  68                 {
  69                     case '\0':
  70                         return;
  71
  72                     case 'u':
  73                     case 'U':
  74                         if( ( *pval++ = *in++ ) == '\0' )
  75                             return;
  76                         if( ( *pval++ = *in++ ) == '\0' )
  77                             return;
  78                         c = *in++;
  79
  80                     default:
  81                         *pval++ = c;
  82                         if( ( *pval++ = *in++ ) == '\0' )
  83                             return;
  84                         *pval = '\0';
  85                 }
  86
  87                 cp = strtoul( val, NULL, 0x10 );
  88                 if( cp < 0x80 )
  89                     *out++ = cp;
  90                 else
  91                 if( cp < 0x800 )
  92                 {
  93                     *out++ = (( cp >>  6)         | 0xc0);
  94                     *out++ = (( cp        & 0x3f) | 0x80);
  95                 }
  96                 else
  97                 {
  98                     assert( cp < 0x10000 );
  99                     *out++ = (( cp >> 12)         | 0xe0);
 100                     *out++ = (((cp >>  6) & 0x3f) | 0x80);
 101                     *out++ = (( cp        & 0x3f) | 0x80);
 102                 }
 103                 break;
 104             }
 105
 106             /* + is not a special case - it means plus, not space. */
 107
 108             default:
 109                 /* Inserting non-ASCII or non-printable characters is unsafe,
 110                  * and no sane browser will send these unencoded */
 111                 if( ( c < 32 ) || ( c > 127 ) )
 112                     *out++ = '?';
 113                 else
 114                     *out++ = c;
 115         }
 116     }
 117     *out = '\0';
 118 }
 119
 120 /**
 121  * Decode encoded URI string
 122  * \return decoded duplicated string
 123  */
 124 char *decode_URI_duplicate( const char *psz )
 125 {
 126     char *psz_dup = strdup( psz );
 127     unescape_URI( psz_dup );
 128     return psz_dup;
 129 }
 130
 131 /**
 132  * Decode encoded URI string in place
 133  * \return nothing
 134  */
 135 void decode_URI( char *psz )
 136 {
 137     unsigned char *in = (unsigned char *)psz, *out = in, c;
 138
 139     while( ( c = *in++ ) != '\0' )
 140     {
 141         switch( c )
 142         {
 143             case '%':
 144             {
 145                 char hex[2];
 146
 147                 if( ( ( hex[0] = *in++ ) == 0 )
 148                  || ( ( hex[1] = *in++ ) == 0 ) )
 149                     return;
 150
 151                 hex[2] = '\0';
 152                 *out++ = (unsigned char)strtoul( hex, NULL, 0x10 );
 153                 break;
 154             }
 155
 156             case '+':
 157                 *out++ = ' ';
 158
 159             default:
 160                 /* Inserting non-ASCII or non-printable characters is unsafe,
 161                  * and no sane browser will send these unencoded */
 162                 if( ( c < 32 ) || ( c > 127 ) )
 163                     *out++ = '?';
 164                 else
 165                     *out++ = c;
 166         }
 167     }
 168     *out = '\0';
 169     EnsureUTF8( psz );
 170 }
 171
 172 static inline int isurlsafe( int c )
 173 {
 174     return ( (unsigned char)( c - 'a' ) < 26 )
 175             || ( (unsigned char)( c - 'A' ) < 26 )
 176             || ( (unsigned char)( c - '0' ) < 10 )
 177         /* Hmm, we should not encode character that are allowed in URLs
 178          * (even if they are not URL-safe), nor URL-safe characters.
 179          * We still encode some of them because of Microsoft's crap browser.
 180          */
 181             || ( strchr( "-_.", c ) != NULL );
 182 }
 183
 184 static inline char url_hexchar( int c )
 185 {
 186     return ( c < 10 ) ? c + '0' : c + 'A' - 10;
 187 }
 188
 189 /**
 190  * encode_URI_component
 191  * Encodes an URI component.
 192  *
 193  * @param psz_url nul-terminated UTF-8 representation of the component.
 194  * Obviously, you can't pass an URI containing a nul character, but you don't
 195  * want to do that, do you?
 196  *
 197  * @return encoded string (must be free()'d)
 198  */
 199 char *encode_URI_component( const char *psz_url )
 200 {
 201     char psz_enc[3 * strlen( psz_url ) + 1], *out = psz_enc;
 202     const uint8_t *in;
 203
 204     for( in = (const uint8_t *)psz_url; *in; in++ )
 205     {
 206         uint8_t c = *in;
 207
 208         if( isurlsafe( c ) )
 209             *out++ = (char)c;
 210         else
 211         if ( c == ' ')
 212             *out++ = '+';
 213         else
 214         {
 215             *out++ = '%';
 216             *out++ = url_hexchar( c >> 4 );
 217             *out++ = url_hexchar( c & 0xf );
 218         }
 219     }
 220     *out++ = '\0';
 221
 222     return strdup( psz_enc );
 223 }
 224
 225 /**
 226  * Converts "&lt;", "&gt;" and "&amp;" to "<", ">" and "&"
 227  * \param string to convert
 228  */
 229 void resolve_xml_special_chars( char *psz_value )
 230 {
 231     char *p_pos = psz_value;
 232
 233     while ( *psz_value )
 234     {
 235         if( !strncmp( psz_value, "&lt;", 4 ) )
 236         {
 237             *p_pos = '<';
 238             psz_value += 4;
 239         }
 240         else if( !strncmp( psz_value, "&gt;", 4 ) )
 241         {
 242             *p_pos = '>';
 243             psz_value += 4;
 244         }
 245         else if( !strncmp( psz_value, "&amp;", 5 ) )
 246         {
 247             *p_pos = '&';
 248             psz_value += 5;
 249         }
 250         else if( !strncmp( psz_value, "&quot;", 6 ) )
 251         {
 252             *p_pos = '\"';
 253             psz_value += 6;
 254         }
 255         else if( !strncmp( psz_value, "&#039;", 6 ) )
 256         {
 257             *p_pos = '\'';
 258             psz_value += 6;
 259         }
 260         else
 261         {
 262             *p_pos = *psz_value;
 263             psz_value++;
 264         }
 265
 266         p_pos++;
 267     }
 268
 269     *p_pos = '\0';
 270 }
 271
 272 /**
 273  * Converts '<', '>', '\"', '\'' and '&' to their html entities
 274  * \param psz_content simple element content that is to be converted
 275  */
 276 char *convert_xml_special_chars( const char *psz_content )
 277 {
 278     char *psz_temp = malloc( 6 * strlen( psz_content ) + 1 );
 279     const char *p_from = psz_content;
 280     char *p_to   = psz_temp;
 281
 282     while ( *p_from )
 283     {
 284         if ( *p_from == '<' )
 285         {
 286             strcpy( p_to, "&lt;" );
 287             p_to += 4;
 288         }
 289         else if ( *p_from == '>' )
 290         {
 291             strcpy( p_to, "&gt;" );
 292             p_to += 4;
 293         }
 294         else if ( *p_from == '&' )
 295         {
 296             strcpy( p_to, "&amp;" );
 297             p_to += 5;
 298         }
 299         else if( *p_from == '\"' )
 300         {
 301             strcpy( p_to, "&quot;" );
 302             p_to += 6;
 303         }
 304         else if( *p_from == '\'' )
 305         {
 306             strcpy( p_to, "&#039;" );
 307             p_to += 6;
 308         }
 309         else
 310         {
 311             *p_to = *p_from;
 312             p_to++;
 313         }
 314         p_from++;
 315     }
 316     *p_to = '\0';
 317
 318     return psz_temp;
 319 }