git.sesse.net Git - vlc/blob - src/misc/strings.c

   1 /*****************************************************************************
   2  * strings.c: String related functions
   3  *****************************************************************************
   4  * Copyright (C) 2006 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Antoine Cellerier <dionoea at videolan dot org>
   8  *          Daniel Stranger <vlc at schmaller dot de>
   9  *          Rémi Denis-Courmont <rem # videolan org>
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  24  *****************************************************************************/
  25
  26 /*****************************************************************************
  27  * Preamble
  28  *****************************************************************************/
  29 #include <vlc/vlc.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #include <stdlib.h>
  33 #include <assert.h>
  34
  35 #include "vlc_strings.h"
  36 #include "vlc_url.h"
  37 #include "charset.h"
  38
  39 /**
  40  * Unescape URI encoded string
  41  * \return decoded duplicated string
  42  */
  43 char *unescape_URI_duplicate( const char *psz )
  44 {
  45     char *psz_dup = strdup( psz );
  46     unescape_URI( psz_dup );
  47     return psz_dup;
  48 }
  49
  50 /**
  51  * Unescape URI encoded string in place
  52  * \return nothing
  53  */
  54 void unescape_URI( char *psz )
  55 {
  56     unsigned char *in = (unsigned char *)psz, *out = in, c;
  57
  58     while( ( c = *in++ ) != '\0' )
  59     {
  60         switch( c )
  61         {
  62             case '%':
  63             {
  64                 char val[5], *pval = val;
  65                 unsigned long cp;
  66
  67                 switch( c = *in++ )
  68                 {
  69                     case '\0':
  70                         return;
  71
  72                     case 'u':
  73                     case 'U':
  74                         if( ( *pval++ = *in++ ) == '\0' )
  75                             return;
  76                         if( ( *pval++ = *in++ ) == '\0' )
  77                             return;
  78                         c = *in++;
  79
  80                     default:
  81                         *pval++ = c;
  82                         if( ( *pval++ = *in++ ) == '\0' )
  83                             return;
  84                         *pval = '\0';
  85                 }
  86
  87                 cp = strtoul( val, NULL, 0x10 );
  88                 if( cp < 0x80 )
  89                     *out++ = cp;
  90                 else
  91                 if( cp < 0x800 )
  92                 {
  93                     *out++ = (( cp >>  6)         | 0xc0);
  94                     *out++ = (( cp        & 0x3f) | 0x80);
  95                 }
  96                 else
  97                 {
  98                     assert( cp < 0x10000 );
  99                     *out++ = (( cp >> 12)         | 0xe0);
 100                     *out++ = (((cp >>  6) & 0x3f) | 0x80);
 101                     *out++ = (( cp        & 0x3f) | 0x80);
 102                 }
 103                 break;
 104             }
 105
 106             /* + is not a special case - it means plus, not space. */
 107
 108             default:
 109                 /* Inserting non-ASCII or non-printable characters is unsafe,
 110                  * and no sane browser will send these unencoded */
 111                 if( ( c < 32 ) || ( c > 127 ) )
 112                     *out++ = '?';
 113                 else
 114                     *out++ = c;
 115         }
 116     }
 117     *out = '\0';
 118 }
 119
 120 /**
 121  * Decode encoded URI string
 122  * \return decoded duplicated string
 123  */
 124 char *decode_URI_duplicate( const char *psz )
 125 {
 126     char *psz_dup = strdup( psz );
 127     decode_URI( psz_dup );
 128     return psz_dup;
 129 }
 130
 131 /**
 132  * Decode encoded URI string in place
 133  * \return nothing
 134  */
 135 void decode_URI( char *psz )
 136 {
 137     unsigned char *in = (unsigned char *)psz, *out = in, c;
 138
 139     while( ( c = *in++ ) != '\0' )
 140     {
 141         switch( c )
 142         {
 143             case '%':
 144             {
 145                 char hex[3];
 146
 147                 if( ( ( hex[0] = *in++ ) == 0 )
 148                  || ( ( hex[1] = *in++ ) == 0 ) )
 149                     return;
 150
 151                 hex[2] = '\0';
 152                 *out++ = (unsigned char)strtoul( hex, NULL, 0x10 );
 153                 break;
 154             }
 155
 156             case '+':
 157                 *out++ = ' ';
 158                 break;
 159
 160             default:
 161                 /* Inserting non-ASCII or non-printable characters is unsafe,
 162                  * and no sane browser will send these unencoded */
 163                 if( ( c < 32 ) || ( c > 127 ) )
 164                     *out++ = '?';
 165                 else
 166                     *out++ = c;
 167         }
 168     }
 169     *out = '\0';
 170     EnsureUTF8( psz );
 171 }
 172
 173 static inline int isurlsafe( int c )
 174 {
 175     return ( (unsigned char)( c - 'a' ) < 26 )
 176             || ( (unsigned char)( c - 'A' ) < 26 )
 177             || ( (unsigned char)( c - '0' ) < 10 )
 178         /* Hmm, we should not encode character that are allowed in URLs
 179          * (even if they are not URL-safe), nor URL-safe characters.
 180          * We still encode some of them because of Microsoft's crap browser.
 181          */
 182             || ( strchr( "-_.", c ) != NULL );
 183 }
 184
 185 static inline char url_hexchar( int c )
 186 {
 187     return ( c < 10 ) ? c + '0' : c + 'A' - 10;
 188 }
 189
 190 /**
 191  * encode_URI_component
 192  * Encodes an URI component.
 193  *
 194  * @param psz_url nul-terminated UTF-8 representation of the component.
 195  * Obviously, you can't pass an URI containing a nul character, but you don't
 196  * want to do that, do you?
 197  *
 198  * @return encoded string (must be free()'d)
 199  */
 200 char *encode_URI_component( const char *psz_url )
 201 {
 202     char psz_enc[3 * strlen( psz_url ) + 1], *out = psz_enc;
 203     const uint8_t *in;
 204
 205     for( in = (const uint8_t *)psz_url; *in; in++ )
 206     {
 207         uint8_t c = *in;
 208
 209         if( isurlsafe( c ) )
 210             *out++ = (char)c;
 211         else
 212         if ( c == ' ')
 213             *out++ = '+';
 214         else
 215         {
 216             *out++ = '%';
 217             *out++ = url_hexchar( c >> 4 );
 218             *out++ = url_hexchar( c & 0xf );
 219         }
 220     }
 221     *out++ = '\0';
 222
 223     return strdup( psz_enc );
 224 }
 225
 226 /**
 227  * Converts "&lt;", "&gt;" and "&amp;" to "<", ">" and "&"
 228  * \param string to convert
 229  */
 230 void resolve_xml_special_chars( char *psz_value )
 231 {
 232     char *p_pos = psz_value;
 233
 234     while ( *psz_value )
 235     {
 236         if( !strncmp( psz_value, "&lt;", 4 ) )
 237         {
 238             *p_pos = '<';
 239             psz_value += 4;
 240         }
 241         else if( !strncmp( psz_value, "&gt;", 4 ) )
 242         {
 243             *p_pos = '>';
 244             psz_value += 4;
 245         }
 246         else if( !strncmp( psz_value, "&amp;", 5 ) )
 247         {
 248             *p_pos = '&';
 249             psz_value += 5;
 250         }
 251         else if( !strncmp( psz_value, "&quot;", 6 ) )
 252         {
 253             *p_pos = '\"';
 254             psz_value += 6;
 255         }
 256         else if( !strncmp( psz_value, "&#039;", 6 ) )
 257         {
 258             *p_pos = '\'';
 259             psz_value += 6;
 260         }
 261         else
 262         {
 263             *p_pos = *psz_value;
 264             psz_value++;
 265         }
 266
 267         p_pos++;
 268     }
 269
 270     *p_pos = '\0';
 271 }
 272
 273 /**
 274  * Converts '<', '>', '\"', '\'' and '&' to their html entities
 275  * \param psz_content simple element content that is to be converted
 276  */
 277 char *convert_xml_special_chars( const char *psz_content )
 278 {
 279     char *psz_temp = malloc( 6 * strlen( psz_content ) + 1 );
 280     const char *p_from = psz_content;
 281     char *p_to   = psz_temp;
 282
 283     while ( *p_from )
 284     {
 285         if ( *p_from == '<' )
 286         {
 287             strcpy( p_to, "&lt;" );
 288             p_to += 4;
 289         }
 290         else if ( *p_from == '>' )
 291         {
 292             strcpy( p_to, "&gt;" );
 293             p_to += 4;
 294         }
 295         else if ( *p_from == '&' )
 296         {
 297             strcpy( p_to, "&amp;" );
 298             p_to += 5;
 299         }
 300         else if( *p_from == '\"' )
 301         {
 302             strcpy( p_to, "&quot;" );
 303             p_to += 6;
 304         }
 305         else if( *p_from == '\'' )
 306         {
 307             strcpy( p_to, "&#039;" );
 308             p_to += 6;
 309         }
 310         else
 311         {
 312             *p_to = *p_from;
 313             p_to++;
 314         }
 315         p_from++;
 316     }
 317     *p_to = '\0';
 318
 319     return psz_temp;
 320 }