From 79de221529a46aa6f247d71fbbbcc20105802a91 Mon Sep 17 00:00:00 2001 From: =?utf8?q?R=C3=A9mi=20Denis-Courmont?= Date: Sun, 5 Apr 2009 13:21:31 +0300 Subject: [PATCH] resolve_xml_special_chars handles non-ASCII Unicode code points --- src/text/strings.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/text/strings.c b/src/text/strings.c index 77f175abfd..71caaaa8e2 100644 --- a/src/text/strings.c +++ b/src/text/strings.c @@ -384,23 +384,42 @@ void resolve_xml_special_chars( char *psz_value ) { if( *psz_value == '&' ) { - const char *psz_value1 = psz_value + 1; - if( *psz_value1 == '#' ) - { + if( psz_value[1] == '#' ) + { /* &#xxx; Unicode code point */ char *psz_end; - int i = strtol( psz_value+2, &psz_end, 10 ); + unsigned long cp = strtoul( psz_value+2, &psz_end, 10 ); if( *psz_end == ';' ) { - if( i >= 32 && i <= 126 ) + psz_value = psz_end + 1; + if( cp == 0 ) + (void)0; /* skip nuls */ + else + if( cp <= 0x7F ) + { + *p_pos = cp; + } + else + /* Unicode code point outside ASCII. + * &#xxx; representation is longer than UTF-8 :) */ + if( cp <= 0x7FF ) { - *p_pos = (char)i; - psz_value = psz_end+1; + *p_pos++ = 0xC0 | (cp >> 6); + *p_pos = 0x80 | (cp & 0x3F); } else + if( cp <= 0xFFFF ) { - /* Unhandled code, FIXME */ - *p_pos = *psz_value; - psz_value++; + *p_pos++ = 0xE0 | (cp >> 12); + *p_pos++ = 0x80 | ((cp >> 6) & 0x3F); + *p_pos = 0x80 | (cp & 0x3F); + } + else + if( cp <= 0x1FFFFF ) /* Outside the BMP */ + { /* Unicode stops at 10FFFF, but who cares? */ + *p_pos++ = 0xF0 | (cp >> 18); + *p_pos++ = 0x80 | ((cp >> 12) & 0x3F); + *p_pos++ = 0x80 | ((cp >> 6) & 0x3F); + *p_pos = 0x80 | (cp & 0x3F); } } else @@ -411,10 +430,10 @@ void resolve_xml_special_chars( char *psz_value ) } } else - { + { /* Well-known XML entity */ const struct xml_entity_s *ent; - ent = bsearch (psz_value1, xml_entities, + ent = bsearch (psz_value + 1, xml_entities, sizeof (xml_entities) / sizeof (*ent), sizeof (*ent), cmp_entity); if (ent != NULL) -- 2.39.5