From 9a55e8384d5115113f5623b98d66660ae2a89972 Mon Sep 17 00:00:00 2001 From: =?utf8?q?R=C3=A9mi=20Denis-Courmont?= Date: Fri, 15 Dec 2006 14:16:08 +0000 Subject: [PATCH] Simpler UTF-8 check functions + rudimentary unit test --- src/Makefile.am | 8 ++- src/test/utf8.c | 85 +++++++++++++++++++++++ src/text/unicode.c | 164 +++++++++++---------------------------------- 3 files changed, 131 insertions(+), 126 deletions(-) create mode 100644 src/test/utf8.c diff --git a/src/Makefile.am b/src/Makefile.am index f4247e06e6..5470367d6e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -363,8 +363,9 @@ endif # Unit/regression test ############################################################################### if USE_LIBTOOL -check_PROGRAMS = test_i18n_atof test_url +check_PROGRAMS = test_i18n_atof test_url test_utf8 TESTS = $(check_PROGRAMS) +endif CFLAGS_tests = `$(VLC_CONFIG) --cflags libvlc` @@ -375,6 +376,9 @@ test_i18n_atof_CFLAGS = $(CFLAGS_tests) test_url_SOURCES = test/url.c test_url_LDADD = libvlc.la test_url_CFLAGS = $(CFLAGS_tests) -endif + +test_utf8_SOURCES = test/utf8.c +test_utf8_LDADD = libvlc.la +test_utf8_CFLAGS = $(CFLAGS_tests) FORCE: diff --git a/src/test/utf8.c b/src/test/utf8.c new file mode 100644 index 0000000000..d06509afc1 --- /dev/null +++ b/src/test/utf8.c @@ -0,0 +1,85 @@ +/***************************************************************************** + * utf8.c: Test for UTF-8 encoding/decoding stuff + ***************************************************************************** + * Copyright (C) 2006 Rémi Denis-Courmont + * $Id$ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. + *****************************************************************************/ + +#include +#include "vlc_charset.h" + +#include +#include +#include + +static void test (const char *in, const char *out) +{ + bool isutf8 = !strcmp (in, out); + char *str = strdup (in); + if (str == NULL) + abort (); + + if (isutf8) + printf ("\"%s\" should be accepted...\n", in); + else + printf ("\"%s\" should be rewritten as \"%s\"...\n", in, out); + + if ((IsUTF8 (in) != NULL) != isutf8) + { + printf (" ERROR: IsUTF8 (%s) failed\n", in); + exit (1); + } + + if ((EnsureUTF8 (str) != NULL) != isutf8) + { + printf (" ERROR: EnsureUTF8 (%s) failed\n", in); + exit (2); + } + + if (strcmp (str, out)) + { + printf (" ERROR: got \"%s\"\n", str); + exit (3); + } + + if ((EnsureUTF8 (str) == NULL) || IsUTF8 (str) == NULL) + { + printf (" ERROR: EnsureUTF8 (%s) is not UTF-8\n", in); + exit (4); + } + free (str); +} + +int main (void) +{ + (void)setvbuf (stdout, NULL, _IONBF, 0); + test ("", ""); + + test ("this_should_not_be_modified_1234", + "this_should_not_be_modified_1234"); + + test ("\xFF", "?"); // invalid byte + test ("\xEF\xBB\xBFHello", "\xEF\xBB\xBFHello"); // BOM + test ("\x00\xE9", ""); // no conversion past end of string + + test ("T\xC3\xA9l\xC3\xA9vision \xE2\x82\xAC", "Télévision €"); + test ("T\xE9l\xE9vision", "T?l?vision"); + test ("\xC1\x94\xC3\xa9l\xC3\xA9vision", "??élévision"); /* overlong */ + + test ("Hel\xF0\x83\x85\x87lo", "Hel????lo"); /* more overlong */ + return 0; +} diff --git a/src/text/unicode.c b/src/text/unicode.c index 5b4adf4a66..7979a44e3b 100644 --- a/src/text/unicode.c +++ b/src/text/unicode.c @@ -639,147 +639,63 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... ) static char *CheckUTF8( char *str, char rep ) -#define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF)) { - unsigned char *ptr, c; - + uint8_t *ptr = (uint8_t *)str; assert (str != NULL); - ptr = (unsigned char *)str; - while( (c = *ptr) != '\0' ) + for (;;) { - /* US-ASCII, 1 byte */ - if( c <= 0x7F ) - ptr++; /* OK */ - else - /* 2 bytes */ - if( ( c >= 0xC2 ) && ( c <= 0xDF ) ) - { - c = ptr[1]; - if( isutf8cont( c ) ) - ptr += 2; /* OK */ - else - goto error; - } - else - /* 3 bytes */ - if( c == 0xE0 ) - { - c = ptr[1]; - if( ( c >= 0xA0 ) && ( c <= 0xBF ) ) - { - c = ptr[2]; - if( isutf8cont( c ) ) - ptr += 3; /* OK */ - else - goto error; - } - else - goto error; - } - else - if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC ) - || ( c == 0xEE ) || ( c == 0xEF ) ) - { - c = ptr[1]; - if( isutf8cont( c ) ) - { - c = ptr[2]; - if( isutf8cont( c ) ) - ptr += 3; /* OK */ - else - goto error; - } - else - goto error; - } - else - if( c == 0xED ) - { - c = ptr[1]; - if( ( c >= 0x80 ) && ( c <= 0x9F ) ) - { - c = ptr[2]; - if( isutf8cont( c ) ) - ptr += 3; /* OK */ - else - goto error; - } - else - goto error; - } - else - /* 4 bytes */ - if( c == 0xF0 ) - { - c = ptr[1]; - if( ( c >= 0x90 ) && ( c <= 0xBF ) ) + uint8_t c = ptr[0]; + int charlen = -1; + + if (c == '\0') + break; + + for (int i = 0; i < 7; i++) + if ((c >> (7 - i)) == ((0xff >> (7 - i)) ^ 1)) { - c = ptr[2]; - if( isutf8cont( c ) ) - { - c = ptr[3]; - if( isutf8cont( c ) ) - ptr += 4; /* OK */ - else - goto error; - } - else - goto error; + charlen = i; + break; } - else - goto error; - } - else - if( ( c >= 0xF1 ) && ( c <= 0xF3 ) ) + + switch (charlen) { - c = ptr[1]; - if( isutf8cont( c ) ) - { - c = ptr[2]; - if( isutf8cont( c ) ) - { - c = ptr[3]; - if( isutf8cont( c ) ) - ptr += 4; /* OK */ - goto error; - } - else - goto error; - } - else + case 0: // 7-bit ASCII character -> OK + ptr++; + continue; + + case -1: // 1111111x -> error + case 1: // continuation byte -> error goto error; } - else - if( c == 0xF4 ) + + assert (charlen >= 2); + + uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen)); + for (int i = 1; i < charlen; i++) { - c = ptr[1]; - if( ( c >= 0x80 ) && ( c <= 0x8F ) ) - { - c = ptr[2]; - if( isutf8cont( c ) ) - { - c = ptr[3]; - if( isutf8cont( c ) ) - ptr += 4; /* OK */ - else - goto error; - } - else - goto error; - } - else + assert (cp < (1 << 26)); + c = ptr[i]; + + if ((c == '\0') // unexpected end of string + || ((c >> 6) != 2)) // not a continuation byte goto error; + + cp = (cp << 6) | (ptr[i] & 0x3f); } - else + + if (cp < 128) // overlong (special case for ASCII) + goto error; + if (cp < (1u << (5 * charlen - 3))) // overlong goto error; + ptr += charlen; continue; -error: - if( rep == 0 ) + error: + if (rep == 0) return NULL; - *ptr++ = '?'; + *ptr++ = rep; str = NULL; } -- 2.39.2