# Unit/regression test
###############################################################################
if USE_LIBTOOL
-check_PROGRAMS = test_i18n_atof test_url
+check_PROGRAMS = test_i18n_atof test_url test_utf8
TESTS = $(check_PROGRAMS)
+endif
CFLAGS_tests = `$(VLC_CONFIG) --cflags libvlc`
test_url_SOURCES = test/url.c
test_url_LDADD = libvlc.la
test_url_CFLAGS = $(CFLAGS_tests)
-endif
+
+test_utf8_SOURCES = test/utf8.c
+test_utf8_LDADD = libvlc.la
+test_utf8_CFLAGS = $(CFLAGS_tests)
FORCE:
--- /dev/null
+/*****************************************************************************
+ * utf8.c: Test for UTF-8 encoding/decoding stuff
+ *****************************************************************************
+ * Copyright (C) 2006 Rémi Denis-Courmont
+ * $Id$
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#include <vlc/vlc.h>
+#include "vlc_charset.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+static void test (const char *in, const char *out)
+{
+ bool isutf8 = !strcmp (in, out);
+ char *str = strdup (in);
+ if (str == NULL)
+ abort ();
+
+ if (isutf8)
+ printf ("\"%s\" should be accepted...\n", in);
+ else
+ printf ("\"%s\" should be rewritten as \"%s\"...\n", in, out);
+
+ if ((IsUTF8 (in) != NULL) != isutf8)
+ {
+ printf (" ERROR: IsUTF8 (%s) failed\n", in);
+ exit (1);
+ }
+
+ if ((EnsureUTF8 (str) != NULL) != isutf8)
+ {
+ printf (" ERROR: EnsureUTF8 (%s) failed\n", in);
+ exit (2);
+ }
+
+ if (strcmp (str, out))
+ {
+ printf (" ERROR: got \"%s\"\n", str);
+ exit (3);
+ }
+
+ if ((EnsureUTF8 (str) == NULL) || IsUTF8 (str) == NULL)
+ {
+ printf (" ERROR: EnsureUTF8 (%s) is not UTF-8\n", in);
+ exit (4);
+ }
+ free (str);
+}
+
+int main (void)
+{
+ (void)setvbuf (stdout, NULL, _IONBF, 0);
+ test ("", "");
+
+ test ("this_should_not_be_modified_1234",
+ "this_should_not_be_modified_1234");
+
+ test ("\xFF", "?"); // invalid byte
+ test ("\xEF\xBB\xBFHello", "\xEF\xBB\xBFHello"); // BOM
+ test ("\x00\xE9", ""); // no conversion past end of string
+
+ test ("T\xC3\xA9l\xC3\xA9vision \xE2\x82\xAC", "Télévision €");
+ test ("T\xE9l\xE9vision", "T?l?vision");
+ test ("\xC1\x94\xC3\xa9l\xC3\xA9vision", "??élévision"); /* overlong */
+
+ test ("Hel\xF0\x83\x85\x87lo", "Hel????lo"); /* more overlong */
+ return 0;
+}
static char *CheckUTF8( char *str, char rep )
-#define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF))
{
- unsigned char *ptr, c;
-
+ uint8_t *ptr = (uint8_t *)str;
assert (str != NULL);
- ptr = (unsigned char *)str;
- while( (c = *ptr) != '\0' )
+ for (;;)
{
- /* US-ASCII, 1 byte */
- if( c <= 0x7F )
- ptr++; /* OK */
- else
- /* 2 bytes */
- if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
- {
- c = ptr[1];
- if( isutf8cont( c ) )
- ptr += 2; /* OK */
- else
- goto error;
- }
- else
- /* 3 bytes */
- if( c == 0xE0 )
- {
- c = ptr[1];
- if( ( c >= 0xA0 ) && ( c <= 0xBF ) )
- {
- c = ptr[2];
- if( isutf8cont( c ) )
- ptr += 3; /* OK */
- else
- goto error;
- }
- else
- goto error;
- }
- else
- if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
- || ( c == 0xEE ) || ( c == 0xEF ) )
- {
- c = ptr[1];
- if( isutf8cont( c ) )
- {
- c = ptr[2];
- if( isutf8cont( c ) )
- ptr += 3; /* OK */
- else
- goto error;
- }
- else
- goto error;
- }
- else
- if( c == 0xED )
- {
- c = ptr[1];
- if( ( c >= 0x80 ) && ( c <= 0x9F ) )
- {
- c = ptr[2];
- if( isutf8cont( c ) )
- ptr += 3; /* OK */
- else
- goto error;
- }
- else
- goto error;
- }
- else
- /* 4 bytes */
- if( c == 0xF0 )
- {
- c = ptr[1];
- if( ( c >= 0x90 ) && ( c <= 0xBF ) )
+ uint8_t c = ptr[0];
+ int charlen = -1;
+
+ if (c == '\0')
+ break;
+
+ for (int i = 0; i < 7; i++)
+ if ((c >> (7 - i)) == ((0xff >> (7 - i)) ^ 1))
{
- c = ptr[2];
- if( isutf8cont( c ) )
- {
- c = ptr[3];
- if( isutf8cont( c ) )
- ptr += 4; /* OK */
- else
- goto error;
- }
- else
- goto error;
+ charlen = i;
+ break;
}
- else
- goto error;
- }
- else
- if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
+
+ switch (charlen)
{
- c = ptr[1];
- if( isutf8cont( c ) )
- {
- c = ptr[2];
- if( isutf8cont( c ) )
- {
- c = ptr[3];
- if( isutf8cont( c ) )
- ptr += 4; /* OK */
- goto error;
- }
- else
- goto error;
- }
- else
+ case 0: // 7-bit ASCII character -> OK
+ ptr++;
+ continue;
+
+ case -1: // 1111111x -> error
+ case 1: // continuation byte -> error
goto error;
}
- else
- if( c == 0xF4 )
+
+ assert (charlen >= 2);
+
+ uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
+ for (int i = 1; i < charlen; i++)
{
- c = ptr[1];
- if( ( c >= 0x80 ) && ( c <= 0x8F ) )
- {
- c = ptr[2];
- if( isutf8cont( c ) )
- {
- c = ptr[3];
- if( isutf8cont( c ) )
- ptr += 4; /* OK */
- else
- goto error;
- }
- else
- goto error;
- }
- else
+ assert (cp < (1 << 26));
+ c = ptr[i];
+
+ if ((c == '\0') // unexpected end of string
+ || ((c >> 6) != 2)) // not a continuation byte
goto error;
+
+ cp = (cp << 6) | (ptr[i] & 0x3f);
}
- else
+
+ if (cp < 128) // overlong (special case for ASCII)
+ goto error;
+ if (cp < (1u << (5 * charlen - 3))) // overlong
goto error;
+ ptr += charlen;
continue;
-error:
- if( rep == 0 )
+ error:
+ if (rep == 0)
return NULL;
- *ptr++ = '?';
+ *ptr++ = rep;
str = NULL;
}