From 9a55e8384d5115113f5623b98d66660ae2a89972 Mon Sep 17 00:00:00 2001
From: =?utf8?q?R=C3=A9mi=20Denis-Courmont?= <rem@videolan.org>
Date: Fri, 15 Dec 2006 14:16:08 +0000
Subject: [PATCH] Simpler UTF-8 check functions + rudimentary unit test

---
 src/Makefile.am    |   8 ++-
 src/test/utf8.c    |  85 +++++++++++++++++++++++
 src/text/unicode.c | 164 +++++++++++----------------------------------
 3 files changed, 131 insertions(+), 126 deletions(-)
 create mode 100644 src/test/utf8.c

diff --git a/src/Makefile.am b/src/Makefile.am
index f4247e06e6..5470367d6e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -363,8 +363,9 @@ endif
 # Unit/regression test
 ###############################################################################
 if USE_LIBTOOL
-check_PROGRAMS = test_i18n_atof test_url
+check_PROGRAMS = test_i18n_atof test_url test_utf8
 TESTS = $(check_PROGRAMS)
+endif
 
 CFLAGS_tests = `$(VLC_CONFIG) --cflags libvlc`
 
@@ -375,6 +376,9 @@ test_i18n_atof_CFLAGS = $(CFLAGS_tests)
 test_url_SOURCES = test/url.c
 test_url_LDADD = libvlc.la
 test_url_CFLAGS = $(CFLAGS_tests)
-endif
+
+test_utf8_SOURCES = test/utf8.c
+test_utf8_LDADD = libvlc.la
+test_utf8_CFLAGS = $(CFLAGS_tests)
 
 FORCE:
diff --git a/src/test/utf8.c b/src/test/utf8.c
new file mode 100644
index 0000000000..d06509afc1
--- /dev/null
+++ b/src/test/utf8.c
@@ -0,0 +1,85 @@
+/*****************************************************************************
+ * utf8.c: Test for UTF-8 encoding/decoding stuff
+ *****************************************************************************
+ * Copyright (C) 2006 RÃ©mi Denis-Courmont
+ * $Id$
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#include <vlc/vlc.h>
+#include "vlc_charset.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+static void test (const char *in, const char *out)
+{
+    bool isutf8 = !strcmp (in, out);
+    char *str = strdup (in);
+    if (str == NULL)
+        abort ();
+
+    if (isutf8)
+        printf ("\"%s\" should be accepted...\n", in);
+    else
+        printf ("\"%s\" should be rewritten as \"%s\"...\n", in, out);
+
+    if ((IsUTF8 (in) != NULL) != isutf8)
+    {
+        printf (" ERROR: IsUTF8 (%s) failed\n", in);
+        exit (1);
+    }
+
+    if ((EnsureUTF8 (str) != NULL) != isutf8)
+    {
+        printf (" ERROR: EnsureUTF8 (%s) failed\n", in);
+        exit (2);
+    }
+
+    if (strcmp (str, out))
+    {
+        printf (" ERROR: got \"%s\"\n", str);
+        exit (3);
+    }
+
+    if ((EnsureUTF8 (str) == NULL) || IsUTF8 (str) == NULL)
+    {
+        printf (" ERROR: EnsureUTF8 (%s) is not UTF-8\n", in);
+        exit (4);
+    }
+    free (str);
+}
+
+int main (void)
+{
+    (void)setvbuf (stdout, NULL, _IONBF, 0);
+    test ("", "");
+
+    test ("this_should_not_be_modified_1234",
+          "this_should_not_be_modified_1234");
+
+    test ("\xFF", "?"); // invalid byte
+    test ("\xEF\xBB\xBFHello", "\xEF\xBB\xBFHello"); // BOM
+    test ("\x00\xE9", ""); // no conversion past end of string
+
+    test ("T\xC3\xA9l\xC3\xA9vision \xE2\x82\xAC", "TÃ©lÃ©vision â¬");
+    test ("T\xE9l\xE9vision", "T?l?vision");
+    test ("\xC1\x94\xC3\xa9l\xC3\xA9vision", "??Ã©lÃ©vision"); /* overlong */
+
+    test ("Hel\xF0\x83\x85\x87lo", "Hel????lo"); /* more overlong */
+    return 0;
+}
diff --git a/src/text/unicode.c b/src/text/unicode.c
index 5b4adf4a66..7979a44e3b 100644
--- a/src/text/unicode.c
+++ b/src/text/unicode.c
@@ -639,147 +639,63 @@ int utf8_fprintf( FILE *stream, const char *fmt, ... )
 
 
 static char *CheckUTF8( char *str, char rep )
-#define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF)) 
 {
-    unsigned char *ptr, c;
-
+    uint8_t *ptr = (uint8_t *)str;
     assert (str != NULL);
 
-    ptr = (unsigned char *)str;
-    while( (c = *ptr) != '\0' )
+    for (;;)
     {
-        /* US-ASCII, 1 byte */
-        if( c <= 0x7F )
-            ptr++; /* OK */
-        else
-        /* 2 bytes */
-        if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
-        {
-            c = ptr[1];
-            if( isutf8cont( c ) )
-                ptr += 2; /* OK */
-            else
-                goto error;
-        }
-        else
-        /* 3 bytes */
-        if( c == 0xE0 )
-        {
-            c = ptr[1];
-            if( ( c >= 0xA0 ) && ( c <= 0xBF ) )
-            {
-                c = ptr[2];
-                if( isutf8cont( c ) )
-                    ptr += 3; /* OK */
-                else
-                    goto error;
-            }
-            else
-                goto error;
-        }
-        else
-        if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
-         || ( c == 0xEE ) || ( c == 0xEF ) )
-        {
-            c = ptr[1];
-            if( isutf8cont( c ) )
-            {
-                c = ptr[2];
-                if( isutf8cont( c ) )
-                    ptr += 3; /* OK */
-                else
-                    goto error;
-            }
-            else
-                goto error;
-        }
-        else
-        if( c == 0xED )
-        {
-            c = ptr[1];
-            if( ( c >= 0x80 ) && ( c <= 0x9F ) )
-            {
-                c = ptr[2];
-                if( isutf8cont( c ) )
-                    ptr += 3; /* OK */
-                else
-                    goto error;
-            }
-            else
-                goto error;
-        }
-        else
-        /* 4 bytes */
-        if( c == 0xF0 )
-        {
-            c = ptr[1];
-            if( ( c >= 0x90 ) && ( c <= 0xBF ) )
+        uint8_t c = ptr[0];
+        int charlen = -1;
+
+        if (c == '\0')
+            break;
+
+        for (int i = 0; i < 7; i++)
+            if ((c >> (7 - i)) == ((0xff >> (7 - i)) ^ 1))
             {
-                c = ptr[2];
-                if( isutf8cont( c ) )
-                {
-                    c = ptr[3];
-                    if( isutf8cont( c ) )
-                        ptr += 4; /* OK */
-                    else
-                        goto error;
-                }
-                else
-                    goto error;
+                charlen = i;
+                break;
             }
-            else
-                goto error;
-        }
-        else
-        if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
+
+        switch (charlen)
         {
-            c = ptr[1];
-            if( isutf8cont( c ) )
-            {
-                c = ptr[2];
-                if( isutf8cont( c ) )
-                {
-                    c = ptr[3];
-                    if( isutf8cont( c ) )
-                        ptr += 4; /* OK */
-                    goto error;
-                }
-                else
-                    goto error;
-            }
-            else
+            case 0: // 7-bit ASCII character -> OK
+                ptr++;
+                continue;
+
+            case -1: // 1111111x -> error
+            case 1: // continuation byte -> error
                 goto error;
         }
-        else
-        if( c == 0xF4 )
+
+        assert (charlen >= 2);
+
+        uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
+        for (int i = 1; i < charlen; i++)
         {
-            c = ptr[1];
-            if( ( c >= 0x80 ) && ( c <= 0x8F ) )
-            {
-                c = ptr[2];
-                if( isutf8cont( c ) )
-                {
-                    c = ptr[3];
-                    if( isutf8cont( c ) )
-                        ptr += 4; /* OK */
-                    else
-                        goto error;
-                }
-                else
-                    goto error;
-            }
-            else
+            assert (cp < (1 << 26));
+            c = ptr[i];
+
+            if ((c == '\0') // unexpected end of string
+             || ((c >> 6) != 2)) // not a continuation byte
                 goto error;
+
+            cp = (cp << 6) | (ptr[i] & 0x3f);
         }
-        else
+
+        if (cp < 128) // overlong (special case for ASCII)
+            goto error;
+        if (cp < (1u << (5 * charlen - 3))) // overlong
             goto error;
 
+        ptr += charlen;
         continue;
 
-error:
-        if( rep == 0 )
+    error:
+        if (rep == 0)
             return NULL;
-        *ptr++ = '?';
+        *ptr++ = rep;
         str = NULL;
     }
 
-- 
2.39.2