]> git.sesse.net Git - ffmpeg/blobdiff - libavcodec/htmlsubtitles.c
Merge commit '01f1f017d831cf14617aaaeafcec3ae3a81efce7'
[ffmpeg] / libavcodec / htmlsubtitles.c
index 8b57febd26384c1c77585cbbec666ee35e764088..fb9f90042228d881d18a9ecf62fd726f98e09b55 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2010  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (c) 2017  Clément Bœsch <u@pkh.me>
  *
  * This file is part of FFmpeg.
  *
@@ -18,6 +19,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/parseutils.h"
 static int html_color_parse(void *log_ctx, const char *str)
 {
     uint8_t rgba[4];
+    int nb_sharps = 0;
+    while (str[nb_sharps] == '#')
+        nb_sharps++;
+    str += FFMAX(0, nb_sharps - 1);
     if (av_parse_color(rgba, str, strcspn(str, "\" >"), log_ctx) < 0)
         return -1;
     return rgba[0] | rgba[1] << 8 | rgba[2] << 16;
 }
 
-enum {
-    PARAM_UNKNOWN = -1,
-    PARAM_SIZE,
-    PARAM_COLOR,
-    PARAM_FACE,
-    PARAM_NUMBER
-};
-
-typedef struct SrtStack {
-    char tag[128];
-    char param[PARAM_NUMBER][128];
-} SrtStack;
-
 static void rstrip_spaces_buf(AVBPrint *buf)
 {
-    while (buf->len > 0 && buf->str[buf->len - 1] == ' ')
-        buf->str[--buf->len] = 0;
+    if (av_bprint_is_complete(buf))
+        while (buf->len > 0 && buf->str[buf->len - 1] == ' ')
+            buf->str[--buf->len] = 0;
+}
+
+/* skip all {\xxx} substrings except for {\an%d}
+   and all microdvd like styles such as {Y:xxx} */
+static void handle_open_brace(AVBPrint *dst, const char **inp, int *an, int *closing_brace_missing)
+{
+    int len = 0;
+    const char *in = *inp;
+
+    *an += sscanf(in, "{\\an%*1u}%n", &len) >= 0 && len > 0;
+
+    if (!*closing_brace_missing) {
+        if (   (*an != 1 && in[1] == '\\')
+            || (in[1] && strchr("CcFfoPSsYy", in[1]) && in[2] == ':')) {
+            char *bracep = strchr(in+2, '}');
+            if (bracep) {
+                *inp = bracep;
+                return;
+            } else
+                *closing_brace_missing = 1;
+        }
+    }
+
+    av_bprint_chars(dst, *in, 1);
 }
 
-void ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in)
+struct font_tag {
+    char face[128];
+    int size;
+    uint32_t color;
+};
+
+/*
+ * The general politic of the convert is to mask unsupported tags or formatting
+ * errors (but still alert the user/subtitles writer with an error/warning)
+ * without dropping any actual text content for the final user.
+ */
+int ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in)
 {
-    char *param, buffer[128], tmp[128];
-    int len, tag_close, sptr = 1, line_start = 1, an = 0, end = 0;
-    SrtStack stack[16];
+    char *param, buffer[128];
+    int len, tag_close, sptr = 0, line_start = 1, an = 0, end = 0;
+    int closing_brace_missing = 0;
+    int i, likely_a_tag;
+
+    /*
+     * state stack is only present for fonts since they are the only tags where
+     * the state is not binary. Here is a typical use case:
+     *
+     *   <font color="red" size=10>
+     *     red 10
+     *     <font size=50> RED AND BIG </font>
+     *     red 10 again
+     *   </font>
+     *
+     * On the other hand, using the state system for all the tags should be
+     * avoided because it breaks wrongly nested tags such as:
+     *
+     *   <b> foo <i> bar </b> bla </i>
+     *
+     * We don't want to break here; instead, we will treat all these tags as
+     * binary state markers. Basically, "<b>" will activate bold, and "</b>"
+     * will deactivate it, whatever the current state.
+     *
+     * This will also prevents cases where we have a random closing tag
+     * remaining after the opening one was dropped. Yes, this happens and we
+     * still don't want to print a "</b>" at the end of the dialog event.
+     */
+    struct font_tag stack[16];
 
-    stack[0].tag[0] = 0;
-    strcpy(stack[0].param[PARAM_SIZE],  "{\\fs}");
-    strcpy(stack[0].param[PARAM_COLOR], "{\\c}");
-    strcpy(stack[0].param[PARAM_FACE],  "{\\fn}");
+    memset(&stack[0], 0, sizeof(stack[0]));
 
     for (; !end && *in; in++) {
         switch (*in) {
@@ -78,91 +130,128 @@ void ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in)
             if (!line_start)
                 av_bprint_chars(dst, *in, 1);
             break;
-        case '{':    /* skip all {\xxx} substrings except for {\an%d}
-                        and all microdvd like styles such as {Y:xxx} */
-            len = 0;
-            an += sscanf(in, "{\\an%*1u}%n", &len) >= 0 && len > 0;
-            if ((an != 1 && (len = 0, sscanf(in, "{\\%*[^}]}%n", &len) >= 0 && len > 0)) ||
-                (len = 0, sscanf(in, "{%*1[CcFfoPSsYy]:%*[^}]}%n", &len) >= 0 && len > 0)) {
-                in += len - 1;
-            } else
-                av_bprint_chars(dst, *in, 1);
+        case '{':
+            handle_open_brace(dst, &in, &an, &closing_brace_missing);
             break;
         case '<':
+            /*
+             * "<<" are likely latin guillemets in ASCII or some kind of random
+             * style effect; see sub/badsyntax.srt in the FATE samples
+             * directory for real test cases.
+             */
+
+            likely_a_tag = 1;
+            for (i = 0; in[1] == '<'; i++) {
+                av_bprint_chars(dst, '<', 1);
+                likely_a_tag = 0;
+                in++;
+            }
+
             tag_close = in[1] == '/';
+            if (tag_close)
+                likely_a_tag = 1;
+
+            av_assert0(in[0] == '<');
+
             len = 0;
-            if (sscanf(in+tag_close+1, "%127[^>]>%n", buffer, &len) >= 1 && len > 0) {
+
+            if (sscanf(in+tag_close+1, "%127[^<>]>%n", buffer, &len) >= 1 && len > 0) {
+                const int skip = len + tag_close;
                 const char *tagname = buffer;
-                while (*tagname == ' ')
+                while (*tagname == ' ') {
+                    likely_a_tag = 0;
                     tagname++;
+                }
                 if ((param = strchr(tagname, ' ')))
                     *param++ = 0;
-                if ((!tag_close && sptr < FF_ARRAY_ELEMS(stack)) ||
-                    ( tag_close && sptr > 0 && !strcmp(stack[sptr-1].tag, tagname))) {
-                    int i, j, unknown = 0;
-                    in += len + tag_close;
-                    if (!tag_close)
-                        memset(stack+sptr, 0, sizeof(*stack));
-                    if (!strcmp(tagname, "font")) {
-                        if (tag_close) {
-                            for (i=PARAM_NUMBER-1; i>=0; i--)
-                                if (stack[sptr-1].param[i][0])
-                                    for (j=sptr-2; j>=0; j--)
-                                        if (stack[j].param[i][0]) {
-                                            av_bprintf(dst, "%s", stack[j].param[i]);
-                                            break;
-                                        }
-                        } else {
-                            while (param) {
-                                if (!strncmp(param, "size=", 5)) {
-                                    unsigned font_size;
-                                    param += 5 + (param[5] == '"');
-                                    if (sscanf(param, "%u", &font_size) == 1) {
-                                        snprintf(stack[sptr].param[PARAM_SIZE],
-                                             sizeof(stack[0].param[PARAM_SIZE]),
-                                             "{\\fs%u}", font_size);
-                                    }
-                                } else if (!strncmp(param, "color=", 6)) {
-                                    param += 6 + (param[6] == '"');
-                                    snprintf(stack[sptr].param[PARAM_COLOR],
-                                         sizeof(stack[0].param[PARAM_COLOR]),
-                                         "{\\c&H%X&}",
-                                         html_color_parse(log_ctx, param));
-                                } else if (!strncmp(param, "face=", 5)) {
-                                    param += 5 + (param[5] == '"');
-                                    len = strcspn(param,
-                                                  param[-1] == '"' ? "\"" :" ");
-                                    av_strlcpy(tmp, param,
-                                               FFMIN(sizeof(tmp), len+1));
-                                    param += len;
-                                    snprintf(stack[sptr].param[PARAM_FACE],
-                                             sizeof(stack[0].param[PARAM_FACE]),
-                                             "{\\fn%s}", tmp);
+
+                /* Check if this is likely a tag */
+#define LIKELY_A_TAG_CHAR(x) (((x) >= '0' && (x) <= '9') || \
+                              ((x) >= 'a' && (x) <= 'z') || \
+                              ((x) >= 'A' && (x) <= 'Z') || \
+                               (x) == '_' || (x) == '/')
+                for (i = 0; tagname[i]; i++) {
+                    if (!LIKELY_A_TAG_CHAR(tagname[i])) {
+                        likely_a_tag = 0;
+                        break;
+                    }
+                }
+
+                if (!av_strcasecmp(tagname, "font")) {
+                    if (tag_close && sptr > 0) {
+                        struct font_tag *cur_tag  = &stack[sptr--];
+                        struct font_tag *last_tag = &stack[sptr];
+
+                        if (cur_tag->size) {
+                            if (!last_tag->size)
+                                av_bprintf(dst, "{\\fs}");
+                            else if (last_tag->size != cur_tag->size)
+                                av_bprintf(dst, "{\\fs%d}", last_tag->size);
+                        }
+
+                        if (cur_tag->color & 0xff000000) {
+                            if (!(last_tag->color & 0xff000000))
+                                av_bprintf(dst, "{\\c}");
+                            else if (last_tag->color != cur_tag->color)
+                                av_bprintf(dst, "{\\c&H%"PRIX32"&}", last_tag->color & 0xffffff);
+                        }
+
+                        if (cur_tag->face[0]) {
+                            if (!last_tag->face[0])
+                                av_bprintf(dst, "{\\fn}");
+                            else if (strcmp(last_tag->face, cur_tag->face))
+                                av_bprintf(dst, "{\\fn%s}", last_tag->face);
+                        }
+                    } else if (!tag_close && sptr < FF_ARRAY_ELEMS(stack) - 1) {
+                        struct font_tag *new_tag = &stack[sptr + 1];
+
+                        *new_tag = stack[sptr++];
+
+                        while (param) {
+                            if (!av_strncasecmp(param, "size=", 5)) {
+                                param += 5 + (param[5] == '"');
+                                if (sscanf(param, "%u", &new_tag->size) == 1)
+                                    av_bprintf(dst, "{\\fs%u}", new_tag->size);
+                            } else if (!av_strncasecmp(param, "color=", 6)) {
+                                int color;
+                                param += 6 + (param[6] == '"');
+                                color = html_color_parse(log_ctx, param);
+                                if (color >= 0) {
+                                    new_tag->color = 0xff000000 | color;
+                                    av_bprintf(dst, "{\\c&H%"PRIX32"&}", new_tag->color & 0xffffff);
                                 }
-                                if ((param = strchr(param, ' ')))
-                                    param++;
+                            } else if (!av_strncasecmp(param, "face=", 5)) {
+                                param += 5 + (param[5] == '"');
+                                len = strcspn(param,
+                                              param[-1] == '"' ? "\"" :" ");
+                                av_strlcpy(new_tag->face, param,
+                                           FFMIN(sizeof(new_tag->face), len+1));
+                                param += len;
+                                av_bprintf(dst, "{\\fn%s}", new_tag->face);
                             }
-                            for (i=0; i<PARAM_NUMBER; i++)
-                                if (stack[sptr].param[i][0])
-                                    av_bprintf(dst, "%s", stack[sptr].param[i]);
+                            if ((param = strchr(param, ' ')))
+                                param++;
                         }
-                    } else if (tagname[0] && !tagname[1] && strspn(tagname, "bisu") == 1) {
-                        av_bprintf(dst, "{\\%c%d}", tagname[0], !tag_close);
-                    } else {
-                        unknown = 1;
-                        snprintf(tmp, sizeof(tmp), "</%s>", tagname);
                     }
-                    if (tag_close) {
-                        sptr--;
-                    } else if (unknown && !strstr(in, tmp)) {
-                        in -= len + tag_close;
-                        av_bprint_chars(dst, *in, 1);
-                    } else
-                        av_strlcpy(stack[sptr++].tag, tagname,
-                                   sizeof(stack[0].tag));
-                    break;
+                    in += skip;
+                } else if (tagname[0] && !tagname[1] && strchr("bisu", av_tolower(tagname[0]))) {
+                    av_bprintf(dst, "{\\%c%d}", (char)av_tolower(tagname[0]), !tag_close);
+                    in += skip;
+                } else if (!av_strncasecmp(tagname, "br", 2) &&
+                           (!tagname[2] || (tagname[2] == '/' && !tagname[3]))) {
+                    av_bprintf(dst, "\\N");
+                    in += skip;
+                } else if (likely_a_tag) {
+                    if (!tag_close) // warn only once
+                        av_log(log_ctx, AV_LOG_WARNING, "Unrecognized tag %s\n", tagname);
+                    in += skip;
+                } else {
+                    av_bprint_chars(dst, '<', 1);
                 }
+            } else {
+                av_bprint_chars(dst, *in, 1);
             }
+            break;
         default:
             av_bprint_chars(dst, *in, 1);
             break;
@@ -171,8 +260,13 @@ void ff_htmlmarkup_to_ass(void *log_ctx, AVBPrint *dst, const char *in)
             line_start = 0;
     }
 
+    if (!av_bprint_is_complete(dst))
+        return AVERROR(ENOMEM);
+
     while (dst->len >= 2 && !strncmp(&dst->str[dst->len - 2], "\\N", 2))
         dst->len -= 2;
     dst->str[dst->len] = 0;
     rstrip_spaces_buf(dst);
+
+    return 0;
 }