git.sesse.net Git - ffmpeg/blob - libavcodec/movtextdec.c

   1 /*
   2  * 3GPP TS 26.245 Timed Text decoder
   3  * Copyright (c) 2012  Philip Langdale <philipl@overt.org>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "avcodec.h"
  23 #include "ass.h"
  24 #include "libavutil/opt.h"
  25 #include "libavutil/avstring.h"
  26 #include "libavutil/common.h"
  27 #include "libavutil/bprint.h"
  28 #include "libavutil/intreadwrite.h"
  29 #include "libavutil/mem.h"
  30 #include "bytestream.h"
  31
  32 #define STYLE_FLAG_BOLD         (1<<0)
  33 #define STYLE_FLAG_ITALIC       (1<<1)
  34 #define STYLE_FLAG_UNDERLINE    (1<<2)
  35
  36 #define BOX_SIZE_INITIAL    40
  37
  38 #define STYL_BOX   (1<<0)
  39 #define HLIT_BOX   (1<<1)
  40 #define HCLR_BOX   (1<<2)
  41 #define TWRP_BOX   (1<<3)
  42
  43 #define BOTTOM_LEFT     1
  44 #define BOTTOM_CENTER   2
  45 #define BOTTOM_RIGHT    3
  46 #define MIDDLE_LEFT     4
  47 #define MIDDLE_CENTER   5
  48 #define MIDDLE_RIGHT    6
  49 #define TOP_LEFT        7
  50 #define TOP_CENTER      8
  51 #define TOP_RIGHT       9
  52
  53 #define RGB_TO_BGR(c) (((c) & 0xff) << 16 | ((c) & 0xff00) | (((c) >> 16) & 0xff))
  54
  55 typedef struct {
  56     uint16_t fontID;
  57     const char *font;
  58     uint8_t fontsize;
  59     int color;
  60     uint8_t alpha;
  61     int back_color;
  62     uint8_t back_alpha;
  63     uint8_t bold;
  64     uint8_t italic;
  65     uint8_t underline;
  66     int alignment;
  67 } MovTextDefault;
  68
  69 typedef struct {
  70     uint16_t fontID;
  71     char *font;
  72 } FontRecord;
  73
  74 typedef struct {
  75     uint16_t style_start;
  76     uint16_t style_end;
  77     uint8_t style_flag;
  78     uint8_t bold;
  79     uint8_t italic;
  80     uint8_t underline;
  81     int color;
  82     uint8_t alpha;
  83     uint8_t fontsize;
  84     uint16_t style_fontID;
  85 } StyleBox;
  86
  87 typedef struct {
  88     uint16_t hlit_start;
  89     uint16_t hlit_end;
  90 } HighlightBox;
  91
  92 typedef struct {
  93    uint8_t hlit_color[4];
  94 } HilightcolorBox;
  95
  96 typedef struct {
  97     uint8_t wrap_flag;
  98 } TextWrapBox;
  99
 100 typedef struct {
 101     AVClass *class;
 102     StyleBox *s;
 103     HighlightBox h;
 104     HilightcolorBox c;
 105     FontRecord *ftab;
 106     TextWrapBox w;
 107     MovTextDefault d;
 108     uint8_t box_flags;
 109     uint16_t style_entries, ftab_entries;
 110     uint64_t tracksize;
 111     int size_var;
 112     int readorder;
 113     int frame_width;
 114     int frame_height;
 115 } MovTextContext;
 116
 117 typedef struct {
 118     uint32_t type;
 119     size_t base_size;
 120     int (*decode)(const uint8_t *tsmb, MovTextContext *m, const AVPacket *avpkt);
 121 } Box;
 122
 123 static void mov_text_cleanup(MovTextContext *m)
 124 {
 125     if (m->box_flags & STYL_BOX) {
 126         av_freep(&m->s);
 127         m->style_entries = 0;
 128     }
 129 }
 130
 131 static void mov_text_cleanup_ftab(MovTextContext *m)
 132 {
 133     for (unsigned i = 0; i < m->ftab_entries; i++)
 134         av_freep(&m->ftab[i].font);
 135     av_freep(&m->ftab);
 136     m->ftab_entries = 0;
 137 }
 138
 139 static int mov_text_tx3g(AVCodecContext *avctx, MovTextContext *m)
 140 {
 141     const uint8_t *tx3g_ptr = avctx->extradata;
 142     int i, j = -1, font_length, remaining = avctx->extradata_size - BOX_SIZE_INITIAL;
 143     int8_t v_align, h_align;
 144     unsigned ftab_entries;
 145     StyleBox s_default;
 146
 147     m->ftab_entries = 0;
 148     if (remaining < 0)
 149         return -1;
 150
 151     // Display Flags
 152     tx3g_ptr += 4;
 153     // Alignment
 154     h_align = bytestream_get_byte(&tx3g_ptr);
 155     v_align = bytestream_get_byte(&tx3g_ptr);
 156     if (h_align == 0) {
 157         if (v_align == 0)
 158             m->d.alignment = TOP_LEFT;
 159         if (v_align == 1)
 160             m->d.alignment = MIDDLE_LEFT;
 161         if (v_align == -1)
 162             m->d.alignment = BOTTOM_LEFT;
 163     }
 164     if (h_align == 1) {
 165         if (v_align == 0)
 166             m->d.alignment = TOP_CENTER;
 167         if (v_align == 1)
 168             m->d.alignment = MIDDLE_CENTER;
 169         if (v_align == -1)
 170             m->d.alignment = BOTTOM_CENTER;
 171     }
 172     if (h_align == -1) {
 173         if (v_align == 0)
 174             m->d.alignment = TOP_RIGHT;
 175         if (v_align == 1)
 176             m->d.alignment = MIDDLE_RIGHT;
 177         if (v_align == -1)
 178             m->d.alignment = BOTTOM_RIGHT;
 179     }
 180     // Background Color
 181     m->d.back_color = bytestream_get_be24(&tx3g_ptr);
 182     m->d.back_alpha = bytestream_get_byte(&tx3g_ptr);
 183     // BoxRecord
 184     tx3g_ptr += 8;
 185     // StyleRecord
 186     tx3g_ptr += 4;
 187     // fontID
 188     m->d.fontID = bytestream_get_be16(&tx3g_ptr);
 189     // face-style-flags
 190     s_default.style_flag = bytestream_get_byte(&tx3g_ptr);
 191     m->d.bold = !!(s_default.style_flag & STYLE_FLAG_BOLD);
 192     m->d.italic = !!(s_default.style_flag & STYLE_FLAG_ITALIC);
 193     m->d.underline = !!(s_default.style_flag & STYLE_FLAG_UNDERLINE);
 194     // fontsize
 195     m->d.fontsize = bytestream_get_byte(&tx3g_ptr);
 196     // Primary color
 197     m->d.color = bytestream_get_be24(&tx3g_ptr);
 198     m->d.alpha = bytestream_get_byte(&tx3g_ptr);
 199     // FontRecord
 200     // FontRecord Size
 201     tx3g_ptr += 4;
 202     // ftab
 203     tx3g_ptr += 4;
 204
 205     // In case of broken header, init default font
 206     m->d.font = ASS_DEFAULT_FONT;
 207
 208     ftab_entries = bytestream_get_be16(&tx3g_ptr);
 209     if (!ftab_entries)
 210         return 0;
 211     remaining   -= 3 * ftab_entries;
 212     if (remaining < 0)
 213         return AVERROR_INVALIDDATA;
 214     m->ftab = av_calloc(ftab_entries, sizeof(*m->ftab));
 215     if (!m->ftab)
 216         return AVERROR(ENOMEM);
 217     m->ftab_entries = ftab_entries;
 218
 219     for (i = 0; i < m->ftab_entries; i++) {
 220         m->ftab[i].fontID = bytestream_get_be16(&tx3g_ptr);
 221         if (m->ftab[i].fontID == m->d.fontID)
 222             j = i;
 223         font_length = bytestream_get_byte(&tx3g_ptr);
 224
 225         remaining  -= font_length;
 226         if (remaining < 0) {
 227             mov_text_cleanup_ftab(m);
 228             return -1;
 229         }
 230         m->ftab[i].font = av_malloc(font_length + 1);
 231         if (!m->ftab[i].font) {
 232             mov_text_cleanup_ftab(m);
 233             return AVERROR(ENOMEM);
 234         }
 235         bytestream_get_buffer(&tx3g_ptr, m->ftab[i].font, font_length);
 236         m->ftab[i].font[font_length] = '\0';
 237     }
 238     if (j >= 0)
 239         m->d.font = m->ftab[j].font;
 240     return 0;
 241 }
 242
 243 static int decode_twrp(const uint8_t *tsmb, MovTextContext *m, const AVPacket *avpkt)
 244 {
 245     m->box_flags |= TWRP_BOX;
 246     m->w.wrap_flag = bytestream_get_byte(&tsmb);
 247     return 0;
 248 }
 249
 250 static int decode_hlit(const uint8_t *tsmb, MovTextContext *m, const AVPacket *avpkt)
 251 {
 252     m->box_flags |= HLIT_BOX;
 253     m->h.hlit_start = bytestream_get_be16(&tsmb);
 254     m->h.hlit_end   = bytestream_get_be16(&tsmb);
 255     return 0;
 256 }
 257
 258 static int decode_hclr(const uint8_t *tsmb, MovTextContext *m, const AVPacket *avpkt)
 259 {
 260     m->box_flags |= HCLR_BOX;
 261     bytestream_get_buffer(&tsmb, m->c.hlit_color, 4);
 262     return 0;
 263 }
 264
 265 static int decode_styl(const uint8_t *tsmb, MovTextContext *m, const AVPacket *avpkt)
 266 {
 267     int i;
 268     int style_entries = bytestream_get_be16(&tsmb);
 269     StyleBox *tmp;
 270
 271     // A single style record is of length 12 bytes.
 272     if (m->tracksize + m->size_var + 2 + style_entries * 12 > avpkt->size)
 273         return -1;
 274
 275     tmp = av_realloc_array(m->s, style_entries, sizeof(*m->s));
 276     if (!tmp)
 277         return AVERROR(ENOMEM);
 278     m->s             = tmp;
 279     m->style_entries = style_entries;
 280
 281     m->box_flags |= STYL_BOX;
 282     for(i = 0; i < m->style_entries; i++) {
 283         StyleBox *style = &m->s[i];
 284
 285         style->style_start = bytestream_get_be16(&tsmb);
 286         style->style_end   = bytestream_get_be16(&tsmb);
 287         if (   style->style_end < style->style_start
 288             || (i && style->style_start < m->s[i - 1].style_end)) {
 289             mov_text_cleanup(m);
 290             return AVERROR(ENOMEM);
 291         }
 292         if (style->style_start == style->style_end) {
 293             /* Skip this style as it applies to no character */
 294             tsmb += 8;
 295             m->style_entries--;
 296             i--;
 297             continue;
 298         }
 299
 300         style->style_fontID = bytestream_get_be16(&tsmb);
 301         style->style_flag   = bytestream_get_byte(&tsmb);
 302         style->bold      = !!(style->style_flag & STYLE_FLAG_BOLD);
 303         style->italic    = !!(style->style_flag & STYLE_FLAG_ITALIC);
 304         style->underline = !!(style->style_flag & STYLE_FLAG_UNDERLINE);
 305         style->fontsize  = bytestream_get_byte(&tsmb);
 306         style->color     = bytestream_get_be24(&tsmb);
 307         style->alpha     = bytestream_get_byte(&tsmb);
 308     }
 309     return 0;
 310 }
 311
 312 static const Box box_types[] = {
 313     { MKBETAG('s','t','y','l'), 2, decode_styl },
 314     { MKBETAG('h','l','i','t'), 4, decode_hlit },
 315     { MKBETAG('h','c','l','r'), 4, decode_hclr },
 316     { MKBETAG('t','w','r','p'), 1, decode_twrp }
 317 };
 318
 319 const static size_t box_count = FF_ARRAY_ELEMS(box_types);
 320
 321 // Return byte length of the UTF-8 sequence starting at text[0]. 0 on error.
 322 static int get_utf8_length_at(const char *text, const char *text_end)
 323 {
 324     const char *start = text;
 325     int err = 0;
 326     uint32_t c;
 327     GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), goto error;);
 328     if (err)
 329         goto error;
 330     return text - start;
 331 error:
 332     return 0;
 333 }
 334
 335 static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
 336                        AVCodecContext *avctx)
 337 {
 338     MovTextContext *m = avctx->priv_data;
 339     int i = 0;
 340     int text_pos = 0;
 341     int entry = 0;
 342     int color = m->d.color;
 343
 344     if (text < text_end && m->box_flags & TWRP_BOX) {
 345         if (m->w.wrap_flag == 1) {
 346             av_bprintf(buf, "{\\q1}"); /* End of line wrap */
 347         } else {
 348             av_bprintf(buf, "{\\q2}"); /* No wrap */
 349         }
 350     }
 351
 352     while (text < text_end) {
 353         int len;
 354
 355         if ((m->box_flags & STYL_BOX) && entry < m->style_entries) {
 356             const StyleBox *style = &m->s[entry];
 357             if (text_pos == style->style_end) {
 358                 av_bprintf(buf, "{\\r}");
 359                 color = m->d.color;
 360                 entry++;
 361                 style++;
 362             }
 363             if (entry < m->style_entries && text_pos == style->style_start) {
 364                 if (style->bold ^ m->d.bold)
 365                     av_bprintf(buf, "{\\b%d}", style->bold);
 366                 if (style->italic ^ m->d.italic)
 367                     av_bprintf(buf, "{\\i%d}", style->italic);
 368                 if (style->underline ^ m->d.underline)
 369                     av_bprintf(buf, "{\\u%d}", style->underline);
 370                 if (style->fontsize != m->d.fontsize)
 371                     av_bprintf(buf, "{\\fs%d}", style->fontsize);
 372                 if (style->style_fontID != m->d.fontID)
 373                     for (i = 0; i < m->ftab_entries; i++) {
 374                         if (style->style_fontID == m->ftab[i].fontID)
 375                             av_bprintf(buf, "{\\fn%s}", m->ftab[i].font);
 376                     }
 377                 if (m->d.color != style->color) {
 378                     color = style->color;
 379                     av_bprintf(buf, "{\\1c&H%X&}", RGB_TO_BGR(color));
 380                 }
 381                 if (m->d.alpha != style->alpha)
 382                     av_bprintf(buf, "{\\1a&H%02X&}", 255 - style->alpha);
 383             }
 384         }
 385         if (m->box_flags & HLIT_BOX) {
 386             if (text_pos == m->h.hlit_start) {
 387                 /* If hclr box is present, set the secondary color to the color
 388                  * specified. Otherwise, set primary color to white and secondary
 389                  * color to black. These colors will come from TextSampleModifier
 390                  * boxes in future and inverse video technique for highlight will
 391                  * be implemented.
 392                  */
 393                 if (m->box_flags & HCLR_BOX) {
 394                     av_bprintf(buf, "{\\2c&H%02x%02x%02x&}", m->c.hlit_color[2],
 395                                 m->c.hlit_color[1], m->c.hlit_color[0]);
 396                 } else {
 397                     av_bprintf(buf, "{\\1c&H000000&}{\\2c&HFFFFFF&}");
 398                 }
 399             }
 400             if (text_pos == m->h.hlit_end) {
 401                 if (m->box_flags & HCLR_BOX) {
 402                     av_bprintf(buf, "{\\2c&H%X&}", RGB_TO_BGR(m->d.color));
 403                 } else {
 404                     av_bprintf(buf, "{\\1c&H%X&}{\\2c&H%X&}",
 405                                RGB_TO_BGR(color), RGB_TO_BGR(m->d.color));
 406                 }
 407             }
 408         }
 409
 410         len = get_utf8_length_at(text, text_end);
 411         if (len < 1) {
 412             av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in subtitle\n");
 413             len = 1;
 414         }
 415         switch (*text) {
 416         case '\r':
 417             break;
 418         case '\n':
 419             av_bprintf(buf, "\\N");
 420             break;
 421         default:
 422             av_bprint_append_data(buf, text, len);
 423             break;
 424         }
 425         text += len;
 426         text_pos++;
 427     }
 428
 429     return 0;
 430 }
 431
 432 static int mov_text_init(AVCodecContext *avctx) {
 433     /*
 434      * TODO: Handle the default text style.
 435      * NB: Most players ignore styles completely, with the result that
 436      * it's very common to find files where the default style is broken
 437      * and respecting it results in a worse experience than ignoring it.
 438      */
 439     int ret;
 440     MovTextContext *m = avctx->priv_data;
 441     ret = mov_text_tx3g(avctx, m);
 442     if (ret == 0) {
 443         if (!m->frame_width || !m->frame_height) {
 444             m->frame_width = ASS_DEFAULT_PLAYRESX;
 445             m->frame_height = ASS_DEFAULT_PLAYRESY;
 446         }
 447         return ff_ass_subtitle_header_full(avctx,
 448                     m->frame_width, m->frame_height,
 449                     m->d.font, m->d.fontsize,
 450                     (255U - m->d.alpha) << 24 | RGB_TO_BGR(m->d.color),
 451                     (255U - m->d.alpha) << 24 | RGB_TO_BGR(m->d.color),
 452                     (255U - m->d.back_alpha) << 24 | RGB_TO_BGR(m->d.back_color),
 453                     (255U - m->d.back_alpha) << 24 | RGB_TO_BGR(m->d.back_color),
 454                     m->d.bold, m->d.italic, m->d.underline,
 455                     ASS_DEFAULT_BORDERSTYLE, m->d.alignment);
 456     } else
 457         return ff_ass_subtitle_header_default(avctx);
 458 }
 459
 460 static int mov_text_decode_frame(AVCodecContext *avctx,
 461                             void *data, int *got_sub_ptr, AVPacket *avpkt)
 462 {
 463     AVSubtitle *sub = data;
 464     MovTextContext *m = avctx->priv_data;
 465     int ret;
 466     AVBPrint buf;
 467     char *ptr = avpkt->data;
 468     char *end;
 469     int text_length, tsmb_type, ret_tsmb;
 470     uint64_t tsmb_size;
 471     const uint8_t *tsmb;
 472     size_t i;
 473
 474     if (!ptr || avpkt->size < 2)
 475         return AVERROR_INVALIDDATA;
 476
 477     /*
 478      * A packet of size two with value zero is an empty subtitle
 479      * used to mark the end of the previous non-empty subtitle.
 480      * We can just drop them here as we have duration information
 481      * already. If the value is non-zero, then it's technically a
 482      * bad packet.
 483      */
 484     if (avpkt->size == 2)
 485         return AV_RB16(ptr) == 0 ? 0 : AVERROR_INVALIDDATA;
 486
 487     /*
 488      * The first two bytes of the packet are the length of the text string
 489      * In complex cases, there are style descriptors appended to the string
 490      * so we can't just assume the packet size is the string size.
 491      */
 492     text_length = AV_RB16(ptr);
 493     end = ptr + FFMIN(2 + text_length, avpkt->size);
 494     ptr += 2;
 495
 496     mov_text_cleanup(m);
 497
 498     tsmb_size = 0;
 499     m->tracksize = 2 + text_length;
 500     m->style_entries = 0;
 501     m->box_flags = 0;
 502     // Note that the spec recommends lines be no longer than 2048 characters.
 503     av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
 504     if (text_length + 2 != avpkt->size) {
 505         while (m->tracksize + 8 <= avpkt->size) {
 506             // A box is a minimum of 8 bytes.
 507             tsmb = ptr + m->tracksize - 2;
 508             tsmb_size = AV_RB32(tsmb);
 509             tsmb += 4;
 510             tsmb_type = AV_RB32(tsmb);
 511             tsmb += 4;
 512
 513             if (tsmb_size == 1) {
 514                 if (m->tracksize + 16 > avpkt->size)
 515                     break;
 516                 tsmb_size = AV_RB64(tsmb);
 517                 tsmb += 8;
 518                 m->size_var = 16;
 519             } else
 520                 m->size_var = 8;
 521             //size_var is equal to 8 or 16 depending on the size of box
 522
 523             if (tsmb_size == 0) {
 524                 av_log(avctx, AV_LOG_ERROR, "tsmb_size is 0\n");
 525                 return AVERROR_INVALIDDATA;
 526             }
 527
 528             if (tsmb_size > avpkt->size - m->tracksize)
 529                 break;
 530
 531             for (i = 0; i < box_count; i++) {
 532                 if (tsmb_type == box_types[i].type) {
 533                     if (m->tracksize + m->size_var + box_types[i].base_size > avpkt->size)
 534                         break;
 535                     ret_tsmb = box_types[i].decode(tsmb, m, avpkt);
 536                     if (ret_tsmb == -1)
 537                         break;
 538                 }
 539             }
 540             m->tracksize = m->tracksize + tsmb_size;
 541         }
 542         text_to_ass(&buf, ptr, end, avctx);
 543         mov_text_cleanup(m);
 544     } else
 545         text_to_ass(&buf, ptr, end, avctx);
 546
 547     ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL);
 548     av_bprint_finalize(&buf, NULL);
 549     if (ret < 0)
 550         return ret;
 551     *got_sub_ptr = sub->num_rects > 0;
 552     return avpkt->size;
 553 }
 554
 555 static int mov_text_decode_close(AVCodecContext *avctx)
 556 {
 557     MovTextContext *m = avctx->priv_data;
 558     mov_text_cleanup_ftab(m);
 559     mov_text_cleanup(m);
 560     return 0;
 561 }
 562
 563 static void mov_text_flush(AVCodecContext *avctx)
 564 {
 565     MovTextContext *m = avctx->priv_data;
 566     if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
 567         m->readorder = 0;
 568 }
 569
 570 #define OFFSET(x) offsetof(MovTextContext, x)
 571 #define FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_SUBTITLE_PARAM
 572 static const AVOption options[] = {
 573     { "width", "Frame width, usually video width", OFFSET(frame_width), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS },
 574     { "height", "Frame height, usually video height", OFFSET(frame_height), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS },
 575     { NULL },
 576 };
 577
 578 static const AVClass mov_text_decoder_class = {
 579     .class_name = "MOV text decoder",
 580     .item_name  = av_default_item_name,
 581     .option     = options,
 582     .version    = LIBAVUTIL_VERSION_INT,
 583 };
 584
 585 AVCodec ff_movtext_decoder = {
 586     .name         = "mov_text",
 587     .long_name    = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
 588     .type         = AVMEDIA_TYPE_SUBTITLE,
 589     .id           = AV_CODEC_ID_MOV_TEXT,
 590     .priv_data_size = sizeof(MovTextContext),
 591     .priv_class   = &mov_text_decoder_class,
 592     .init         = mov_text_init,
 593     .decode       = mov_text_decode_frame,
 594     .close        = mov_text_decode_close,
 595     .flush        = mov_text_flush,
 596 };