git.sesse.net Git - ffmpeg/blob - libavcodec/movtextdec.c

   1 /*
   2  * 3GPP TS 26.245 Timed Text decoder
   3  * Copyright (c) 2012  Philip Langdale <philipl@overt.org>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "avcodec.h"
  23 #include "ass.h"
  24 #include "libavutil/avstring.h"
  25 #include "libavutil/common.h"
  26 #include "libavutil/bprint.h"
  27 #include "libavutil/intreadwrite.h"
  28 #include "libavutil/mem.h"
  29
  30 #define STYLE_FLAG_BOLD         (1<<0)
  31 #define STYLE_FLAG_ITALIC       (1<<1)
  32 #define STYLE_FLAG_UNDERLINE    (1<<2)
  33
  34 #define BOX_SIZE_INITIAL    40
  35
  36 #define STYL_BOX   (1<<0)
  37 #define HLIT_BOX   (1<<1)
  38 #define HCLR_BOX   (1<<2)
  39 #define TWRP_BOX   (1<<3)
  40
  41 #define BOTTOM_LEFT     1
  42 #define BOTTOM_CENTER   2
  43 #define BOTTOM_RIGHT    3
  44 #define MIDDLE_LEFT     4
  45 #define MIDDLE_CENTER   5
  46 #define MIDDLE_RIGHT    6
  47 #define TOP_LEFT        7
  48 #define TOP_CENTER      8
  49 #define TOP_RIGHT       9
  50
  51 #define RGB_TO_BGR(c) (((c) & 0xff) << 16 | ((c) & 0xff00) | (((c) >> 16) & 0xff))
  52
  53 typedef struct {
  54     uint16_t fontID;
  55     char *font;
  56     uint8_t fontsize;
  57     int color;
  58     int back_color;
  59     uint8_t bold;
  60     uint8_t italic;
  61     uint8_t underline;
  62     int alignment;
  63 } MovTextDefault;
  64
  65 typedef struct {
  66     uint16_t fontID;
  67     char *font;
  68 } FontRecord;
  69
  70 typedef struct {
  71     uint16_t style_start;
  72     uint16_t style_end;
  73     uint8_t style_flag;
  74     uint8_t bold;
  75     uint8_t italic;
  76     uint8_t underline;
  77     uint8_t fontsize;
  78     uint16_t style_fontID;
  79 } StyleBox;
  80
  81 typedef struct {
  82     uint16_t hlit_start;
  83     uint16_t hlit_end;
  84 } HighlightBox;
  85
  86 typedef struct {
  87    uint8_t hlit_color[4];
  88 } HilightcolorBox;
  89
  90 typedef struct {
  91     uint8_t wrap_flag;
  92 } TextWrapBox;
  93
  94 typedef struct {
  95     StyleBox **s;
  96     StyleBox *s_temp;
  97     HighlightBox h;
  98     HilightcolorBox c;
  99     FontRecord **ftab;
 100     FontRecord *ftab_temp;
 101     TextWrapBox w;
 102     MovTextDefault d;
 103     uint8_t box_flags;
 104     uint16_t style_entries, ftab_entries;
 105     uint64_t tracksize;
 106     int size_var;
 107     int count_s, count_f;
 108     int readorder;
 109 } MovTextContext;
 110
 111 typedef struct {
 112     uint32_t type;
 113     size_t base_size;
 114     int (*decode)(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt);
 115 } Box;
 116
 117 static void mov_text_cleanup(MovTextContext *m)
 118 {
 119     int i;
 120     if (m->box_flags & STYL_BOX) {
 121         for(i = 0; i < m->count_s; i++) {
 122             av_freep(&m->s[i]);
 123         }
 124         av_freep(&m->s);
 125         m->count_s = 0;
 126         m->style_entries = 0;
 127     }
 128 }
 129
 130 static void mov_text_cleanup_ftab(MovTextContext *m)
 131 {
 132     int i;
 133     if (m->ftab_temp)
 134         av_freep(&m->ftab_temp->font);
 135     av_freep(&m->ftab_temp);
 136     if (m->ftab) {
 137         for(i = 0; i < m->count_f; i++) {
 138             av_freep(&m->ftab[i]->font);
 139             av_freep(&m->ftab[i]);
 140         }
 141     }
 142     av_freep(&m->ftab);
 143 }
 144
 145 static int mov_text_tx3g(AVCodecContext *avctx, MovTextContext *m)
 146 {
 147     uint8_t *tx3g_ptr = avctx->extradata;
 148     int i, box_size, font_length;
 149     int8_t v_align, h_align;
 150     StyleBox s_default;
 151
 152     m->count_f = 0;
 153     m->ftab_entries = 0;
 154     box_size = BOX_SIZE_INITIAL; /* Size till ftab_entries */
 155     if (avctx->extradata_size < box_size)
 156         return -1;
 157
 158     // Display Flags
 159     tx3g_ptr += 4;
 160     // Alignment
 161     h_align = *tx3g_ptr++;
 162     v_align = *tx3g_ptr++;
 163     if (h_align == 0) {
 164         if (v_align == 0)
 165             m->d.alignment = TOP_LEFT;
 166         if (v_align == 1)
 167             m->d.alignment = MIDDLE_LEFT;
 168         if (v_align == -1)
 169             m->d.alignment = BOTTOM_LEFT;
 170     }
 171     if (h_align == 1) {
 172         if (v_align == 0)
 173             m->d.alignment = TOP_CENTER;
 174         if (v_align == 1)
 175             m->d.alignment = MIDDLE_CENTER;
 176         if (v_align == -1)
 177             m->d.alignment = BOTTOM_CENTER;
 178     }
 179     if (h_align == -1) {
 180         if (v_align == 0)
 181             m->d.alignment = TOP_RIGHT;
 182         if (v_align == 1)
 183             m->d.alignment = MIDDLE_RIGHT;
 184         if (v_align == -1)
 185             m->d.alignment = BOTTOM_RIGHT;
 186     }
 187     // Background Color
 188     m->d.back_color = AV_RB24(tx3g_ptr);
 189     tx3g_ptr += 4;
 190     // BoxRecord
 191     tx3g_ptr += 8;
 192     // StyleRecord
 193     tx3g_ptr += 4;
 194     // fontID
 195     m->d.fontID = AV_RB16(tx3g_ptr);
 196     tx3g_ptr += 2;
 197     // face-style-flags
 198     s_default.style_flag = *tx3g_ptr++;
 199     m->d.bold = !!(s_default.style_flag & STYLE_FLAG_BOLD);
 200     m->d.italic = !!(s_default.style_flag & STYLE_FLAG_ITALIC);
 201     m->d.underline = !!(s_default.style_flag & STYLE_FLAG_UNDERLINE);
 202     // fontsize
 203     m->d.fontsize = *tx3g_ptr++;
 204     // Primary color
 205     m->d.color = AV_RB24(tx3g_ptr);
 206     tx3g_ptr += 4;
 207     // FontRecord
 208     // FontRecord Size
 209     tx3g_ptr += 4;
 210     // ftab
 211     tx3g_ptr += 4;
 212
 213     m->ftab_entries = AV_RB16(tx3g_ptr);
 214     tx3g_ptr += 2;
 215
 216     for (i = 0; i < m->ftab_entries; i++) {
 217
 218         box_size += 3;
 219         if (avctx->extradata_size < box_size) {
 220             mov_text_cleanup_ftab(m);
 221             m->ftab_entries = 0;
 222             return -1;
 223         }
 224         m->ftab_temp = av_mallocz(sizeof(*m->ftab_temp));
 225         if (!m->ftab_temp) {
 226             mov_text_cleanup_ftab(m);
 227             return AVERROR(ENOMEM);
 228         }
 229         m->ftab_temp->fontID = AV_RB16(tx3g_ptr);
 230         tx3g_ptr += 2;
 231         font_length = *tx3g_ptr++;
 232
 233         box_size = box_size + font_length;
 234         if (avctx->extradata_size < box_size) {
 235             mov_text_cleanup_ftab(m);
 236             m->ftab_entries = 0;
 237             return -1;
 238         }
 239         m->ftab_temp->font = av_malloc(font_length + 1);
 240         if (!m->ftab_temp->font) {
 241             mov_text_cleanup_ftab(m);
 242             return AVERROR(ENOMEM);
 243         }
 244         memcpy(m->ftab_temp->font, tx3g_ptr, font_length);
 245         m->ftab_temp->font[font_length] = '\0';
 246         av_dynarray_add(&m->ftab, &m->count_f, m->ftab_temp);
 247         if (!m->ftab) {
 248             mov_text_cleanup_ftab(m);
 249             return AVERROR(ENOMEM);
 250         }
 251         m->ftab_temp = NULL;
 252         tx3g_ptr = tx3g_ptr + font_length;
 253     }
 254     for (i = 0; i < m->ftab_entries; i++) {
 255         if (m->d.fontID == m->ftab[i]->fontID)
 256             m->d.font = m->ftab[i]->font;
 257     }
 258     return 0;
 259 }
 260
 261 static int decode_twrp(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
 262 {
 263     m->box_flags |= TWRP_BOX;
 264     m->w.wrap_flag = *tsmb++;
 265     return 0;
 266 }
 267
 268 static int decode_hlit(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
 269 {
 270     m->box_flags |= HLIT_BOX;
 271     m->h.hlit_start = AV_RB16(tsmb);
 272     tsmb += 2;
 273     m->h.hlit_end = AV_RB16(tsmb);
 274     tsmb += 2;
 275     return 0;
 276 }
 277
 278 static int decode_hclr(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
 279 {
 280     m->box_flags |= HCLR_BOX;
 281     memcpy(m->c.hlit_color, tsmb, 4);
 282     tsmb += 4;
 283     return 0;
 284 }
 285
 286 static int decode_styl(const uint8_t *tsmb, MovTextContext *m, AVPacket *avpkt)
 287 {
 288     int i;
 289     int style_entries = AV_RB16(tsmb);
 290     tsmb += 2;
 291     // A single style record is of length 12 bytes.
 292     if (m->tracksize + m->size_var + 2 + style_entries * 12 > avpkt->size)
 293         return -1;
 294
 295     m->style_entries = style_entries;
 296
 297     m->box_flags |= STYL_BOX;
 298     for(i = 0; i < m->style_entries; i++) {
 299         m->s_temp = av_malloc(sizeof(*m->s_temp));
 300         if (!m->s_temp) {
 301             mov_text_cleanup(m);
 302             return AVERROR(ENOMEM);
 303         }
 304         m->s_temp->style_start = AV_RB16(tsmb);
 305         tsmb += 2;
 306         m->s_temp->style_end = AV_RB16(tsmb);
 307
 308         if (   m->s_temp->style_end < m->s_temp->style_start
 309             || (m->count_s && m->s_temp->style_start < m->s[m->count_s - 1]->style_end)) {
 310             av_freep(&m->s_temp);
 311             mov_text_cleanup(m);
 312             return AVERROR(ENOMEM);
 313         }
 314
 315         tsmb += 2;
 316         m->s_temp->style_fontID = AV_RB16(tsmb);
 317         tsmb += 2;
 318         m->s_temp->style_flag = AV_RB8(tsmb);
 319         m->s_temp->bold = !!(m->s_temp->style_flag & STYLE_FLAG_BOLD);
 320         m->s_temp->italic = !!(m->s_temp->style_flag & STYLE_FLAG_ITALIC);
 321         m->s_temp->underline = !!(m->s_temp->style_flag & STYLE_FLAG_UNDERLINE);
 322         tsmb++;
 323         m->s_temp->fontsize = AV_RB8(tsmb);
 324         av_dynarray_add(&m->s, &m->count_s, m->s_temp);
 325         if(!m->s) {
 326             mov_text_cleanup(m);
 327             return AVERROR(ENOMEM);
 328         }
 329         tsmb++;
 330         // text-color-rgba
 331         tsmb += 4;
 332     }
 333     return 0;
 334 }
 335
 336 static const Box box_types[] = {
 337     { MKBETAG('s','t','y','l'), 2, decode_styl },
 338     { MKBETAG('h','l','i','t'), 4, decode_hlit },
 339     { MKBETAG('h','c','l','r'), 4, decode_hclr },
 340     { MKBETAG('t','w','r','p'), 1, decode_twrp }
 341 };
 342
 343 const static size_t box_count = FF_ARRAY_ELEMS(box_types);
 344
 345 // Return byte length of the UTF-8 sequence starting at text[0]. 0 on error.
 346 static int get_utf8_length_at(const char *text, const char *text_end)
 347 {
 348     const char *start = text;
 349     int err = 0;
 350     uint32_t c;
 351     GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), goto error;);
 352     if (err)
 353         goto error;
 354     return text - start;
 355 error:
 356     return 0;
 357 }
 358
 359 static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
 360                        AVCodecContext *avctx)
 361 {
 362     MovTextContext *m = avctx->priv_data;
 363     int i = 0;
 364     int text_pos = 0;
 365     int style_active = 0;
 366     int entry = 0;
 367
 368     if (text < text_end && m->box_flags & TWRP_BOX) {
 369         if (m->w.wrap_flag == 1) {
 370             av_bprintf(buf, "{\\q1}"); /* End of line wrap */
 371         } else {
 372             av_bprintf(buf, "{\\q2}"); /* No wrap */
 373         }
 374     }
 375
 376     while (text < text_end) {
 377         int len;
 378
 379         if ((m->box_flags & STYL_BOX) && entry < m->style_entries) {
 380             if (text_pos == m->s[entry]->style_start) {
 381                 style_active = 1;
 382                 if (m->s[entry]->bold ^ m->d.bold)
 383                     av_bprintf(buf, "{\\b%d}", m->s[entry]->bold);
 384                 if (m->s[entry]->italic ^ m->d.italic)
 385                     av_bprintf(buf, "{\\i%d}", m->s[entry]->italic);
 386                 if (m->s[entry]->underline ^ m->d.underline)
 387                     av_bprintf(buf, "{\\u%d}", m->s[entry]->underline);
 388                 if (m->s[entry]->fontsize != m->d.fontsize)
 389                     av_bprintf(buf, "{\\fs%d}", m->s[entry]->fontsize);
 390                 if (m->s[entry]->style_fontID != m->d.fontID)
 391                     for (i = 0; i < m->ftab_entries; i++) {
 392                         if (m->s[entry]->style_fontID == m->ftab[i]->fontID)
 393                             av_bprintf(buf, "{\\fn%s}", m->ftab[i]->font);
 394                     }
 395             }
 396             if (text_pos == m->s[entry]->style_end) {
 397                 if (style_active) {
 398                     av_bprintf(buf, "{\\r}");
 399                     style_active = 0;
 400                 }
 401                 entry++;
 402             }
 403         }
 404         if (m->box_flags & HLIT_BOX) {
 405             if (text_pos == m->h.hlit_start) {
 406                 /* If hclr box is present, set the secondary color to the color
 407                  * specified. Otherwise, set primary color to white and secondary
 408                  * color to black. These colors will come from TextSampleModifier
 409                  * boxes in future and inverse video technique for highlight will
 410                  * be implemented.
 411                  */
 412                 if (m->box_flags & HCLR_BOX) {
 413                     av_bprintf(buf, "{\\2c&H%02x%02x%02x&}", m->c.hlit_color[2],
 414                                 m->c.hlit_color[1], m->c.hlit_color[0]);
 415                 } else {
 416                     av_bprintf(buf, "{\\1c&H000000&}{\\2c&HFFFFFF&}");
 417                 }
 418             }
 419             if (text_pos == m->h.hlit_end) {
 420                 if (m->box_flags & HCLR_BOX) {
 421                     av_bprintf(buf, "{\\2c&H000000&}");
 422                 } else {
 423                     av_bprintf(buf, "{\\1c&HFFFFFF&}{\\2c&H000000&}");
 424                 }
 425             }
 426         }
 427
 428         len = get_utf8_length_at(text, text_end);
 429         if (len < 1) {
 430             av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in subtitle\n");
 431             len = 1;
 432         }
 433         for (i = 0; i < len; i++) {
 434             switch (*text) {
 435             case '\r':
 436                 break;
 437             case '\n':
 438                 av_bprintf(buf, "\\N");
 439                 break;
 440             default:
 441                 av_bprint_chars(buf, *text, 1);
 442                 break;
 443             }
 444             text++;
 445         }
 446         text_pos++;
 447     }
 448
 449     return 0;
 450 }
 451
 452 static int mov_text_init(AVCodecContext *avctx) {
 453     /*
 454      * TODO: Handle the default text style.
 455      * NB: Most players ignore styles completely, with the result that
 456      * it's very common to find files where the default style is broken
 457      * and respecting it results in a worse experience than ignoring it.
 458      */
 459     int ret;
 460     MovTextContext *m = avctx->priv_data;
 461     ret = mov_text_tx3g(avctx, m);
 462     if (ret == 0) {
 463         return ff_ass_subtitle_header(avctx, m->d.font, m->d.fontsize,
 464                     RGB_TO_BGR(m->d.color),
 465                     RGB_TO_BGR(m->d.back_color),
 466                     m->d.bold, m->d.italic, m->d.underline,
 467                     ASS_DEFAULT_BORDERSTYLE, m->d.alignment);
 468     } else
 469         return ff_ass_subtitle_header_default(avctx);
 470 }
 471
 472 static int mov_text_decode_frame(AVCodecContext *avctx,
 473                             void *data, int *got_sub_ptr, AVPacket *avpkt)
 474 {
 475     AVSubtitle *sub = data;
 476     MovTextContext *m = avctx->priv_data;
 477     int ret;
 478     AVBPrint buf;
 479     char *ptr = avpkt->data;
 480     char *end;
 481     int text_length, tsmb_type, ret_tsmb;
 482     uint64_t tsmb_size;
 483     const uint8_t *tsmb;
 484     size_t i;
 485
 486     if (!ptr || avpkt->size < 2)
 487         return AVERROR_INVALIDDATA;
 488
 489     /*
 490      * A packet of size two with value zero is an empty subtitle
 491      * used to mark the end of the previous non-empty subtitle.
 492      * We can just drop them here as we have duration information
 493      * already. If the value is non-zero, then it's technically a
 494      * bad packet.
 495      */
 496     if (avpkt->size == 2)
 497         return AV_RB16(ptr) == 0 ? 0 : AVERROR_INVALIDDATA;
 498
 499     /*
 500      * The first two bytes of the packet are the length of the text string
 501      * In complex cases, there are style descriptors appended to the string
 502      * so we can't just assume the packet size is the string size.
 503      */
 504     text_length = AV_RB16(ptr);
 505     end = ptr + FFMIN(2 + text_length, avpkt->size);
 506     ptr += 2;
 507
 508     mov_text_cleanup(m);
 509
 510     tsmb_size = 0;
 511     m->tracksize = 2 + text_length;
 512     m->style_entries = 0;
 513     m->box_flags = 0;
 514     m->count_s = 0;
 515     // Note that the spec recommends lines be no longer than 2048 characters.
 516     av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
 517     if (text_length + 2 != avpkt->size) {
 518         while (m->tracksize + 8 <= avpkt->size) {
 519             // A box is a minimum of 8 bytes.
 520             tsmb = ptr + m->tracksize - 2;
 521             tsmb_size = AV_RB32(tsmb);
 522             tsmb += 4;
 523             tsmb_type = AV_RB32(tsmb);
 524             tsmb += 4;
 525
 526             if (tsmb_size == 1) {
 527                 if (m->tracksize + 16 > avpkt->size)
 528                     break;
 529                 tsmb_size = AV_RB64(tsmb);
 530                 tsmb += 8;
 531                 m->size_var = 16;
 532             } else
 533                 m->size_var = 8;
 534             //size_var is equal to 8 or 16 depending on the size of box
 535
 536             if (tsmb_size == 0) {
 537                 av_log(avctx, AV_LOG_ERROR, "tsmb_size is 0\n");
 538                 return AVERROR_INVALIDDATA;
 539             }
 540
 541             if (tsmb_size > avpkt->size - m->tracksize)
 542                 break;
 543
 544             for (i = 0; i < box_count; i++) {
 545                 if (tsmb_type == box_types[i].type) {
 546                     if (m->tracksize + m->size_var + box_types[i].base_size > avpkt->size)
 547                         break;
 548                     ret_tsmb = box_types[i].decode(tsmb, m, avpkt);
 549                     if (ret_tsmb == -1)
 550                         break;
 551                 }
 552             }
 553             m->tracksize = m->tracksize + tsmb_size;
 554         }
 555         text_to_ass(&buf, ptr, end, avctx);
 556         mov_text_cleanup(m);
 557     } else
 558         text_to_ass(&buf, ptr, end, avctx);
 559
 560     ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL);
 561     av_bprint_finalize(&buf, NULL);
 562     if (ret < 0)
 563         return ret;
 564     *got_sub_ptr = sub->num_rects > 0;
 565     return avpkt->size;
 566 }
 567
 568 static int mov_text_decode_close(AVCodecContext *avctx)
 569 {
 570     MovTextContext *m = avctx->priv_data;
 571     mov_text_cleanup_ftab(m);
 572     mov_text_cleanup(m);
 573     return 0;
 574 }
 575
 576 static void mov_text_flush(AVCodecContext *avctx)
 577 {
 578     MovTextContext *m = avctx->priv_data;
 579     if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP))
 580         m->readorder = 0;
 581 }
 582
 583 AVCodec ff_movtext_decoder = {
 584     .name         = "mov_text",
 585     .long_name    = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"),
 586     .type         = AVMEDIA_TYPE_SUBTITLE,
 587     .id           = AV_CODEC_ID_MOV_TEXT,
 588     .priv_data_size = sizeof(MovTextContext),
 589     .init         = mov_text_init,
 590     .decode       = mov_text_decode_frame,
 591     .close        = mov_text_decode_close,
 592     .flush        = mov_text_flush,
 593 };