]> git.sesse.net Git - vlc/blob - modules/codec/subtitles/subsdec.c
Comments
[vlc] / modules / codec / subtitles / subsdec.c
1 /*****************************************************************************
2  * subsdec.c : text subtitles decoder
3  *****************************************************************************
4  * Copyright (C) 2000-2006 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Gildas Bazin <gbazin@videolan.org>
8  *          Samuel Hocevar <sam@zoy.org>
9  *          Derk-Jan Hartman <hartman at videolan dot org>
10  *          Bernie Purcell <bitmap@videolan.org>
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
25  *****************************************************************************/
26
27 /*****************************************************************************
28  * Preamble
29  *****************************************************************************/
30
31 #include "subsdec.h"
32
33 /*****************************************************************************
34  * Local prototypes
35  *****************************************************************************/
36 static int  OpenDecoder   ( vlc_object_t * );
37 static void CloseDecoder  ( vlc_object_t * );
38
39 static subpicture_t   *DecodeBlock   ( decoder_t *, block_t ** );
40 static subpicture_t   *ParseText     ( decoder_t *, block_t * );
41 static char           *StripTags      ( char * );
42 static char           *CreateHtmlSubtitle ( char * );
43
44
45 /*****************************************************************************
46  * Module descriptor.
47  *****************************************************************************/
48 static const char *ppsz_encodings[] = { DEFAULT_NAME, "ASCII", "UTF-8", "",
49     "ISO-8859-1", "CP1252", "MacRoman", "MacIceland","ISO-8859-15", "",
50     "ISO-8859-2", "CP1250", "MacCentralEurope", "MacCroatian", "MacRomania", "",
51     "ISO-8859-5", "CP1251", "MacCyrillic", "MacUkraine", "KOI8-R", "KOI8-U", "KOI8-RU", "",
52     "ISO-8859-6", "CP1256", "MacArabic", "",
53     "ISO-8859-7", "CP1253", "MacGreek", "",
54     "ISO-8859-8", "CP1255", "MacHebrew", "",
55     "ISO-8859-9", "CP1254", "MacTurkish", "",
56     "ISO-8859-13", "CP1257", "",
57     "ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS", "",
58     "ISO-2022-CN", "ISO-2022-CN-EXT", "EUC-CN", "EUC-TW", "BIG5", "BIG5-HKSCS", "",
59     "ISO-2022-KR", "EUC-KR", "",
60     "MacThai", "KOI8-T", "",
61     "ISO-8859-3", "ISO-8859-4", "ISO-8859-10", "ISO-8859-14", "ISO-8859-16", "",
62     "CP850", "CP862", "CP866", "CP874", "CP932", "CP949", "CP950", "CP1133", "CP1258", "",
63     "Macintosh", "",
64     "UTF-7", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE",
65     "C99", "JAVA", "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4", "UCS-4BE", "UCS-4LE", "",
66     "HZ", "GBK", "GB18030", "JOHAB", "ARMSCII-8",
67     "Georgian-Academy", "Georgian-PS", "TIS-620", "MuleLao-1", "VISCII", "TCVN",
68     "HPROMAN8", "NEXTSTEP" };
69 /*
70 SSA supports charset selection.
71 The following known charsets are used:
72
73 0 = Ansi - Western European
74 1 = default
75 2 = symbol
76 3 = invalid
77 77 = Mac
78 128 = Japanese (Shift JIS)
79 129 = Hangul
80 130 = Johab
81 134 = GB2312 Simplified Chinese
82 136 = Big5 Traditional Chinese
83 161 = Greek
84 162 = Turkish
85 163 = Vietnamese
86 177 = Hebrew
87 178 = Arabic
88 186 = Baltic
89 204 = Russian (Cyrillic)
90 222 = Thai
91 238 = Eastern European
92 254 = PC 437
93 */
94
95 static int  pi_justification[] = { 0, 1, 2 };
96 static const char *ppsz_justification_text[] = {N_("Center"),N_("Left"),N_("Right")};
97
98 #define ENCODING_TEXT N_("Subtitles text encoding")
99 #define ENCODING_LONGTEXT N_("Set the encoding used in text subtitles")
100 #define ALIGN_TEXT N_("Subtitles justification")
101 #define ALIGN_LONGTEXT N_("Set the justification of subtitles")
102 #define AUTODETECT_UTF8_TEXT N_("UTF-8 subtitles autodetection")
103 #define AUTODETECT_UTF8_LONGTEXT N_("This enables automatic detection of " \
104             "UTF-8 encoding within subtitles files.")
105 #define FORMAT_TEXT N_("Formatted Subtitles")
106 #define FORMAT_LONGTEXT N_("Some subtitle formats allow for text formatting. " \
107  "VLC partly implements this, but you can choose to disable all formatting.")
108
109
110 vlc_module_begin();
111     set_shortname( _("Subtitles"));
112     set_description( _("Text subtitles decoder") );
113     set_capability( "decoder", 50 );
114     set_callbacks( OpenDecoder, CloseDecoder );
115     set_category( CAT_INPUT );
116     set_subcategory( SUBCAT_INPUT_SCODEC );
117
118     add_integer( "subsdec-align", 0, NULL, ALIGN_TEXT, ALIGN_LONGTEXT,
119                  VLC_FALSE );
120         change_integer_list( pi_justification, ppsz_justification_text, 0 );
121     add_string( "subsdec-encoding", DEFAULT_NAME, NULL,
122                 ENCODING_TEXT, ENCODING_LONGTEXT, VLC_FALSE );
123         change_string_list( ppsz_encodings, 0, 0 );
124     add_bool( "subsdec-autodetect-utf8", VLC_TRUE, NULL,
125               AUTODETECT_UTF8_TEXT, AUTODETECT_UTF8_LONGTEXT, VLC_FALSE );
126     add_bool( "subsdec-formatted", VLC_TRUE, NULL, FORMAT_TEXT, FORMAT_LONGTEXT,
127                  VLC_FALSE );
128 vlc_module_end();
129
130 /*****************************************************************************
131  * OpenDecoder: probe the decoder and return score
132  *****************************************************************************
133  * Tries to launch a decoder and return score so that the interface is able
134  * to chose.
135  *****************************************************************************/
136 static int OpenDecoder( vlc_object_t *p_this )
137 {
138     decoder_t     *p_dec = (decoder_t*)p_this;
139     decoder_sys_t *p_sys;
140     vlc_value_t    val;
141
142     switch( p_dec->fmt_in.i_codec )
143     {
144         case VLC_FOURCC('s','u','b','t'):
145         case VLC_FOURCC('s','s','a',' '):
146         case VLC_FOURCC('t','1','4','0'):
147             break;
148         default:
149             return VLC_EGENERIC;
150     }
151
152     p_dec->pf_decode_sub = DecodeBlock;
153
154     /* Allocate the memory needed to store the decoder's structure */
155     p_dec->p_sys = p_sys = malloc( sizeof( *p_sys ) );
156     if( p_sys == NULL )
157     {
158         msg_Err( p_dec, "out of memory" );
159         return VLC_ENOMEM;
160     }
161
162     /* init of p_sys */
163     memset( p_sys, 0, sizeof( *p_sys ) );
164     p_sys->i_align = 0;
165     p_sys->iconv_handle = (vlc_iconv_t)-1;
166     p_sys->b_autodetect_utf8 = VLC_FALSE;
167     p_sys->b_ass = VLC_FALSE;
168     p_sys->i_original_height = -1;
169     p_sys->i_original_width = -1;
170     TAB_INIT( p_sys->i_ssa_styles, p_sys->pp_ssa_styles );
171     TAB_INIT( p_sys->i_images, p_sys->pp_images );
172
173     char *psz_charset = NULL;
174
175     /* First try demux-specified encoding */
176     if( p_dec->fmt_in.i_codec == VLC_FOURCC('t','1','4','0') )
177         psz_charset = strdup( "UTF-8" ); /* IUT T.140 is always using UTF-8 */
178     else
179     if( p_dec->fmt_in.subs.psz_encoding && *p_dec->fmt_in.subs.psz_encoding )
180     {
181         psz_charset = strdup (p_dec->fmt_in.subs.psz_encoding);
182         msg_Dbg (p_dec, "trying demuxer-specified character encoding: %s",
183                  p_dec->fmt_in.subs.psz_encoding ?: "not specified");
184     }
185
186     /* Second, try configured encoding */
187     if (psz_charset == NULL)
188     {
189         psz_charset = var_CreateGetNonEmptyString (p_dec, "subsdec-encoding");
190         if ((psz_charset != NULL) && !strcasecmp (psz_charset, DEFAULT_NAME))
191         {
192             free (psz_charset);
193             psz_charset = NULL;
194         }
195
196         msg_Dbg (p_dec, "trying configured character encoding: %s",
197                  psz_charset ?: "not specified");
198     }
199
200     /* Third, try "local" encoding with optional UTF-8 autodetection */
201     if (psz_charset == NULL)
202     {
203         psz_charset = strdup (GetFallbackEncoding ());
204         msg_Dbg (p_dec, "trying default character encoding: %s",
205                  psz_charset ?: "not specified");
206
207         if (var_CreateGetBool (p_dec, "subsdec-autodetect-utf8"))
208         {
209             msg_Dbg (p_dec, "using automatic UTF-8 detection");
210             p_sys->b_autodetect_utf8 = VLC_TRUE;
211         }
212     }
213
214     /* Forth, don't do character decoding, i.e. assume UTF-8 */
215     if (psz_charset == NULL)
216     {
217         psz_charset = strdup ("UTF-8");
218         msg_Dbg (p_dec, "trying hard-coded character encoding: %s",
219                  psz_charset ? psz_charset : "error");
220     }
221
222     /* Fifth, fail */
223     if (psz_charset == NULL)
224     {
225         free (p_sys);
226         return VLC_ENOMEM;
227     }
228
229     if (strcasecmp (psz_charset, "UTF-8") && strcasecmp (psz_charset, "utf8"))
230     {
231         p_sys->iconv_handle = vlc_iconv_open ("UTF-8", psz_charset);
232         if (p_sys->iconv_handle == (vlc_iconv_t)(-1))
233             msg_Err (p_dec, "cannot convert from %s: %m", psz_charset);
234     }
235     free (psz_charset);
236
237     var_Create( p_dec, "subsdec-align", VLC_VAR_INTEGER | VLC_VAR_DOINHERIT );
238     var_Get( p_dec, "subsdec-align", &val );
239     p_sys->i_align = val.i_int;
240
241     if( p_dec->fmt_in.i_codec == VLC_FOURCC('s','s','a',' ')
242      && var_CreateGetBool( p_dec, "subsdec-formatted" ) )
243     {
244         if( p_dec->fmt_in.i_extra > 0 )
245             ParseSSAHeader( p_dec );
246     }
247
248     return VLC_SUCCESS;
249 }
250
251 /****************************************************************************
252  * DecodeBlock: the whole thing
253  ****************************************************************************
254  * This function must be fed with complete subtitles units.
255  ****************************************************************************/
256 static subpicture_t *DecodeBlock( decoder_t *p_dec, block_t **pp_block )
257 {
258     subpicture_t *p_spu = NULL;
259
260     if( !pp_block || *pp_block == NULL ) return NULL;
261
262     p_spu = ParseText( p_dec, *pp_block );
263
264     block_Release( *pp_block );
265     *pp_block = NULL;
266
267     return p_spu;
268 }
269
270 /*****************************************************************************
271  * CloseDecoder: clean up the decoder
272  *****************************************************************************/
273 static void CloseDecoder( vlc_object_t *p_this )
274 {
275     decoder_t *p_dec = (decoder_t *)p_this;
276     decoder_sys_t *p_sys = p_dec->p_sys;
277
278     if( p_sys->iconv_handle != (vlc_iconv_t)-1 )
279         vlc_iconv_close( p_sys->iconv_handle );
280
281     if( p_sys->pp_ssa_styles )
282     {
283         int i;
284         for( i = 0; i < p_sys->i_ssa_styles; i++ )
285         {
286             if( !p_sys->pp_ssa_styles[i] )
287                 continue;
288
289             if( p_sys->pp_ssa_styles[i]->psz_stylename )
290                 free( p_sys->pp_ssa_styles[i]->psz_stylename );
291             if( p_sys->pp_ssa_styles[i]->font_style.psz_fontname )
292                 free( p_sys->pp_ssa_styles[i]->font_style.psz_fontname );
293             if( p_sys->pp_ssa_styles[i] )
294                 free( p_sys->pp_ssa_styles[i] );
295         }
296         TAB_CLEAN( p_sys->i_ssa_styles, p_sys->pp_ssa_styles );
297     }
298     if( p_sys->pp_images )
299     {
300         int i;
301         for( i = 0; i < p_sys->i_images; i++ )
302         {
303             if( !p_sys->pp_images[i] )
304                 continue;
305
306             if( p_sys->pp_images[i]->p_pic )
307                 p_sys->pp_images[i]->p_pic->pf_release( p_sys->pp_images[i]->p_pic );
308             if( p_sys->pp_images[i]->psz_filename )
309                 free( p_sys->pp_images[i]->psz_filename );
310
311             free( p_sys->pp_images[i] );
312         }
313         TAB_CLEAN( p_sys->i_images, p_sys->pp_images );
314     }
315
316     free( p_sys );
317 }
318
319 /*****************************************************************************
320  * ParseText: parse an text subtitle packet and send it to the video output
321  *****************************************************************************/
322 static subpicture_t *ParseText( decoder_t *p_dec, block_t *p_block )
323 {
324     decoder_sys_t *p_sys = p_dec->p_sys;
325     subpicture_t *p_spu = NULL;
326     char *psz_subtitle = NULL;
327     video_format_t fmt;
328
329     /* We cannot display a subpicture with no date */
330     if( p_block->i_pts == 0 )
331     {
332         msg_Warn( p_dec, "subtitle without a date" );
333         return NULL;
334     }
335
336     /* Check validity of packet data */
337     /* An "empty" line containing only \0 can be used to force
338        and ephemer picture from the screen */
339     if( p_block->i_buffer < 1 )
340     {
341         msg_Warn( p_dec, "no subtitle data" );
342         return NULL;
343     }
344
345     /* Should be resiliant against bad subtitles */
346     psz_subtitle = strndup( (const char *)p_block->p_buffer,
347                             p_block->i_buffer );
348     if( psz_subtitle == NULL )
349         return NULL;
350
351     if( p_sys->iconv_handle == (vlc_iconv_t)-1 )
352     {
353         if (EnsureUTF8( psz_subtitle ) == NULL)
354         {
355             msg_Err( p_dec, _("failed to convert subtitle encoding.\n"
356                      "Try manually setting a character-encoding "
357                      "before you open the file.") );
358         }
359     }
360     else
361     {
362
363         if( p_sys->b_autodetect_utf8 )
364         {
365             if( IsUTF8( psz_subtitle ) == NULL )
366             {
367                 msg_Dbg( p_dec, "invalid UTF-8 sequence: "
368                          "disabling UTF-8 subtitles autodetection" );
369                 p_sys->b_autodetect_utf8 = VLC_FALSE;
370             }
371         }
372
373         if( !p_sys->b_autodetect_utf8 )
374         {
375             size_t inbytes_left = strlen( psz_subtitle );
376             size_t outbytes_left = 6 * inbytes_left;
377             char *psz_new_subtitle = malloc( outbytes_left + 1 );
378             char *psz_convert_buffer_out = psz_new_subtitle;
379             const char *psz_convert_buffer_in = psz_subtitle;
380
381             size_t ret = vlc_iconv( p_sys->iconv_handle,
382                                     &psz_convert_buffer_in, &inbytes_left,
383                                     &psz_convert_buffer_out, &outbytes_left );
384
385             *psz_convert_buffer_out++ = '\0';
386             free( psz_subtitle );
387
388             if( ( ret == (size_t)(-1) ) || inbytes_left )
389             {
390                 free( psz_new_subtitle );
391                 msg_Err( p_dec, _("failed to convert subtitle encoding.\n"
392                         "Try manually setting a character-encoding "
393                                 "before you open the file.") );
394                 return NULL;
395             }
396
397             psz_subtitle = realloc( psz_new_subtitle,
398                                     psz_convert_buffer_out - psz_new_subtitle );
399         }
400     }
401
402     /* Create the subpicture unit */
403     p_spu = p_dec->pf_spu_buffer_new( p_dec );
404     if( !p_spu )
405     {
406         msg_Warn( p_dec, "can't get spu buffer" );
407         if( psz_subtitle ) free( psz_subtitle );
408         return NULL;
409     }
410
411     p_spu->b_pausable = VLC_TRUE;
412
413     /* Create a new subpicture region */
414     memset( &fmt, 0, sizeof(video_format_t) );
415     fmt.i_chroma = VLC_FOURCC('T','E','X','T');
416     fmt.i_aspect = 0;
417     fmt.i_width = fmt.i_height = 0;
418     fmt.i_x_offset = fmt.i_y_offset = 0;
419     p_spu->p_region = p_spu->pf_create_region( VLC_OBJECT(p_dec), &fmt );
420     if( !p_spu->p_region )
421     {
422         msg_Err( p_dec, "cannot allocate SPU region" );
423         if( psz_subtitle ) free( psz_subtitle );
424         p_dec->pf_spu_buffer_del( p_dec, p_spu );
425         return NULL;
426     }
427
428     /* Decode and format the subpicture unit */
429     if( p_dec->fmt_in.i_codec != VLC_FOURCC('s','s','a',' ') )
430     {
431         /* Normal text subs, easy markup */
432         p_spu->p_region->i_align = SUBPICTURE_ALIGN_BOTTOM | p_sys->i_align;
433         p_spu->i_x = p_sys->i_align ? 20 : 0;
434         p_spu->i_y = 10;
435
436         /* Remove formatting from string */
437
438         p_spu->p_region->psz_text = StripTags( psz_subtitle );
439         if( var_CreateGetBool( p_dec, "subsdec-formatted" ) )
440         {
441             p_spu->p_region->psz_html = CreateHtmlSubtitle( psz_subtitle );
442         }
443
444         p_spu->i_start = p_block->i_pts;
445         p_spu->i_stop = p_block->i_pts + p_block->i_length;
446         p_spu->b_ephemer = (p_block->i_length == 0);
447         p_spu->b_absolute = VLC_FALSE;
448     }
449     else
450     {
451         /* Decode SSA/USF strings */
452         if( p_dec->fmt_in.i_codec == VLC_FOURCC('s','s','a',' ') )
453             ParseSSAString( p_dec, psz_subtitle, p_spu );
454
455         p_spu->i_start = p_block->i_pts;
456         p_spu->i_stop = p_block->i_pts + p_block->i_length;
457         p_spu->b_ephemer = (p_block->i_length == 0);
458         p_spu->b_absolute = VLC_FALSE;
459         p_spu->i_original_picture_width = p_sys->i_original_width;
460         p_spu->i_original_picture_height = p_sys->i_original_height;
461     }
462     if( psz_subtitle ) free( psz_subtitle );
463
464     return p_spu;
465 }
466
467 char* GotoNextLine( char *psz_text )
468 {
469     char *p_newline = psz_text;
470
471     while( p_newline[0] != '\0' )
472     {
473         if( p_newline[0] == '\n' || p_newline[0] == '\r' )
474         {
475             p_newline++;
476             while( p_newline[0] == '\n' || p_newline[0] == '\r' )
477                 p_newline++;
478             break;
479         }
480         else p_newline++;
481     }
482     return p_newline;
483 }
484
485 /* Function now handles tags with attribute values, and tries
486  * to deal with &' commands too. It no longer modifies the string
487  * in place, so that the original text can be reused
488  */
489 static char *StripTags( char *psz_subtitle )
490 {
491     char *psz_text_start;
492     char *psz_text;
493
494     psz_text = psz_text_start = malloc( strlen( psz_subtitle ) + 1 );
495     if( !psz_text_start )
496         return NULL;
497
498     while( *psz_subtitle )
499     {
500         if( *psz_subtitle == '<' )
501         {
502             if( strncasecmp( psz_subtitle, "<br/>", 5 ) == 0 )
503                 *psz_text++ = '\n';
504
505             psz_subtitle += strcspn( psz_subtitle, ">" );
506         }
507         else if( *psz_subtitle == '&' )
508         {
509             if( !strncasecmp( psz_subtitle, "&lt;", 4 ))
510             {
511                 *psz_text++ = '<';
512                 psz_subtitle += strcspn( psz_subtitle, ";" );
513             }
514             else if( !strncasecmp( psz_subtitle, "&gt;", 4 ))
515             {
516                 *psz_text++ = '>';
517                 psz_subtitle += strcspn( psz_subtitle, ";" );
518             }
519             else if( !strncasecmp( psz_subtitle, "&amp;", 5 ))
520             {
521                 *psz_text++ = '&';
522                 psz_subtitle += strcspn( psz_subtitle, ";" );
523             }
524             else if( !strncasecmp( psz_subtitle, "&quot;", 6 ))
525             {
526                 *psz_text++ = '\"';
527                 psz_subtitle += strcspn( psz_subtitle, ";" );
528             }
529             else
530             {
531                 /* Assume it is just a normal ampersand */
532                 *psz_text++ = '&';
533             }
534         }
535         else
536         {
537             *psz_text++ = *psz_subtitle;
538         }
539
540         psz_subtitle++;
541     }
542     *psz_text = '\0';
543     psz_text_start = realloc( psz_text_start, strlen( psz_text_start ) + 1 );
544
545     return psz_text_start;
546 }
547
548 /* Try to respect any style tags present in the subtitle string. The main
549  * problem here is a lack of adequate specs for the subtitle formats.
550  * SSA/ASS and USF are both detail spec'ed -- but they are handled elsewhere.
551  * SAMI has a detailed spec, but extensive rework is needed in the demux
552  * code to prevent all this style information being excised, as it presently
553  * does.
554  * That leaves the others - none of which were (I guess) originally intended
555  * to be carrying style information. Over time people have used them that way.
556  * In the absence of specifications from which to work, the tags supported
557  * have been restricted to the simple set permitted by the USF DTD, ie. :
558  *  Basic: <br>, <i>, <b>, <u>
559  *  Extended: <font>
560  *    Attributes: face
561  *                family
562  *                size
563  *                color
564  *                outline-color
565  *                shadow-color
566  *                outline-level
567  *                shadow-level
568  *                back-color
569  *                alpha
570  * There is also the further restriction that the subtitle be well-formed
571  * as an XML entity, ie. the HTML sentence:
572  *        <b><i>Bold and Italics</b></i>
573  * doesn't qualify because the tags aren't nested one inside the other.
574  * <text> tags are automatically added to the output to ensure
575  * well-formedness.
576  * If the text doesn't qualify for any reason, a NULL string is
577  * returned, and the rendering engine will fall back to the
578  * plain text version of the subtitle.
579  */
580 static char *CreateHtmlSubtitle( char *psz_subtitle )
581 {
582     char    psz_tagStack[ 100 ];
583     size_t  i_buf_size     = strlen( psz_subtitle ) + 100;
584     char   *psz_html_start = malloc( i_buf_size );
585
586     psz_tagStack[ 0 ] = '\0';
587
588     if( psz_html_start != NULL )
589     {
590         char *psz_html = psz_html_start;
591
592         strcpy( psz_html, "<text>" );
593         psz_html += 6;
594
595         while( *psz_subtitle )
596         {
597             if( *psz_subtitle == '\n' )
598             {
599                 strcpy( psz_html, "<br/>" );
600                 psz_html += 5;
601                 psz_subtitle++;
602             }
603             else if( *psz_subtitle == '<' )
604             {
605                 if( !strncasecmp( psz_subtitle, "<br/>", 5 ))
606                 {
607                     strcpy( psz_html, "<br/>" );
608                     psz_html += 5;
609                     psz_subtitle += 5;
610                 }
611                 else if( !strncasecmp( psz_subtitle, "<b>", 3 ) )
612                 {
613                     strcpy( psz_html, "<b>" );
614                     strcat( psz_tagStack, "b" );
615                     psz_html += 3;
616                     psz_subtitle += 3;
617                 }
618                 else if( !strncasecmp( psz_subtitle, "<i>", 3 ) )
619                 {
620                     strcpy( psz_html, "<i>" );
621                     strcat( psz_tagStack, "i" );
622                     psz_html += 3;
623                     psz_subtitle += 3;
624                 }
625                 else if( !strncasecmp( psz_subtitle, "<u>", 3 ) )
626                 {
627                     strcpy( psz_html, "<u>" );
628                     strcat( psz_tagStack, "u" );
629                     psz_html += 3;
630                     psz_subtitle += 3;
631                 }
632                 else if( !strncasecmp( psz_subtitle, "<font ", 6 ))
633                 {
634                     const char *psz_attribs[] = { "face=\"", "family=\"", "size=\"",
635                             "color=\"", "outline-color=\"", "shadow-color=\"",
636                             "outline-level=\"", "shadow-level=\"", "back-color=\"",
637                             "alpha=\"", NULL };
638
639                     strcpy( psz_html, "<font " );
640                     strcat( psz_tagStack, "f" );
641                     psz_html += 6;
642                     psz_subtitle += 6;
643
644                     while( *psz_subtitle != '>' )
645                     {
646                         int  k;
647
648                         for( k=0; psz_attribs[ k ]; k++ )
649                         {
650                             int i_len = strlen( psz_attribs[ k ] );
651
652                             if( !strncasecmp( psz_subtitle, psz_attribs[ k ], i_len ))
653                             {
654                                 i_len += strcspn( psz_subtitle + i_len, "\"" ) + 1;
655
656                                 strncpy( psz_html, psz_subtitle, i_len );
657                                 psz_html += i_len;
658                                 psz_subtitle += i_len;
659                                 break;
660                             }
661                         }
662                         if( psz_attribs[ k ] == NULL )
663                         {
664                             /* Jump over unrecognised tag */
665                             int i_len = strcspn( psz_subtitle, "\"" ) + 1;
666
667                             i_len += strcspn( psz_subtitle + i_len, "\"" ) + 1;
668                             psz_subtitle += i_len;
669                         }
670                         while (*psz_subtitle == ' ')
671                             *psz_html++ = *psz_subtitle++;
672                     }
673                     *psz_html++ = *psz_subtitle++;
674                 }
675                 else if( !strncmp( psz_subtitle, "</", 2 ))
676                 {
677                     vlc_bool_t  b_match     = VLC_FALSE;
678                     int         i_len       = strlen( psz_tagStack ) - 1;
679                     char       *psz_lastTag = NULL;
680
681                     if( i_len >= 0 )
682                     {
683                         psz_lastTag = psz_tagStack + i_len;
684                         i_len = 0;
685
686                         switch( *psz_lastTag )
687                         {
688                             case 'b':
689                                 b_match = !strncasecmp( psz_subtitle, "</b>", 4 );
690                                 i_len   = 4;
691                                 break;
692                             case 'i':
693                                 b_match = !strncasecmp( psz_subtitle, "</i>", 4 );
694                                 i_len   = 4;
695                                 break;
696                             case 'u':
697                                 b_match = !strncasecmp( psz_subtitle, "</u>", 4 );
698                                 i_len   = 4;
699                                 break;
700                             case 'f':
701                                 b_match = !strncasecmp( psz_subtitle, "</font>", 7 );
702                                 i_len   = 7;
703                                 break;
704                         }
705                     }
706                     if( ! b_match )
707                     {
708                         /* Not well formed -- kill everything */
709                         free( psz_html_start );
710                         psz_html_start = NULL;
711                         break;
712                     }
713                     *psz_lastTag = '\0';
714                     strncpy( psz_html, psz_subtitle, i_len );
715                     psz_html += i_len;
716                     psz_subtitle += i_len;
717                 }
718                 else
719                 {
720                     psz_subtitle += strcspn( psz_subtitle, ">" );
721                 }
722             }
723             else if( *psz_subtitle == '&' )
724             {
725                 if( !strncasecmp( psz_subtitle, "&lt;", 4 ))
726                 {
727                     strcpy( psz_html, "&lt;" );
728                     psz_html += 4;
729                     psz_subtitle += 4;
730                 }
731                 else if( !strncasecmp( psz_subtitle, "&gt;", 4 ))
732                 {
733                     strcpy( psz_html, "&gt;" );
734                     psz_html += 4;
735                     psz_subtitle += 4;
736                 }
737                 else if( !strncasecmp( psz_subtitle, "&amp;", 5 ))
738                 {
739                     strcpy( psz_html, "&amp;" );
740                     psz_html += 5;
741                     psz_subtitle += 5;
742                 }
743                 else
744                 {
745                     strcpy( psz_html, "&amp;" );
746                     psz_html += 5;
747                     psz_subtitle++;
748                 }
749             }
750             else
751             {
752                 *psz_html = *psz_subtitle;
753                 if( psz_html > psz_html_start )
754                 {
755                     /* Check for double whitespace */
756                     if((( *psz_html == ' ' ) ||
757                         ( *psz_html == '\t' )) &&
758                        (( *(psz_html-1) == ' ' ) ||
759                         ( *(psz_html-1) == '\t' )))
760                     {
761                         strcpy( psz_html, NO_BREAKING_SPACE );
762                         psz_html += strlen( NO_BREAKING_SPACE ) - 1;
763                     }
764                 }
765                 psz_html++;
766                 psz_subtitle++;
767             }
768
769             if( ( size_t )( psz_html - psz_html_start ) > i_buf_size - 10 )
770             {
771                 int i_len = psz_html - psz_html_start;
772
773                 i_buf_size += 100;
774                 psz_html_start = realloc( psz_html_start, i_buf_size );
775                 psz_html = psz_html_start + i_len;
776                 *psz_html = '\0';
777             }
778         }
779         strcpy( psz_html, "</text>" );
780         psz_html += 7;
781
782         if( psz_tagStack[ 0 ] != '\0' )
783         {
784             /* Not well formed -- kill everything */
785             free( psz_html_start );
786             psz_html_start = NULL;
787         }
788         else if( psz_html_start )
789         {
790             /* Shrink the memory requirements */
791             psz_html_start = realloc( psz_html_start,  psz_html - psz_html_start + 1 );
792         }
793     }
794     return psz_html_start;
795 }