]> git.sesse.net Git - vlc/blob - modules/codec/subtitles/subsdec.c
Cosmetic
[vlc] / modules / codec / subtitles / subsdec.c
1 /*****************************************************************************
2  * subsdec.c : text subtitles decoder
3  *****************************************************************************
4  * Copyright (C) 2000-2006 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Gildas Bazin <gbazin@videolan.org>
8  *          Samuel Hocevar <sam@zoy.org>
9  *          Derk-Jan Hartman <hartman at videolan dot org>
10  *          Bernie Purcell <bitmap@videolan.org>
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
25  *****************************************************************************/
26
27 /*****************************************************************************
28  * Preamble
29  *****************************************************************************/
30
31 #include "subsdec.h"
32
33 /*****************************************************************************
34  * Local prototypes
35  *****************************************************************************/
36 static int  OpenDecoder   ( vlc_object_t * );
37 static void CloseDecoder  ( vlc_object_t * );
38
39 static subpicture_t   *DecodeBlock   ( decoder_t *, block_t ** );
40 static subpicture_t   *ParseText     ( decoder_t *, block_t * );
41 static char           *StripTags      ( char * );
42 static char           *CreateHtmlSubtitle ( char * );
43
44
45 /*****************************************************************************
46  * Module descriptor.
47  *****************************************************************************/
48 static const char *ppsz_encodings[] = { DEFAULT_NAME, "ASCII", "UTF-8", "",
49     "ISO-8859-1", "CP1252", "MacRoman", "MacIceland","ISO-8859-15", "",
50     "ISO-8859-2", "CP1250", "MacCentralEurope", "MacCroatian", "MacRomania", "",
51     "ISO-8859-5", "CP1251", "MacCyrillic", "MacUkraine", "KOI8-R", "KOI8-U", "KOI8-RU", "",
52     "ISO-8859-6", "CP1256", "MacArabic", "",
53     "ISO-8859-7", "CP1253", "MacGreek", "",
54     "ISO-8859-8", "CP1255", "MacHebrew", "",
55     "ISO-8859-9", "CP1254", "MacTurkish", "",
56     "ISO-8859-13", "CP1257", "",
57     "ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS", "",
58     "ISO-2022-CN", "ISO-2022-CN-EXT", "EUC-CN", "EUC-TW", "BIG5", "BIG5-HKSCS", "",
59     "ISO-2022-KR", "EUC-KR", "",
60     "MacThai", "KOI8-T", "",
61     "ISO-8859-3", "ISO-8859-4", "ISO-8859-10", "ISO-8859-14", "ISO-8859-16", "",
62     "CP850", "CP862", "CP866", "CP874", "CP932", "CP949", "CP950", "CP1133", "CP1258", "",
63     "Macintosh", "",
64     "UTF-7", "UTF-16", "UTF-16BE", "UTF-16LE", "UTF-32", "UTF-32BE", "UTF-32LE",
65     "C99", "JAVA", "UCS-2", "UCS-2BE", "UCS-2LE", "UCS-4", "UCS-4BE", "UCS-4LE", "",
66     "HZ", "GBK", "GB18030", "JOHAB", "ARMSCII-8",
67     "Georgian-Academy", "Georgian-PS", "TIS-620", "MuleLao-1", "VISCII", "TCVN",
68     "HPROMAN8", "NEXTSTEP" };
69 /*
70 SSA supports charset selection.
71 The following known charsets are used:
72
73 0 = Ansi - Western European
74 1 = default
75 2 = symbol
76 3 = invalid
77 77 = Mac
78 128 = Japanese (Shift JIS)
79 129 = Hangul
80 130 = Johab
81 134 = GB2312 Simplified Chinese
82 136 = Big5 Traditional Chinese
83 161 = Greek
84 162 = Turkish
85 163 = Vietnamese
86 177 = Hebrew
87 178 = Arabic
88 186 = Baltic
89 204 = Russian (Cyrillic)
90 222 = Thai
91 238 = Eastern European
92 254 = PC 437
93 */
94
95 static int  pi_justification[] = { 0, 1, 2 };
96 static const char *ppsz_justification_text[] = {N_("Center"),N_("Left"),N_("Right")};
97
98 #define ENCODING_TEXT N_("Subtitles text encoding")
99 #define ENCODING_LONGTEXT N_("Set the encoding used in text subtitles")
100 #define ALIGN_TEXT N_("Subtitles justification")
101 #define ALIGN_LONGTEXT N_("Set the justification of subtitles")
102 #define AUTODETECT_UTF8_TEXT N_("UTF-8 subtitles autodetection")
103 #define AUTODETECT_UTF8_LONGTEXT N_("This enables automatic detection of " \
104             "UTF-8 encoding within subtitles files.")
105 #define FORMAT_TEXT N_("Formatted Subtitles")
106 #define FORMAT_LONGTEXT N_("Some subtitle formats allow for text formatting. " \
107  "VLC partly implements this, but you can choose to disable all formatting.")
108
109
110 vlc_module_begin();
111     set_shortname( _("Subtitles"));
112     set_description( _("Text subtitles decoder") );
113     set_capability( "decoder", 50 );
114     set_callbacks( OpenDecoder, CloseDecoder );
115     set_category( CAT_INPUT );
116     set_subcategory( SUBCAT_INPUT_SCODEC );
117
118     add_integer( "subsdec-align", 0, NULL, ALIGN_TEXT, ALIGN_LONGTEXT,
119                  VLC_FALSE );
120         change_integer_list( pi_justification, ppsz_justification_text, 0 );
121     add_string( "subsdec-encoding", DEFAULT_NAME, NULL,
122                 ENCODING_TEXT, ENCODING_LONGTEXT, VLC_FALSE );
123         change_string_list( ppsz_encodings, 0, 0 );
124     add_bool( "subsdec-autodetect-utf8", VLC_TRUE, NULL,
125               AUTODETECT_UTF8_TEXT, AUTODETECT_UTF8_LONGTEXT, VLC_FALSE );
126     add_bool( "subsdec-formatted", VLC_TRUE, NULL, FORMAT_TEXT, FORMAT_LONGTEXT,
127                  VLC_FALSE );
128 vlc_module_end();
129
130 /*****************************************************************************
131  * OpenDecoder: probe the decoder and return score
132  *****************************************************************************
133  * Tries to launch a decoder and return score so that the interface is able
134  * to chose.
135  *****************************************************************************/
136 static int OpenDecoder( vlc_object_t *p_this )
137 {
138     decoder_t     *p_dec = (decoder_t*)p_this;
139     decoder_sys_t *p_sys;
140     vlc_value_t    val;
141
142     if( p_dec->fmt_in.i_codec != VLC_FOURCC('s','u','b','t') &&
143         p_dec->fmt_in.i_codec != VLC_FOURCC('s','s','a',' ') )
144     {
145         return VLC_EGENERIC;
146     }
147
148     p_dec->pf_decode_sub = DecodeBlock;
149
150     /* Allocate the memory needed to store the decoder's structure */
151     if( ( p_dec->p_sys = p_sys =
152           (decoder_sys_t *)calloc(1, sizeof(decoder_sys_t)) ) == NULL )
153     {
154         msg_Err( p_dec, "out of memory" );
155         return VLC_ENOMEM;
156     }
157
158     /* init of p_sys */
159     p_sys->i_align = 0;
160     p_sys->iconv_handle = (vlc_iconv_t)-1;
161     p_sys->b_autodetect_utf8 = VLC_FALSE;
162     p_sys->b_ass = VLC_FALSE;
163     p_sys->i_original_height = -1;
164     p_sys->i_original_width = -1;
165     TAB_INIT( p_sys->i_ssa_styles, p_sys->pp_ssa_styles );
166     TAB_INIT( p_sys->i_images, p_sys->pp_images );
167
168     char *psz_charset = NULL;
169     /* First try demux-specified encoding */
170     if( p_dec->fmt_in.subs.psz_encoding && *p_dec->fmt_in.subs.psz_encoding )
171     {
172         psz_charset = strdup (p_dec->fmt_in.subs.psz_encoding);
173         msg_Dbg (p_dec, "trying demuxer-specified character encoding: %s",
174                  p_dec->fmt_in.subs.psz_encoding ?: "not specified");
175     }
176
177     /* Second, try configured encoding */
178     if (psz_charset == NULL)
179     {
180         psz_charset = var_CreateGetNonEmptyString (p_dec, "subsdec-encoding");
181         if ((psz_charset != NULL) && !strcasecmp (psz_charset, DEFAULT_NAME))
182         {
183             free (psz_charset);
184             psz_charset = NULL;
185         }
186
187         msg_Dbg (p_dec, "trying configured character encoding: %s",
188                  psz_charset ?: "not specified");
189     }
190
191     /* Third, try "local" encoding with optional UTF-8 autodetection */
192     if (psz_charset == NULL)
193     {
194         psz_charset = strdup (GetFallbackEncoding ());
195         msg_Dbg (p_dec, "trying default character encoding: %s",
196                  psz_charset ?: "not specified");
197
198         if (var_CreateGetBool (p_dec, "subsdec-autodetect-utf8"))
199         {
200             msg_Dbg (p_dec, "using automatic UTF-8 detection");
201             p_sys->b_autodetect_utf8 = VLC_TRUE;
202         }
203     }
204
205     if (psz_charset == NULL)
206     {
207         psz_charset = strdup ("UTF-8");
208         msg_Dbg (p_dec, "trying hard-coded character encoding: %s",
209                  psz_charset ?: "error");
210     }
211
212     if (psz_charset == NULL)
213     {
214         free (p_sys);
215         return VLC_ENOMEM;
216     }
217
218     if (strcasecmp (psz_charset, "UTF-8") && strcasecmp (psz_charset, "utf8"))
219     {
220         p_sys->iconv_handle = vlc_iconv_open ("UTF-8", psz_charset);
221         if (p_sys->iconv_handle == (vlc_iconv_t)(-1))
222             msg_Err (p_dec, "cannot convert from %s: %s", psz_charset,
223                      strerror (errno));
224     }
225     free (psz_charset);
226
227     var_Create( p_dec, "subsdec-align", VLC_VAR_INTEGER | VLC_VAR_DOINHERIT );
228     var_Get( p_dec, "subsdec-align", &val );
229     p_sys->i_align = val.i_int;
230
231     if( p_dec->fmt_in.i_codec == VLC_FOURCC('s','s','a',' ')
232      && var_CreateGetBool( p_dec, "subsdec-formatted" ) )
233     {
234         if( p_dec->fmt_in.i_extra > 0 )
235             ParseSSAHeader( p_dec );
236     }
237
238     return VLC_SUCCESS;
239 }
240
241 /****************************************************************************
242  * DecodeBlock: the whole thing
243  ****************************************************************************
244  * This function must be fed with complete subtitles units.
245  ****************************************************************************/
246 static subpicture_t *DecodeBlock( decoder_t *p_dec, block_t **pp_block )
247 {
248     subpicture_t *p_spu = NULL;
249
250     if( !pp_block || *pp_block == NULL ) return NULL;
251
252     p_spu = ParseText( p_dec, *pp_block );
253
254     block_Release( *pp_block );
255     *pp_block = NULL;
256
257     return p_spu;
258 }
259
260 /*****************************************************************************
261  * CloseDecoder: clean up the decoder
262  *****************************************************************************/
263 static void CloseDecoder( vlc_object_t *p_this )
264 {
265     decoder_t *p_dec = (decoder_t *)p_this;
266     decoder_sys_t *p_sys = p_dec->p_sys;
267
268     if( p_sys->iconv_handle != (vlc_iconv_t)-1 )
269         vlc_iconv_close( p_sys->iconv_handle );
270
271     if( p_sys->pp_ssa_styles )
272     {
273         int i;
274         for( i = 0; i < p_sys->i_ssa_styles; i++ )
275         {
276             if( !p_sys->pp_ssa_styles[i] )
277                 continue;
278
279             if( p_sys->pp_ssa_styles[i]->psz_stylename )
280                 free( p_sys->pp_ssa_styles[i]->psz_stylename );
281             if( p_sys->pp_ssa_styles[i]->font_style.psz_fontname )
282                 free( p_sys->pp_ssa_styles[i]->font_style.psz_fontname );
283             if( p_sys->pp_ssa_styles[i] )
284                 free( p_sys->pp_ssa_styles[i] );
285         }
286         TAB_CLEAN( p_sys->i_ssa_styles, p_sys->pp_ssa_styles );
287     }
288     if( p_sys->pp_images )
289     {
290         int i;
291         for( i = 0; i < p_sys->i_images; i++ )
292         {
293             if( !p_sys->pp_images[i] )
294                 continue;
295
296             if( p_sys->pp_images[i]->p_pic )
297                 p_sys->pp_images[i]->p_pic->pf_release( p_sys->pp_images[i]->p_pic );
298             if( p_sys->pp_images[i]->psz_filename )
299                 free( p_sys->pp_images[i]->psz_filename );
300
301             free( p_sys->pp_images[i] );
302         }
303         TAB_CLEAN( p_sys->i_images, p_sys->pp_images );
304     }
305
306     free( p_sys );
307 }
308
309 /*****************************************************************************
310  * ParseText: parse an text subtitle packet and send it to the video output
311  *****************************************************************************/
312 static subpicture_t *ParseText( decoder_t *p_dec, block_t *p_block )
313 {
314     decoder_sys_t *p_sys = p_dec->p_sys;
315     subpicture_t *p_spu = NULL;
316     char *psz_subtitle = NULL;
317     video_format_t fmt;
318
319     /* We cannot display a subpicture with no date */
320     if( p_block->i_pts == 0 )
321     {
322         msg_Warn( p_dec, "subtitle without a date" );
323         return NULL;
324     }
325
326     /* Check validity of packet data */
327     /* An "empty" line containing only \0 can be used to force
328        and ephemer picture from the screen */
329     if( p_block->i_buffer < 1 )
330     {
331         msg_Warn( p_dec, "no subtitle data" );
332         return NULL;
333     }
334
335     /* Should be resiliant against bad subtitles */
336     psz_subtitle = strndup( (const char *)p_block->p_buffer,
337                             p_block->i_buffer );
338     if( psz_subtitle == NULL )
339         return NULL;
340
341     if( p_sys->iconv_handle == (vlc_iconv_t)-1 )
342     {
343         if (EnsureUTF8( psz_subtitle ) == NULL)
344         {
345             msg_Err( p_dec, _("failed to convert subtitle encoding.\n"
346                      "Try manually setting a character-encoding "
347                      "before you open the file.") );
348         }
349     }
350     else
351     {
352
353         if( p_sys->b_autodetect_utf8 )
354         {
355             if( IsUTF8( psz_subtitle ) == NULL )
356             {
357                 msg_Dbg( p_dec, "invalid UTF-8 sequence: "
358                          "disabling UTF-8 subtitles autodetection" );
359                 p_sys->b_autodetect_utf8 = VLC_FALSE;
360             }
361         }
362
363         if( !p_sys->b_autodetect_utf8 )
364         {
365             size_t inbytes_left = strlen( psz_subtitle );
366             size_t outbytes_left = 6 * inbytes_left;
367             char *psz_new_subtitle = malloc( outbytes_left + 1 );
368             char *psz_convert_buffer_out = psz_new_subtitle;
369             const char *psz_convert_buffer_in = psz_subtitle;
370
371             size_t ret = vlc_iconv( p_sys->iconv_handle,
372                                     &psz_convert_buffer_in, &inbytes_left,
373                                     &psz_convert_buffer_out, &outbytes_left );
374
375             *psz_convert_buffer_out++ = '\0';
376             free( psz_subtitle );
377
378             if( ( ret == (size_t)(-1) ) || inbytes_left )
379             {
380                 free( psz_new_subtitle );
381                 msg_Err( p_dec, _("failed to convert subtitle encoding.\n"
382                         "Try manually setting a character-encoding "
383                                 "before you open the file.") );
384                 return NULL;
385             }
386
387             psz_subtitle = realloc( psz_new_subtitle,
388                                     psz_convert_buffer_out - psz_new_subtitle );
389         }
390     }
391
392     /* Create the subpicture unit */
393     p_spu = p_dec->pf_spu_buffer_new( p_dec );
394     if( !p_spu )
395     {
396         msg_Warn( p_dec, "can't get spu buffer" );
397         if( psz_subtitle ) free( psz_subtitle );
398         return NULL;
399     }
400
401     p_spu->b_pausable = VLC_TRUE;
402
403     /* Create a new subpicture region */
404     memset( &fmt, 0, sizeof(video_format_t) );
405     fmt.i_chroma = VLC_FOURCC('T','E','X','T');
406     fmt.i_aspect = 0;
407     fmt.i_width = fmt.i_height = 0;
408     fmt.i_x_offset = fmt.i_y_offset = 0;
409     p_spu->p_region = p_spu->pf_create_region( VLC_OBJECT(p_dec), &fmt );
410     if( !p_spu->p_region )
411     {
412         msg_Err( p_dec, "cannot allocate SPU region" );
413         if( psz_subtitle ) free( psz_subtitle );
414         p_dec->pf_spu_buffer_del( p_dec, p_spu );
415         return NULL;
416     }
417
418     /* Decode and format the subpicture unit */
419     if( p_dec->fmt_in.i_codec != VLC_FOURCC('s','s','a',' ') )
420     {
421         /* Normal text subs, easy markup */
422         p_spu->p_region->i_align = SUBPICTURE_ALIGN_BOTTOM | p_sys->i_align;
423         p_spu->i_x = p_sys->i_align ? 20 : 0;
424         p_spu->i_y = 10;
425
426         /* Remove formatting from string */
427
428         p_spu->p_region->psz_text = StripTags( psz_subtitle );
429         if( var_CreateGetBool( p_dec, "subsdec-formatted" ) )
430         {
431             p_spu->p_region->psz_html = CreateHtmlSubtitle( psz_subtitle );
432         }
433
434         p_spu->i_start = p_block->i_pts;
435         p_spu->i_stop = p_block->i_pts + p_block->i_length;
436         p_spu->b_ephemer = (p_block->i_length == 0);
437         p_spu->b_absolute = VLC_FALSE;
438     }
439     else
440     {
441         /* Decode SSA/USF strings */
442         if( p_dec->fmt_in.i_codec == VLC_FOURCC('s','s','a',' ') )
443             ParseSSAString( p_dec, psz_subtitle, p_spu );
444
445         p_spu->i_start = p_block->i_pts;
446         p_spu->i_stop = p_block->i_pts + p_block->i_length;
447         p_spu->b_ephemer = (p_block->i_length == 0);
448         p_spu->b_absolute = VLC_FALSE;
449         p_spu->i_original_picture_width = p_sys->i_original_width;
450         p_spu->i_original_picture_height = p_sys->i_original_height;
451     }
452     if( psz_subtitle ) free( psz_subtitle );
453
454     return p_spu;
455 }
456
457 char* GotoNextLine( char *psz_text )
458 {
459     char *p_newline = psz_text;
460
461     while( p_newline[0] != '\0' )
462     {
463         if( p_newline[0] == '\n' || p_newline[0] == '\r' )
464         {
465             p_newline++;
466             while( p_newline[0] == '\n' || p_newline[0] == '\r' )
467                 p_newline++;
468             break;
469         }
470         else p_newline++;
471     }
472     return p_newline;
473 }
474
475 /* Function now handles tags which has attribute values, and tries
476  * to deal with &' commands too. It no longer modifies the string
477  * in place, so that the original text can be reused
478  */
479 static char *StripTags( char *psz_subtitle )
480 {
481     char *psz_text_start;
482     char *psz_text;
483
484     psz_text = psz_text_start = malloc( strlen( psz_subtitle ) + 1 );
485     if( !psz_text_start )
486         return NULL;
487
488     while( *psz_subtitle )
489     {
490         if( *psz_subtitle == '<' )
491         {
492             if( strncasecmp( psz_subtitle, "<br/>", 5 ) == 0 )
493                 *psz_text++ = '\n';
494
495             psz_subtitle += strcspn( psz_subtitle, ">" );
496         }
497         else if( *psz_subtitle == '&' )
498         {
499             if( !strncasecmp( psz_subtitle, "&lt;", 4 ))
500             {
501                 *psz_text++ = '<';
502                 psz_subtitle += strcspn( psz_subtitle, ";" );
503             }
504             else if( !strncasecmp( psz_subtitle, "&gt;", 4 ))
505             {
506                 *psz_text++ = '>';
507                 psz_subtitle += strcspn( psz_subtitle, ";" );
508             }
509             else if( !strncasecmp( psz_subtitle, "&amp;", 5 ))
510             {
511                 *psz_text++ = '&';
512                 psz_subtitle += strcspn( psz_subtitle, ";" );
513             }
514             else if( !strncasecmp( psz_subtitle, "&quot;", 6 ))
515             {
516                 *psz_text++ = '\"';
517                 psz_subtitle += strcspn( psz_subtitle, ";" );
518             }
519             else
520             {
521                 /* Assume it is just a normal ampersand */
522                 *psz_text++ = '&';
523             }
524         }
525         else
526         {
527             *psz_text++ = *psz_subtitle;
528         }
529
530         psz_subtitle++;
531     }
532     *psz_text = '\0';
533     psz_text_start = realloc( psz_text_start, strlen( psz_text_start ) + 1 );
534
535     return psz_text_start;
536 }
537
538 /* Try to respect any style tags present in the subtitle string. The main
539  * problem here is a lack of adequate specs for the subtitle formats.
540  * SSA/ASS and USF are both detail spec'ed -- but they are handled elsewhere.
541  * SAMI has a detailed spec, but extensive rework is needed in the demux
542  * code to prevent all this style information being excised, as it presently
543  * does.
544  * That leaves the others - none of which were (I guess) originally intended
545  * to be carrying style information. Over time people have used them that way.
546  * In the absence of specifications from which to work, the tags supported
547  * have been restricted to the simple set permitted by the USF DTD, ie. :
548  *  Basic: <br>, <i>, <b>, <u>
549  *  Extended: <font>
550  *    Attributes: face
551  *                family
552  *                size
553  *                color
554  *                outline-color
555  *                shadow-color
556  *                outline-level
557  *                shadow-level
558  *                back-color
559  *                alpha
560  * There is also the further restriction that the subtitle be well-formed
561  * as an XML entity, ie. the HTML sentence:
562  *        <b><i>Bold and Italics</b></i>
563  * doesn't qualify because the tags aren't nested one inside the other.
564  * <text> tags are automatically added to the output to ensure
565  * well-formedness.
566  * If the text doesn't qualify for any reason, a NULL string is
567  * returned, and the rendering engine will fall back to the
568  * plain text version of the subtitle.
569  */
570 static char *CreateHtmlSubtitle( char *psz_subtitle )
571 {
572     char    psz_tagStack[ 100 ];
573     size_t  i_buf_size     = strlen( psz_subtitle ) + 100;
574     char   *psz_html_start = malloc( i_buf_size );
575
576     psz_tagStack[ 0 ] = '\0';
577
578     if( psz_html_start != NULL )
579     {
580         char *psz_html = psz_html_start;
581
582         strcpy( psz_html, "<text>" );
583         psz_html += 6;
584
585         while( *psz_subtitle )
586         {
587             if( *psz_subtitle == '\n' )
588             {
589                 strcpy( psz_html, "<br/>" );
590                 psz_html += 5;
591                 psz_subtitle++;
592             }
593             else if( *psz_subtitle == '<' )
594             {
595                 if( !strncasecmp( psz_subtitle, "<br/>", 5 ))
596                 {
597                     strcpy( psz_html, "<br/>" );
598                     psz_html += 5;
599                     psz_subtitle += 5;
600                 }
601                 else if( !strncasecmp( psz_subtitle, "<b>", 3 ) )
602                 {
603                     strcpy( psz_html, "<b>" );
604                     strcat( psz_tagStack, "b" );
605                     psz_html += 3;
606                     psz_subtitle += 3;
607                 }
608                 else if( !strncasecmp( psz_subtitle, "<i>", 3 ) )
609                 {
610                     strcpy( psz_html, "<i>" );
611                     strcat( psz_tagStack, "i" );
612                     psz_html += 3;
613                     psz_subtitle += 3;
614                 }
615                 else if( !strncasecmp( psz_subtitle, "<u>", 3 ) )
616                 {
617                     strcpy( psz_html, "<u>" );
618                     strcat( psz_tagStack, "u" );
619                     psz_html += 3;
620                     psz_subtitle += 3;
621                 }
622                 else if( !strncasecmp( psz_subtitle, "<font ", 6 ))
623                 {
624                     const char *psz_attribs[] = { "face=\"", "family=\"", "size=\"",
625                             "color=\"", "outline-color=\"", "shadow-color=\"",
626                             "outline-level=\"", "shadow-level=\"", "back-color=\"",
627                             "alpha=\"", NULL };
628
629                     strcpy( psz_html, "<font " );
630                     strcat( psz_tagStack, "f" );
631                     psz_html += 6;
632                     psz_subtitle += 6;
633
634                     while( *psz_subtitle != '>' )
635                     {
636                         int  k;
637
638                         for( k=0; psz_attribs[ k ]; k++ )
639                         {
640                             int i_len = strlen( psz_attribs[ k ] );
641
642                             if( !strncasecmp( psz_subtitle, psz_attribs[ k ], i_len ))
643                             {
644                                 i_len += strcspn( psz_subtitle + i_len, "\"" ) + 1;
645
646                                 strncpy( psz_html, psz_subtitle, i_len );
647                                 psz_html += i_len;
648                                 psz_subtitle += i_len;
649                                 break;
650                             }
651                         }
652                         if( psz_attribs[ k ] == NULL )
653                         {
654                             /* Jump over unrecognised tag */
655                             int i_len = strcspn( psz_subtitle, "\"" ) + 1;
656
657                             i_len += strcspn( psz_subtitle + i_len, "\"" ) + 1;
658                             psz_subtitle += i_len;
659                         }
660                         while (*psz_subtitle == ' ')
661                             *psz_html++ = *psz_subtitle++;
662                     }
663                     *psz_html++ = *psz_subtitle++;
664                 }
665                 else if( !strncmp( psz_subtitle, "</", 2 ))
666                 {
667                     vlc_bool_t  b_match     = VLC_FALSE;
668                     int         i_len       = strlen( psz_tagStack ) - 1;
669                     char       *psz_lastTag = NULL;
670
671                     if( i_len >= 0 )
672                     {
673                         psz_lastTag = psz_tagStack + i_len;
674                         i_len = 0;
675
676                         switch( *psz_lastTag )
677                         {
678                             case 'b':
679                                 b_match = !strncasecmp( psz_subtitle, "</b>", 4 );
680                                 i_len   = 4;
681                                 break;
682                             case 'i':
683                                 b_match = !strncasecmp( psz_subtitle, "</i>", 4 );
684                                 i_len   = 4;
685                                 break;
686                             case 'u':
687                                 b_match = !strncasecmp( psz_subtitle, "</u>", 4 );
688                                 i_len   = 4;
689                                 break;
690                             case 'f':
691                                 b_match = !strncasecmp( psz_subtitle, "</font>", 7 );
692                                 i_len   = 7;
693                                 break;
694                         }
695                     }
696                     if( ! b_match )
697                     {
698                         /* Not well formed -- kill everything */
699                         free( psz_html_start );
700                         psz_html_start = NULL;
701                         break;
702                     }
703                     *psz_lastTag = '\0';
704                     strncpy( psz_html, psz_subtitle, i_len );
705                     psz_html += i_len;
706                     psz_subtitle += i_len;
707                 }
708                 else
709                 {
710                     psz_subtitle += strcspn( psz_subtitle, ">" );
711                 }
712             }
713             else if( *psz_subtitle == '&' )
714             {
715                 if( !strncasecmp( psz_subtitle, "&lt;", 4 ))
716                 {
717                     strcpy( psz_html, "&lt;" );
718                     psz_html += 4;
719                     psz_subtitle += 4;
720                 }
721                 else if( !strncasecmp( psz_subtitle, "&gt;", 4 ))
722                 {
723                     strcpy( psz_html, "&gt;" );
724                     psz_html += 4;
725                     psz_subtitle += 4;
726                 }
727                 else if( !strncasecmp( psz_subtitle, "&amp;", 5 ))
728                 {
729                     strcpy( psz_html, "&amp;" );
730                     psz_html += 5;
731                     psz_subtitle += 5;
732                 }
733                 else
734                 {
735                     strcpy( psz_html, "&amp;" );
736                     psz_html += 5;
737                     psz_subtitle++;
738                 }
739             }
740             else
741             {
742                 *psz_html = *psz_subtitle;
743                 if( psz_html > psz_html_start )
744                 {
745                     /* Check for double whitespace */
746                     if((( *psz_html == ' ' ) ||
747                         ( *psz_html == '\t' )) &&
748                        (( *(psz_html-1) == ' ' ) ||
749                         ( *(psz_html-1) == '\t' )))
750                     {
751                         strcpy( psz_html, NO_BREAKING_SPACE );
752                         psz_html += strlen( NO_BREAKING_SPACE ) - 1;
753                     }
754                 }
755                 psz_html++;
756                 psz_subtitle++;
757             }
758
759             if( ( size_t )( psz_html - psz_html_start ) > i_buf_size - 10 )
760             {
761                 int i_len = psz_html - psz_html_start;
762
763                 i_buf_size += 100;
764                 psz_html_start = realloc( psz_html_start, i_buf_size );
765                 psz_html = psz_html_start + i_len;
766                 *psz_html = '\0';
767             }
768         }
769         strcpy( psz_html, "</text>" );
770         psz_html += 7;
771
772         if( psz_tagStack[ 0 ] != '\0' )
773         {
774             /* Not well formed -- kill everything */
775             free( psz_html_start );
776             psz_html_start = NULL;
777         }
778         else if( psz_html_start )
779         {
780             /* Shrink the memory requirements */
781             psz_html_start = realloc( psz_html_start,  psz_html - psz_html_start + 1 );
782         }
783     }
784     return psz_html_start;
785 }