]> git.sesse.net Git - vlc/blob - modules/codec/subsdec.c
Avcodec: fix a sign warning
[vlc] / modules / codec / subsdec.c
1 /*****************************************************************************
2  * subsdec.c : text subtitles decoder
3  *****************************************************************************
4  * Copyright (C) 2000-2006 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Gildas Bazin <gbazin@videolan.org>
8  *          Samuel Hocevar <sam@zoy.org>
9  *          Derk-Jan Hartman <hartman at videolan dot org>
10  *          Bernie Purcell <bitmap@videolan.org>
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
25  *****************************************************************************/
26
27 /*****************************************************************************
28  * Preamble
29  *****************************************************************************/
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33
34 #include <vlc_common.h>
35 #include <vlc_plugin.h>
36 #include <vlc_codec.h>
37 #include <vlc_charset.h>
38
39 #include "substext.h"
40
41 /*****************************************************************************
42  * Module descriptor.
43  *****************************************************************************/
44 static const char *const ppsz_encodings[] = {
45     "",
46     "system",
47     "UTF-8",
48     "UTF-16",
49     "UTF-16BE",
50     "UTF-16LE",
51     "GB18030",
52     "ISO-8859-15",
53     "Windows-1252",
54     "IBM850",
55     "ISO-8859-2",
56     "Windows-1250",
57     "ISO-8859-3",
58     "ISO-8859-10",
59     "Windows-1251",
60     "KOI8-R",
61     "KOI8-U",
62     "ISO-8859-6",
63     "Windows-1256",
64     "ISO-8859-7",
65     "Windows-1253",
66     "ISO-8859-8",
67     "Windows-1255",
68     "ISO-8859-9",
69     "Windows-1254",
70     "ISO-8859-11",
71     "Windows-874",
72     "ISO-8859-13",
73     "Windows-1257",
74     "ISO-8859-14",
75     "ISO-8859-16",
76     "ISO-2022-CN-EXT",
77     "EUC-CN",
78     "ISO-2022-JP-2",
79     "EUC-JP",
80     "Shift_JIS",
81     "CP949",
82     "ISO-2022-KR",
83     "Big5",
84     "ISO-2022-TW",
85     "Big5-HKSCS",
86     "VISCII",
87     "Windows-1258",
88 };
89
90 static const char *const ppsz_encoding_names[] = {
91     /* xgettext:
92       The character encoding name in parenthesis corresponds to that used for
93       the GetACP translation. "Windows-1252" applies to Western European
94       languages using the Latin alphabet. */
95     N_("Default (Windows-1252)"),
96     N_("System codeset"),
97     N_("Universal (UTF-8)"),
98     N_("Universal (UTF-16)"),
99     N_("Universal (big endian UTF-16)"),
100     N_("Universal (little endian UTF-16)"),
101     N_("Universal, Chinese (GB18030)"),
102
103   /* ISO 8859 and the likes */
104     /* 1 */
105     N_("Western European (Latin-9)"), /* mostly superset of Latin-1 */
106     N_("Western European (Windows-1252)"),
107     N_("Western European (IBM 00850)"),
108     /* 2 */
109     N_("Eastern European (Latin-2)"),
110     N_("Eastern European (Windows-1250)"),
111     /* 3 */
112     N_("Esperanto (Latin-3)"),
113     /* 4 */
114     N_("Nordic (Latin-6)"), /* Latin 6 supersedes Latin 4 */
115     /* 5 */
116     N_("Cyrillic (Windows-1251)"), /* ISO 8859-5 is not practically used */
117     N_("Russian (KOI8-R)"),
118     N_("Ukrainian (KOI8-U)"),
119     /* 6 */
120     N_("Arabic (ISO 8859-6)"),
121     N_("Arabic (Windows-1256)"),
122     /* 7 */
123     N_("Greek (ISO 8859-7)"),
124     N_("Greek (Windows-1253)"),
125     /* 8 */
126     N_("Hebrew (ISO 8859-8)"),
127     N_("Hebrew (Windows-1255)"),
128     /* 9 */
129     N_("Turkish (ISO 8859-9)"),
130     N_("Turkish (Windows-1254)"),
131     /* 10 -> 4 */
132     /* 11 */
133     N_("Thai (TIS 620-2533/ISO 8859-11)"),
134     N_("Thai (Windows-874)"),
135     /* 13 */
136     N_("Baltic (Latin-7)"),
137     N_("Baltic (Windows-1257)"),
138     /* 12 -> /dev/null */
139     /* 14 */
140     N_("Celtic (Latin-8)"),
141     /* 15 -> 1 */
142     /* 16 */
143     N_("South-Eastern European (Latin-10)"),
144   /* CJK families */
145     N_("Simplified Chinese (ISO-2022-CN-EXT)"),
146     N_("Simplified Chinese Unix (EUC-CN)"),
147     N_("Japanese (7-bits JIS/ISO-2022-JP-2)"),
148     N_("Japanese Unix (EUC-JP)"),
149     N_("Japanese (Shift JIS)"),
150     N_("Korean (EUC-KR/CP949)"),
151     N_("Korean (ISO-2022-KR)"),
152     N_("Traditional Chinese (Big5)"),
153     N_("Traditional Chinese Unix (EUC-TW)"),
154     N_("Hong-Kong Supplementary (HKSCS)"),
155   /* Other */
156     N_("Vietnamese (VISCII)"),
157     N_("Vietnamese (Windows-1258)"),
158 };
159
160 static const int  pi_justification[] = { 0, 1, 2 };
161 static const char *const ppsz_justification_text[] = {
162     N_("Center"),N_("Left"),N_("Right")};
163
164 #define ENCODING_TEXT N_("Subtitles text encoding")
165 #define ENCODING_LONGTEXT N_("Set the encoding used in text subtitles")
166 #define ALIGN_TEXT N_("Subtitles justification")
167 #define ALIGN_LONGTEXT N_("Set the justification of subtitles")
168 #define AUTODETECT_UTF8_TEXT N_("UTF-8 subtitles autodetection")
169 #define AUTODETECT_UTF8_LONGTEXT N_("This enables automatic detection of " \
170             "UTF-8 encoding within subtitles files.")
171 #define FORMAT_TEXT N_("Formatted Subtitles")
172 #define FORMAT_LONGTEXT N_("Some subtitle formats allow for text formatting. " \
173  "VLC partly implements this, but you can choose to disable all formatting.")
174
175 static int  OpenDecoder   ( vlc_object_t * );
176 static void CloseDecoder  ( vlc_object_t * );
177
178 vlc_module_begin ()
179     set_shortname( N_("Subtitles"))
180     set_description( N_("Text subtitles decoder") )
181     set_capability( "decoder", 50 )
182     set_callbacks( OpenDecoder, CloseDecoder )
183     set_category( CAT_INPUT )
184     set_subcategory( SUBCAT_INPUT_SCODEC )
185
186     add_integer( "subsdec-align", 0, ALIGN_TEXT, ALIGN_LONGTEXT,
187                  false )
188         change_integer_list( pi_justification, ppsz_justification_text )
189     add_string( "subsdec-encoding", "",
190                 ENCODING_TEXT, ENCODING_LONGTEXT, false )
191         change_string_list( ppsz_encodings, ppsz_encoding_names, 0 )
192     add_bool( "subsdec-autodetect-utf8", true,
193               AUTODETECT_UTF8_TEXT, AUTODETECT_UTF8_LONGTEXT, false )
194     add_bool( "subsdec-formatted", true, FORMAT_TEXT, FORMAT_LONGTEXT,
195                  false )
196 vlc_module_end ()
197
198 /*****************************************************************************
199  * Local prototypes
200  *****************************************************************************/
201 #define NO_BREAKING_SPACE  "&#160;"
202
203 struct decoder_sys_t
204 {
205     int                 i_align;          /* Subtitles alignment on the vout */
206
207     vlc_iconv_t         iconv_handle;            /* handle to iconv instance */
208     bool                b_autodetect_utf8;
209 };
210
211
212 static subpicture_t   *DecodeBlock   ( decoder_t *, block_t ** );
213 static subpicture_t   *ParseText     ( decoder_t *, block_t * );
214 static char           *StripTags      ( char * );
215 static char           *CreateHtmlSubtitle( int *pi_align, char * );
216
217 /*****************************************************************************
218  * OpenDecoder: probe the decoder and return score
219  *****************************************************************************
220  * Tries to launch a decoder and return score so that the interface is able
221  * to chose.
222  *****************************************************************************/
223 static int OpenDecoder( vlc_object_t *p_this )
224 {
225     decoder_t     *p_dec = (decoder_t*)p_this;
226     decoder_sys_t *p_sys;
227
228     switch( p_dec->fmt_in.i_codec )
229     {
230         case VLC_CODEC_SUBT:
231         case VLC_CODEC_ITU_T140:
232             break;
233         default:
234             return VLC_EGENERIC;
235     }
236
237     p_dec->pf_decode_sub = DecodeBlock;
238     p_dec->fmt_out.i_cat = SPU_ES;
239     p_dec->fmt_out.i_codec = 0;
240
241     /* Allocate the memory needed to store the decoder's structure */
242     p_dec->p_sys = p_sys = calloc( 1, sizeof( *p_sys ) );
243     if( p_sys == NULL )
244         return VLC_ENOMEM;
245
246     /* init of p_sys */
247     p_sys->i_align = 0;
248     p_sys->iconv_handle = (vlc_iconv_t)-1;
249     p_sys->b_autodetect_utf8 = false;
250
251     const char *encoding;
252     char *var = NULL;
253
254     /* First try demux-specified encoding */
255     if( p_dec->fmt_in.i_codec == VLC_CODEC_ITU_T140 )
256         encoding = "UTF-8"; /* IUT T.140 is always using UTF-8 */
257     else
258     if( p_dec->fmt_in.subs.psz_encoding && *p_dec->fmt_in.subs.psz_encoding )
259     {
260         encoding = p_dec->fmt_in.subs.psz_encoding;
261         msg_Dbg (p_dec, "trying demuxer-specified character encoding: %s",
262                  encoding);
263     }
264     else
265     /* Second, try configured encoding */
266     if ((var = var_InheritString (p_dec, "subsdec-encoding")) != NULL)
267     {
268         msg_Dbg (p_dec, "trying configured character encoding: %s", var);
269         if (!strcmp (var, "system"))
270         {
271             free (var);
272             var = NULL;
273             encoding = "";
274             /* ^ iconv() treats "" as nl_langinfo(CODESET) */
275         }
276         else
277             encoding = var;
278     }
279     else
280     /* Third, try "local" encoding with optional UTF-8 autodetection */
281     {
282         /* xgettext:
283            The Windows ANSI code page most commonly used for this language.
284            VLC uses this as a guess of the subtitle files character set
285            (if UTF-8 and UTF-16 autodetection fails).
286            Western European languages normally use "CP1252", which is a
287            Microsoft-variant of ISO 8859-1. That suits the Latin alphabet.
288            Other scripts use other code pages.
289
290            This MUST be a valid iconv character set. If unsure, please refer
291            the VideoLAN translators mailing list. */
292         encoding = vlc_pgettext("GetACP", "CP1252");
293         msg_Dbg (p_dec, "trying default character encoding: %s", encoding);
294         if (var_InheritBool (p_dec, "subsdec-autodetect-utf8"))
295         {
296             msg_Dbg (p_dec, "using automatic UTF-8 detection");
297             p_sys->b_autodetect_utf8 = true;
298         }
299     }
300
301     if (strcasecmp (encoding, "UTF-8") && strcasecmp (encoding, "utf8"))
302     {
303         p_sys->iconv_handle = vlc_iconv_open ("UTF-8", encoding);
304         if (p_sys->iconv_handle == (vlc_iconv_t)(-1))
305             msg_Err (p_dec, "cannot convert from %s: %m", encoding);
306     }
307     free (var);
308
309     p_sys->i_align = var_InheritInteger( p_dec, "subsdec-align" );
310
311     return VLC_SUCCESS;
312 }
313
314 /****************************************************************************
315  * DecodeBlock: the whole thing
316  ****************************************************************************
317  * This function must be fed with complete subtitles units.
318  ****************************************************************************/
319 static subpicture_t *DecodeBlock( decoder_t *p_dec, block_t **pp_block )
320 {
321     subpicture_t *p_spu;
322     block_t *p_block;
323
324     if( !pp_block || *pp_block == NULL )
325         return NULL;
326
327     p_block = *pp_block;
328     if( p_block->i_flags & (BLOCK_FLAG_DISCONTINUITY|BLOCK_FLAG_CORRUPTED) )
329     {
330         block_Release( p_block );
331         return NULL;
332     }
333
334     p_spu = ParseText( p_dec, p_block );
335
336     block_Release( p_block );
337     *pp_block = NULL;
338
339     return p_spu;
340 }
341
342 /*****************************************************************************
343  * CloseDecoder: clean up the decoder
344  *****************************************************************************/
345 static void CloseDecoder( vlc_object_t *p_this )
346 {
347     decoder_t *p_dec = (decoder_t *)p_this;
348     decoder_sys_t *p_sys = p_dec->p_sys;
349
350     if( p_sys->iconv_handle != (vlc_iconv_t)-1 )
351         vlc_iconv_close( p_sys->iconv_handle );
352
353     free( p_sys );
354 }
355
356 /*****************************************************************************
357  * ParseText: parse an text subtitle packet and send it to the video output
358  *****************************************************************************/
359 static subpicture_t *ParseText( decoder_t *p_dec, block_t *p_block )
360 {
361     decoder_sys_t *p_sys = p_dec->p_sys;
362     subpicture_t *p_spu = NULL;
363     char *psz_subtitle = NULL;
364
365     /* We cannot display a subpicture with no date */
366     if( p_block->i_pts <= VLC_TS_INVALID )
367     {
368         msg_Warn( p_dec, "subtitle without a date" );
369         return NULL;
370     }
371
372     /* Check validity of packet data */
373     /* An "empty" line containing only \0 can be used to force
374        and ephemer picture from the screen */
375     if( p_block->i_buffer < 1 )
376     {
377         msg_Warn( p_dec, "no subtitle data" );
378         return NULL;
379     }
380
381     /* Should be resiliant against bad subtitles */
382     psz_subtitle = malloc( p_block->i_buffer + 1 );
383     if( psz_subtitle == NULL )
384         return NULL;
385     memcpy( psz_subtitle, p_block->p_buffer, p_block->i_buffer );
386     psz_subtitle[p_block->i_buffer] = '\0';
387
388     if( p_sys->iconv_handle == (vlc_iconv_t)-1 )
389     {
390         if (EnsureUTF8( psz_subtitle ) == NULL)
391         {
392             msg_Err( p_dec, "failed to convert subtitle encoding.\n"
393                      "Try manually setting a character-encoding "
394                      "before you open the file." );
395         }
396     }
397     else
398     {
399
400         if( p_sys->b_autodetect_utf8 )
401         {
402             if( IsUTF8( psz_subtitle ) == NULL )
403             {
404                 msg_Dbg( p_dec, "invalid UTF-8 sequence: "
405                          "disabling UTF-8 subtitles autodetection" );
406                 p_sys->b_autodetect_utf8 = false;
407             }
408         }
409
410         if( !p_sys->b_autodetect_utf8 )
411         {
412             size_t inbytes_left = strlen( psz_subtitle );
413             size_t outbytes_left = 6 * inbytes_left;
414             char *psz_new_subtitle = xmalloc( outbytes_left + 1 );
415             char *psz_convert_buffer_out = psz_new_subtitle;
416             const char *psz_convert_buffer_in = psz_subtitle;
417
418             size_t ret = vlc_iconv( p_sys->iconv_handle,
419                                     &psz_convert_buffer_in, &inbytes_left,
420                                     &psz_convert_buffer_out, &outbytes_left );
421
422             *psz_convert_buffer_out++ = '\0';
423             free( psz_subtitle );
424
425             if( ( ret == (size_t)(-1) ) || inbytes_left )
426             {
427                 free( psz_new_subtitle );
428                 msg_Err( p_dec, "failed to convert subtitle encoding.\n"
429                         "Try manually setting a character-encoding "
430                                 "before you open the file." );
431                 return NULL;
432             }
433
434             psz_subtitle = realloc( psz_new_subtitle,
435                                     psz_convert_buffer_out - psz_new_subtitle );
436             if( !psz_subtitle )
437                 psz_subtitle = psz_new_subtitle;
438         }
439     }
440
441     /* Create the subpicture unit */
442     p_spu = decoder_NewSubpictureText( p_dec );
443     if( !p_spu )
444     {
445         free( psz_subtitle );
446         return NULL;
447     }
448     p_spu->i_start    = p_block->i_pts;
449     p_spu->i_stop     = p_block->i_pts + p_block->i_length;
450     p_spu->b_ephemer  = (p_block->i_length == 0);
451     p_spu->b_absolute = false;
452
453     subpicture_updater_sys_t *p_spu_sys = p_spu->updater.p_sys;
454
455     p_spu_sys->align = SUBPICTURE_ALIGN_BOTTOM | p_sys->i_align;
456     p_spu_sys->text  = StripTags( psz_subtitle );
457     if( var_InheritBool( p_dec, "subsdec-formatted" ) )
458         p_spu_sys->html = CreateHtmlSubtitle( &p_spu_sys->align, psz_subtitle );
459
460     free( psz_subtitle );
461
462     return p_spu;
463 }
464
465 /* Function now handles tags with attribute values, and tries
466  * to deal with &' commands too. It no longer modifies the string
467  * in place, so that the original text can be reused
468  */
469 static char *StripTags( char *psz_subtitle )
470 {
471     char *psz_text_start;
472     char *psz_text;
473
474     psz_text = psz_text_start = malloc( strlen( psz_subtitle ) + 1 );
475     if( !psz_text_start )
476         return NULL;
477
478     while( *psz_subtitle )
479     {
480         if( *psz_subtitle == '<' )
481         {
482             if( strncasecmp( psz_subtitle, "<br/>", 5 ) == 0 )
483                 *psz_text++ = '\n';
484
485             psz_subtitle += strcspn( psz_subtitle, ">" );
486         }
487         else if( *psz_subtitle == '&' )
488         {
489             if( !strncasecmp( psz_subtitle, "&lt;", 4 ))
490             {
491                 *psz_text++ = '<';
492                 psz_subtitle += strcspn( psz_subtitle, ";" );
493             }
494             else if( !strncasecmp( psz_subtitle, "&gt;", 4 ))
495             {
496                 *psz_text++ = '>';
497                 psz_subtitle += strcspn( psz_subtitle, ";" );
498             }
499             else if( !strncasecmp( psz_subtitle, "&amp;", 5 ))
500             {
501                 *psz_text++ = '&';
502                 psz_subtitle += strcspn( psz_subtitle, ";" );
503             }
504             else if( !strncasecmp( psz_subtitle, "&quot;", 6 ))
505             {
506                 *psz_text++ = '\"';
507                 psz_subtitle += strcspn( psz_subtitle, ";" );
508             }
509             else
510             {
511                 /* Assume it is just a normal ampersand */
512                 *psz_text++ = '&';
513             }
514         }
515         else
516         {
517             *psz_text++ = *psz_subtitle;
518         }
519
520         /* Security fix: Account for the case where input ends early */
521         if( *psz_subtitle == '\0' ) break;
522
523         psz_subtitle++;
524     }
525     *psz_text = '\0';
526     char *psz = realloc( psz_text_start, strlen( psz_text_start ) + 1 );
527     if( psz ) psz_text_start = psz;
528
529     return psz_text_start;
530 }
531
532 /* Try to respect any style tags present in the subtitle string. The main
533  * problem here is a lack of adequate specs for the subtitle formats.
534  * SSA/ASS and USF are both detail spec'ed -- but they are handled elsewhere.
535  * SAMI has a detailed spec, but extensive rework is needed in the demux
536  * code to prevent all this style information being excised, as it presently
537  * does.
538  * That leaves the others - none of which were (I guess) originally intended
539  * to be carrying style information. Over time people have used them that way.
540  * In the absence of specifications from which to work, the tags supported
541  * have been restricted to the simple set permitted by the USF DTD, ie. :
542  *  Basic: <br>, <i>, <b>, <u>, <s>
543  *  Extended: <font>
544  *    Attributes: face
545  *                family
546  *                size
547  *                color
548  *                outline-color
549  *                shadow-color
550  *                outline-level
551  *                shadow-level
552  *                back-color
553  *                alpha
554  * There is also the further restriction that the subtitle be well-formed
555  * as an XML entity, ie. the HTML sentence:
556  *        <b><i>Bold and Italics</b></i>
557  * doesn't qualify because the tags aren't nested one inside the other.
558  * <text> tags are automatically added to the output to ensure
559  * well-formedness.
560  * If the text doesn't qualify for any reason, a NULL string is
561  * returned, and the rendering engine will fall back to the
562  * plain text version of the subtitle.
563  */
564 static void HtmlNPut( char **ppsz_html, const char *psz_text, int i_max )
565 {
566     const int i_len = strlen(psz_text);
567
568     strncpy( *ppsz_html, psz_text, i_max );
569     *ppsz_html += __MIN(i_max,i_len);
570 }
571
572 static void HtmlPut( char **ppsz_html, const char *psz_text )
573 {
574     strcpy( *ppsz_html, psz_text );
575     *ppsz_html += strlen(psz_text);
576 }
577 static void HtmlCopy( char **ppsz_html, char **ppsz_subtitle, const char *psz_text )
578 {
579     HtmlPut( ppsz_html, psz_text );
580     *ppsz_subtitle += strlen(psz_text);
581 }
582
583 static char *CreateHtmlSubtitle( int *pi_align, char *psz_subtitle )
584 {
585     /* */
586     char *psz_tag = malloc( ( strlen( psz_subtitle ) / 3 ) + 1 );
587     if( !psz_tag )
588         return NULL;
589     psz_tag[ 0 ] = '\0';
590
591     /* */
592     //Oo + 100 ???
593     size_t i_buf_size = strlen( psz_subtitle ) + 100;
594     char   *psz_html_start = malloc( i_buf_size );
595     char   *psz_html = psz_html_start;
596     if( psz_html_start == NULL )
597     {
598         free( psz_tag );
599         return NULL;
600     }
601     psz_html[0] = '\0';
602
603     bool b_has_align = false;
604
605     HtmlPut( &psz_html, "<text>" );
606
607     /* */
608     while( *psz_subtitle )
609     {
610         if( *psz_subtitle == '\n' )
611         {
612             HtmlPut( &psz_html, "<br/>" );
613             psz_subtitle++;
614         }
615         else if( *psz_subtitle == '<' )
616         {
617             if( !strncasecmp( psz_subtitle, "<br/>", 5 ))
618             {
619                 HtmlCopy( &psz_html, &psz_subtitle, "<br/>" );
620             }
621             else if( !strncasecmp( psz_subtitle, "<b>", 3 ) )
622             {
623                 HtmlCopy( &psz_html, &psz_subtitle, "<b>" );
624                 strcat( psz_tag, "b" );
625             }
626             else if( !strncasecmp( psz_subtitle, "<i>", 3 ) )
627             {
628                 HtmlCopy( &psz_html, &psz_subtitle, "<i>" );
629                 strcat( psz_tag, "i" );
630             }
631             else if( !strncasecmp( psz_subtitle, "<u>", 3 ) )
632             {
633                 HtmlCopy( &psz_html, &psz_subtitle, "<u>" );
634                 strcat( psz_tag, "u" );
635             }
636             else if( !strncasecmp( psz_subtitle, "<s>", 3 ) )
637             {
638                 HtmlCopy( &psz_html, &psz_subtitle, "<s>" );
639                 strcat( psz_tag, "s" );
640             }
641             else if( !strncasecmp( psz_subtitle, "<font ", 6 ))
642             {
643                 const char *psz_attribs[] = { "face=", "family=", "size=",
644                         "color=", "outline-color=", "shadow-color=",
645                         "outline-level=", "shadow-level=", "back-color=",
646                         "alpha=", NULL };
647
648                 HtmlCopy( &psz_html, &psz_subtitle, "<font " );
649                 strcat( psz_tag, "f" );
650
651                 while( *psz_subtitle != '>' )
652                 {
653                     int  k;
654
655                     for( k=0; psz_attribs[ k ]; k++ )
656                     {
657                         int i_len = strlen( psz_attribs[ k ] );
658
659                         if( !strncasecmp( psz_subtitle, psz_attribs[k], i_len ) )
660                         {
661                             /* */
662                             HtmlPut( &psz_html, psz_attribs[k] );
663                             psz_subtitle += i_len;
664
665                             /* */
666                             if( *psz_subtitle == '"' )
667                             {
668                                 psz_subtitle++;
669                                 i_len = strcspn( psz_subtitle, "\"" );
670                             }
671                             else if( *psz_subtitle == '\'' )
672                             {
673                                 psz_subtitle++;
674                                 i_len = strcspn( psz_subtitle, "'" );
675                             }
676                             else
677                             {
678                                 i_len = strcspn( psz_subtitle, " \t>" );
679                             }
680                             HtmlPut( &psz_html, "\"" );
681                             HtmlNPut( &psz_html, psz_subtitle, i_len );
682                             HtmlPut( &psz_html, "\"" );
683
684                             psz_subtitle += i_len;
685                             if( *psz_subtitle == '\"' || *psz_subtitle == '\'' )
686                                 psz_subtitle++;
687                             break;
688                         }
689                     }
690                     if( psz_attribs[ k ] == NULL )
691                     {
692                         /* Jump over unrecognised tag */
693                         int i_len = strcspn( psz_subtitle, "\"" );
694                         if( psz_subtitle[i_len] == '\"' )
695                         {
696                             i_len += 1 + strcspn( &psz_subtitle[i_len + 1], "\"" );
697                             if( psz_subtitle[i_len] == '\"' )
698                                 i_len++;
699                         }
700                         /* Not a tag, something else we do not understand */
701                         if( i_len == 0 )
702                             *psz_subtitle++;
703
704                         psz_subtitle += i_len;
705                     }
706                     while (*psz_subtitle == ' ')
707                         *psz_html++ = *psz_subtitle++;
708                 }
709                 *psz_html++ = '>';
710                 *psz_subtitle++;
711             }
712             else if( !strncmp( psz_subtitle, "</", 2 ))
713             {
714                 bool   b_match     = false;
715                 bool   b_ignore    = false;
716                 int    i_len       = strlen( psz_tag ) - 1;
717                 char  *psz_lastTag = NULL;
718
719                 if( i_len >= 0 )
720                 {
721                     psz_lastTag = psz_tag + i_len;
722                     i_len = 0;
723
724                     switch( *psz_lastTag )
725                     {
726                     case 'b':
727                         b_match = !strncasecmp( psz_subtitle, "</b>", 4 );
728                         i_len   = 4;
729                         break;
730                     case 'i':
731                         b_match = !strncasecmp( psz_subtitle, "</i>", 4 );
732                         i_len   = 4;
733                         break;
734                     case 'u':
735                         b_match = !strncasecmp( psz_subtitle, "</u>", 4 );
736                         i_len   = 4;
737                         break;
738                     case 's':
739                         b_match = !strncasecmp( psz_subtitle, "</s>", 4 );
740                         i_len   = 4;
741                         break;
742                     case 'f':
743                         b_match = !strncasecmp( psz_subtitle, "</font>", 7 );
744                         i_len   = 7;
745                         break;
746                     case 'I':
747                         i_len = strcspn( psz_subtitle, ">" );
748                         b_match = psz_subtitle[i_len] == '>';
749                         b_ignore = true;
750                         if( b_match )
751                             i_len++;
752                         break;
753                     }
754                 }
755                 if( !b_match )
756                 {
757                     /* Not well formed -- kill everything */
758                     free( psz_html_start );
759                     psz_html_start = NULL;
760                     break;
761                 }
762                 *psz_lastTag = '\0';
763                 if( !b_ignore )
764                     HtmlNPut( &psz_html, psz_subtitle, i_len );
765
766                 psz_subtitle += i_len;
767             }
768             else if( ( psz_subtitle[1] < 'a' || psz_subtitle[1] > 'z' ) &&
769                      ( psz_subtitle[1] < 'A' || psz_subtitle[1] > 'Z' ) )
770             {
771                 /* We have a single < */
772                 HtmlPut( &psz_html, "&lt;" );
773                 psz_subtitle++;
774             }
775             else
776             {
777                 /* We have an unknown tag or a single < */
778
779                 /* Search for the next tag or end of tag or end of string */
780                 char *psz_stop = psz_subtitle + 1 + strcspn( &psz_subtitle[1], "<>" );
781                 char *psz_closing = strstr( psz_subtitle, "/>" );
782
783                 if( psz_closing && psz_closing < psz_stop )
784                 {
785                     /* We have a self closed tag, remove it */
786                     psz_subtitle = &psz_closing[2];
787                 }
788                 else if( *psz_stop == '>' )
789                 {
790                     char psz_match[256];
791
792                     snprintf( psz_match, sizeof(psz_match), "</%s", &psz_subtitle[1] );
793                     psz_match[strcspn( psz_match, " \t>" )] = '\0';
794
795                     if( strstr( psz_subtitle, psz_match ) )
796                     {
797                         /* We have the closing tag, ignore it TODO */
798                         psz_subtitle = &psz_stop[1];
799                         strcat( psz_tag, "I" );
800                     }
801                     else
802                     {
803                         int i_len = psz_stop + 1 - psz_subtitle;
804
805                         /* Copy the whole data */
806                         for( ; i_len > 0; i_len--, psz_subtitle++ )
807                         {
808                             if( *psz_subtitle == '<' )
809                                 HtmlPut( &psz_html, "&lt;" );
810                             else if( *psz_subtitle == '>' )
811                                 HtmlPut( &psz_html, "&gt;" );
812                             else
813                                 *psz_html++ = *psz_subtitle;
814                         }
815                     }
816                 }
817                 else
818                 {
819                     /* We have a single < */
820                     HtmlPut( &psz_html, "&lt;" );
821                     psz_subtitle++;
822                 }
823             }
824         }
825         else if( *psz_subtitle == '&' )
826         {
827             if( !strncasecmp( psz_subtitle, "&lt;", 4 ))
828             {
829                 HtmlCopy( &psz_html, &psz_subtitle, "&lt;" );
830             }
831             else if( !strncasecmp( psz_subtitle, "&gt;", 4 ))
832             {
833                 HtmlCopy( &psz_html, &psz_subtitle, "&gt;" );
834             }
835             else if( !strncasecmp( psz_subtitle, "&amp;", 5 ))
836             {
837                 HtmlCopy( &psz_html, &psz_subtitle, "&amp;" );
838             }
839             else
840             {
841                 HtmlPut( &psz_html, "&amp;" );
842                 psz_subtitle++;
843             }
844         }
845         else if( *psz_subtitle == '>' )
846         {
847             HtmlPut( &psz_html, "&gt;" );
848             psz_subtitle++;
849         }
850         else if( psz_subtitle[0] == '{' && psz_subtitle[1] == '\\' &&
851                  strchr( psz_subtitle, '}' ) )
852         {
853             /* Check for forced alignment */
854             if( !b_has_align &&
855                 !strncmp( psz_subtitle, "{\\an", 4 ) && psz_subtitle[4] >= '1' && psz_subtitle[4] <= '9' && psz_subtitle[5] == '}' )
856             {
857                 static const int pi_vertical[3] = { SUBPICTURE_ALIGN_BOTTOM, 0, SUBPICTURE_ALIGN_TOP };
858                 static const int pi_horizontal[3] = { SUBPICTURE_ALIGN_LEFT, 0, SUBPICTURE_ALIGN_RIGHT };
859                 const int i_id = psz_subtitle[4] - '1';
860
861                 b_has_align = true;
862                 *pi_align = pi_vertical[i_id/3] | pi_horizontal[i_id%3];
863             }
864             /* TODO fr -> rotation */
865
866             /* Hide {\stupidity} */
867             psz_subtitle = strchr( psz_subtitle, '}' ) + 1;
868         }
869         else if( psz_subtitle[0] == '{' &&
870                 ( psz_subtitle[1] == 'Y' || psz_subtitle[1] == 'y' )
871                 && psz_subtitle[2] == ':' && strchr( psz_subtitle, '}' ) )
872         {
873             // FIXME: We don't do difference between Y and y, and we should.
874             if( psz_subtitle[3] == 'i' )
875             {
876                 HtmlPut( &psz_html, "<i>" );
877                 strcat( psz_tag, "i" );
878             }
879             if( psz_subtitle[3] == 'b' )
880             {
881                 HtmlPut( &psz_html, "<b>" );
882                 strcat( psz_tag, "b" );
883             }
884             if( psz_subtitle[3] == 'u' )
885             {
886                 HtmlPut( &psz_html, "<u>" );
887                 strcat( psz_tag, "u" );
888             }
889             psz_subtitle = strchr( psz_subtitle, '}' ) + 1;
890         }
891         else if( psz_subtitle[0] == '{' &&  psz_subtitle[2] == ':' && strchr( psz_subtitle, '}' ) )
892         {
893             // Hide other {x:y} atrocities, like {c:$bbggrr} or {P:x}
894             psz_subtitle = strchr( psz_subtitle, '}' ) + 1;
895         }
896         else if( psz_subtitle[0] == '\\' && psz_subtitle[1] )
897         {
898             if( psz_subtitle[1] == 'N' || psz_subtitle[1] == 'n' )
899             {
900                 HtmlPut( &psz_html, "<br/>" );
901                 psz_subtitle += 2;
902             }
903             else if( psz_subtitle[1] == 'h' )
904             {
905                 /* Non breakable space */
906                 HtmlPut( &psz_html, NO_BREAKING_SPACE );
907                 psz_subtitle += 2;
908             }
909             else
910             {
911                 HtmlPut( &psz_html, "\\" );
912                 psz_subtitle++;
913             }
914         }
915         else
916         {
917             *psz_html = *psz_subtitle;
918             if( psz_html > psz_html_start )
919             {
920                 /* Check for double whitespace */
921                 if( ( *psz_html == ' '  || *psz_html == '\t' ) &&
922                     ( *(psz_html-1) == ' ' || *(psz_html-1) == '\t' ) )
923                 {
924                     HtmlPut( &psz_html, NO_BREAKING_SPACE );
925                     psz_html--;
926                 }
927             }
928             psz_html++;
929             psz_subtitle++;
930         }
931
932         if( ( size_t )( psz_html - psz_html_start ) > i_buf_size - 50 )
933         {
934             const int i_len = psz_html - psz_html_start;
935
936             i_buf_size += 200;
937             char *psz_new = realloc( psz_html_start, i_buf_size );
938             if( !psz_new )
939                 break;
940             psz_html_start = psz_new;
941             psz_html = &psz_new[i_len];
942         }
943     }
944     if( psz_html_start )
945     {
946         static const char *psz_text_close = "</text>";
947         static const char *psz_tag_long = "/font>";
948
949         /* Realloc for closing tags and shrink memory */
950         const size_t i_length = (size_t)( psz_html - psz_html_start );
951
952         const size_t i_size = i_length + strlen(psz_tag_long) * strlen(psz_tag) + strlen(psz_text_close) + 1;
953         char *psz_new = realloc( psz_html_start, i_size );
954         if( psz_new )
955         {
956             psz_html_start = psz_new;
957             psz_html = &psz_new[i_length];
958
959             /* Close not well formed subtitle */
960             while( *psz_tag )
961             {
962                 /* */
963                 char *psz_last = &psz_tag[strlen(psz_tag)-1];
964                 switch( *psz_last )
965                 {
966                 case 'b':
967                     HtmlPut( &psz_html, "</b>" );
968                     break;
969                 case 'i':
970                     HtmlPut( &psz_html, "</i>" );
971                     break;
972                 case 'u':
973                     HtmlPut( &psz_html, "</u>" );
974                     break;
975                 case 's':
976                     HtmlPut( &psz_html, "</s>" );
977                     break;
978                 case 'f':
979                     HtmlPut( &psz_html, "/font>" );
980                     break;
981                 case 'I':
982                     break;
983                 }
984
985                 *psz_last = '\0';
986             }
987             HtmlPut( &psz_html, psz_text_close );
988         }
989     }
990     free( psz_tag );
991
992     return psz_html_start;
993 }
994