/*****************************************************************************
- * subsdec.c : text subtitles decoder
+ * subsdec.c : text subtitle decoder
*****************************************************************************
- * Copyright (C) 2000-2006 the VideoLAN team
+ * Copyright (C) 2000-2006 VLC authors and VideoLAN
* $Id$
*
* Authors: Gildas Bazin <gbazin@videolan.org>
* Derk-Jan Hartman <hartman at videolan dot org>
* Bernie Purcell <bitmap@videolan.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
/*****************************************************************************
# include "config.h"
#endif
-#include "subsdec.h"
-#include <vlc_plugin.h>
-
-/*****************************************************************************
- * Local prototypes
- *****************************************************************************/
-static int OpenDecoder ( vlc_object_t * );
-static void CloseDecoder ( vlc_object_t * );
+#include <limits.h>
-static subpicture_t *DecodeBlock ( decoder_t *, block_t ** );
-static subpicture_t *ParseText ( decoder_t *, block_t * );
-static char *StripTags ( char * );
-static char *CreateHtmlSubtitle( int *pi_align, char * );
+#include <vlc_common.h>
+#include <vlc_plugin.h>
+#include <vlc_codec.h>
+#include <vlc_charset.h>
+#include "substext.h"
/*****************************************************************************
* Module descriptor.
"GB18030",
"ISO-8859-15",
"Windows-1252",
+ "IBM850",
"ISO-8859-2",
"Windows-1250",
"ISO-8859-3",
/* 1 */
N_("Western European (Latin-9)"), /* mostly superset of Latin-1 */
N_("Western European (Windows-1252)"),
+ N_("Western European (IBM 00850)"),
/* 2 */
N_("Eastern European (Latin-2)"),
N_("Eastern European (Windows-1250)"),
static const char *const ppsz_justification_text[] = {
N_("Center"),N_("Left"),N_("Right")};
-#define ENCODING_TEXT N_("Subtitles text encoding")
+#define ENCODING_TEXT N_("Subtitle text encoding")
#define ENCODING_LONGTEXT N_("Set the encoding used in text subtitles")
-#define ALIGN_TEXT N_("Subtitles justification")
+#define ALIGN_TEXT N_("Subtitle justification")
#define ALIGN_LONGTEXT N_("Set the justification of subtitles")
-#define AUTODETECT_UTF8_TEXT N_("UTF-8 subtitles autodetection")
+#define AUTODETECT_UTF8_TEXT N_("UTF-8 subtitle autodetection")
#define AUTODETECT_UTF8_LONGTEXT N_("This enables automatic detection of " \
- "UTF-8 encoding within subtitles files.")
+ "UTF-8 encoding within subtitle files.")
#define FORMAT_TEXT N_("Formatted Subtitles")
#define FORMAT_LONGTEXT N_("Some subtitle formats allow for text formatting. " \
"VLC partly implements this, but you can choose to disable all formatting.")
+static int OpenDecoder ( vlc_object_t * );
+static void CloseDecoder ( vlc_object_t * );
vlc_module_begin ()
set_shortname( N_("Subtitles"))
- set_description( N_("Text subtitles decoder") )
+ set_description( N_("Text subtitle decoder") )
set_capability( "decoder", 50 )
set_callbacks( OpenDecoder, CloseDecoder )
set_category( CAT_INPUT )
change_integer_list( pi_justification, ppsz_justification_text )
add_string( "subsdec-encoding", "",
ENCODING_TEXT, ENCODING_LONGTEXT, false )
- change_string_list( ppsz_encodings, ppsz_encoding_names, 0 )
+ change_string_list( ppsz_encodings, ppsz_encoding_names )
add_bool( "subsdec-autodetect-utf8", true,
AUTODETECT_UTF8_TEXT, AUTODETECT_UTF8_LONGTEXT, false )
add_bool( "subsdec-formatted", true, FORMAT_TEXT, FORMAT_LONGTEXT,
false )
vlc_module_end ()
+/*****************************************************************************
+ * Local prototypes
+ *****************************************************************************/
+#define NO_BREAKING_SPACE " "
+
+struct decoder_sys_t
+{
+ int i_align; /* Subtitles alignment on the vout */
+
+ vlc_iconv_t iconv_handle; /* handle to iconv instance */
+ bool b_autodetect_utf8;
+};
+
+
+static subpicture_t *DecodeBlock ( decoder_t *, block_t ** );
+static subpicture_t *ParseText ( decoder_t *, block_t * );
+static char *StripTags ( char * );
+static char *CreateHtmlSubtitle( int *pi_align, char * );
+
/*****************************************************************************
* OpenDecoder: probe the decoder and return score
*****************************************************************************
p_sys->i_align = 0;
p_sys->iconv_handle = (vlc_iconv_t)-1;
p_sys->b_autodetect_utf8 = false;
- p_sys->i_original_height = -1;
- p_sys->i_original_width = -1;
- TAB_INIT( p_sys->i_ssa_styles, p_sys->pp_ssa_styles );
- TAB_INIT( p_sys->i_images, p_sys->pp_images );
- char *psz_charset = NULL;
+ const char *encoding;
+ char *var = NULL;
/* First try demux-specified encoding */
if( p_dec->fmt_in.i_codec == VLC_CODEC_ITU_T140 )
- psz_charset = strdup( "UTF-8" ); /* IUT T.140 is always using UTF-8 */
+ encoding = "UTF-8"; /* IUT T.140 is always using UTF-8 */
else
if( p_dec->fmt_in.subs.psz_encoding && *p_dec->fmt_in.subs.psz_encoding )
{
- psz_charset = strdup (p_dec->fmt_in.subs.psz_encoding);
+ encoding = p_dec->fmt_in.subs.psz_encoding;
msg_Dbg (p_dec, "trying demuxer-specified character encoding: %s",
- p_dec->fmt_in.subs.psz_encoding ?
- p_dec->fmt_in.subs.psz_encoding : "not specified");
+ encoding);
}
-
- /* Second, try configured encoding */
- if (psz_charset == NULL)
+ else
{
- psz_charset = var_InheritString (p_dec, "subsdec-encoding");
- msg_Dbg (p_dec, "trying configured character encoding: %s",
- psz_charset ? psz_charset : "not specified");
- if (psz_charset != NULL && !strcmp (psz_charset, "system"))
+ /* Second, try configured encoding */
+ if ((var = var_InheritString (p_dec, "subsdec-encoding")) != NULL)
{
- free (psz_charset);
- psz_charset = strdup ("");
- /* ^ iconv() treats "" as nl_langinfo(CODESET) */
+ msg_Dbg (p_dec, "trying configured character encoding: %s", var);
+ if (!strcmp (var, "system"))
+ {
+ free (var);
+ var = NULL;
+ encoding = "";
+ /* ^ iconv() treats "" as nl_langinfo(CODESET) */
+ }
+ else
+ encoding = var;
}
- }
-
- /* Third, try "local" encoding with optional UTF-8 autodetection */
- if (psz_charset == NULL)
- {
+ else
+ /* Third, try "local" encoding */
+ {
/* xgettext:
The Windows ANSI code page most commonly used for this language.
VLC uses this as a guess of the subtitle files character set
This MUST be a valid iconv character set. If unsure, please refer
the VideoLAN translators mailing list. */
- const char *acp = vlc_pgettext("GetACP", "CP1252");
-
- psz_charset = strdup (acp);
- msg_Dbg (p_dec, "trying default character encoding: %s",
- psz_charset ? psz_charset : "not specified");
+ encoding = vlc_pgettext("GetACP", "CP1252");
+ msg_Dbg (p_dec, "trying default character encoding: %s", encoding);
+ }
+ /* Check UTF-8 autodetection */
if (var_InheritBool (p_dec, "subsdec-autodetect-utf8"))
{
msg_Dbg (p_dec, "using automatic UTF-8 detection");
}
}
- /* Forth, don't do character decoding, i.e. assume UTF-8 */
- if (psz_charset == NULL)
+ if (strcasecmp (encoding, "UTF-8") && strcasecmp (encoding, "utf8"))
{
- psz_charset = strdup ("UTF-8");
- msg_Dbg (p_dec, "using UTF-8 character encoding" );
- }
-
- if ((psz_charset != NULL)
- && strcasecmp (psz_charset, "UTF-8")
- && strcasecmp (psz_charset, "utf8"))
- {
- p_sys->iconv_handle = vlc_iconv_open ("UTF-8", psz_charset);
+ p_sys->iconv_handle = vlc_iconv_open ("UTF-8", encoding);
if (p_sys->iconv_handle == (vlc_iconv_t)(-1))
- msg_Err (p_dec, "cannot convert from %s: %m", psz_charset);
+ msg_Err (p_dec, "cannot convert from %s: %m", encoding);
}
- free (psz_charset);
+ free (var);
p_sys->i_align = var_InheritInteger( p_dec, "subsdec-align" );
if( p_sys->iconv_handle != (vlc_iconv_t)-1 )
vlc_iconv_close( p_sys->iconv_handle );
- if( p_sys->pp_ssa_styles )
- {
- int i;
- for( i = 0; i < p_sys->i_ssa_styles; i++ )
- {
- if( !p_sys->pp_ssa_styles[i] )
- continue;
-
- free( p_sys->pp_ssa_styles[i]->psz_stylename );
- free( p_sys->pp_ssa_styles[i]->font_style.psz_fontname );
- free( p_sys->pp_ssa_styles[i] );
- }
- TAB_CLEAN( p_sys->i_ssa_styles, p_sys->pp_ssa_styles );
- }
- if( p_sys->pp_images )
- {
- int i;
- for( i = 0; i < p_sys->i_images; i++ )
- {
- if( !p_sys->pp_images[i] )
- continue;
-
- if( p_sys->pp_images[i]->p_pic )
- picture_Release( p_sys->pp_images[i]->p_pic );
- free( p_sys->pp_images[i]->psz_filename );
-
- free( p_sys->pp_images[i] );
- }
- TAB_CLEAN( p_sys->i_images, p_sys->pp_images );
- }
-
free( p_sys );
}
decoder_sys_t *p_sys = p_dec->p_sys;
subpicture_t *p_spu = NULL;
char *psz_subtitle = NULL;
- video_format_t fmt;
/* We cannot display a subpicture with no date */
if( p_block->i_pts <= VLC_TS_INVALID )
}
/* Create the subpicture unit */
- p_spu = decoder_NewSubpicture( p_dec, NULL );
+ p_spu = decoder_NewSubpictureText( p_dec );
if( !p_spu )
{
- msg_Warn( p_dec, "can't get spu buffer" );
- free( psz_subtitle );
- return NULL;
- }
-
- /* Create a new subpicture region */
- memset( &fmt, 0, sizeof(video_format_t) );
- fmt.i_chroma = VLC_CODEC_TEXT;
- fmt.i_width = fmt.i_height = 0;
- fmt.i_x_offset = fmt.i_y_offset = 0;
- p_spu->p_region = subpicture_region_New( &fmt );
- if( !p_spu->p_region )
- {
- msg_Err( p_dec, "cannot allocate SPU region" );
free( psz_subtitle );
- decoder_DeleteSubpicture( p_dec, p_spu );
return NULL;
}
+ p_spu->i_start = p_block->i_pts;
+ p_spu->i_stop = p_block->i_pts + p_block->i_length;
+ p_spu->b_ephemer = (p_block->i_length == 0);
+ p_spu->b_absolute = false;
- /* Normal text subs, easy markup */
- p_spu->p_region->i_align = SUBPICTURE_ALIGN_BOTTOM | p_sys->i_align;
- p_spu->p_region->i_x = p_sys->i_align ? 20 : 0;
- p_spu->p_region->i_y = 10;
-
- /* Remove formatting from string */
+ subpicture_updater_sys_t *p_spu_sys = p_spu->updater.p_sys;
- p_spu->p_region->psz_text = StripTags( psz_subtitle );
+ p_spu_sys->align = SUBPICTURE_ALIGN_BOTTOM | p_sys->i_align;
+ p_spu_sys->text = StripTags( psz_subtitle );
if( var_InheritBool( p_dec, "subsdec-formatted" ) )
- {
- p_spu->p_region->psz_html = CreateHtmlSubtitle( &p_spu->p_region->i_align, psz_subtitle );
- }
+ p_spu_sys->html = CreateHtmlSubtitle( &p_spu_sys->align, psz_subtitle );
- p_spu->i_start = p_block->i_pts;
- p_spu->i_stop = p_block->i_pts + p_block->i_length;
- p_spu->b_ephemer = (p_block->i_length == 0);
- p_spu->b_absolute = false;
free( psz_subtitle );
return p_spu;
}
-char* GotoNextLine( char *psz_text )
-{
- char *p_newline = psz_text;
-
- while( p_newline[0] != '\0' )
- {
- if( p_newline[0] == '\n' || p_newline[0] == '\r' )
- {
- p_newline++;
- while( p_newline[0] == '\n' || p_newline[0] == '\r' )
- p_newline++;
- break;
- }
- else p_newline++;
- }
- return p_newline;
-}
-
/* Function now handles tags with attribute values, and tries
* to deal with &' commands too. It no longer modifies the string
* in place, so that the original text can be reused
* returned, and the rendering engine will fall back to the
* plain text version of the subtitle.
*/
+/* TODO: highly suboptimal, offset should be cached */
static void HtmlNPut( char **ppsz_html, const char *psz_text, int i_max )
{
- const int i_len = strlen(psz_text);
+ char *psz_html = *ppsz_html;
+ if( psz_html == NULL )
+ return;
+
+ const size_t i_offset = strlen(psz_html);
+ const size_t i_len = strnlen(psz_text, i_max);
- strncpy( *ppsz_html, psz_text, i_max );
- *ppsz_html += __MIN(i_max,i_len);
+ psz_html = realloc( psz_html, i_offset + i_len + 1 );
+ if( psz_html != NULL )
+ {
+ memcpy( psz_html + i_offset, psz_text, i_len );
+ psz_html[i_offset + i_len] = '\0';
+ }
+ else
+ free( *ppsz_html );
+ *ppsz_html = psz_html;
}
static void HtmlPut( char **ppsz_html, const char *psz_text )
{
- strcpy( *ppsz_html, psz_text );
- *ppsz_html += strlen(psz_text);
+ HtmlNPut( ppsz_html, psz_text, INT_MAX );
}
+
static void HtmlCopy( char **ppsz_html, char **ppsz_subtitle, const char *psz_text )
{
HtmlPut( ppsz_html, psz_text );
static char *CreateHtmlSubtitle( int *pi_align, char *psz_subtitle )
{
- /* */
- char *psz_tag = malloc( ( strlen( psz_subtitle ) / 3 ) + 1 );
- if( !psz_tag )
+ char *psz_tag = malloc( 1 );
+ if( psz_tag == NULL )
return NULL;
- psz_tag[ 0 ] = '\0';
- /* */
- //Oo + 100 ???
- size_t i_buf_size = strlen( psz_subtitle ) + 100;
- char *psz_html_start = malloc( i_buf_size );
- char *psz_html = psz_html_start;
- if( psz_html_start == NULL )
+ char *psz_html = malloc( 1 );
+ if( psz_html == NULL )
{
free( psz_tag );
return NULL;
}
+ psz_tag[0] = '\0';
psz_html[0] = '\0';
bool b_has_align = false;
else if( !strncasecmp( psz_subtitle, "<b>", 3 ) )
{
HtmlCopy( &psz_html, &psz_subtitle, "<b>" );
- strcat( psz_tag, "b" );
+ HtmlPut( &psz_tag, "b" );
}
else if( !strncasecmp( psz_subtitle, "<i>", 3 ) )
{
HtmlCopy( &psz_html, &psz_subtitle, "<i>" );
- strcat( psz_tag, "i" );
+ HtmlPut( &psz_tag, "i" );
}
else if( !strncasecmp( psz_subtitle, "<u>", 3 ) )
{
HtmlCopy( &psz_html, &psz_subtitle, "<u>" );
- strcat( psz_tag, "u" );
+ HtmlPut( &psz_tag, "u" );
}
else if( !strncasecmp( psz_subtitle, "<s>", 3 ) )
{
HtmlCopy( &psz_html, &psz_subtitle, "<s>" );
- strcat( psz_tag, "s" );
+ HtmlPut( &psz_tag, "s" );
}
else if( !strncasecmp( psz_subtitle, "<font ", 6 ))
{
"alpha=", NULL };
HtmlCopy( &psz_html, &psz_subtitle, "<font " );
- strcat( psz_tag, "f" );
+ HtmlPut( &psz_tag, "f" );
while( *psz_subtitle != '>' )
{
int k;
+ /* <font color= */
+ while (*psz_subtitle == ' ')
+ psz_subtitle++;
+
for( k=0; psz_attribs[ k ]; k++ )
{
int i_len = strlen( psz_attribs[ k ] );
HtmlPut( &psz_html, psz_attribs[k] );
psz_subtitle += i_len;
+ /* <font color= red */
+ while (*psz_subtitle == ' ')
+ psz_subtitle++;
+
/* */
if( *psz_subtitle == '"' )
{
psz_subtitle++;
i_len = strcspn( psz_subtitle, "\"" );
}
+ else if( *psz_subtitle == '\'' )
+ {
+ psz_subtitle++;
+ i_len = strcspn( psz_subtitle, "'" );
+ }
else
{
i_len = strcspn( psz_subtitle, " \t>" );
}
HtmlPut( &psz_html, "\"" );
- if( !strcmp( psz_attribs[ k ], "color=" ) && *psz_subtitle >= '0' && *psz_subtitle <= '9' )
- HtmlPut( &psz_html, "#" );
HtmlNPut( &psz_html, psz_subtitle, i_len );
HtmlPut( &psz_html, "\"" );
psz_subtitle += i_len;
- if( *psz_subtitle == '\"' )
+ if( *psz_subtitle == '\"' || *psz_subtitle == '\'' )
psz_subtitle++;
break;
}
if( psz_subtitle[i_len] == '\"' )
i_len++;
}
+ /* Not a tag, something else we do not understand */
+ if( i_len == 0 )
+ psz_subtitle++;
+
psz_subtitle += i_len;
}
- while (*psz_subtitle == ' ')
- *psz_html++ = *psz_subtitle++;
+
+ HtmlNPut( &psz_html, psz_subtitle, strspn(psz_subtitle, " ") );
}
- *psz_html++ = *psz_subtitle++;
+ HtmlPut( &psz_html, ">" );
+ psz_subtitle++;
}
else if( !strncmp( psz_subtitle, "</", 2 ))
{
bool b_match = false;
bool b_ignore = false;
- int i_len = strlen( psz_tag ) - 1;
+ int i_len = (psz_tag ? strlen(psz_tag) : 0) - 1;
char *psz_lastTag = NULL;
if( i_len >= 0 )
if( !b_match )
{
/* Not well formed -- kill everything */
- free( psz_html_start );
- psz_html_start = NULL;
+ free( psz_html );
+ psz_html = NULL;
break;
}
*psz_lastTag = '\0';
{
/* We have the closing tag, ignore it TODO */
psz_subtitle = &psz_stop[1];
- strcat( psz_tag, "I" );
+ HtmlPut( &psz_tag, "I" );
}
else
{
else if( *psz_subtitle == '>' )
HtmlPut( &psz_html, ">" );
else
- *psz_html++ = *psz_subtitle;
+ HtmlNPut( &psz_html, psz_subtitle, 1 );
}
}
}
if( psz_subtitle[3] == 'i' )
{
HtmlPut( &psz_html, "<i>" );
- strcat( psz_tag, "i" );
+ HtmlPut( &psz_tag, "i" );
}
if( psz_subtitle[3] == 'b' )
{
HtmlPut( &psz_html, "<b>" );
- strcat( psz_tag, "b" );
+ HtmlPut( &psz_tag, "b" );
}
if( psz_subtitle[3] == 'u' )
{
HtmlPut( &psz_html, "<u>" );
- strcat( psz_tag, "u" );
+ HtmlPut( &psz_tag, "u" );
}
psz_subtitle = strchr( psz_subtitle, '}' ) + 1;
}
}
else
{
- *psz_html = *psz_subtitle;
- if( psz_html > psz_html_start )
+ HtmlNPut( &psz_html, psz_subtitle, 1 );
+#if 0
+ if( *psz_html )
{
/* Check for double whitespace */
+# error This test does not make sense.
if( ( *psz_html == ' ' || *psz_html == '\t' ) &&
( *(psz_html-1) == ' ' || *(psz_html-1) == '\t' ) )
{
psz_html--;
}
}
- psz_html++;
+#endif
psz_subtitle++;
}
+ }
- if( ( size_t )( psz_html - psz_html_start ) > i_buf_size - 50 )
+ while( psz_tag && *psz_tag )
+ {
+ /* */
+ char *psz_last = &psz_tag[strlen(psz_tag)-1];
+ switch( *psz_last )
{
- const int i_len = psz_html - psz_html_start;
-
- i_buf_size += 200;
- char *psz_new = realloc( psz_html_start, i_buf_size );
- if( !psz_new )
+ case 'b':
+ HtmlPut( &psz_html, "</b>" );
+ break;
+ case 'i':
+ HtmlPut( &psz_html, "</i>" );
+ break;
+ case 'u':
+ HtmlPut( &psz_html, "</u>" );
+ break;
+ case 's':
+ HtmlPut( &psz_html, "</s>" );
+ break;
+ case 'f':
+ HtmlPut( &psz_html, "</font>" );
+ break;
+ case 'I':
break;
- psz_html_start = psz_new;
- psz_html = &psz_new[i_len];
}
+ *psz_last = '\0';
}
- if( psz_html_start )
- {
- static const char *psz_text_close = "</text>";
- static const char *psz_tag_long = "/font>";
-
- /* Realloc for closing tags and shrink memory */
- const size_t i_length = (size_t)( psz_html - psz_html_start );
-
- const size_t i_size = i_length + strlen(psz_tag_long) * strlen(psz_tag) + strlen(psz_text_close) + 1;
- char *psz_new = realloc( psz_html_start, i_size );
- if( psz_new )
- {
- psz_html_start = psz_new;
- psz_html = &psz_new[i_length];
+ /* Close not well formed subtitle */
+ HtmlPut( &psz_html, "</text>" );
- /* Close not well formed subtitle */
- while( *psz_tag )
- {
- /* */
- char *psz_last = &psz_tag[strlen(psz_tag)-1];
- switch( *psz_last )
- {
- case 'b':
- HtmlPut( &psz_html, "</b>" );
- break;
- case 'i':
- HtmlPut( &psz_html, "</i>" );
- break;
- case 'u':
- HtmlPut( &psz_html, "</u>" );
- break;
- case 's':
- HtmlPut( &psz_html, "</s>" );
- break;
- case 'f':
- HtmlPut( &psz_html, "/font>" );
- break;
- case 'I':
- break;
- }
-
- *psz_last = '\0';
- }
- HtmlPut( &psz_html, psz_text_close );
- }
- }
free( psz_tag );
- return psz_html_start;
+ return psz_html;
}
-