1 /*****************************************************************************
2 * subsdec.c : text subtitle decoder
3 *****************************************************************************
4 * Copyright (C) 2000-2006 VLC authors and VideoLAN
5 * $Id: a440140d1ad65ede614e1489c8a1ba3e640af667 $
6 *
7 * Authors: Gildas Bazin <gbazin@videolan.org>
8 * Samuel Hocevar <sam@zoy.org>
9 * Derk-Jan Hartman <hartman at videolan dot org>
10 * Bernie Purcell <bitmap@videolan.org>
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU Lesser General Public License as published by
14 * the Free Software Foundation; either version 2.1 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public License
23 * along with this program; if not, write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
25 *****************************************************************************/
26
27 /*****************************************************************************
28 * Preamble
29 *****************************************************************************/
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33
34 #include <limits.h>
35 #include <errno.h>
36 #include <ctype.h>
37
38 #include <vlc_common.h>
39 #include <vlc_plugin.h>
40 #include <vlc_codec.h>
41 #include <vlc_charset.h>
42 #include <vlc_xml.h>
43
44 #include "substext.h"
45
46 /*****************************************************************************
47 * Module descriptor.
48 *****************************************************************************/
49 static const char *const ppsz_encodings[] = {
50 "",
51 "system",
52 "UTF-8",
53 "UTF-16",
54 "UTF-16BE",
55 "UTF-16LE",
56 "GB18030",
57 "ISO-8859-15",
58 "Windows-1252",
59 "IBM850",
60 "ISO-8859-2",
61 "Windows-1250",
62 "ISO-8859-3",
63 "ISO-8859-10",
64 "Windows-1251",
65 "KOI8-R",
66 "KOI8-U",
67 "ISO-8859-6",
68 "Windows-1256",
69 "ISO-8859-7",
70 "Windows-1253",
71 "ISO-8859-8",
72 "Windows-1255",
73 "ISO-8859-9",
74 "Windows-1254",
75 "ISO-8859-11",
76 "Windows-874",
77 "ISO-8859-13",
78 "Windows-1257",
79 "ISO-8859-14",
80 "ISO-8859-16",
81 "ISO-2022-CN-EXT",
82 "EUC-CN",
83 "ISO-2022-JP-2",
84 "EUC-JP",
85 "Shift_JIS",
86 "CP949",
87 "ISO-2022-KR",
88 "Big5",
89 "ISO-2022-TW",
90 "Big5-HKSCS",
91 "VISCII",
92 "Windows-1258",
93 };
94
95 static const char *const ppsz_encoding_names[] = {
96 /* xgettext:
97 The character encoding name in parenthesis corresponds to that used for
98 the GetACP translation. "Windows-1252" applies to Western European
99 languages using the Latin alphabet. */
100 N_("Default (Windows-1252)"),
101 N_("System codeset"),
102 N_("Universal (UTF-8)"),
103 N_("Universal (UTF-16)"),
104 N_("Universal (big endian UTF-16)"),
105 N_("Universal (little endian UTF-16)"),
106 N_("Universal, Chinese (GB18030)"),
107
108 /* ISO 8859 and the likes */
109 /* 1 */
110 N_("Western European (Latin-9)"), /* mostly superset of Latin-1 */
111 N_("Western European (Windows-1252)"),
112 N_("Western European (IBM 00850)"),
113 /* 2 */
114 N_("Eastern European (Latin-2)"),
115 N_("Eastern European (Windows-1250)"),
116 /* 3 */
117 N_("Esperanto (Latin-3)"),
118 /* 4 */
119 N_("Nordic (Latin-6)"), /* Latin 6 supersedes Latin 4 */
120 /* 5 */
121 N_("Cyrillic (Windows-1251)"), /* ISO 8859-5 is not practically used */
122 N_("Russian (KOI8-R)"),
123 N_("Ukrainian (KOI8-U)"),
124 /* 6 */
125 N_("Arabic (ISO 8859-6)"),
126 N_("Arabic (Windows-1256)"),
127 /* 7 */
128 N_("Greek (ISO 8859-7)"),
129 N_("Greek (Windows-1253)"),
130 /* 8 */
131 N_("Hebrew (ISO 8859-8)"),
132 N_("Hebrew (Windows-1255)"),
133 /* 9 */
134 N_("Turkish (ISO 8859-9)"),
135 N_("Turkish (Windows-1254)"),
136 /* 10 -> 4 */
137 /* 11 */
138 N_("Thai (TIS 620-2533/ISO 8859-11)"),
139 N_("Thai (Windows-874)"),
140 /* 13 */
141 N_("Baltic (Latin-7)"),
142 N_("Baltic (Windows-1257)"),
143 /* 12 -> /dev/null */
144 /* 14 */
145 N_("Celtic (Latin-8)"),
146 /* 15 -> 1 */
147 /* 16 */
148 N_("South-Eastern European (Latin-10)"),
149 /* CJK families */
150 N_("Simplified Chinese (ISO-2022-CN-EXT)"),
151 N_("Simplified Chinese Unix (EUC-CN)"),
152 N_("Japanese (7-bits JIS/ISO-2022-JP-2)"),
153 N_("Japanese Unix (EUC-JP)"),
154 N_("Japanese (Shift JIS)"),
155 N_("Korean (EUC-KR/CP949)"),
156 N_("Korean (ISO-2022-KR)"),
157 N_("Traditional Chinese (Big5)"),
158 N_("Traditional Chinese Unix (EUC-TW)"),
159 N_("Hong-Kong Supplementary (HKSCS)"),
160 /* Other */
161 N_("Vietnamese (VISCII)"),
162 N_("Vietnamese (Windows-1258)"),
163 };
164
165 static const int pi_justification[] = { -1, 0, 1, 2 };
166 static const char *const ppsz_justification_text[] = {
167 N_("Auto"),N_("Center"),N_("Left"),N_("Right")
168 };
169
170 #define ENCODING_TEXT N_("Subtitle text encoding")
171 #define ENCODING_LONGTEXT N_("Set the encoding used in text subtitles")
172 #define ALIGN_TEXT N_("Subtitle justification")
173 #define ALIGN_LONGTEXT N_("Set the justification of subtitles")
174 #define AUTODETECT_UTF8_TEXT N_("UTF-8 subtitle autodetection")
175 #define AUTODETECT_UTF8_LONGTEXT N_("This enables automatic detection of " \
176 "UTF-8 encoding within subtitle files.")
177
178 static int OpenDecoder ( vlc_object_t * );
179 static void CloseDecoder ( vlc_object_t * );
180
181 vlc_module_begin ()
182 set_shortname( N_("Subtitles"))
183 set_description( N_("Text subtitle decoder") )
184 set_capability( "spu decoder", 50 )
185 set_callbacks( OpenDecoder, CloseDecoder )
186 set_category( CAT_INPUT )
187 set_subcategory( SUBCAT_INPUT_SCODEC )
188
189 add_integer( "subsdec-align", -1, ALIGN_TEXT, ALIGN_LONGTEXT,
190 false )
191 change_integer_list( pi_justification, ppsz_justification_text )
192 add_string( "subsdec-encoding", "",
193 ENCODING_TEXT, ENCODING_LONGTEXT, false )
194 change_string_list( ppsz_encodings, ppsz_encoding_names )
195 add_bool( "subsdec-autodetect-utf8", true,
196 AUTODETECT_UTF8_TEXT, AUTODETECT_UTF8_LONGTEXT, false )
197 vlc_module_end ()
198
199 /*****************************************************************************
200 * Local prototypes
201 *****************************************************************************/
202 #define NO_BREAKING_SPACE " "
203
204 struct decoder_sys_t
205 {
206 int i_align; /* Subtitles alignment on the vout */
207
208 vlc_iconv_t iconv_handle; /* handle to iconv instance */
209 bool b_autodetect_utf8;
210 };
211
212
213 static int DecodeBlock ( decoder_t *, block_t * );
214 static subpicture_t *ParseText ( decoder_t *, block_t * );
215 static text_segment_t *ParseSubtitles(int *pi_align, const char * );
216
217 /*****************************************************************************
218 * OpenDecoder: probe the decoder and return score
219 *****************************************************************************
220 * Tries to launch a decoder and return score so that the interface is able
221 * to chose.
222 *****************************************************************************/
OpenDecoder(vlc_object_t * p_this)223 static int OpenDecoder( vlc_object_t *p_this )
224 {
225 decoder_t *p_dec = (decoder_t*)p_this;
226 decoder_sys_t *p_sys;
227
228 switch( p_dec->fmt_in.i_codec )
229 {
230 case VLC_CODEC_SUBT:
231 case VLC_CODEC_ITU_T140:
232 break;
233 default:
234 return VLC_EGENERIC;
235 }
236
237 /* Allocate the memory needed to store the decoder's structure */
238 p_dec->p_sys = p_sys = calloc( 1, sizeof( *p_sys ) );
239 if( p_sys == NULL )
240 return VLC_ENOMEM;
241
242 p_dec->pf_decode = DecodeBlock;
243 p_dec->fmt_out.i_codec = 0;
244
245 /* init of p_sys */
246 p_sys->i_align = -1;
247 p_sys->iconv_handle = (vlc_iconv_t)-1;
248 p_sys->b_autodetect_utf8 = false;
249
250 const char *encoding;
251 char *var = NULL;
252
253 /* First try demux-specified encoding */
254 if( p_dec->fmt_in.i_codec == VLC_CODEC_ITU_T140 )
255 encoding = "UTF-8"; /* IUT T.140 is always using UTF-8 */
256 else
257 if( p_dec->fmt_in.subs.psz_encoding && *p_dec->fmt_in.subs.psz_encoding )
258 {
259 encoding = p_dec->fmt_in.subs.psz_encoding;
260 msg_Dbg (p_dec, "trying demuxer-specified character encoding: %s",
261 encoding);
262 }
263 else
264 {
265 /* Second, try configured encoding */
266 if ((var = var_InheritString (p_dec, "subsdec-encoding")) != NULL)
267 {
268 msg_Dbg (p_dec, "trying configured character encoding: %s", var);
269 if (!strcmp (var, "system"))
270 {
271 free (var);
272 var = NULL;
273 encoding = "";
274 /* ^ iconv() treats "" as nl_langinfo(CODESET) */
275 }
276 else
277 encoding = var;
278 }
279 else
280 /* Third, try "local" encoding */
281 {
282 /* xgettext:
283 The Windows ANSI code page most commonly used for this language.
284 VLC uses this as a guess of the subtitle files character set
285 (if UTF-8 and UTF-16 autodetection fails).
286 Western European languages normally use "CP1252", which is a
287 Microsoft-variant of ISO 8859-1. That suits the Latin alphabet.
288 Other scripts use other code pages.
289
290 This MUST be a valid iconv character set. If unsure, please refer
291 the VideoLAN translators mailing list. */
292 encoding = vlc_pgettext("GetACP", "CP1252");
293 msg_Dbg (p_dec, "trying default character encoding: %s", encoding);
294 }
295
296 /* Check UTF-8 autodetection */
297 if (var_InheritBool (p_dec, "subsdec-autodetect-utf8"))
298 {
299 msg_Dbg (p_dec, "using automatic UTF-8 detection");
300 p_sys->b_autodetect_utf8 = true;
301 }
302 }
303
304 if (strcasecmp (encoding, "UTF-8") && strcasecmp (encoding, "utf8"))
305 {
306 p_sys->iconv_handle = vlc_iconv_open ("UTF-8", encoding);
307 if (p_sys->iconv_handle == (vlc_iconv_t)(-1))
308 msg_Err (p_dec, "cannot convert from %s: %s", encoding,
309 vlc_strerror_c(errno));
310 }
311 free (var);
312
313 p_sys->i_align = var_InheritInteger( p_dec, "subsdec-align" );
314
315 return VLC_SUCCESS;
316 }
317
318 /****************************************************************************
319 * DecodeBlock: the whole thing
320 ****************************************************************************
321 * This function must be fed with complete subtitles units.
322 ****************************************************************************/
DecodeBlock(decoder_t * p_dec,block_t * p_block)323 static int DecodeBlock( decoder_t *p_dec, block_t *p_block )
324 {
325 subpicture_t *p_spu;
326
327 if( p_block == NULL ) /* No Drain */
328 return VLCDEC_SUCCESS;
329
330 if( p_block->i_flags & BLOCK_FLAG_CORRUPTED )
331 {
332 block_Release( p_block );
333 return VLCDEC_SUCCESS;
334 }
335
336 p_spu = ParseText( p_dec, p_block );
337
338 block_Release( p_block );
339 if( p_spu != NULL )
340 decoder_QueueSub( p_dec, p_spu );
341 return VLCDEC_SUCCESS;
342 }
343
344 /*****************************************************************************
345 * CloseDecoder: clean up the decoder
346 *****************************************************************************/
CloseDecoder(vlc_object_t * p_this)347 static void CloseDecoder( vlc_object_t *p_this )
348 {
349 decoder_t *p_dec = (decoder_t *)p_this;
350 decoder_sys_t *p_sys = p_dec->p_sys;
351
352 if( p_sys->iconv_handle != (vlc_iconv_t)-1 )
353 vlc_iconv_close( p_sys->iconv_handle );
354
355 free( p_sys );
356 }
357
358 /*****************************************************************************
359 * ParseText: parse an text subtitle packet and send it to the video output
360 *****************************************************************************/
ParseText(decoder_t * p_dec,block_t * p_block)361 static subpicture_t *ParseText( decoder_t *p_dec, block_t *p_block )
362 {
363 decoder_sys_t *p_sys = p_dec->p_sys;
364 subpicture_t *p_spu = NULL;
365
366 if( p_block->i_flags & BLOCK_FLAG_CORRUPTED )
367 return NULL;
368
369 /* We cannot display a subpicture with no date */
370 if( p_block->i_pts <= VLC_TS_INVALID )
371 {
372 msg_Warn( p_dec, "subtitle without a date" );
373 return NULL;
374 }
375
376 /* Check validity of packet data */
377 /* An "empty" line containing only \0 can be used to force
378 and ephemer picture from the screen */
379 if( p_block->i_buffer < 1 )
380 {
381 msg_Warn( p_dec, "no subtitle data" );
382 return NULL;
383 }
384
385 char *psz_subtitle = NULL;
386
387 /* Should be resiliant against bad subtitles */
388 if( p_sys->iconv_handle == (vlc_iconv_t)-1 ||
389 p_sys->b_autodetect_utf8 )
390 {
391 psz_subtitle = malloc( p_block->i_buffer + 1 );
392 if( psz_subtitle == NULL )
393 return NULL;
394 memcpy( psz_subtitle, p_block->p_buffer, p_block->i_buffer );
395 psz_subtitle[p_block->i_buffer] = '\0';
396 }
397
398 if( p_sys->iconv_handle == (vlc_iconv_t)-1 )
399 {
400 if (EnsureUTF8( psz_subtitle ) == NULL)
401 {
402 msg_Err( p_dec, "failed to convert subtitle encoding.\n"
403 "Try manually setting a character-encoding "
404 "before you open the file." );
405 }
406 }
407 else
408 {
409 if( p_sys->b_autodetect_utf8 )
410 {
411 if( IsUTF8( psz_subtitle ) == NULL )
412 {
413 msg_Dbg( p_dec, "invalid UTF-8 sequence: "
414 "disabling UTF-8 subtitles autodetection" );
415 p_sys->b_autodetect_utf8 = false;
416 }
417 }
418
419 if( !p_sys->b_autodetect_utf8 )
420 {
421 size_t inbytes_left = p_block->i_buffer;
422 size_t outbytes_left = 6 * inbytes_left;
423 char *psz_new_subtitle = xmalloc( outbytes_left + 1 );
424 char *psz_convert_buffer_out = psz_new_subtitle;
425 const char *psz_convert_buffer_in =
426 psz_subtitle ? psz_subtitle : (char *)p_block->p_buffer;
427
428 size_t ret = vlc_iconv( p_sys->iconv_handle,
429 &psz_convert_buffer_in, &inbytes_left,
430 &psz_convert_buffer_out, &outbytes_left );
431
432 *psz_convert_buffer_out++ = '\0';
433 free( psz_subtitle );
434
435 if( ( ret == (size_t)(-1) ) || inbytes_left )
436 {
437 free( psz_new_subtitle );
438 msg_Err( p_dec, "failed to convert subtitle encoding.\n"
439 "Try manually setting a character-encoding "
440 "before you open the file." );
441 return NULL;
442 }
443
444 psz_subtitle = realloc( psz_new_subtitle,
445 psz_convert_buffer_out - psz_new_subtitle );
446 if( !psz_subtitle )
447 psz_subtitle = psz_new_subtitle;
448 }
449 }
450
451 /* Create the subpicture unit */
452 p_spu = decoder_NewSubpictureText( p_dec );
453 if( !p_spu )
454 {
455 free( psz_subtitle );
456 return NULL;
457 }
458 p_spu->i_start = p_block->i_pts;
459 p_spu->i_stop = p_block->i_pts + p_block->i_length;
460 p_spu->b_ephemer = (p_block->i_length == 0);
461 p_spu->b_absolute = false;
462
463 subpicture_updater_sys_t *p_spu_sys = p_spu->updater.p_sys;
464
465 int i_inline_align = -1;
466 p_spu_sys->region.p_segments = ParseSubtitles( &i_inline_align, psz_subtitle );
467 free( psz_subtitle );
468 if( p_sys->i_align >= 0 ) /* bottom ; left, right or centered */
469 {
470 p_spu_sys->region.align = SUBPICTURE_ALIGN_BOTTOM | p_sys->i_align;
471 p_spu_sys->region.inner_align = p_sys->i_align;
472 }
473 else if( i_inline_align >= 0 )
474 {
475 p_spu_sys->region.align = i_inline_align;
476 p_spu_sys->region.inner_align = i_inline_align;
477 }
478 else /* default, bottom ; centered */
479 {
480 p_spu_sys->region.align = SUBPICTURE_ALIGN_BOTTOM;
481 p_spu_sys->region.inner_align = 0;
482 }
483
484 return p_spu;
485 }
486
AppendCharacter(text_segment_t * p_segment,char c)487 static bool AppendCharacter( text_segment_t* p_segment, char c )
488 {
489 char* tmp;
490 if ( asprintf( &tmp, "%s%c", p_segment->psz_text ? p_segment->psz_text : "", c ) < 0 )
491 return false;
492 free( p_segment->psz_text );
493 p_segment->psz_text = tmp;
494 return true;
495 }
496
AppendString(text_segment_t * p_segment,const char * psz_str)497 static bool AppendString( text_segment_t* p_segment, const char* psz_str )
498 {
499 char* tmp;
500 if ( asprintf( &tmp, "%s%s", p_segment->psz_text ? p_segment->psz_text : "", psz_str ) < 0 )
501 return false;
502 free( p_segment->psz_text );
503 p_segment->psz_text = tmp;
504 return true;
505 }
506
ConsumeAttribute(const char ** ppsz_subtitle,char ** ppsz_attribute_value)507 static char* ConsumeAttribute( const char** ppsz_subtitle, char** ppsz_attribute_value )
508 {
509 const char* psz_subtitle = *ppsz_subtitle;
510 char* psz_attribute_name;
511 *ppsz_attribute_value = NULL;
512
513 while (*psz_subtitle == ' ')
514 psz_subtitle++;
515
516 size_t attr_len = 0;
517 char delimiter;
518
519 while ( *psz_subtitle && isalpha( *psz_subtitle ) )
520 {
521 psz_subtitle++;
522 attr_len++;
523 }
524 if ( !*psz_subtitle || attr_len == 0 )
525 return NULL;
526 psz_attribute_name = malloc( attr_len + 1 );
527 if ( unlikely( !psz_attribute_name ) )
528 return NULL;
529 strncpy( psz_attribute_name, psz_subtitle - attr_len, attr_len );
530 psz_attribute_name[attr_len] = 0;
531
532 // Skip over to the attribute value
533 while ( *psz_subtitle && *psz_subtitle != '=' )
534 psz_subtitle++;
535 if ( !*psz_subtitle )
536 {
537 *ppsz_subtitle = psz_subtitle;
538 return psz_attribute_name;
539 }
540 // Skip the '=' sign
541 psz_subtitle++;
542
543 // Aknoledge the delimiter if any
544 while ( *psz_subtitle && isspace( *psz_subtitle) )
545 psz_subtitle++;
546
547 if ( *psz_subtitle == '\'' || *psz_subtitle == '"' )
548 {
549 // Save the delimiter and skip it
550 delimiter = *psz_subtitle;
551 psz_subtitle++;
552 }
553 else
554 delimiter = 0;
555
556 // Skip spaces, just in case
557 while ( *psz_subtitle && isspace( *psz_subtitle ) )
558 psz_subtitle++;
559
560 attr_len = 0;
561 while ( *psz_subtitle && ( ( delimiter != 0 && *psz_subtitle != delimiter ) ||
562 ( delimiter == 0 && ( !isspace(*psz_subtitle) && *psz_subtitle != '>' ) ) ) )
563 {
564 psz_subtitle++;
565 attr_len++;
566 }
567 if ( attr_len == 0 )
568 {
569 *ppsz_subtitle = psz_subtitle;
570 return psz_attribute_name;
571 }
572 if ( unlikely( !( *ppsz_attribute_value = malloc( attr_len + 1 ) ) ) )
573 {
574 free( psz_attribute_name );
575 return NULL;
576 }
577 strncpy( *ppsz_attribute_value, psz_subtitle - attr_len, attr_len );
578 (*ppsz_attribute_value)[attr_len] = 0;
579 // Finally, skip over the final delimiter
580 if (delimiter != 0 && *psz_subtitle)
581 psz_subtitle++;
582 *ppsz_subtitle = psz_subtitle;
583 return psz_attribute_name;
584 }
585
586 // Returns the next tag and consume the string up to after the tag name, or
587 // returns NULL and doesn't advance if the angle bracket was not a tag opening
588 // For instance, if psz_subtitle == "<some_tag attribute=value>"
589 // GetTag will return "some_tag", and will advance up to the first 'a' in "attribute"
590 // The returned value must be freed.
GetTag(const char ** ppsz_subtitle,bool b_closing)591 static char* GetTag( const char** ppsz_subtitle, bool b_closing )
592 {
593 const char* psz_subtitle = *ppsz_subtitle;
594 if ( *psz_subtitle != '<' )
595 return NULL;
596 // Skip the '<'
597 psz_subtitle++;
598 if ( b_closing && *psz_subtitle == '/' )
599 psz_subtitle++;
600 // Skip potential spaces
601 while ( *psz_subtitle == ' ' )
602 psz_subtitle++;
603 // Now we need to verify if what comes next is a valid tag:
604 if ( !isalpha( *psz_subtitle ) )
605 return NULL;
606 size_t tag_size = 1;
607 while ( isalnum( psz_subtitle[tag_size] ) || psz_subtitle[tag_size] == '_' )
608 tag_size++;
609 char* psz_tagname = vlc_alloc( tag_size + 1, sizeof( *psz_tagname ) );
610 if ( unlikely( !psz_tagname ) )
611 return NULL;
612 strncpy( psz_tagname, psz_subtitle, tag_size );
613 psz_tagname[tag_size] = 0;
614 psz_subtitle += tag_size;
615 *ppsz_subtitle = psz_subtitle;
616 return psz_tagname;
617 }
618
IsClosed(const char * psz_subtitle,const char * psz_tagname)619 static bool IsClosed( const char* psz_subtitle, const char* psz_tagname )
620 {
621 const char* psz_tagpos = strcasestr( psz_subtitle, psz_tagname );
622 if ( !psz_tagpos )
623 return false;
624 // Search for '</' and '>' immediatly before & after (minding the potential spaces)
625 const char* psz_endtag = psz_tagpos + strlen( psz_tagname );
626 while ( *psz_endtag == ' ' )
627 psz_endtag++;
628 if ( *psz_endtag != '>' )
629 return false;
630 // Skip back before the tag itself
631 psz_tagpos--;
632 while ( *psz_tagpos == ' ' && psz_tagpos > psz_subtitle )
633 psz_tagpos--;
634 if ( *psz_tagpos-- != '/' )
635 return false;
636 if ( *psz_tagpos != '<' )
637 return false;
638 return true;
639 }
640
641 typedef struct tag_stack tag_stack_t;
642 struct tag_stack
643 {
644 char* psz_tagname;
645 tag_stack_t *p_next;
646 };
647
AppendTag(tag_stack_t ** pp_stack,char * psz_tagname)648 static void AppendTag( tag_stack_t **pp_stack, char* psz_tagname )
649 {
650 tag_stack_t* p_elem = malloc( sizeof( *p_elem ) );
651 if ( unlikely( !p_elem ) )
652 return;
653 p_elem->p_next = *pp_stack;
654 p_elem->psz_tagname = psz_tagname;
655 *pp_stack = p_elem;
656 }
657
HasTag(tag_stack_t ** pp_stack,const char * psz_tagname)658 static bool HasTag( tag_stack_t **pp_stack, const char* psz_tagname )
659 {
660 tag_stack_t *p_prev = NULL;
661 for ( tag_stack_t* p_current = *pp_stack; p_current; p_current = p_current->p_next )
662 {
663 if ( !strcasecmp( psz_tagname, p_current->psz_tagname ) )
664 {
665 if ( p_current == *pp_stack )
666 {
667 *pp_stack = p_current->p_next;
668 }
669 else
670 {
671 p_prev->p_next = p_current->p_next;
672 }
673 free( p_current->psz_tagname );
674 free( p_current );
675 return true;
676 }
677 p_prev = p_current;
678 }
679 return false;
680 }
681
682 /*
683 * mini style stack implementation
684 */
685 typedef struct style_stack style_stack_t;
686 struct style_stack
687 {
688 text_style_t* p_style;
689 style_stack_t* p_next;
690 };
691
DuplicateAndPushStyle(style_stack_t ** pp_stack)692 static text_style_t* DuplicateAndPushStyle(style_stack_t** pp_stack)
693 {
694 text_style_t* p_dup = ( *pp_stack ) ? text_style_Duplicate( (*pp_stack)->p_style ) : text_style_Create( STYLE_NO_DEFAULTS );
695 if ( unlikely( !p_dup ) )
696 return NULL;
697 style_stack_t* p_entry = malloc( sizeof( *p_entry ) );
698 if ( unlikely( !p_entry ) )
699 {
700 text_style_Delete( p_dup );
701 return NULL;
702 }
703 // Give the style ownership to the segment.
704 p_entry->p_style = p_dup;
705 p_entry->p_next = *pp_stack;
706 *pp_stack = p_entry;
707 return p_dup;
708 }
709
PopStyle(style_stack_t ** pp_stack)710 static void PopStyle(style_stack_t** pp_stack)
711 {
712 style_stack_t* p_old = *pp_stack;
713 if ( !p_old )
714 return;
715 *pp_stack = p_old->p_next;
716 // Don't free the style, it is now owned by the text_segment_t
717 free( p_old );
718 }
719
NewTextSegmentPushStyle(text_segment_t * p_segment,style_stack_t ** pp_stack)720 static text_segment_t* NewTextSegmentPushStyle( text_segment_t* p_segment, style_stack_t** pp_stack )
721 {
722 text_segment_t* p_new = text_segment_New( NULL );
723 if ( unlikely( p_new == NULL ) )
724 return NULL;
725 text_style_t* p_style = DuplicateAndPushStyle( pp_stack );
726 p_new->style = p_style;
727 p_segment->p_next = p_new;
728 return p_new;
729 }
730
NewTextSegmentPopStyle(text_segment_t * p_segment,style_stack_t ** pp_stack)731 static text_segment_t* NewTextSegmentPopStyle( text_segment_t* p_segment, style_stack_t** pp_stack )
732 {
733 text_segment_t* p_new = text_segment_New( NULL );
734 if ( unlikely( p_new == NULL ) )
735 return NULL;
736 // We shouldn't have an empty stack since this happens when closing a tag,
737 // but better be safe than sorry if (/when) we encounter a broken subtitle file.
738 PopStyle( pp_stack );
739 text_style_t* p_dup = ( *pp_stack ) ? text_style_Duplicate( (*pp_stack)->p_style ) : text_style_Create( STYLE_NO_DEFAULTS );
740 p_new->style = p_dup;
741 p_segment->p_next = p_new;
742 return p_new;
743 }
744
ParseSubtitles(int * pi_align,const char * psz_subtitle)745 static text_segment_t* ParseSubtitles( int *pi_align, const char *psz_subtitle )
746 {
747 text_segment_t* p_segment;
748 text_segment_t* p_first_segment;
749 style_stack_t* p_stack = NULL;
750 tag_stack_t* p_tag_stack = NULL;
751
752 //FIXME: Remove initial allocation? Might make the below code more complicated
753 p_first_segment = p_segment = text_segment_New( "" );
754
755 *pi_align = -1;
756
757 /* */
758 while( *psz_subtitle )
759 {
760 /* HTML extensions */
761 if( *psz_subtitle == '<' )
762 {
763 char *psz_tagname = GetTag( &psz_subtitle, false );
764 if ( psz_tagname != NULL )
765 {
766 if( !strcasecmp( psz_tagname, "br" ) )
767 {
768 if ( !AppendCharacter( p_segment, '\n' ) )
769 {
770 free( psz_tagname );
771 goto fail;
772 }
773 }
774 else if( !strcasecmp( psz_tagname, "b" ) )
775 {
776 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
777 p_segment->style->i_style_flags |= STYLE_BOLD;
778 p_segment->style->i_features |= STYLE_HAS_FLAGS;
779 }
780 else if( !strcasecmp( psz_tagname, "i" ) )
781 {
782 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
783 p_segment->style->i_style_flags |= STYLE_ITALIC;
784 p_segment->style->i_features |= STYLE_HAS_FLAGS;
785 }
786 else if( !strcasecmp( psz_tagname, "u" ) )
787 {
788 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
789 p_segment->style->i_style_flags |= STYLE_UNDERLINE;
790 p_segment->style->i_features |= STYLE_HAS_FLAGS;
791 }
792 else if( !strcasecmp( psz_tagname, "s" ) )
793 {
794 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
795 p_segment->style->i_style_flags |= STYLE_STRIKEOUT;
796 p_segment->style->i_features |= STYLE_HAS_FLAGS;
797 }
798 else if( !strcasecmp( psz_tagname, "font" ) )
799 {
800 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
801
802 char* psz_attribute_name;
803 char* psz_attribute_value;
804
805 while( ( psz_attribute_name = ConsumeAttribute( &psz_subtitle, &psz_attribute_value ) ) )
806 {
807 if ( !psz_attribute_value )
808 {
809 free( psz_attribute_name );
810 continue;
811 }
812 if ( !strcasecmp( psz_attribute_name, "face" ) )
813 {
814 free(p_segment->style->psz_fontname);
815 p_segment->style->psz_fontname = psz_attribute_value;
816 // We don't want to free the attribute value since it has become our fontname
817 psz_attribute_value = NULL;
818 }
819 else if ( !strcasecmp( psz_attribute_name, "family" ) )
820 {
821 free(p_segment->style->psz_monofontname);
822 p_segment->style->psz_monofontname = psz_attribute_value;
823 psz_attribute_value = NULL;
824 }
825 else if ( !strcasecmp( psz_attribute_name, "size" ) )
826 {
827 int size = atoi( psz_attribute_value );
828 if( size )
829 {
830 p_segment->style->i_font_size = size;
831 p_segment->style->f_font_relsize = STYLE_DEFAULT_REL_FONT_SIZE *
832 STYLE_DEFAULT_FONT_SIZE / p_segment->style->i_font_size;
833 }
834 }
835 else if ( !strcasecmp( psz_attribute_name, "color" ) )
836 {
837 p_segment->style->i_font_color = vlc_html_color( psz_attribute_value, NULL );
838 p_segment->style->i_features |= STYLE_HAS_FONT_COLOR;
839 }
840 else if ( !strcasecmp( psz_attribute_name, "outline-color" ) )
841 {
842 p_segment->style->i_outline_color = vlc_html_color( psz_attribute_value, NULL );
843 p_segment->style->i_features |= STYLE_HAS_OUTLINE_COLOR;
844 }
845 else if ( !strcasecmp( psz_attribute_name, "shadow-color" ) )
846 {
847 p_segment->style->i_shadow_color = vlc_html_color( psz_attribute_value, NULL );
848 p_segment->style->i_features |= STYLE_HAS_SHADOW_COLOR;
849 }
850 else if ( !strcasecmp( psz_attribute_name, "outline-level" ) )
851 {
852 p_segment->style->i_outline_width = atoi( psz_attribute_value );
853 }
854 else if ( !strcasecmp( psz_attribute_name, "shadow-level" ) )
855 {
856 p_segment->style->i_shadow_width = atoi( psz_attribute_value );
857 }
858 else if ( !strcasecmp( psz_attribute_name, "back-color" ) )
859 {
860 p_segment->style->i_background_color = vlc_html_color( psz_attribute_value, NULL );
861 p_segment->style->i_features |= STYLE_HAS_BACKGROUND_COLOR;
862 }
863 else if ( !strcasecmp( psz_attribute_name, "alpha" ) )
864 {
865 p_segment->style->i_font_alpha = atoi( psz_attribute_value );
866 p_segment->style->i_features |= STYLE_HAS_FONT_ALPHA;
867 }
868
869 free( psz_attribute_name );
870 free( psz_attribute_value );
871 }
872 }
873 else
874 {
875 // This is an unknown tag. We need to hide it if it's properly closed, and display it otherwise
876 if ( !IsClosed( psz_subtitle, psz_tagname ) )
877 {
878 AppendCharacter( p_segment, '<' );
879 AppendString( p_segment, psz_tagname );
880 AppendCharacter( p_segment, '>' );
881 }
882 else
883 {
884 AppendTag( &p_tag_stack, psz_tagname );
885 // We don't want to free the tagname now, it will be freed when the tag
886 // gets poped from the stack.
887 psz_tagname = NULL;
888 }
889 // In any case, fall through and skip to the closing tag.
890 }
891 // Skip potential spaces & end tag
892 while ( *psz_subtitle && *psz_subtitle != '>' )
893 psz_subtitle++;
894 if ( *psz_subtitle == '>' )
895 psz_subtitle++;
896
897 free( psz_tagname );
898 }
899 else if( !strncmp( psz_subtitle, "</", 2 ))
900 {
901 char* psz_tagname = GetTag( &psz_subtitle, true );
902 if ( psz_tagname != NULL )
903 {
904 if ( !strcasecmp( psz_tagname, "b" ) ||
905 !strcasecmp( psz_tagname, "i" ) ||
906 !strcasecmp( psz_tagname, "u" ) ||
907 !strcasecmp( psz_tagname, "s" ) ||
908 !strcasecmp( psz_tagname, "font" ) )
909 {
910 // A closing tag for one of the tags we handle, meaning
911 // we pushed a style onto the stack earlier
912 p_segment = NewTextSegmentPopStyle( p_segment, &p_stack );
913 }
914 else
915 {
916 // Unknown closing tag. If it is closing an unknown tag, ignore it. Otherwise, display it
917 if ( !HasTag( &p_tag_stack, psz_tagname ) )
918 {
919 AppendString( p_segment, "</" );
920 AppendString( p_segment, psz_tagname );
921 AppendCharacter( p_segment, '>' );
922 }
923 }
924 while ( *psz_subtitle == ' ' )
925 psz_subtitle++;
926 if ( *psz_subtitle == '>' )
927 psz_subtitle++;
928 free( psz_tagname );
929 }
930 else
931 {
932 /**
933 * This doesn't appear to be a valid tag closing syntax.
934 * Simply append the text
935 */
936 AppendString( p_segment, "</" );
937 psz_subtitle += 2;
938 }
939 }
940 else
941 {
942 /* We have an unknown tag, just append it, and move on.
943 * The rest of the string won't be recognized as a tag, and
944 * we will ignore unknown closing tag
945 */
946 AppendCharacter( p_segment, '<' );
947 psz_subtitle++;
948 }
949 }
950 /* SSA extensions */
951 else if( psz_subtitle[0] == '{' && psz_subtitle[1] == '\\' &&
952 strchr( psz_subtitle, '}' ) )
953 {
954 /* Check for forced alignment */
955 if( *pi_align < 0 &&
956 !strncmp( psz_subtitle, "{\\an", 4 ) && psz_subtitle[4] >= '1' && psz_subtitle[4] <= '9' && psz_subtitle[5] == '}' )
957 {
958 static const int pi_vertical[3] = { SUBPICTURE_ALIGN_BOTTOM, 0, SUBPICTURE_ALIGN_TOP };
959 static const int pi_horizontal[3] = { SUBPICTURE_ALIGN_LEFT, 0, SUBPICTURE_ALIGN_RIGHT };
960 const int i_id = psz_subtitle[4] - '1';
961
962 *pi_align = pi_vertical[i_id/3] | pi_horizontal[i_id%3];
963 }
964 /* TODO fr -> rotation */
965
966 /* Hide {\stupidity} */
967 psz_subtitle = strchr( psz_subtitle, '}' ) + 1;
968 }
969 /* MicroDVD extensions */
970 /* FIXME:
971 * - Currently, we don't do difference between X and x, and we should:
972 * Capital Letters applies to the whole text and not one line
973 * - We don't support Position and Coordinates
974 * - We don't support the DEFAULT flag (HEADER)
975 */
976
977 else if( psz_subtitle[0] == '{' && psz_subtitle[1] != 0 &&
978 psz_subtitle[2] == ':' && strchr( &psz_subtitle[2], '}' ) )
979 {
980 const char *psz_tag_end = strchr( &psz_subtitle[2], '}' );
981 size_t i_len = psz_tag_end - &psz_subtitle[3];
982
983 if( psz_subtitle[1] == 'Y' || psz_subtitle[1] == 'y' )
984 {
985 if( psz_subtitle[3] == 'i' )
986 {
987 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
988 p_segment->style->i_style_flags |= STYLE_ITALIC;
989 p_segment->style->i_features |= STYLE_HAS_FLAGS;
990 psz_subtitle++;
991 }
992 if( psz_subtitle[3] == 'b' )
993 {
994 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
995 p_segment->style->i_style_flags |= STYLE_BOLD;
996 p_segment->style->i_features |= STYLE_HAS_FLAGS;
997 psz_subtitle++;
998 }
999 if( psz_subtitle[3] == 'u' )
1000 {
1001 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1002 p_segment->style->i_style_flags |= STYLE_UNDERLINE;
1003 p_segment->style->i_features |= STYLE_HAS_FLAGS;
1004 psz_subtitle++;
1005 }
1006 }
1007 else if( (psz_subtitle[1] == 'C' || psz_subtitle[1] == 'c' )
1008 && psz_subtitle[3] == '$' && i_len >= 7 )
1009 {
1010 /* Yes, they use BBGGRR, instead of RRGGBB */
1011 char psz_color[7];
1012 psz_color[0] = psz_subtitle[8]; psz_color[1] = psz_subtitle[9];
1013 psz_color[2] = psz_subtitle[6]; psz_color[3] = psz_subtitle[7];
1014 psz_color[4] = psz_subtitle[4]; psz_color[5] = psz_subtitle[5];
1015 psz_color[6] = '\0';
1016 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1017 p_segment->style->i_font_color = vlc_html_color( psz_color, NULL );
1018 p_segment->style->i_features |= STYLE_HAS_FONT_COLOR;
1019 }
1020 else if( psz_subtitle[1] == 'F' || psz_subtitle[1] == 'f' )
1021 {
1022 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1023 free(p_segment->style->psz_fontname);
1024 p_segment->style->psz_fontname = strndup( &psz_subtitle[3], i_len );
1025 }
1026 else if( psz_subtitle[1] == 'S' || psz_subtitle[1] == 's' )
1027 {
1028 int size = atoi( &psz_subtitle[3] );
1029 if( size )
1030 {
1031 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1032 p_segment->style->i_font_size = size;
1033 p_segment->style->f_font_relsize = STYLE_DEFAULT_REL_FONT_SIZE *
1034 STYLE_DEFAULT_FONT_SIZE / p_segment->style->i_font_size;
1035
1036 }
1037 }
1038 /* Currently unsupported since we don't have access to the i_align flag here
1039 else if( psz_subtitle[1] == 'P' )
1040 {
1041 if( psz_subtitle[3] == "1" )
1042 i_align = SUBPICTURE_ALIGN_TOP;
1043 else if( psz_subtitle[3] == "0" )
1044 i_align = SUBPICTURE_ALIGN_BOTTOM;
1045 } */
1046 // Hide other {x:y} atrocities, notably {o:x}
1047 psz_subtitle = psz_tag_end + 1;
1048 }
1049 else
1050 {
1051 if( *psz_subtitle == '\n' || !strncasecmp( psz_subtitle, "\\n", 2 ) )
1052 {
1053 if ( !AppendCharacter( p_segment, '\n' ) )
1054 goto fail;
1055 if ( *psz_subtitle == '\n' )
1056 psz_subtitle++;
1057 else
1058 psz_subtitle += 2;
1059 }
1060 else if( !strncasecmp( psz_subtitle, "\\h", 2 ) )
1061 {
1062 if ( !AppendString( p_segment, "\xC2\xA0" ) )
1063 goto fail;
1064 psz_subtitle += 2;
1065 }
1066 else
1067 {
1068 //FIXME: Highly inneficient
1069 AppendCharacter( p_segment, *psz_subtitle );
1070 psz_subtitle++;
1071 }
1072 }
1073 }
1074 while ( p_stack )
1075 PopStyle( &p_stack );
1076 while ( p_tag_stack )
1077 {
1078 tag_stack_t *p_tag = p_tag_stack;
1079 p_tag_stack = p_tag_stack->p_next;
1080 free( p_tag->psz_tagname );
1081 free( p_tag );
1082 }
1083
1084 return p_first_segment;
1085
1086 fail:
1087 text_segment_ChainDelete( p_first_segment );
1088 return NULL;
1089 }
1090