1 /*****************************************************************************
2  * subsdec.c : text subtitle decoder
3  *****************************************************************************
4  * Copyright (C) 2000-2006 VLC authors and VideoLAN
5  * $Id: a440140d1ad65ede614e1489c8a1ba3e640af667 $
6  *
7  * Authors: Gildas Bazin <gbazin@videolan.org>
8  *          Samuel Hocevar <sam@zoy.org>
9  *          Derk-Jan Hartman <hartman at videolan dot org>
10  *          Bernie Purcell <bitmap@videolan.org>
11  *
12  * This program is free software; you can redistribute it and/or modify it
13  * under the terms of the GNU Lesser General Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public License
23  * along with this program; if not, write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
25  *****************************************************************************/
26 
27 /*****************************************************************************
28  * Preamble
29  *****************************************************************************/
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 
34 #include <limits.h>
35 #include <errno.h>
36 #include <ctype.h>
37 
38 #include <vlc_common.h>
39 #include <vlc_plugin.h>
40 #include <vlc_codec.h>
41 #include <vlc_charset.h>
42 #include <vlc_xml.h>
43 
44 #include "substext.h"
45 
46 /*****************************************************************************
47  * Module descriptor.
48  *****************************************************************************/
49 static const char *const ppsz_encodings[] = {
50     "",
51     "system",
52     "UTF-8",
53     "UTF-16",
54     "UTF-16BE",
55     "UTF-16LE",
56     "GB18030",
57     "ISO-8859-15",
58     "Windows-1252",
59     "IBM850",
60     "ISO-8859-2",
61     "Windows-1250",
62     "ISO-8859-3",
63     "ISO-8859-10",
64     "Windows-1251",
65     "KOI8-R",
66     "KOI8-U",
67     "ISO-8859-6",
68     "Windows-1256",
69     "ISO-8859-7",
70     "Windows-1253",
71     "ISO-8859-8",
72     "Windows-1255",
73     "ISO-8859-9",
74     "Windows-1254",
75     "ISO-8859-11",
76     "Windows-874",
77     "ISO-8859-13",
78     "Windows-1257",
79     "ISO-8859-14",
80     "ISO-8859-16",
81     "ISO-2022-CN-EXT",
82     "EUC-CN",
83     "ISO-2022-JP-2",
84     "EUC-JP",
85     "Shift_JIS",
86     "CP949",
87     "ISO-2022-KR",
88     "Big5",
89     "ISO-2022-TW",
90     "Big5-HKSCS",
91     "VISCII",
92     "Windows-1258",
93 };
94 
95 static const char *const ppsz_encoding_names[] = {
96     /* xgettext:
97       The character encoding name in parenthesis corresponds to that used for
98       the GetACP translation. "Windows-1252" applies to Western European
99       languages using the Latin alphabet. */
100     N_("Default (Windows-1252)"),
101     N_("System codeset"),
102     N_("Universal (UTF-8)"),
103     N_("Universal (UTF-16)"),
104     N_("Universal (big endian UTF-16)"),
105     N_("Universal (little endian UTF-16)"),
106     N_("Universal, Chinese (GB18030)"),
107 
108   /* ISO 8859 and the likes */
109     /* 1 */
110     N_("Western European (Latin-9)"), /* mostly superset of Latin-1 */
111     N_("Western European (Windows-1252)"),
112     N_("Western European (IBM 00850)"),
113     /* 2 */
114     N_("Eastern European (Latin-2)"),
115     N_("Eastern European (Windows-1250)"),
116     /* 3 */
117     N_("Esperanto (Latin-3)"),
118     /* 4 */
119     N_("Nordic (Latin-6)"), /* Latin 6 supersedes Latin 4 */
120     /* 5 */
121     N_("Cyrillic (Windows-1251)"), /* ISO 8859-5 is not practically used */
122     N_("Russian (KOI8-R)"),
123     N_("Ukrainian (KOI8-U)"),
124     /* 6 */
125     N_("Arabic (ISO 8859-6)"),
126     N_("Arabic (Windows-1256)"),
127     /* 7 */
128     N_("Greek (ISO 8859-7)"),
129     N_("Greek (Windows-1253)"),
130     /* 8 */
131     N_("Hebrew (ISO 8859-8)"),
132     N_("Hebrew (Windows-1255)"),
133     /* 9 */
134     N_("Turkish (ISO 8859-9)"),
135     N_("Turkish (Windows-1254)"),
136     /* 10 -> 4 */
137     /* 11 */
138     N_("Thai (TIS 620-2533/ISO 8859-11)"),
139     N_("Thai (Windows-874)"),
140     /* 13 */
141     N_("Baltic (Latin-7)"),
142     N_("Baltic (Windows-1257)"),
143     /* 12 -> /dev/null */
144     /* 14 */
145     N_("Celtic (Latin-8)"),
146     /* 15 -> 1 */
147     /* 16 */
148     N_("South-Eastern European (Latin-10)"),
149   /* CJK families */
150     N_("Simplified Chinese (ISO-2022-CN-EXT)"),
151     N_("Simplified Chinese Unix (EUC-CN)"),
152     N_("Japanese (7-bits JIS/ISO-2022-JP-2)"),
153     N_("Japanese Unix (EUC-JP)"),
154     N_("Japanese (Shift JIS)"),
155     N_("Korean (EUC-KR/CP949)"),
156     N_("Korean (ISO-2022-KR)"),
157     N_("Traditional Chinese (Big5)"),
158     N_("Traditional Chinese Unix (EUC-TW)"),
159     N_("Hong-Kong Supplementary (HKSCS)"),
160   /* Other */
161     N_("Vietnamese (VISCII)"),
162     N_("Vietnamese (Windows-1258)"),
163 };
164 
165 static const int  pi_justification[] = { -1, 0, 1, 2 };
166 static const char *const ppsz_justification_text[] = {
167     N_("Auto"),N_("Center"),N_("Left"),N_("Right")
168 };
169 
170 #define ENCODING_TEXT N_("Subtitle text encoding")
171 #define ENCODING_LONGTEXT N_("Set the encoding used in text subtitles")
172 #define ALIGN_TEXT N_("Subtitle justification")
173 #define ALIGN_LONGTEXT N_("Set the justification of subtitles")
174 #define AUTODETECT_UTF8_TEXT N_("UTF-8 subtitle autodetection")
175 #define AUTODETECT_UTF8_LONGTEXT N_("This enables automatic detection of " \
176             "UTF-8 encoding within subtitle files.")
177 
178 static int  OpenDecoder   ( vlc_object_t * );
179 static void CloseDecoder  ( vlc_object_t * );
180 
181 vlc_module_begin ()
182     set_shortname( N_("Subtitles"))
183     set_description( N_("Text subtitle decoder") )
184     set_capability( "spu decoder", 50 )
185     set_callbacks( OpenDecoder, CloseDecoder )
186     set_category( CAT_INPUT )
187     set_subcategory( SUBCAT_INPUT_SCODEC )
188 
189     add_integer( "subsdec-align", -1, ALIGN_TEXT, ALIGN_LONGTEXT,
190                  false )
191         change_integer_list( pi_justification, ppsz_justification_text )
192     add_string( "subsdec-encoding", "",
193                 ENCODING_TEXT, ENCODING_LONGTEXT, false )
194         change_string_list( ppsz_encodings, ppsz_encoding_names )
195     add_bool( "subsdec-autodetect-utf8", true,
196               AUTODETECT_UTF8_TEXT, AUTODETECT_UTF8_LONGTEXT, false )
197 vlc_module_end ()
198 
199 /*****************************************************************************
200  * Local prototypes
201  *****************************************************************************/
202 #define NO_BREAKING_SPACE  "&#160;"
203 
204 struct decoder_sys_t
205 {
206     int                 i_align;          /* Subtitles alignment on the vout */
207 
208     vlc_iconv_t         iconv_handle;            /* handle to iconv instance */
209     bool                b_autodetect_utf8;
210 };
211 
212 
213 static int             DecodeBlock   ( decoder_t *, block_t * );
214 static subpicture_t   *ParseText     ( decoder_t *, block_t * );
215 static text_segment_t *ParseSubtitles(int *pi_align, const char * );
216 
217 /*****************************************************************************
218  * OpenDecoder: probe the decoder and return score
219  *****************************************************************************
220  * Tries to launch a decoder and return score so that the interface is able
221  * to chose.
222  *****************************************************************************/
OpenDecoder(vlc_object_t * p_this)223 static int OpenDecoder( vlc_object_t *p_this )
224 {
225     decoder_t     *p_dec = (decoder_t*)p_this;
226     decoder_sys_t *p_sys;
227 
228     switch( p_dec->fmt_in.i_codec )
229     {
230         case VLC_CODEC_SUBT:
231         case VLC_CODEC_ITU_T140:
232             break;
233         default:
234             return VLC_EGENERIC;
235     }
236 
237     /* Allocate the memory needed to store the decoder's structure */
238     p_dec->p_sys = p_sys = calloc( 1, sizeof( *p_sys ) );
239     if( p_sys == NULL )
240         return VLC_ENOMEM;
241 
242     p_dec->pf_decode = DecodeBlock;
243     p_dec->fmt_out.i_codec = 0;
244 
245     /* init of p_sys */
246     p_sys->i_align = -1;
247     p_sys->iconv_handle = (vlc_iconv_t)-1;
248     p_sys->b_autodetect_utf8 = false;
249 
250     const char *encoding;
251     char *var = NULL;
252 
253     /* First try demux-specified encoding */
254     if( p_dec->fmt_in.i_codec == VLC_CODEC_ITU_T140 )
255         encoding = "UTF-8"; /* IUT T.140 is always using UTF-8 */
256     else
257     if( p_dec->fmt_in.subs.psz_encoding && *p_dec->fmt_in.subs.psz_encoding )
258     {
259         encoding = p_dec->fmt_in.subs.psz_encoding;
260         msg_Dbg (p_dec, "trying demuxer-specified character encoding: %s",
261                  encoding);
262     }
263     else
264     {
265         /* Second, try configured encoding */
266         if ((var = var_InheritString (p_dec, "subsdec-encoding")) != NULL)
267         {
268             msg_Dbg (p_dec, "trying configured character encoding: %s", var);
269             if (!strcmp (var, "system"))
270             {
271                 free (var);
272                 var = NULL;
273                 encoding = "";
274                 /* ^ iconv() treats "" as nl_langinfo(CODESET) */
275             }
276             else
277                 encoding = var;
278         }
279         else
280         /* Third, try "local" encoding */
281         {
282         /* xgettext:
283            The Windows ANSI code page most commonly used for this language.
284            VLC uses this as a guess of the subtitle files character set
285            (if UTF-8 and UTF-16 autodetection fails).
286            Western European languages normally use "CP1252", which is a
287            Microsoft-variant of ISO 8859-1. That suits the Latin alphabet.
288            Other scripts use other code pages.
289 
290            This MUST be a valid iconv character set. If unsure, please refer
291            the VideoLAN translators mailing list. */
292             encoding = vlc_pgettext("GetACP", "CP1252");
293             msg_Dbg (p_dec, "trying default character encoding: %s", encoding);
294         }
295 
296         /* Check UTF-8 autodetection */
297         if (var_InheritBool (p_dec, "subsdec-autodetect-utf8"))
298         {
299             msg_Dbg (p_dec, "using automatic UTF-8 detection");
300             p_sys->b_autodetect_utf8 = true;
301         }
302     }
303 
304     if (strcasecmp (encoding, "UTF-8") && strcasecmp (encoding, "utf8"))
305     {
306         p_sys->iconv_handle = vlc_iconv_open ("UTF-8", encoding);
307         if (p_sys->iconv_handle == (vlc_iconv_t)(-1))
308             msg_Err (p_dec, "cannot convert from %s: %s", encoding,
309                      vlc_strerror_c(errno));
310     }
311     free (var);
312 
313     p_sys->i_align = var_InheritInteger( p_dec, "subsdec-align" );
314 
315     return VLC_SUCCESS;
316 }
317 
318 /****************************************************************************
319  * DecodeBlock: the whole thing
320  ****************************************************************************
321  * This function must be fed with complete subtitles units.
322  ****************************************************************************/
DecodeBlock(decoder_t * p_dec,block_t * p_block)323 static int DecodeBlock( decoder_t *p_dec, block_t *p_block )
324 {
325     subpicture_t *p_spu;
326 
327     if( p_block == NULL ) /* No Drain */
328         return VLCDEC_SUCCESS;
329 
330     if( p_block->i_flags & BLOCK_FLAG_CORRUPTED )
331     {
332         block_Release( p_block );
333         return VLCDEC_SUCCESS;
334     }
335 
336     p_spu = ParseText( p_dec, p_block );
337 
338     block_Release( p_block );
339     if( p_spu != NULL )
340         decoder_QueueSub( p_dec, p_spu );
341     return VLCDEC_SUCCESS;
342 }
343 
344 /*****************************************************************************
345  * CloseDecoder: clean up the decoder
346  *****************************************************************************/
CloseDecoder(vlc_object_t * p_this)347 static void CloseDecoder( vlc_object_t *p_this )
348 {
349     decoder_t *p_dec = (decoder_t *)p_this;
350     decoder_sys_t *p_sys = p_dec->p_sys;
351 
352     if( p_sys->iconv_handle != (vlc_iconv_t)-1 )
353         vlc_iconv_close( p_sys->iconv_handle );
354 
355     free( p_sys );
356 }
357 
358 /*****************************************************************************
359  * ParseText: parse an text subtitle packet and send it to the video output
360  *****************************************************************************/
ParseText(decoder_t * p_dec,block_t * p_block)361 static subpicture_t *ParseText( decoder_t *p_dec, block_t *p_block )
362 {
363     decoder_sys_t *p_sys = p_dec->p_sys;
364     subpicture_t *p_spu = NULL;
365 
366     if( p_block->i_flags & BLOCK_FLAG_CORRUPTED )
367         return NULL;
368 
369     /* We cannot display a subpicture with no date */
370     if( p_block->i_pts <= VLC_TS_INVALID )
371     {
372         msg_Warn( p_dec, "subtitle without a date" );
373         return NULL;
374     }
375 
376     /* Check validity of packet data */
377     /* An "empty" line containing only \0 can be used to force
378        and ephemer picture from the screen */
379     if( p_block->i_buffer < 1 )
380     {
381         msg_Warn( p_dec, "no subtitle data" );
382         return NULL;
383     }
384 
385     char *psz_subtitle = NULL;
386 
387     /* Should be resiliant against bad subtitles */
388     if( p_sys->iconv_handle == (vlc_iconv_t)-1 ||
389         p_sys->b_autodetect_utf8 )
390     {
391         psz_subtitle = malloc( p_block->i_buffer + 1 );
392         if( psz_subtitle == NULL )
393             return NULL;
394         memcpy( psz_subtitle, p_block->p_buffer, p_block->i_buffer );
395         psz_subtitle[p_block->i_buffer] = '\0';
396     }
397 
398     if( p_sys->iconv_handle == (vlc_iconv_t)-1 )
399     {
400         if (EnsureUTF8( psz_subtitle ) == NULL)
401         {
402             msg_Err( p_dec, "failed to convert subtitle encoding.\n"
403                      "Try manually setting a character-encoding "
404                      "before you open the file." );
405         }
406     }
407     else
408     {
409         if( p_sys->b_autodetect_utf8 )
410         {
411             if( IsUTF8( psz_subtitle ) == NULL )
412             {
413                 msg_Dbg( p_dec, "invalid UTF-8 sequence: "
414                          "disabling UTF-8 subtitles autodetection" );
415                 p_sys->b_autodetect_utf8 = false;
416             }
417         }
418 
419         if( !p_sys->b_autodetect_utf8 )
420         {
421             size_t inbytes_left = p_block->i_buffer;
422             size_t outbytes_left = 6 * inbytes_left;
423             char *psz_new_subtitle = xmalloc( outbytes_left + 1 );
424             char *psz_convert_buffer_out = psz_new_subtitle;
425             const char *psz_convert_buffer_in =
426                     psz_subtitle ? psz_subtitle : (char *)p_block->p_buffer;
427 
428             size_t ret = vlc_iconv( p_sys->iconv_handle,
429                                     &psz_convert_buffer_in, &inbytes_left,
430                                     &psz_convert_buffer_out, &outbytes_left );
431 
432             *psz_convert_buffer_out++ = '\0';
433             free( psz_subtitle );
434 
435             if( ( ret == (size_t)(-1) ) || inbytes_left )
436             {
437                 free( psz_new_subtitle );
438                 msg_Err( p_dec, "failed to convert subtitle encoding.\n"
439                         "Try manually setting a character-encoding "
440                                 "before you open the file." );
441                 return NULL;
442             }
443 
444             psz_subtitle = realloc( psz_new_subtitle,
445                                     psz_convert_buffer_out - psz_new_subtitle );
446             if( !psz_subtitle )
447                 psz_subtitle = psz_new_subtitle;
448         }
449     }
450 
451     /* Create the subpicture unit */
452     p_spu = decoder_NewSubpictureText( p_dec );
453     if( !p_spu )
454     {
455         free( psz_subtitle );
456         return NULL;
457     }
458     p_spu->i_start    = p_block->i_pts;
459     p_spu->i_stop     = p_block->i_pts + p_block->i_length;
460     p_spu->b_ephemer  = (p_block->i_length == 0);
461     p_spu->b_absolute = false;
462 
463     subpicture_updater_sys_t *p_spu_sys = p_spu->updater.p_sys;
464 
465     int i_inline_align = -1;
466     p_spu_sys->region.p_segments = ParseSubtitles( &i_inline_align, psz_subtitle );
467     free( psz_subtitle );
468     if( p_sys->i_align >= 0 ) /* bottom ; left, right or centered */
469     {
470         p_spu_sys->region.align = SUBPICTURE_ALIGN_BOTTOM | p_sys->i_align;
471         p_spu_sys->region.inner_align = p_sys->i_align;
472     }
473     else if( i_inline_align >= 0 )
474     {
475         p_spu_sys->region.align = i_inline_align;
476         p_spu_sys->region.inner_align = i_inline_align;
477     }
478     else /* default, bottom ; centered */
479     {
480         p_spu_sys->region.align = SUBPICTURE_ALIGN_BOTTOM;
481         p_spu_sys->region.inner_align = 0;
482     }
483 
484     return p_spu;
485 }
486 
AppendCharacter(text_segment_t * p_segment,char c)487 static bool AppendCharacter( text_segment_t* p_segment, char c )
488 {
489     char* tmp;
490     if ( asprintf( &tmp, "%s%c", p_segment->psz_text ? p_segment->psz_text : "", c ) < 0 )
491         return false;
492     free( p_segment->psz_text );
493     p_segment->psz_text = tmp;
494     return true;
495 }
496 
AppendString(text_segment_t * p_segment,const char * psz_str)497 static bool AppendString( text_segment_t* p_segment, const char* psz_str )
498 {
499     char* tmp;
500     if ( asprintf( &tmp, "%s%s", p_segment->psz_text ? p_segment->psz_text : "", psz_str ) < 0 )
501         return false;
502     free( p_segment->psz_text );
503     p_segment->psz_text = tmp;
504     return true;
505 }
506 
ConsumeAttribute(const char ** ppsz_subtitle,char ** ppsz_attribute_value)507 static char* ConsumeAttribute( const char** ppsz_subtitle, char** ppsz_attribute_value )
508 {
509     const char* psz_subtitle = *ppsz_subtitle;
510     char* psz_attribute_name;
511     *ppsz_attribute_value = NULL;
512 
513     while (*psz_subtitle == ' ')
514         psz_subtitle++;
515 
516     size_t attr_len = 0;
517     char delimiter;
518 
519     while ( *psz_subtitle && isalpha( *psz_subtitle ) )
520     {
521         psz_subtitle++;
522         attr_len++;
523     }
524     if ( !*psz_subtitle || attr_len == 0 )
525         return NULL;
526     psz_attribute_name = malloc( attr_len + 1 );
527     if ( unlikely( !psz_attribute_name ) )
528         return NULL;
529     strncpy( psz_attribute_name, psz_subtitle - attr_len, attr_len );
530     psz_attribute_name[attr_len] = 0;
531 
532     // Skip over to the attribute value
533     while ( *psz_subtitle && *psz_subtitle != '=' )
534         psz_subtitle++;
535     if ( !*psz_subtitle )
536     {
537         *ppsz_subtitle = psz_subtitle;
538         return psz_attribute_name;
539     }
540     // Skip the '=' sign
541     psz_subtitle++;
542 
543     // Aknoledge the delimiter if any
544     while ( *psz_subtitle && isspace( *psz_subtitle) )
545         psz_subtitle++;
546 
547     if ( *psz_subtitle == '\'' || *psz_subtitle == '"' )
548     {
549         // Save the delimiter and skip it
550         delimiter = *psz_subtitle;
551         psz_subtitle++;
552     }
553     else
554         delimiter = 0;
555 
556     // Skip spaces, just in case
557     while ( *psz_subtitle && isspace( *psz_subtitle ) )
558         psz_subtitle++;
559 
560     attr_len = 0;
561     while ( *psz_subtitle && ( ( delimiter != 0 && *psz_subtitle != delimiter ) ||
562                                ( delimiter == 0 && ( !isspace(*psz_subtitle) && *psz_subtitle != '>' ) ) ) )
563     {
564         psz_subtitle++;
565         attr_len++;
566     }
567     if ( attr_len == 0 )
568     {
569         *ppsz_subtitle = psz_subtitle;
570         return psz_attribute_name;
571     }
572     if ( unlikely( !( *ppsz_attribute_value = malloc( attr_len + 1 ) ) ) )
573     {
574         free( psz_attribute_name );
575         return NULL;
576     }
577     strncpy( *ppsz_attribute_value, psz_subtitle - attr_len, attr_len );
578     (*ppsz_attribute_value)[attr_len] = 0;
579     // Finally, skip over the final delimiter
580     if (delimiter != 0 && *psz_subtitle)
581         psz_subtitle++;
582     *ppsz_subtitle = psz_subtitle;
583     return psz_attribute_name;
584 }
585 
586 // Returns the next tag and consume the string up to after the tag name, or
587 // returns NULL and doesn't advance if the angle bracket was not a tag opening
588 // For instance, if psz_subtitle == "<some_tag attribute=value>"
589 // GetTag will return "some_tag", and will advance up to the first 'a' in "attribute"
590 // The returned value must be freed.
GetTag(const char ** ppsz_subtitle,bool b_closing)591 static char* GetTag( const char** ppsz_subtitle, bool b_closing )
592 {
593     const char* psz_subtitle = *ppsz_subtitle;
594     if ( *psz_subtitle != '<' )
595         return NULL;
596     // Skip the '<'
597     psz_subtitle++;
598     if ( b_closing && *psz_subtitle == '/' )
599         psz_subtitle++;
600     // Skip potential spaces
601     while ( *psz_subtitle == ' ' )
602         psz_subtitle++;
603     // Now we need to verify if what comes next is a valid tag:
604     if ( !isalpha( *psz_subtitle ) )
605         return NULL;
606     size_t tag_size = 1;
607     while ( isalnum( psz_subtitle[tag_size] ) || psz_subtitle[tag_size] == '_' )
608         tag_size++;
609     char* psz_tagname = vlc_alloc( tag_size + 1, sizeof( *psz_tagname ) );
610     if ( unlikely( !psz_tagname ) )
611         return NULL;
612     strncpy( psz_tagname, psz_subtitle, tag_size );
613     psz_tagname[tag_size] = 0;
614     psz_subtitle += tag_size;
615     *ppsz_subtitle = psz_subtitle;
616     return psz_tagname;
617 }
618 
IsClosed(const char * psz_subtitle,const char * psz_tagname)619 static bool IsClosed( const char* psz_subtitle, const char* psz_tagname )
620 {
621     const char* psz_tagpos = strcasestr( psz_subtitle, psz_tagname );
622     if ( !psz_tagpos )
623         return false;
624     // Search for '</' and '>' immediatly before & after (minding the potential spaces)
625     const char* psz_endtag = psz_tagpos + strlen( psz_tagname );
626     while ( *psz_endtag == ' ' )
627         psz_endtag++;
628     if ( *psz_endtag != '>' )
629         return false;
630     // Skip back before the tag itself
631     psz_tagpos--;
632     while ( *psz_tagpos == ' ' && psz_tagpos > psz_subtitle )
633         psz_tagpos--;
634     if ( *psz_tagpos-- != '/' )
635         return false;
636     if ( *psz_tagpos != '<' )
637         return false;
638     return true;
639 }
640 
641 typedef struct tag_stack tag_stack_t;
642 struct tag_stack
643 {
644     char* psz_tagname;
645     tag_stack_t *p_next;
646 };
647 
AppendTag(tag_stack_t ** pp_stack,char * psz_tagname)648 static void AppendTag( tag_stack_t **pp_stack, char* psz_tagname )
649 {
650     tag_stack_t* p_elem = malloc( sizeof( *p_elem ) );
651     if ( unlikely( !p_elem ) )
652         return;
653     p_elem->p_next = *pp_stack;
654     p_elem->psz_tagname = psz_tagname;
655     *pp_stack = p_elem;
656 }
657 
HasTag(tag_stack_t ** pp_stack,const char * psz_tagname)658 static bool HasTag( tag_stack_t **pp_stack, const char* psz_tagname )
659 {
660     tag_stack_t *p_prev = NULL;
661     for ( tag_stack_t* p_current = *pp_stack; p_current; p_current = p_current->p_next )
662     {
663         if ( !strcasecmp( psz_tagname, p_current->psz_tagname ) )
664         {
665             if ( p_current == *pp_stack )
666             {
667                 *pp_stack = p_current->p_next;
668             }
669             else
670             {
671                 p_prev->p_next = p_current->p_next;
672             }
673             free( p_current->psz_tagname );
674             free( p_current );
675             return true;
676         }
677         p_prev = p_current;
678     }
679     return false;
680 }
681 
682 /*
683  * mini style stack implementation
684  */
685 typedef struct style_stack style_stack_t;
686 struct  style_stack
687 {
688     text_style_t* p_style;
689     style_stack_t* p_next;
690 };
691 
DuplicateAndPushStyle(style_stack_t ** pp_stack)692 static text_style_t* DuplicateAndPushStyle(style_stack_t** pp_stack)
693 {
694     text_style_t* p_dup = ( *pp_stack ) ? text_style_Duplicate( (*pp_stack)->p_style ) : text_style_Create( STYLE_NO_DEFAULTS );
695     if ( unlikely( !p_dup ) )
696         return NULL;
697     style_stack_t* p_entry = malloc( sizeof( *p_entry ) );
698     if ( unlikely( !p_entry ) )
699     {
700         text_style_Delete( p_dup );
701         return NULL;
702     }
703     // Give the style ownership to the segment.
704     p_entry->p_style = p_dup;
705     p_entry->p_next = *pp_stack;
706     *pp_stack = p_entry;
707     return p_dup;
708 }
709 
PopStyle(style_stack_t ** pp_stack)710 static void PopStyle(style_stack_t** pp_stack)
711 {
712     style_stack_t* p_old = *pp_stack;
713     if ( !p_old )
714         return;
715     *pp_stack = p_old->p_next;
716     // Don't free the style, it is now owned by the text_segment_t
717     free( p_old );
718 }
719 
NewTextSegmentPushStyle(text_segment_t * p_segment,style_stack_t ** pp_stack)720 static text_segment_t* NewTextSegmentPushStyle( text_segment_t* p_segment, style_stack_t** pp_stack )
721 {
722     text_segment_t* p_new = text_segment_New( NULL );
723     if ( unlikely( p_new == NULL ) )
724         return NULL;
725     text_style_t* p_style = DuplicateAndPushStyle( pp_stack );
726     p_new->style = p_style;
727     p_segment->p_next = p_new;
728     return p_new;
729 }
730 
NewTextSegmentPopStyle(text_segment_t * p_segment,style_stack_t ** pp_stack)731 static text_segment_t* NewTextSegmentPopStyle( text_segment_t* p_segment, style_stack_t** pp_stack )
732 {
733     text_segment_t* p_new = text_segment_New( NULL );
734     if ( unlikely( p_new == NULL ) )
735         return NULL;
736     // We shouldn't have an empty stack since this happens when closing a tag,
737     // but better be safe than sorry if (/when) we encounter a broken subtitle file.
738     PopStyle( pp_stack );
739     text_style_t* p_dup = ( *pp_stack ) ? text_style_Duplicate( (*pp_stack)->p_style ) : text_style_Create( STYLE_NO_DEFAULTS );
740     p_new->style = p_dup;
741     p_segment->p_next = p_new;
742     return p_new;
743 }
744 
ParseSubtitles(int * pi_align,const char * psz_subtitle)745 static text_segment_t* ParseSubtitles( int *pi_align, const char *psz_subtitle )
746 {
747     text_segment_t* p_segment;
748     text_segment_t* p_first_segment;
749     style_stack_t* p_stack = NULL;
750     tag_stack_t* p_tag_stack = NULL;
751 
752     //FIXME: Remove initial allocation? Might make the below code more complicated
753     p_first_segment = p_segment = text_segment_New( "" );
754 
755     *pi_align = -1;
756 
757     /* */
758     while( *psz_subtitle )
759     {
760         /* HTML extensions */
761         if( *psz_subtitle == '<' )
762         {
763             char *psz_tagname = GetTag( &psz_subtitle, false );
764             if ( psz_tagname != NULL )
765             {
766                 if( !strcasecmp( psz_tagname, "br" ) )
767                 {
768                     if ( !AppendCharacter( p_segment, '\n' ) )
769                     {
770                         free( psz_tagname );
771                         goto fail;
772                     }
773                 }
774                 else if( !strcasecmp( psz_tagname, "b" ) )
775                 {
776                     p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
777                     p_segment->style->i_style_flags |= STYLE_BOLD;
778                     p_segment->style->i_features |= STYLE_HAS_FLAGS;
779                 }
780                 else if( !strcasecmp( psz_tagname, "i" ) )
781                 {
782                     p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
783                     p_segment->style->i_style_flags |= STYLE_ITALIC;
784                     p_segment->style->i_features |= STYLE_HAS_FLAGS;
785                 }
786                 else if( !strcasecmp( psz_tagname, "u" ) )
787                 {
788                     p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
789                     p_segment->style->i_style_flags |= STYLE_UNDERLINE;
790                     p_segment->style->i_features |= STYLE_HAS_FLAGS;
791                 }
792                 else if( !strcasecmp( psz_tagname, "s" ) )
793                 {
794                     p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
795                     p_segment->style->i_style_flags |= STYLE_STRIKEOUT;
796                     p_segment->style->i_features |= STYLE_HAS_FLAGS;
797                 }
798                 else if( !strcasecmp( psz_tagname, "font" ) )
799                 {
800                     p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
801 
802                     char* psz_attribute_name;
803                     char* psz_attribute_value;
804 
805                     while( ( psz_attribute_name = ConsumeAttribute( &psz_subtitle, &psz_attribute_value ) ) )
806                     {
807                         if ( !psz_attribute_value )
808                         {
809                             free( psz_attribute_name );
810                             continue;
811                         }
812                         if ( !strcasecmp( psz_attribute_name, "face" ) )
813                         {
814                             free(p_segment->style->psz_fontname);
815                             p_segment->style->psz_fontname = psz_attribute_value;
816                             // We don't want to free the attribute value since it has become our fontname
817                             psz_attribute_value = NULL;
818                         }
819                         else if ( !strcasecmp( psz_attribute_name, "family" ) )
820                         {
821                             free(p_segment->style->psz_monofontname);
822                             p_segment->style->psz_monofontname = psz_attribute_value;
823                             psz_attribute_value = NULL;
824                         }
825                         else if ( !strcasecmp( psz_attribute_name, "size" ) )
826                         {
827                             int size = atoi( psz_attribute_value );
828                             if( size )
829                             {
830                                 p_segment->style->i_font_size = size;
831                                 p_segment->style->f_font_relsize = STYLE_DEFAULT_REL_FONT_SIZE *
832                                         STYLE_DEFAULT_FONT_SIZE / p_segment->style->i_font_size;
833                             }
834                         }
835                         else if ( !strcasecmp( psz_attribute_name, "color" ) )
836                         {
837                             p_segment->style->i_font_color = vlc_html_color( psz_attribute_value, NULL );
838                             p_segment->style->i_features |= STYLE_HAS_FONT_COLOR;
839                         }
840                         else if ( !strcasecmp( psz_attribute_name, "outline-color" ) )
841                         {
842                             p_segment->style->i_outline_color = vlc_html_color( psz_attribute_value, NULL );
843                             p_segment->style->i_features |= STYLE_HAS_OUTLINE_COLOR;
844                         }
845                         else if ( !strcasecmp( psz_attribute_name, "shadow-color" ) )
846                         {
847                             p_segment->style->i_shadow_color = vlc_html_color( psz_attribute_value, NULL );
848                             p_segment->style->i_features |= STYLE_HAS_SHADOW_COLOR;
849                         }
850                         else if ( !strcasecmp( psz_attribute_name, "outline-level" ) )
851                         {
852                             p_segment->style->i_outline_width = atoi( psz_attribute_value );
853                         }
854                         else if ( !strcasecmp( psz_attribute_name, "shadow-level" ) )
855                         {
856                             p_segment->style->i_shadow_width = atoi( psz_attribute_value );
857                         }
858                         else if ( !strcasecmp( psz_attribute_name, "back-color" ) )
859                         {
860                             p_segment->style->i_background_color = vlc_html_color( psz_attribute_value, NULL );
861                             p_segment->style->i_features |= STYLE_HAS_BACKGROUND_COLOR;
862                         }
863                         else if ( !strcasecmp( psz_attribute_name, "alpha" ) )
864                         {
865                             p_segment->style->i_font_alpha = atoi( psz_attribute_value );
866                             p_segment->style->i_features |= STYLE_HAS_FONT_ALPHA;
867                         }
868 
869                         free( psz_attribute_name );
870                         free( psz_attribute_value );
871                     }
872                 }
873                 else
874                 {
875                     // This is an unknown tag. We need to hide it if it's properly closed, and display it otherwise
876                     if ( !IsClosed( psz_subtitle, psz_tagname ) )
877                     {
878                         AppendCharacter( p_segment, '<' );
879                         AppendString( p_segment, psz_tagname );
880                         AppendCharacter( p_segment, '>' );
881                     }
882                     else
883                     {
884                         AppendTag( &p_tag_stack, psz_tagname );
885                         // We don't want to free the tagname now, it will be freed when the tag
886                         // gets poped from the stack.
887                         psz_tagname = NULL;
888                     }
889                     // In any case, fall through and skip to the closing tag.
890                 }
891                 // Skip potential spaces & end tag
892                 while ( *psz_subtitle && *psz_subtitle != '>' )
893                     psz_subtitle++;
894                 if ( *psz_subtitle == '>' )
895                     psz_subtitle++;
896 
897                 free( psz_tagname );
898             }
899             else if( !strncmp( psz_subtitle, "</", 2 ))
900             {
901                 char* psz_tagname = GetTag( &psz_subtitle, true );
902                 if ( psz_tagname != NULL )
903                 {
904                     if ( !strcasecmp( psz_tagname, "b" ) ||
905                          !strcasecmp( psz_tagname, "i" ) ||
906                          !strcasecmp( psz_tagname, "u" ) ||
907                          !strcasecmp( psz_tagname, "s" ) ||
908                          !strcasecmp( psz_tagname, "font" ) )
909                     {
910                         // A closing tag for one of the tags we handle, meaning
911                         // we pushed a style onto the stack earlier
912                         p_segment = NewTextSegmentPopStyle( p_segment, &p_stack );
913                     }
914                     else
915                     {
916                         // Unknown closing tag. If it is closing an unknown tag, ignore it. Otherwise, display it
917                         if ( !HasTag( &p_tag_stack, psz_tagname ) )
918                         {
919                             AppendString( p_segment, "</" );
920                             AppendString( p_segment, psz_tagname );
921                             AppendCharacter( p_segment, '>' );
922                         }
923                     }
924                     while ( *psz_subtitle == ' ' )
925                         psz_subtitle++;
926                     if ( *psz_subtitle == '>' )
927                         psz_subtitle++;
928                     free( psz_tagname );
929                 }
930                 else
931                 {
932                     /**
933                       * This doesn't appear to be a valid tag closing syntax.
934                       * Simply append the text
935                       */
936                     AppendString( p_segment, "</" );
937                     psz_subtitle += 2;
938                 }
939             }
940             else
941             {
942                 /* We have an unknown tag, just append it, and move on.
943                  * The rest of the string won't be recognized as a tag, and
944                  * we will ignore unknown closing tag
945                  */
946                 AppendCharacter( p_segment, '<' );
947                 psz_subtitle++;
948             }
949         }
950         /* SSA extensions */
951         else if( psz_subtitle[0] == '{' && psz_subtitle[1] == '\\' &&
952                  strchr( psz_subtitle, '}' ) )
953         {
954             /* Check for forced alignment */
955             if( *pi_align < 0 &&
956                 !strncmp( psz_subtitle, "{\\an", 4 ) && psz_subtitle[4] >= '1' && psz_subtitle[4] <= '9' && psz_subtitle[5] == '}' )
957             {
958                 static const int pi_vertical[3] = { SUBPICTURE_ALIGN_BOTTOM, 0, SUBPICTURE_ALIGN_TOP };
959                 static const int pi_horizontal[3] = { SUBPICTURE_ALIGN_LEFT, 0, SUBPICTURE_ALIGN_RIGHT };
960                 const int i_id = psz_subtitle[4] - '1';
961 
962                 *pi_align = pi_vertical[i_id/3] | pi_horizontal[i_id%3];
963             }
964             /* TODO fr -> rotation */
965 
966             /* Hide {\stupidity} */
967             psz_subtitle = strchr( psz_subtitle, '}' ) + 1;
968         }
969         /* MicroDVD extensions */
970         /* FIXME:
971          *  - Currently, we don't do difference between X and x, and we should:
972          *    Capital Letters applies to the whole text and not one line
973          *  - We don't support Position and Coordinates
974          *  - We don't support the DEFAULT flag (HEADER)
975          */
976 
977         else if( psz_subtitle[0] == '{' && psz_subtitle[1] != 0 &&
978                  psz_subtitle[2] == ':' && strchr( &psz_subtitle[2], '}' ) )
979         {
980             const char *psz_tag_end = strchr( &psz_subtitle[2], '}' );
981             size_t i_len = psz_tag_end - &psz_subtitle[3];
982 
983             if( psz_subtitle[1] == 'Y' || psz_subtitle[1] == 'y' )
984             {
985                 if( psz_subtitle[3] == 'i' )
986                 {
987                     p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
988                     p_segment->style->i_style_flags |= STYLE_ITALIC;
989                     p_segment->style->i_features |= STYLE_HAS_FLAGS;
990                     psz_subtitle++;
991                 }
992                 if( psz_subtitle[3] == 'b' )
993                 {
994                     p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
995                     p_segment->style->i_style_flags |= STYLE_BOLD;
996                     p_segment->style->i_features |= STYLE_HAS_FLAGS;
997                     psz_subtitle++;
998                 }
999                 if( psz_subtitle[3] == 'u' )
1000                 {
1001                     p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1002                     p_segment->style->i_style_flags |= STYLE_UNDERLINE;
1003                     p_segment->style->i_features |= STYLE_HAS_FLAGS;
1004                     psz_subtitle++;
1005                 }
1006             }
1007             else if( (psz_subtitle[1] == 'C' || psz_subtitle[1] == 'c' )
1008                     && psz_subtitle[3] == '$' && i_len >= 7 )
1009             {
1010                 /* Yes, they use BBGGRR, instead of RRGGBB */
1011                 char psz_color[7];
1012                 psz_color[0] = psz_subtitle[8]; psz_color[1] = psz_subtitle[9];
1013                 psz_color[2] = psz_subtitle[6]; psz_color[3] = psz_subtitle[7];
1014                 psz_color[4] = psz_subtitle[4]; psz_color[5] = psz_subtitle[5];
1015                 psz_color[6] = '\0';
1016                 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1017                 p_segment->style->i_font_color = vlc_html_color( psz_color, NULL );
1018                 p_segment->style->i_features |= STYLE_HAS_FONT_COLOR;
1019             }
1020             else if( psz_subtitle[1] == 'F' || psz_subtitle[1] == 'f' )
1021             {
1022                 p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1023                 free(p_segment->style->psz_fontname);
1024                 p_segment->style->psz_fontname = strndup( &psz_subtitle[3], i_len );
1025             }
1026             else if( psz_subtitle[1] == 'S' || psz_subtitle[1] == 's' )
1027             {
1028                 int size = atoi( &psz_subtitle[3] );
1029                 if( size )
1030                 {
1031                     p_segment = NewTextSegmentPushStyle( p_segment, &p_stack );
1032                     p_segment->style->i_font_size = size;
1033                     p_segment->style->f_font_relsize = STYLE_DEFAULT_REL_FONT_SIZE *
1034                                 STYLE_DEFAULT_FONT_SIZE / p_segment->style->i_font_size;
1035 
1036                 }
1037             }
1038             /* Currently unsupported since we don't have access to the i_align flag here
1039             else if( psz_subtitle[1] == 'P' )
1040             {
1041                 if( psz_subtitle[3] == "1" )
1042                     i_align = SUBPICTURE_ALIGN_TOP;
1043                 else if( psz_subtitle[3] == "0" )
1044                     i_align = SUBPICTURE_ALIGN_BOTTOM;
1045             } */
1046             // Hide other {x:y} atrocities, notably {o:x}
1047             psz_subtitle = psz_tag_end + 1;
1048         }
1049         else
1050         {
1051             if( *psz_subtitle == '\n' || !strncasecmp( psz_subtitle, "\\n", 2 ) )
1052             {
1053                 if ( !AppendCharacter( p_segment, '\n' ) )
1054                     goto fail;
1055                 if ( *psz_subtitle == '\n' )
1056                     psz_subtitle++;
1057                 else
1058                     psz_subtitle += 2;
1059             }
1060             else if( !strncasecmp( psz_subtitle, "\\h", 2 ) )
1061             {
1062                 if ( !AppendString( p_segment, "\xC2\xA0" ) )
1063                     goto fail;
1064                 psz_subtitle += 2;
1065             }
1066             else
1067             {
1068                 //FIXME: Highly inneficient
1069                 AppendCharacter( p_segment, *psz_subtitle );
1070                 psz_subtitle++;
1071             }
1072         }
1073     }
1074     while ( p_stack )
1075         PopStyle( &p_stack );
1076     while ( p_tag_stack )
1077     {
1078         tag_stack_t *p_tag = p_tag_stack;
1079         p_tag_stack = p_tag_stack->p_next;
1080         free( p_tag->psz_tagname );
1081         free( p_tag );
1082     }
1083 
1084     return p_first_segment;
1085 
1086 fail:
1087     text_segment_ChainDelete( p_first_segment );
1088     return NULL;
1089 }
1090