1 /*
2    mkvmerge -- utility for splicing together matroska files
3    from component media subtypes
4 
5    Distributed under the GPL v2
6    see the file COPYING for details
7    or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
8 
9    subtitle helper
10 
11    Written by Moritz Bunkus <moritz@bunkus.org>.
12 */
13 
14 #include "common/common_pch.h"
15 
16 #include <QRegularExpression>
17 
18 #include "common/endian.h"
19 #include "common/mime.h"
20 #include "common/mm_proxy_io.h"
21 #include "common/mm_text_io.h"
22 #include "common/qt.h"
23 #include "common/strings/formatting.h"
24 #include "common/strings/parsing.h"
25 #include "common/strings/utf8.h"
26 #include "input/subtitles.h"
27 #include "merge/file_status.h"
28 #include "merge/input_x.h"
29 #include "merge/generic_reader.h"
30 #include "merge/packet_extensions.h"
31 
32 // ------------------------------------------------------------
33 
subtitles_c(std::string const & file_name,int64_t track_id)34 subtitles_c::subtitles_c(std::string const &file_name,
35                          int64_t track_id)
36   : current{entries.end()}
37   , m_cc_utf8{charset_converter_c::init("UTF-8")}
38   , m_invalid_utf8_warned{g_identifying}
39   , m_file_name{file_name}
40   , m_track_id{track_id}
41 {
42 }
43 
44 void
set_charset_converter(charset_converter_cptr const & cc_utf8)45 subtitles_c::set_charset_converter(charset_converter_cptr const &cc_utf8) {
46   if (cc_utf8)
47     m_cc_utf8 = cc_utf8;
48 
49   else {
50     m_cc_utf8  = g_cc_local_utf8;
51     m_try_utf8 = true;
52   }
53 }
54 
55 std::string
recode(std::string const & s,uint32_t replacement_marker)56 subtitles_c::recode(std::string const &s,
57                      uint32_t replacement_marker) {
58   if (m_try_utf8 && !mtx::utf8::is_valid(s))
59     m_try_utf8 = false;
60 
61   auto recoded = m_try_utf8 ? s : m_cc_utf8->utf8(s);
62 
63   if (mtx::utf8::is_valid(recoded))
64     return recoded;
65 
66   if (!m_invalid_utf8_warned) {
67     m_invalid_utf8_warned = true;
68     mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("This text subtitle track contains invalid 8-bit characters outside valid multi-byte UTF-8 sequences. Please specify the correct encoding for this track.\n")));
69   }
70 
71   return mtx::utf8::fix_invalid(recoded, replacement_marker);
72 }
73 
74 void
process(generic_packetizer_c * p)75 subtitles_c::process(generic_packetizer_c *p) {
76   if (empty() || (entries.end() == current))
77     return;
78 
79   packet_cptr packet(new packet_t(memory_c::borrow(current->subs), current->start, current->end - current->start));
80   packet->extensions.push_back(packet_extension_cptr(new subtitle_number_packet_extension_c(current->number)));
81   p->process(packet);
82   ++current;
83 }
84 
85 // ------------------------------------------------------------
86 
87 #define SRT_RE_VALUE          "\\s*(-?)\\s*(\\d+)"
88 #define SRT_RE_TIMESTAMP      SRT_RE_VALUE ":" SRT_RE_VALUE ":" SRT_RE_VALUE "(?:[,\\.:]" SRT_RE_VALUE ")?"
89 #define SRT_RE_TIMESTAMP_LINE "^" SRT_RE_TIMESTAMP "\\s*[\\-\\s]+>\\s*" SRT_RE_TIMESTAMP "\\s*"
90 #define SRT_RE_COORDINATES    "([XY]\\d+:\\d+\\s*){4}\\s*$"
91 
92 bool
probe(mm_text_io_c & io)93 srt_parser_c::probe(mm_text_io_c &io) {
94   try {
95     io.setFilePointer(0);
96     std::string s;
97     do {
98       s = io.getline(10);
99       mtx::string::strip(s);
100     } while (s.empty());
101 
102     int64_t dummy;
103     if (!mtx::string::parse_number(s, dummy))
104       return false;
105 
106     s = io.getline(100);
107     QRegularExpression timestamp_re{SRT_RE_TIMESTAMP_LINE};
108     if (!Q(s).contains(timestamp_re))
109       return false;
110 
111     s = io.getline();
112     io.setFilePointer(0);
113 
114   } catch (...) {
115     return false;
116   }
117 
118   return true;
119 }
120 
srt_parser_c(mm_text_io_cptr const & io,const std::string & file_name,int64_t track_id)121 srt_parser_c::srt_parser_c(mm_text_io_cptr const &io,
122                            const std::string &file_name,
123                            int64_t track_id)
124   : subtitles_c{file_name, track_id}
125   , m_io(io)
126   , m_coordinates_warning_shown(false)
127 {
128 }
129 
130 void
parse()131 srt_parser_c::parse() {
132   QRegularExpression timestamp_re{SRT_RE_TIMESTAMP_LINE};
133   QRegularExpression number_re{"^\\d+$"};
134   QRegularExpression coordinates_re{SRT_RE_COORDINATES};
135 
136   int64_t start                  = 0;
137   int64_t end                    = 0;
138   int64_t previous_start         = 0;
139   bool timestamp_warning_printed = false;
140   parser_state_e state           = STATE_INITIAL;
141   int line_number                = 0;
142   unsigned int subtitle_number   = 0;
143   unsigned int timestamp_number  = 0;
144   std::string subtitles;
145 
146   m_io->setFilePointer(0);
147 
148   while (1) {
149     std::string s;
150     if (!m_io->getline2(s))
151       break;
152 
153     s = recode(s);
154 
155     line_number++;
156     mtx::string::strip_back(s);
157 
158     mxdebug_if(m_debug, fmt::format("line {0} state {1} content »{2}«\n", line_number, state == STATE_INITIAL ? "initial" : state == STATE_TIME ? "time" : state == STATE_SUBS ? "subs" : "subs-or-number", s));
159 
160     if (s.empty()) {
161       if ((STATE_INITIAL == state) || (STATE_TIME == state))
162         continue;
163 
164       state = STATE_SUBS_OR_NUMBER;
165 
166       if (!subtitles.empty())
167         subtitles += "\n";
168 
169       continue;
170     }
171 
172     if (STATE_INITIAL == state) {
173       if (!Q(s).contains(number_re)) {
174         mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("Error in line {0}: expected subtitle number and found some text.\n"), line_number));
175         break;
176       }
177       state = STATE_TIME;
178       mtx::string::parse_number(s, subtitle_number);
179 
180     } else if (STATE_TIME == state) {
181       auto matches = timestamp_re.match(Q(s));
182       if (!matches.hasMatch()) {
183         mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("Error in line {0}: expected a SRT timestamp line but found something else. Aborting this file.\n"), line_number));
184         break;
185       }
186 
187       int64_t s_h = 0, s_min = 0, s_sec = 0, s_ns = 0, e_h = 0, e_min = 0, e_sec = 0, e_ns = 0;
188 
189       //      1       2         3    4          5   6                  7   8
190       // "\\s*(-?)\\s*(\\d+):\\s(-?)*(\\d+):\\s*(-?)(\\d+)(?:[,\\.]\\s*(-?)(\\d+))?"
191 
192       mtx::string::parse_number(to_utf8(matches.captured( 2)), s_h);
193       mtx::string::parse_number(to_utf8(matches.captured( 4)), s_min);
194       mtx::string::parse_number(to_utf8(matches.captured( 6)), s_sec);
195       mtx::string::parse_number(to_utf8(matches.captured(10)), e_h);
196       mtx::string::parse_number(to_utf8(matches.captured(12)), e_min);
197       mtx::string::parse_number(to_utf8(matches.captured(14)), e_sec);
198 
199       std::string s_rest = to_utf8(matches.captured( 8));
200       std::string e_rest = to_utf8(matches.captured(16));
201 
202       auto neg_calculator = [&matches](auto start_idx) -> auto {
203         int64_t neg = 1;
204         for (auto idx = 0; idx < 4; ++idx)
205           if (matches.captured(start_idx + (idx * 2)) == Q("-"))
206             neg *= -1;
207         return neg;
208       };
209 
210       int64_t s_neg = neg_calculator(1);
211       int64_t e_neg = neg_calculator(9);
212 
213       if (Q(s).contains(coordinates_re) && !m_coordinates_warning_shown) {
214         mxwarn_tid(m_file_name, m_track_id,
215                    Y("This file contains coordinates in the timestamp lines. "
216                      "Such coordinates are not supported by the Matroska SRT subtitle format. "
217                      "The coordinates will be removed automatically.\n"));
218         m_coordinates_warning_shown = true;
219       }
220 
221       // The previous entry is done now. Append it to the list of subtitles.
222       if (!subtitles.empty()) {
223         mtx::string::strip_back(subtitles, true);
224         add(start, end, timestamp_number, subtitles.c_str());
225       }
226 
227       while (s_rest.length() < 9)
228         s_rest += "0";
229       if (s_rest.length() > 9)
230         s_rest.erase(9);
231 
232       while (e_rest.length() < 9)
233         e_rest += "0";
234       if (e_rest.length() > 9)
235         e_rest.erase(9);
236 
237       mtx::string::parse_number(s_rest, s_ns);
238       mtx::string::parse_number(e_rest, e_ns);
239 
240       // Calculate the start and end time in ns precision for the following entry.
241       start  = ((s_h * 60 * 60 + s_min * 60 + s_sec) * 1'000'000'000ll + s_ns) * s_neg;
242       end    = ((e_h * 60 * 60 + e_min * 60 + e_sec) * 1'000'000'000ll + e_ns) * e_neg;
243 
244       if (0 > start) {
245         mxwarn_tid(m_file_name, m_track_id,
246                    fmt::format(Y("Line {0}: Negative timestamp encountered. The entry will be adjusted to start from 00:00:00.000.\n"), line_number));
247         end   -= start;
248         start  = 0;
249         if (0 > end)
250           end *= -1;
251       }
252 
253       // There are files for which start timestamps overlap. Matroska requires
254       // blocks to be sorted by their timestamp. mkvmerge does this at the end
255       // of this function, but warn the user that the original order is being
256       // changed.
257       if (!timestamp_warning_printed && (start < previous_start)) {
258         mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("Warning in line {0}: The start timestamp is smaller than that of the previous entry. "
259                                                      "All entries from this file will be sorted by their start time.\n"), line_number));
260         timestamp_warning_printed = true;
261       }
262 
263       previous_start   = start;
264       subtitles        = "";
265       state            = STATE_SUBS;
266       timestamp_number = subtitle_number;
267 
268     } else if (STATE_SUBS == state) {
269       if (!subtitles.empty())
270         subtitles += "\n";
271       subtitles += s;
272 
273     } else if (Q(s).contains(number_re)) {
274       state = STATE_TIME;
275       mtx::string::parse_number(s, subtitle_number);
276 
277     } else {
278       if (!subtitles.empty())
279         subtitles += "\n";
280       subtitles += s;
281     }
282   }
283 
284   if (!subtitles.empty()) {
285     mtx::string::strip_back(subtitles, true);
286     add(start, end, timestamp_number, subtitles.c_str());
287   }
288 
289   sort();
290 }
291 
292 // ------------------------------------------------------------
293 
294 bool
probe(mm_text_io_c & io)295 ssa_parser_c::probe(mm_text_io_c &io) {
296   QRegularExpression script_info_re{"^\\s*\\[script\\s+info\\]",   QRegularExpression::CaseInsensitiveOption};
297   QRegularExpression styles_re{     "^\\s*\\[V4\\+?\\s+Styles\\]", QRegularExpression::CaseInsensitiveOption};
298   QRegularExpression comment_re{    "^\\s*$|^\\s*[!;]",            QRegularExpression::CaseInsensitiveOption};
299 
300   try {
301     int line_number = 0;
302     io.setFilePointer(0);
303 
304     std::string line;
305     while (io.getline2(line, 1000)) {
306       ++line_number;
307 
308       // Read at most 100 lines.
309       if (100 < line_number)
310         return false;
311 
312       auto qline = Q(line);
313       // Skip comments and empty lines.
314       if (qline.contains(comment_re))
315         continue;
316 
317       // This is the line mkvmerge is looking for: positive match.
318       if (qline.contains(script_info_re) || qline.contains(styles_re))
319         return true;
320 
321       // Neither a wanted line nor an empty one/a comment: negative result.
322       return false;
323     }
324   } catch (...) {
325   }
326 
327   return false;
328 }
329 
ssa_parser_c(generic_reader_c & reader,mm_text_io_cptr const & io,const std::string & file_name,int64_t track_id)330 ssa_parser_c::ssa_parser_c(generic_reader_c &reader,
331                            mm_text_io_cptr const &io,
332                            const std::string &file_name,
333                            int64_t track_id)
334   : subtitles_c{file_name, track_id}
335   , m_reader(reader)
336   , m_io(io)
337   , m_is_ass(false)
338   , m_attachment_id(0)
339 {
340 }
341 
342 void
parse()343 ssa_parser_c::parse() {
344   QRegularExpression sec_styles_ass_re{"^\\s*\\[V4\\+\\s+Styles\\]", QRegularExpression::CaseInsensitiveOption};
345   QRegularExpression sec_styles_re{    "^\\s*\\[V4\\s+Styles\\]",    QRegularExpression::CaseInsensitiveOption};
346   QRegularExpression sec_info_re{      "^\\s*\\[Script\\s+Info\\]",  QRegularExpression::CaseInsensitiveOption};
347   QRegularExpression sec_events_re{    "^\\s*\\[Events\\]",          QRegularExpression::CaseInsensitiveOption};
348   QRegularExpression sec_graphics_re{  "^\\s*\\[Graphics\\]",        QRegularExpression::CaseInsensitiveOption};
349   QRegularExpression sec_fonts_re{     "^\\s*\\[Fonts\\]",           QRegularExpression::CaseInsensitiveOption};
350 
351   int num                        = 0;
352   ssa_section_e section          = SSA_SECTION_NONE;
353   ssa_section_e previous_section = SSA_SECTION_NONE;
354   std::string name_field         = "Name";
355 
356   std::string attachment_name, attachment_data_uu;
357 
358   m_io->setFilePointer(0);
359 
360   while (!m_io->eof()) {
361     std::string line;
362     if (!m_io->getline2(line))
363       break;
364 
365     line               = recode(line);
366     auto qline         = Q(line);
367     bool add_to_global = true;
368 
369     // A normal line. Let's see if this file is ASS and not SSA.
370     if (!strcasecmp(line.c_str(), "ScriptType: v4.00+"))
371       m_is_ass = true;
372 
373     else if (qline.contains(sec_styles_ass_re)) {
374       m_is_ass = true;
375       section  = SSA_SECTION_V4STYLES;
376 
377     } else if (qline.contains(sec_styles_re))
378       section = SSA_SECTION_V4STYLES;
379 
380     else if (qline.contains(sec_info_re))
381       section = SSA_SECTION_INFO;
382 
383     else if (qline.contains(sec_events_re))
384       section = SSA_SECTION_EVENTS;
385 
386     else if (qline.contains(sec_graphics_re)) {
387       section       = SSA_SECTION_GRAPHICS;
388       add_to_global = false;
389 
390     } else if (qline.contains(sec_fonts_re)) {
391       section       = SSA_SECTION_FONTS;
392       add_to_global = false;
393 
394     } else if (SSA_SECTION_EVENTS == section) {
395       if (balg::istarts_with(line, "Format: ")) {
396         // Analyze the format string.
397         m_format = mtx::string::split(&line.c_str()[strlen("Format: ")]);
398         mtx::string::strip(m_format);
399 
400         // Let's see if "Actor" is used in the format instead of "Name".
401         size_t i;
402         for (i = 0; m_format.size() > i; ++i)
403           if (balg::iequals(m_format[i], "actor")) {
404             name_field = "Actor";
405             break;
406           }
407 
408       } else if (balg::istarts_with(line, "Dialogue: ")) {
409         if (m_format.empty())
410           throw mtx::input::extended_x(Y("ssa_reader: Invalid format. Could not find the \"Format\" line in the \"[Events]\" section."));
411 
412         std::string orig_line = line;
413 
414         line.erase(0, strlen("Dialogue: ")); // Trim the start.
415 
416         // Split the line into fields.
417         std::vector<std::string> fields = mtx::string::split(line.c_str(), ",", m_format.size());
418         while (fields.size() < m_format.size())
419           fields.push_back(""s);
420 
421         // Parse the start time.
422         auto stime = get_element("Start", fields);
423         auto start = parse_time(stime);
424         stime      = get_element("End", fields);
425         auto end   = parse_time(stime);
426 
427         if (   (0     > start)
428             || (0     > end)
429             || (start > end)) {
430           mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("SSA/ASS: The following line will be skipped as one of the timestamps is less than 0, or the end timestamp is less than the start timestamp: {0}\n"), orig_line));
431           continue;
432         }
433 
434         // Specs say that the following fields are to put into the block:
435         // ReadOrder, Layer, Style, Name, MarginL, MarginR, MarginV, Effect,
436         //   Text
437 
438         std::string comma = ",";
439         line
440           = fmt::to_string(num)                     + comma
441           + get_element("Layer", fields)            + comma
442           + get_element("Style", fields)            + comma
443           + get_element(name_field.c_str(), fields) + comma
444           + get_element("MarginL", fields)          + comma
445           + get_element("MarginR", fields)          + comma
446           + get_element("MarginV", fields)          + comma
447           + get_element("Effect", fields)           + comma
448           + get_element("Text", fields);
449 
450         add(start, end, num, line);
451         num++;
452 
453         add_to_global = false;
454       }
455 
456     } else if ((SSA_SECTION_FONTS == section) || (SSA_SECTION_GRAPHICS == section)) {
457       if (balg::istarts_with(line, "fontname:")) {
458         add_attachment_maybe(attachment_name, attachment_data_uu, section);
459 
460         line.erase(0, strlen("fontname:"));
461         mtx::string::strip(line, true);
462         attachment_name = line;
463 
464       } else {
465         mtx::string::strip(line, true);
466         attachment_data_uu += line;
467       }
468 
469       add_to_global = false;
470     }
471 
472     if (add_to_global) {
473       m_global += line;
474       m_global += "\r\n";
475     }
476 
477     if (previous_section != section)
478       add_attachment_maybe(attachment_name, attachment_data_uu, previous_section);
479 
480     previous_section = section;
481   }
482 
483   sort();
484 }
485 
486 std::string
get_element(const char * index,std::vector<std::string> & fields)487 ssa_parser_c::get_element(const char *index,
488                           std::vector<std::string> &fields) {
489   size_t i;
490 
491   for (i = 0; i < m_format.size(); i++)
492     if (m_format[i] == index)
493       return fields[i];
494 
495   return ""s;
496 }
497 
498 int64_t
parse_time(std::string & stime)499 ssa_parser_c::parse_time(std::string &stime) {
500   int64_t th, tm, ts, tds;
501 
502   int pos = stime.find(':');
503   if (0 > pos)
504     return -1;
505 
506   std::string s = stime.substr(0, pos);
507   if (!mtx::string::parse_number(s, th))
508     return -1;
509   stime.erase(0, pos + 1);
510 
511   pos = stime.find(':');
512   if (0 > pos)
513     return -1;
514 
515   s = stime.substr(0, pos);
516   if (!mtx::string::parse_number(s, tm))
517     return -1;
518   stime.erase(0, pos + 1);
519 
520   pos = stime.find('.');
521   if (0 > pos)
522     return -1;
523 
524   s = stime.substr(0, pos);
525   if (!mtx::string::parse_number(s, ts))
526     return -1;
527   stime.erase(0, pos + 1);
528 
529   if (!mtx::string::parse_number(stime, tds))
530     return -1;
531 
532   return (tds * 10 + ts * 1000 + tm * 60 * 1000 + th * 60 * 60 * 1000) * 1000000;
533 }
534 
535 void
add_attachment_maybe(std::string & name,std::string & data_uu,ssa_section_e section)536 ssa_parser_c::add_attachment_maybe(std::string &name,
537                                    std::string &data_uu,
538                                    ssa_section_e section) {
539   if (name.empty() || data_uu.empty() || ((SSA_SECTION_FONTS != section) && (SSA_SECTION_GRAPHICS != section))) {
540     name    = "";
541     data_uu = "";
542     return;
543   }
544 
545   ++m_attachment_id;
546 
547   if (!m_reader.attachment_requested(m_attachment_id)) {
548     name    = "";
549     data_uu = "";
550     return;
551   }
552 
553   auto attachment_p = std::make_shared<attachment_t>();
554   auto &attachment  = *attachment_p;
555 
556   std::string short_name = m_file_name;
557   size_t pos             = short_name.rfind('/');
558 
559   if (std::string::npos != pos)
560     short_name.erase(0, pos + 1);
561   pos = short_name.rfind('\\');
562   if (std::string::npos != pos)
563     short_name.erase(0, pos + 1);
564 
565   attachment.ui_id        = m_attachment_id;
566   attachment.name         = name;
567   attachment.description  = fmt::format(SSA_SECTION_FONTS == section ? Y("Imported font from {0}") : Y("Imported picture from {0}"), short_name);
568   attachment.to_all_files = true;
569   attachment.source_file  = m_file_name;
570 
571   size_t data_size        = data_uu.length() % 4;
572   data_size               = 3 == data_size ? 2 : 2 == data_size ? 1 : 0;
573   data_size              += data_uu.length() / 4 * 3;
574   attachment.data         = memory_c::alloc(data_size);
575   auto out                = attachment.data->get_buffer();
576   auto in                 = reinterpret_cast<unsigned char const *>(data_uu.c_str());
577 
578   for (auto end = in + (data_uu.length() / 4) * 4; in < end; in += 4, out += 3)
579     decode_chars(in, out, 4);
580 
581   decode_chars(in, out, data_uu.length() % 4);
582 
583   attachment.mime_type = mtx::mime::guess_type_for_data(*attachment.data);
584 
585   add_attachment(attachment_p);
586 
587   name    = "";
588   data_uu = "";
589 }
590 
591 void
decode_chars(unsigned char const * in,unsigned char * out,size_t bytes_in)592 ssa_parser_c::decode_chars(unsigned char const *in,
593                            unsigned char *out,
594                            size_t bytes_in) {
595   if (!bytes_in)
596     return;
597 
598   size_t bytes_out = 4 == bytes_in ? 3 : 3 == bytes_in ? 2 : 1;
599   uint32_t value   = 0;
600 
601   for (int idx = 0; idx < static_cast<int>(bytes_in); ++idx)
602     value |= (static_cast<uint32_t>(in[idx]) - 33) << (6 * (3 - idx));
603 
604   for (int idx = 0; idx < static_cast<int>(bytes_out); ++idx)
605     out[idx] = (value >> ((2 - idx) * 8)) & 0xff;
606 }
607