1 /*
2 mkvmerge -- utility for splicing together matroska files
3 from component media subtypes
4
5 Distributed under the GPL v2
6 see the file COPYING for details
7 or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
8
9 subtitle helper
10
11 Written by Moritz Bunkus <moritz@bunkus.org>.
12 */
13
14 #include "common/common_pch.h"
15
16 #include <QRegularExpression>
17
18 #include "common/endian.h"
19 #include "common/mime.h"
20 #include "common/mm_proxy_io.h"
21 #include "common/mm_text_io.h"
22 #include "common/qt.h"
23 #include "common/strings/formatting.h"
24 #include "common/strings/parsing.h"
25 #include "common/strings/utf8.h"
26 #include "input/subtitles.h"
27 #include "merge/file_status.h"
28 #include "merge/input_x.h"
29 #include "merge/generic_reader.h"
30 #include "merge/packet_extensions.h"
31
32 // ------------------------------------------------------------
33
subtitles_c(std::string const & file_name,int64_t track_id)34 subtitles_c::subtitles_c(std::string const &file_name,
35 int64_t track_id)
36 : current{entries.end()}
37 , m_cc_utf8{charset_converter_c::init("UTF-8")}
38 , m_invalid_utf8_warned{g_identifying}
39 , m_file_name{file_name}
40 , m_track_id{track_id}
41 {
42 }
43
44 void
set_charset_converter(charset_converter_cptr const & cc_utf8)45 subtitles_c::set_charset_converter(charset_converter_cptr const &cc_utf8) {
46 if (cc_utf8)
47 m_cc_utf8 = cc_utf8;
48
49 else {
50 m_cc_utf8 = g_cc_local_utf8;
51 m_try_utf8 = true;
52 }
53 }
54
55 std::string
recode(std::string const & s,uint32_t replacement_marker)56 subtitles_c::recode(std::string const &s,
57 uint32_t replacement_marker) {
58 if (m_try_utf8 && !mtx::utf8::is_valid(s))
59 m_try_utf8 = false;
60
61 auto recoded = m_try_utf8 ? s : m_cc_utf8->utf8(s);
62
63 if (mtx::utf8::is_valid(recoded))
64 return recoded;
65
66 if (!m_invalid_utf8_warned) {
67 m_invalid_utf8_warned = true;
68 mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("This text subtitle track contains invalid 8-bit characters outside valid multi-byte UTF-8 sequences. Please specify the correct encoding for this track.\n")));
69 }
70
71 return mtx::utf8::fix_invalid(recoded, replacement_marker);
72 }
73
74 void
process(generic_packetizer_c * p)75 subtitles_c::process(generic_packetizer_c *p) {
76 if (empty() || (entries.end() == current))
77 return;
78
79 packet_cptr packet(new packet_t(memory_c::borrow(current->subs), current->start, current->end - current->start));
80 packet->extensions.push_back(packet_extension_cptr(new subtitle_number_packet_extension_c(current->number)));
81 p->process(packet);
82 ++current;
83 }
84
85 // ------------------------------------------------------------
86
87 #define SRT_RE_VALUE "\\s*(-?)\\s*(\\d+)"
88 #define SRT_RE_TIMESTAMP SRT_RE_VALUE ":" SRT_RE_VALUE ":" SRT_RE_VALUE "(?:[,\\.:]" SRT_RE_VALUE ")?"
89 #define SRT_RE_TIMESTAMP_LINE "^" SRT_RE_TIMESTAMP "\\s*[\\-\\s]+>\\s*" SRT_RE_TIMESTAMP "\\s*"
90 #define SRT_RE_COORDINATES "([XY]\\d+:\\d+\\s*){4}\\s*$"
91
92 bool
probe(mm_text_io_c & io)93 srt_parser_c::probe(mm_text_io_c &io) {
94 try {
95 io.setFilePointer(0);
96 std::string s;
97 do {
98 s = io.getline(10);
99 mtx::string::strip(s);
100 } while (s.empty());
101
102 int64_t dummy;
103 if (!mtx::string::parse_number(s, dummy))
104 return false;
105
106 s = io.getline(100);
107 QRegularExpression timestamp_re{SRT_RE_TIMESTAMP_LINE};
108 if (!Q(s).contains(timestamp_re))
109 return false;
110
111 s = io.getline();
112 io.setFilePointer(0);
113
114 } catch (...) {
115 return false;
116 }
117
118 return true;
119 }
120
srt_parser_c(mm_text_io_cptr const & io,const std::string & file_name,int64_t track_id)121 srt_parser_c::srt_parser_c(mm_text_io_cptr const &io,
122 const std::string &file_name,
123 int64_t track_id)
124 : subtitles_c{file_name, track_id}
125 , m_io(io)
126 , m_coordinates_warning_shown(false)
127 {
128 }
129
130 void
parse()131 srt_parser_c::parse() {
132 QRegularExpression timestamp_re{SRT_RE_TIMESTAMP_LINE};
133 QRegularExpression number_re{"^\\d+$"};
134 QRegularExpression coordinates_re{SRT_RE_COORDINATES};
135
136 int64_t start = 0;
137 int64_t end = 0;
138 int64_t previous_start = 0;
139 bool timestamp_warning_printed = false;
140 parser_state_e state = STATE_INITIAL;
141 int line_number = 0;
142 unsigned int subtitle_number = 0;
143 unsigned int timestamp_number = 0;
144 std::string subtitles;
145
146 m_io->setFilePointer(0);
147
148 while (1) {
149 std::string s;
150 if (!m_io->getline2(s))
151 break;
152
153 s = recode(s);
154
155 line_number++;
156 mtx::string::strip_back(s);
157
158 mxdebug_if(m_debug, fmt::format("line {0} state {1} content »{2}«\n", line_number, state == STATE_INITIAL ? "initial" : state == STATE_TIME ? "time" : state == STATE_SUBS ? "subs" : "subs-or-number", s));
159
160 if (s.empty()) {
161 if ((STATE_INITIAL == state) || (STATE_TIME == state))
162 continue;
163
164 state = STATE_SUBS_OR_NUMBER;
165
166 if (!subtitles.empty())
167 subtitles += "\n";
168
169 continue;
170 }
171
172 if (STATE_INITIAL == state) {
173 if (!Q(s).contains(number_re)) {
174 mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("Error in line {0}: expected subtitle number and found some text.\n"), line_number));
175 break;
176 }
177 state = STATE_TIME;
178 mtx::string::parse_number(s, subtitle_number);
179
180 } else if (STATE_TIME == state) {
181 auto matches = timestamp_re.match(Q(s));
182 if (!matches.hasMatch()) {
183 mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("Error in line {0}: expected a SRT timestamp line but found something else. Aborting this file.\n"), line_number));
184 break;
185 }
186
187 int64_t s_h = 0, s_min = 0, s_sec = 0, s_ns = 0, e_h = 0, e_min = 0, e_sec = 0, e_ns = 0;
188
189 // 1 2 3 4 5 6 7 8
190 // "\\s*(-?)\\s*(\\d+):\\s(-?)*(\\d+):\\s*(-?)(\\d+)(?:[,\\.]\\s*(-?)(\\d+))?"
191
192 mtx::string::parse_number(to_utf8(matches.captured( 2)), s_h);
193 mtx::string::parse_number(to_utf8(matches.captured( 4)), s_min);
194 mtx::string::parse_number(to_utf8(matches.captured( 6)), s_sec);
195 mtx::string::parse_number(to_utf8(matches.captured(10)), e_h);
196 mtx::string::parse_number(to_utf8(matches.captured(12)), e_min);
197 mtx::string::parse_number(to_utf8(matches.captured(14)), e_sec);
198
199 std::string s_rest = to_utf8(matches.captured( 8));
200 std::string e_rest = to_utf8(matches.captured(16));
201
202 auto neg_calculator = [&matches](auto start_idx) -> auto {
203 int64_t neg = 1;
204 for (auto idx = 0; idx < 4; ++idx)
205 if (matches.captured(start_idx + (idx * 2)) == Q("-"))
206 neg *= -1;
207 return neg;
208 };
209
210 int64_t s_neg = neg_calculator(1);
211 int64_t e_neg = neg_calculator(9);
212
213 if (Q(s).contains(coordinates_re) && !m_coordinates_warning_shown) {
214 mxwarn_tid(m_file_name, m_track_id,
215 Y("This file contains coordinates in the timestamp lines. "
216 "Such coordinates are not supported by the Matroska SRT subtitle format. "
217 "The coordinates will be removed automatically.\n"));
218 m_coordinates_warning_shown = true;
219 }
220
221 // The previous entry is done now. Append it to the list of subtitles.
222 if (!subtitles.empty()) {
223 mtx::string::strip_back(subtitles, true);
224 add(start, end, timestamp_number, subtitles.c_str());
225 }
226
227 while (s_rest.length() < 9)
228 s_rest += "0";
229 if (s_rest.length() > 9)
230 s_rest.erase(9);
231
232 while (e_rest.length() < 9)
233 e_rest += "0";
234 if (e_rest.length() > 9)
235 e_rest.erase(9);
236
237 mtx::string::parse_number(s_rest, s_ns);
238 mtx::string::parse_number(e_rest, e_ns);
239
240 // Calculate the start and end time in ns precision for the following entry.
241 start = ((s_h * 60 * 60 + s_min * 60 + s_sec) * 1'000'000'000ll + s_ns) * s_neg;
242 end = ((e_h * 60 * 60 + e_min * 60 + e_sec) * 1'000'000'000ll + e_ns) * e_neg;
243
244 if (0 > start) {
245 mxwarn_tid(m_file_name, m_track_id,
246 fmt::format(Y("Line {0}: Negative timestamp encountered. The entry will be adjusted to start from 00:00:00.000.\n"), line_number));
247 end -= start;
248 start = 0;
249 if (0 > end)
250 end *= -1;
251 }
252
253 // There are files for which start timestamps overlap. Matroska requires
254 // blocks to be sorted by their timestamp. mkvmerge does this at the end
255 // of this function, but warn the user that the original order is being
256 // changed.
257 if (!timestamp_warning_printed && (start < previous_start)) {
258 mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("Warning in line {0}: The start timestamp is smaller than that of the previous entry. "
259 "All entries from this file will be sorted by their start time.\n"), line_number));
260 timestamp_warning_printed = true;
261 }
262
263 previous_start = start;
264 subtitles = "";
265 state = STATE_SUBS;
266 timestamp_number = subtitle_number;
267
268 } else if (STATE_SUBS == state) {
269 if (!subtitles.empty())
270 subtitles += "\n";
271 subtitles += s;
272
273 } else if (Q(s).contains(number_re)) {
274 state = STATE_TIME;
275 mtx::string::parse_number(s, subtitle_number);
276
277 } else {
278 if (!subtitles.empty())
279 subtitles += "\n";
280 subtitles += s;
281 }
282 }
283
284 if (!subtitles.empty()) {
285 mtx::string::strip_back(subtitles, true);
286 add(start, end, timestamp_number, subtitles.c_str());
287 }
288
289 sort();
290 }
291
292 // ------------------------------------------------------------
293
294 bool
probe(mm_text_io_c & io)295 ssa_parser_c::probe(mm_text_io_c &io) {
296 QRegularExpression script_info_re{"^\\s*\\[script\\s+info\\]", QRegularExpression::CaseInsensitiveOption};
297 QRegularExpression styles_re{ "^\\s*\\[V4\\+?\\s+Styles\\]", QRegularExpression::CaseInsensitiveOption};
298 QRegularExpression comment_re{ "^\\s*$|^\\s*[!;]", QRegularExpression::CaseInsensitiveOption};
299
300 try {
301 int line_number = 0;
302 io.setFilePointer(0);
303
304 std::string line;
305 while (io.getline2(line, 1000)) {
306 ++line_number;
307
308 // Read at most 100 lines.
309 if (100 < line_number)
310 return false;
311
312 auto qline = Q(line);
313 // Skip comments and empty lines.
314 if (qline.contains(comment_re))
315 continue;
316
317 // This is the line mkvmerge is looking for: positive match.
318 if (qline.contains(script_info_re) || qline.contains(styles_re))
319 return true;
320
321 // Neither a wanted line nor an empty one/a comment: negative result.
322 return false;
323 }
324 } catch (...) {
325 }
326
327 return false;
328 }
329
ssa_parser_c(generic_reader_c & reader,mm_text_io_cptr const & io,const std::string & file_name,int64_t track_id)330 ssa_parser_c::ssa_parser_c(generic_reader_c &reader,
331 mm_text_io_cptr const &io,
332 const std::string &file_name,
333 int64_t track_id)
334 : subtitles_c{file_name, track_id}
335 , m_reader(reader)
336 , m_io(io)
337 , m_is_ass(false)
338 , m_attachment_id(0)
339 {
340 }
341
342 void
parse()343 ssa_parser_c::parse() {
344 QRegularExpression sec_styles_ass_re{"^\\s*\\[V4\\+\\s+Styles\\]", QRegularExpression::CaseInsensitiveOption};
345 QRegularExpression sec_styles_re{ "^\\s*\\[V4\\s+Styles\\]", QRegularExpression::CaseInsensitiveOption};
346 QRegularExpression sec_info_re{ "^\\s*\\[Script\\s+Info\\]", QRegularExpression::CaseInsensitiveOption};
347 QRegularExpression sec_events_re{ "^\\s*\\[Events\\]", QRegularExpression::CaseInsensitiveOption};
348 QRegularExpression sec_graphics_re{ "^\\s*\\[Graphics\\]", QRegularExpression::CaseInsensitiveOption};
349 QRegularExpression sec_fonts_re{ "^\\s*\\[Fonts\\]", QRegularExpression::CaseInsensitiveOption};
350
351 int num = 0;
352 ssa_section_e section = SSA_SECTION_NONE;
353 ssa_section_e previous_section = SSA_SECTION_NONE;
354 std::string name_field = "Name";
355
356 std::string attachment_name, attachment_data_uu;
357
358 m_io->setFilePointer(0);
359
360 while (!m_io->eof()) {
361 std::string line;
362 if (!m_io->getline2(line))
363 break;
364
365 line = recode(line);
366 auto qline = Q(line);
367 bool add_to_global = true;
368
369 // A normal line. Let's see if this file is ASS and not SSA.
370 if (!strcasecmp(line.c_str(), "ScriptType: v4.00+"))
371 m_is_ass = true;
372
373 else if (qline.contains(sec_styles_ass_re)) {
374 m_is_ass = true;
375 section = SSA_SECTION_V4STYLES;
376
377 } else if (qline.contains(sec_styles_re))
378 section = SSA_SECTION_V4STYLES;
379
380 else if (qline.contains(sec_info_re))
381 section = SSA_SECTION_INFO;
382
383 else if (qline.contains(sec_events_re))
384 section = SSA_SECTION_EVENTS;
385
386 else if (qline.contains(sec_graphics_re)) {
387 section = SSA_SECTION_GRAPHICS;
388 add_to_global = false;
389
390 } else if (qline.contains(sec_fonts_re)) {
391 section = SSA_SECTION_FONTS;
392 add_to_global = false;
393
394 } else if (SSA_SECTION_EVENTS == section) {
395 if (balg::istarts_with(line, "Format: ")) {
396 // Analyze the format string.
397 m_format = mtx::string::split(&line.c_str()[strlen("Format: ")]);
398 mtx::string::strip(m_format);
399
400 // Let's see if "Actor" is used in the format instead of "Name".
401 size_t i;
402 for (i = 0; m_format.size() > i; ++i)
403 if (balg::iequals(m_format[i], "actor")) {
404 name_field = "Actor";
405 break;
406 }
407
408 } else if (balg::istarts_with(line, "Dialogue: ")) {
409 if (m_format.empty())
410 throw mtx::input::extended_x(Y("ssa_reader: Invalid format. Could not find the \"Format\" line in the \"[Events]\" section."));
411
412 std::string orig_line = line;
413
414 line.erase(0, strlen("Dialogue: ")); // Trim the start.
415
416 // Split the line into fields.
417 std::vector<std::string> fields = mtx::string::split(line.c_str(), ",", m_format.size());
418 while (fields.size() < m_format.size())
419 fields.push_back(""s);
420
421 // Parse the start time.
422 auto stime = get_element("Start", fields);
423 auto start = parse_time(stime);
424 stime = get_element("End", fields);
425 auto end = parse_time(stime);
426
427 if ( (0 > start)
428 || (0 > end)
429 || (start > end)) {
430 mxwarn_tid(m_file_name, m_track_id, fmt::format(Y("SSA/ASS: The following line will be skipped as one of the timestamps is less than 0, or the end timestamp is less than the start timestamp: {0}\n"), orig_line));
431 continue;
432 }
433
434 // Specs say that the following fields are to put into the block:
435 // ReadOrder, Layer, Style, Name, MarginL, MarginR, MarginV, Effect,
436 // Text
437
438 std::string comma = ",";
439 line
440 = fmt::to_string(num) + comma
441 + get_element("Layer", fields) + comma
442 + get_element("Style", fields) + comma
443 + get_element(name_field.c_str(), fields) + comma
444 + get_element("MarginL", fields) + comma
445 + get_element("MarginR", fields) + comma
446 + get_element("MarginV", fields) + comma
447 + get_element("Effect", fields) + comma
448 + get_element("Text", fields);
449
450 add(start, end, num, line);
451 num++;
452
453 add_to_global = false;
454 }
455
456 } else if ((SSA_SECTION_FONTS == section) || (SSA_SECTION_GRAPHICS == section)) {
457 if (balg::istarts_with(line, "fontname:")) {
458 add_attachment_maybe(attachment_name, attachment_data_uu, section);
459
460 line.erase(0, strlen("fontname:"));
461 mtx::string::strip(line, true);
462 attachment_name = line;
463
464 } else {
465 mtx::string::strip(line, true);
466 attachment_data_uu += line;
467 }
468
469 add_to_global = false;
470 }
471
472 if (add_to_global) {
473 m_global += line;
474 m_global += "\r\n";
475 }
476
477 if (previous_section != section)
478 add_attachment_maybe(attachment_name, attachment_data_uu, previous_section);
479
480 previous_section = section;
481 }
482
483 sort();
484 }
485
486 std::string
get_element(const char * index,std::vector<std::string> & fields)487 ssa_parser_c::get_element(const char *index,
488 std::vector<std::string> &fields) {
489 size_t i;
490
491 for (i = 0; i < m_format.size(); i++)
492 if (m_format[i] == index)
493 return fields[i];
494
495 return ""s;
496 }
497
498 int64_t
parse_time(std::string & stime)499 ssa_parser_c::parse_time(std::string &stime) {
500 int64_t th, tm, ts, tds;
501
502 int pos = stime.find(':');
503 if (0 > pos)
504 return -1;
505
506 std::string s = stime.substr(0, pos);
507 if (!mtx::string::parse_number(s, th))
508 return -1;
509 stime.erase(0, pos + 1);
510
511 pos = stime.find(':');
512 if (0 > pos)
513 return -1;
514
515 s = stime.substr(0, pos);
516 if (!mtx::string::parse_number(s, tm))
517 return -1;
518 stime.erase(0, pos + 1);
519
520 pos = stime.find('.');
521 if (0 > pos)
522 return -1;
523
524 s = stime.substr(0, pos);
525 if (!mtx::string::parse_number(s, ts))
526 return -1;
527 stime.erase(0, pos + 1);
528
529 if (!mtx::string::parse_number(stime, tds))
530 return -1;
531
532 return (tds * 10 + ts * 1000 + tm * 60 * 1000 + th * 60 * 60 * 1000) * 1000000;
533 }
534
535 void
add_attachment_maybe(std::string & name,std::string & data_uu,ssa_section_e section)536 ssa_parser_c::add_attachment_maybe(std::string &name,
537 std::string &data_uu,
538 ssa_section_e section) {
539 if (name.empty() || data_uu.empty() || ((SSA_SECTION_FONTS != section) && (SSA_SECTION_GRAPHICS != section))) {
540 name = "";
541 data_uu = "";
542 return;
543 }
544
545 ++m_attachment_id;
546
547 if (!m_reader.attachment_requested(m_attachment_id)) {
548 name = "";
549 data_uu = "";
550 return;
551 }
552
553 auto attachment_p = std::make_shared<attachment_t>();
554 auto &attachment = *attachment_p;
555
556 std::string short_name = m_file_name;
557 size_t pos = short_name.rfind('/');
558
559 if (std::string::npos != pos)
560 short_name.erase(0, pos + 1);
561 pos = short_name.rfind('\\');
562 if (std::string::npos != pos)
563 short_name.erase(0, pos + 1);
564
565 attachment.ui_id = m_attachment_id;
566 attachment.name = name;
567 attachment.description = fmt::format(SSA_SECTION_FONTS == section ? Y("Imported font from {0}") : Y("Imported picture from {0}"), short_name);
568 attachment.to_all_files = true;
569 attachment.source_file = m_file_name;
570
571 size_t data_size = data_uu.length() % 4;
572 data_size = 3 == data_size ? 2 : 2 == data_size ? 1 : 0;
573 data_size += data_uu.length() / 4 * 3;
574 attachment.data = memory_c::alloc(data_size);
575 auto out = attachment.data->get_buffer();
576 auto in = reinterpret_cast<unsigned char const *>(data_uu.c_str());
577
578 for (auto end = in + (data_uu.length() / 4) * 4; in < end; in += 4, out += 3)
579 decode_chars(in, out, 4);
580
581 decode_chars(in, out, data_uu.length() % 4);
582
583 attachment.mime_type = mtx::mime::guess_type_for_data(*attachment.data);
584
585 add_attachment(attachment_p);
586
587 name = "";
588 data_uu = "";
589 }
590
591 void
decode_chars(unsigned char const * in,unsigned char * out,size_t bytes_in)592 ssa_parser_c::decode_chars(unsigned char const *in,
593 unsigned char *out,
594 size_t bytes_in) {
595 if (!bytes_in)
596 return;
597
598 size_t bytes_out = 4 == bytes_in ? 3 : 3 == bytes_in ? 2 : 1;
599 uint32_t value = 0;
600
601 for (int idx = 0; idx < static_cast<int>(bytes_in); ++idx)
602 value |= (static_cast<uint32_t>(in[idx]) - 33) << (6 * (3 - idx));
603
604 for (int idx = 0; idx < static_cast<int>(bytes_out); ++idx)
605 out[idx] = (value >> ((2 - idx) * 8)) & 0xff;
606 }
607