1 /*
2    mkvmerge -- utility for splicing together matroska files
3    from component media subtypes
4 
5    Distributed under the GPL v2
6    see the file COPYING for details
7    or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
8 
9    helper function for WebVTT data
10 
11    Written by Moritz Bunkus <moritz@bunkus.org>.
12 */
13 
14 #include "common/common_pch.h"
15 
16 #include "common/qt.h"
17 #include "common/strings/editing.h"
18 #include "common/strings/formatting.h"
19 #include "common/strings/parsing.h"
20 #include "common/webvtt.h"
21 
22 namespace mtx::webvtt {
23 
24 constexpr auto RE_TIMESTAMP = "((?:\\d+:)?\\d{2}:\\d{2}\\.\\d{3})";
25 
26 struct parser_c::impl_t {
27 public:
28   std::vector<std::string> current_block, global_blocks, local_blocks;
29   bool parsing_global_data{true};
30   std::deque<parser_c::cue_cptr> cues;
31   unsigned int current_cue_number{}, total_number_of_cues{}, total_number_of_bytes{};
32   debugging_option_c debug{"parser"};
33 
34   QRegularExpression timestamp_line_re{Q(fmt::format("^[ \\t]*{0}[ \\t]+-->[ \\t]+{0}(?:[ \\t]+([^\\n]+))?$", RE_TIMESTAMP))};
35 };
36 
parser_c()37 parser_c::parser_c()
38   : m{new impl_t()}
39 {
40 }
41 
~parser_c()42 parser_c::~parser_c() { // NOLINT(modernize-use-equals-default) due to pimpl idiom requiring explicit dtor declaration somewhere
43 }
44 
45 void
add_line(std::string const & line)46 parser_c::add_line(std::string const &line) {
47   auto tmp = mtx::string::chomp(line);
48 
49   if (tmp.empty())
50     add_block();
51 
52   else
53     m->current_block.emplace_back(std::move(tmp));
54 }
55 
56 void
add_joined_lines(std::string const & joined_lines)57 parser_c::add_joined_lines(std::string const &joined_lines) {
58   auto lines = mtx::string::split(mtx::string::chomp(mtx::string::normalize_line_endings(joined_lines)), "\n");
59 
60   for (auto const &line : lines)
61     add_line(line);
62 }
63 
64 void
add_joined_lines(memory_c const & mem)65 parser_c::add_joined_lines(memory_c const &mem) {
66   if (!mem.get_size())
67     return;
68 
69   add_joined_lines(mem.to_string());
70 }
71 
72 void
flush()73 parser_c::flush() {
74   add_block();
75   m->parsing_global_data = false;
76 }
77 
78 void
add_block()79 parser_c::add_block() {
80   if (m->current_block.empty())
81     return;
82 
83   std::string label, additional;
84   auto timestamp_line = -1;
85   auto is_other       = false;
86   QRegularExpressionMatch matches;
87 
88   if (matches = m->timestamp_line_re.match(Q(m->current_block[0])); matches.hasMatch())
89     timestamp_line = 0;
90 
91   else if (m->current_block.size() <= 1)
92     is_other = true;
93 
94   else if (matches = m->timestamp_line_re.match(Q(m->current_block[1])); matches.hasMatch()) {
95     timestamp_line = 1;
96     label          = std::move(m->current_block[0]);
97 
98   } else
99     is_other = true;
100 
101   if (is_other) {
102     auto content = mtx::string::join(m->current_block, "\n");
103     (m->parsing_global_data ? m->global_blocks : m->local_blocks).emplace_back(std::move(content));
104 
105     m->current_block.clear();
106 
107     return;
108   }
109 
110   m->parsing_global_data = false;
111 
112   timestamp_c start, end;
113   mtx::string::parse_timestamp(to_utf8(matches.captured(1)), start);
114   mtx::string::parse_timestamp(to_utf8(matches.captured(2)), end);
115 
116   auto content       = mtx::string::join(m->current_block.begin() + timestamp_line + 1, m->current_block.end(), "\n");
117   content            = adjust_embedded_timestamps(content, start.negate());
118   auto cue           = std::make_shared<cue_t>();
119   cue->m_start       = start;
120   cue->m_duration    = end - start;
121   cue->m_content     = memory_c::clone(content);
122   auto settings_list = to_utf8(matches.captured(3));
123 
124   if (! (label.empty() && settings_list.empty() && m->local_blocks.empty())) {
125     additional = settings_list + "\n" + label + "\n" + mtx::string::join(m->local_blocks, "\n");
126     cue->m_addition = memory_c::clone(additional);
127   }
128 
129   mxdebug_if(m->debug,
130              fmt::format("label «{0}» start «{1}» end «{2}» settings list «{3}» additional «{4}» content «{5}»\n",
131                          label, to_utf8(matches.captured(1)), to_utf8(matches.captured(2)), to_utf8(matches.captured(3)),
132                          to_utf8(Q(additional).replace(QRegularExpression{"\n+"}, "–")),
133                          to_utf8(Q(content)   .replace(QRegularExpression{"\n+"}, "–"))));
134 
135   m->local_blocks.clear();
136   m->current_block.clear();
137 
138   m->total_number_of_bytes += cue->m_content->get_size() + (cue->m_addition ? cue->m_addition->get_size() : 0);
139 
140   m->cues.emplace_back(cue);
141 }
142 
143 bool
codec_private_available() const144 parser_c::codec_private_available()
145   const {
146   return !m->parsing_global_data;
147 }
148 
149 memory_cptr
get_codec_private() const150 parser_c::get_codec_private()
151   const {
152   return memory_c::clone(mtx::string::join(m->global_blocks, "\n\n"));
153 }
154 
155 bool
cue_available() const156 parser_c::cue_available()
157   const {
158   return !m->cues.empty();
159 }
160 
161 parser_c::cue_cptr
get_cue()162 parser_c::get_cue() {
163   auto cue = m->cues.front();
164   m->cues.pop_front();
165   return cue;
166 }
167 
168 unsigned int
get_current_cue_number() const169 parser_c::get_current_cue_number()
170   const {
171   return m->current_cue_number;
172 }
173 
174 unsigned int
get_total_number_of_cues() const175 parser_c::get_total_number_of_cues()
176   const {
177   return m->total_number_of_cues;
178 }
179 
180 unsigned int
get_total_number_of_bytes() const181 parser_c::get_total_number_of_bytes()
182   const {
183   return m->total_number_of_bytes;
184 }
185 
186 std::string
adjust_embedded_timestamps(std::string const & text,timestamp_c const & offset)187 parser_c::adjust_embedded_timestamps(std::string const &text,
188                                             timestamp_c const &offset) {
189   static std::optional<QRegularExpression> s_embedded_timestamp_re;
190 
191   if (!s_embedded_timestamp_re)
192     s_embedded_timestamp_re = QRegularExpression{Q(fmt::format("<{0}>", RE_TIMESTAMP))};
193 
194   return mtx::string::replace(text, *s_embedded_timestamp_re, [&offset](auto const &match) {
195     timestamp_c timestamp;
196     mtx::string::parse_timestamp(to_utf8(match.captured(1)), timestamp);
197     return Q(fmt::format("<{0}>", mtx::string::format_timestamp(timestamp + offset, 3)));
198   });
199 }
200 
201 } // namespace mtx::webvtt
202