1 /*
2    mkvmerge -- utility for splicing together matroska files
3    from component media subtypes
4 
5    Distributed under the GPL v2
6    see the file COPYING for details
7    or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
8 
9    Subripper subtitle reader
10 
11    Written by Moritz Bunkus <moritz@bunkus.org>.
12 */
13 
14 #include "common/common_pch.h"
15 
16 #include <matroska/KaxTracks.h>
17 
18 #include "common/codec.h"
19 #include "common/debugging.h"
20 #include "common/strings/editing.h"
21 #include "common/strings/parsing.h"
22 #include "common/strings/utf8.h"
23 #include "merge/connection_checks.h"
24 #include "merge/output_control.h"
25 #include "merge/packet_extensions.h"
26 #include "output/p_textsubs.h"
27 
28 using namespace libmatroska;
29 
textsubs_packetizer_c(generic_reader_c * p_reader,track_info_c & p_ti,const char * codec_id,bool recode)30 textsubs_packetizer_c::textsubs_packetizer_c(generic_reader_c *p_reader,
31                                              track_info_c &p_ti,
32                                              const char *codec_id,
33                                              bool recode)
34   : generic_packetizer_c(p_reader, p_ti)
35   , m_codec_id{codec_id}
36 {
37   if (recode) {
38     m_cc_utf8           = charset_converter_c::init(m_ti.m_sub_charset);
39     m_converter_is_utf8 = charset_converter_c::is_utf8_charset_name(m_cc_utf8->get_charset());
40     m_try_utf8          = m_ti.m_sub_charset.empty();
41   }
42 
43   set_track_type(track_subtitle);
44   if (m_codec_id == MKV_S_TEXTUSF)
45     set_default_compression_method(COMPRESSION_ZLIB);
46 
47   auto arg = std::string{};
48   if (!debugging_c::requested("textsubs_force_rerender", &arg))
49     return;
50 
51   auto tid_and_packetno = mtx::string::split(arg, ":");
52   auto tid              = int64_t{};
53   if (!mtx::string::parse_number(tid_and_packetno[0], tid) || (tid != m_ti.m_id))
54     return;
55 
56   unsigned int packetno{};
57   mtx::string::parse_number(tid_and_packetno[1], packetno);
58   m_force_rerender_track_headers_on_packetno = packetno;
59 
60   mxdebug(fmt::format("textsubs_packetizer_c: track {0}: forcing rerendering of track headers after packet {1}\n", tid, packetno));
61 }
62 
~textsubs_packetizer_c()63 textsubs_packetizer_c::~textsubs_packetizer_c() {
64 }
65 
66 void
set_headers()67 textsubs_packetizer_c::set_headers() {
68   set_codec_id(m_codec_id);
69   set_codec_private(m_ti.m_private_data);
70 
71   generic_packetizer_c::set_headers();
72 
73   m_track_entry->EnableLacing(false);
74 }
75 
76 void
set_line_ending_style(mtx::string::line_ending_style_e line_ending_style)77 textsubs_packetizer_c::set_line_ending_style(mtx::string::line_ending_style_e line_ending_style) {
78   m_line_ending_style = line_ending_style;
79 }
80 
81 void
process_impl(packet_cptr const & packet)82 textsubs_packetizer_c::process_impl(packet_cptr const &packet) {
83   if (m_buffered_packet) {
84     m_buffered_packet->duration = packet->timestamp - m_buffered_packet->timestamp;
85     process_one_packet(m_buffered_packet);
86     m_buffered_packet.reset();
87   }
88 
89   auto subs = recode(packet->data->to_string());
90   subs      = mtx::string::normalize_line_endings(subs, m_line_ending_style);
91 
92   mtx::string::strip_back(subs);
93 
94   if (subs.empty())
95     return;
96 
97   packet->data = memory_c::clone(subs);
98 
99   packet->force_key_frame();
100 
101   if (0 <= packet->duration)
102     process_one_packet(packet);
103 
104   else {
105     m_buffered_packet = packet;
106     m_buffered_packet->data->take_ownership();
107   }
108 }
109 
110 std::string
recode(std::string subs)111 textsubs_packetizer_c::recode(std::string subs) {
112   if (m_try_utf8 && !mtx::utf8::is_valid(subs))
113     m_try_utf8 = false;
114 
115   auto emit_invalid_utf8_warning = false;
116 
117   if (!m_try_utf8 && m_cc_utf8) {
118     if (!m_invalid_utf8_warned && m_converter_is_utf8 && !mtx::utf8::is_valid(subs))
119       emit_invalid_utf8_warning = true;
120 
121     subs = m_cc_utf8->utf8(subs);
122 
123   } else if (!m_invalid_utf8_warned && !mtx::utf8::is_valid(subs))
124     emit_invalid_utf8_warning = true;
125 
126   if (emit_invalid_utf8_warning) {
127     m_invalid_utf8_warned = true;
128     mxwarn_tid(m_ti.m_fname, m_ti.m_id, fmt::format(Y("This text subtitle track contains invalid 8-bit characters outside valid multi-byte UTF-8 sequences. Please specify the correct encoding for this track.\n")));
129   }
130 
131   return subs;
132 }
133 
134 void
process_one_packet(packet_cptr const & packet)135 textsubs_packetizer_c::process_one_packet(packet_cptr const &packet) {
136   ++m_packetno;
137 
138   if (0 > packet->duration) {
139     subtitle_number_packet_extension_c *extension = dynamic_cast<subtitle_number_packet_extension_c *>(packet->find_extension(packet_extension_c::SUBTITLE_NUMBER));
140     mxwarn_tid(m_ti.m_fname, m_ti.m_id, fmt::format(Y("Ignoring an entry which starts after it ends ({0}).\n"), extension ? extension->get_number() : static_cast<unsigned int>(m_packetno)));
141     return;
142   }
143 
144   packet->duration_mandatory = true;
145 
146   add_packet(packet);
147 
148   if (m_force_rerender_track_headers_on_packetno && (*m_force_rerender_track_headers_on_packetno == m_packetno)) {
149     auto codec_private = memory_c::alloc(20000);
150     std::memset(codec_private->get_buffer(), 0, codec_private->get_size());
151     set_codec_private(codec_private);
152     rerender_track_headers();
153   }
154 }
155 
156 connection_result_e
can_connect_to(generic_packetizer_c * src,std::string & error_message)157 textsubs_packetizer_c::can_connect_to(generic_packetizer_c *src,
158                                       std::string &error_message) {
159   textsubs_packetizer_c *psrc = dynamic_cast<textsubs_packetizer_c *>(src);
160   if (!psrc)
161     return CAN_CONNECT_NO_FORMAT;
162 
163   connect_check_codec_private(src);
164 
165   return CAN_CONNECT_YES;
166 }
167