1 /*
2    mkvmerge -- utility for splicing together matroska files
3    from component media subtypes
4 
5    Distributed under the GPL v2
6    see the file COPYING for details
7    or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
8 
9    locale handling functions
10 
11    Written by Moritz Bunkus <moritz@bunkus.org>.
12 */
13 
14 #include "common/common_pch.h"
15 
16 #include <cerrno>
17 #include <clocale>
18 #if HAVE_NL_LANGINFO
19 # include <langinfo.h>
20 #elif HAVE_LOCALE_CHARSET
21 # include <libcharset.h>
22 #endif
23 #if defined(SYS_WINDOWS)
24 # include <windows.h>
25 #endif
26 
27 #include <QRegularExpression>
28 
29 #include "common/memory.h"
30 #include "common/mm_io.h"
31 #include "common/mm_mem_io.h"
32 #include "common/mm_proxy_io.h"
33 #include "common/mm_text_io.h"
34 #include "common/qt.h"
35 #include "common/strings/parsing.h"
36 #ifdef SYS_WINDOWS
37 # include "common/fs_sys_helpers.h"
38 # include "common/strings/formatting.h"
39 #endif
40 
41 charset_converter_cptr g_cc_local_utf8;
42 
43 std::map<std::string, charset_converter_cptr> charset_converter_c::s_converters;
44 
charset_converter_c(std::string charset)45 charset_converter_c::charset_converter_c(std::string charset)
46   : m_charset{std::move(charset)}
47 {
48 }
49 
50 std::string
utf8(const std::string & source)51 charset_converter_c::utf8(const std::string &source) {
52   return source;
53 }
54 
55 std::string
native(const std::string & source)56 charset_converter_c::native(const std::string &source) {
57   return source;
58 }
59 
60 std::string const &
get_charset() const61 charset_converter_c::get_charset()
62   const {
63   return m_charset;
64 }
65 
66 charset_converter_cptr
init(const std::string & charset,bool ignore_errors)67 charset_converter_c::init(const std::string &charset,
68                           bool ignore_errors) {
69   std::string actual_charset = charset.empty() ? get_local_charset() : charset;
70 
71   auto converter = s_converters.find(actual_charset);
72   if (converter != s_converters.end())
73     return (*converter).second;
74 
75 #if defined(SYS_WINDOWS)
76   if (windows_charset_converter_c::is_available(actual_charset))
77     return charset_converter_cptr(new windows_charset_converter_c(actual_charset));
78 #endif
79 
80   if (ignore_errors && !iconv_charset_converter_c::is_available(actual_charset))
81     return {};
82 
83   return charset_converter_cptr(new iconv_charset_converter_c(actual_charset));
84 }
85 
86 bool
is_utf8_charset_name(const std::string & charset)87 charset_converter_c::is_utf8_charset_name(const std::string &charset) {
88   return Q(charset).contains(QRegularExpression{"^utf-?8$", QRegularExpression::CaseInsensitiveOption});
89 }
90 
91 void
enable_byte_order_marker_detection(bool enable)92 charset_converter_c::enable_byte_order_marker_detection(bool enable) {
93   m_detect_byte_order_marker = enable;
94 }
95 
96 bool
handle_string_with_bom(const std::string & source,std::string & recoded)97 charset_converter_c::handle_string_with_bom(const std::string &source,
98                                             std::string &recoded) {
99   if (!m_detect_byte_order_marker)
100     return false;
101 
102   if (!mm_text_io_c::has_byte_order_marker(source))
103     return false;
104 
105   recoded.clear();
106   mm_text_io_c io(std::make_shared<mm_mem_io_c>(reinterpret_cast<const unsigned char *>(source.c_str()), source.length()));
107   std::string line;
108   while (io.getline2(line))
109     recoded += line;
110 
111   return true;
112 }
113 
114 // ------------------------------------------------------------
115 static iconv_t const s_iconv_t_error_value = reinterpret_cast<iconv_t>(-1);
116 
iconv_charset_converter_c(const std::string & charset)117 iconv_charset_converter_c::iconv_charset_converter_c(const std::string &charset)
118   : charset_converter_c(charset)
119   , m_is_utf8(false)
120   , m_to_utf8_handle(s_iconv_t_error_value)
121   , m_from_utf8_handle(s_iconv_t_error_value)
122 {
123   if (is_utf8_charset_name(charset)) {
124     m_is_utf8 = true;
125     return;
126   }
127 
128   m_to_utf8_handle = iconv_open("UTF-8", charset.c_str());
129   if (s_iconv_t_error_value == m_to_utf8_handle)
130     mxwarn(fmt::format(Y("Could not initialize the iconv library for the conversion from {0} to UTF-8. "
131                          "Some strings will not be converted to UTF-8 and the resulting Matroska file "
132                          "might not comply with the Matroska specs (error: {1}, {2}).\n"),
133                        charset, errno, strerror(errno)));
134 
135   m_from_utf8_handle = iconv_open(charset.c_str(), "UTF-8");
136   if (s_iconv_t_error_value == m_from_utf8_handle)
137     mxwarn(fmt::format(Y("Could not initialize the iconv library for the conversion from UTF-8 to {0}. "
138                          "Some strings cannot be converted from UTF-8 and might be displayed incorrectly (error: {1}, {2}).\n"),
139                        charset, errno, strerror(errno)));
140 }
141 
~iconv_charset_converter_c()142 iconv_charset_converter_c::~iconv_charset_converter_c() {
143   if (s_iconv_t_error_value != m_to_utf8_handle)
144     iconv_close(m_to_utf8_handle);
145 
146   if (s_iconv_t_error_value != m_from_utf8_handle)
147     iconv_close(m_from_utf8_handle);
148 }
149 
150 std::string
utf8(const std::string & source)151 iconv_charset_converter_c::utf8(const std::string &source) {
152   std::string recoded;
153   if (handle_string_with_bom(source, recoded))
154     return recoded;
155 
156   return m_is_utf8 ? source : iconv_charset_converter_c::convert(m_to_utf8_handle, source);
157 }
158 
159 std::string
native(const std::string & source)160 iconv_charset_converter_c::native(const std::string &source) {
161   return m_is_utf8 ? source : iconv_charset_converter_c::convert(m_from_utf8_handle, source);
162 }
163 
164 std::string
convert(iconv_t handle,const std::string & source)165 iconv_charset_converter_c::convert(iconv_t handle,
166                                    const std::string &source) {
167   if (s_iconv_t_error_value == handle)
168     return source;
169 
170   int length        = source.length() * 4;
171   char *destination = (char *)safemalloc(length + 1);
172   memset(destination, 0, length + 1);
173 
174   iconv(handle, nullptr, nullptr, nullptr, nullptr); // Reset the iconv state.
175 
176   size_t length_source      = length / 4;
177   size_t length_destination = length;
178   char *source_copy         = safestrdup(source.c_str());
179   char *ptr_source          = source_copy;
180   char *ptr_destination     = destination;
181   iconv(handle, (ICONV_CONST char **)&ptr_source, &length_source, &ptr_destination, &length_destination);
182   iconv(handle, nullptr, nullptr, &ptr_destination, &length_destination);
183 
184   safefree(source_copy);
185   std::string result = destination;
186   safefree(destination);
187 
188   return result;
189 }
190 
191 bool
is_available(const std::string & charset)192 iconv_charset_converter_c::is_available(const std::string &charset) {
193   if (is_utf8_charset_name(charset))
194     return true;
195 
196   iconv_t handle = iconv_open("UTF-8", charset.c_str());
197   if (s_iconv_t_error_value == handle)
198     return false;
199 
200   iconv_close(handle);
201 
202   return true;
203 }
204 
205 // ------------------------------------------------------------
206 
207 #if defined(SYS_WINDOWS)
208 
windows_charset_converter_c(const std::string & charset)209 windows_charset_converter_c::windows_charset_converter_c(const std::string &charset)
210   : charset_converter_c(charset)
211   , m_is_utf8(is_utf8_charset_name(charset))
212   , m_code_page(extract_code_page(charset))
213 {
214 }
215 
~windows_charset_converter_c()216 windows_charset_converter_c::~windows_charset_converter_c() {
217 }
218 
219 std::string
utf8(const std::string & source)220 windows_charset_converter_c::utf8(const std::string &source) {
221   std::string recoded;
222   if (handle_string_with_bom(source, recoded))
223     return recoded;
224 
225   return m_is_utf8 ? source : windows_charset_converter_c::convert(m_code_page, CP_UTF8, source);
226 }
227 
228 std::string
native(const std::string & source)229 windows_charset_converter_c::native(const std::string &source) {
230   return m_is_utf8 ? source : windows_charset_converter_c::convert(CP_UTF8, m_code_page, source);
231 }
232 
233 std::string
convert(unsigned int source_code_page,unsigned int destination_code_page,const std::string & source)234 windows_charset_converter_c::convert(unsigned int source_code_page,
235                                      unsigned int destination_code_page,
236                                      const std::string &source) {
237   if (source_code_page == destination_code_page)
238     return source;
239 
240   int num_wide_chars = MultiByteToWideChar(source_code_page, 0, source.c_str(), -1, nullptr, 0);
241   wchar_t *wbuffer   = new wchar_t[num_wide_chars];
242   MultiByteToWideChar(source_code_page, 0, source.c_str(), -1, wbuffer, num_wide_chars);
243 
244   int num_bytes = WideCharToMultiByte(destination_code_page, 0, wbuffer, -1, nullptr, 0, nullptr, nullptr);
245   char *buffer  = new char[num_bytes];
246   WideCharToMultiByte(destination_code_page, 0, wbuffer, -1, buffer, num_bytes, nullptr, nullptr);
247 
248   std::string result = buffer;
249 
250   delete []wbuffer;
251   delete []buffer;
252 
253   return result;
254 }
255 
256 bool
is_available(const std::string & charset)257 windows_charset_converter_c::is_available(const std::string &charset) {
258   unsigned int code_page = extract_code_page(charset);
259   if (0 == code_page)
260     return false;
261 
262   return IsValidCodePage(code_page);
263 }
264 
265 unsigned int
extract_code_page(const std::string & charset)266 windows_charset_converter_c::extract_code_page(const std::string &charset) {
267   if (charset.substr(0, 2) != "CP")
268     return 0;
269 
270   std::string number_as_str = charset.substr(2, charset.length() - 2);
271   uint64_t number           = 0;
272   if (!mtx::string::parse_number(number_as_str.c_str(), number))
273     return 0;
274 
275   return number;
276 }
277 
278 #endif  // defined(SYS_WINDOWS)
279 
280 // ------------------------------------------------------------
281 
282 std::string
get_local_charset()283 get_local_charset() {
284   std::string lc_charset;
285 
286   setlocale(LC_CTYPE, "");
287 #if defined(COMP_MINGW) || defined(COMP_MSC)
288   lc_charset = fmt::format("CP{0}", GetACP());
289 #elif defined(SYS_SOLARIS)
290   int i;
291 
292   lc_charset = nl_langinfo(CODESET);
293   if (mtx::string::parse_number(lc_charset, i))
294     lc_charset = "ISO"s + lc_charset + "-US"s;
295 #elif HAVE_NL_LANGINFO
296   lc_charset = nl_langinfo(CODESET);
297 #elif HAVE_LOCALE_CHARSET
298   lc_charset = locale_charset();
299 #endif
300 
301   return lc_charset;
302 }
303 
304 std::string
get_local_console_charset()305 get_local_console_charset() {
306 #if defined(SYS_WINDOWS)
307   return fmt::format("CP{0}", GetACP());
308 #else
309   return get_local_charset();
310 #endif
311 }
312