1 /*
2 mkvmerge -- utility for splicing together matroska files
3 from component media subtypes
4
5 Distributed under the GPL v2
6 see the file COPYING for details
7 or visit https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
8
9 locale handling functions
10
11 Written by Moritz Bunkus <moritz@bunkus.org>.
12 */
13
14 #include "common/common_pch.h"
15
16 #include <cerrno>
17 #include <clocale>
18 #if HAVE_NL_LANGINFO
19 # include <langinfo.h>
20 #elif HAVE_LOCALE_CHARSET
21 # include <libcharset.h>
22 #endif
23 #if defined(SYS_WINDOWS)
24 # include <windows.h>
25 #endif
26
27 #include <QRegularExpression>
28
29 #include "common/memory.h"
30 #include "common/mm_io.h"
31 #include "common/mm_mem_io.h"
32 #include "common/mm_proxy_io.h"
33 #include "common/mm_text_io.h"
34 #include "common/qt.h"
35 #include "common/strings/parsing.h"
36 #ifdef SYS_WINDOWS
37 # include "common/fs_sys_helpers.h"
38 # include "common/strings/formatting.h"
39 #endif
40
41 charset_converter_cptr g_cc_local_utf8;
42
43 std::map<std::string, charset_converter_cptr> charset_converter_c::s_converters;
44
charset_converter_c(std::string charset)45 charset_converter_c::charset_converter_c(std::string charset)
46 : m_charset{std::move(charset)}
47 {
48 }
49
50 std::string
utf8(const std::string & source)51 charset_converter_c::utf8(const std::string &source) {
52 return source;
53 }
54
55 std::string
native(const std::string & source)56 charset_converter_c::native(const std::string &source) {
57 return source;
58 }
59
60 std::string const &
get_charset() const61 charset_converter_c::get_charset()
62 const {
63 return m_charset;
64 }
65
66 charset_converter_cptr
init(const std::string & charset,bool ignore_errors)67 charset_converter_c::init(const std::string &charset,
68 bool ignore_errors) {
69 std::string actual_charset = charset.empty() ? get_local_charset() : charset;
70
71 auto converter = s_converters.find(actual_charset);
72 if (converter != s_converters.end())
73 return (*converter).second;
74
75 #if defined(SYS_WINDOWS)
76 if (windows_charset_converter_c::is_available(actual_charset))
77 return charset_converter_cptr(new windows_charset_converter_c(actual_charset));
78 #endif
79
80 if (ignore_errors && !iconv_charset_converter_c::is_available(actual_charset))
81 return {};
82
83 return charset_converter_cptr(new iconv_charset_converter_c(actual_charset));
84 }
85
86 bool
is_utf8_charset_name(const std::string & charset)87 charset_converter_c::is_utf8_charset_name(const std::string &charset) {
88 return Q(charset).contains(QRegularExpression{"^utf-?8$", QRegularExpression::CaseInsensitiveOption});
89 }
90
91 void
enable_byte_order_marker_detection(bool enable)92 charset_converter_c::enable_byte_order_marker_detection(bool enable) {
93 m_detect_byte_order_marker = enable;
94 }
95
96 bool
handle_string_with_bom(const std::string & source,std::string & recoded)97 charset_converter_c::handle_string_with_bom(const std::string &source,
98 std::string &recoded) {
99 if (!m_detect_byte_order_marker)
100 return false;
101
102 if (!mm_text_io_c::has_byte_order_marker(source))
103 return false;
104
105 recoded.clear();
106 mm_text_io_c io(std::make_shared<mm_mem_io_c>(reinterpret_cast<const unsigned char *>(source.c_str()), source.length()));
107 std::string line;
108 while (io.getline2(line))
109 recoded += line;
110
111 return true;
112 }
113
114 // ------------------------------------------------------------
115 static iconv_t const s_iconv_t_error_value = reinterpret_cast<iconv_t>(-1);
116
iconv_charset_converter_c(const std::string & charset)117 iconv_charset_converter_c::iconv_charset_converter_c(const std::string &charset)
118 : charset_converter_c(charset)
119 , m_is_utf8(false)
120 , m_to_utf8_handle(s_iconv_t_error_value)
121 , m_from_utf8_handle(s_iconv_t_error_value)
122 {
123 if (is_utf8_charset_name(charset)) {
124 m_is_utf8 = true;
125 return;
126 }
127
128 m_to_utf8_handle = iconv_open("UTF-8", charset.c_str());
129 if (s_iconv_t_error_value == m_to_utf8_handle)
130 mxwarn(fmt::format(Y("Could not initialize the iconv library for the conversion from {0} to UTF-8. "
131 "Some strings will not be converted to UTF-8 and the resulting Matroska file "
132 "might not comply with the Matroska specs (error: {1}, {2}).\n"),
133 charset, errno, strerror(errno)));
134
135 m_from_utf8_handle = iconv_open(charset.c_str(), "UTF-8");
136 if (s_iconv_t_error_value == m_from_utf8_handle)
137 mxwarn(fmt::format(Y("Could not initialize the iconv library for the conversion from UTF-8 to {0}. "
138 "Some strings cannot be converted from UTF-8 and might be displayed incorrectly (error: {1}, {2}).\n"),
139 charset, errno, strerror(errno)));
140 }
141
~iconv_charset_converter_c()142 iconv_charset_converter_c::~iconv_charset_converter_c() {
143 if (s_iconv_t_error_value != m_to_utf8_handle)
144 iconv_close(m_to_utf8_handle);
145
146 if (s_iconv_t_error_value != m_from_utf8_handle)
147 iconv_close(m_from_utf8_handle);
148 }
149
150 std::string
utf8(const std::string & source)151 iconv_charset_converter_c::utf8(const std::string &source) {
152 std::string recoded;
153 if (handle_string_with_bom(source, recoded))
154 return recoded;
155
156 return m_is_utf8 ? source : iconv_charset_converter_c::convert(m_to_utf8_handle, source);
157 }
158
159 std::string
native(const std::string & source)160 iconv_charset_converter_c::native(const std::string &source) {
161 return m_is_utf8 ? source : iconv_charset_converter_c::convert(m_from_utf8_handle, source);
162 }
163
164 std::string
convert(iconv_t handle,const std::string & source)165 iconv_charset_converter_c::convert(iconv_t handle,
166 const std::string &source) {
167 if (s_iconv_t_error_value == handle)
168 return source;
169
170 int length = source.length() * 4;
171 char *destination = (char *)safemalloc(length + 1);
172 memset(destination, 0, length + 1);
173
174 iconv(handle, nullptr, nullptr, nullptr, nullptr); // Reset the iconv state.
175
176 size_t length_source = length / 4;
177 size_t length_destination = length;
178 char *source_copy = safestrdup(source.c_str());
179 char *ptr_source = source_copy;
180 char *ptr_destination = destination;
181 iconv(handle, (ICONV_CONST char **)&ptr_source, &length_source, &ptr_destination, &length_destination);
182 iconv(handle, nullptr, nullptr, &ptr_destination, &length_destination);
183
184 safefree(source_copy);
185 std::string result = destination;
186 safefree(destination);
187
188 return result;
189 }
190
191 bool
is_available(const std::string & charset)192 iconv_charset_converter_c::is_available(const std::string &charset) {
193 if (is_utf8_charset_name(charset))
194 return true;
195
196 iconv_t handle = iconv_open("UTF-8", charset.c_str());
197 if (s_iconv_t_error_value == handle)
198 return false;
199
200 iconv_close(handle);
201
202 return true;
203 }
204
205 // ------------------------------------------------------------
206
207 #if defined(SYS_WINDOWS)
208
windows_charset_converter_c(const std::string & charset)209 windows_charset_converter_c::windows_charset_converter_c(const std::string &charset)
210 : charset_converter_c(charset)
211 , m_is_utf8(is_utf8_charset_name(charset))
212 , m_code_page(extract_code_page(charset))
213 {
214 }
215
~windows_charset_converter_c()216 windows_charset_converter_c::~windows_charset_converter_c() {
217 }
218
219 std::string
utf8(const std::string & source)220 windows_charset_converter_c::utf8(const std::string &source) {
221 std::string recoded;
222 if (handle_string_with_bom(source, recoded))
223 return recoded;
224
225 return m_is_utf8 ? source : windows_charset_converter_c::convert(m_code_page, CP_UTF8, source);
226 }
227
228 std::string
native(const std::string & source)229 windows_charset_converter_c::native(const std::string &source) {
230 return m_is_utf8 ? source : windows_charset_converter_c::convert(CP_UTF8, m_code_page, source);
231 }
232
233 std::string
convert(unsigned int source_code_page,unsigned int destination_code_page,const std::string & source)234 windows_charset_converter_c::convert(unsigned int source_code_page,
235 unsigned int destination_code_page,
236 const std::string &source) {
237 if (source_code_page == destination_code_page)
238 return source;
239
240 int num_wide_chars = MultiByteToWideChar(source_code_page, 0, source.c_str(), -1, nullptr, 0);
241 wchar_t *wbuffer = new wchar_t[num_wide_chars];
242 MultiByteToWideChar(source_code_page, 0, source.c_str(), -1, wbuffer, num_wide_chars);
243
244 int num_bytes = WideCharToMultiByte(destination_code_page, 0, wbuffer, -1, nullptr, 0, nullptr, nullptr);
245 char *buffer = new char[num_bytes];
246 WideCharToMultiByte(destination_code_page, 0, wbuffer, -1, buffer, num_bytes, nullptr, nullptr);
247
248 std::string result = buffer;
249
250 delete []wbuffer;
251 delete []buffer;
252
253 return result;
254 }
255
256 bool
is_available(const std::string & charset)257 windows_charset_converter_c::is_available(const std::string &charset) {
258 unsigned int code_page = extract_code_page(charset);
259 if (0 == code_page)
260 return false;
261
262 return IsValidCodePage(code_page);
263 }
264
265 unsigned int
extract_code_page(const std::string & charset)266 windows_charset_converter_c::extract_code_page(const std::string &charset) {
267 if (charset.substr(0, 2) != "CP")
268 return 0;
269
270 std::string number_as_str = charset.substr(2, charset.length() - 2);
271 uint64_t number = 0;
272 if (!mtx::string::parse_number(number_as_str.c_str(), number))
273 return 0;
274
275 return number;
276 }
277
278 #endif // defined(SYS_WINDOWS)
279
280 // ------------------------------------------------------------
281
282 std::string
get_local_charset()283 get_local_charset() {
284 std::string lc_charset;
285
286 setlocale(LC_CTYPE, "");
287 #if defined(COMP_MINGW) || defined(COMP_MSC)
288 lc_charset = fmt::format("CP{0}", GetACP());
289 #elif defined(SYS_SOLARIS)
290 int i;
291
292 lc_charset = nl_langinfo(CODESET);
293 if (mtx::string::parse_number(lc_charset, i))
294 lc_charset = "ISO"s + lc_charset + "-US"s;
295 #elif HAVE_NL_LANGINFO
296 lc_charset = nl_langinfo(CODESET);
297 #elif HAVE_LOCALE_CHARSET
298 lc_charset = locale_charset();
299 #endif
300
301 return lc_charset;
302 }
303
304 std::string
get_local_console_charset()305 get_local_console_charset() {
306 #if defined(SYS_WINDOWS)
307 return fmt::format("CP{0}", GetACP());
308 #else
309 return get_local_charset();
310 #endif
311 }
312