1 /*
2 * mptString.cpp
3 * -------------
4 * Purpose: Small string-related utilities, number and message formatting.
5 * Notes : Currently none.
6 * Authors: OpenMPT Devs
7 * The OpenMPT source code is released under the BSD license. Read LICENSE for more details.
8 */
9
10 #include "stdafx.h"
11 #include "mptString.h"
12
13 #include "mpt/string/types.hpp"
14 #include "mpt/string/utility.hpp"
15 #include "mpt/string_transcode/transcode.hpp"
16
17 #include <locale>
18 #include <string>
19 #include <vector>
20
21 #include <cstdlib>
22
23 #if defined(MODPLUG_TRACKER)
24 #include <cwctype>
25 #endif // MODPLUG_TRACKER
26
27 #if defined(MODPLUG_TRACKER)
28 #include <wctype.h>
29 #endif // MODPLUG_TRACKER
30
31 #if MPT_OS_WINDOWS
32 #include <windows.h>
33 #endif // MPT_OS_WINDOWS
34
35
36 OPENMPT_NAMESPACE_BEGIN
37
38
39
40 /*
41
42
43
44 Quick guide to the OpenMPT string type jungle
45 =============================================
46
47
48
49 This quick guide is only meant as a hint. There may be valid reasons to not
50 honor the recommendations found here. Staying consistent with surrounding and/or
51 related code sections may also be important.
52
53
54
55 List of string types
56 --------------------
57
58 * std::string (OpenMPT, libopenmpt)
59 C++ string of unspecifed 8bit encoding. Try to always document the
60 encoding if not clear from context. Do not use unless there is an obvious
61 reason to do so.
62
63 * std::wstring (OpenMPT)
64 UTF16 (on windows) or UTF32 (otherwise). Do not use unless there is an
65 obvious reason to do so.
66
67 * mpt::lstring (OpenMPT)
68 OpenMPT locale string type. The encoding is always CP_ACP. Do not use unless
69 there is an obvious reason to do so.
70
71 * char* (OpenMPT, libopenmpt)
72 C string of unspecified encoding. Use only for static literals or in
73 performance critical inner loops where full control and avoidance of memory
74 allocations is required.
75
76 * wchar_t* (OpenMPT)
77 C wide string. Use only if Unicode is required for static literals or in
78 performance critical inner loops where full control and avoidance of memory
79 allocation is required.
80
81 * mpt::winstring (OpenMPT)
82 OpenMPT type-safe string to interface with native WinAPI, either encoded in
83 locale/CP_ACP (if !UNICODE) or UTF16 (if UNICODE).
84
85 * CString (OpenMPT)
86 MFC string type, either encoded in locale/CP_ACP (if !UNICODE) or UTF16 (if
87 UNICODE). Specify literals with _T(""). Use in MFC GUI code.
88
89 * CStringA (OpenMPT)
90 MFC ANSI string type. The encoding is always CP_ACP. Do not use.
91
92 * CStringW (OpenMPT)
93 MFC Unicode string type. Do not use.
94
95 * mpt::PathString (OpenMPT, libopenmpt)
96 String type representing paths and filenames. Always use for these in order
97 to avoid potentially lossy conversions. Use P_("") macro for
98 literals.
99
100 * mpt::ustring (OpenMPT, libopenmpt)
101 The default unicode string type. Can be encoded in UTF8 or UTF16 or UTF32,
102 depending on MPT_USTRING_MODE_* and sizeof(wchar_t). Literals can written as
103 U_(""). Use as your default string type if no other string type is
104 a measurably better fit.
105
106 * MPT_UTF8 (OpenMPT, libopenmpt)
107 Macro that generates a mpt::ustring from string literals containing
108 non-ascii characters. In order to keep the source code in ascii encoding,
109 always express non-ascii characters using explicit \x23 escaping. Note that
110 depending on the underlying type of mpt::ustring, MPT_UTF8 *requires* a
111 runtime conversion. Only use for string literals containing non-ascii
112 characters (use MPT_USTRING otherwise).
113
114 * MPT_ULITERAL / MPT_UCHAR / mpt::uchar (OpenMPT, libopenmpt)
115 Macros which generate string literals, char literals and the char literal
116 type respectively. These are especially useful in constexpr contexts or
117 global data where MPT_USTRING is either unusable or requires a global
118 contructor to run. Do NOT use as a performance optimization in place of
119 MPT_USTRING however, because MPT_USTRING can be converted to C++11/14 user
120 defined literals eventually, while MPT_ULITERAL cannot because of constexpr
121 requirements.
122
123 * mpt::RawPathString (OpenMPT, libopenmpt)
124 Internal representation of mpt::PathString. Only use for parsing path
125 fragments.
126
127 * mpt::u8string (OpenMPT, libopenmpt)
128 Internal representation of mpt::ustring. Do not use directly. Ever.
129
130 * std::basic_string<char> (OpenMPT)
131 Same as std::string. Do not use std::basic_string in the templated form.
132
133 * std::basic_string<wchar_t> (OpenMPT)
134 Same as std::wstring. Do not use std::basic_string in the templated form.
135
136 The following string types are available in order to avoid the need to overload
137 functions on a huge variety of string types. Use only ever as function argument
138 types.
139 Note that the locale charset is not available on all libopenmpt builds (in which
140 case the option is ignored or a sensible fallback is used; these types are
141 always available).
142 All these types publicly inherit from mpt::ustring and do not contain any
143 additional state. This means that they work the same way as mpt::ustring does
144 and do support type-slicing for both, read and write accesses.
145 These types only add conversion constructors for all string types that have a
146 defined encoding and for all 8bit string types using the specified encoding
147 heuristic.
148
149 * AnyUnicodeString (OpenMPT, libopenmpt)
150 Is constructible from any Unicode string.
151
152 * AnyString (OpenMPT, libopenmpt)
153 Tries to do the smartest auto-magic we can do.
154
155 * AnyStringLocale (OpenMPT, libopenmpt)
156 char-based strings are assumed to be in locale encoding.
157
158 * AnyStringUTF8orLocale (OpenMPT, libopenmpt)
159 char-based strings are tried in UTF8 first, if this fails, locale is used.
160
161 * AnyStringUTF8 (OpenMPT, libopenmpt)
162 char-based strings are assumed to be in UTF8.
163
164
165
166 Encoding of 8bit strings
167 ------------------------
168
169 8bit strings have an unspecified encoding. When the string is contained within a
170 CSoundFile object, the encoding is most likely CSoundFile::GetCharsetInternal(),
171 otherwise, try to gather the most probable encoding from surrounding or related
172 code sections.
173
174
175
176 Decision tree to help deciding which string type to use
177 -------------------------------------------------------
178
179 if in libopenmpt
180 if in libopenmpt c++ interface
181 T = std::string, the encoding is utf8
182 elif in libopenmpt c interface
183 T = char*, the encoding is utf8
184 elif performance critical inner loop
185 T = char*, document the encoding if not clear from context
186 elif string literal containing non-ascii characters
187 T = MPT_UTF8
188 elif path or file
189 if parsing path fragments
190 T = mpt::RawPathString
191 template your function on the concrete underlying string type
192 (std::string and std::wstring) or use preprocessor MPT_OS_WINDOWS
193 else
194 T = mpt::PathString
195 fi
196 else
197 T = mpt::ustring
198 fi
199 else
200 if performance critical inner loop
201 if needs unicode support
202 T = mpt::uchar* / MPT_ULITERAL
203 else
204 T = char*, document the encoding if not clear from context
205 fi
206 elif string literal containing non-ascii characters
207 T = MPT_UTF8
208 elif path or file
209 if parsing path fragments
210 T = mpt::RawPathString
211 template your function on the concrete underlying string type
212 (std::string and std::wstring) or use preprocessor MPT_OS_WINDOWS
213 else
214 T = mpt::PathString
215 fi
216 elif winapi interfacing code
217 T = mpt::winstring
218 elif mfc/gui code
219 T = CString
220 else
221 if constexpr context or global data
222 T = mpt::uchar* / MPT_ULITERAL
223 else
224 T = mpt::ustring
225 fi
226 fi
227 fi
228
229 This boils down to: Prefer mpt::PathString and mpt::ustring, and only use any
230 other string type if there is an obvious reason to do so.
231
232
233
234 Character set conversions
235 -------------------------
236
237 Character set conversions in OpenMPT are always fuzzy.
238
239 Behaviour in case of an invalid source encoding and behaviour in case of an
240 unrepresentable destination encoding can be any of the following:
241 * The character is replaced by some replacement character ('?' or L'\ufffd' in
242 most cases).
243 * The character is replaced by a similar character (either semantically
244 similiar or visually similar).
245 * The character is transcribed with some ASCII text.
246 * The character is discarded.
247 * Conversion stops at this very character.
248
249 Additionally. conversion may stop or continue on \0 characters in the middle of
250 the string.
251
252 Behaviour can vary from one conversion tuple to any other.
253
254 If you need to ensure lossless conversion, do a roundtrip conversion and check
255 for equality.
256
257
258
259 Unicode handling
260 ----------------
261
262 OpenMPT is generally not aware of and does not handle different Unicode
263 normalization forms.
264 You should be aware of the following possibilities:
265 * Conversion between UTF8, UTF16, UTF32 may or may not change between NFC and
266 NFD.
267 * Conversion from any non-Unicode 8bit encoding can result in both, NFC or NFD
268 forms.
269 * Conversion to any non-Unicode 8bit encoding may or may not involve
270 conversion to NFC, NFD, NFKC or NFKD during the conversion. This in
271 particular means that conversion of decomposed german umlauts to ISO8859-1
272 may fail.
273 * Changing the normalization form of path strings may render the file
274 inaccessible.
275
276 Unicode BOM may or may not be preserved and/or discarded during conversion.
277
278 Invalid Unicode code points may be treated as invalid or as valid characters
279 when converting between different Unicode encodings.
280
281
282
283 Interfacing with WinAPI
284 -----------------------
285
286 When in MFC code, use CString.
287 When in non MFC code, either use std::wstring when directly interfacing with
288 APIs only available in WCHAR variants, or use mpt::winstring and
289 mpt::WinStringBuf helpers otherwise.
290 Specify TCHAR string literals with _T("foo") in mptrack/, and with TEXT("foo")
291 in common/ or sounddev/. _T() requires <tchar.h> which is specific to the MSVC
292 runtime and not portable across compilers. TEXT() is from <windows.h>. We use
293 _T() in mptrack/ only because it is shorter.
294
295
296
297 */
298
299
300
301 namespace mpt { namespace String {
302
303
304
305 #define C(x) (mpt::char_value((x)))
306
307 // AMS1 actually only supports ASCII plus the modified control characters and no high chars at all.
308 // Just default to CP437 for those to keep things simple.
309 static constexpr char32_t CharsetTableCP437AMS[256] = {
310 C(' '),0x0001,0x0002,0x0003,0x00e4,0x0005,0x00e5,0x0007,0x0008,0x0009,0x000a,0x000b,0x000c,0x000d,0x00c4,0x00c5, // differs from CP437
311 0x0010,0x0011,0x0012,0x0013,0x00f6,0x0015,0x0016,0x0017,0x0018,0x00d6,0x001a,0x001b,0x001c,0x001d,0x001e,0x001f, // differs from CP437
312 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002a,0x002b,0x002c,0x002d,0x002e,0x002f,
313 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003a,0x003b,0x003c,0x003d,0x003e,0x003f,
314 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004a,0x004b,0x004c,0x004d,0x004e,0x004f,
315 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005a,0x005b,0x005c,0x005d,0x005e,0x005f,
316 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006a,0x006b,0x006c,0x006d,0x006e,0x006f,
317 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007a,0x007b,0x007c,0x007d,0x007e,0x2302,
318 0x00c7,0x00fc,0x00e9,0x00e2,0x00e4,0x00e0,0x00e5,0x00e7,0x00ea,0x00eb,0x00e8,0x00ef,0x00ee,0x00ec,0x00c4,0x00c5,
319 0x00c9,0x00e6,0x00c6,0x00f4,0x00f6,0x00f2,0x00fb,0x00f9,0x00ff,0x00d6,0x00dc,0x00a2,0x00a3,0x00a5,0x20a7,0x0192,
320 0x00e1,0x00ed,0x00f3,0x00fa,0x00f1,0x00d1,0x00aa,0x00ba,0x00bf,0x2310,0x00ac,0x00bd,0x00bc,0x00a1,0x00ab,0x00bb,
321 0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556,0x2555,0x2563,0x2551,0x2557,0x255d,0x255c,0x255b,0x2510,
322 0x2514,0x2534,0x252c,0x251c,0x2500,0x253c,0x255e,0x255f,0x255a,0x2554,0x2569,0x2566,0x2560,0x2550,0x256c,0x2567,
323 0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256b,0x256a,0x2518,0x250c,0x2588,0x2584,0x258c,0x2590,0x2580,
324 0x03b1,0x00df,0x0393,0x03c0,0x03a3,0x03c3,0x00b5,0x03c4,0x03a6,0x0398,0x03a9,0x03b4,0x221e,0x03c6,0x03b5,0x2229,
325 0x2261,0x00b1,0x2265,0x2264,0x2320,0x2321,0x00f7,0x2248,0x00b0,0x2219,0x00b7,0x221a,0x207f,0x00b2,0x25a0,0x00a0
326 };
327
328 // AMS2: Looking at Velvet Studio's bitmap font (TPIC32.PCX), these appear to be the only supported non-ASCII chars.
329 static constexpr char32_t CharsetTableCP437AMS2[256] = {
330 C(' '),0x00a9,0x221a,0x00b7,C('0'),C('1'),C('2'),C('3'),C('4'),C('5'),C('6'),C('7'),C('8'),C('9'),C('A'),C('B'), // differs from CP437
331 C('C'),C('D'),C('E'),C('F'),C(' '),0x00a7,C(' '),C(' '),C(' '),C(' '),C(' '),C(' '),C(' '),C(' '),C(' '),C(' '), // differs from CP437
332 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002a,0x002b,0x002c,0x002d,0x002e,0x002f,
333 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003a,0x003b,0x003c,0x003d,0x003e,0x003f,
334 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004a,0x004b,0x004c,0x004d,0x004e,0x004f,
335 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005a,0x005b,0x005c,0x005d,0x005e,0x005f,
336 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006a,0x006b,0x006c,0x006d,0x006e,0x006f,
337 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007a,0x007b,0x007c,0x007d,0x007e,0x2302,
338 0x00c7,0x00fc,0x00e9,0x00e2,0x00e4,0x00e0,0x00e5,0x00e7,0x00ea,0x00eb,0x00e8,0x00ef,0x00ee,0x00ec,0x00c4,0x00c5,
339 0x00c9,0x00e6,0x00c6,0x00f4,0x00f6,0x00f2,0x00fb,0x00f9,0x00ff,0x00d6,0x00dc,0x00a2,0x00a3,0x00a5,0x20a7,0x0192,
340 0x00e1,0x00ed,0x00f3,0x00fa,0x00f1,0x00d1,0x00aa,0x00ba,0x00bf,0x2310,0x00ac,0x00bd,0x00bc,0x00a1,0x00ab,0x00bb,
341 0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556,0x2555,0x2563,0x2551,0x2557,0x255d,0x255c,0x255b,0x2510,
342 0x2514,0x2534,0x252c,0x251c,0x2500,0x253c,0x255e,0x255f,0x255a,0x2554,0x2569,0x2566,0x2560,0x2550,0x256c,0x2567,
343 0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256b,0x256a,0x2518,0x250c,0x2588,0x2584,0x258c,0x2590,0x2580,
344 0x03b1,0x00df,0x0393,0x03c0,0x03a3,0x03c3,0x00b5,0x03c4,0x03a6,0x0398,0x03a9,0x03b4,0x221e,0x03c6,0x03b5,0x2229,
345 0x2261,0x00b1,0x2265,0x2264,0x2320,0x2321,0x00f7,0x2248,0x00b0,0x2219,0x00b7,0x221a,0x207f,0x00b2,0x25a0,0x00a0
346 };
347
348 #undef C
349
350
351 // templated on 8bit strings because of type-safe variants
352 template<typename Tdststring>
EncodeImpl(Charset charset,const mpt::widestring & src)353 static Tdststring EncodeImpl(Charset charset, const mpt::widestring &src)
354 {
355 static_assert(sizeof(typename Tdststring::value_type) == sizeof(char));
356 static_assert(mpt::is_character<typename Tdststring::value_type>::value);
357 switch(charset)
358 {
359 #if defined(MPT_ENABLE_CHARSET_LOCALE)
360 case Charset::Locale: return mpt::encode<Tdststring>(mpt::logical_encoding::locale, src); break;
361 #endif
362 case Charset::UTF8: return mpt::encode<Tdststring>(mpt::common_encoding::utf8, src); break;
363 case Charset::ASCII: return mpt::encode<Tdststring>(mpt::common_encoding::ascii, src); break;
364 case Charset::ISO8859_1: return mpt::encode<Tdststring>(mpt::common_encoding::iso8859_1, src); break;
365 case Charset::ISO8859_15: return mpt::encode<Tdststring>(mpt::common_encoding::iso8859_15, src); break;
366 case Charset::CP850: return mpt::encode<Tdststring>(mpt::common_encoding::cp850, src); break;
367 case Charset::CP437: return mpt::encode<Tdststring>(mpt::common_encoding::cp437, src); break;
368 case Charset::CP437AMS: return mpt::encode<Tdststring>(CharsetTableCP437AMS, src); break;
369 case Charset::CP437AMS2: return mpt::encode<Tdststring>(CharsetTableCP437AMS2, src); break;
370 case Charset::Windows1252: return mpt::encode<Tdststring>(mpt::common_encoding::windows1252, src); break;
371 }
372 return Tdststring();
373 }
374
375
376 // templated on 8bit strings because of type-safe variants
377 template<typename Tsrcstring>
DecodeImpl(Charset charset,const Tsrcstring & src)378 static mpt::widestring DecodeImpl(Charset charset, const Tsrcstring &src)
379 {
380 static_assert(sizeof(typename Tsrcstring::value_type) == sizeof(char));
381 static_assert(mpt::is_character<typename Tsrcstring::value_type>::value);
382 switch(charset)
383 {
384 #if defined(MPT_ENABLE_CHARSET_LOCALE)
385 case Charset::Locale: return mpt::decode<Tsrcstring>(mpt::logical_encoding::locale, src); break;
386 #endif
387 case Charset::UTF8: return mpt::decode<Tsrcstring>(mpt::common_encoding::utf8, src); break;
388 case Charset::ASCII: return mpt::decode<Tsrcstring>(mpt::common_encoding::ascii, src); break;
389 case Charset::ISO8859_1: return mpt::decode<Tsrcstring>(mpt::common_encoding::iso8859_1, src); break;
390 case Charset::ISO8859_15: return mpt::decode<Tsrcstring>(mpt::common_encoding::iso8859_15, src); break;
391 case Charset::CP850: return mpt::decode<Tsrcstring>(mpt::common_encoding::cp850, src); break;
392 case Charset::CP437: return mpt::decode<Tsrcstring>(mpt::common_encoding::cp437, src); break;
393 case Charset::CP437AMS: return mpt::decode<Tsrcstring>(CharsetTableCP437AMS, src); break;
394 case Charset::CP437AMS2: return mpt::decode<Tsrcstring>(CharsetTableCP437AMS2, src); break;
395 case Charset::Windows1252: return mpt::decode<Tsrcstring>(mpt::common_encoding::windows1252, src); break;
396 }
397 return mpt::widestring();
398 }
399
400
401 // templated on 8bit strings because of type-safe variants
402 template<typename Tdststring, typename Tsrcstring>
ConvertImpl(Charset to,Charset from,const Tsrcstring & src)403 static Tdststring ConvertImpl(Charset to, Charset from, const Tsrcstring &src)
404 {
405 static_assert(sizeof(typename Tdststring::value_type) == sizeof(char));
406 static_assert(sizeof(typename Tsrcstring::value_type) == sizeof(char));
407 if(to == from)
408 {
409 const typename Tsrcstring::value_type * src_beg = src.data();
410 const typename Tsrcstring::value_type * src_end = src_beg + src.size();
411 return Tdststring(reinterpret_cast<const typename Tdststring::value_type *>(src_beg), reinterpret_cast<const typename Tdststring::value_type *>(src_end));
412 }
413 return EncodeImpl<Tdststring>(to, DecodeImpl(from, src));
414 }
415
416
417
418 } // namespace String
419
420
IsUTF8(const std::string & str)421 bool IsUTF8(const std::string &str)
422 {
423 return mpt::is_utf8(str);
424 }
425
426
427 #if MPT_WSTRING_CONVERT
ToWide(Charset from,const std::string & str)428 std::wstring ToWide(Charset from, const std::string &str)
429 {
430 return String::DecodeImpl(from, str);
431 }
432 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToWide(const mpt::lstring & str)433 std::wstring ToWide(const mpt::lstring &str)
434 {
435 return String::DecodeImpl(Charset::Locale, str);
436 }
437 #endif // MPT_ENABLE_CHARSET_LOCALE
438 #endif
439
440 #if MPT_WSTRING_CONVERT
ToCharset(Charset to,const std::wstring & str)441 std::string ToCharset(Charset to, const std::wstring &str)
442 {
443 return String::EncodeImpl<std::string>(to, str);
444 }
445 #endif
ToCharset(Charset to,Charset from,const std::string & str)446 std::string ToCharset(Charset to, Charset from, const std::string &str)
447 {
448 return String::ConvertImpl<std::string>(to, from, str);
449 }
450 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToCharset(Charset to,const mpt::lstring & str)451 std::string ToCharset(Charset to, const mpt::lstring &str)
452 {
453 return String::ConvertImpl<std::string>(to, Charset::Locale, str);
454 }
455 #endif // MPT_ENABLE_CHARSET_LOCALE
456
457 #if defined(MPT_ENABLE_CHARSET_LOCALE)
458 #if MPT_WSTRING_CONVERT
ToLocale(const std::wstring & str)459 mpt::lstring ToLocale(const std::wstring &str)
460 {
461 return String::EncodeImpl<mpt::lstring>(Charset::Locale, str);
462 }
463 #endif
ToLocale(Charset from,const std::string & str)464 mpt::lstring ToLocale(Charset from, const std::string &str)
465 {
466 return String::ConvertImpl<mpt::lstring>(Charset::Locale, from, str);
467 }
468 #endif // MPT_ENABLE_CHARSET_LOCALE
469
470 #if MPT_OS_WINDOWS
471 #if MPT_WSTRING_CONVERT
ToWin(const std::wstring & str)472 mpt::winstring ToWin(const std::wstring &str)
473 {
474 #ifdef UNICODE
475 return str;
476 #else
477 return ToLocale(str);
478 #endif
479 }
480 #endif
ToWin(Charset from,const std::string & str)481 mpt::winstring ToWin(Charset from, const std::string &str)
482 {
483 #ifdef UNICODE
484 return ToWide(from, str);
485 #else
486 return ToLocale(from, str);
487 #endif
488 }
489 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToWin(const mpt::lstring & str)490 mpt::winstring ToWin(const mpt::lstring &str)
491 {
492 #ifdef UNICODE
493 return ToWide(str);
494 #else
495 return str;
496 #endif
497 }
498 #endif // MPT_ENABLE_CHARSET_LOCALE
499 #endif // MPT_OS_WINDOWS
500
501
502 #if defined(MPT_WITH_MFC)
503
ToCString(const std::wstring & str)504 CString ToCString(const std::wstring &str)
505 {
506 #ifdef UNICODE
507 return str.c_str();
508 #else
509 return ToCharset(Charset::Locale, str).c_str();
510 #endif
511 }
ToCString(Charset from,const std::string & str)512 CString ToCString(Charset from, const std::string &str)
513 {
514 #ifdef UNICODE
515 return ToWide(from, str).c_str();
516 #else
517 return ToCharset(Charset::Locale, from, str).c_str();
518 #endif
519 }
ToWide(const CString & str)520 std::wstring ToWide(const CString &str)
521 {
522 #ifdef UNICODE
523 return str.GetString();
524 #else
525 return ToWide(Charset::Locale, str.GetString());
526 #endif
527 }
ToCharset(Charset to,const CString & str)528 std::string ToCharset(Charset to, const CString &str)
529 {
530 #ifdef UNICODE
531 return ToCharset(to, str.GetString());
532 #else
533 return ToCharset(to, Charset::Locale, str.GetString());
534 #endif
535 }
536 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToCString(const mpt::lstring & str)537 CString ToCString(const mpt::lstring &str)
538 {
539 #ifdef UNICODE
540 return ToWide(str).c_str();
541 #else
542 return str.c_str();
543 #endif
544 }
ToLocale(const CString & str)545 mpt::lstring ToLocale(const CString &str)
546 {
547 #ifdef UNICODE
548 return String::EncodeImpl<mpt::lstring>(Charset::Locale, str.GetString());
549 #else
550 return str.GetString();
551 #endif
552 }
553 #endif // MPT_ENABLE_CHARSET_LOCALE
554 #if MPT_OS_WINDOWS
ToWin(const CString & str)555 mpt::winstring ToWin(const CString &str)
556 {
557 return str.GetString();
558 }
559 #endif // MPT_OS_WINDOWS
560
561 #endif // MPT_WITH_MFC
562
563
564 #if MPT_USTRING_MODE_WIDE
565 // inline
566 #else // !MPT_USTRING_MODE_WIDE
567 #if MPT_WSTRING_CONVERT
ToUnicode(const std::wstring & str)568 mpt::ustring ToUnicode(const std::wstring &str)
569 {
570 return String::EncodeImpl<mpt::ustring>(mpt::Charset::UTF8, str);
571 }
572 #endif
ToUnicode(Charset from,const std::string & str)573 mpt::ustring ToUnicode(Charset from, const std::string &str)
574 {
575 return String::ConvertImpl<mpt::ustring>(mpt::Charset::UTF8, from, str);
576 }
577 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToUnicode(const mpt::lstring & str)578 mpt::ustring ToUnicode(const mpt::lstring &str)
579 {
580 return String::ConvertImpl<mpt::ustring>(mpt::Charset::UTF8, mpt::Charset::Locale, str);
581 }
582 #endif // MPT_ENABLE_CHARSET_LOCALE
583 #if defined(MPT_WITH_MFC)
ToUnicode(const CString & str)584 mpt::ustring ToUnicode(const CString &str)
585 {
586 #ifdef UNICODE
587 return String::EncodeImpl<mpt::ustring>(mpt::Charset::UTF8, str.GetString());
588 #else // !UNICODE
589 return String::ConvertImpl<mpt::ustring, std::string>(mpt::Charset::UTF8, mpt::Charset::Locale, str.GetString());
590 #endif // UNICODE
591 }
592 #endif // MPT_WITH_MFC
593 #endif // MPT_USTRING_MODE_WIDE
594
595 #if MPT_USTRING_MODE_WIDE
596 // nothing, std::wstring overloads will catch all stuff
597 #else // !MPT_USTRING_MODE_WIDE
598 #if MPT_WSTRING_CONVERT
ToWide(const mpt::ustring & str)599 std::wstring ToWide(const mpt::ustring &str)
600 {
601 return String::DecodeImpl<mpt::ustring>(mpt::Charset::UTF8, str);
602 }
603 #endif
ToCharset(Charset to,const mpt::ustring & str)604 std::string ToCharset(Charset to, const mpt::ustring &str)
605 {
606 return String::ConvertImpl<std::string, mpt::ustring>(to, mpt::Charset::UTF8, str);
607 }
608 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToLocale(const mpt::ustring & str)609 mpt::lstring ToLocale(const mpt::ustring &str)
610 {
611 return String::ConvertImpl<mpt::lstring, mpt::ustring>(mpt::Charset::Locale, mpt::Charset::UTF8, str);
612 }
613 #endif // MPT_ENABLE_CHARSET_LOCALE
614 #if MPT_OS_WINDOWS
ToWin(const mpt::ustring & str)615 mpt::winstring ToWin(const mpt::ustring &str)
616 {
617 #ifdef UNICODE
618 return String::DecodeImpl<mpt::ustring>(mpt::Charset::UTF8, str);
619 #else
620 return String::ConvertImpl<mpt::lstring, mpt::ustring>(mpt::Charset::Locale, mpt::Charset::UTF8, str);
621 #endif
622 }
623 #endif // MPT_OS_WINDOWS
624 #if defined(MPT_WITH_MFC)
ToCString(const mpt::ustring & str)625 CString ToCString(const mpt::ustring &str)
626 {
627 #ifdef UNICODE
628 return String::DecodeImpl<mpt::ustring>(mpt::Charset::UTF8, str).c_str();
629 #else // !UNICODE
630 return String::ConvertImpl<std::string, mpt::ustring>(mpt::Charset::Locale, mpt::Charset::UTF8, str).c_str();
631 #endif // UNICODE
632 }
633 #endif // MPT_WITH_MFC
634 #endif // MPT_USTRING_MODE_WIDE
635
636
637
638
639
CharsetFromCodePage(uint16 codepage,mpt::Charset fallback,bool * isFallback=nullptr)640 static mpt::Charset CharsetFromCodePage(uint16 codepage, mpt::Charset fallback, bool * isFallback = nullptr)
641 {
642 mpt::Charset result = fallback;
643 switch(codepage)
644 {
645 case 65001:
646 result = mpt::Charset::UTF8;
647 if(isFallback) *isFallback = false;
648 break;
649 case 20127:
650 result = mpt::Charset::ASCII;
651 if(isFallback) *isFallback = false;
652 break;
653 case 28591:
654 result = mpt::Charset::ISO8859_1;
655 if(isFallback) *isFallback = false;
656 break;
657 case 28605:
658 result = mpt::Charset::ISO8859_15;
659 if(isFallback) *isFallback = false;
660 break;
661 case 437:
662 result = mpt::Charset::CP437;
663 if(isFallback) *isFallback = false;
664 break;
665 case 1252:
666 result = mpt::Charset::Windows1252;
667 if(isFallback) *isFallback = false;
668 break;
669 default:
670 result = fallback;
671 if(isFallback) *isFallback = true;
672 break;
673 }
674 return result;
675 }
676
ToUnicode(uint16 codepage,mpt::Charset fallback,const std::string & str)677 mpt::ustring ToUnicode(uint16 codepage, mpt::Charset fallback, const std::string &str)
678 {
679 #if MPT_OS_WINDOWS
680 mpt::ustring result;
681 bool noCharsetMatch = true;
682 mpt::Charset charset = mpt::CharsetFromCodePage(codepage, fallback, &noCharsetMatch);
683 if(noCharsetMatch && mpt::has_codepage(codepage))
684 {
685 result = mpt::ToUnicode(mpt::decode<std::string>(codepage, str));
686 } else
687 {
688 result = mpt::ToUnicode(charset, str);
689 }
690 return result;
691 #else // !MPT_OS_WINDOWS
692 return mpt::ToUnicode(mpt::CharsetFromCodePage(codepage, fallback), str);
693 #endif // MPT_OS_WINDOWS
694 }
695
696
697
698
699
ToLowerCaseAscii(char c)700 char ToLowerCaseAscii(char c)
701 {
702 return mpt::to_lower_ascii(c);
703 }
704
ToUpperCaseAscii(char c)705 char ToUpperCaseAscii(char c)
706 {
707 return mpt::to_upper_ascii(c);
708 }
709
ToLowerCaseAscii(std::string s)710 std::string ToLowerCaseAscii(std::string s)
711 {
712 std::transform(s.begin(), s.end(), s.begin(), static_cast<char(*)(char)>(&mpt::ToLowerCaseAscii));
713 return s;
714 }
715
ToUpperCaseAscii(std::string s)716 std::string ToUpperCaseAscii(std::string s)
717 {
718 std::transform(s.begin(), s.end(), s.begin(), static_cast<char(*)(char)>(&mpt::ToUpperCaseAscii));
719 return s;
720 }
721
CompareNoCaseAscii(const char * a,const char * b,std::size_t n)722 int CompareNoCaseAscii(const char *a, const char *b, std::size_t n)
723 {
724 while(n--)
725 {
726 unsigned char ac = mpt::char_value(mpt::ToLowerCaseAscii(*a));
727 unsigned char bc = mpt::char_value(mpt::ToLowerCaseAscii(*b));
728 if(ac != bc)
729 {
730 return ac < bc ? -1 : 1;
731 } else if(!ac && !bc)
732 {
733 return 0;
734 }
735 ++a;
736 ++b;
737 }
738 return 0;
739 }
740
CompareNoCaseAscii(std::string_view a,std::string_view b)741 int CompareNoCaseAscii(std::string_view a, std::string_view b)
742 {
743 for(std::size_t i = 0; i < std::min(a.length(), b.length()); ++i)
744 {
745 unsigned char ac = mpt::char_value(mpt::ToLowerCaseAscii(a[i]));
746 unsigned char bc = mpt::char_value(mpt::ToLowerCaseAscii(b[i]));
747 if(ac != bc)
748 {
749 return ac < bc ? -1 : 1;
750 } else if(!ac && !bc)
751 {
752 return 0;
753 }
754 }
755 if(a.length() == b.length())
756 {
757 return 0;
758 }
759 return a.length() < b.length() ? -1 : 1;
760 }
761
CompareNoCaseAscii(const std::string & a,const std::string & b)762 int CompareNoCaseAscii(const std::string &a, const std::string &b)
763 {
764 return CompareNoCaseAscii(std::string_view(a), std::string_view(b));
765 }
766
767
768 #if defined(MODPLUG_TRACKER)
769
ToLowerCase(const mpt::ustring & s)770 mpt::ustring ToLowerCase(const mpt::ustring &s)
771 {
772 #if defined(MPT_WITH_MFC)
773 #if defined(UNICODE)
774 CString tmp = mpt::ToCString(s);
775 tmp.MakeLower();
776 return mpt::ToUnicode(tmp);
777 #else // !UNICODE
778 CStringW tmp = mpt::ToWide(s).c_str();
779 tmp.MakeLower();
780 return mpt::ToUnicode(tmp.GetString());
781 #endif // UNICODE
782 #else // !MPT_WITH_MFC
783 std::wstring ws = mpt::ToWide(s);
784 std::transform(ws.begin(), ws.end(), ws.begin(), &std::towlower);
785 return mpt::ToUnicode(ws);
786 #endif // MPT_WITH_MFC
787 }
788
ToUpperCase(const mpt::ustring & s)789 mpt::ustring ToUpperCase(const mpt::ustring &s)
790 {
791 #if defined(MPT_WITH_MFC)
792 #if defined(UNICODE)
793 CString tmp = mpt::ToCString(s);
794 tmp.MakeUpper();
795 return mpt::ToUnicode(tmp);
796 #else // !UNICODE
797 CStringW tmp = mpt::ToWide(s).c_str();
798 tmp.MakeUpper();
799 return mpt::ToUnicode(tmp.GetString());
800 #endif // UNICODE
801 #else // !MPT_WITH_MFC
802 std::wstring ws = mpt::ToWide(s);
803 std::transform(ws.begin(), ws.end(), ws.begin(), &std::towlower);
804 return mpt::ToUnicode(ws);
805 #endif // MPT_WITH_MFC
806 }
807
808 #endif // MODPLUG_TRACKER
809
810
811
812 } // namespace mpt
813
814
815
816 OPENMPT_NAMESPACE_END
817