1 /*
2  * mptString.cpp
3  * -------------
4  * Purpose: Small string-related utilities, number and message formatting.
5  * Notes  : Currently none.
6  * Authors: OpenMPT Devs
7  * The OpenMPT source code is released under the BSD license. Read LICENSE for more details.
8  */
9 
10 #include "stdafx.h"
11 #include "mptString.h"
12 
13 #include "mpt/string/types.hpp"
14 #include "mpt/string/utility.hpp"
15 #include "mpt/string_transcode/transcode.hpp"
16 
17 #include <locale>
18 #include <string>
19 #include <vector>
20 
21 #include <cstdlib>
22 
23 #if defined(MODPLUG_TRACKER)
24 #include <cwctype>
25 #endif // MODPLUG_TRACKER
26 
27 #if defined(MODPLUG_TRACKER)
28 #include <wctype.h>
29 #endif // MODPLUG_TRACKER
30 
31 #if MPT_OS_WINDOWS
32 #include <windows.h>
33 #endif // MPT_OS_WINDOWS
34 
35 
36 OPENMPT_NAMESPACE_BEGIN
37 
38 
39 
40 /*
41 
42 
43 
44 Quick guide to the OpenMPT string type jungle
45 =============================================
46 
47 
48 
49 This quick guide is only meant as a hint. There may be valid reasons to not
50 honor the recommendations found here. Staying consistent with surrounding and/or
51 related code sections may also be important.
52 
53 
54 
55 List of string types
56 --------------------
57 
58  *  std::string (OpenMPT, libopenmpt)
59     C++ string of unspecifed 8bit encoding. Try to always document the
60     encoding if not clear from context. Do not use unless there is an obvious
61     reason to do so.
62 
63  *  std::wstring (OpenMPT)
64     UTF16 (on windows) or UTF32 (otherwise). Do not use unless there is an
65     obvious reason to do so.
66 
67  *  mpt::lstring (OpenMPT)
68     OpenMPT locale string type. The encoding is always CP_ACP. Do not use unless
69     there is an obvious reason to do so.
70 
71  *  char* (OpenMPT, libopenmpt)
72     C string of unspecified encoding. Use only for static literals or in
73     performance critical inner loops where full control and avoidance of memory
74     allocations is required.
75 
76  *  wchar_t* (OpenMPT)
77     C wide string. Use only if Unicode is required for static literals or in
78     performance critical inner loops where full control and avoidance of memory
79     allocation is required.
80 
81  *  mpt::winstring (OpenMPT)
82     OpenMPT type-safe string to interface with native WinAPI, either encoded in
83     locale/CP_ACP (if !UNICODE) or UTF16 (if UNICODE).
84 
85  *  CString (OpenMPT)
86     MFC string type, either encoded in locale/CP_ACP (if !UNICODE) or UTF16 (if
87     UNICODE). Specify literals with _T(""). Use in MFC GUI code.
88 
89  *  CStringA (OpenMPT)
90     MFC ANSI string type. The encoding is always CP_ACP. Do not use.
91 
92  *  CStringW (OpenMPT)
93     MFC Unicode string type. Do not use.
94 
95  *  mpt::PathString (OpenMPT, libopenmpt)
96     String type representing paths and filenames. Always use for these in order
97     to avoid potentially lossy conversions. Use P_("") macro for
98     literals.
99 
100  *  mpt::ustring (OpenMPT, libopenmpt)
101     The default unicode string type. Can be encoded in UTF8 or UTF16 or UTF32,
102     depending on MPT_USTRING_MODE_* and sizeof(wchar_t). Literals can written as
103     U_(""). Use as your default string type if no other string type is
104     a measurably better fit.
105 
106  *  MPT_UTF8 (OpenMPT, libopenmpt)
107     Macro that generates a mpt::ustring from string literals containing
108     non-ascii characters. In order to keep the source code in ascii encoding,
109     always express non-ascii characters using explicit \x23 escaping. Note that
110     depending on the underlying type of mpt::ustring, MPT_UTF8 *requires* a
111     runtime conversion. Only use for string literals containing non-ascii
112     characters (use MPT_USTRING otherwise).
113 
114  *  MPT_ULITERAL / MPT_UCHAR / mpt::uchar (OpenMPT, libopenmpt)
115     Macros which generate string literals, char literals and the char literal
116     type respectively. These are especially useful in constexpr contexts or
117     global data where MPT_USTRING is either unusable or requires a global
118     contructor to run. Do NOT use as a performance optimization in place of
119     MPT_USTRING however, because MPT_USTRING can be converted to C++11/14 user
120     defined literals eventually, while MPT_ULITERAL cannot because of constexpr
121     requirements.
122 
123  *  mpt::RawPathString (OpenMPT, libopenmpt)
124     Internal representation of mpt::PathString. Only use for parsing path
125     fragments.
126 
127  *  mpt::u8string (OpenMPT, libopenmpt)
128     Internal representation of mpt::ustring. Do not use directly. Ever.
129 
130  *  std::basic_string<char> (OpenMPT)
131     Same as std::string. Do not use std::basic_string in the templated form.
132 
133  *  std::basic_string<wchar_t> (OpenMPT)
134     Same as std::wstring. Do not use std::basic_string in the templated form.
135 
136 The following string types are available in order to avoid the need to overload
137 functions on a huge variety of string types. Use only ever as function argument
138 types.
139 Note that the locale charset is not available on all libopenmpt builds (in which
140 case the option is ignored or a sensible fallback is used; these types are
141 always available).
142 All these types publicly inherit from mpt::ustring and do not contain any
143 additional state. This means that they work the same way as mpt::ustring does
144 and do support type-slicing for both, read and write accesses.
145 These types only add conversion constructors for all string types that have a
146 defined encoding and for all 8bit string types using the specified encoding
147 heuristic.
148 
149  *  AnyUnicodeString (OpenMPT, libopenmpt)
150     Is constructible from any Unicode string.
151 
152  *  AnyString (OpenMPT, libopenmpt)
153     Tries to do the smartest auto-magic we can do.
154 
155  *  AnyStringLocale (OpenMPT, libopenmpt)
156     char-based strings are assumed to be in locale encoding.
157 
158  *  AnyStringUTF8orLocale (OpenMPT, libopenmpt)
159     char-based strings are tried in UTF8 first, if this fails, locale is used.
160 
161  *  AnyStringUTF8 (OpenMPT, libopenmpt)
162     char-based strings are assumed to be in UTF8.
163 
164 
165 
166 Encoding of 8bit strings
167 ------------------------
168 
169 8bit strings have an unspecified encoding. When the string is contained within a
170 CSoundFile object, the encoding is most likely CSoundFile::GetCharsetInternal(),
171 otherwise, try to gather the most probable encoding from surrounding or related
172 code sections.
173 
174 
175 
176 Decision tree to help deciding which string type to use
177 -------------------------------------------------------
178 
179 if in libopenmpt
180  if in libopenmpt c++ interface
181   T = std::string, the encoding is utf8
182  elif in libopenmpt c interface
183   T = char*, the encoding is utf8
184  elif performance critical inner loop
185   T = char*, document the encoding if not clear from context
186  elif string literal containing non-ascii characters
187   T = MPT_UTF8
188  elif path or file
189   if parsing path fragments
190    T = mpt::RawPathString
191        template your function on the concrete underlying string type
192        (std::string and std::wstring) or use preprocessor MPT_OS_WINDOWS
193   else
194    T = mpt::PathString
195   fi
196  else
197   T = mpt::ustring
198  fi
199 else
200  if performance critical inner loop
201   if needs unicode support
202    T = mpt::uchar* / MPT_ULITERAL
203   else
204    T = char*, document the encoding if not clear from context
205   fi
206  elif string literal containing non-ascii characters
207   T = MPT_UTF8
208  elif path or file
209   if parsing path fragments
210    T = mpt::RawPathString
211        template your function on the concrete underlying string type
212        (std::string and std::wstring) or use preprocessor MPT_OS_WINDOWS
213   else
214    T = mpt::PathString
215   fi
216  elif winapi interfacing code
217   T = mpt::winstring
218  elif mfc/gui code
219   T = CString
220  else
221   if constexpr context or global data
222    T = mpt::uchar* / MPT_ULITERAL
223   else
224    T = mpt::ustring
225   fi
226  fi
227 fi
228 
229 This boils down to: Prefer mpt::PathString and mpt::ustring, and only use any
230 other string type if there is an obvious reason to do so.
231 
232 
233 
234 Character set conversions
235 -------------------------
236 
237 Character set conversions in OpenMPT are always fuzzy.
238 
239 Behaviour in case of an invalid source encoding and behaviour in case of an
240 unrepresentable destination encoding can be any of the following:
241  *  The character is replaced by some replacement character ('?' or L'\ufffd' in
242     most cases).
243  *  The character is replaced by a similar character (either semantically
244     similiar or visually similar).
245  *  The character is transcribed with some ASCII text.
246  *  The character is discarded.
247  *  Conversion stops at this very character.
248 
249 Additionally. conversion may stop or continue on \0 characters in the middle of
250 the string.
251 
252 Behaviour can vary from one conversion tuple to any other.
253 
254 If you need to ensure lossless conversion, do a roundtrip conversion and check
255 for equality.
256 
257 
258 
259 Unicode handling
260 ----------------
261 
262 OpenMPT is generally not aware of and does not handle different Unicode
263 normalization forms.
264 You should be aware of the following possibilities:
265  *  Conversion between UTF8, UTF16, UTF32 may or may not change between NFC and
266     NFD.
267  *  Conversion from any non-Unicode 8bit encoding can result in both, NFC or NFD
268     forms.
269  *  Conversion to any non-Unicode 8bit encoding may or may not involve
270     conversion to NFC, NFD, NFKC or NFKD during the conversion. This in
271     particular means that conversion of decomposed german umlauts to ISO8859-1
272     may fail.
273  *  Changing the normalization form of path strings may render the file
274     inaccessible.
275 
276 Unicode BOM may or may not be preserved and/or discarded during conversion.
277 
278 Invalid Unicode code points may be treated as invalid or as valid characters
279 when converting between different Unicode encodings.
280 
281 
282 
283 Interfacing with WinAPI
284 -----------------------
285 
286 When in MFC code, use CString.
287 When in non MFC code, either use std::wstring when directly interfacing with
288 APIs only available in WCHAR variants, or use mpt::winstring and
289 mpt::WinStringBuf helpers otherwise.
290 Specify TCHAR string literals with _T("foo") in mptrack/, and with TEXT("foo")
291 in common/ or sounddev/. _T() requires <tchar.h> which is specific to the MSVC
292 runtime and not portable across compilers. TEXT() is from <windows.h>. We use
293 _T() in mptrack/ only because it is shorter.
294 
295 
296 
297 */
298 
299 
300 
301 namespace mpt { namespace String {
302 
303 
304 
305 #define C(x) (mpt::char_value((x)))
306 
307 // AMS1 actually only supports ASCII plus the modified control characters and no high chars at all.
308 // Just default to CP437 for those to keep things simple.
309 static constexpr char32_t CharsetTableCP437AMS[256] = {
310 	C(' '),0x0001,0x0002,0x0003,0x00e4,0x0005,0x00e5,0x0007,0x0008,0x0009,0x000a,0x000b,0x000c,0x000d,0x00c4,0x00c5, // differs from CP437
311 	0x0010,0x0011,0x0012,0x0013,0x00f6,0x0015,0x0016,0x0017,0x0018,0x00d6,0x001a,0x001b,0x001c,0x001d,0x001e,0x001f, // differs from CP437
312 	0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002a,0x002b,0x002c,0x002d,0x002e,0x002f,
313 	0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003a,0x003b,0x003c,0x003d,0x003e,0x003f,
314 	0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004a,0x004b,0x004c,0x004d,0x004e,0x004f,
315 	0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005a,0x005b,0x005c,0x005d,0x005e,0x005f,
316 	0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006a,0x006b,0x006c,0x006d,0x006e,0x006f,
317 	0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007a,0x007b,0x007c,0x007d,0x007e,0x2302,
318 	0x00c7,0x00fc,0x00e9,0x00e2,0x00e4,0x00e0,0x00e5,0x00e7,0x00ea,0x00eb,0x00e8,0x00ef,0x00ee,0x00ec,0x00c4,0x00c5,
319 	0x00c9,0x00e6,0x00c6,0x00f4,0x00f6,0x00f2,0x00fb,0x00f9,0x00ff,0x00d6,0x00dc,0x00a2,0x00a3,0x00a5,0x20a7,0x0192,
320 	0x00e1,0x00ed,0x00f3,0x00fa,0x00f1,0x00d1,0x00aa,0x00ba,0x00bf,0x2310,0x00ac,0x00bd,0x00bc,0x00a1,0x00ab,0x00bb,
321 	0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556,0x2555,0x2563,0x2551,0x2557,0x255d,0x255c,0x255b,0x2510,
322 	0x2514,0x2534,0x252c,0x251c,0x2500,0x253c,0x255e,0x255f,0x255a,0x2554,0x2569,0x2566,0x2560,0x2550,0x256c,0x2567,
323 	0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256b,0x256a,0x2518,0x250c,0x2588,0x2584,0x258c,0x2590,0x2580,
324 	0x03b1,0x00df,0x0393,0x03c0,0x03a3,0x03c3,0x00b5,0x03c4,0x03a6,0x0398,0x03a9,0x03b4,0x221e,0x03c6,0x03b5,0x2229,
325 	0x2261,0x00b1,0x2265,0x2264,0x2320,0x2321,0x00f7,0x2248,0x00b0,0x2219,0x00b7,0x221a,0x207f,0x00b2,0x25a0,0x00a0
326 };
327 
328 // AMS2: Looking at Velvet Studio's bitmap font (TPIC32.PCX), these appear to be the only supported non-ASCII chars.
329 static constexpr char32_t CharsetTableCP437AMS2[256] = {
330 	C(' '),0x00a9,0x221a,0x00b7,C('0'),C('1'),C('2'),C('3'),C('4'),C('5'),C('6'),C('7'),C('8'),C('9'),C('A'),C('B'), // differs from CP437
331 	C('C'),C('D'),C('E'),C('F'),C(' '),0x00a7,C(' '),C(' '),C(' '),C(' '),C(' '),C(' '),C(' '),C(' '),C(' '),C(' '), // differs from CP437
332 	0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002a,0x002b,0x002c,0x002d,0x002e,0x002f,
333 	0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003a,0x003b,0x003c,0x003d,0x003e,0x003f,
334 	0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004a,0x004b,0x004c,0x004d,0x004e,0x004f,
335 	0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005a,0x005b,0x005c,0x005d,0x005e,0x005f,
336 	0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006a,0x006b,0x006c,0x006d,0x006e,0x006f,
337 	0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007a,0x007b,0x007c,0x007d,0x007e,0x2302,
338 	0x00c7,0x00fc,0x00e9,0x00e2,0x00e4,0x00e0,0x00e5,0x00e7,0x00ea,0x00eb,0x00e8,0x00ef,0x00ee,0x00ec,0x00c4,0x00c5,
339 	0x00c9,0x00e6,0x00c6,0x00f4,0x00f6,0x00f2,0x00fb,0x00f9,0x00ff,0x00d6,0x00dc,0x00a2,0x00a3,0x00a5,0x20a7,0x0192,
340 	0x00e1,0x00ed,0x00f3,0x00fa,0x00f1,0x00d1,0x00aa,0x00ba,0x00bf,0x2310,0x00ac,0x00bd,0x00bc,0x00a1,0x00ab,0x00bb,
341 	0x2591,0x2592,0x2593,0x2502,0x2524,0x2561,0x2562,0x2556,0x2555,0x2563,0x2551,0x2557,0x255d,0x255c,0x255b,0x2510,
342 	0x2514,0x2534,0x252c,0x251c,0x2500,0x253c,0x255e,0x255f,0x255a,0x2554,0x2569,0x2566,0x2560,0x2550,0x256c,0x2567,
343 	0x2568,0x2564,0x2565,0x2559,0x2558,0x2552,0x2553,0x256b,0x256a,0x2518,0x250c,0x2588,0x2584,0x258c,0x2590,0x2580,
344 	0x03b1,0x00df,0x0393,0x03c0,0x03a3,0x03c3,0x00b5,0x03c4,0x03a6,0x0398,0x03a9,0x03b4,0x221e,0x03c6,0x03b5,0x2229,
345 	0x2261,0x00b1,0x2265,0x2264,0x2320,0x2321,0x00f7,0x2248,0x00b0,0x2219,0x00b7,0x221a,0x207f,0x00b2,0x25a0,0x00a0
346 };
347 
348 #undef C
349 
350 
351 // templated on 8bit strings because of type-safe variants
352 template<typename Tdststring>
EncodeImpl(Charset charset,const mpt::widestring & src)353 static Tdststring EncodeImpl(Charset charset, const mpt::widestring &src)
354 {
355 	static_assert(sizeof(typename Tdststring::value_type) == sizeof(char));
356 	static_assert(mpt::is_character<typename Tdststring::value_type>::value);
357 	switch(charset)
358 	{
359 #if defined(MPT_ENABLE_CHARSET_LOCALE)
360 		case Charset::Locale:      return mpt::encode<Tdststring>(mpt::logical_encoding::locale, src); break;
361 #endif
362 		case Charset::UTF8:        return mpt::encode<Tdststring>(mpt::common_encoding::utf8, src); break;
363 		case Charset::ASCII:       return mpt::encode<Tdststring>(mpt::common_encoding::ascii, src); break;
364 		case Charset::ISO8859_1:   return mpt::encode<Tdststring>(mpt::common_encoding::iso8859_1, src); break;
365 		case Charset::ISO8859_15:  return mpt::encode<Tdststring>(mpt::common_encoding::iso8859_15, src); break;
366 		case Charset::CP850:       return mpt::encode<Tdststring>(mpt::common_encoding::cp850, src); break;
367 		case Charset::CP437:       return mpt::encode<Tdststring>(mpt::common_encoding::cp437, src); break;
368 		case Charset::CP437AMS:    return mpt::encode<Tdststring>(CharsetTableCP437AMS, src); break;
369 		case Charset::CP437AMS2:   return mpt::encode<Tdststring>(CharsetTableCP437AMS2, src); break;
370 		case Charset::Windows1252: return mpt::encode<Tdststring>(mpt::common_encoding::windows1252, src); break;
371 	}
372 	return Tdststring();
373 }
374 
375 
376 // templated on 8bit strings because of type-safe variants
377 template<typename Tsrcstring>
DecodeImpl(Charset charset,const Tsrcstring & src)378 static mpt::widestring DecodeImpl(Charset charset, const Tsrcstring &src)
379 {
380 	static_assert(sizeof(typename Tsrcstring::value_type) == sizeof(char));
381 	static_assert(mpt::is_character<typename Tsrcstring::value_type>::value);
382 	switch(charset)
383 	{
384 #if defined(MPT_ENABLE_CHARSET_LOCALE)
385 		case Charset::Locale:      return mpt::decode<Tsrcstring>(mpt::logical_encoding::locale, src); break;
386 #endif
387 		case Charset::UTF8:        return mpt::decode<Tsrcstring>(mpt::common_encoding::utf8, src); break;
388 		case Charset::ASCII:       return mpt::decode<Tsrcstring>(mpt::common_encoding::ascii, src); break;
389 		case Charset::ISO8859_1:   return mpt::decode<Tsrcstring>(mpt::common_encoding::iso8859_1, src); break;
390 		case Charset::ISO8859_15:  return mpt::decode<Tsrcstring>(mpt::common_encoding::iso8859_15, src); break;
391 		case Charset::CP850:       return mpt::decode<Tsrcstring>(mpt::common_encoding::cp850, src); break;
392 		case Charset::CP437:       return mpt::decode<Tsrcstring>(mpt::common_encoding::cp437, src); break;
393 		case Charset::CP437AMS:    return mpt::decode<Tsrcstring>(CharsetTableCP437AMS, src); break;
394 		case Charset::CP437AMS2:   return mpt::decode<Tsrcstring>(CharsetTableCP437AMS2, src); break;
395 		case Charset::Windows1252: return mpt::decode<Tsrcstring>(mpt::common_encoding::windows1252, src); break;
396 	}
397 	return mpt::widestring();
398 }
399 
400 
401 // templated on 8bit strings because of type-safe variants
402 template<typename Tdststring, typename Tsrcstring>
ConvertImpl(Charset to,Charset from,const Tsrcstring & src)403 static Tdststring ConvertImpl(Charset to, Charset from, const Tsrcstring &src)
404 {
405 	static_assert(sizeof(typename Tdststring::value_type) == sizeof(char));
406 	static_assert(sizeof(typename Tsrcstring::value_type) == sizeof(char));
407 	if(to == from)
408 	{
409 		const typename Tsrcstring::value_type * src_beg = src.data();
410 		const typename Tsrcstring::value_type * src_end = src_beg + src.size();
411 		return Tdststring(reinterpret_cast<const typename Tdststring::value_type *>(src_beg), reinterpret_cast<const typename Tdststring::value_type *>(src_end));
412 	}
413 	return EncodeImpl<Tdststring>(to, DecodeImpl(from, src));
414 }
415 
416 
417 
418 } // namespace String
419 
420 
IsUTF8(const std::string & str)421 bool IsUTF8(const std::string &str)
422 {
423 	return mpt::is_utf8(str);
424 }
425 
426 
427 #if MPT_WSTRING_CONVERT
ToWide(Charset from,const std::string & str)428 std::wstring ToWide(Charset from, const std::string &str)
429 {
430 	return String::DecodeImpl(from, str);
431 }
432 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToWide(const mpt::lstring & str)433 std::wstring ToWide(const mpt::lstring &str)
434 {
435 	return String::DecodeImpl(Charset::Locale, str);
436 }
437 #endif // MPT_ENABLE_CHARSET_LOCALE
438 #endif
439 
440 #if MPT_WSTRING_CONVERT
ToCharset(Charset to,const std::wstring & str)441 std::string ToCharset(Charset to, const std::wstring &str)
442 {
443 	return String::EncodeImpl<std::string>(to, str);
444 }
445 #endif
ToCharset(Charset to,Charset from,const std::string & str)446 std::string ToCharset(Charset to, Charset from, const std::string &str)
447 {
448 	return String::ConvertImpl<std::string>(to, from, str);
449 }
450 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToCharset(Charset to,const mpt::lstring & str)451 std::string ToCharset(Charset to, const mpt::lstring &str)
452 {
453 	return String::ConvertImpl<std::string>(to, Charset::Locale, str);
454 }
455 #endif // MPT_ENABLE_CHARSET_LOCALE
456 
457 #if defined(MPT_ENABLE_CHARSET_LOCALE)
458 #if MPT_WSTRING_CONVERT
ToLocale(const std::wstring & str)459 mpt::lstring ToLocale(const std::wstring &str)
460 {
461 	return String::EncodeImpl<mpt::lstring>(Charset::Locale, str);
462 }
463 #endif
ToLocale(Charset from,const std::string & str)464 mpt::lstring ToLocale(Charset from, const std::string &str)
465 {
466 	return String::ConvertImpl<mpt::lstring>(Charset::Locale, from, str);
467 }
468 #endif // MPT_ENABLE_CHARSET_LOCALE
469 
470 #if MPT_OS_WINDOWS
471 #if MPT_WSTRING_CONVERT
ToWin(const std::wstring & str)472 mpt::winstring ToWin(const std::wstring &str)
473 {
474 	#ifdef UNICODE
475 		return str;
476 	#else
477 		return ToLocale(str);
478 	#endif
479 }
480 #endif
ToWin(Charset from,const std::string & str)481 mpt::winstring ToWin(Charset from, const std::string &str)
482 {
483 	#ifdef UNICODE
484 		return ToWide(from, str);
485 	#else
486 		return ToLocale(from, str);
487 	#endif
488 }
489 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToWin(const mpt::lstring & str)490 mpt::winstring ToWin(const mpt::lstring &str)
491 {
492 	#ifdef UNICODE
493 		return ToWide(str);
494 	#else
495 		return str;
496 	#endif
497 }
498 #endif // MPT_ENABLE_CHARSET_LOCALE
499 #endif // MPT_OS_WINDOWS
500 
501 
502 #if defined(MPT_WITH_MFC)
503 
ToCString(const std::wstring & str)504 CString ToCString(const std::wstring &str)
505 {
506 	#ifdef UNICODE
507 		return str.c_str();
508 	#else
509 		return ToCharset(Charset::Locale, str).c_str();
510 	#endif
511 }
ToCString(Charset from,const std::string & str)512 CString ToCString(Charset from, const std::string &str)
513 {
514 	#ifdef UNICODE
515 		return ToWide(from, str).c_str();
516 	#else
517 		return ToCharset(Charset::Locale, from, str).c_str();
518 	#endif
519 }
ToWide(const CString & str)520 std::wstring ToWide(const CString &str)
521 {
522 	#ifdef UNICODE
523 		return str.GetString();
524 	#else
525 		return ToWide(Charset::Locale, str.GetString());
526 	#endif
527 }
ToCharset(Charset to,const CString & str)528 std::string ToCharset(Charset to, const CString &str)
529 {
530 	#ifdef UNICODE
531 		return ToCharset(to, str.GetString());
532 	#else
533 		return ToCharset(to, Charset::Locale, str.GetString());
534 	#endif
535 }
536 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToCString(const mpt::lstring & str)537 CString ToCString(const mpt::lstring &str)
538 {
539 	#ifdef UNICODE
540 		return ToWide(str).c_str();
541 	#else
542 		return str.c_str();
543 	#endif
544 }
ToLocale(const CString & str)545 mpt::lstring ToLocale(const CString &str)
546 {
547 	#ifdef UNICODE
548 		return String::EncodeImpl<mpt::lstring>(Charset::Locale, str.GetString());
549 	#else
550 		return str.GetString();
551 	#endif
552 }
553 #endif // MPT_ENABLE_CHARSET_LOCALE
554 #if MPT_OS_WINDOWS
ToWin(const CString & str)555 mpt::winstring ToWin(const CString &str)
556 {
557 	return str.GetString();
558 }
559 #endif // MPT_OS_WINDOWS
560 
561 #endif // MPT_WITH_MFC
562 
563 
564 #if MPT_USTRING_MODE_WIDE
565 // inline
566 #else // !MPT_USTRING_MODE_WIDE
567 #if MPT_WSTRING_CONVERT
ToUnicode(const std::wstring & str)568 mpt::ustring ToUnicode(const std::wstring &str)
569 {
570 	return String::EncodeImpl<mpt::ustring>(mpt::Charset::UTF8, str);
571 }
572 #endif
ToUnicode(Charset from,const std::string & str)573 mpt::ustring ToUnicode(Charset from, const std::string &str)
574 {
575 	return String::ConvertImpl<mpt::ustring>(mpt::Charset::UTF8, from, str);
576 }
577 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToUnicode(const mpt::lstring & str)578 mpt::ustring ToUnicode(const mpt::lstring &str)
579 {
580 	return String::ConvertImpl<mpt::ustring>(mpt::Charset::UTF8, mpt::Charset::Locale, str);
581 }
582 #endif // MPT_ENABLE_CHARSET_LOCALE
583 #if defined(MPT_WITH_MFC)
ToUnicode(const CString & str)584 mpt::ustring ToUnicode(const CString &str)
585 {
586 	#ifdef UNICODE
587 		return String::EncodeImpl<mpt::ustring>(mpt::Charset::UTF8, str.GetString());
588 	#else // !UNICODE
589 		return String::ConvertImpl<mpt::ustring, std::string>(mpt::Charset::UTF8, mpt::Charset::Locale, str.GetString());
590 	#endif // UNICODE
591 }
592 #endif // MPT_WITH_MFC
593 #endif // MPT_USTRING_MODE_WIDE
594 
595 #if MPT_USTRING_MODE_WIDE
596 // nothing, std::wstring overloads will catch all stuff
597 #else // !MPT_USTRING_MODE_WIDE
598 #if MPT_WSTRING_CONVERT
ToWide(const mpt::ustring & str)599 std::wstring ToWide(const mpt::ustring &str)
600 {
601 	return String::DecodeImpl<mpt::ustring>(mpt::Charset::UTF8, str);
602 }
603 #endif
ToCharset(Charset to,const mpt::ustring & str)604 std::string ToCharset(Charset to, const mpt::ustring &str)
605 {
606 	return String::ConvertImpl<std::string, mpt::ustring>(to, mpt::Charset::UTF8, str);
607 }
608 #if defined(MPT_ENABLE_CHARSET_LOCALE)
ToLocale(const mpt::ustring & str)609 mpt::lstring ToLocale(const mpt::ustring &str)
610 {
611 	return String::ConvertImpl<mpt::lstring, mpt::ustring>(mpt::Charset::Locale, mpt::Charset::UTF8, str);
612 }
613 #endif // MPT_ENABLE_CHARSET_LOCALE
614 #if MPT_OS_WINDOWS
ToWin(const mpt::ustring & str)615 mpt::winstring ToWin(const mpt::ustring &str)
616 {
617 	#ifdef UNICODE
618 		return String::DecodeImpl<mpt::ustring>(mpt::Charset::UTF8, str);
619 	#else
620 		return String::ConvertImpl<mpt::lstring, mpt::ustring>(mpt::Charset::Locale, mpt::Charset::UTF8, str);
621 	#endif
622 }
623 #endif // MPT_OS_WINDOWS
624 #if defined(MPT_WITH_MFC)
ToCString(const mpt::ustring & str)625 CString ToCString(const mpt::ustring &str)
626 {
627 	#ifdef UNICODE
628 		return String::DecodeImpl<mpt::ustring>(mpt::Charset::UTF8, str).c_str();
629 	#else // !UNICODE
630 		return String::ConvertImpl<std::string, mpt::ustring>(mpt::Charset::Locale, mpt::Charset::UTF8, str).c_str();
631 	#endif // UNICODE
632 }
633 #endif // MPT_WITH_MFC
634 #endif // MPT_USTRING_MODE_WIDE
635 
636 
637 
638 
639 
CharsetFromCodePage(uint16 codepage,mpt::Charset fallback,bool * isFallback=nullptr)640 static mpt::Charset CharsetFromCodePage(uint16 codepage, mpt::Charset fallback, bool * isFallback = nullptr)
641 {
642 	mpt::Charset result = fallback;
643 	switch(codepage)
644 	{
645 	case 65001:
646 		result = mpt::Charset::UTF8;
647 		if(isFallback) *isFallback = false;
648 		break;
649 	case 20127:
650 		result = mpt::Charset::ASCII;
651 		if(isFallback) *isFallback = false;
652 		break;
653 	case 28591:
654 		result = mpt::Charset::ISO8859_1;
655 		if(isFallback) *isFallback = false;
656 		break;
657 	case 28605:
658 		result = mpt::Charset::ISO8859_15;
659 		if(isFallback) *isFallback = false;
660 		break;
661 	case 437:
662 		result = mpt::Charset::CP437;
663 		if(isFallback) *isFallback = false;
664 		break;
665 	case 1252:
666 		result = mpt::Charset::Windows1252;
667 		if(isFallback) *isFallback = false;
668 		break;
669 	default:
670 		result = fallback;
671 		if(isFallback) *isFallback = true;
672 		break;
673 	}
674 	return result;
675 }
676 
ToUnicode(uint16 codepage,mpt::Charset fallback,const std::string & str)677 mpt::ustring ToUnicode(uint16 codepage, mpt::Charset fallback, const std::string &str)
678 {
679 	#if MPT_OS_WINDOWS
680 		mpt::ustring result;
681 		bool noCharsetMatch = true;
682 		mpt::Charset charset = mpt::CharsetFromCodePage(codepage, fallback, &noCharsetMatch);
683 		if(noCharsetMatch && mpt::has_codepage(codepage))
684 		{
685 			result = mpt::ToUnicode(mpt::decode<std::string>(codepage, str));
686 		} else
687 		{
688 			result = mpt::ToUnicode(charset, str);
689 		}
690 		return result;
691 	#else // !MPT_OS_WINDOWS
692 		return mpt::ToUnicode(mpt::CharsetFromCodePage(codepage, fallback), str);
693 	#endif // MPT_OS_WINDOWS
694 }
695 
696 
697 
698 
699 
ToLowerCaseAscii(char c)700 char ToLowerCaseAscii(char c)
701 {
702 	return mpt::to_lower_ascii(c);
703 }
704 
ToUpperCaseAscii(char c)705 char ToUpperCaseAscii(char c)
706 {
707 	return mpt::to_upper_ascii(c);
708 }
709 
ToLowerCaseAscii(std::string s)710 std::string ToLowerCaseAscii(std::string s)
711 {
712 	std::transform(s.begin(), s.end(), s.begin(), static_cast<char(*)(char)>(&mpt::ToLowerCaseAscii));
713 	return s;
714 }
715 
ToUpperCaseAscii(std::string s)716 std::string ToUpperCaseAscii(std::string s)
717 {
718 	std::transform(s.begin(), s.end(), s.begin(), static_cast<char(*)(char)>(&mpt::ToUpperCaseAscii));
719 	return s;
720 }
721 
CompareNoCaseAscii(const char * a,const char * b,std::size_t n)722 int CompareNoCaseAscii(const char *a, const char *b, std::size_t n)
723 {
724 	while(n--)
725 	{
726 		unsigned char ac = mpt::char_value(mpt::ToLowerCaseAscii(*a));
727 		unsigned char bc = mpt::char_value(mpt::ToLowerCaseAscii(*b));
728 		if(ac != bc)
729 		{
730 			return ac < bc ? -1 : 1;
731 		} else if(!ac && !bc)
732 		{
733 			return 0;
734 		}
735 		++a;
736 		++b;
737 	}
738 	return 0;
739 }
740 
CompareNoCaseAscii(std::string_view a,std::string_view b)741 int CompareNoCaseAscii(std::string_view a, std::string_view b)
742 {
743 	for(std::size_t i = 0; i < std::min(a.length(), b.length()); ++i)
744 	{
745 		unsigned char ac = mpt::char_value(mpt::ToLowerCaseAscii(a[i]));
746 		unsigned char bc = mpt::char_value(mpt::ToLowerCaseAscii(b[i]));
747 		if(ac != bc)
748 		{
749 			return ac < bc ? -1 : 1;
750 		} else if(!ac && !bc)
751 		{
752 			return 0;
753 		}
754 	}
755 	if(a.length() == b.length())
756 	{
757 		return 0;
758 	}
759 	return a.length() < b.length() ? -1 : 1;
760 }
761 
CompareNoCaseAscii(const std::string & a,const std::string & b)762 int CompareNoCaseAscii(const std::string &a, const std::string &b)
763 {
764 	return CompareNoCaseAscii(std::string_view(a), std::string_view(b));
765 }
766 
767 
768 #if defined(MODPLUG_TRACKER)
769 
ToLowerCase(const mpt::ustring & s)770 mpt::ustring ToLowerCase(const mpt::ustring &s)
771 {
772 	#if defined(MPT_WITH_MFC)
773 		#if defined(UNICODE)
774 			CString tmp = mpt::ToCString(s);
775 			tmp.MakeLower();
776 			return mpt::ToUnicode(tmp);
777 		#else // !UNICODE
778 			CStringW tmp = mpt::ToWide(s).c_str();
779 			tmp.MakeLower();
780 			return mpt::ToUnicode(tmp.GetString());
781 		#endif // UNICODE
782 	#else // !MPT_WITH_MFC
783 		std::wstring ws = mpt::ToWide(s);
784 		std::transform(ws.begin(), ws.end(), ws.begin(), &std::towlower);
785 		return mpt::ToUnicode(ws);
786 	#endif // MPT_WITH_MFC
787 }
788 
ToUpperCase(const mpt::ustring & s)789 mpt::ustring ToUpperCase(const mpt::ustring &s)
790 {
791 	#if defined(MPT_WITH_MFC)
792 		#if defined(UNICODE)
793 			CString tmp = mpt::ToCString(s);
794 			tmp.MakeUpper();
795 			return mpt::ToUnicode(tmp);
796 		#else // !UNICODE
797 			CStringW tmp = mpt::ToWide(s).c_str();
798 			tmp.MakeUpper();
799 			return mpt::ToUnicode(tmp.GetString());
800 		#endif // UNICODE
801 	#else // !MPT_WITH_MFC
802 		std::wstring ws = mpt::ToWide(s);
803 		std::transform(ws.begin(), ws.end(), ws.begin(), &std::towlower);
804 		return mpt::ToUnicode(ws);
805 	#endif // MPT_WITH_MFC
806 }
807 
808 #endif // MODPLUG_TRACKER
809 
810 
811 
812 } // namespace mpt
813 
814 
815 
816 OPENMPT_NAMESPACE_END
817