1 /*
2  *  Copyright (C) 2005-2018 Team Kodi
3  *  This file is part of Kodi - https://kodi.tv
4  *
5  *  SPDX-License-Identifier: GPL-2.0-or-later
6  *  See LICENSES/README.md for more information.
7  */
8 //-----------------------------------------------------------------------
9 //
10 //  File:      StringUtils.cpp
11 //
12 //  Purpose:   ATL split string utility
13 //  Author:    Paul J. Weiss
14 //
15 //  Modified to use J O'Leary's std::string class by kraqh3d
16 //
17 //------------------------------------------------------------------------
18 
19 #ifdef HAVE_NEW_CROSSGUID
20 #include <guid.hpp>
21 #else
22 #include <guid.h>
23 #endif
24 
25 #if defined(TARGET_ANDROID)
26 #include <androidjni/JNIThreading.h>
27 #endif
28 
29 #include "CharsetConverter.h"
30 #include "LangInfo.h"
31 #include "StringUtils.h"
32 #include "Util.h"
33 
34 #include <algorithm>
35 #include <array>
36 #include <assert.h>
37 #include <functional>
38 #include <inttypes.h>
39 #include <iomanip>
40 #include <math.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <time.h>
45 
46 #include <fstrcmp.h>
47 #include <memory.h>
48 
49 // don't move or std functions end up in PCRE namespace
50 // clang-format off
51 #include "utils/RegExp.h"
52 // clang-format on
53 
54 #define FORMAT_BLOCK_SIZE 512 // # of bytes for initial allocation for printf
55 
56 static constexpr const char* ADDON_GUID_RE = "^(\\{){0,1}[0-9a-fA-F]{8}\\-[0-9a-fA-F]{4}\\-[0-9a-fA-F]{4}\\-[0-9a-fA-F]{4}\\-[0-9a-fA-F]{12}(\\}){0,1}$";
57 
58 /* empty string for use in returns by ref */
59 const std::string StringUtils::Empty = "";
60 
61 //	Copyright (c) Leigh Brasington 2012.  All rights reserved.
62 //  This code may be used and reproduced without written permission.
63 //  http://www.leighb.com/tounicupper.htm
64 //
65 //	The tables were constructed from
66 //	http://publib.boulder.ibm.com/infocenter/iseries/v7r1m0/index.jsp?topic=%2Fnls%2Frbagslowtoupmaptable.htm
67 
68 static constexpr wchar_t unicode_lowers[] = {
69   (wchar_t)0x0061, (wchar_t)0x0062, (wchar_t)0x0063, (wchar_t)0x0064, (wchar_t)0x0065, (wchar_t)0x0066, (wchar_t)0x0067, (wchar_t)0x0068, (wchar_t)0x0069,
70   (wchar_t)0x006A, (wchar_t)0x006B, (wchar_t)0x006C, (wchar_t)0x006D, (wchar_t)0x006E, (wchar_t)0x006F, (wchar_t)0x0070, (wchar_t)0x0071, (wchar_t)0x0072,
71   (wchar_t)0x0073, (wchar_t)0x0074, (wchar_t)0x0075, (wchar_t)0x0076, (wchar_t)0x0077, (wchar_t)0x0078, (wchar_t)0x0079, (wchar_t)0x007A, (wchar_t)0x00E0,
72   (wchar_t)0x00E1, (wchar_t)0x00E2, (wchar_t)0x00E3, (wchar_t)0x00E4, (wchar_t)0x00E5, (wchar_t)0x00E6, (wchar_t)0x00E7, (wchar_t)0x00E8, (wchar_t)0x00E9,
73   (wchar_t)0x00EA, (wchar_t)0x00EB, (wchar_t)0x00EC, (wchar_t)0x00ED, (wchar_t)0x00EE, (wchar_t)0x00EF, (wchar_t)0x00F0, (wchar_t)0x00F1, (wchar_t)0x00F2,
74   (wchar_t)0x00F3, (wchar_t)0x00F4, (wchar_t)0x00F5, (wchar_t)0x00F6, (wchar_t)0x00F8, (wchar_t)0x00F9, (wchar_t)0x00FA, (wchar_t)0x00FB, (wchar_t)0x00FC,
75   (wchar_t)0x00FD, (wchar_t)0x00FE, (wchar_t)0x00FF, (wchar_t)0x0101, (wchar_t)0x0103, (wchar_t)0x0105, (wchar_t)0x0107, (wchar_t)0x0109, (wchar_t)0x010B,
76   (wchar_t)0x010D, (wchar_t)0x010F, (wchar_t)0x0111, (wchar_t)0x0113, (wchar_t)0x0115, (wchar_t)0x0117, (wchar_t)0x0119, (wchar_t)0x011B, (wchar_t)0x011D,
77   (wchar_t)0x011F, (wchar_t)0x0121, (wchar_t)0x0123, (wchar_t)0x0125, (wchar_t)0x0127, (wchar_t)0x0129, (wchar_t)0x012B, (wchar_t)0x012D, (wchar_t)0x012F,
78   (wchar_t)0x0131, (wchar_t)0x0133, (wchar_t)0x0135, (wchar_t)0x0137, (wchar_t)0x013A, (wchar_t)0x013C, (wchar_t)0x013E, (wchar_t)0x0140, (wchar_t)0x0142,
79   (wchar_t)0x0144, (wchar_t)0x0146, (wchar_t)0x0148, (wchar_t)0x014B, (wchar_t)0x014D, (wchar_t)0x014F, (wchar_t)0x0151, (wchar_t)0x0153, (wchar_t)0x0155,
80   (wchar_t)0x0157, (wchar_t)0x0159, (wchar_t)0x015B, (wchar_t)0x015D, (wchar_t)0x015F, (wchar_t)0x0161, (wchar_t)0x0163, (wchar_t)0x0165, (wchar_t)0x0167,
81   (wchar_t)0x0169, (wchar_t)0x016B, (wchar_t)0x016D, (wchar_t)0x016F, (wchar_t)0x0171, (wchar_t)0x0173, (wchar_t)0x0175, (wchar_t)0x0177, (wchar_t)0x017A,
82   (wchar_t)0x017C, (wchar_t)0x017E, (wchar_t)0x0183, (wchar_t)0x0185, (wchar_t)0x0188, (wchar_t)0x018C, (wchar_t)0x0192, (wchar_t)0x0199, (wchar_t)0x01A1,
83   (wchar_t)0x01A3, (wchar_t)0x01A5, (wchar_t)0x01A8, (wchar_t)0x01AD, (wchar_t)0x01B0, (wchar_t)0x01B4, (wchar_t)0x01B6, (wchar_t)0x01B9, (wchar_t)0x01BD,
84   (wchar_t)0x01C6, (wchar_t)0x01C9, (wchar_t)0x01CC, (wchar_t)0x01CE, (wchar_t)0x01D0, (wchar_t)0x01D2, (wchar_t)0x01D4, (wchar_t)0x01D6, (wchar_t)0x01D8,
85   (wchar_t)0x01DA, (wchar_t)0x01DC, (wchar_t)0x01DF, (wchar_t)0x01E1, (wchar_t)0x01E3, (wchar_t)0x01E5, (wchar_t)0x01E7, (wchar_t)0x01E9, (wchar_t)0x01EB,
86   (wchar_t)0x01ED, (wchar_t)0x01EF, (wchar_t)0x01F3, (wchar_t)0x01F5, (wchar_t)0x01FB, (wchar_t)0x01FD, (wchar_t)0x01FF, (wchar_t)0x0201, (wchar_t)0x0203,
87   (wchar_t)0x0205, (wchar_t)0x0207, (wchar_t)0x0209, (wchar_t)0x020B, (wchar_t)0x020D, (wchar_t)0x020F, (wchar_t)0x0211, (wchar_t)0x0213, (wchar_t)0x0215,
88   (wchar_t)0x0217, (wchar_t)0x0253, (wchar_t)0x0254, (wchar_t)0x0257, (wchar_t)0x0258, (wchar_t)0x0259, (wchar_t)0x025B, (wchar_t)0x0260, (wchar_t)0x0263,
89   (wchar_t)0x0268, (wchar_t)0x0269, (wchar_t)0x026F, (wchar_t)0x0272, (wchar_t)0x0275, (wchar_t)0x0283, (wchar_t)0x0288, (wchar_t)0x028A, (wchar_t)0x028B,
90   (wchar_t)0x0292, (wchar_t)0x03AC, (wchar_t)0x03AD, (wchar_t)0x03AE, (wchar_t)0x03AF, (wchar_t)0x03B1, (wchar_t)0x03B2, (wchar_t)0x03B3, (wchar_t)0x03B4,
91   (wchar_t)0x03B5, (wchar_t)0x03B6, (wchar_t)0x03B7, (wchar_t)0x03B8, (wchar_t)0x03B9, (wchar_t)0x03BA, (wchar_t)0x03BB, (wchar_t)0x03BC, (wchar_t)0x03BD,
92   (wchar_t)0x03BE, (wchar_t)0x03BF, (wchar_t)0x03C0, (wchar_t)0x03C1, (wchar_t)0x03C3, (wchar_t)0x03C4, (wchar_t)0x03C5, (wchar_t)0x03C6, (wchar_t)0x03C7,
93   (wchar_t)0x03C8, (wchar_t)0x03C9, (wchar_t)0x03CA, (wchar_t)0x03CB, (wchar_t)0x03CC, (wchar_t)0x03CD, (wchar_t)0x03CE, (wchar_t)0x03E3, (wchar_t)0x03E5,
94   (wchar_t)0x03E7, (wchar_t)0x03E9, (wchar_t)0x03EB, (wchar_t)0x03ED, (wchar_t)0x03EF, (wchar_t)0x0430, (wchar_t)0x0431, (wchar_t)0x0432, (wchar_t)0x0433,
95   (wchar_t)0x0434, (wchar_t)0x0435, (wchar_t)0x0436, (wchar_t)0x0437, (wchar_t)0x0438, (wchar_t)0x0439, (wchar_t)0x043A, (wchar_t)0x043B, (wchar_t)0x043C,
96   (wchar_t)0x043D, (wchar_t)0x043E, (wchar_t)0x043F, (wchar_t)0x0440, (wchar_t)0x0441, (wchar_t)0x0442, (wchar_t)0x0443, (wchar_t)0x0444, (wchar_t)0x0445,
97   (wchar_t)0x0446, (wchar_t)0x0447, (wchar_t)0x0448, (wchar_t)0x0449, (wchar_t)0x044A, (wchar_t)0x044B, (wchar_t)0x044C, (wchar_t)0x044D, (wchar_t)0x044E,
98   (wchar_t)0x044F, (wchar_t)0x0451, (wchar_t)0x0452, (wchar_t)0x0453, (wchar_t)0x0454, (wchar_t)0x0455, (wchar_t)0x0456, (wchar_t)0x0457, (wchar_t)0x0458,
99   (wchar_t)0x0459, (wchar_t)0x045A, (wchar_t)0x045B, (wchar_t)0x045C, (wchar_t)0x045E, (wchar_t)0x045F, (wchar_t)0x0461, (wchar_t)0x0463, (wchar_t)0x0465,
100   (wchar_t)0x0467, (wchar_t)0x0469, (wchar_t)0x046B, (wchar_t)0x046D, (wchar_t)0x046F, (wchar_t)0x0471, (wchar_t)0x0473, (wchar_t)0x0475, (wchar_t)0x0477,
101   (wchar_t)0x0479, (wchar_t)0x047B, (wchar_t)0x047D, (wchar_t)0x047F, (wchar_t)0x0481, (wchar_t)0x0491, (wchar_t)0x0493, (wchar_t)0x0495, (wchar_t)0x0497,
102   (wchar_t)0x0499, (wchar_t)0x049B, (wchar_t)0x049D, (wchar_t)0x049F, (wchar_t)0x04A1, (wchar_t)0x04A3, (wchar_t)0x04A5, (wchar_t)0x04A7, (wchar_t)0x04A9,
103   (wchar_t)0x04AB, (wchar_t)0x04AD, (wchar_t)0x04AF, (wchar_t)0x04B1, (wchar_t)0x04B3, (wchar_t)0x04B5, (wchar_t)0x04B7, (wchar_t)0x04B9, (wchar_t)0x04BB,
104   (wchar_t)0x04BD, (wchar_t)0x04BF, (wchar_t)0x04C2, (wchar_t)0x04C4, (wchar_t)0x04C8, (wchar_t)0x04CC, (wchar_t)0x04D1, (wchar_t)0x04D3, (wchar_t)0x04D5,
105   (wchar_t)0x04D7, (wchar_t)0x04D9, (wchar_t)0x04DB, (wchar_t)0x04DD, (wchar_t)0x04DF, (wchar_t)0x04E1, (wchar_t)0x04E3, (wchar_t)0x04E5, (wchar_t)0x04E7,
106   (wchar_t)0x04E9, (wchar_t)0x04EB, (wchar_t)0x04EF, (wchar_t)0x04F1, (wchar_t)0x04F3, (wchar_t)0x04F5, (wchar_t)0x04F9, (wchar_t)0x0561, (wchar_t)0x0562,
107   (wchar_t)0x0563, (wchar_t)0x0564, (wchar_t)0x0565, (wchar_t)0x0566, (wchar_t)0x0567, (wchar_t)0x0568, (wchar_t)0x0569, (wchar_t)0x056A, (wchar_t)0x056B,
108   (wchar_t)0x056C, (wchar_t)0x056D, (wchar_t)0x056E, (wchar_t)0x056F, (wchar_t)0x0570, (wchar_t)0x0571, (wchar_t)0x0572, (wchar_t)0x0573, (wchar_t)0x0574,
109   (wchar_t)0x0575, (wchar_t)0x0576, (wchar_t)0x0577, (wchar_t)0x0578, (wchar_t)0x0579, (wchar_t)0x057A, (wchar_t)0x057B, (wchar_t)0x057C, (wchar_t)0x057D,
110   (wchar_t)0x057E, (wchar_t)0x057F, (wchar_t)0x0580, (wchar_t)0x0581, (wchar_t)0x0582, (wchar_t)0x0583, (wchar_t)0x0584, (wchar_t)0x0585, (wchar_t)0x0586,
111   (wchar_t)0x10D0, (wchar_t)0x10D1, (wchar_t)0x10D2, (wchar_t)0x10D3, (wchar_t)0x10D4, (wchar_t)0x10D5, (wchar_t)0x10D6, (wchar_t)0x10D7, (wchar_t)0x10D8,
112   (wchar_t)0x10D9, (wchar_t)0x10DA, (wchar_t)0x10DB, (wchar_t)0x10DC, (wchar_t)0x10DD, (wchar_t)0x10DE, (wchar_t)0x10DF, (wchar_t)0x10E0, (wchar_t)0x10E1,
113   (wchar_t)0x10E2, (wchar_t)0x10E3, (wchar_t)0x10E4, (wchar_t)0x10E5, (wchar_t)0x10E6, (wchar_t)0x10E7, (wchar_t)0x10E8, (wchar_t)0x10E9, (wchar_t)0x10EA,
114   (wchar_t)0x10EB, (wchar_t)0x10EC, (wchar_t)0x10ED, (wchar_t)0x10EE, (wchar_t)0x10EF, (wchar_t)0x10F0, (wchar_t)0x10F1, (wchar_t)0x10F2, (wchar_t)0x10F3,
115   (wchar_t)0x10F4, (wchar_t)0x10F5, (wchar_t)0x1E01, (wchar_t)0x1E03, (wchar_t)0x1E05, (wchar_t)0x1E07, (wchar_t)0x1E09, (wchar_t)0x1E0B, (wchar_t)0x1E0D,
116   (wchar_t)0x1E0F, (wchar_t)0x1E11, (wchar_t)0x1E13, (wchar_t)0x1E15, (wchar_t)0x1E17, (wchar_t)0x1E19, (wchar_t)0x1E1B, (wchar_t)0x1E1D, (wchar_t)0x1E1F,
117   (wchar_t)0x1E21, (wchar_t)0x1E23, (wchar_t)0x1E25, (wchar_t)0x1E27, (wchar_t)0x1E29, (wchar_t)0x1E2B, (wchar_t)0x1E2D, (wchar_t)0x1E2F, (wchar_t)0x1E31,
118   (wchar_t)0x1E33, (wchar_t)0x1E35, (wchar_t)0x1E37, (wchar_t)0x1E39, (wchar_t)0x1E3B, (wchar_t)0x1E3D, (wchar_t)0x1E3F, (wchar_t)0x1E41, (wchar_t)0x1E43,
119   (wchar_t)0x1E45, (wchar_t)0x1E47, (wchar_t)0x1E49, (wchar_t)0x1E4B, (wchar_t)0x1E4D, (wchar_t)0x1E4F, (wchar_t)0x1E51, (wchar_t)0x1E53, (wchar_t)0x1E55,
120   (wchar_t)0x1E57, (wchar_t)0x1E59, (wchar_t)0x1E5B, (wchar_t)0x1E5D, (wchar_t)0x1E5F, (wchar_t)0x1E61, (wchar_t)0x1E63, (wchar_t)0x1E65, (wchar_t)0x1E67,
121   (wchar_t)0x1E69, (wchar_t)0x1E6B, (wchar_t)0x1E6D, (wchar_t)0x1E6F, (wchar_t)0x1E71, (wchar_t)0x1E73, (wchar_t)0x1E75, (wchar_t)0x1E77, (wchar_t)0x1E79,
122   (wchar_t)0x1E7B, (wchar_t)0x1E7D, (wchar_t)0x1E7F, (wchar_t)0x1E81, (wchar_t)0x1E83, (wchar_t)0x1E85, (wchar_t)0x1E87, (wchar_t)0x1E89, (wchar_t)0x1E8B,
123   (wchar_t)0x1E8D, (wchar_t)0x1E8F, (wchar_t)0x1E91, (wchar_t)0x1E93, (wchar_t)0x1E95, (wchar_t)0x1EA1, (wchar_t)0x1EA3, (wchar_t)0x1EA5, (wchar_t)0x1EA7,
124   (wchar_t)0x1EA9, (wchar_t)0x1EAB, (wchar_t)0x1EAD, (wchar_t)0x1EAF, (wchar_t)0x1EB1, (wchar_t)0x1EB3, (wchar_t)0x1EB5, (wchar_t)0x1EB7, (wchar_t)0x1EB9,
125   (wchar_t)0x1EBB, (wchar_t)0x1EBD, (wchar_t)0x1EBF, (wchar_t)0x1EC1, (wchar_t)0x1EC3, (wchar_t)0x1EC5, (wchar_t)0x1EC7, (wchar_t)0x1EC9, (wchar_t)0x1ECB,
126   (wchar_t)0x1ECD, (wchar_t)0x1ECF, (wchar_t)0x1ED1, (wchar_t)0x1ED3, (wchar_t)0x1ED5, (wchar_t)0x1ED7, (wchar_t)0x1ED9, (wchar_t)0x1EDB, (wchar_t)0x1EDD,
127   (wchar_t)0x1EDF, (wchar_t)0x1EE1, (wchar_t)0x1EE3, (wchar_t)0x1EE5, (wchar_t)0x1EE7, (wchar_t)0x1EE9, (wchar_t)0x1EEB, (wchar_t)0x1EED, (wchar_t)0x1EEF,
128   (wchar_t)0x1EF1, (wchar_t)0x1EF3, (wchar_t)0x1EF5, (wchar_t)0x1EF7, (wchar_t)0x1EF9, (wchar_t)0x1F00, (wchar_t)0x1F01, (wchar_t)0x1F02, (wchar_t)0x1F03,
129   (wchar_t)0x1F04, (wchar_t)0x1F05, (wchar_t)0x1F06, (wchar_t)0x1F07, (wchar_t)0x1F10, (wchar_t)0x1F11, (wchar_t)0x1F12, (wchar_t)0x1F13, (wchar_t)0x1F14,
130   (wchar_t)0x1F15, (wchar_t)0x1F20, (wchar_t)0x1F21, (wchar_t)0x1F22, (wchar_t)0x1F23, (wchar_t)0x1F24, (wchar_t)0x1F25, (wchar_t)0x1F26, (wchar_t)0x1F27,
131   (wchar_t)0x1F30, (wchar_t)0x1F31, (wchar_t)0x1F32, (wchar_t)0x1F33, (wchar_t)0x1F34, (wchar_t)0x1F35, (wchar_t)0x1F36, (wchar_t)0x1F37, (wchar_t)0x1F40,
132   (wchar_t)0x1F41, (wchar_t)0x1F42, (wchar_t)0x1F43, (wchar_t)0x1F44, (wchar_t)0x1F45, (wchar_t)0x1F51, (wchar_t)0x1F53, (wchar_t)0x1F55, (wchar_t)0x1F57,
133   (wchar_t)0x1F60, (wchar_t)0x1F61, (wchar_t)0x1F62, (wchar_t)0x1F63, (wchar_t)0x1F64, (wchar_t)0x1F65, (wchar_t)0x1F66, (wchar_t)0x1F67, (wchar_t)0x1F80,
134   (wchar_t)0x1F81, (wchar_t)0x1F82, (wchar_t)0x1F83, (wchar_t)0x1F84, (wchar_t)0x1F85, (wchar_t)0x1F86, (wchar_t)0x1F87, (wchar_t)0x1F90, (wchar_t)0x1F91,
135   (wchar_t)0x1F92, (wchar_t)0x1F93, (wchar_t)0x1F94, (wchar_t)0x1F95, (wchar_t)0x1F96, (wchar_t)0x1F97, (wchar_t)0x1FA0, (wchar_t)0x1FA1, (wchar_t)0x1FA2,
136   (wchar_t)0x1FA3, (wchar_t)0x1FA4, (wchar_t)0x1FA5, (wchar_t)0x1FA6, (wchar_t)0x1FA7, (wchar_t)0x1FB0, (wchar_t)0x1FB1, (wchar_t)0x1FD0, (wchar_t)0x1FD1,
137   (wchar_t)0x1FE0, (wchar_t)0x1FE1, (wchar_t)0x24D0, (wchar_t)0x24D1, (wchar_t)0x24D2, (wchar_t)0x24D3, (wchar_t)0x24D4, (wchar_t)0x24D5, (wchar_t)0x24D6,
138   (wchar_t)0x24D7, (wchar_t)0x24D8, (wchar_t)0x24D9, (wchar_t)0x24DA, (wchar_t)0x24DB, (wchar_t)0x24DC, (wchar_t)0x24DD, (wchar_t)0x24DE, (wchar_t)0x24DF,
139   (wchar_t)0x24E0, (wchar_t)0x24E1, (wchar_t)0x24E2, (wchar_t)0x24E3, (wchar_t)0x24E4, (wchar_t)0x24E5, (wchar_t)0x24E6, (wchar_t)0x24E7, (wchar_t)0x24E8,
140   (wchar_t)0x24E9, (wchar_t)0xFF41, (wchar_t)0xFF42, (wchar_t)0xFF43, (wchar_t)0xFF44, (wchar_t)0xFF45, (wchar_t)0xFF46, (wchar_t)0xFF47, (wchar_t)0xFF48,
141   (wchar_t)0xFF49, (wchar_t)0xFF4A, (wchar_t)0xFF4B, (wchar_t)0xFF4C, (wchar_t)0xFF4D, (wchar_t)0xFF4E, (wchar_t)0xFF4F, (wchar_t)0xFF50, (wchar_t)0xFF51,
142   (wchar_t)0xFF52, (wchar_t)0xFF53, (wchar_t)0xFF54, (wchar_t)0xFF55, (wchar_t)0xFF56, (wchar_t)0xFF57, (wchar_t)0xFF58, (wchar_t)0xFF59, (wchar_t)0xFF5A
143 };
144 
145 static const wchar_t unicode_uppers[] = {
146   (wchar_t)0x0041, (wchar_t)0x0042, (wchar_t)0x0043, (wchar_t)0x0044, (wchar_t)0x0045, (wchar_t)0x0046, (wchar_t)0x0047, (wchar_t)0x0048, (wchar_t)0x0049,
147   (wchar_t)0x004A, (wchar_t)0x004B, (wchar_t)0x004C, (wchar_t)0x004D, (wchar_t)0x004E, (wchar_t)0x004F, (wchar_t)0x0050, (wchar_t)0x0051, (wchar_t)0x0052,
148   (wchar_t)0x0053, (wchar_t)0x0054, (wchar_t)0x0055, (wchar_t)0x0056, (wchar_t)0x0057, (wchar_t)0x0058, (wchar_t)0x0059, (wchar_t)0x005A, (wchar_t)0x00C0,
149   (wchar_t)0x00C1, (wchar_t)0x00C2, (wchar_t)0x00C3, (wchar_t)0x00C4, (wchar_t)0x00C5, (wchar_t)0x00C6, (wchar_t)0x00C7, (wchar_t)0x00C8, (wchar_t)0x00C9,
150   (wchar_t)0x00CA, (wchar_t)0x00CB, (wchar_t)0x00CC, (wchar_t)0x00CD, (wchar_t)0x00CE, (wchar_t)0x00CF, (wchar_t)0x00D0, (wchar_t)0x00D1, (wchar_t)0x00D2,
151   (wchar_t)0x00D3, (wchar_t)0x00D4, (wchar_t)0x00D5, (wchar_t)0x00D6, (wchar_t)0x00D8, (wchar_t)0x00D9, (wchar_t)0x00DA, (wchar_t)0x00DB, (wchar_t)0x00DC,
152   (wchar_t)0x00DD, (wchar_t)0x00DE, (wchar_t)0x0178, (wchar_t)0x0100, (wchar_t)0x0102, (wchar_t)0x0104, (wchar_t)0x0106, (wchar_t)0x0108, (wchar_t)0x010A,
153   (wchar_t)0x010C, (wchar_t)0x010E, (wchar_t)0x0110, (wchar_t)0x0112, (wchar_t)0x0114, (wchar_t)0x0116, (wchar_t)0x0118, (wchar_t)0x011A, (wchar_t)0x011C,
154   (wchar_t)0x011E, (wchar_t)0x0120, (wchar_t)0x0122, (wchar_t)0x0124, (wchar_t)0x0126, (wchar_t)0x0128, (wchar_t)0x012A, (wchar_t)0x012C, (wchar_t)0x012E,
155   (wchar_t)0x0049, (wchar_t)0x0132, (wchar_t)0x0134, (wchar_t)0x0136, (wchar_t)0x0139, (wchar_t)0x013B, (wchar_t)0x013D, (wchar_t)0x013F, (wchar_t)0x0141,
156   (wchar_t)0x0143, (wchar_t)0x0145, (wchar_t)0x0147, (wchar_t)0x014A, (wchar_t)0x014C, (wchar_t)0x014E, (wchar_t)0x0150, (wchar_t)0x0152, (wchar_t)0x0154,
157   (wchar_t)0x0156, (wchar_t)0x0158, (wchar_t)0x015A, (wchar_t)0x015C, (wchar_t)0x015E, (wchar_t)0x0160, (wchar_t)0x0162, (wchar_t)0x0164, (wchar_t)0x0166,
158   (wchar_t)0x0168, (wchar_t)0x016A, (wchar_t)0x016C, (wchar_t)0x016E, (wchar_t)0x0170, (wchar_t)0x0172, (wchar_t)0x0174, (wchar_t)0x0176, (wchar_t)0x0179,
159   (wchar_t)0x017B, (wchar_t)0x017D, (wchar_t)0x0182, (wchar_t)0x0184, (wchar_t)0x0187, (wchar_t)0x018B, (wchar_t)0x0191, (wchar_t)0x0198, (wchar_t)0x01A0,
160   (wchar_t)0x01A2, (wchar_t)0x01A4, (wchar_t)0x01A7, (wchar_t)0x01AC, (wchar_t)0x01AF, (wchar_t)0x01B3, (wchar_t)0x01B5, (wchar_t)0x01B8, (wchar_t)0x01BC,
161   (wchar_t)0x01C4, (wchar_t)0x01C7, (wchar_t)0x01CA, (wchar_t)0x01CD, (wchar_t)0x01CF, (wchar_t)0x01D1, (wchar_t)0x01D3, (wchar_t)0x01D5, (wchar_t)0x01D7,
162   (wchar_t)0x01D9, (wchar_t)0x01DB, (wchar_t)0x01DE, (wchar_t)0x01E0, (wchar_t)0x01E2, (wchar_t)0x01E4, (wchar_t)0x01E6, (wchar_t)0x01E8, (wchar_t)0x01EA,
163   (wchar_t)0x01EC, (wchar_t)0x01EE, (wchar_t)0x01F1, (wchar_t)0x01F4, (wchar_t)0x01FA, (wchar_t)0x01FC, (wchar_t)0x01FE, (wchar_t)0x0200, (wchar_t)0x0202,
164   (wchar_t)0x0204, (wchar_t)0x0206, (wchar_t)0x0208, (wchar_t)0x020A, (wchar_t)0x020C, (wchar_t)0x020E, (wchar_t)0x0210, (wchar_t)0x0212, (wchar_t)0x0214,
165   (wchar_t)0x0216, (wchar_t)0x0181, (wchar_t)0x0186, (wchar_t)0x018A, (wchar_t)0x018E, (wchar_t)0x018F, (wchar_t)0x0190, (wchar_t)0x0193, (wchar_t)0x0194,
166   (wchar_t)0x0197, (wchar_t)0x0196, (wchar_t)0x019C, (wchar_t)0x019D, (wchar_t)0x019F, (wchar_t)0x01A9, (wchar_t)0x01AE, (wchar_t)0x01B1, (wchar_t)0x01B2,
167   (wchar_t)0x01B7, (wchar_t)0x0386, (wchar_t)0x0388, (wchar_t)0x0389, (wchar_t)0x038A, (wchar_t)0x0391, (wchar_t)0x0392, (wchar_t)0x0393, (wchar_t)0x0394,
168   (wchar_t)0x0395, (wchar_t)0x0396, (wchar_t)0x0397, (wchar_t)0x0398, (wchar_t)0x0399, (wchar_t)0x039A, (wchar_t)0x039B, (wchar_t)0x039C, (wchar_t)0x039D,
169   (wchar_t)0x039E, (wchar_t)0x039F, (wchar_t)0x03A0, (wchar_t)0x03A1, (wchar_t)0x03A3, (wchar_t)0x03A4, (wchar_t)0x03A5, (wchar_t)0x03A6, (wchar_t)0x03A7,
170   (wchar_t)0x03A8, (wchar_t)0x03A9, (wchar_t)0x03AA, (wchar_t)0x03AB, (wchar_t)0x038C, (wchar_t)0x038E, (wchar_t)0x038F, (wchar_t)0x03E2, (wchar_t)0x03E4,
171   (wchar_t)0x03E6, (wchar_t)0x03E8, (wchar_t)0x03EA, (wchar_t)0x03EC, (wchar_t)0x03EE, (wchar_t)0x0410, (wchar_t)0x0411, (wchar_t)0x0412, (wchar_t)0x0413,
172   (wchar_t)0x0414, (wchar_t)0x0415, (wchar_t)0x0416, (wchar_t)0x0417, (wchar_t)0x0418, (wchar_t)0x0419, (wchar_t)0x041A, (wchar_t)0x041B, (wchar_t)0x041C,
173   (wchar_t)0x041D, (wchar_t)0x041E, (wchar_t)0x041F, (wchar_t)0x0420, (wchar_t)0x0421, (wchar_t)0x0422, (wchar_t)0x0423, (wchar_t)0x0424, (wchar_t)0x0425,
174   (wchar_t)0x0426, (wchar_t)0x0427, (wchar_t)0x0428, (wchar_t)0x0429, (wchar_t)0x042A, (wchar_t)0x042B, (wchar_t)0x042C, (wchar_t)0x042D, (wchar_t)0x042E,
175   (wchar_t)0x042F, (wchar_t)0x0401, (wchar_t)0x0402, (wchar_t)0x0403, (wchar_t)0x0404, (wchar_t)0x0405, (wchar_t)0x0406, (wchar_t)0x0407, (wchar_t)0x0408,
176   (wchar_t)0x0409, (wchar_t)0x040A, (wchar_t)0x040B, (wchar_t)0x040C, (wchar_t)0x040E, (wchar_t)0x040F, (wchar_t)0x0460, (wchar_t)0x0462, (wchar_t)0x0464,
177   (wchar_t)0x0466, (wchar_t)0x0468, (wchar_t)0x046A, (wchar_t)0x046C, (wchar_t)0x046E, (wchar_t)0x0470, (wchar_t)0x0472, (wchar_t)0x0474, (wchar_t)0x0476,
178   (wchar_t)0x0478, (wchar_t)0x047A, (wchar_t)0x047C, (wchar_t)0x047E, (wchar_t)0x0480, (wchar_t)0x0490, (wchar_t)0x0492, (wchar_t)0x0494, (wchar_t)0x0496,
179   (wchar_t)0x0498, (wchar_t)0x049A, (wchar_t)0x049C, (wchar_t)0x049E, (wchar_t)0x04A0, (wchar_t)0x04A2, (wchar_t)0x04A4, (wchar_t)0x04A6, (wchar_t)0x04A8,
180   (wchar_t)0x04AA, (wchar_t)0x04AC, (wchar_t)0x04AE, (wchar_t)0x04B0, (wchar_t)0x04B2, (wchar_t)0x04B4, (wchar_t)0x04B6, (wchar_t)0x04B8, (wchar_t)0x04BA,
181   (wchar_t)0x04BC, (wchar_t)0x04BE, (wchar_t)0x04C1, (wchar_t)0x04C3, (wchar_t)0x04C7, (wchar_t)0x04CB, (wchar_t)0x04D0, (wchar_t)0x04D2, (wchar_t)0x04D4,
182   (wchar_t)0x04D6, (wchar_t)0x04D8, (wchar_t)0x04DA, (wchar_t)0x04DC, (wchar_t)0x04DE, (wchar_t)0x04E0, (wchar_t)0x04E2, (wchar_t)0x04E4, (wchar_t)0x04E6,
183   (wchar_t)0x04E8, (wchar_t)0x04EA, (wchar_t)0x04EE, (wchar_t)0x04F0, (wchar_t)0x04F2, (wchar_t)0x04F4, (wchar_t)0x04F8, (wchar_t)0x0531, (wchar_t)0x0532,
184   (wchar_t)0x0533, (wchar_t)0x0534, (wchar_t)0x0535, (wchar_t)0x0536, (wchar_t)0x0537, (wchar_t)0x0538, (wchar_t)0x0539, (wchar_t)0x053A, (wchar_t)0x053B,
185   (wchar_t)0x053C, (wchar_t)0x053D, (wchar_t)0x053E, (wchar_t)0x053F, (wchar_t)0x0540, (wchar_t)0x0541, (wchar_t)0x0542, (wchar_t)0x0543, (wchar_t)0x0544,
186   (wchar_t)0x0545, (wchar_t)0x0546, (wchar_t)0x0547, (wchar_t)0x0548, (wchar_t)0x0549, (wchar_t)0x054A, (wchar_t)0x054B, (wchar_t)0x054C, (wchar_t)0x054D,
187   (wchar_t)0x054E, (wchar_t)0x054F, (wchar_t)0x0550, (wchar_t)0x0551, (wchar_t)0x0552, (wchar_t)0x0553, (wchar_t)0x0554, (wchar_t)0x0555, (wchar_t)0x0556,
188   (wchar_t)0x10A0, (wchar_t)0x10A1, (wchar_t)0x10A2, (wchar_t)0x10A3, (wchar_t)0x10A4, (wchar_t)0x10A5, (wchar_t)0x10A6, (wchar_t)0x10A7, (wchar_t)0x10A8,
189   (wchar_t)0x10A9, (wchar_t)0x10AA, (wchar_t)0x10AB, (wchar_t)0x10AC, (wchar_t)0x10AD, (wchar_t)0x10AE, (wchar_t)0x10AF, (wchar_t)0x10B0, (wchar_t)0x10B1,
190   (wchar_t)0x10B2, (wchar_t)0x10B3, (wchar_t)0x10B4, (wchar_t)0x10B5, (wchar_t)0x10B6, (wchar_t)0x10B7, (wchar_t)0x10B8, (wchar_t)0x10B9, (wchar_t)0x10BA,
191   (wchar_t)0x10BB, (wchar_t)0x10BC, (wchar_t)0x10BD, (wchar_t)0x10BE, (wchar_t)0x10BF, (wchar_t)0x10C0, (wchar_t)0x10C1, (wchar_t)0x10C2, (wchar_t)0x10C3,
192   (wchar_t)0x10C4, (wchar_t)0x10C5, (wchar_t)0x1E00, (wchar_t)0x1E02, (wchar_t)0x1E04, (wchar_t)0x1E06, (wchar_t)0x1E08, (wchar_t)0x1E0A, (wchar_t)0x1E0C,
193   (wchar_t)0x1E0E, (wchar_t)0x1E10, (wchar_t)0x1E12, (wchar_t)0x1E14, (wchar_t)0x1E16, (wchar_t)0x1E18, (wchar_t)0x1E1A, (wchar_t)0x1E1C, (wchar_t)0x1E1E,
194   (wchar_t)0x1E20, (wchar_t)0x1E22, (wchar_t)0x1E24, (wchar_t)0x1E26, (wchar_t)0x1E28, (wchar_t)0x1E2A, (wchar_t)0x1E2C, (wchar_t)0x1E2E, (wchar_t)0x1E30,
195   (wchar_t)0x1E32, (wchar_t)0x1E34, (wchar_t)0x1E36, (wchar_t)0x1E38, (wchar_t)0x1E3A, (wchar_t)0x1E3C, (wchar_t)0x1E3E, (wchar_t)0x1E40, (wchar_t)0x1E42,
196   (wchar_t)0x1E44, (wchar_t)0x1E46, (wchar_t)0x1E48, (wchar_t)0x1E4A, (wchar_t)0x1E4C, (wchar_t)0x1E4E, (wchar_t)0x1E50, (wchar_t)0x1E52, (wchar_t)0x1E54,
197   (wchar_t)0x1E56, (wchar_t)0x1E58, (wchar_t)0x1E5A, (wchar_t)0x1E5C, (wchar_t)0x1E5E, (wchar_t)0x1E60, (wchar_t)0x1E62, (wchar_t)0x1E64, (wchar_t)0x1E66,
198   (wchar_t)0x1E68, (wchar_t)0x1E6A, (wchar_t)0x1E6C, (wchar_t)0x1E6E, (wchar_t)0x1E70, (wchar_t)0x1E72, (wchar_t)0x1E74, (wchar_t)0x1E76, (wchar_t)0x1E78,
199   (wchar_t)0x1E7A, (wchar_t)0x1E7C, (wchar_t)0x1E7E, (wchar_t)0x1E80, (wchar_t)0x1E82, (wchar_t)0x1E84, (wchar_t)0x1E86, (wchar_t)0x1E88, (wchar_t)0x1E8A,
200   (wchar_t)0x1E8C, (wchar_t)0x1E8E, (wchar_t)0x1E90, (wchar_t)0x1E92, (wchar_t)0x1E94, (wchar_t)0x1EA0, (wchar_t)0x1EA2, (wchar_t)0x1EA4, (wchar_t)0x1EA6,
201   (wchar_t)0x1EA8, (wchar_t)0x1EAA, (wchar_t)0x1EAC, (wchar_t)0x1EAE, (wchar_t)0x1EB0, (wchar_t)0x1EB2, (wchar_t)0x1EB4, (wchar_t)0x1EB6, (wchar_t)0x1EB8,
202   (wchar_t)0x1EBA, (wchar_t)0x1EBC, (wchar_t)0x1EBE, (wchar_t)0x1EC0, (wchar_t)0x1EC2, (wchar_t)0x1EC4, (wchar_t)0x1EC6, (wchar_t)0x1EC8, (wchar_t)0x1ECA,
203   (wchar_t)0x1ECC, (wchar_t)0x1ECE, (wchar_t)0x1ED0, (wchar_t)0x1ED2, (wchar_t)0x1ED4, (wchar_t)0x1ED6, (wchar_t)0x1ED8, (wchar_t)0x1EDA, (wchar_t)0x1EDC,
204   (wchar_t)0x1EDE, (wchar_t)0x1EE0, (wchar_t)0x1EE2, (wchar_t)0x1EE4, (wchar_t)0x1EE6, (wchar_t)0x1EE8, (wchar_t)0x1EEA, (wchar_t)0x1EEC, (wchar_t)0x1EEE,
205   (wchar_t)0x1EF0, (wchar_t)0x1EF2, (wchar_t)0x1EF4, (wchar_t)0x1EF6, (wchar_t)0x1EF8, (wchar_t)0x1F08, (wchar_t)0x1F09, (wchar_t)0x1F0A, (wchar_t)0x1F0B,
206   (wchar_t)0x1F0C, (wchar_t)0x1F0D, (wchar_t)0x1F0E, (wchar_t)0x1F0F, (wchar_t)0x1F18, (wchar_t)0x1F19, (wchar_t)0x1F1A, (wchar_t)0x1F1B, (wchar_t)0x1F1C,
207   (wchar_t)0x1F1D, (wchar_t)0x1F28, (wchar_t)0x1F29, (wchar_t)0x1F2A, (wchar_t)0x1F2B, (wchar_t)0x1F2C, (wchar_t)0x1F2D, (wchar_t)0x1F2E, (wchar_t)0x1F2F,
208   (wchar_t)0x1F38, (wchar_t)0x1F39, (wchar_t)0x1F3A, (wchar_t)0x1F3B, (wchar_t)0x1F3C, (wchar_t)0x1F3D, (wchar_t)0x1F3E, (wchar_t)0x1F3F, (wchar_t)0x1F48,
209   (wchar_t)0x1F49, (wchar_t)0x1F4A, (wchar_t)0x1F4B, (wchar_t)0x1F4C, (wchar_t)0x1F4D, (wchar_t)0x1F59, (wchar_t)0x1F5B, (wchar_t)0x1F5D, (wchar_t)0x1F5F,
210   (wchar_t)0x1F68, (wchar_t)0x1F69, (wchar_t)0x1F6A, (wchar_t)0x1F6B, (wchar_t)0x1F6C, (wchar_t)0x1F6D, (wchar_t)0x1F6E, (wchar_t)0x1F6F, (wchar_t)0x1F88,
211   (wchar_t)0x1F89, (wchar_t)0x1F8A, (wchar_t)0x1F8B, (wchar_t)0x1F8C, (wchar_t)0x1F8D, (wchar_t)0x1F8E, (wchar_t)0x1F8F, (wchar_t)0x1F98, (wchar_t)0x1F99,
212   (wchar_t)0x1F9A, (wchar_t)0x1F9B, (wchar_t)0x1F9C, (wchar_t)0x1F9D, (wchar_t)0x1F9E, (wchar_t)0x1F9F, (wchar_t)0x1FA8, (wchar_t)0x1FA9, (wchar_t)0x1FAA,
213   (wchar_t)0x1FAB, (wchar_t)0x1FAC, (wchar_t)0x1FAD, (wchar_t)0x1FAE, (wchar_t)0x1FAF, (wchar_t)0x1FB8, (wchar_t)0x1FB9, (wchar_t)0x1FD8, (wchar_t)0x1FD9,
214   (wchar_t)0x1FE8, (wchar_t)0x1FE9, (wchar_t)0x24B6, (wchar_t)0x24B7, (wchar_t)0x24B8, (wchar_t)0x24B9, (wchar_t)0x24BA, (wchar_t)0x24BB, (wchar_t)0x24BC,
215   (wchar_t)0x24BD, (wchar_t)0x24BE, (wchar_t)0x24BF, (wchar_t)0x24C0, (wchar_t)0x24C1, (wchar_t)0x24C2, (wchar_t)0x24C3, (wchar_t)0x24C4, (wchar_t)0x24C5,
216   (wchar_t)0x24C6, (wchar_t)0x24C7, (wchar_t)0x24C8, (wchar_t)0x24C9, (wchar_t)0x24CA, (wchar_t)0x24CB, (wchar_t)0x24CC, (wchar_t)0x24CD, (wchar_t)0x24CE,
217   (wchar_t)0x24CF, (wchar_t)0xFF21, (wchar_t)0xFF22, (wchar_t)0xFF23, (wchar_t)0xFF24, (wchar_t)0xFF25, (wchar_t)0xFF26, (wchar_t)0xFF27, (wchar_t)0xFF28,
218   (wchar_t)0xFF29, (wchar_t)0xFF2A, (wchar_t)0xFF2B, (wchar_t)0xFF2C, (wchar_t)0xFF2D, (wchar_t)0xFF2E, (wchar_t)0xFF2F, (wchar_t)0xFF30, (wchar_t)0xFF31,
219   (wchar_t)0xFF32, (wchar_t)0xFF33, (wchar_t)0xFF34, (wchar_t)0xFF35, (wchar_t)0xFF36, (wchar_t)0xFF37, (wchar_t)0xFF38, (wchar_t)0xFF39, (wchar_t)0xFF3A
220 };
221 
222 
FormatV(const char * fmt,va_list args)223 std::string StringUtils::FormatV(const char *fmt, va_list args)
224 {
225   if (!fmt || !fmt[0])
226     return "";
227 
228   int size = FORMAT_BLOCK_SIZE;
229   va_list argCopy;
230 
231   while (true)
232   {
233     char *cstr = reinterpret_cast<char*>(malloc(sizeof(char) * size));
234     if (!cstr)
235       return "";
236 
237     va_copy(argCopy, args);
238     int nActual = vsnprintf(cstr, size, fmt, argCopy);
239     va_end(argCopy);
240 
241     if (nActual > -1 && nActual < size) // We got a valid result
242     {
243       std::string str(cstr, nActual);
244       free(cstr);
245       return str;
246     }
247     free(cstr);
248 #ifndef TARGET_WINDOWS
249     if (nActual > -1)                   // Exactly what we will need (glibc 2.1)
250       size = nActual + 1;
251     else                                // Let's try to double the size (glibc 2.0)
252       size *= 2;
253 #else  // TARGET_WINDOWS
254     va_copy(argCopy, args);
255     size = _vscprintf(fmt, argCopy);
256     va_end(argCopy);
257     if (size < 0)
258       return "";
259     else
260       size++; // increment for null-termination
261 #endif // TARGET_WINDOWS
262   }
263 
264   return ""; // unreachable
265 }
266 
FormatV(const wchar_t * fmt,va_list args)267 std::wstring StringUtils::FormatV(const wchar_t *fmt, va_list args)
268 {
269   if (!fmt || !fmt[0])
270     return L"";
271 
272   int size = FORMAT_BLOCK_SIZE;
273   va_list argCopy;
274 
275   while (true)
276   {
277     wchar_t *cstr = reinterpret_cast<wchar_t*>(malloc(sizeof(wchar_t) * size));
278     if (!cstr)
279       return L"";
280 
281     va_copy(argCopy, args);
282     int nActual = vswprintf(cstr, size, fmt, argCopy);
283     va_end(argCopy);
284 
285     if (nActual > -1 && nActual < size) // We got a valid result
286     {
287       std::wstring str(cstr, nActual);
288       free(cstr);
289       return str;
290     }
291     free(cstr);
292 
293 #ifndef TARGET_WINDOWS
294     if (nActual > -1)                   // Exactly what we will need (glibc 2.1)
295       size = nActual + 1;
296     else                                // Let's try to double the size (glibc 2.0)
297       size *= 2;
298 #else  // TARGET_WINDOWS
299     va_copy(argCopy, args);
300     size = _vscwprintf(fmt, argCopy);
301     va_end(argCopy);
302     if (size < 0)
303       return L"";
304     else
305       size++; // increment for null-termination
306 #endif // TARGET_WINDOWS
307   }
308 
309   return L"";
310 }
311 
compareWchar(const void * a,const void * b)312 int compareWchar (const void* a, const void* b)
313 {
314   if (*(const wchar_t*)a <  *(const wchar_t*)b)
315     return -1;
316   else if (*(const wchar_t*)a >  *(const wchar_t*)b)
317     return 1;
318   return 0;
319 }
320 
tolowerUnicode(const wchar_t & c)321 wchar_t tolowerUnicode(const wchar_t& c)
322 {
323   wchar_t* p = (wchar_t*) bsearch (&c, unicode_uppers, sizeof(unicode_uppers) / sizeof(wchar_t), sizeof(wchar_t), compareWchar);
324   if (p)
325     return *(unicode_lowers + (p - unicode_uppers));
326 
327   return c;
328 }
329 
toupperUnicode(const wchar_t & c)330 wchar_t toupperUnicode(const wchar_t& c)
331 {
332   wchar_t* p = (wchar_t*) bsearch (&c, unicode_lowers, sizeof(unicode_lowers) / sizeof(wchar_t), sizeof(wchar_t), compareWchar);
333   if (p)
334     return *(unicode_uppers + (p - unicode_lowers));
335 
336   return c;
337 }
338 
ToUpper(std::string & str)339 void StringUtils::ToUpper(std::string &str)
340 {
341   std::transform(str.begin(), str.end(), str.begin(), ::toupper);
342 }
343 
ToUpper(std::wstring & str)344 void StringUtils::ToUpper(std::wstring &str)
345 {
346   transform(str.begin(), str.end(), str.begin(), toupperUnicode);
347 }
348 
ToLower(std::string & str)349 void StringUtils::ToLower(std::string &str)
350 {
351   transform(str.begin(), str.end(), str.begin(), ::tolower);
352 }
353 
ToLower(std::wstring & str)354 void StringUtils::ToLower(std::wstring &str)
355 {
356   transform(str.begin(), str.end(), str.begin(), tolowerUnicode);
357 }
358 
ToCapitalize(std::string & str)359 void StringUtils::ToCapitalize(std::string &str)
360 {
361   std::wstring wstr;
362   g_charsetConverter.utf8ToW(str, wstr);
363   ToCapitalize(wstr);
364   g_charsetConverter.wToUTF8(wstr, str);
365 }
366 
ToCapitalize(std::wstring & str)367 void StringUtils::ToCapitalize(std::wstring &str)
368 {
369   const std::locale& loc = g_langInfo.GetSystemLocale();
370   bool isFirstLetter = true;
371   for (std::wstring::iterator it = str.begin(); it < str.end(); ++it)
372   {
373     // capitalize after spaces and punctuation characters (except apostrophes)
374     if (std::isspace(*it, loc) || (std::ispunct(*it, loc) && *it != '\''))
375       isFirstLetter = true;
376     else if (isFirstLetter)
377     {
378       *it = std::toupper(*it, loc);
379       isFirstLetter = false;
380     }
381   }
382 }
383 
EqualsNoCase(const std::string & str1,const std::string & str2)384 bool StringUtils::EqualsNoCase(const std::string &str1, const std::string &str2)
385 {
386   // before we do the char-by-char comparison, first compare sizes of both strings.
387   // This led to a 33% improvement in benchmarking on average. (size() just returns a member of std::string)
388   if (str1.size() != str2.size())
389     return false;
390   return EqualsNoCase(str1.c_str(), str2.c_str());
391 }
392 
EqualsNoCase(const std::string & str1,const char * s2)393 bool StringUtils::EqualsNoCase(const std::string &str1, const char *s2)
394 {
395   return EqualsNoCase(str1.c_str(), s2);
396 }
397 
EqualsNoCase(const char * s1,const char * s2)398 bool StringUtils::EqualsNoCase(const char *s1, const char *s2)
399 {
400   char c2; // we need only one char outside the loop
401   do
402   {
403     const char c1 = *s1++; // const local variable should help compiler to optimize
404     c2 = *s2++;
405     if (c1 != c2 && ::tolower(c1) != ::tolower(c2)) // This includes the possibility that one of the characters is the null-terminator, which implies a string mismatch.
406       return false;
407   } while (c2 != '\0'); // At this point, we know c1 == c2, so there's no need to test them both.
408   return true;
409 }
410 
CompareNoCase(const std::string & str1,const std::string & str2,size_t n)411 int StringUtils::CompareNoCase(const std::string& str1, const std::string& str2, size_t n /* = 0 */)
412 {
413   return CompareNoCase(str1.c_str(), str2.c_str(), n);
414 }
415 
CompareNoCase(const char * s1,const char * s2,size_t n)416 int StringUtils::CompareNoCase(const char* s1, const char* s2, size_t n /* = 0 */)
417 {
418   char c2; // we need only one char outside the loop
419   size_t index = 0;
420   do
421   {
422     const char c1 = *s1++; // const local variable should help compiler to optimize
423     c2 = *s2++;
424     index++;
425     if (c1 != c2 && ::tolower(c1) != ::tolower(c2)) // This includes the possibility that one of the characters is the null-terminator, which implies a string mismatch.
426       return ::tolower(c1) - ::tolower(c2);
427   } while (c2 != '\0' &&
428            index != n); // At this point, we know c1 == c2, so there's no need to test them both.
429   return 0;
430 }
431 
Left(const std::string & str,size_t count)432 std::string StringUtils::Left(const std::string &str, size_t count)
433 {
434   count = std::max((size_t)0, std::min(count, str.size()));
435   return str.substr(0, count);
436 }
437 
Mid(const std::string & str,size_t first,size_t count)438 std::string StringUtils::Mid(const std::string &str, size_t first, size_t count /* = string::npos */)
439 {
440   if (first + count > str.size())
441     count = str.size() - first;
442 
443   if (first > str.size())
444     return std::string();
445 
446   assert(first + count <= str.size());
447 
448   return str.substr(first, count);
449 }
450 
Right(const std::string & str,size_t count)451 std::string StringUtils::Right(const std::string &str, size_t count)
452 {
453   count = std::max((size_t)0, std::min(count, str.size()));
454   return str.substr(str.size() - count);
455 }
456 
Trim(std::string & str)457 std::string& StringUtils::Trim(std::string &str)
458 {
459   TrimLeft(str);
460   return TrimRight(str);
461 }
462 
Trim(std::string & str,const char * const chars)463 std::string& StringUtils::Trim(std::string &str, const char* const chars)
464 {
465   TrimLeft(str, chars);
466   return TrimRight(str, chars);
467 }
468 
469 // hack to check only first byte of UTF-8 character
470 // without this hack "TrimX" functions failed on Win32 and OS X with UTF-8 strings
isspace_c(char c)471 static int isspace_c(char c)
472 {
473   return (c & 0x80) == 0 && ::isspace(c);
474 }
475 
TrimLeft(std::string & str)476 std::string& StringUtils::TrimLeft(std::string &str)
477 {
478   str.erase(str.begin(),
479             std::find_if(str.begin(), str.end(), [](char s) { return isspace_c(s) == 0; }));
480   return str;
481 }
482 
TrimLeft(std::string & str,const char * const chars)483 std::string& StringUtils::TrimLeft(std::string &str, const char* const chars)
484 {
485   size_t nidx = str.find_first_not_of(chars);
486   str.erase(0, nidx);
487   return str;
488 }
489 
TrimRight(std::string & str)490 std::string& StringUtils::TrimRight(std::string &str)
491 {
492   str.erase(std::find_if(str.rbegin(), str.rend(), [](char s) { return isspace_c(s) == 0; }).base(),
493             str.end());
494   return str;
495 }
496 
TrimRight(std::string & str,const char * const chars)497 std::string& StringUtils::TrimRight(std::string &str, const char* const chars)
498 {
499   size_t nidx = str.find_last_not_of(chars);
500   str.erase(str.npos == nidx ? 0 : ++nidx);
501   return str;
502 }
503 
ReturnDigits(const std::string & str)504 int StringUtils::ReturnDigits(const std::string& str)
505 {
506   std::stringstream ss;
507   for (const auto& character : str)
508   {
509     if (isdigit(character))
510       ss << character;
511   }
512   return atoi(ss.str().c_str());
513 }
514 
RemoveDuplicatedSpacesAndTabs(std::string & str)515 std::string& StringUtils::RemoveDuplicatedSpacesAndTabs(std::string& str)
516 {
517   std::string::iterator it = str.begin();
518   bool onSpace = false;
519   while(it != str.end())
520   {
521     if (*it == '\t')
522       *it = ' ';
523 
524     if (*it == ' ')
525     {
526       if (onSpace)
527       {
528         it = str.erase(it);
529         continue;
530       }
531       else
532         onSpace = true;
533     }
534     else
535       onSpace = false;
536 
537     ++it;
538   }
539   return str;
540 }
541 
Replace(std::string & str,char oldChar,char newChar)542 int StringUtils::Replace(std::string &str, char oldChar, char newChar)
543 {
544   int replacedChars = 0;
545   for (std::string::iterator it = str.begin(); it != str.end(); ++it)
546   {
547     if (*it == oldChar)
548     {
549       *it = newChar;
550       replacedChars++;
551     }
552   }
553 
554   return replacedChars;
555 }
556 
Replace(std::string & str,const std::string & oldStr,const std::string & newStr)557 int StringUtils::Replace(std::string &str, const std::string &oldStr, const std::string &newStr)
558 {
559   if (oldStr.empty())
560     return 0;
561 
562   int replacedChars = 0;
563   size_t index = 0;
564 
565   while (index < str.size() && (index = str.find(oldStr, index)) != std::string::npos)
566   {
567     str.replace(index, oldStr.size(), newStr);
568     index += newStr.size();
569     replacedChars++;
570   }
571 
572   return replacedChars;
573 }
574 
Replace(std::wstring & str,const std::wstring & oldStr,const std::wstring & newStr)575 int StringUtils::Replace(std::wstring &str, const std::wstring &oldStr, const std::wstring &newStr)
576 {
577   if (oldStr.empty())
578     return 0;
579 
580   int replacedChars = 0;
581   size_t index = 0;
582 
583   while (index < str.size() && (index = str.find(oldStr, index)) != std::string::npos)
584   {
585     str.replace(index, oldStr.size(), newStr);
586     index += newStr.size();
587     replacedChars++;
588   }
589 
590   return replacedChars;
591 }
592 
StartsWith(const std::string & str1,const std::string & str2)593 bool StringUtils::StartsWith(const std::string &str1, const std::string &str2)
594 {
595   return str1.compare(0, str2.size(), str2) == 0;
596 }
597 
StartsWith(const std::string & str1,const char * s2)598 bool StringUtils::StartsWith(const std::string &str1, const char *s2)
599 {
600   return StartsWith(str1.c_str(), s2);
601 }
602 
StartsWith(const char * s1,const char * s2)603 bool StringUtils::StartsWith(const char *s1, const char *s2)
604 {
605   while (*s2 != '\0')
606   {
607     if (*s1 != *s2)
608       return false;
609     s1++;
610     s2++;
611   }
612   return true;
613 }
614 
StartsWithNoCase(const std::string & str1,const std::string & str2)615 bool StringUtils::StartsWithNoCase(const std::string &str1, const std::string &str2)
616 {
617   return StartsWithNoCase(str1.c_str(), str2.c_str());
618 }
619 
StartsWithNoCase(const std::string & str1,const char * s2)620 bool StringUtils::StartsWithNoCase(const std::string &str1, const char *s2)
621 {
622   return StartsWithNoCase(str1.c_str(), s2);
623 }
624 
StartsWithNoCase(const char * s1,const char * s2)625 bool StringUtils::StartsWithNoCase(const char *s1, const char *s2)
626 {
627   while (*s2 != '\0')
628   {
629     if (::tolower(*s1) != ::tolower(*s2))
630       return false;
631     s1++;
632     s2++;
633   }
634   return true;
635 }
636 
EndsWith(const std::string & str1,const std::string & str2)637 bool StringUtils::EndsWith(const std::string &str1, const std::string &str2)
638 {
639   if (str1.size() < str2.size())
640     return false;
641   return str1.compare(str1.size() - str2.size(), str2.size(), str2) == 0;
642 }
643 
EndsWith(const std::string & str1,const char * s2)644 bool StringUtils::EndsWith(const std::string &str1, const char *s2)
645 {
646   size_t len2 = strlen(s2);
647   if (str1.size() < len2)
648     return false;
649   return str1.compare(str1.size() - len2, len2, s2) == 0;
650 }
651 
EndsWithNoCase(const std::string & str1,const std::string & str2)652 bool StringUtils::EndsWithNoCase(const std::string &str1, const std::string &str2)
653 {
654   if (str1.size() < str2.size())
655     return false;
656   const char *s1 = str1.c_str() + str1.size() - str2.size();
657   const char *s2 = str2.c_str();
658   while (*s2 != '\0')
659   {
660     if (::tolower(*s1) != ::tolower(*s2))
661       return false;
662     s1++;
663     s2++;
664   }
665   return true;
666 }
667 
EndsWithNoCase(const std::string & str1,const char * s2)668 bool StringUtils::EndsWithNoCase(const std::string &str1, const char *s2)
669 {
670   size_t len2 = strlen(s2);
671   if (str1.size() < len2)
672     return false;
673   const char *s1 = str1.c_str() + str1.size() - len2;
674   while (*s2 != '\0')
675   {
676     if (::tolower(*s1) != ::tolower(*s2))
677       return false;
678     s1++;
679     s2++;
680   }
681   return true;
682 }
683 
Split(const std::string & input,const std::string & delimiter,unsigned int iMaxStrings)684 std::vector<std::string> StringUtils::Split(const std::string& input, const std::string& delimiter, unsigned int iMaxStrings)
685 {
686   std::vector<std::string> result;
687   SplitTo(std::back_inserter(result), input, delimiter, iMaxStrings);
688   return result;
689 }
690 
Split(const std::string & input,const char delimiter,size_t iMaxStrings)691 std::vector<std::string> StringUtils::Split(const std::string& input, const char delimiter, size_t iMaxStrings)
692 {
693   std::vector<std::string> result;
694   SplitTo(std::back_inserter(result), input, delimiter, iMaxStrings);
695   return result;
696 }
697 
Split(const std::string & input,const std::vector<std::string> & delimiters)698 std::vector<std::string> StringUtils::Split(const std::string& input, const std::vector<std::string>& delimiters)
699 {
700   std::vector<std::string> result;
701   SplitTo(std::back_inserter(result), input, delimiters);
702   return result;
703 }
704 
SplitMulti(const std::vector<std::string> & input,const std::vector<std::string> & delimiters,size_t iMaxStrings)705 std::vector<std::string> StringUtils::SplitMulti(const std::vector<std::string>& input,
706                                                  const std::vector<std::string>& delimiters,
707                                                  size_t iMaxStrings /* = 0 */)
708 {
709   if (input.empty())
710     return std::vector<std::string>();
711 
712   std::vector<std::string> results(input);
713 
714   if (delimiters.empty() || (iMaxStrings > 0 && iMaxStrings <= input.size()))
715     return results;
716 
717   std::vector<std::string> strings1;
718   if (iMaxStrings == 0)
719   {
720     for (size_t di = 0; di < delimiters.size(); di++)
721     {
722       for (size_t i = 0; i < results.size(); i++)
723       {
724         std::vector<std::string> substrings = StringUtils::Split(results[i], delimiters[di]);
725         for (size_t j = 0; j < substrings.size(); j++)
726           strings1.push_back(substrings[j]);
727       }
728       results = strings1;
729       strings1.clear();
730     }
731     return results;
732   }
733 
734   // Control the number of strings input is split into, keeping the original strings.
735   // Note iMaxStrings > input.size()
736   int64_t iNew = iMaxStrings - results.size();
737   for (size_t di = 0; di < delimiters.size(); di++)
738   {
739     for (size_t i = 0; i < results.size(); i++)
740     {
741       if (iNew > 0)
742       {
743         std::vector<std::string> substrings = StringUtils::Split(results[i], delimiters[di], iNew + 1);
744         iNew = iNew - substrings.size() + 1;
745         for (size_t j = 0; j < substrings.size(); j++)
746           strings1.push_back(substrings[j]);
747       }
748       else
749         strings1.push_back(results[i]);
750     }
751     results = strings1;
752     iNew = iMaxStrings - results.size();
753     strings1.clear();
754     if ((iNew <= 0))
755       break;  //Stop trying any more delimiters
756   }
757   return results;
758 }
759 
760 // returns the number of occurrences of strFind in strInput.
FindNumber(const std::string & strInput,const std::string & strFind)761 int StringUtils::FindNumber(const std::string& strInput, const std::string &strFind)
762 {
763   size_t pos = strInput.find(strFind, 0);
764   int numfound = 0;
765   while (pos != std::string::npos)
766   {
767     numfound++;
768     pos = strInput.find(strFind, pos + 1);
769   }
770   return numfound;
771 }
772 
773 // Plane maps for MySQL utf8_general_ci (now known as utf8mb3_general_ci) collation
774 // Derived from https://github.com/MariaDB/server/blob/10.5/strings/ctype-utf8.c
775 
776 // clang-format off
777 static const uint16_t plane00[] = {
778   0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
779   0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
780   0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
781   0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
782   0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
783   0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
784   0x0060, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
785   0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,
786   0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
787   0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
788   0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
789   0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x039C, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
790   0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x00C6, 0x0043, 0x0045, 0x0045, 0x0045, 0x0045, 0x0049, 0x0049, 0x0049, 0x0049,
791   0x00D0, 0x004E, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x00D7, 0x00D8, 0x0055, 0x0055, 0x0055, 0x0055, 0x0059, 0x00DE, 0x0053,
792   0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x00C6, 0x0043, 0x0045, 0x0045, 0x0045, 0x0045, 0x0049, 0x0049, 0x0049, 0x0049,
793   0x00D0, 0x004E, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x00F7, 0x00D8, 0x0055, 0x0055, 0x0055, 0x0055, 0x0059, 0x00DE, 0x0059
794 };
795 
796 static const uint16_t plane01[] = {
797   0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0043, 0x0043, 0x0043, 0x0043, 0x0043, 0x0043, 0x0043, 0x0043, 0x0044, 0x0044,
798   0x0110, 0x0110, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0047, 0x0047, 0x0047, 0x0047,
799   0x0047, 0x0047, 0x0047, 0x0047, 0x0048, 0x0048, 0x0126, 0x0126, 0x0049, 0x0049, 0x0049, 0x0049, 0x0049, 0x0049, 0x0049, 0x0049,
800   0x0049, 0x0049, 0x0132, 0x0132, 0x004A, 0x004A, 0x004B, 0x004B, 0x0138, 0x004C, 0x004C, 0x004C, 0x004C, 0x004C, 0x004C, 0x013F,
801   0x013F, 0x0141, 0x0141, 0x004E, 0x004E, 0x004E, 0x004E, 0x004E, 0x004E, 0x0149, 0x014A, 0x014A, 0x004F, 0x004F, 0x004F, 0x004F,
802   0x004F, 0x004F, 0x0152, 0x0152, 0x0052, 0x0052, 0x0052, 0x0052, 0x0052, 0x0052, 0x0053, 0x0053, 0x0053, 0x0053, 0x0053, 0x0053,
803   0x0053, 0x0053, 0x0054, 0x0054, 0x0054, 0x0054, 0x0166, 0x0166, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055,
804   0x0055, 0x0055, 0x0055, 0x0055, 0x0057, 0x0057, 0x0059, 0x0059, 0x0059, 0x005A, 0x005A, 0x005A, 0x005A, 0x005A, 0x005A, 0x0053,
805   0x0180, 0x0181, 0x0182, 0x0182, 0x0184, 0x0184, 0x0186, 0x0187, 0x0187, 0x0189, 0x018A, 0x018B, 0x018B, 0x018D, 0x018E, 0x018F,
806   0x0190, 0x0191, 0x0191, 0x0193, 0x0194, 0x01F6, 0x0196, 0x0197, 0x0198, 0x0198, 0x019A, 0x019B, 0x019C, 0x019D, 0x019E, 0x019F,
807   0x004F, 0x004F, 0x01A2, 0x01A2, 0x01A4, 0x01A4, 0x01A6, 0x01A7, 0x01A7, 0x01A9, 0x01AA, 0x01AB, 0x01AC, 0x01AC, 0x01AE, 0x0055,
808   0x0055, 0x01B1, 0x01B2, 0x01B3, 0x01B3, 0x01B5, 0x01B5, 0x01B7, 0x01B8, 0x01B8, 0x01BA, 0x01BB, 0x01BC, 0x01BC, 0x01BE, 0x01F7,
809   0x01C0, 0x01C1, 0x01C2, 0x01C3, 0x01C4, 0x01C4, 0x01C4, 0x01C7, 0x01C7, 0x01C7, 0x01CA, 0x01CA, 0x01CA, 0x0041, 0x0041, 0x0049,
810   0x0049, 0x004F, 0x004F, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x018E, 0x0041, 0x0041,
811   0x0041, 0x0041, 0x00C6, 0x00C6, 0x01E4, 0x01E4, 0x0047, 0x0047, 0x004B, 0x004B, 0x004F, 0x004F, 0x004F, 0x004F, 0x01B7, 0x01B7,
812   0x004A, 0x01F1, 0x01F1, 0x01F1, 0x0047, 0x0047, 0x01F6, 0x01F7, 0x004E, 0x004E, 0x0041, 0x0041, 0x00C6, 0x00C6, 0x00D8, 0x00D8
813 };
814 
815 static const uint16_t plane02[] = {
816   0x0041, 0x0041, 0x0041, 0x0041, 0x0045, 0x0045, 0x0045, 0x0045, 0x0049, 0x0049, 0x0049, 0x0049, 0x004F, 0x004F, 0x004F, 0x004F,
817   0x0052, 0x0052, 0x0052, 0x0052, 0x0055, 0x0055, 0x0055, 0x0055, 0x0053, 0x0053, 0x0054, 0x0054, 0x021C, 0x021C, 0x0048, 0x0048,
818   0x0220, 0x0221, 0x0222, 0x0222, 0x0224, 0x0224, 0x0041, 0x0041, 0x0045, 0x0045, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F,
819   0x004F, 0x004F, 0x0059, 0x0059, 0x0234, 0x0235, 0x0236, 0x0237, 0x0238, 0x0239, 0x023A, 0x023B, 0x023C, 0x023D, 0x023E, 0x023F,
820   0x0240, 0x0241, 0x0242, 0x0243, 0x0244, 0x0245, 0x0246, 0x0247, 0x0248, 0x0249, 0x024A, 0x024B, 0x024C, 0x024D, 0x024E, 0x024F,
821   0x0250, 0x0251, 0x0252, 0x0181, 0x0186, 0x0255, 0x0189, 0x018A, 0x0258, 0x018F, 0x025A, 0x0190, 0x025C, 0x025D, 0x025E, 0x025F,
822   0x0193, 0x0261, 0x0262, 0x0194, 0x0264, 0x0265, 0x0266, 0x0267, 0x0197, 0x0196, 0x026A, 0x026B, 0x026C, 0x026D, 0x026E, 0x019C,
823   0x0270, 0x0271, 0x019D, 0x0273, 0x0274, 0x019F, 0x0276, 0x0277, 0x0278, 0x0279, 0x027A, 0x027B, 0x027C, 0x027D, 0x027E, 0x027F,
824   0x01A6, 0x0281, 0x0282, 0x01A9, 0x0284, 0x0285, 0x0286, 0x0287, 0x01AE, 0x0289, 0x01B1, 0x01B2, 0x028C, 0x028D, 0x028E, 0x028F,
825   0x0290, 0x0291, 0x01B7, 0x0293, 0x0294, 0x0295, 0x0296, 0x0297, 0x0298, 0x0299, 0x029A, 0x029B, 0x029C, 0x029D, 0x029E, 0x029F,
826   0x02A0, 0x02A1, 0x02A2, 0x02A3, 0x02A4, 0x02A5, 0x02A6, 0x02A7, 0x02A8, 0x02A9, 0x02AA, 0x02AB, 0x02AC, 0x02AD, 0x02AE, 0x02AF,
827   0x02B0, 0x02B1, 0x02B2, 0x02B3, 0x02B4, 0x02B5, 0x02B6, 0x02B7, 0x02B8, 0x02B9, 0x02BA, 0x02BB, 0x02BC, 0x02BD, 0x02BE, 0x02BF,
828   0x02C0, 0x02C1, 0x02C2, 0x02C3, 0x02C4, 0x02C5, 0x02C6, 0x02C7, 0x02C8, 0x02C9, 0x02CA, 0x02CB, 0x02CC, 0x02CD, 0x02CE, 0x02CF,
829   0x02D0, 0x02D1, 0x02D2, 0x02D3, 0x02D4, 0x02D5, 0x02D6, 0x02D7, 0x02D8, 0x02D9, 0x02DA, 0x02DB, 0x02DC, 0x02DD, 0x02DE, 0x02DF,
830   0x02E0, 0x02E1, 0x02E2, 0x02E3, 0x02E4, 0x02E5, 0x02E6, 0x02E7, 0x02E8, 0x02E9, 0x02EA, 0x02EB, 0x02EC, 0x02ED, 0x02EE, 0x02EF,
831   0x02F0, 0x02F1, 0x02F2, 0x02F3, 0x02F4, 0x02F5, 0x02F6, 0x02F7, 0x02F8, 0x02F9, 0x02FA, 0x02FB, 0x02FC, 0x02FD, 0x02FE, 0x02FF
832 };
833 
834 static const uint16_t plane03[] = {
835   0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, 0x0308, 0x0309, 0x030A, 0x030B, 0x030C, 0x030D, 0x030E, 0x030F,
836   0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, 0x0318, 0x0319, 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F,
837   0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, 0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F,
838   0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, 0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F,
839   0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0399, 0x0346, 0x0347, 0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, 0x034E, 0x034F,
840   0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, 0x0358, 0x0359, 0x035A, 0x035B, 0x035C, 0x035D, 0x035E, 0x035F,
841   0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F,
842   0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377, 0x0378, 0x0379, 0x037A, 0x037B, 0x037C, 0x037D, 0x037E, 0x037F,
843   0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0391, 0x0387, 0x0395, 0x0397, 0x0399, 0x038B, 0x039F, 0x038D, 0x03A5, 0x03A9,
844   0x0399, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
845   0x03A0, 0x03A1, 0x03A2, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x0399, 0x03A5, 0x0391, 0x0395, 0x0397, 0x0399,
846   0x03A5, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
847   0x03A0, 0x03A1, 0x03A3, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x0399, 0x03A5, 0x039F, 0x03A5, 0x03A9, 0x03CF,
848   0x0392, 0x0398, 0x03D2, 0x03D2, 0x03D2, 0x03A6, 0x03A0, 0x03D7, 0x03D8, 0x03D9, 0x03DA, 0x03DA, 0x03DC, 0x03DC, 0x03DE, 0x03DE,
849   0x03E0, 0x03E0, 0x03E2, 0x03E2, 0x03E4, 0x03E4, 0x03E6, 0x03E6, 0x03E8, 0x03E8, 0x03EA, 0x03EA, 0x03EC, 0x03EC, 0x03EE, 0x03EE,
850   0x039A, 0x03A1, 0x03A3, 0x03F3, 0x03F4, 0x03F5, 0x03F6, 0x03F7, 0x03F8, 0x03F9, 0x03FA, 0x03FB, 0x03FC, 0x03FD, 0x03FE, 0x03FF
851 };
852 
853 static const uint16_t plane04[] = {
854   0x0415, 0x0415, 0x0402, 0x0413, 0x0404, 0x0405, 0x0406, 0x0406, 0x0408, 0x0409, 0x040A, 0x040B, 0x041A, 0x0418, 0x0423, 0x040F,
855   0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
856   0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
857   0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
858   0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
859   0x0415, 0x0415, 0x0402, 0x0413, 0x0404, 0x0405, 0x0406, 0x0406, 0x0408, 0x0409, 0x040A, 0x040B, 0x041A, 0x0418, 0x0423, 0x040F,
860   0x0460, 0x0460, 0x0462, 0x0462, 0x0464, 0x0464, 0x0466, 0x0466, 0x0468, 0x0468, 0x046A, 0x046A, 0x046C, 0x046C, 0x046E, 0x046E,
861   0x0470, 0x0470, 0x0472, 0x0472, 0x0474, 0x0474, 0x0474, 0x0474, 0x0478, 0x0478, 0x047A, 0x047A, 0x047C, 0x047C, 0x047E, 0x047E,
862   0x0480, 0x0480, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, 0x0488, 0x0489, 0x048A, 0x048B, 0x048C, 0x048C, 0x048E, 0x048E,
863   0x0490, 0x0490, 0x0492, 0x0492, 0x0494, 0x0494, 0x0496, 0x0496, 0x0498, 0x0498, 0x049A, 0x049A, 0x049C, 0x049C, 0x049E, 0x049E,
864   0x04A0, 0x04A0, 0x04A2, 0x04A2, 0x04A4, 0x04A4, 0x04A6, 0x04A6, 0x04A8, 0x04A8, 0x04AA, 0x04AA, 0x04AC, 0x04AC, 0x04AE, 0x04AE,
865   0x04B0, 0x04B0, 0x04B2, 0x04B2, 0x04B4, 0x04B4, 0x04B6, 0x04B6, 0x04B8, 0x04B8, 0x04BA, 0x04BA, 0x04BC, 0x04BC, 0x04BE, 0x04BE,
866   0x04C0, 0x0416, 0x0416, 0x04C3, 0x04C3, 0x04C5, 0x04C6, 0x04C7, 0x04C7, 0x04C9, 0x04CA, 0x04CB, 0x04CB, 0x04CD, 0x04CE, 0x04CF,
867   0x0410, 0x0410, 0x0410, 0x0410, 0x04D4, 0x04D4, 0x0415, 0x0415, 0x04D8, 0x04D8, 0x04D8, 0x04D8, 0x0416, 0x0416, 0x0417, 0x0417,
868   0x04E0, 0x04E0, 0x0418, 0x0418, 0x0418, 0x0418, 0x041E, 0x041E, 0x04E8, 0x04E8, 0x04E8, 0x04E8, 0x042D, 0x042D, 0x0423, 0x0423,
869   0x0423, 0x0423, 0x0423, 0x0423, 0x0427, 0x0427, 0x04F6, 0x04F7, 0x042B, 0x042B, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF
870 };
871 
872 static const uint16_t plane05[] = {
873   0x0500, 0x0501, 0x0502, 0x0503, 0x0504, 0x0505, 0x0506, 0x0507, 0x0508, 0x0509, 0x050A, 0x050B, 0x050C, 0x050D, 0x050E, 0x050F,
874   0x0510, 0x0511, 0x0512, 0x0513, 0x0514, 0x0515, 0x0516, 0x0517, 0x0518, 0x0519, 0x051A, 0x051B, 0x051C, 0x051D, 0x051E, 0x051F,
875   0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527, 0x0528, 0x0529, 0x052A, 0x052B, 0x052C, 0x052D, 0x052E, 0x052F,
876   0x0530, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, 0x0538, 0x0539, 0x053A, 0x053B, 0x053C, 0x053D, 0x053E, 0x053F,
877   0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, 0x0548, 0x0549, 0x054A, 0x054B, 0x054C, 0x054D, 0x054E, 0x054F,
878   0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0557, 0x0558, 0x0559, 0x055A, 0x055B, 0x055C, 0x055D, 0x055E, 0x055F,
879   0x0560, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, 0x0538, 0x0539, 0x053A, 0x053B, 0x053C, 0x053D, 0x053E, 0x053F,
880   0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, 0x0548, 0x0549, 0x054A, 0x054B, 0x054C, 0x054D, 0x054E, 0x054F,
881   0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0587, 0x0588, 0x0589, 0x058A, 0x058B, 0x058C, 0x058D, 0x058E, 0x058F,
882   0x0590, 0x0591, 0x0592, 0x0593, 0x0594, 0x0595, 0x0596, 0x0597, 0x0598, 0x0599, 0x059A, 0x059B, 0x059C, 0x059D, 0x059E, 0x059F,
883   0x05A0, 0x05A1, 0x05A2, 0x05A3, 0x05A4, 0x05A5, 0x05A6, 0x05A7, 0x05A8, 0x05A9, 0x05AA, 0x05AB, 0x05AC, 0x05AD, 0x05AE, 0x05AF,
884   0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF,
885   0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05C4, 0x05C5, 0x05C6, 0x05C7, 0x05C8, 0x05C9, 0x05CA, 0x05CB, 0x05CC, 0x05CD, 0x05CE, 0x05CF,
886   0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
887   0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0x05EB, 0x05EC, 0x05ED, 0x05EE, 0x05EF,
888   0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0x05F5, 0x05F6, 0x05F7, 0x05F8, 0x05F9, 0x05FA, 0x05FB, 0x05FC, 0x05FD, 0x05FE, 0x05FF
889 };
890 
891 static const uint16_t plane1E[] = {
892   0x0041, 0x0041, 0x0042, 0x0042, 0x0042, 0x0042, 0x0042, 0x0042, 0x0043, 0x0043, 0x0044, 0x0044, 0x0044, 0x0044, 0x0044, 0x0044,
893   0x0044, 0x0044, 0x0044, 0x0044, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0046, 0x0046,
894   0x0047, 0x0047, 0x0048, 0x0048, 0x0048, 0x0048, 0x0048, 0x0048, 0x0048, 0x0048, 0x0048, 0x0048, 0x0049, 0x0049, 0x0049, 0x0049,
895   0x004B, 0x004B, 0x004B, 0x004B, 0x004B, 0x004B, 0x004C, 0x004C, 0x004C, 0x004C, 0x004C, 0x004C, 0x004C, 0x004C, 0x004D, 0x004D,
896   0x004D, 0x004D, 0x004D, 0x004D, 0x004E, 0x004E, 0x004E, 0x004E, 0x004E, 0x004E, 0x004E, 0x004E, 0x004F, 0x004F, 0x004F, 0x004F,
897   0x004F, 0x004F, 0x004F, 0x004F, 0x0050, 0x0050, 0x0050, 0x0050, 0x0052, 0x0052, 0x0052, 0x0052, 0x0052, 0x0052, 0x0052, 0x0052,
898   0x0053, 0x0053, 0x0053, 0x0053, 0x0053, 0x0053, 0x0053, 0x0053, 0x0053, 0x0053, 0x0054, 0x0054, 0x0054, 0x0054, 0x0054, 0x0054,
899   0x0054, 0x0054, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0056, 0x0056, 0x0056, 0x0056,
900   0x0057, 0x0057, 0x0057, 0x0057, 0x0057, 0x0057, 0x0057, 0x0057, 0x0057, 0x0057, 0x0058, 0x0058, 0x0058, 0x0058, 0x0059, 0x0059,
901   0x005A, 0x005A, 0x005A, 0x005A, 0x005A, 0x005A, 0x0048, 0x0054, 0x0057, 0x0059, 0x1E9A, 0x0053, 0x1E9C, 0x1E9D, 0x1E9E, 0x1E9F,
902   0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041,
903   0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045,
904   0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0045, 0x0049, 0x0049, 0x0049, 0x0049, 0x004F, 0x004F, 0x004F, 0x004F,
905   0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F,
906   0x004F, 0x004F, 0x004F, 0x004F, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055, 0x0055,
907   0x0055, 0x0055, 0x0059, 0x0059, 0x0059, 0x0059, 0x0059, 0x0059, 0x0059, 0x0059, 0x1EFA, 0x1EFB, 0x1EFC, 0x1EFD, 0x1EFE, 0x1EFF
908 };
909 
910 static const uint16_t plane1F[] = {
911   0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391,
912   0x0395, 0x0395, 0x0395, 0x0395, 0x0395, 0x0395, 0x1F16, 0x1F17, 0x0395, 0x0395, 0x0395, 0x0395, 0x0395, 0x0395, 0x1F1E, 0x1F1F,
913   0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397,
914   0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399,
915   0x039F, 0x039F, 0x039F, 0x039F, 0x039F, 0x039F, 0x1F46, 0x1F47, 0x039F, 0x039F, 0x039F, 0x039F, 0x039F, 0x039F, 0x1F4E, 0x1F4F,
916   0x03A5, 0x03A5, 0x03A5, 0x03A5, 0x03A5, 0x03A5, 0x03A5, 0x03A5, 0x1F58, 0x03A5, 0x1F5A, 0x03A5, 0x1F5C, 0x03A5, 0x1F5E, 0x03A5,
917   0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9,
918   0x0391, 0x1FBB, 0x0395, 0x1FC9, 0x0397, 0x1FCB, 0x0399, 0x1FDB, 0x039F, 0x1FF9, 0x03A5, 0x1FEB, 0x03A9, 0x1FFB, 0x1F7E, 0x1F7F,
919   0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391,
920   0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397, 0x0397,
921   0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9, 0x03A9,
922   0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x1FB5, 0x0391, 0x0391, 0x0391, 0x0391, 0x0391, 0x1FBB, 0x0391, 0x1FBD, 0x0399, 0x1FBF,
923   0x1FC0, 0x1FC1, 0x0397, 0x0397, 0x0397, 0x1FC5, 0x0397, 0x0397, 0x0395, 0x1FC9, 0x0397, 0x1FCB, 0x0397, 0x1FCD, 0x1FCE, 0x1FCF,
924   0x0399, 0x0399, 0x0399, 0x1FD3, 0x1FD4, 0x1FD5, 0x0399, 0x0399, 0x0399, 0x0399, 0x0399, 0x1FDB, 0x1FDC, 0x1FDD, 0x1FDE, 0x1FDF,
925   0x03A5, 0x03A5, 0x03A5, 0x1FE3, 0x03A1, 0x03A1, 0x03A5, 0x03A5, 0x03A5, 0x03A5, 0x03A5, 0x1FEB, 0x03A1, 0x1FED, 0x1FEE, 0x1FEF,
926   0x1FF0, 0x1FF1, 0x03A9, 0x03A9, 0x03A9, 0x1FF5, 0x03A9, 0x03A9, 0x039F, 0x1FF9, 0x03A9, 0x1FFB, 0x03A9, 0x1FFD, 0x1FFE, 0x1FFF
927 };
928 
929 static const uint16_t plane21[] = {
930   0x2100, 0x2101, 0x2102, 0x2103, 0x2104, 0x2105, 0x2106, 0x2107, 0x2108, 0x2109, 0x210A, 0x210B, 0x210C, 0x210D, 0x210E, 0x210F,
931   0x2110, 0x2111, 0x2112, 0x2113, 0x2114, 0x2115, 0x2116, 0x2117, 0x2118, 0x2119, 0x211A, 0x211B, 0x211C, 0x211D, 0x211E, 0x211F,
932   0x2120, 0x2121, 0x2122, 0x2123, 0x2124, 0x2125, 0x2126, 0x2127, 0x2128, 0x2129, 0x212A, 0x212B, 0x212C, 0x212D, 0x212E, 0x212F,
933   0x2130, 0x2131, 0x2132, 0x2133, 0x2134, 0x2135, 0x2136, 0x2137, 0x2138, 0x2139, 0x213A, 0x213B, 0x213C, 0x213D, 0x213E, 0x213F,
934   0x2140, 0x2141, 0x2142, 0x2143, 0x2144, 0x2145, 0x2146, 0x2147, 0x2148, 0x2149, 0x214A, 0x214B, 0x214C, 0x214D, 0x214E, 0x214F,
935   0x2150, 0x2151, 0x2152, 0x2153, 0x2154, 0x2155, 0x2156, 0x2157, 0x2158, 0x2159, 0x215A, 0x215B, 0x215C, 0x215D, 0x215E, 0x215F,
936   0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216A, 0x216B, 0x216C, 0x216D, 0x216E, 0x216F,
937   0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216A, 0x216B, 0x216C, 0x216D, 0x216E, 0x216F,
938   0x2180, 0x2181, 0x2182, 0x2183, 0x2184, 0x2185, 0x2186, 0x2187, 0x2188, 0x2189, 0x218A, 0x218B, 0x218C, 0x218D, 0x218E, 0x218F,
939   0x2190, 0x2191, 0x2192, 0x2193, 0x2194, 0x2195, 0x2196, 0x2197, 0x2198, 0x2199, 0x219A, 0x219B, 0x219C, 0x219D, 0x219E, 0x219F,
940   0x21A0, 0x21A1, 0x21A2, 0x21A3, 0x21A4, 0x21A5, 0x21A6, 0x21A7, 0x21A8, 0x21A9, 0x21AA, 0x21AB, 0x21AC, 0x21AD, 0x21AE, 0x21AF,
941   0x21B0, 0x21B1, 0x21B2, 0x21B3, 0x21B4, 0x21B5, 0x21B6, 0x21B7, 0x21B8, 0x21B9, 0x21BA, 0x21BB, 0x21BC, 0x21BD, 0x21BE, 0x21BF,
942   0x21C0, 0x21C1, 0x21C2, 0x21C3, 0x21C4, 0x21C5, 0x21C6, 0x21C7, 0x21C8, 0x21C9, 0x21CA, 0x21CB, 0x21CC, 0x21CD, 0x21CE, 0x21CF,
943   0x21D0, 0x21D1, 0x21D2, 0x21D3, 0x21D4, 0x21D5, 0x21D6, 0x21D7, 0x21D8, 0x21D9, 0x21DA, 0x21DB, 0x21DC, 0x21DD, 0x21DE, 0x21DF,
944   0x21E0, 0x21E1, 0x21E2, 0x21E3, 0x21E4, 0x21E5, 0x21E6, 0x21E7, 0x21E8, 0x21E9, 0x21EA, 0x21EB, 0x21EC, 0x21ED, 0x21EE, 0x21EF,
945   0x21F0, 0x21F1, 0x21F2, 0x21F3, 0x21F4, 0x21F5, 0x21F6, 0x21F7, 0x21F8, 0x21F9, 0x21FA, 0x21FB, 0x21FC, 0x21FD, 0x21FE, 0x21FF
946 };
947 
948 static const uint16_t plane24[] = {
949   0x2400, 0x2401, 0x2402, 0x2403, 0x2404, 0x2405, 0x2406, 0x2407, 0x2408, 0x2409, 0x240A, 0x240B, 0x240C, 0x240D, 0x240E, 0x240F,
950   0x2410, 0x2411, 0x2412, 0x2413, 0x2414, 0x2415, 0x2416, 0x2417, 0x2418, 0x2419, 0x241A, 0x241B, 0x241C, 0x241D, 0x241E, 0x241F,
951   0x2420, 0x2421, 0x2422, 0x2423, 0x2424, 0x2425, 0x2426, 0x2427, 0x2428, 0x2429, 0x242A, 0x242B, 0x242C, 0x242D, 0x242E, 0x242F,
952   0x2430, 0x2431, 0x2432, 0x2433, 0x2434, 0x2435, 0x2436, 0x2437, 0x2438, 0x2439, 0x243A, 0x243B, 0x243C, 0x243D, 0x243E, 0x243F,
953   0x2440, 0x2441, 0x2442, 0x2443, 0x2444, 0x2445, 0x2446, 0x2447, 0x2448, 0x2449, 0x244A, 0x244B, 0x244C, 0x244D, 0x244E, 0x244F,
954   0x2450, 0x2451, 0x2452, 0x2453, 0x2454, 0x2455, 0x2456, 0x2457, 0x2458, 0x2459, 0x245A, 0x245B, 0x245C, 0x245D, 0x245E, 0x245F,
955   0x2460, 0x2461, 0x2462, 0x2463, 0x2464, 0x2465, 0x2466, 0x2467, 0x2468, 0x2469, 0x246A, 0x246B, 0x246C, 0x246D, 0x246E, 0x246F,
956   0x2470, 0x2471, 0x2472, 0x2473, 0x2474, 0x2475, 0x2476, 0x2477, 0x2478, 0x2479, 0x247A, 0x247B, 0x247C, 0x247D, 0x247E, 0x247F,
957   0x2480, 0x2481, 0x2482, 0x2483, 0x2484, 0x2485, 0x2486, 0x2487, 0x2488, 0x2489, 0x248A, 0x248B, 0x248C, 0x248D, 0x248E, 0x248F,
958   0x2490, 0x2491, 0x2492, 0x2493, 0x2494, 0x2495, 0x2496, 0x2497, 0x2498, 0x2499, 0x249A, 0x249B, 0x249C, 0x249D, 0x249E, 0x249F,
959   0x24A0, 0x24A1, 0x24A2, 0x24A3, 0x24A4, 0x24A5, 0x24A6, 0x24A7, 0x24A8, 0x24A9, 0x24AA, 0x24AB, 0x24AC, 0x24AD, 0x24AE, 0x24AF,
960   0x24B0, 0x24B1, 0x24B2, 0x24B3, 0x24B4, 0x24B5, 0x24B6, 0x24B7, 0x24B8, 0x24B9, 0x24BA, 0x24BB, 0x24BC, 0x24BD, 0x24BE, 0x24BF,
961   0x24C0, 0x24C1, 0x24C2, 0x24C3, 0x24C4, 0x24C5, 0x24C6, 0x24C7, 0x24C8, 0x24C9, 0x24CA, 0x24CB, 0x24CC, 0x24CD, 0x24CE, 0x24CF,
962   0x24B6, 0x24B7, 0x24B8, 0x24B9, 0x24BA, 0x24BB, 0x24BC, 0x24BD, 0x24BE, 0x24BF, 0x24C0, 0x24C1, 0x24C2, 0x24C3, 0x24C4, 0x24C5,
963   0x24C6, 0x24C7, 0x24C8, 0x24C9, 0x24CA, 0x24CB, 0x24CC, 0x24CD, 0x24CE, 0x24CF, 0x24EA, 0x24EB, 0x24EC, 0x24ED, 0x24EE, 0x24EF,
964   0x24F0, 0x24F1, 0x24F2, 0x24F3, 0x24F4, 0x24F5, 0x24F6, 0x24F7, 0x24F8, 0x24F9, 0x24FA, 0x24FB, 0x24FC, 0x24FD, 0x24FE, 0x24FF
965 };
966 
967 static const uint16_t planeFF[] = {
968   0xFF00, 0xFF01, 0xFF02, 0xFF03, 0xFF04, 0xFF05, 0xFF06, 0xFF07, 0xFF08, 0xFF09, 0xFF0A, 0xFF0B, 0xFF0C, 0xFF0D, 0xFF0E, 0xFF0F,
969   0xFF10, 0xFF11, 0xFF12, 0xFF13, 0xFF14, 0xFF15, 0xFF16, 0xFF17, 0xFF18, 0xFF19, 0xFF1A, 0xFF1B, 0xFF1C, 0xFF1D, 0xFF1E, 0xFF1F,
970   0xFF20, 0xFF21, 0xFF22, 0xFF23, 0xFF24, 0xFF25, 0xFF26, 0xFF27, 0xFF28, 0xFF29, 0xFF2A, 0xFF2B, 0xFF2C, 0xFF2D, 0xFF2E, 0xFF2F,
971   0xFF30, 0xFF31, 0xFF32, 0xFF33, 0xFF34, 0xFF35, 0xFF36, 0xFF37, 0xFF38, 0xFF39, 0xFF3A, 0xFF3B, 0xFF3C, 0xFF3D, 0xFF3E, 0xFF3F,
972   0xFF40, 0xFF21, 0xFF22, 0xFF23, 0xFF24, 0xFF25, 0xFF26, 0xFF27, 0xFF28, 0xFF29, 0xFF2A, 0xFF2B, 0xFF2C, 0xFF2D, 0xFF2E, 0xFF2F,
973   0xFF30, 0xFF31, 0xFF32, 0xFF33, 0xFF34, 0xFF35, 0xFF36, 0xFF37, 0xFF38, 0xFF39, 0xFF3A, 0xFF5B, 0xFF5C, 0xFF5D, 0xFF5E, 0xFF5F,
974   0xFF60, 0xFF61, 0xFF62, 0xFF63, 0xFF64, 0xFF65, 0xFF66, 0xFF67, 0xFF68, 0xFF69, 0xFF6A, 0xFF6B, 0xFF6C, 0xFF6D, 0xFF6E, 0xFF6F,
975   0xFF70, 0xFF71, 0xFF72, 0xFF73, 0xFF74, 0xFF75, 0xFF76, 0xFF77, 0xFF78, 0xFF79, 0xFF7A, 0xFF7B, 0xFF7C, 0xFF7D, 0xFF7E, 0xFF7F,
976   0xFF80, 0xFF81, 0xFF82, 0xFF83, 0xFF84, 0xFF85, 0xFF86, 0xFF87, 0xFF88, 0xFF89, 0xFF8A, 0xFF8B, 0xFF8C, 0xFF8D, 0xFF8E, 0xFF8F,
977   0xFF90, 0xFF91, 0xFF92, 0xFF93, 0xFF94, 0xFF95, 0xFF96, 0xFF97, 0xFF98, 0xFF99, 0xFF9A, 0xFF9B, 0xFF9C, 0xFF9D, 0xFF9E, 0xFF9F,
978   0xFFA0, 0xFFA1, 0xFFA2, 0xFFA3, 0xFFA4, 0xFFA5, 0xFFA6, 0xFFA7, 0xFFA8, 0xFFA9, 0xFFAA, 0xFFAB, 0xFFAC, 0xFFAD, 0xFFAE, 0xFFAF,
979   0xFFB0, 0xFFB1, 0xFFB2, 0xFFB3, 0xFFB4, 0xFFB5, 0xFFB6, 0xFFB7, 0xFFB8, 0xFFB9, 0xFFBA, 0xFFBB, 0xFFBC, 0xFFBD, 0xFFBE, 0xFFBF,
980   0xFFC0, 0xFFC1, 0xFFC2, 0xFFC3, 0xFFC4, 0xFFC5, 0xFFC6, 0xFFC7, 0xFFC8, 0xFFC9, 0xFFCA, 0xFFCB, 0xFFCC, 0xFFCD, 0xFFCE, 0xFFCF,
981   0xFFD0, 0xFFD1, 0xFFD2, 0xFFD3, 0xFFD4, 0xFFD5, 0xFFD6, 0xFFD7, 0xFFD8, 0xFFD9, 0xFFDA, 0xFFDB, 0xFFDC, 0xFFDD, 0xFFDE, 0xFFDF,
982   0xFFE0, 0xFFE1, 0xFFE2, 0xFFE3, 0xFFE4, 0xFFE5, 0xFFE6, 0xFFE7, 0xFFE8, 0xFFE9, 0xFFEA, 0xFFEB, 0xFFEC, 0xFFED, 0xFFEE, 0xFFEF,
983   0xFFF0, 0xFFF1, 0xFFF2, 0xFFF3, 0xFFF4, 0xFFF5, 0xFFF6, 0xFFF7, 0xFFF8, 0xFFF9, 0xFFFA, 0xFFFB, 0xFFFC, 0xFFFD, 0xFFFE, 0xFFFF
984 };
985 
986 static const uint16_t* const planemap[256] = {
987     plane00, plane01, plane02, plane03, plane04, plane05, NULL, NULL, NULL,    NULL,    NULL,
988     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
989     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, plane1E, plane1F, NULL,
990     plane21, NULL,    NULL,    plane24, NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
991     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
992     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
993     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
994     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
995     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
996     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
997     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
998     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
999     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1000     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1001     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1002     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1003     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1004     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1005     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1006     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1007     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1008     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1009     NULL,    NULL,    NULL,    NULL,    NULL,    NULL,    NULL, NULL, NULL,    NULL,    NULL,
1010     NULL,    NULL,    planeFF
1011 };
1012 // clang-format on
1013 
GetCollationWeight(const wchar_t & r)1014 static wchar_t GetCollationWeight(const wchar_t& r)
1015 {
1016   // Lookup the "weight" of a UTF8 char, equivalent lowercase ascii letter, in the plane map,
1017   // the character comparison value used by using "accent folding" collation utf8_general_ci
1018   // in MySQL (AKA utf8mb3_general_ci in MariaDB 10)
1019   auto index = r >> 8;
1020   if (index > 255)
1021     return 0xFFFD;
1022   auto plane = planemap[index];
1023   if (plane == nullptr)
1024     return r;
1025   return static_cast<wchar_t>(plane[r & 0xFF]);
1026 }
1027 
1028 // Compares separately the numeric and alphabetic parts of a wide string.
1029 // returns negative if left < right, positive if left > right
1030 // and 0 if they are identical.
1031 // See also the equivalent StringUtils::AlphaNumericCollation() for UFT8 data
AlphaNumericCompare(const wchar_t * left,const wchar_t * right)1032 int64_t StringUtils::AlphaNumericCompare(const wchar_t* left, const wchar_t* right)
1033 {
1034   const wchar_t *l = left;
1035   const wchar_t *r = right;
1036   const wchar_t *ld, *rd;
1037   wchar_t lc, rc;
1038   int64_t lnum, rnum;
1039   bool lsym, rsym;
1040   while (*l != 0 && *r != 0)
1041   {
1042     // check if we have a numerical value
1043     if (*l >= L'0' && *l <= L'9' && *r >= L'0' && *r <= L'9')
1044     {
1045       ld = l;
1046       lnum = *ld++ - L'0';
1047       while (*ld >= L'0' && *ld <= L'9' && ld < l + 15)
1048       { // compare only up to 15 digits
1049         lnum *= 10;
1050         lnum += *ld++ - L'0';
1051       }
1052       rd = r;
1053       rnum = *rd++ - L'0';
1054       while (*rd >= L'0' && *rd <= L'9' && rd < r + 15)
1055       { // compare only up to 15 digits
1056         rnum *= 10;
1057         rnum += *rd++ - L'0';
1058       }
1059       // do we have numbers?
1060       if (lnum != rnum)
1061       { // yes - and they're different!
1062         return lnum - rnum;
1063       }
1064       l = ld;
1065       r = rd;
1066       continue;
1067     }
1068 
1069     lc = *l;
1070     rc = *r;
1071     // Put ascii punctuation and symbols e.g. !#$&()*+,-./:;<=>?@[\]^_ `{|}~ above the other
1072     // alphanumeric ascii, rather than some being mixed between the numbers and letters, and
1073     // above all other unicode letters, symbols and punctuation.
1074     // (Locale collation of these chars varies across platforms)
1075     lsym = (lc >= 32 && lc < L'0') || (lc > L'9' && lc < L'A') ||
1076            (lc > L'Z' && lc < L'a') || (lc > L'z' && lc < 128);
1077     rsym = (rc >= 32 && rc < L'0') || (rc > L'9' && rc < L'A') ||
1078            (rc > L'Z' && rc < L'a') || (rc > L'z' && rc < 128);
1079     if (lsym && !rsym)
1080       return -1;
1081     if (!lsym && rsym)
1082       return 1;
1083     if (lsym && rsym)
1084     {
1085       if (lc != rc)
1086         return lc - rc;
1087       else
1088       { // Same symbol advance to next wchar
1089         l++;
1090         r++;
1091         continue;
1092       }
1093     }
1094     if (!g_langInfo.UseLocaleCollation())
1095     {
1096       // Apply case sensitive accent folding collation to non-ascii chars.
1097       // This mimics utf8_general_ci collation, and provides simple collation of LATIN-1 chars
1098       // for any platformthat doesn't have a language specific collate facet implemented
1099       if (lc > 128)
1100         lc = GetCollationWeight(lc);
1101       if (rc > 128)
1102         rc = GetCollationWeight(rc);
1103     }
1104     // Do case less comparison, convert ascii upper case to lower case
1105     if (lc >= L'A' && lc <= L'Z')
1106       lc += L'a' - L'A';
1107     if (rc >= L'A' && rc <= L'Z')
1108       rc += L'a' - L'A';
1109 
1110     if (lc != rc)
1111     {
1112       if (!g_langInfo.UseLocaleCollation())
1113       {
1114         // Compare unicode (having applied accent folding collation to non-ascii chars).
1115         int i = wcsncmp(&lc, &rc, 1);
1116         return i;
1117       }
1118       else
1119       {
1120         // Fetch collation facet from locale to do comparison of wide char although on some
1121         // platforms this is not langauge specific but just compares unicode
1122         const std::collate<wchar_t>& coll =
1123             std::use_facet<std::collate<wchar_t>>(g_langInfo.GetSystemLocale());
1124         int cmp_res = coll.compare(&lc, &lc + 1, &rc, &rc + 1);
1125         if (cmp_res != 0)
1126           return cmp_res;
1127       }
1128     }
1129     l++; r++;
1130   }
1131   if (*r)
1132   { // r is longer
1133     return -1;
1134   }
1135   else if (*l)
1136   { // l is longer
1137     return 1;
1138   }
1139   return 0; // files are the same
1140 }
1141 
1142 /*
1143   Convert the UTF8 character to which z points into a 31-bit Unicode point.
1144   Return how many bytes (0 to 3) of UTF8 data encode the character.
1145   This only works right if z points to a well-formed UTF8 string.
1146   Byte-0    Byte-1    Byte-2    Byte-3     Value
1147   0xxxxxxx                                 00000000 00000000 0xxxxxxx
1148   110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
1149   1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
1150   11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx
1151 */
UTF8ToUnicode(const unsigned char * z,int nKey,unsigned char & bytes)1152 static uint32_t UTF8ToUnicode(const unsigned char* z, int nKey, unsigned char& bytes)
1153 {
1154   // Lookup table used decode the first byte of a multi-byte UTF8 character
1155   // clang-format off
1156   static const unsigned char utf8Trans1[] = {
1157     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1158     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1159     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1160     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1161     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1162     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1163     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1164     0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1165   };
1166   // clang-format on
1167 
1168   uint32_t c;
1169   bytes = 0;
1170   c = z[0];
1171   if (c >= 0xc0)
1172   {
1173     c = utf8Trans1[c - 0xc0];
1174     int index = 1;
1175     while (index < nKey && (z[index] & 0xc0) == 0x80)
1176     {
1177       c = (c << 6) + (0x3f & z[index]);
1178       index++;
1179     }
1180     if (c < 0x80 || (c & 0xFFFFF800) == 0xD800 || (c & 0xFFFFFFFE) == 0xFFFE)
1181       c = 0xFFFD;
1182     bytes = static_cast<unsigned char>(index - 1);
1183   }
1184   return c;
1185 }
1186 
1187 /*
1188   SQLite collating function, see sqlite3_create_collation
1189   The equivalent of AlphaNumericCompare() but for comparing UTF8 encoded data
1190 
1191   This only processes enough data to find a difference, and avoids expensive data conversions.
1192   When sorting in memory item data is converted once to wstring in advance prior to sorting, the
1193   SQLite callback function can not do that kind of preparation. Instead, in order to use
1194   AlphaNumericCompare(), it would have to repeatedly convert the full input data to wstring for
1195   every pair comparison made. That approach was found to be 10 times slower than using this
1196   separate routine.
1197 */
AlphaNumericCollation(int nKey1,const void * pKey1,int nKey2,const void * pKey2)1198 int StringUtils::AlphaNumericCollation(int nKey1, const void* pKey1, int nKey2, const void* pKey2)
1199 {
1200   // Get exact matches of shorter text to start of larger test fast
1201   int n = std::min(nKey1, nKey2);
1202   int r = memcmp(pKey1, pKey2, n);
1203   if (r == 0)
1204     return nKey1 - nKey2;
1205 
1206   //Not a binary match, so process character at a time
1207   const unsigned char* zA = static_cast<const unsigned char*>(pKey1);
1208   const unsigned char* zB = static_cast<const unsigned char*>(pKey2);
1209   wchar_t lc, rc;
1210   unsigned char bytes;
1211   int64_t lnum, rnum;
1212   bool lsym, rsym;
1213   int ld, rd;
1214   int i = 0;
1215   int j = 0;
1216   // Looping Unicode point at a time through potentially 1 to 4 multi-byte encoded UTF8 data
1217   while (i < nKey1 && j < nKey2)
1218   {
1219     // Check if we have numerical values, compare only up to 15 digits
1220     if (isdigit(zA[i]) && isdigit(zB[j]))
1221     {
1222       lnum = zA[i] - '0';
1223       ld = i + 1;
1224       while (ld < nKey1 && isdigit(zA[ld]) && ld < i + 15)
1225       {
1226         lnum *= 10;
1227         lnum += zA[ld] - '0';
1228         ld++;
1229       }
1230       rnum = zB[j] - '0';
1231       rd = j + 1;
1232       while (rd < nKey2 && isdigit(zB[rd]) && rd < j + 15)
1233       {
1234         rnum *= 10;
1235         rnum += zB[rd] - '0';
1236         rd++;
1237       }
1238       // do we have numbers?
1239       if (lnum != rnum)
1240       { // yes - and they're different!
1241         return static_cast<int>(lnum - rnum);
1242       }
1243       // Advance to after digits
1244       i = ld;
1245       j = rd;
1246       continue;
1247     }
1248     // Put ascii punctuation and symbols e.g. !#$&()*+,-./:;<=>?@[\]^_ `{|}~ before the other
1249     // alphanumeric ascii, rather than some being mixed between the numbers and letters, and
1250     // above all other unicode letters, symbols and punctuation.
1251     // (Locale collation of these chars varies across platforms)
1252     lsym = (zA[i] >= 32 && zA[i] < '0') || (zA[i] > '9' && zA[i] < 'A') ||
1253            (zA[i] > 'Z' && zA[i] < 'a') || (zA[i] > 'z' && zA[i] < 128);
1254     rsym = (zB[j] >= 32 && zB[j] < '0') || (zB[j] > '9' && zB[j] < 'A') ||
1255            (zB[j] > 'Z' && zB[j] < 'a') || (zB[j] > 'z' && zB[j] < 128);
1256     if (lsym && !rsym)
1257       return -1;
1258     if (!lsym && rsym)
1259       return 1;
1260     if (lsym && rsym)
1261     {
1262       if (zA[i] != zB[j])
1263         return zA[i] - zB[j];
1264       else
1265       { // Same symbol advance to next
1266         i++;
1267         j++;
1268         continue;
1269       }
1270     }
1271     //Decode single (1 to 4 bytes) UTF8 character to Unicode
1272     lc = UTF8ToUnicode(&zA[i], nKey1 - i, bytes);
1273     i += bytes;
1274     rc = UTF8ToUnicode(&zB[j], nKey2 - j, bytes);
1275     j += bytes;
1276     if (!g_langInfo.UseLocaleCollation())
1277     {
1278       // Apply case sensitive accent folding collation to non-ascii chars.
1279       // This mimics utf8_general_ci collation, and provides simple collation of LATIN-1 chars
1280       // for any platform that doesn't have a language specific collate facet implemented
1281       if (lc > 128)
1282         lc = GetCollationWeight(lc);
1283       if (rc > 128)
1284         rc = GetCollationWeight(rc);
1285     }
1286     // Caseless comparison so convert ascii upper case to lower case
1287     if (lc >= 'A' && lc <= 'Z')
1288       lc += 'a' - 'A';
1289     if (rc >= 'A' && rc <= 'Z')
1290       rc += 'a' - 'A';
1291 
1292     if (lc != rc)
1293     {
1294       if (!g_langInfo.UseLocaleCollation() || (lc <= 128 && rc <= 128))
1295         // Compare unicode (having applied accent folding collation to non-ascii chars).
1296         return lc - rc;
1297       else
1298       {
1299         // Fetch collation facet from locale to do comparison of wide char although on some
1300         // platforms this is not langauge specific but just compares unicode
1301         const std::collate<wchar_t>& coll =
1302             std::use_facet<std::collate<wchar_t>>(g_langInfo.GetSystemLocale());
1303         int cmp_res = coll.compare(&lc, &lc + 1, &rc, &rc + 1);
1304         if (cmp_res != 0)
1305           return cmp_res;
1306       }
1307     }
1308     i++;
1309     j++;
1310   }
1311   // Compared characters of shortest are the same as longest, length determines order
1312   return (nKey1 - nKey2);
1313 }
1314 
DateStringToYYYYMMDD(const std::string & dateString)1315 int StringUtils::DateStringToYYYYMMDD(const std::string &dateString)
1316 {
1317   std::vector<std::string> days = StringUtils::Split(dateString, '-');
1318   if (days.size() == 1)
1319     return atoi(days[0].c_str());
1320   else if (days.size() == 2)
1321     return atoi(days[0].c_str())*100+atoi(days[1].c_str());
1322   else if (days.size() == 3)
1323     return atoi(days[0].c_str())*10000+atoi(days[1].c_str())*100+atoi(days[2].c_str());
1324   else
1325     return -1;
1326 }
1327 
ISODateToLocalizedDate(const std::string & strIsoDate)1328 std::string StringUtils::ISODateToLocalizedDate(const std::string& strIsoDate)
1329 {
1330   // Convert ISO8601 date strings YYYY, YYYY-MM, or YYYY-MM-DD to (partial) localized date strings
1331   CDateTime date;
1332   std::string formattedDate = strIsoDate;
1333   if (formattedDate.size() == 10)
1334   {
1335     date.SetFromDBDate(strIsoDate);
1336     formattedDate = date.GetAsLocalizedDate();
1337   }
1338   else if (formattedDate.size() == 7)
1339   {
1340     std::string strFormat = date.GetAsLocalizedDate(false);
1341     std::string tempdate;
1342     // find which date separator we are using.  Can be -./
1343     size_t pos = strFormat.find_first_of("-./");
1344     if (pos != std::string::npos)
1345     {
1346       bool yearFirst = strFormat.find("1601") == 0; // true if year comes first
1347       std::string sep = strFormat.substr(pos, 1);
1348       if (yearFirst)
1349       { // build formatted date with year first, then separator and month
1350         tempdate = formattedDate.substr(0, 4);
1351         tempdate += sep;
1352         tempdate += formattedDate.substr(5, 2);
1353       }
1354       else
1355       {
1356         tempdate = formattedDate.substr(5, 2);
1357         tempdate += sep;
1358         tempdate += formattedDate.substr(0, 4);
1359       }
1360       formattedDate = tempdate;
1361     }
1362   // return either just the year or the locally formatted version of the ISO date
1363   }
1364   return formattedDate;
1365 }
1366 
TimeStringToSeconds(const std::string & timeString)1367 long StringUtils::TimeStringToSeconds(const std::string &timeString)
1368 {
1369   std::string strCopy(timeString);
1370   StringUtils::Trim(strCopy);
1371   if(StringUtils::EndsWithNoCase(strCopy, " min"))
1372   {
1373     // this is imdb format of "XXX min"
1374     return 60 * atoi(strCopy.c_str());
1375   }
1376   else
1377   {
1378     std::vector<std::string> secs = StringUtils::Split(strCopy, ':');
1379     int timeInSecs = 0;
1380     for (unsigned int i = 0; i < 3 && i < secs.size(); i++)
1381     {
1382       timeInSecs *= 60;
1383       timeInSecs += atoi(secs[i].c_str());
1384     }
1385     return timeInSecs;
1386   }
1387 }
1388 
SecondsToTimeString(long lSeconds,TIME_FORMAT format)1389 std::string StringUtils::SecondsToTimeString(long lSeconds, TIME_FORMAT format)
1390 {
1391   bool isNegative = lSeconds < 0;
1392   lSeconds = std::abs(lSeconds);
1393 
1394   std::string strHMS;
1395   if (format == TIME_FORMAT_SECS)
1396     strHMS = StringUtils::Format("%i", lSeconds);
1397   else if (format == TIME_FORMAT_MINS)
1398     strHMS = StringUtils::Format("%i", lrintf(static_cast<float>(lSeconds) / 60.0f));
1399   else if (format == TIME_FORMAT_HOURS)
1400     strHMS = StringUtils::Format("%i", lrintf(static_cast<float>(lSeconds) / 3600.0f));
1401   else if (format & TIME_FORMAT_M)
1402     strHMS += StringUtils::Format("%i", lSeconds % 3600 / 60);
1403   else
1404   {
1405     int hh = lSeconds / 3600;
1406     lSeconds = lSeconds % 3600;
1407     int mm = lSeconds / 60;
1408     int ss = lSeconds % 60;
1409 
1410     if (format == TIME_FORMAT_GUESS)
1411       format = (hh >= 1) ? TIME_FORMAT_HH_MM_SS : TIME_FORMAT_MM_SS;
1412     if (format & TIME_FORMAT_HH)
1413       strHMS += StringUtils::Format("%2.2i", hh);
1414     else if (format & TIME_FORMAT_H)
1415       strHMS += StringUtils::Format("%i", hh);
1416     if (format & TIME_FORMAT_MM)
1417       strHMS += StringUtils::Format(strHMS.empty() ? "%2.2i" : ":%2.2i", mm);
1418     if (format & TIME_FORMAT_SS)
1419       strHMS += StringUtils::Format(strHMS.empty() ? "%2.2i" : ":%2.2i", ss);
1420   }
1421 
1422   if (isNegative)
1423     strHMS = "-" + strHMS;
1424 
1425   return strHMS;
1426 }
1427 
IsNaturalNumber(const std::string & str)1428 bool StringUtils::IsNaturalNumber(const std::string& str)
1429 {
1430   size_t i = 0, n = 0;
1431   // allow whitespace,digits,whitespace
1432   while (i < str.size() && isspace((unsigned char) str[i]))
1433     i++;
1434   while (i < str.size() && isdigit((unsigned char) str[i]))
1435   {
1436     i++; n++;
1437   }
1438   while (i < str.size() && isspace((unsigned char) str[i]))
1439     i++;
1440   return i == str.size() && n > 0;
1441 }
1442 
IsInteger(const std::string & str)1443 bool StringUtils::IsInteger(const std::string& str)
1444 {
1445   size_t i = 0, n = 0;
1446   // allow whitespace,-,digits,whitespace
1447   while (i < str.size() && isspace((unsigned char) str[i]))
1448     i++;
1449   if (i < str.size() && str[i] == '-')
1450     i++;
1451   while (i < str.size() && isdigit((unsigned char) str[i]))
1452   {
1453     i++; n++;
1454   }
1455   while (i < str.size() && isspace((unsigned char) str[i]))
1456     i++;
1457   return i == str.size() && n > 0;
1458 }
1459 
asciidigitvalue(char chr)1460 int StringUtils::asciidigitvalue(char chr)
1461 {
1462   if (!isasciidigit(chr))
1463     return -1;
1464 
1465   return chr - '0';
1466 }
1467 
asciixdigitvalue(char chr)1468 int StringUtils::asciixdigitvalue(char chr)
1469 {
1470   int v = asciidigitvalue(chr);
1471   if (v >= 0)
1472     return v;
1473   if (chr >= 'a' && chr <= 'f')
1474     return chr - 'a' + 10;
1475   if (chr >= 'A' && chr <= 'F')
1476     return chr - 'A' + 10;
1477 
1478   return -1;
1479 }
1480 
1481 
RemoveCRLF(std::string & strLine)1482 void StringUtils::RemoveCRLF(std::string& strLine)
1483 {
1484   StringUtils::TrimRight(strLine, "\n\r");
1485 }
1486 
SizeToString(int64_t size)1487 std::string StringUtils::SizeToString(int64_t size)
1488 {
1489   std::string strLabel;
1490   const char prefixes[] = {' ', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'};
1491   unsigned int i = 0;
1492   double s = (double)size;
1493   while (i < ARRAY_SIZE(prefixes) && s >= 1000.0)
1494   {
1495     s /= 1024.0;
1496     i++;
1497   }
1498 
1499   if (!i)
1500     strLabel = StringUtils::Format("%.lf B", s);
1501   else if (i == ARRAY_SIZE(prefixes))
1502   {
1503     if (s >= 1000.0)
1504       strLabel = StringUtils::Format(">999.99 %cB", prefixes[i - 1]);
1505     else
1506       strLabel = StringUtils::Format("%.2lf %cB", s, prefixes[i - 1]);
1507   }
1508   else if (s >= 100.0)
1509     strLabel = StringUtils::Format("%.1lf %cB", s, prefixes[i]);
1510   else
1511     strLabel = StringUtils::Format("%.2lf %cB", s, prefixes[i]);
1512 
1513   return strLabel;
1514 }
1515 
BinaryStringToString(const std::string & in)1516 std::string StringUtils::BinaryStringToString(const std::string& in)
1517 {
1518   std::string out;
1519   out.reserve(in.size() / 2);
1520   for (const char *cur = in.c_str(), *end = cur + in.size(); cur != end; ++cur) {
1521     if (*cur == '\\') {
1522       ++cur;
1523       if (cur == end) {
1524         break;
1525       }
1526       if (isdigit(*cur)) {
1527         char* end;
1528         unsigned long num = strtol(cur, &end, 10);
1529         cur = end - 1;
1530         out.push_back(num);
1531         continue;
1532       }
1533     }
1534     out.push_back(*cur);
1535   }
1536   return out;
1537 }
1538 
ToHexadecimal(const std::string & in)1539 std::string StringUtils::ToHexadecimal(const std::string& in)
1540 {
1541   std::ostringstream ss;
1542   ss << std::hex;
1543   for (unsigned char ch : in) {
1544     ss << std::setw(2) << std::setfill('0') << static_cast<unsigned long> (ch);
1545   }
1546   return ss.str();
1547 }
1548 
1549 // return -1 if not, else return the utf8 char length.
IsUTF8Letter(const unsigned char * str)1550 int IsUTF8Letter(const unsigned char *str)
1551 {
1552   // reference:
1553   // unicode -> utf8 table: http://www.utf8-chartable.de/
1554   // latin characters in unicode: http://en.wikipedia.org/wiki/Latin_characters_in_Unicode
1555   unsigned char ch = str[0];
1556   if (!ch)
1557     return -1;
1558   if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'))
1559     return 1;
1560   if (!(ch & 0x80))
1561     return -1;
1562   unsigned char ch2 = str[1];
1563   if (!ch2)
1564     return -1;
1565   // check latin 1 letter table: http://en.wikipedia.org/wiki/C1_Controls_and_Latin-1_Supplement
1566   if (ch == 0xC3 && ch2 >= 0x80 && ch2 <= 0xBF && ch2 != 0x97 && ch2 != 0xB7)
1567     return 2;
1568   // check latin extended A table: http://en.wikipedia.org/wiki/Latin_Extended-A
1569   if (ch >= 0xC4 && ch <= 0xC7 && ch2 >= 0x80 && ch2 <= 0xBF)
1570     return 2;
1571   // check latin extended B table: http://en.wikipedia.org/wiki/Latin_Extended-B
1572   // and International Phonetic Alphabet: http://en.wikipedia.org/wiki/IPA_Extensions_(Unicode_block)
1573   if (((ch == 0xC8 || ch == 0xC9) && ch2 >= 0x80 && ch2 <= 0xBF)
1574       || (ch == 0xCA && ch2 >= 0x80 && ch2 <= 0xAF))
1575     return 2;
1576   return -1;
1577 }
1578 
FindWords(const char * str,const char * wordLowerCase)1579 size_t StringUtils::FindWords(const char *str, const char *wordLowerCase)
1580 {
1581   // NOTE: This assumes word is lowercase!
1582   const unsigned char *s = (const unsigned char *)str;
1583   do
1584   {
1585     // start with a compare
1586     const unsigned char *c = s;
1587     const unsigned char *w = (const unsigned char *)wordLowerCase;
1588     bool same = true;
1589     while (same && *c && *w)
1590     {
1591       unsigned char lc = *c++;
1592       if (lc >= 'A' && lc <= 'Z')
1593         lc += 'a'-'A';
1594 
1595       if (lc != *w++) // different
1596         same = false;
1597     }
1598     if (same && *w == 0)  // only the same if word has been exhausted
1599       return (const char *)s - str;
1600 
1601     // otherwise, skip current word (composed by latin letters) or number
1602     int l;
1603     if (*s >= '0' && *s <= '9')
1604     {
1605       ++s;
1606       while (*s >= '0' && *s <= '9') ++s;
1607     }
1608     else if ((l = IsUTF8Letter(s)) > 0)
1609     {
1610       s += l;
1611       while ((l = IsUTF8Letter(s)) > 0) s += l;
1612     }
1613     else
1614       ++s;
1615     while (*s && *s == ' ') s++;
1616 
1617     // and repeat until we're done
1618   } while (*s);
1619 
1620   return std::string::npos;
1621 }
1622 
1623 // assumes it is called from after the first open bracket is found
FindEndBracket(const std::string & str,char opener,char closer,int startPos)1624 int StringUtils::FindEndBracket(const std::string &str, char opener, char closer, int startPos)
1625 {
1626   int blocks = 1;
1627   for (unsigned int i = startPos; i < str.size(); i++)
1628   {
1629     if (str[i] == opener)
1630       blocks++;
1631     else if (str[i] == closer)
1632     {
1633       blocks--;
1634       if (!blocks)
1635         return i;
1636     }
1637   }
1638 
1639   return (int)std::string::npos;
1640 }
1641 
WordToDigits(std::string & word)1642 void StringUtils::WordToDigits(std::string &word)
1643 {
1644   static const char word_to_letter[] = "22233344455566677778889999";
1645   StringUtils::ToLower(word);
1646   for (unsigned int i = 0; i < word.size(); ++i)
1647   { // NB: This assumes ascii, which probably needs extending at some  point.
1648     char letter = word[i];
1649     if ((letter >= 'a' && letter <= 'z')) // assume contiguous letter range
1650     {
1651       word[i] = word_to_letter[letter-'a'];
1652     }
1653     else if (letter < '0' || letter > '9') // We want to keep 0-9!
1654     {
1655       word[i] = ' ';  // replace everything else with a space
1656     }
1657   }
1658 }
1659 
CreateUUID()1660 std::string StringUtils::CreateUUID()
1661 {
1662 #ifdef HAVE_NEW_CROSSGUID
1663   return xg::newGuid().str();
1664 #else
1665   static GuidGenerator guidGenerator;
1666   auto guid = guidGenerator.newGuid();
1667 
1668   std::stringstream strGuid; strGuid << guid;
1669   return strGuid.str();
1670 #endif
1671 }
1672 
ValidateUUID(const std::string & uuid)1673 bool StringUtils::ValidateUUID(const std::string &uuid)
1674 {
1675   CRegExp guidRE;
1676   guidRE.RegComp(ADDON_GUID_RE);
1677   return (guidRE.RegFind(uuid.c_str()) == 0);
1678 }
1679 
CompareFuzzy(const std::string & left,const std::string & right)1680 double StringUtils::CompareFuzzy(const std::string &left, const std::string &right)
1681 {
1682   return (0.5 + fstrcmp(left.c_str(), right.c_str()) * (left.length() + right.length())) / 2.0;
1683 }
1684 
FindBestMatch(const std::string & str,const std::vector<std::string> & strings,double & matchscore)1685 int StringUtils::FindBestMatch(const std::string &str, const std::vector<std::string> &strings, double &matchscore)
1686 {
1687   int best = -1;
1688   matchscore = 0;
1689 
1690   int i = 0;
1691   for (std::vector<std::string>::const_iterator it = strings.begin(); it != strings.end(); ++it, i++)
1692   {
1693     int maxlength = std::max(str.length(), it->length());
1694     double score = StringUtils::CompareFuzzy(str, *it) / maxlength;
1695     if (score > matchscore)
1696     {
1697       matchscore = score;
1698       best = i;
1699     }
1700   }
1701   return best;
1702 }
1703 
ContainsKeyword(const std::string & str,const std::vector<std::string> & keywords)1704 bool StringUtils::ContainsKeyword(const std::string &str, const std::vector<std::string> &keywords)
1705 {
1706   for (std::vector<std::string>::const_iterator it = keywords.begin(); it != keywords.end(); ++it)
1707   {
1708     if (str.find(*it) != str.npos)
1709       return true;
1710   }
1711   return false;
1712 }
1713 
utf8_strlen(const char * s)1714 size_t StringUtils::utf8_strlen(const char *s)
1715 {
1716   size_t length = 0;
1717   while (*s)
1718   {
1719     if ((*s++ & 0xC0) != 0x80)
1720       length++;
1721   }
1722   return length;
1723 }
1724 
Paramify(const std::string & param)1725 std::string StringUtils::Paramify(const std::string &param)
1726 {
1727   std::string result = param;
1728   // escape backspaces
1729   StringUtils::Replace(result, "\\", "\\\\");
1730   // escape double quotes
1731   StringUtils::Replace(result, "\"", "\\\"");
1732 
1733   // add double quotes around the whole string
1734   return "\"" + result + "\"";
1735 }
1736 
Tokenize(const std::string & input,const std::string & delimiters)1737 std::vector<std::string> StringUtils::Tokenize(const std::string &input, const std::string &delimiters)
1738 {
1739   std::vector<std::string> tokens;
1740   Tokenize(input, tokens, delimiters);
1741   return tokens;
1742 }
1743 
Tokenize(const std::string & input,std::vector<std::string> & tokens,const std::string & delimiters)1744 void StringUtils::Tokenize(const std::string& input, std::vector<std::string>& tokens, const std::string& delimiters)
1745 {
1746   tokens.clear();
1747   // Skip delimiters at beginning.
1748   std::string::size_type dataPos = input.find_first_not_of(delimiters);
1749   while (dataPos != std::string::npos)
1750   {
1751     // Find next delimiter
1752     const std::string::size_type nextDelimPos = input.find_first_of(delimiters, dataPos);
1753     // Found a token, add it to the vector.
1754     tokens.push_back(input.substr(dataPos, nextDelimPos - dataPos));
1755     // Skip delimiters.  Note the "not_of"
1756     dataPos = input.find_first_not_of(delimiters, nextDelimPos);
1757   }
1758 }
1759 
Tokenize(const std::string & input,const char delimiter)1760 std::vector<std::string> StringUtils::Tokenize(const std::string &input, const char delimiter)
1761 {
1762   std::vector<std::string> tokens;
1763   Tokenize(input, tokens, delimiter);
1764   return tokens;
1765 }
1766 
Tokenize(const std::string & input,std::vector<std::string> & tokens,const char delimiter)1767 void StringUtils::Tokenize(const std::string& input, std::vector<std::string>& tokens, const char delimiter)
1768 {
1769   tokens.clear();
1770   // Skip delimiters at beginning.
1771   std::string::size_type dataPos = input.find_first_not_of(delimiter);
1772   while (dataPos != std::string::npos)
1773   {
1774     // Find next delimiter
1775     const std::string::size_type nextDelimPos = input.find(delimiter, dataPos);
1776     // Found a token, add it to the vector.
1777     tokens.push_back(input.substr(dataPos, nextDelimPos - dataPos));
1778     // Skip delimiters.  Note the "not_of"
1779     dataPos = input.find_first_not_of(delimiter, nextDelimPos);
1780   }
1781 }
1782 
ToUint64(const std::string & str,uint64_t fallback)1783 uint64_t StringUtils::ToUint64(const std::string& str, uint64_t fallback) noexcept
1784 {
1785   std::istringstream iss(str);
1786   uint64_t result(fallback);
1787   iss >> result;
1788   return result;
1789 }
1790 
FormatFileSize(uint64_t bytes)1791 std::string StringUtils::FormatFileSize(uint64_t bytes)
1792 {
1793   const std::array<std::string, 6> units{{"B", "kB", "MB", "GB", "TB", "PB"}};
1794   if (bytes < 1000)
1795     return Format("%" PRIu64 "B", bytes);
1796 
1797   size_t i = 0;
1798   double value = static_cast<double>(bytes);
1799   while (i + 1 < units.size() && value >= 999.5)
1800   {
1801     ++i;
1802     value /= 1024.0;
1803   }
1804   unsigned int decimals = value < 9.995 ? 2 : (value < 99.95 ? 1 : 0);
1805   auto frmt = "%." + Format("%u", decimals) + "f%s";
1806   return Format(frmt.c_str(), value, units[i].c_str());
1807 }
1808 
GetOriginalLocale()1809 const std::locale& StringUtils::GetOriginalLocale() noexcept
1810 {
1811   return g_langInfo.GetOriginalLocale();
1812 }
1813