1 /*
2 * Copyright (C) 2013-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
4 *
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
8
9 #include "Utf8Utils.h"
10
11
checkStrForUtf8(const std::string & str)12 CUtf8Utils::utf8CheckResult CUtf8Utils::checkStrForUtf8(const std::string& str)
13 {
14 const char* const strC = str.c_str();
15 const size_t len = str.length();
16 size_t pos = 0;
17 bool isPlainAscii = true;
18
19 while (pos < len)
20 {
21 const size_t chrLen = SizeOfUtf8Char(strC + pos);
22 if (chrLen == 0)
23 return hiAscii; // non valid UTF-8 sequence
24 else if (chrLen > 1)
25 isPlainAscii = false;
26
27 pos += chrLen;
28 }
29
30 if (isPlainAscii)
31 return plainAscii; // only single-byte characters (valid for US-ASCII and for UTF-8)
32
33 return utf8string; // valid UTF-8 with at least one valid UTF-8 multi-byte sequence
34 }
35
36
37
FindValidUtf8Char(const std::string & str,const size_t startPos)38 size_t CUtf8Utils::FindValidUtf8Char(const std::string& str, const size_t startPos /*= 0*/)
39 {
40 const char* strC = str.c_str();
41 const size_t len = str.length();
42
43 size_t pos = startPos;
44 while (pos < len)
45 {
46 if (SizeOfUtf8Char(strC + pos))
47 return pos;
48
49 pos++;
50 }
51
52 return std::string::npos;
53 }
54
RFindValidUtf8Char(const std::string & str,const size_t startPos)55 size_t CUtf8Utils::RFindValidUtf8Char(const std::string& str, const size_t startPos)
56 {
57 const size_t len = str.length();
58 if (!len)
59 return std::string::npos;
60
61 const char* strC = str.c_str();
62 size_t pos = (startPos >= len) ? len - 1 : startPos;
63 while (pos < len) // pos is unsigned, after zero pos becomes large then len
64 {
65 if (SizeOfUtf8Char(strC + pos))
66 return pos;
67
68 pos--;
69 }
70
71 return std::string::npos;
72 }
73
SizeOfUtf8Char(const std::string & str,const size_t charStart)74 inline size_t CUtf8Utils::SizeOfUtf8Char(const std::string& str, const size_t charStart /*= 0*/)
75 {
76 if (charStart >= str.length())
77 return std::string::npos;
78
79 return SizeOfUtf8Char(str.c_str() + charStart);
80 }
81
82 // must be used only internally in class!
83 // str must be null-terminated
SizeOfUtf8Char(const char * const str)84 inline size_t CUtf8Utils::SizeOfUtf8Char(const char* const str)
85 {
86 if (!str)
87 return 0;
88
89 const unsigned char* const strU = (const unsigned char*)str;
90 const unsigned char chr = strU[0];
91
92 /* this is an implementation of http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G27506 */
93
94 /* U+0000 - U+007F in UTF-8 */
95 if (chr <= 0x7F)
96 return 1;
97
98 /* U+0080 - U+07FF in UTF-8 */ /* binary representation and range */
99 if (chr >= 0xC2 && chr <= 0xDF /* C2=1100 0010 - DF=1101 1111 */
100 // as str is null terminated,
101 && ((strU[1] & 0xC0) == 0x80)) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
102 return 2; // valid UTF-8 2 bytes sequence
103
104 /* U+0800 - U+0FFF in UTF-8 */
105 if (chr == 0xE0 /* E0=1110 0000 */
106 && (strU[1] & 0xE0) == 0xA0 /* E0=1110 0000, A0=1010 0000 - BF=1011 1111 */
107 && (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
108 return 3; // valid UTF-8 3 bytes sequence
109
110 /* U+1000 - U+CFFF in UTF-8 */
111 /* skip U+D000 - U+DFFF (handled later) */
112 /* U+E000 - U+FFFF in UTF-8 */
113 if (((chr >= 0xE1 && chr <= 0xEC) /* E1=1110 0001 - EC=1110 1100 */
114 || chr == 0xEE || chr == 0xEF) /* EE=1110 1110 - EF=1110 1111 */
115 && (strU[1] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
116 && (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
117 return 3; // valid UTF-8 3 bytes sequence
118
119 /* U+D000 - U+D7FF in UTF-8 */
120 /* note: range U+D800 - U+DFFF is reserved and invalid */
121 if (chr == 0xED /* ED=1110 1101 */
122 && (strU[1] & 0xE0) == 0x80 /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */
123 && (strU[2] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
124 return 3; // valid UTF-8 3 bytes sequence
125
126 /* U+10000 - U+3FFFF in UTF-8 */
127 if (chr == 0xF0 /* F0=1111 0000 */
128 && (strU[1] & 0xE0) == 0x80 /* E0=1110 0000, 80=1000 0000 - 9F=1001 1111 */
129 && strU[2] >= 0x90 && strU[2] <= 0xBF /* 90=1001 0000 - BF=1011 1111 */
130 && (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
131 return 4; // valid UTF-8 4 bytes sequence
132
133 /* U+40000 - U+FFFFF in UTF-8 */
134 if (chr >= 0xF1 && chr <= 0xF3 /* F1=1111 0001 - F3=1111 0011 */
135 && (strU[1] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
136 && (strU[2] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
137 && (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
138 return 4; // valid UTF-8 4 bytes sequence
139
140 /* U+100000 - U+10FFFF in UTF-8 */
141 if (chr == 0xF4 /* F4=1111 0100 */
142 && (strU[1] & 0xF0) == 0x80 /* F0=1111 0000, 80=1000 0000 - 8F=1000 1111 */
143 && (strU[2] & 0xC0) == 0x80 /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
144 && (strU[3] & 0xC0) == 0x80) /* C0=1100 0000, 80=1000 0000 - BF=1011 1111 */
145 return 4; // valid UTF-8 4 bytes sequence
146
147 return 0; // invalid UTF-8 char sequence
148 }
149